diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,172692 +1,196148 @@ { - "best_metric": 0.2483615279197693, - "best_model_checkpoint": "./checkpoints/ckpt_8_6_r32_claude_all_fp16_lr5e-05_batch8/checkpoint-141500", - "epoch": 4.995953980930937, + "best_metric": 0.427830308675766, + "best_model_checkpoint": "../checkpoints/iccad-contest-results/checkpoint-110000", + "epoch": 4.991530615922442, "eval_steps": 500, - "global_step": 142000, + "global_step": 138500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.0, - "learning_rate": 5.863039399624766e-08, - "loss": 0.796, + "epoch": 0.0001801996612246369, + "grad_norm": 0.279486745595932, + "learning_rate": 6.005284650492435e-08, + "loss": 1.012, "step": 5 }, { - "epoch": 0.0, - "learning_rate": 1.1726078799249532e-07, - "loss": 0.7443, + "epoch": 0.0003603993224492738, + "grad_norm": 0.22080670297145844, + "learning_rate": 1.201056930098487e-07, + "loss": 0.9384, "step": 10 }, { - "epoch": 0.0, - "learning_rate": 1.7589118198874298e-07, - "loss": 0.8128, + "epoch": 0.0005405989836739107, + "grad_norm": 0.16033534705638885, + "learning_rate": 1.80158539514773e-07, + "loss": 0.993, "step": 15 }, { - "epoch": 0.0, - "learning_rate": 2.3452157598499064e-07, - "loss": 0.8111, + "epoch": 0.0007207986448985476, + "grad_norm": 0.21021625399589539, + "learning_rate": 2.402113860196974e-07, + "loss": 0.9073, "step": 20 }, { - "epoch": 0.0, - "learning_rate": 2.931519699812383e-07, - "loss": 0.6772, + "epoch": 0.0009009983061231845, + "grad_norm": 0.1960284262895584, + "learning_rate": 3.002642325246217e-07, + "loss": 0.9033, "step": 25 }, { - "epoch": 0.0, - "learning_rate": 3.5178236397748596e-07, - "loss": 0.8012, + "epoch": 0.0010811979673478213, + "grad_norm": 0.31484127044677734, + "learning_rate": 3.60317079029546e-07, + "loss": 1.0241, "step": 30 }, { - "epoch": 0.0, - "learning_rate": 4.1041275797373365e-07, - "loss": 0.7822, + "epoch": 0.0012613976285724582, + "grad_norm": 0.35444363951683044, + "learning_rate": 4.2036992553447037e-07, + "loss": 0.9982, "step": 35 }, { - "epoch": 0.0, - "learning_rate": 4.690431519699813e-07, - "loss": 0.7554, + "epoch": 0.0014415972897970951, + "grad_norm": 0.25069698691368103, + "learning_rate": 4.804227720393948e-07, + "loss": 0.9843, "step": 40 }, { - "epoch": 0.0, - "learning_rate": 5.276735459662289e-07, - "loss": 0.7366, + "epoch": 0.001621796951021732, + "grad_norm": 0.23986054956912994, + "learning_rate": 5.40475618544319e-07, + "loss": 0.9416, "step": 45 }, { - "epoch": 0.0, - "learning_rate": 5.863039399624765e-07, - "loss": 0.7378, + "epoch": 0.001801996612246369, + "grad_norm": 0.2893970012664795, + "learning_rate": 6.005284650492434e-07, + "loss": 0.9448, "step": 50 }, { - "epoch": 0.0, - "learning_rate": 6.449343339587242e-07, - "loss": 0.8267, + "epoch": 0.0019821962734710057, + "grad_norm": 0.19641688466072083, + "learning_rate": 6.605813115541676e-07, + "loss": 0.8997, "step": 55 }, { - "epoch": 0.0, - "learning_rate": 7.035647279549719e-07, - "loss": 0.8, + "epoch": 0.0021623959346956426, + "grad_norm": 0.2509602904319763, + "learning_rate": 7.20634158059092e-07, + "loss": 0.9826, "step": 60 }, { - "epoch": 0.0, - "learning_rate": 7.621951219512196e-07, - "loss": 0.8192, + "epoch": 0.0023425955959202795, + "grad_norm": 0.25828176736831665, + "learning_rate": 7.806870045640163e-07, + "loss": 0.9222, "step": 65 }, { - "epoch": 0.0, - "learning_rate": 8.208255159474673e-07, - "loss": 0.8117, + "epoch": 0.0025227952571449165, + "grad_norm": 0.19463542103767395, + "learning_rate": 8.407398510689407e-07, + "loss": 0.8906, "step": 70 }, { - "epoch": 0.0, - "learning_rate": 8.794559099437148e-07, - "loss": 0.7438, + "epoch": 0.0027029949183695534, + "grad_norm": 0.1942373514175415, + "learning_rate": 9.007926975738651e-07, + "loss": 0.9204, "step": 75 }, { - "epoch": 0.0, - "learning_rate": 9.380863039399626e-07, - "loss": 0.7744, + "epoch": 0.0028831945795941903, + "grad_norm": 0.2071942389011383, + "learning_rate": 9.608455440787895e-07, + "loss": 0.9334, "step": 80 }, { - "epoch": 0.0, - "learning_rate": 9.9671669793621e-07, - "loss": 0.7994, + "epoch": 0.0030633942408188272, + "grad_norm": 0.2774100601673126, + "learning_rate": 1.0208983905837138e-06, + "loss": 0.9543, "step": 85 }, { - "epoch": 0.0, - "learning_rate": 1.0553470919324578e-06, - "loss": 0.7964, + "epoch": 0.003243593902043464, + "grad_norm": 0.21016931533813477, + "learning_rate": 1.080951237088638e-06, + "loss": 0.9632, "step": 90 }, { - "epoch": 0.0, - "learning_rate": 1.1139774859287056e-06, - "loss": 0.7616, + "epoch": 0.003423793563268101, + "grad_norm": 0.24393318593502045, + "learning_rate": 1.1410040835935624e-06, + "loss": 0.8989, "step": 95 }, { - "epoch": 0.0, - "learning_rate": 1.172607879924953e-06, - "loss": 0.7486, + "epoch": 0.003603993224492738, + "grad_norm": 0.22489190101623535, + "learning_rate": 1.2010569300984867e-06, + "loss": 0.9678, "step": 100 }, { - "epoch": 0.0, - "learning_rate": 1.2312382739212008e-06, - "loss": 0.7867, + "epoch": 0.003784192885717375, + "grad_norm": 0.2544313669204712, + "learning_rate": 1.261109776603411e-06, + "loss": 0.9295, "step": 105 }, { - "epoch": 0.0, - "learning_rate": 1.2898686679174484e-06, - "loss": 0.7258, + "epoch": 0.003964392546942011, + "grad_norm": 0.21903349459171295, + "learning_rate": 1.3211626231083353e-06, + "loss": 0.9345, "step": 110 }, { - "epoch": 0.0, - "learning_rate": 1.348499061913696e-06, - "loss": 0.7761, + "epoch": 0.004144592208166649, + "grad_norm": 0.2654012143611908, + "learning_rate": 1.3812154696132596e-06, + "loss": 0.9683, "step": 115 }, { - "epoch": 0.0, - "learning_rate": 1.4071294559099438e-06, - "loss": 0.7119, + "epoch": 0.004324791869391285, + "grad_norm": 0.25590232014656067, + "learning_rate": 1.441268316118184e-06, + "loss": 0.9614, "step": 120 }, { - "epoch": 0.0, - "learning_rate": 1.4657598499061916e-06, - "loss": 0.7707, + "epoch": 0.004504991530615923, + "grad_norm": 0.25431954860687256, + "learning_rate": 1.5013211626231084e-06, + "loss": 1.0293, "step": 125 }, { - "epoch": 0.0, - "learning_rate": 1.5243902439024391e-06, - "loss": 0.7129, + "epoch": 0.004685191191840559, + "grad_norm": 0.18761098384857178, + "learning_rate": 1.5613740091280327e-06, + "loss": 0.8426, "step": 130 }, { - "epoch": 0.0, - "learning_rate": 1.5830206378986866e-06, - "loss": 0.7411, + "epoch": 0.004865390853065196, + "grad_norm": 0.21217921376228333, + "learning_rate": 1.621426855632957e-06, + "loss": 0.9276, "step": 135 }, { - "epoch": 0.0, - "learning_rate": 1.6416510318949346e-06, - "loss": 0.781, + "epoch": 0.005045590514289833, + "grad_norm": 0.23869900405406952, + "learning_rate": 1.6814797021378815e-06, + "loss": 0.8987, "step": 140 }, { - "epoch": 0.01, - "learning_rate": 1.7002814258911821e-06, - "loss": 0.7319, + "epoch": 0.00522579017551447, + "grad_norm": 0.18920210003852844, + "learning_rate": 1.7415325486428058e-06, + "loss": 0.9036, "step": 145 }, { - "epoch": 0.01, - "learning_rate": 1.7589118198874296e-06, - "loss": 0.7559, + "epoch": 0.005405989836739107, + "grad_norm": 0.2015441209077835, + "learning_rate": 1.8015853951477303e-06, + "loss": 0.8491, "step": 150 }, { - "epoch": 0.01, - "learning_rate": 1.8175422138836774e-06, - "loss": 0.8009, + "epoch": 0.005586189497963744, + "grad_norm": 0.18248215317726135, + "learning_rate": 1.8616382416526544e-06, + "loss": 0.9322, "step": 155 }, { - "epoch": 0.01, - "learning_rate": 1.8761726078799251e-06, - "loss": 0.7054, + "epoch": 0.005766389159188381, + "grad_norm": 0.23022598028182983, + "learning_rate": 1.921691088157579e-06, + "loss": 0.8488, "step": 160 }, { - "epoch": 0.01, - "learning_rate": 1.934803001876173e-06, - "loss": 0.6913, + "epoch": 0.005946588820413018, + "grad_norm": 0.18739941716194153, + "learning_rate": 1.981743934662503e-06, + "loss": 0.8359, "step": 165 }, { - "epoch": 0.01, - "learning_rate": 1.99343339587242e-06, - "loss": 0.6887, + "epoch": 0.0061267884816376544, + "grad_norm": 0.21552830934524536, + "learning_rate": 2.0417967811674277e-06, + "loss": 0.8829, "step": 170 }, { - "epoch": 0.01, - "learning_rate": 2.052063789868668e-06, - "loss": 0.6825, + "epoch": 0.006306988142862292, + "grad_norm": 0.15109090507030487, + "learning_rate": 2.101849627672352e-06, + "loss": 0.7759, "step": 175 }, { - "epoch": 0.01, - "learning_rate": 2.1106941838649157e-06, - "loss": 0.7041, + "epoch": 0.006487187804086928, + "grad_norm": 0.2417735457420349, + "learning_rate": 2.161902474177276e-06, + "loss": 0.9047, "step": 180 }, { - "epoch": 0.01, - "learning_rate": 2.1693245778611634e-06, - "loss": 0.686, + "epoch": 0.006667387465311566, + "grad_norm": 0.23814047873020172, + "learning_rate": 2.2219553206822005e-06, + "loss": 0.8425, "step": 185 }, { - "epoch": 0.01, - "learning_rate": 2.227954971857411e-06, - "loss": 0.7251, + "epoch": 0.006847587126536202, + "grad_norm": 0.23794889450073242, + "learning_rate": 2.282008167187125e-06, + "loss": 0.7744, "step": 190 }, { - "epoch": 0.01, - "learning_rate": 2.2865853658536584e-06, - "loss": 0.6662, + "epoch": 0.007027786787760839, + "grad_norm": 0.18542109429836273, + "learning_rate": 2.342061013692049e-06, + "loss": 0.7658, "step": 195 }, { - "epoch": 0.01, - "learning_rate": 2.345215759849906e-06, - "loss": 0.7285, + "epoch": 0.007207986448985476, + "grad_norm": 0.18955494463443756, + "learning_rate": 2.4021138601969734e-06, + "loss": 0.8884, "step": 200 }, { - "epoch": 0.01, - "learning_rate": 2.403846153846154e-06, - "loss": 0.6286, + "epoch": 0.0073881861102101124, + "grad_norm": 0.17132216691970825, + "learning_rate": 2.4621667067018977e-06, + "loss": 0.8408, "step": 205 }, { - "epoch": 0.01, - "learning_rate": 2.4624765478424017e-06, - "loss": 0.6775, + "epoch": 0.00756838577143475, + "grad_norm": 0.20857086777687073, + "learning_rate": 2.522219553206822e-06, + "loss": 0.826, "step": 210 }, { - "epoch": 0.01, - "learning_rate": 2.5211069418386494e-06, - "loss": 0.6458, + "epoch": 0.007748585432659386, + "grad_norm": 0.23217755556106567, + "learning_rate": 2.5822723997117463e-06, + "loss": 0.7692, "step": 215 }, { - "epoch": 0.01, - "learning_rate": 2.5797373358348967e-06, - "loss": 0.6852, + "epoch": 0.007928785093884023, + "grad_norm": 0.26247748732566833, + "learning_rate": 2.6423252462166706e-06, + "loss": 0.758, "step": 220 }, { - "epoch": 0.01, - "learning_rate": 2.638367729831145e-06, - "loss": 0.736, + "epoch": 0.008108984755108661, + "grad_norm": 0.1590353399515152, + "learning_rate": 2.7023780927215953e-06, + "loss": 0.7827, "step": 225 }, { - "epoch": 0.01, - "learning_rate": 2.696998123827392e-06, - "loss": 0.6177, + "epoch": 0.008289184416333297, + "grad_norm": 0.1754877269268036, + "learning_rate": 2.762430939226519e-06, + "loss": 0.8534, "step": 230 }, { - "epoch": 0.01, - "learning_rate": 2.75562851782364e-06, - "loss": 0.6665, + "epoch": 0.008469384077557934, + "grad_norm": 0.2057289332151413, + "learning_rate": 2.822483785731444e-06, + "loss": 0.7856, "step": 235 }, { - "epoch": 0.01, - "learning_rate": 2.8142589118198877e-06, - "loss": 0.6038, + "epoch": 0.00864958373878257, + "grad_norm": 0.15028229355812073, + "learning_rate": 2.882536632236368e-06, + "loss": 0.7429, "step": 240 }, { - "epoch": 0.01, - "learning_rate": 2.872889305816135e-06, - "loss": 0.6792, + "epoch": 0.008829783400007209, + "grad_norm": 0.20948627591133118, + "learning_rate": 2.9425894787412925e-06, + "loss": 0.7745, "step": 245 }, { - "epoch": 0.01, - "learning_rate": 2.931519699812383e-06, - "loss": 0.6346, + "epoch": 0.009009983061231845, + "grad_norm": 0.2172427922487259, + "learning_rate": 3.0026423252462168e-06, + "loss": 0.7531, "step": 250 }, { - "epoch": 0.01, - "learning_rate": 2.9901500938086305e-06, - "loss": 0.5861, + "epoch": 0.009190182722456482, + "grad_norm": 0.2265809327363968, + "learning_rate": 3.062695171751141e-06, + "loss": 0.8168, "step": 255 }, { - "epoch": 0.01, - "learning_rate": 3.0487804878048782e-06, - "loss": 0.6147, + "epoch": 0.009370382383681118, + "grad_norm": 0.216866135597229, + "learning_rate": 3.1227480182560654e-06, + "loss": 0.7149, "step": 260 }, { - "epoch": 0.01, - "learning_rate": 3.1074108818011255e-06, - "loss": 0.5976, + "epoch": 0.009550582044905756, + "grad_norm": 0.21732446551322937, + "learning_rate": 3.1828008647609896e-06, + "loss": 0.6542, "step": 265 }, { - "epoch": 0.01, - "learning_rate": 3.1660412757973733e-06, - "loss": 0.6033, + "epoch": 0.009730781706130393, + "grad_norm": 0.21542949974536896, + "learning_rate": 3.242853711265914e-06, + "loss": 0.7051, "step": 270 }, { - "epoch": 0.01, - "learning_rate": 3.224671669793621e-06, - "loss": 0.6113, + "epoch": 0.00991098136735503, + "grad_norm": 0.19147822260856628, + "learning_rate": 3.3029065577708387e-06, + "loss": 0.7055, "step": 275 }, { - "epoch": 0.01, - "learning_rate": 3.283302063789869e-06, - "loss": 0.5681, + "epoch": 0.010091181028579666, + "grad_norm": 0.24729080498218536, + "learning_rate": 3.362959404275763e-06, + "loss": 0.7251, "step": 280 }, { - "epoch": 0.01, - "learning_rate": 3.3419324577861165e-06, - "loss": 0.5717, + "epoch": 0.010271380689804302, + "grad_norm": 0.21902547776699066, + "learning_rate": 3.423012250780687e-06, + "loss": 0.7087, "step": 285 }, { - "epoch": 0.01, - "learning_rate": 3.4005628517823642e-06, - "loss": 0.5843, + "epoch": 0.01045158035102894, + "grad_norm": 0.1997164785861969, + "learning_rate": 3.4830650972856115e-06, + "loss": 0.7187, "step": 290 }, { - "epoch": 0.01, - "learning_rate": 3.4591932457786115e-06, - "loss": 0.5899, + "epoch": 0.010631780012253577, + "grad_norm": 0.16925355792045593, + "learning_rate": 3.543117943790536e-06, + "loss": 0.6483, "step": 295 }, { - "epoch": 0.01, - "learning_rate": 3.5178236397748593e-06, - "loss": 0.5319, + "epoch": 0.010811979673478214, + "grad_norm": 0.11117614805698395, + "learning_rate": 3.6031707902954605e-06, + "loss": 0.6396, "step": 300 }, { - "epoch": 0.01, - "learning_rate": 3.5764540337711074e-06, - "loss": 0.6013, + "epoch": 0.01099217933470285, + "grad_norm": 0.15845130383968353, + "learning_rate": 3.6632236368003844e-06, + "loss": 0.6586, "step": 305 }, { - "epoch": 0.01, - "learning_rate": 3.6350844277673548e-06, - "loss": 0.6063, + "epoch": 0.011172378995927488, + "grad_norm": 0.15920360386371613, + "learning_rate": 3.7232764833053087e-06, + "loss": 0.6414, "step": 310 }, { - "epoch": 0.01, - "learning_rate": 3.6937148217636025e-06, - "loss": 0.5724, + "epoch": 0.011352578657152125, + "grad_norm": 0.1967982053756714, + "learning_rate": 3.7833293298102334e-06, + "loss": 0.6027, "step": 315 }, { - "epoch": 0.01, - "learning_rate": 3.7523452157598502e-06, - "loss": 0.5849, + "epoch": 0.011532778318376761, + "grad_norm": 0.1459701806306839, + "learning_rate": 3.843382176315158e-06, + "loss": 0.6115, "step": 320 }, { - "epoch": 0.01, - "learning_rate": 3.8109756097560976e-06, - "loss": 0.5275, + "epoch": 0.011712977979601398, + "grad_norm": 0.1541188508272171, + "learning_rate": 3.903435022820082e-06, + "loss": 0.6935, "step": 325 }, { - "epoch": 0.01, - "learning_rate": 3.869606003752346e-06, - "loss": 0.5455, + "epoch": 0.011893177640826036, + "grad_norm": 0.15788233280181885, + "learning_rate": 3.963487869325006e-06, + "loss": 0.6226, "step": 330 }, { - "epoch": 0.01, - "learning_rate": 3.928236397748593e-06, - "loss": 0.518, + "epoch": 0.012073377302050672, + "grad_norm": 0.15022051334381104, + "learning_rate": 4.023540715829931e-06, + "loss": 0.6914, "step": 335 }, { - "epoch": 0.01, - "learning_rate": 3.98686679174484e-06, - "loss": 0.5309, + "epoch": 0.012253576963275309, + "grad_norm": 0.16778388619422913, + "learning_rate": 4.083593562334855e-06, + "loss": 0.6599, "step": 340 }, { - "epoch": 0.01, - "learning_rate": 4.0454971857410885e-06, - "loss": 0.5405, + "epoch": 0.012433776624499945, + "grad_norm": 0.154036745429039, + "learning_rate": 4.143646408839779e-06, + "loss": 0.6181, "step": 345 }, { - "epoch": 0.01, - "learning_rate": 4.104127579737336e-06, - "loss": 0.4894, + "epoch": 0.012613976285724584, + "grad_norm": 0.15167488157749176, + "learning_rate": 4.203699255344704e-06, + "loss": 0.6736, "step": 350 }, { - "epoch": 0.01, - "learning_rate": 4.162757973733584e-06, - "loss": 0.5709, + "epoch": 0.01279417594694922, + "grad_norm": 0.1559118628501892, + "learning_rate": 4.263752101849628e-06, + "loss": 0.6087, "step": 355 }, { - "epoch": 0.01, - "learning_rate": 4.221388367729831e-06, - "loss": 0.5289, + "epoch": 0.012974375608173857, + "grad_norm": 0.11976916342973709, + "learning_rate": 4.323804948354552e-06, + "loss": 0.6182, "step": 360 }, { - "epoch": 0.01, - "learning_rate": 4.280018761726079e-06, - "loss": 0.5217, + "epoch": 0.013154575269398493, + "grad_norm": 0.12271628528833389, + "learning_rate": 4.383857794859476e-06, + "loss": 0.594, "step": 365 }, { - "epoch": 0.01, - "learning_rate": 4.338649155722327e-06, - "loss": 0.5165, + "epoch": 0.013334774930623131, + "grad_norm": 0.16896136105060577, + "learning_rate": 4.443910641364401e-06, + "loss": 0.6635, "step": 370 }, { - "epoch": 0.01, - "learning_rate": 4.397279549718574e-06, - "loss": 0.4879, + "epoch": 0.013514974591847768, + "grad_norm": 0.1389530748128891, + "learning_rate": 4.503963487869326e-06, + "loss": 0.62, "step": 375 }, { - "epoch": 0.01, - "learning_rate": 4.455909943714822e-06, - "loss": 0.5202, + "epoch": 0.013695174253072404, + "grad_norm": 0.1768377125263214, + "learning_rate": 4.56401633437425e-06, + "loss": 0.6179, "step": 380 }, { - "epoch": 0.01, - "learning_rate": 4.51454033771107e-06, - "loss": 0.4872, + "epoch": 0.01387537391429704, + "grad_norm": 0.12114609777927399, + "learning_rate": 4.6240691808791735e-06, + "loss": 0.5923, "step": 385 }, { - "epoch": 0.01, - "learning_rate": 4.573170731707317e-06, - "loss": 0.493, + "epoch": 0.014055573575521677, + "grad_norm": 0.13337160646915436, + "learning_rate": 4.684122027384098e-06, + "loss": 0.6363, "step": 390 }, { - "epoch": 0.01, - "learning_rate": 4.631801125703565e-06, - "loss": 0.4825, + "epoch": 0.014235773236746315, + "grad_norm": 0.12390133738517761, + "learning_rate": 4.744174873889023e-06, + "loss": 0.5967, "step": 395 }, { - "epoch": 0.01, - "learning_rate": 4.690431519699812e-06, - "loss": 0.456, + "epoch": 0.014415972897970952, + "grad_norm": 0.13579072058200836, + "learning_rate": 4.804227720393947e-06, + "loss": 0.6168, "step": 400 }, { - "epoch": 0.01, - "learning_rate": 4.7490619136960605e-06, - "loss": 0.4817, + "epoch": 0.014596172559195588, + "grad_norm": 0.13865922391414642, + "learning_rate": 4.8642805668988715e-06, + "loss": 0.6619, "step": 405 }, { - "epoch": 0.01, - "learning_rate": 4.807692307692308e-06, - "loss": 0.4762, + "epoch": 0.014776372220420225, + "grad_norm": 0.13145041465759277, + "learning_rate": 4.924333413403795e-06, + "loss": 0.6132, "step": 410 }, { - "epoch": 0.01, - "learning_rate": 4.866322701688555e-06, - "loss": 0.4508, + "epoch": 0.014956571881644863, + "grad_norm": 0.13919544219970703, + "learning_rate": 4.98438625990872e-06, + "loss": 0.6158, "step": 415 }, { - "epoch": 0.01, - "learning_rate": 4.924953095684803e-06, - "loss": 0.4522, + "epoch": 0.0151367715428695, + "grad_norm": 0.14985574781894684, + "learning_rate": 5.044439106413644e-06, + "loss": 0.6162, "step": 420 }, { - "epoch": 0.01, - "learning_rate": 4.983583489681051e-06, - "loss": 0.4894, + "epoch": 0.015316971204094136, + "grad_norm": 0.16457071900367737, + "learning_rate": 5.104491952918569e-06, + "loss": 0.6272, "step": 425 }, { - "epoch": 0.02, - "learning_rate": 5.042213883677299e-06, - "loss": 0.4979, + "epoch": 0.015497170865318773, + "grad_norm": 0.09835664927959442, + "learning_rate": 5.164544799423493e-06, + "loss": 0.5721, "step": 430 }, { - "epoch": 0.02, - "learning_rate": 5.100844277673546e-06, - "loss": 0.4948, + "epoch": 0.01567737052654341, + "grad_norm": 0.12352655827999115, + "learning_rate": 5.224597645928417e-06, + "loss": 0.5868, "step": 435 }, { - "epoch": 0.02, - "learning_rate": 5.1594746716697934e-06, - "loss": 0.4799, + "epoch": 0.015857570187768046, + "grad_norm": 0.11715588718652725, + "learning_rate": 5.284650492433341e-06, + "loss": 0.5729, "step": 440 }, { - "epoch": 0.02, - "learning_rate": 5.218105065666042e-06, - "loss": 0.4551, + "epoch": 0.016037769848992686, + "grad_norm": 0.14185546338558197, + "learning_rate": 5.344703338938266e-06, + "loss": 0.6224, "step": 445 }, { - "epoch": 0.02, - "learning_rate": 5.27673545966229e-06, - "loss": 0.4232, + "epoch": 0.016217969510217322, + "grad_norm": 0.10686635971069336, + "learning_rate": 5.404756185443191e-06, + "loss": 0.5762, "step": 450 }, { - "epoch": 0.02, - "learning_rate": 5.335365853658537e-06, - "loss": 0.4446, + "epoch": 0.01639816917144196, + "grad_norm": 0.1367388218641281, + "learning_rate": 5.4648090319481145e-06, + "loss": 0.6108, "step": 455 }, { - "epoch": 0.02, - "learning_rate": 5.393996247654784e-06, - "loss": 0.4542, + "epoch": 0.016578368832666595, + "grad_norm": 0.1376735419034958, + "learning_rate": 5.524861878453038e-06, + "loss": 0.6249, "step": 460 }, { - "epoch": 0.02, - "learning_rate": 5.452626641651032e-06, - "loss": 0.4734, + "epoch": 0.01675856849389123, + "grad_norm": 0.1213606670498848, + "learning_rate": 5.584914724957963e-06, + "loss": 0.5733, "step": 465 }, { - "epoch": 0.02, - "learning_rate": 5.51125703564728e-06, - "loss": 0.4564, + "epoch": 0.016938768155115868, + "grad_norm": 0.14439453184604645, + "learning_rate": 5.644967571462888e-06, + "loss": 0.636, "step": 470 }, { - "epoch": 0.02, - "learning_rate": 5.569887429643528e-06, - "loss": 0.4991, + "epoch": 0.017118967816340504, + "grad_norm": 0.1540927290916443, + "learning_rate": 5.7050204179678125e-06, + "loss": 0.5924, "step": 475 }, { - "epoch": 0.02, - "learning_rate": 5.628517823639775e-06, - "loss": 0.4756, + "epoch": 0.01729916747756514, + "grad_norm": 0.13569463789463043, + "learning_rate": 5.765073264472736e-06, + "loss": 0.5908, "step": 480 }, { - "epoch": 0.02, - "learning_rate": 5.687148217636023e-06, - "loss": 0.474, + "epoch": 0.017479367138789777, + "grad_norm": 0.13307730853557587, + "learning_rate": 5.82512611097766e-06, + "loss": 0.5795, "step": 485 }, { - "epoch": 0.02, - "learning_rate": 5.74577861163227e-06, - "loss": 0.4501, + "epoch": 0.017659566800014417, + "grad_norm": 0.17746399343013763, + "learning_rate": 5.885178957482585e-06, + "loss": 0.5938, "step": 490 }, { - "epoch": 0.02, - "learning_rate": 5.804409005628518e-06, - "loss": 0.4726, + "epoch": 0.017839766461239054, + "grad_norm": 0.11576178669929504, + "learning_rate": 5.94523180398751e-06, + "loss": 0.5914, "step": 495 }, { - "epoch": 0.02, - "learning_rate": 5.863039399624766e-06, - "loss": 0.4845, + "epoch": 0.01801996612246369, + "grad_norm": 0.12623916566371918, + "learning_rate": 6.0052846504924335e-06, + "loss": 0.5679, "step": 500 }, { - "epoch": 0.02, - "eval_loss": 0.45026421546936035, - "eval_runtime": 10.5589, - "eval_samples_per_second": 9.471, - "eval_steps_per_second": 9.471, + "epoch": 0.01801996612246369, + "eval_loss": 0.6039146184921265, + "eval_runtime": 3.5675, + "eval_samples_per_second": 28.031, + "eval_steps_per_second": 7.008, "step": 500 }, { - "epoch": 0.02, - "learning_rate": 5.921669793621013e-06, - "loss": 0.4809, + "epoch": 0.018200165783688327, + "grad_norm": 0.13079528510570526, + "learning_rate": 6.065337496997358e-06, + "loss": 0.5946, "step": 505 }, { - "epoch": 0.02, - "learning_rate": 5.980300187617261e-06, - "loss": 0.4719, + "epoch": 0.018380365444912963, + "grad_norm": 0.1083323135972023, + "learning_rate": 6.125390343502282e-06, + "loss": 0.5892, "step": 510 }, { - "epoch": 0.02, - "learning_rate": 6.038930581613509e-06, - "loss": 0.4421, + "epoch": 0.0185605651061376, + "grad_norm": 0.16727234423160553, + "learning_rate": 6.185443190007207e-06, + "loss": 0.5907, "step": 515 }, { - "epoch": 0.02, - "learning_rate": 6.0975609756097564e-06, - "loss": 0.4295, + "epoch": 0.018740764767362236, + "grad_norm": 0.12499808520078659, + "learning_rate": 6.245496036512131e-06, + "loss": 0.6076, "step": 520 }, { - "epoch": 0.02, - "learning_rate": 6.156191369606005e-06, - "loss": 0.4448, + "epoch": 0.018920964428586873, + "grad_norm": 0.16318917274475098, + "learning_rate": 6.3055488830170554e-06, + "loss": 0.5995, "step": 525 }, { - "epoch": 0.02, - "learning_rate": 6.214821763602251e-06, - "loss": 0.4497, + "epoch": 0.019101164089811513, + "grad_norm": 0.12780463695526123, + "learning_rate": 6.365601729521979e-06, + "loss": 0.5618, "step": 530 }, { - "epoch": 0.02, - "learning_rate": 6.2734521575985e-06, - "loss": 0.4438, + "epoch": 0.01928136375103615, + "grad_norm": 0.11932645738124847, + "learning_rate": 6.425654576026904e-06, + "loss": 0.5856, "step": 535 }, { - "epoch": 0.02, - "learning_rate": 6.3320825515947465e-06, - "loss": 0.4549, + "epoch": 0.019461563412260786, + "grad_norm": 0.1256912350654602, + "learning_rate": 6.485707422531828e-06, + "loss": 0.5552, "step": 540 }, { - "epoch": 0.02, - "learning_rate": 6.390712945590995e-06, - "loss": 0.4767, + "epoch": 0.019641763073485422, + "grad_norm": 0.09648074209690094, + "learning_rate": 6.545760269036752e-06, + "loss": 0.5326, "step": 545 }, { - "epoch": 0.02, - "learning_rate": 6.449343339587242e-06, - "loss": 0.4291, + "epoch": 0.01982196273471006, + "grad_norm": 0.14793181419372559, + "learning_rate": 6.605813115541677e-06, + "loss": 0.5664, "step": 550 }, { - "epoch": 0.02, - "learning_rate": 6.50797373358349e-06, - "loss": 0.4421, + "epoch": 0.020002162395934695, + "grad_norm": 0.12733140587806702, + "learning_rate": 6.665865962046601e-06, + "loss": 0.571, "step": 555 }, { - "epoch": 0.02, - "learning_rate": 6.566604127579738e-06, - "loss": 0.4297, + "epoch": 0.02018236205715933, + "grad_norm": 0.12296765297651291, + "learning_rate": 6.725918808551526e-06, + "loss": 0.5242, "step": 560 }, { - "epoch": 0.02, - "learning_rate": 6.625234521575985e-06, - "loss": 0.4256, + "epoch": 0.020362561718383968, + "grad_norm": 0.13922452926635742, + "learning_rate": 6.78597165505645e-06, + "loss": 0.5367, "step": 565 }, { - "epoch": 0.02, - "learning_rate": 6.683864915572233e-06, - "loss": 0.4481, + "epoch": 0.020542761379608605, + "grad_norm": 0.14491942524909973, + "learning_rate": 6.846024501561374e-06, + "loss": 0.5708, "step": 570 }, { - "epoch": 0.02, - "learning_rate": 6.74249530956848e-06, - "loss": 0.4512, + "epoch": 0.020722961040833245, + "grad_norm": 0.1201525330543518, + "learning_rate": 6.906077348066299e-06, + "loss": 0.6137, "step": 575 }, { - "epoch": 0.02, - "learning_rate": 6.8011257035647285e-06, - "loss": 0.4377, + "epoch": 0.02090316070205788, + "grad_norm": 0.11606668680906296, + "learning_rate": 6.966130194571223e-06, + "loss": 0.5537, "step": 580 }, { - "epoch": 0.02, - "learning_rate": 6.859756097560977e-06, - "loss": 0.4654, + "epoch": 0.021083360363282518, + "grad_norm": 0.14227990806102753, + "learning_rate": 7.026183041076147e-06, + "loss": 0.5886, "step": 585 }, { - "epoch": 0.02, - "learning_rate": 6.918386491557223e-06, - "loss": 0.4026, + "epoch": 0.021263560024507154, + "grad_norm": 0.13029618561267853, + "learning_rate": 7.086235887581072e-06, + "loss": 0.5999, "step": 590 }, { - "epoch": 0.02, - "learning_rate": 6.977016885553471e-06, - "loss": 0.4547, + "epoch": 0.02144375968573179, + "grad_norm": 0.11412752419710159, + "learning_rate": 7.1462887340859955e-06, + "loss": 0.5678, "step": 595 }, { - "epoch": 0.02, - "learning_rate": 7.0356472795497186e-06, - "loss": 0.4349, + "epoch": 0.021623959346956427, + "grad_norm": 0.12800736725330353, + "learning_rate": 7.206341580590921e-06, + "loss": 0.6086, "step": 600 }, { - "epoch": 0.02, - "learning_rate": 7.094277673545967e-06, - "loss": 0.4776, + "epoch": 0.021804159008181064, + "grad_norm": 0.14154621958732605, + "learning_rate": 7.266394427095845e-06, + "loss": 0.539, "step": 605 }, { - "epoch": 0.02, - "learning_rate": 7.152908067542215e-06, - "loss": 0.4121, + "epoch": 0.0219843586694057, + "grad_norm": 0.10308407992124557, + "learning_rate": 7.326447273600769e-06, + "loss": 0.5246, "step": 610 }, { - "epoch": 0.02, - "learning_rate": 7.211538461538461e-06, - "loss": 0.4229, + "epoch": 0.02216455833063034, + "grad_norm": 0.15192610025405884, + "learning_rate": 7.3865001201056936e-06, + "loss": 0.5498, "step": 615 }, { - "epoch": 0.02, - "learning_rate": 7.2701688555347095e-06, - "loss": 0.445, + "epoch": 0.022344757991854976, + "grad_norm": 0.15246666967868805, + "learning_rate": 7.446552966610617e-06, + "loss": 0.5671, "step": 620 }, { - "epoch": 0.02, - "learning_rate": 7.328799249530957e-06, - "loss": 0.4528, + "epoch": 0.022524957653079613, + "grad_norm": 0.14070971310138702, + "learning_rate": 7.506605813115541e-06, + "loss": 0.5181, "step": 625 }, { - "epoch": 0.02, - "learning_rate": 7.387429643527205e-06, - "loss": 0.4382, + "epoch": 0.02270515731430425, + "grad_norm": 0.1048644632101059, + "learning_rate": 7.566658659620467e-06, + "loss": 0.5661, "step": 630 }, { - "epoch": 0.02, - "learning_rate": 7.446060037523453e-06, - "loss": 0.4237, + "epoch": 0.022885356975528886, + "grad_norm": 0.12176632136106491, + "learning_rate": 7.626711506125391e-06, + "loss": 0.6039, "step": 635 }, { - "epoch": 0.02, - "learning_rate": 7.5046904315197005e-06, - "loss": 0.4303, + "epoch": 0.023065556636753522, + "grad_norm": 0.11277682334184647, + "learning_rate": 7.686764352630316e-06, + "loss": 0.5394, "step": 640 }, { - "epoch": 0.02, - "learning_rate": 7.563320825515948e-06, - "loss": 0.4365, + "epoch": 0.02324575629797816, + "grad_norm": 0.12266416847705841, + "learning_rate": 7.74681719913524e-06, + "loss": 0.5814, "step": 645 }, { - "epoch": 0.02, - "learning_rate": 7.621951219512195e-06, - "loss": 0.4397, + "epoch": 0.023425955959202795, + "grad_norm": 0.1094772145152092, + "learning_rate": 7.806870045640164e-06, + "loss": 0.5854, "step": 650 }, { - "epoch": 0.02, - "learning_rate": 7.680581613508442e-06, - "loss": 0.4157, + "epoch": 0.023606155620427435, + "grad_norm": 0.16953280568122864, + "learning_rate": 7.866922892145088e-06, + "loss": 0.5464, "step": 655 }, { - "epoch": 0.02, - "learning_rate": 7.739212007504691e-06, - "loss": 0.4351, + "epoch": 0.023786355281652072, + "grad_norm": 0.16109640896320343, + "learning_rate": 7.926975738650012e-06, + "loss": 0.5889, "step": 660 }, { - "epoch": 0.02, - "learning_rate": 7.797842401500939e-06, - "loss": 0.4226, + "epoch": 0.02396655494287671, + "grad_norm": 0.1488220989704132, + "learning_rate": 7.987028585154936e-06, + "loss": 0.57, "step": 665 }, { - "epoch": 0.02, - "learning_rate": 7.856472795497186e-06, - "loss": 0.4086, + "epoch": 0.024146754604101345, + "grad_norm": 0.15228040516376495, + "learning_rate": 8.047081431659861e-06, + "loss": 0.5437, "step": 670 }, { - "epoch": 0.02, - "learning_rate": 7.915103189493433e-06, - "loss": 0.4167, + "epoch": 0.02432695426532598, + "grad_norm": 0.11605563014745712, + "learning_rate": 8.107134278164785e-06, + "loss": 0.5315, "step": 675 }, { - "epoch": 0.02, - "learning_rate": 7.97373358348968e-06, - "loss": 0.436, + "epoch": 0.024507153926550618, + "grad_norm": 0.1317128986120224, + "learning_rate": 8.16718712466971e-06, + "loss": 0.5731, "step": 680 }, { - "epoch": 0.02, - "learning_rate": 8.03236397748593e-06, - "loss": 0.4404, + "epoch": 0.024687353587775254, + "grad_norm": 0.14042159914970398, + "learning_rate": 8.227239971174635e-06, + "loss": 0.5542, "step": 685 }, { - "epoch": 0.02, - "learning_rate": 8.090994371482177e-06, - "loss": 0.4268, + "epoch": 0.02486755324899989, + "grad_norm": 0.16029533743858337, + "learning_rate": 8.287292817679558e-06, + "loss": 0.5782, "step": 690 }, { - "epoch": 0.02, - "learning_rate": 8.149624765478424e-06, - "loss": 0.4325, + "epoch": 0.025047752910224527, + "grad_norm": 0.1571711301803589, + "learning_rate": 8.347345664184484e-06, + "loss": 0.5533, "step": 695 }, { - "epoch": 0.02, - "learning_rate": 8.208255159474672e-06, - "loss": 0.41, + "epoch": 0.025227952571449167, + "grad_norm": 0.13758143782615662, + "learning_rate": 8.407398510689408e-06, + "loss": 0.5316, "step": 700 }, { - "epoch": 0.02, - "learning_rate": 8.266885553470919e-06, - "loss": 0.4027, + "epoch": 0.025408152232673804, + "grad_norm": 0.11138833314180374, + "learning_rate": 8.467451357194332e-06, + "loss": 0.5075, "step": 705 }, { - "epoch": 0.02, - "learning_rate": 8.325515947467168e-06, - "loss": 0.4532, + "epoch": 0.02558835189389844, + "grad_norm": 0.14913310110569, + "learning_rate": 8.527504203699256e-06, + "loss": 0.5363, "step": 710 }, { - "epoch": 0.03, - "learning_rate": 8.384146341463415e-06, - "loss": 0.4342, + "epoch": 0.025768551555123077, + "grad_norm": 0.11564284563064575, + "learning_rate": 8.58755705020418e-06, + "loss": 0.5616, "step": 715 }, { - "epoch": 0.03, - "learning_rate": 8.442776735459663e-06, - "loss": 0.4072, + "epoch": 0.025948751216347713, + "grad_norm": 0.15782320499420166, + "learning_rate": 8.647609896709103e-06, + "loss": 0.5368, "step": 720 }, { - "epoch": 0.03, - "learning_rate": 8.50140712945591e-06, - "loss": 0.3962, + "epoch": 0.02612895087757235, + "grad_norm": 0.16397272050380707, + "learning_rate": 8.707662743214029e-06, + "loss": 0.6325, "step": 725 }, { - "epoch": 0.03, - "learning_rate": 8.560037523452157e-06, - "loss": 0.4224, + "epoch": 0.026309150538796986, + "grad_norm": 0.16753605008125305, + "learning_rate": 8.767715589718953e-06, + "loss": 0.5532, "step": 730 }, { - "epoch": 0.03, - "learning_rate": 8.618667917448406e-06, - "loss": 0.443, + "epoch": 0.026489350200021623, + "grad_norm": 0.14978809654712677, + "learning_rate": 8.827768436223878e-06, + "loss": 0.5431, "step": 735 }, { - "epoch": 0.03, - "learning_rate": 8.677298311444654e-06, - "loss": 0.4262, + "epoch": 0.026669549861246263, + "grad_norm": 0.15160630643367767, + "learning_rate": 8.887821282728802e-06, + "loss": 0.576, "step": 740 }, { - "epoch": 0.03, - "learning_rate": 8.735928705440901e-06, - "loss": 0.4276, + "epoch": 0.0268497495224709, + "grad_norm": 0.1296982318162918, + "learning_rate": 8.947874129233726e-06, + "loss": 0.5488, "step": 745 }, { - "epoch": 0.03, - "learning_rate": 8.794559099437148e-06, - "loss": 0.4333, + "epoch": 0.027029949183695536, + "grad_norm": 0.19447606801986694, + "learning_rate": 9.007926975738652e-06, + "loss": 0.5878, "step": 750 }, { - "epoch": 0.03, - "learning_rate": 8.853189493433396e-06, - "loss": 0.4199, + "epoch": 0.027210148844920172, + "grad_norm": 0.13086961209774017, + "learning_rate": 9.067979822243575e-06, + "loss": 0.5713, "step": 755 }, { - "epoch": 0.03, - "learning_rate": 8.911819887429645e-06, - "loss": 0.4505, + "epoch": 0.02739034850614481, + "grad_norm": 0.1761922836303711, + "learning_rate": 9.1280326687485e-06, + "loss": 0.5605, "step": 760 }, { - "epoch": 0.03, - "learning_rate": 8.970450281425892e-06, - "loss": 0.4286, + "epoch": 0.027570548167369445, + "grad_norm": 0.13089734315872192, + "learning_rate": 9.188085515253423e-06, + "loss": 0.5187, "step": 765 }, { - "epoch": 0.03, - "learning_rate": 9.02908067542214e-06, - "loss": 0.4126, + "epoch": 0.02775074782859408, + "grad_norm": 0.14569701254367828, + "learning_rate": 9.248138361758347e-06, + "loss": 0.5301, "step": 770 }, { - "epoch": 0.03, - "learning_rate": 9.087711069418386e-06, - "loss": 0.3909, + "epoch": 0.027930947489818718, + "grad_norm": 0.16195152699947357, + "learning_rate": 9.308191208263273e-06, + "loss": 0.5554, "step": 775 }, { - "epoch": 0.03, - "learning_rate": 9.146341463414634e-06, - "loss": 0.4132, + "epoch": 0.028111147151043354, + "grad_norm": 0.13685005903244019, + "learning_rate": 9.368244054768196e-06, + "loss": 0.5719, "step": 780 }, { - "epoch": 0.03, - "learning_rate": 9.204971857410883e-06, - "loss": 0.3725, + "epoch": 0.028291346812267994, + "grad_norm": 0.1750328093767166, + "learning_rate": 9.42829690127312e-06, + "loss": 0.5635, "step": 785 }, { - "epoch": 0.03, - "learning_rate": 9.26360225140713e-06, - "loss": 0.3996, + "epoch": 0.02847154647349263, + "grad_norm": 0.16423344612121582, + "learning_rate": 9.488349747778046e-06, + "loss": 0.6248, "step": 790 }, { - "epoch": 0.03, - "learning_rate": 9.322232645403377e-06, - "loss": 0.434, + "epoch": 0.028651746134717267, + "grad_norm": 0.15460895001888275, + "learning_rate": 9.54840259428297e-06, + "loss": 0.5727, "step": 795 }, { - "epoch": 0.03, - "learning_rate": 9.380863039399625e-06, - "loss": 0.432, + "epoch": 0.028831945795941904, + "grad_norm": 0.15125848352909088, + "learning_rate": 9.608455440787894e-06, + "loss": 0.5535, "step": 800 }, { - "epoch": 0.03, - "learning_rate": 9.439493433395872e-06, - "loss": 0.407, + "epoch": 0.02901214545716654, + "grad_norm": 0.18755701184272766, + "learning_rate": 9.66850828729282e-06, + "loss": 0.5292, "step": 805 }, { - "epoch": 0.03, - "learning_rate": 9.498123827392121e-06, - "loss": 0.3913, + "epoch": 0.029192345118391177, + "grad_norm": 0.14782796800136566, + "learning_rate": 9.728561133797743e-06, + "loss": 0.5155, "step": 810 }, { - "epoch": 0.03, - "learning_rate": 9.556754221388368e-06, - "loss": 0.4433, + "epoch": 0.029372544779615813, + "grad_norm": 0.1731768250465393, + "learning_rate": 9.788613980302667e-06, + "loss": 0.5752, "step": 815 }, { - "epoch": 0.03, - "learning_rate": 9.615384615384616e-06, - "loss": 0.4153, + "epoch": 0.02955274444084045, + "grad_norm": 0.17426101863384247, + "learning_rate": 9.84866682680759e-06, + "loss": 0.5796, "step": 820 }, { - "epoch": 0.03, - "learning_rate": 9.674015009380863e-06, - "loss": 0.4182, + "epoch": 0.02973294410206509, + "grad_norm": 0.1710687130689621, + "learning_rate": 9.908719673312515e-06, + "loss": 0.5363, "step": 825 }, { - "epoch": 0.03, - "learning_rate": 9.73264540337711e-06, - "loss": 0.4045, + "epoch": 0.029913143763289726, + "grad_norm": 0.12665097415447235, + "learning_rate": 9.96877251981744e-06, + "loss": 0.5302, "step": 830 }, { - "epoch": 0.03, - "learning_rate": 9.79127579737336e-06, - "loss": 0.4041, + "epoch": 0.030093343424514363, + "grad_norm": 0.20195025205612183, + "learning_rate": 1.0028825366322364e-05, + "loss": 0.6092, "step": 835 }, { - "epoch": 0.03, - "learning_rate": 9.849906191369607e-06, - "loss": 0.3941, + "epoch": 0.030273543085739, + "grad_norm": 0.18237920105457306, + "learning_rate": 1.0088878212827288e-05, + "loss": 0.5281, "step": 840 }, { - "epoch": 0.03, - "learning_rate": 9.908536585365854e-06, - "loss": 0.3963, + "epoch": 0.030453742746963636, + "grad_norm": 0.17698000371456146, + "learning_rate": 1.0148931059332214e-05, + "loss": 0.5705, "step": 845 }, { - "epoch": 0.03, - "learning_rate": 9.967166979362101e-06, - "loss": 0.432, + "epoch": 0.030633942408188272, + "grad_norm": 0.16873596608638763, + "learning_rate": 1.0208983905837137e-05, + "loss": 0.5534, "step": 850 }, { - "epoch": 0.03, - "learning_rate": 1.0025797373358349e-05, - "loss": 0.4181, + "epoch": 0.03081414206941291, + "grad_norm": 0.18587124347686768, + "learning_rate": 1.0269036752342063e-05, + "loss": 0.5482, "step": 855 }, { - "epoch": 0.03, - "learning_rate": 1.0084427767354598e-05, - "loss": 0.4433, + "epoch": 0.030994341730637545, + "grad_norm": 0.15320129692554474, + "learning_rate": 1.0329089598846985e-05, + "loss": 0.584, "step": 860 }, { - "epoch": 0.03, - "learning_rate": 1.0143058161350845e-05, - "loss": 0.401, + "epoch": 0.03117454139186218, + "grad_norm": 0.15630275011062622, + "learning_rate": 1.0389142445351909e-05, + "loss": 0.5771, "step": 865 }, { - "epoch": 0.03, - "learning_rate": 1.0201688555347092e-05, - "loss": 0.3956, + "epoch": 0.03135474105308682, + "grad_norm": 0.20800472795963287, + "learning_rate": 1.0449195291856835e-05, + "loss": 0.5642, "step": 870 }, { - "epoch": 0.03, - "learning_rate": 1.0260318949343341e-05, - "loss": 0.4052, + "epoch": 0.031534940714311455, + "grad_norm": 0.12712302803993225, + "learning_rate": 1.0509248138361758e-05, + "loss": 0.5427, "step": 875 }, { - "epoch": 0.03, - "learning_rate": 1.0318949343339587e-05, - "loss": 0.4146, + "epoch": 0.03171514037553609, + "grad_norm": 0.1484389752149582, + "learning_rate": 1.0569300984866682e-05, + "loss": 0.5385, "step": 880 }, { - "epoch": 0.03, - "learning_rate": 1.0377579737335836e-05, - "loss": 0.4049, + "epoch": 0.03189534003676073, + "grad_norm": 0.12968121469020844, + "learning_rate": 1.0629353831371608e-05, + "loss": 0.5315, "step": 885 }, { - "epoch": 0.03, - "learning_rate": 1.0436210131332083e-05, - "loss": 0.4177, + "epoch": 0.03207553969798537, + "grad_norm": 0.14982539415359497, + "learning_rate": 1.0689406677876532e-05, + "loss": 0.5571, "step": 890 }, { - "epoch": 0.03, - "learning_rate": 1.049484052532833e-05, - "loss": 0.4537, + "epoch": 0.03225573935921001, + "grad_norm": 0.16306400299072266, + "learning_rate": 1.0749459524381456e-05, + "loss": 0.5379, "step": 895 }, { - "epoch": 0.03, - "learning_rate": 1.055347091932458e-05, - "loss": 0.4175, + "epoch": 0.032435939020434644, + "grad_norm": 0.19171880185604095, + "learning_rate": 1.0809512370886381e-05, + "loss": 0.5652, "step": 900 }, { - "epoch": 0.03, - "learning_rate": 1.0612101313320825e-05, - "loss": 0.3695, + "epoch": 0.03261613868165928, + "grad_norm": 0.1954376995563507, + "learning_rate": 1.0869565217391305e-05, + "loss": 0.5635, "step": 905 }, { - "epoch": 0.03, - "learning_rate": 1.0670731707317074e-05, - "loss": 0.4037, + "epoch": 0.03279633834288392, + "grad_norm": 0.1810484379529953, + "learning_rate": 1.0929618063896229e-05, + "loss": 0.5046, "step": 910 }, { - "epoch": 0.03, - "learning_rate": 1.0729362101313321e-05, - "loss": 0.4141, + "epoch": 0.03297653800410855, + "grad_norm": 0.18302740156650543, + "learning_rate": 1.0989670910401153e-05, + "loss": 0.5501, "step": 915 }, { - "epoch": 0.03, - "learning_rate": 1.0787992495309569e-05, - "loss": 0.4023, + "epoch": 0.03315673766533319, + "grad_norm": 0.1687111258506775, + "learning_rate": 1.1049723756906077e-05, + "loss": 0.5069, "step": 920 }, { - "epoch": 0.03, - "learning_rate": 1.0846622889305818e-05, - "loss": 0.3647, + "epoch": 0.033336937326557826, + "grad_norm": 0.16762186586856842, + "learning_rate": 1.1109776603411002e-05, + "loss": 0.5167, "step": 925 }, { - "epoch": 0.03, - "learning_rate": 1.0905253283302063e-05, - "loss": 0.4172, + "epoch": 0.03351713698778246, + "grad_norm": 0.22583533823490143, + "learning_rate": 1.1169829449915926e-05, + "loss": 0.5251, "step": 930 }, { - "epoch": 0.03, - "learning_rate": 1.0963883677298312e-05, - "loss": 0.3895, + "epoch": 0.0336973366490071, + "grad_norm": 0.19438394904136658, + "learning_rate": 1.122988229642085e-05, + "loss": 0.6164, "step": 935 }, { - "epoch": 0.03, - "learning_rate": 1.102251407129456e-05, - "loss": 0.3876, + "epoch": 0.033877536310231736, + "grad_norm": 0.1634751260280609, + "learning_rate": 1.1289935142925776e-05, + "loss": 0.5606, "step": 940 }, { - "epoch": 0.03, - "learning_rate": 1.1081144465290807e-05, - "loss": 0.3759, + "epoch": 0.03405773597145637, + "grad_norm": 0.1744074523448944, + "learning_rate": 1.13499879894307e-05, + "loss": 0.5853, "step": 945 }, { - "epoch": 0.03, - "learning_rate": 1.1139774859287056e-05, - "loss": 0.4043, + "epoch": 0.03423793563268101, + "grad_norm": 0.18406233191490173, + "learning_rate": 1.1410040835935625e-05, + "loss": 0.5132, "step": 950 }, { - "epoch": 0.03, - "learning_rate": 1.1198405253283302e-05, - "loss": 0.3948, + "epoch": 0.034418135293905645, + "grad_norm": 0.14555774629116058, + "learning_rate": 1.1470093682440549e-05, + "loss": 0.546, "step": 955 }, { - "epoch": 0.03, - "learning_rate": 1.125703564727955e-05, - "loss": 0.4055, + "epoch": 0.03459833495513028, + "grad_norm": 0.14365147054195404, + "learning_rate": 1.1530146528945473e-05, + "loss": 0.5515, "step": 960 }, { - "epoch": 0.03, - "learning_rate": 1.1315666041275798e-05, - "loss": 0.4142, + "epoch": 0.03477853461635492, + "grad_norm": 0.19130977988243103, + "learning_rate": 1.1590199375450397e-05, + "loss": 0.5576, "step": 965 }, { - "epoch": 0.03, - "learning_rate": 1.1374296435272045e-05, - "loss": 0.4328, + "epoch": 0.034958734277579555, + "grad_norm": 0.1700170785188675, + "learning_rate": 1.165025222195532e-05, + "loss": 0.5344, "step": 970 }, { - "epoch": 0.03, - "learning_rate": 1.1432926829268294e-05, - "loss": 0.4047, + "epoch": 0.0351389339388042, + "grad_norm": 0.14722155034542084, + "learning_rate": 1.1710305068460244e-05, + "loss": 0.5644, "step": 975 }, { - "epoch": 0.03, - "learning_rate": 1.149155722326454e-05, - "loss": 0.408, + "epoch": 0.035319133600028835, + "grad_norm": 0.1315588653087616, + "learning_rate": 1.177035791496517e-05, + "loss": 0.5136, "step": 980 }, { - "epoch": 0.03, - "learning_rate": 1.1550187617260787e-05, - "loss": 0.405, + "epoch": 0.03549933326125347, + "grad_norm": 0.22789514064788818, + "learning_rate": 1.1830410761470094e-05, + "loss": 0.5709, "step": 985 }, { - "epoch": 0.03, - "learning_rate": 1.1608818011257036e-05, - "loss": 0.4044, + "epoch": 0.03567953292247811, + "grad_norm": 0.19900259375572205, + "learning_rate": 1.189046360797502e-05, + "loss": 0.5815, "step": 990 }, { - "epoch": 0.04, - "learning_rate": 1.1667448405253284e-05, - "loss": 0.4023, + "epoch": 0.035859732583702744, + "grad_norm": 0.20573367178440094, + "learning_rate": 1.1950516454479943e-05, + "loss": 0.5188, "step": 995 }, { - "epoch": 0.04, - "learning_rate": 1.1726078799249533e-05, - "loss": 0.3951, + "epoch": 0.03603993224492738, + "grad_norm": 0.17396429181098938, + "learning_rate": 1.2010569300984867e-05, + "loss": 0.5032, "step": 1000 }, { - "epoch": 0.04, - "eval_loss": 0.3985450863838196, - "eval_runtime": 10.5677, - "eval_samples_per_second": 9.463, - "eval_steps_per_second": 9.463, + "epoch": 0.03603993224492738, + "eval_loss": 0.5565588474273682, + "eval_runtime": 3.5193, + "eval_samples_per_second": 28.415, + "eval_steps_per_second": 7.104, "step": 1000 }, { - "epoch": 0.04, - "learning_rate": 1.178470919324578e-05, - "loss": 0.3828, + "epoch": 0.03622013190615202, + "grad_norm": 0.1656351536512375, + "learning_rate": 1.2070622147489793e-05, + "loss": 0.5045, "step": 1005 }, { - "epoch": 0.04, - "learning_rate": 1.1843339587242026e-05, - "loss": 0.3952, + "epoch": 0.036400331567376654, + "grad_norm": 0.15851475298404694, + "learning_rate": 1.2130674993994717e-05, + "loss": 0.5142, "step": 1010 }, { - "epoch": 0.04, - "learning_rate": 1.1901969981238275e-05, - "loss": 0.4074, + "epoch": 0.03658053122860129, + "grad_norm": 0.18795153498649597, + "learning_rate": 1.219072784049964e-05, + "loss": 0.5408, "step": 1015 }, { - "epoch": 0.04, - "learning_rate": 1.1960600375234522e-05, - "loss": 0.3446, + "epoch": 0.03676073088982593, + "grad_norm": 0.18724803626537323, + "learning_rate": 1.2250780687004564e-05, + "loss": 0.5132, "step": 1020 }, { - "epoch": 0.04, - "learning_rate": 1.2019230769230771e-05, - "loss": 0.3972, + "epoch": 0.03694093055105056, + "grad_norm": 0.17133454978466034, + "learning_rate": 1.2310833533509488e-05, + "loss": 0.5231, "step": 1025 }, { - "epoch": 0.04, - "learning_rate": 1.2077861163227018e-05, - "loss": 0.3501, + "epoch": 0.0371211302122752, + "grad_norm": 0.19401347637176514, + "learning_rate": 1.2370886380014414e-05, + "loss": 0.5672, "step": 1030 }, { - "epoch": 0.04, - "learning_rate": 1.2136491557223264e-05, - "loss": 0.4003, + "epoch": 0.037301329873499836, + "grad_norm": 0.23305334150791168, + "learning_rate": 1.2430939226519338e-05, + "loss": 0.5496, "step": 1035 }, { - "epoch": 0.04, - "learning_rate": 1.2195121951219513e-05, - "loss": 0.3998, + "epoch": 0.03748152953472447, + "grad_norm": 0.1804538369178772, + "learning_rate": 1.2490992073024261e-05, + "loss": 0.5241, "step": 1040 }, { - "epoch": 0.04, - "learning_rate": 1.225375234521576e-05, - "loss": 0.3966, + "epoch": 0.03766172919594911, + "grad_norm": 0.16487692296504974, + "learning_rate": 1.2551044919529187e-05, + "loss": 0.537, "step": 1045 }, { - "epoch": 0.04, - "learning_rate": 1.231238273921201e-05, - "loss": 0.4103, + "epoch": 0.037841928857173746, + "grad_norm": 0.17555435001850128, + "learning_rate": 1.2611097766034111e-05, + "loss": 0.5381, "step": 1050 }, { - "epoch": 0.04, - "learning_rate": 1.2371013133208257e-05, - "loss": 0.4126, + "epoch": 0.03802212851839838, + "grad_norm": 0.19943825900554657, + "learning_rate": 1.2671150612539035e-05, + "loss": 0.5353, "step": 1055 }, { - "epoch": 0.04, - "learning_rate": 1.2429643527204502e-05, - "loss": 0.4029, + "epoch": 0.038202328179623025, + "grad_norm": 0.19467459619045258, + "learning_rate": 1.2731203459043959e-05, + "loss": 0.5634, "step": 1060 }, { - "epoch": 0.04, - "learning_rate": 1.2488273921200751e-05, - "loss": 0.4234, + "epoch": 0.03838252784084766, + "grad_norm": 0.1945466697216034, + "learning_rate": 1.2791256305548884e-05, + "loss": 0.5709, "step": 1065 }, { - "epoch": 0.04, - "learning_rate": 1.2546904315197e-05, - "loss": 0.4048, + "epoch": 0.0385627275020723, + "grad_norm": 0.17574776709079742, + "learning_rate": 1.2851309152053808e-05, + "loss": 0.496, "step": 1070 }, { - "epoch": 0.04, - "learning_rate": 1.2605534709193246e-05, - "loss": 0.4306, + "epoch": 0.038742927163296935, + "grad_norm": 0.19129513204097748, + "learning_rate": 1.2911361998558732e-05, + "loss": 0.4818, "step": 1075 }, { - "epoch": 0.04, - "learning_rate": 1.2664165103189493e-05, - "loss": 0.4036, + "epoch": 0.03892312682452157, + "grad_norm": 0.16488486528396606, + "learning_rate": 1.2971414845063656e-05, + "loss": 0.5433, "step": 1080 }, { - "epoch": 0.04, - "learning_rate": 1.272279549718574e-05, - "loss": 0.3782, + "epoch": 0.03910332648574621, + "grad_norm": 0.1613866090774536, + "learning_rate": 1.303146769156858e-05, + "loss": 0.5058, "step": 1085 }, { - "epoch": 0.04, - "learning_rate": 1.278142589118199e-05, - "loss": 0.408, + "epoch": 0.039283526146970844, + "grad_norm": 0.21093931794166565, + "learning_rate": 1.3091520538073504e-05, + "loss": 0.5199, "step": 1090 }, { - "epoch": 0.04, - "learning_rate": 1.2840056285178237e-05, - "loss": 0.4104, + "epoch": 0.03946372580819548, + "grad_norm": 0.1792885661125183, + "learning_rate": 1.315157338457843e-05, + "loss": 0.5298, "step": 1095 }, { - "epoch": 0.04, - "learning_rate": 1.2898686679174484e-05, - "loss": 0.3797, + "epoch": 0.03964392546942012, + "grad_norm": 0.20177467167377472, + "learning_rate": 1.3211626231083355e-05, + "loss": 0.5183, "step": 1100 }, { - "epoch": 0.04, - "learning_rate": 1.2957317073170733e-05, - "loss": 0.4013, + "epoch": 0.039824125130644754, + "grad_norm": 0.1867968887090683, + "learning_rate": 1.3271679077588279e-05, + "loss": 0.5268, "step": 1105 }, { - "epoch": 0.04, - "learning_rate": 1.301594746716698e-05, - "loss": 0.401, + "epoch": 0.04000432479186939, + "grad_norm": 0.17943917214870453, + "learning_rate": 1.3331731924093202e-05, + "loss": 0.555, "step": 1110 }, { - "epoch": 0.04, - "learning_rate": 1.307457786116323e-05, - "loss": 0.4132, + "epoch": 0.04018452445309403, + "grad_norm": 0.24387885630130768, + "learning_rate": 1.3391784770598126e-05, + "loss": 0.5367, "step": 1115 }, { - "epoch": 0.04, - "learning_rate": 1.3133208255159477e-05, - "loss": 0.3838, + "epoch": 0.04036472411431866, + "grad_norm": 0.27169400453567505, + "learning_rate": 1.3451837617103052e-05, + "loss": 0.5293, "step": 1120 }, { - "epoch": 0.04, - "learning_rate": 1.3191838649155722e-05, - "loss": 0.4303, + "epoch": 0.0405449237755433, + "grad_norm": 0.22766202688217163, + "learning_rate": 1.3511890463607976e-05, + "loss": 0.5438, "step": 1125 }, { - "epoch": 0.04, - "learning_rate": 1.325046904315197e-05, - "loss": 0.4177, + "epoch": 0.040725123436767936, + "grad_norm": 0.17502576112747192, + "learning_rate": 1.35719433101129e-05, + "loss": 0.482, "step": 1130 }, { - "epoch": 0.04, - "learning_rate": 1.3309099437148217e-05, - "loss": 0.3646, + "epoch": 0.04090532309799257, + "grad_norm": 0.1690770983695984, + "learning_rate": 1.3631996156617823e-05, + "loss": 0.5331, "step": 1135 }, { - "epoch": 0.04, - "learning_rate": 1.3367729831144466e-05, - "loss": 0.3996, + "epoch": 0.04108552275921721, + "grad_norm": 0.18362893164157867, + "learning_rate": 1.3692049003122747e-05, + "loss": 0.4705, "step": 1140 }, { - "epoch": 0.04, - "learning_rate": 1.3426360225140713e-05, - "loss": 0.3913, + "epoch": 0.04126572242044185, + "grad_norm": 0.2055690884590149, + "learning_rate": 1.3752101849627675e-05, + "loss": 0.4958, "step": 1145 }, { - "epoch": 0.04, - "learning_rate": 1.348499061913696e-05, - "loss": 0.3774, + "epoch": 0.04144592208166649, + "grad_norm": 0.20133014023303986, + "learning_rate": 1.3812154696132598e-05, + "loss": 0.5269, "step": 1150 }, { - "epoch": 0.04, - "learning_rate": 1.354362101313321e-05, - "loss": 0.4188, + "epoch": 0.041626121742891126, + "grad_norm": 0.18268916010856628, + "learning_rate": 1.3872207542637522e-05, + "loss": 0.5104, "step": 1155 }, { - "epoch": 0.04, - "learning_rate": 1.3602251407129457e-05, - "loss": 0.3658, + "epoch": 0.04180632140411576, + "grad_norm": 0.22504407167434692, + "learning_rate": 1.3932260389142446e-05, + "loss": 0.5622, "step": 1160 }, { - "epoch": 0.04, - "learning_rate": 1.3660881801125704e-05, - "loss": 0.387, + "epoch": 0.0419865210653404, + "grad_norm": 0.2050849348306656, + "learning_rate": 1.399231323564737e-05, + "loss": 0.5176, "step": 1165 }, { - "epoch": 0.04, - "learning_rate": 1.3719512195121953e-05, - "loss": 0.3991, + "epoch": 0.042166720726565035, + "grad_norm": 0.1802973598241806, + "learning_rate": 1.4052366082152294e-05, + "loss": 0.5322, "step": 1170 }, { - "epoch": 0.04, - "learning_rate": 1.37781425891182e-05, - "loss": 0.3824, + "epoch": 0.04234692038778967, + "grad_norm": 0.19005241990089417, + "learning_rate": 1.411241892865722e-05, + "loss": 0.5582, "step": 1175 }, { - "epoch": 0.04, - "learning_rate": 1.3836772983114446e-05, - "loss": 0.4034, + "epoch": 0.04252712004901431, + "grad_norm": 0.2026328444480896, + "learning_rate": 1.4172471775162143e-05, + "loss": 0.5726, "step": 1180 }, { - "epoch": 0.04, - "learning_rate": 1.3895403377110693e-05, - "loss": 0.3582, + "epoch": 0.042707319710238945, + "grad_norm": 0.22983752191066742, + "learning_rate": 1.4232524621667067e-05, + "loss": 0.5469, "step": 1185 }, { - "epoch": 0.04, - "learning_rate": 1.3954033771106943e-05, - "loss": 0.3816, + "epoch": 0.04288751937146358, + "grad_norm": 0.15863169729709625, + "learning_rate": 1.4292577468171991e-05, + "loss": 0.5368, "step": 1190 }, { - "epoch": 0.04, - "learning_rate": 1.401266416510319e-05, - "loss": 0.38, + "epoch": 0.04306771903268822, + "grad_norm": 0.19948942959308624, + "learning_rate": 1.4352630314676915e-05, + "loss": 0.518, "step": 1195 }, { - "epoch": 0.04, - "learning_rate": 1.4071294559099437e-05, - "loss": 0.3887, + "epoch": 0.043247918693912854, + "grad_norm": 0.20229573547840118, + "learning_rate": 1.4412683161181842e-05, + "loss": 0.5379, "step": 1200 }, { - "epoch": 0.04, - "learning_rate": 1.4129924953095686e-05, - "loss": 0.4033, + "epoch": 0.04342811835513749, + "grad_norm": 0.22024467587471008, + "learning_rate": 1.4472736007686766e-05, + "loss": 0.5648, "step": 1205 }, { - "epoch": 0.04, - "learning_rate": 1.4188555347091933e-05, - "loss": 0.3784, + "epoch": 0.04360831801636213, + "grad_norm": 0.22408129274845123, + "learning_rate": 1.453278885419169e-05, + "loss": 0.5228, "step": 1210 }, { - "epoch": 0.04, - "learning_rate": 1.424718574108818e-05, - "loss": 0.3889, + "epoch": 0.043788517677586763, + "grad_norm": 0.21704581379890442, + "learning_rate": 1.4592841700696614e-05, + "loss": 0.5289, "step": 1215 }, { - "epoch": 0.04, - "learning_rate": 1.430581613508443e-05, - "loss": 0.3671, + "epoch": 0.0439687173388114, + "grad_norm": 0.1930111199617386, + "learning_rate": 1.4652894547201538e-05, + "loss": 0.5176, "step": 1220 }, { - "epoch": 0.04, - "learning_rate": 1.4364446529080677e-05, - "loss": 0.3575, + "epoch": 0.044148917000036036, + "grad_norm": 0.23059917986392975, + "learning_rate": 1.4712947393706462e-05, + "loss": 0.5323, "step": 1225 }, { - "epoch": 0.04, - "learning_rate": 1.4423076923076923e-05, - "loss": 0.3871, + "epoch": 0.04432911666126068, + "grad_norm": 0.21172671020030975, + "learning_rate": 1.4773000240211387e-05, + "loss": 0.544, "step": 1230 }, { - "epoch": 0.04, - "learning_rate": 1.448170731707317e-05, - "loss": 0.4034, + "epoch": 0.044509316322485316, + "grad_norm": 0.20656728744506836, + "learning_rate": 1.4833053086716311e-05, + "loss": 0.5019, "step": 1235 }, { - "epoch": 0.04, - "learning_rate": 1.4540337711069419e-05, - "loss": 0.396, + "epoch": 0.04468951598370995, + "grad_norm": 0.19450949132442474, + "learning_rate": 1.4893105933221235e-05, + "loss": 0.5262, "step": 1240 }, { - "epoch": 0.04, - "learning_rate": 1.4598968105065666e-05, - "loss": 0.3741, + "epoch": 0.04486971564493459, + "grad_norm": 0.18495753407478333, + "learning_rate": 1.4953158779726159e-05, + "loss": 0.4986, "step": 1245 }, { - "epoch": 0.04, - "learning_rate": 1.4657598499061914e-05, - "loss": 0.3861, + "epoch": 0.045049915306159226, + "grad_norm": 0.20766963064670563, + "learning_rate": 1.5013211626231083e-05, + "loss": 0.5081, "step": 1250 }, { - "epoch": 0.04, - "learning_rate": 1.4716228893058163e-05, - "loss": 0.4007, + "epoch": 0.04523011496738386, + "grad_norm": 0.18297769129276276, + "learning_rate": 1.507326447273601e-05, + "loss": 0.5523, "step": 1255 }, { - "epoch": 0.04, - "learning_rate": 1.477485928705441e-05, - "loss": 0.367, + "epoch": 0.0454103146286085, + "grad_norm": 0.18521635234355927, + "learning_rate": 1.5133317319240934e-05, + "loss": 0.5279, "step": 1260 }, { - "epoch": 0.04, - "learning_rate": 1.4833489681050657e-05, - "loss": 0.355, + "epoch": 0.045590514289833135, + "grad_norm": 0.22829121351242065, + "learning_rate": 1.5193370165745858e-05, + "loss": 0.5055, "step": 1265 }, { - "epoch": 0.04, - "learning_rate": 1.4892120075046906e-05, - "loss": 0.4029, + "epoch": 0.04577071395105777, + "grad_norm": 0.21849414706230164, + "learning_rate": 1.5253423012250781e-05, + "loss": 0.539, "step": 1270 }, { - "epoch": 0.04, - "learning_rate": 1.4950750469043154e-05, - "loss": 0.3722, + "epoch": 0.04595091361228241, + "grad_norm": 0.21421493589878082, + "learning_rate": 1.5313475858755704e-05, + "loss": 0.5275, "step": 1275 }, { - "epoch": 0.05, - "learning_rate": 1.5009380863039401e-05, - "loss": 0.3901, + "epoch": 0.046131113273507045, + "grad_norm": 0.23160327970981598, + "learning_rate": 1.5373528705260633e-05, + "loss": 0.5409, "step": 1280 }, { - "epoch": 0.05, - "learning_rate": 1.5068011257035647e-05, - "loss": 0.4027, + "epoch": 0.04631131293473168, + "grad_norm": 0.16949445009231567, + "learning_rate": 1.5433581551765555e-05, + "loss": 0.5284, "step": 1285 }, { - "epoch": 0.05, - "learning_rate": 1.5126641651031896e-05, - "loss": 0.3618, + "epoch": 0.04649151259595632, + "grad_norm": 0.18024949729442596, + "learning_rate": 1.549363439827048e-05, + "loss": 0.5375, "step": 1290 }, { - "epoch": 0.05, - "learning_rate": 1.5185272045028143e-05, - "loss": 0.3801, + "epoch": 0.046671712257180954, + "grad_norm": 0.22767312824726105, + "learning_rate": 1.5553687244775402e-05, + "loss": 0.5214, "step": 1295 }, { - "epoch": 0.05, - "learning_rate": 1.524390243902439e-05, - "loss": 0.4104, + "epoch": 0.04685191191840559, + "grad_norm": 0.21435782313346863, + "learning_rate": 1.5613740091280328e-05, + "loss": 0.4854, "step": 1300 }, { - "epoch": 0.05, - "learning_rate": 1.530253283302064e-05, - "loss": 0.4028, + "epoch": 0.04703211157963023, + "grad_norm": 0.22171509265899658, + "learning_rate": 1.567379293778525e-05, + "loss": 0.5017, "step": 1305 }, { - "epoch": 0.05, - "learning_rate": 1.5361163227016885e-05, - "loss": 0.3972, + "epoch": 0.04721231124085487, + "grad_norm": 0.23257885873317719, + "learning_rate": 1.5733845784290176e-05, + "loss": 0.5319, "step": 1310 }, { - "epoch": 0.05, - "learning_rate": 1.5419793621013134e-05, - "loss": 0.4118, + "epoch": 0.04739251090207951, + "grad_norm": 0.24968785047531128, + "learning_rate": 1.57938986307951e-05, + "loss": 0.5511, "step": 1315 }, { - "epoch": 0.05, - "learning_rate": 1.5478424015009383e-05, - "loss": 0.3466, + "epoch": 0.047572710563304144, + "grad_norm": 0.19016103446483612, + "learning_rate": 1.5853951477300024e-05, + "loss": 0.5075, "step": 1320 }, { - "epoch": 0.05, - "learning_rate": 1.5537054409005632e-05, - "loss": 0.4084, + "epoch": 0.04775291022452878, + "grad_norm": 0.19356337189674377, + "learning_rate": 1.591400432380495e-05, + "loss": 0.5068, "step": 1325 }, { - "epoch": 0.05, - "learning_rate": 1.5595684803001878e-05, - "loss": 0.3734, + "epoch": 0.04793310988575342, + "grad_norm": 0.28713881969451904, + "learning_rate": 1.597405717030987e-05, + "loss": 0.5403, "step": 1330 }, { - "epoch": 0.05, - "learning_rate": 1.5654315196998123e-05, - "loss": 0.4115, + "epoch": 0.04811330954697805, + "grad_norm": 0.18049970269203186, + "learning_rate": 1.60341100168148e-05, + "loss": 0.5184, "step": 1335 }, { - "epoch": 0.05, - "learning_rate": 1.5712945590994372e-05, - "loss": 0.382, + "epoch": 0.04829350920820269, + "grad_norm": 0.22419008612632751, + "learning_rate": 1.6094162863319722e-05, + "loss": 0.5497, "step": 1340 }, { - "epoch": 0.05, - "learning_rate": 1.5771575984990618e-05, - "loss": 0.361, + "epoch": 0.048473708869427326, + "grad_norm": 0.24275891482830048, + "learning_rate": 1.6154215709824648e-05, + "loss": 0.5375, "step": 1345 }, { - "epoch": 0.05, - "learning_rate": 1.5830206378986867e-05, - "loss": 0.413, + "epoch": 0.04865390853065196, + "grad_norm": 0.21218125522136688, + "learning_rate": 1.621426855632957e-05, + "loss": 0.5069, "step": 1350 }, { - "epoch": 0.05, - "learning_rate": 1.5888836772983116e-05, - "loss": 0.3922, + "epoch": 0.0488341081918766, + "grad_norm": 0.17719610035419464, + "learning_rate": 1.6274321402834496e-05, + "loss": 0.5194, "step": 1355 }, { - "epoch": 0.05, - "learning_rate": 1.594746716697936e-05, - "loss": 0.3477, + "epoch": 0.049014307853101236, + "grad_norm": 0.27654048800468445, + "learning_rate": 1.633437424933942e-05, + "loss": 0.5144, "step": 1360 }, { - "epoch": 0.05, - "learning_rate": 1.600609756097561e-05, - "loss": 0.386, + "epoch": 0.04919450751432587, + "grad_norm": 0.20031145215034485, + "learning_rate": 1.6394427095844343e-05, + "loss": 0.5053, "step": 1365 }, { - "epoch": 0.05, - "learning_rate": 1.606472795497186e-05, - "loss": 0.3448, + "epoch": 0.04937470717555051, + "grad_norm": 0.21770700812339783, + "learning_rate": 1.645447994234927e-05, + "loss": 0.5087, "step": 1370 }, { - "epoch": 0.05, - "learning_rate": 1.6123358348968105e-05, - "loss": 0.3657, + "epoch": 0.049554906836775145, + "grad_norm": 0.26108381152153015, + "learning_rate": 1.651453278885419e-05, + "loss": 0.5614, "step": 1375 }, { - "epoch": 0.05, - "learning_rate": 1.6181988742964354e-05, - "loss": 0.3743, + "epoch": 0.04973510649799978, + "grad_norm": 0.21475346386432648, + "learning_rate": 1.6574585635359117e-05, + "loss": 0.533, "step": 1380 }, { - "epoch": 0.05, - "learning_rate": 1.62406191369606e-05, - "loss": 0.392, + "epoch": 0.04991530615922442, + "grad_norm": 0.2189481258392334, + "learning_rate": 1.663463848186404e-05, + "loss": 0.5106, "step": 1385 }, { - "epoch": 0.05, - "learning_rate": 1.629924953095685e-05, - "loss": 0.3744, + "epoch": 0.050095505820449054, + "grad_norm": 0.22523830831050873, + "learning_rate": 1.6694691328368968e-05, + "loss": 0.5047, "step": 1390 }, { - "epoch": 0.05, - "learning_rate": 1.6357879924953094e-05, - "loss": 0.383, + "epoch": 0.0502757054816737, + "grad_norm": 0.2538076639175415, + "learning_rate": 1.675474417487389e-05, + "loss": 0.4805, "step": 1395 }, { - "epoch": 0.05, - "learning_rate": 1.6416510318949343e-05, - "loss": 0.3739, + "epoch": 0.050455905142898334, + "grad_norm": 0.23015667498111725, + "learning_rate": 1.6814797021378816e-05, + "loss": 0.5236, "step": 1400 }, { - "epoch": 0.05, - "learning_rate": 1.6475140712945592e-05, - "loss": 0.3644, + "epoch": 0.05063610480412297, + "grad_norm": 0.26876676082611084, + "learning_rate": 1.6874849867883738e-05, + "loss": 0.541, "step": 1405 }, { - "epoch": 0.05, - "learning_rate": 1.6533771106941838e-05, - "loss": 0.3429, + "epoch": 0.05081630446534761, + "grad_norm": 0.25886017084121704, + "learning_rate": 1.6934902714388663e-05, + "loss": 0.5363, "step": 1410 }, { - "epoch": 0.05, - "learning_rate": 1.6592401500938087e-05, - "loss": 0.3876, + "epoch": 0.050996504126572244, + "grad_norm": 0.22038790583610535, + "learning_rate": 1.699495556089359e-05, + "loss": 0.4836, "step": 1415 }, { - "epoch": 0.05, - "learning_rate": 1.6651031894934336e-05, - "loss": 0.3877, + "epoch": 0.05117670378779688, + "grad_norm": 0.22495049238204956, + "learning_rate": 1.705500840739851e-05, + "loss": 0.5435, "step": 1420 }, { - "epoch": 0.05, - "learning_rate": 1.670966228893058e-05, - "loss": 0.4003, + "epoch": 0.05135690344902152, + "grad_norm": 0.22736889123916626, + "learning_rate": 1.7115061253903437e-05, + "loss": 0.4876, "step": 1425 }, { - "epoch": 0.05, - "learning_rate": 1.676829268292683e-05, - "loss": 0.3824, + "epoch": 0.05153710311024615, + "grad_norm": 0.21810127794742584, + "learning_rate": 1.717511410040836e-05, + "loss": 0.5353, "step": 1430 }, { - "epoch": 0.05, - "learning_rate": 1.682692307692308e-05, - "loss": 0.375, + "epoch": 0.05171730277147079, + "grad_norm": 0.19743861258029938, + "learning_rate": 1.7235166946913284e-05, + "loss": 0.5042, "step": 1435 }, { - "epoch": 0.05, - "learning_rate": 1.6885553470919325e-05, - "loss": 0.3732, + "epoch": 0.051897502432695426, + "grad_norm": 0.22972045838832855, + "learning_rate": 1.7295219793418207e-05, + "loss": 0.4953, "step": 1440 }, { - "epoch": 0.05, - "learning_rate": 1.694418386491557e-05, - "loss": 0.3874, + "epoch": 0.05207770209392006, + "grad_norm": 0.19471551477909088, + "learning_rate": 1.7355272639923136e-05, + "loss": 0.5055, "step": 1445 }, { - "epoch": 0.05, - "learning_rate": 1.700281425891182e-05, - "loss": 0.3687, + "epoch": 0.0522579017551447, + "grad_norm": 0.29399409890174866, + "learning_rate": 1.7415325486428058e-05, + "loss": 0.547, "step": 1450 }, { - "epoch": 0.05, - "learning_rate": 1.706144465290807e-05, - "loss": 0.3991, + "epoch": 0.052438101416369336, + "grad_norm": 0.24069805443286896, + "learning_rate": 1.7475378332932983e-05, + "loss": 0.5267, "step": 1455 }, { - "epoch": 0.05, - "learning_rate": 1.7120075046904315e-05, - "loss": 0.3493, + "epoch": 0.05261830107759397, + "grad_norm": 0.23223073780536652, + "learning_rate": 1.7535431179437905e-05, + "loss": 0.4945, "step": 1460 }, { - "epoch": 0.05, - "learning_rate": 1.7178705440900564e-05, - "loss": 0.3338, + "epoch": 0.05279850073881861, + "grad_norm": 0.19976119697093964, + "learning_rate": 1.759548402594283e-05, + "loss": 0.5135, "step": 1465 }, { - "epoch": 0.05, - "learning_rate": 1.7237335834896813e-05, - "loss": 0.3733, + "epoch": 0.052978700400043245, + "grad_norm": 0.2490244060754776, + "learning_rate": 1.7655536872447757e-05, + "loss": 0.5031, "step": 1470 }, { - "epoch": 0.05, - "learning_rate": 1.7295966228893058e-05, - "loss": 0.3407, + "epoch": 0.05315890006126788, + "grad_norm": 0.23295678198337555, + "learning_rate": 1.771558971895268e-05, + "loss": 0.5652, "step": 1475 }, { - "epoch": 0.05, - "learning_rate": 1.7354596622889307e-05, - "loss": 0.3743, + "epoch": 0.053339099722492525, + "grad_norm": 0.22554819285869598, + "learning_rate": 1.7775642565457604e-05, + "loss": 0.5279, "step": 1480 }, { - "epoch": 0.05, - "learning_rate": 1.7413227016885556e-05, - "loss": 0.3911, + "epoch": 0.05351929938371716, + "grad_norm": 0.2634689509868622, + "learning_rate": 1.7835695411962526e-05, + "loss": 0.5486, "step": 1485 }, { - "epoch": 0.05, - "learning_rate": 1.7471857410881802e-05, - "loss": 0.3591, + "epoch": 0.0536994990449418, + "grad_norm": 0.1896762251853943, + "learning_rate": 1.7895748258467452e-05, + "loss": 0.5419, "step": 1490 }, { - "epoch": 0.05, - "learning_rate": 1.7530487804878047e-05, - "loss": 0.3793, + "epoch": 0.053879698706166435, + "grad_norm": 0.21280480921268463, + "learning_rate": 1.7955801104972378e-05, + "loss": 0.5541, "step": 1495 }, { - "epoch": 0.05, - "learning_rate": 1.7589118198874296e-05, - "loss": 0.3851, + "epoch": 0.05405989836739107, + "grad_norm": 0.19312447309494019, + "learning_rate": 1.8015853951477303e-05, + "loss": 0.5315, "step": 1500 }, { - "epoch": 0.05, - "eval_loss": 0.3775494694709778, - "eval_runtime": 10.5367, - "eval_samples_per_second": 9.491, - "eval_steps_per_second": 9.491, + "epoch": 0.05405989836739107, + "eval_loss": 0.5359941720962524, + "eval_runtime": 3.5176, + "eval_samples_per_second": 28.429, + "eval_steps_per_second": 7.107, "step": 1500 }, { - "epoch": 0.05, - "learning_rate": 1.7647748592870545e-05, - "loss": 0.3634, + "epoch": 0.05424009802861571, + "grad_norm": 0.22356046736240387, + "learning_rate": 1.8075906797982225e-05, + "loss": 0.5439, "step": 1505 }, { - "epoch": 0.05, - "learning_rate": 1.770637898686679e-05, - "loss": 0.4114, + "epoch": 0.054420297689840344, + "grad_norm": 0.24172358214855194, + "learning_rate": 1.813595964448715e-05, + "loss": 0.5436, "step": 1510 }, { - "epoch": 0.05, - "learning_rate": 1.776500938086304e-05, - "loss": 0.378, + "epoch": 0.05460049735106498, + "grad_norm": 0.25496965646743774, + "learning_rate": 1.8196012490992073e-05, + "loss": 0.5427, "step": 1515 }, { - "epoch": 0.05, - "learning_rate": 1.782363977485929e-05, - "loss": 0.3722, + "epoch": 0.05478069701228962, + "grad_norm": 0.21974816918373108, + "learning_rate": 1.8256065337497e-05, + "loss": 0.5261, "step": 1520 }, { - "epoch": 0.05, - "learning_rate": 1.7882270168855535e-05, - "loss": 0.3827, + "epoch": 0.05496089667351425, + "grad_norm": 0.2528958320617676, + "learning_rate": 1.8316118184001924e-05, + "loss": 0.5123, "step": 1525 }, { - "epoch": 0.05, - "learning_rate": 1.7940900562851784e-05, - "loss": 0.383, + "epoch": 0.05514109633473889, + "grad_norm": 0.2141297310590744, + "learning_rate": 1.8376171030506846e-05, + "loss": 0.4936, "step": 1530 }, { - "epoch": 0.05, - "learning_rate": 1.7999530956848033e-05, - "loss": 0.3992, + "epoch": 0.055321295995963526, + "grad_norm": 0.2709108293056488, + "learning_rate": 1.8436223877011772e-05, + "loss": 0.5397, "step": 1535 }, { - "epoch": 0.05, - "learning_rate": 1.805816135084428e-05, - "loss": 0.3733, + "epoch": 0.05550149565718816, + "grad_norm": 0.2779485285282135, + "learning_rate": 1.8496276723516694e-05, + "loss": 0.5001, "step": 1540 }, { - "epoch": 0.05, - "learning_rate": 1.8116791744840524e-05, - "loss": 0.3744, + "epoch": 0.0556816953184128, + "grad_norm": 0.22084900736808777, + "learning_rate": 1.855632957002162e-05, + "loss": 0.4713, "step": 1545 }, { - "epoch": 0.05, - "learning_rate": 1.8175422138836773e-05, - "loss": 0.3748, + "epoch": 0.055861894979637436, + "grad_norm": 0.1823149174451828, + "learning_rate": 1.8616382416526545e-05, + "loss": 0.4974, "step": 1550 }, { - "epoch": 0.05, - "learning_rate": 1.8234052532833022e-05, - "loss": 0.3561, + "epoch": 0.05604209464086207, + "grad_norm": 0.20994675159454346, + "learning_rate": 1.867643526303147e-05, + "loss": 0.5241, "step": 1555 }, { - "epoch": 0.05, - "learning_rate": 1.8292682926829268e-05, - "loss": 0.3616, + "epoch": 0.05622229430208671, + "grad_norm": 0.24369290471076965, + "learning_rate": 1.8736488109536393e-05, + "loss": 0.5478, "step": 1560 }, { - "epoch": 0.06, - "learning_rate": 1.8351313320825517e-05, - "loss": 0.399, + "epoch": 0.05640249396331135, + "grad_norm": 0.259897381067276, + "learning_rate": 1.879654095604132e-05, + "loss": 0.4991, "step": 1565 }, { - "epoch": 0.06, - "learning_rate": 1.8409943714821766e-05, - "loss": 0.375, + "epoch": 0.05658269362453599, + "grad_norm": 0.25176161527633667, + "learning_rate": 1.885659380254624e-05, + "loss": 0.5066, "step": 1570 }, { - "epoch": 0.06, - "learning_rate": 1.846857410881801e-05, - "loss": 0.3917, + "epoch": 0.056762893285760625, + "grad_norm": 0.2606624364852905, + "learning_rate": 1.8916646649051163e-05, + "loss": 0.4857, "step": 1575 }, { - "epoch": 0.06, - "learning_rate": 1.852720450281426e-05, - "loss": 0.3619, + "epoch": 0.05694309294698526, + "grad_norm": 0.197222039103508, + "learning_rate": 1.8976699495556092e-05, + "loss": 0.5225, "step": 1580 }, { - "epoch": 0.06, - "learning_rate": 1.858583489681051e-05, - "loss": 0.3954, + "epoch": 0.0571232926082099, + "grad_norm": 0.24445274472236633, + "learning_rate": 1.9036752342061014e-05, + "loss": 0.4929, "step": 1585 }, { - "epoch": 0.06, - "learning_rate": 1.8644465290806755e-05, - "loss": 0.3946, + "epoch": 0.057303492269434535, + "grad_norm": 0.21949337422847748, + "learning_rate": 1.909680518856594e-05, + "loss": 0.5149, "step": 1590 }, { - "epoch": 0.06, - "learning_rate": 1.8703095684803e-05, - "loss": 0.3951, + "epoch": 0.05748369193065917, + "grad_norm": 0.19175410270690918, + "learning_rate": 1.9156858035070862e-05, + "loss": 0.4939, "step": 1595 }, { - "epoch": 0.06, - "learning_rate": 1.876172607879925e-05, - "loss": 0.3481, + "epoch": 0.05766389159188381, + "grad_norm": 0.2463282346725464, + "learning_rate": 1.9216910881575787e-05, + "loss": 0.5435, "step": 1600 }, { - "epoch": 0.06, - "learning_rate": 1.88203564727955e-05, - "loss": 0.3811, + "epoch": 0.057844091253108444, + "grad_norm": 0.2281898409128189, + "learning_rate": 1.9276963728080713e-05, + "loss": 0.5538, "step": 1605 }, { - "epoch": 0.06, - "learning_rate": 1.8878986866791744e-05, - "loss": 0.3951, + "epoch": 0.05802429091433308, + "grad_norm": 0.2389582097530365, + "learning_rate": 1.933701657458564e-05, + "loss": 0.5149, "step": 1610 }, { - "epoch": 0.06, - "learning_rate": 1.8937617260787993e-05, - "loss": 0.3794, + "epoch": 0.05820449057555772, + "grad_norm": 0.23086562752723694, + "learning_rate": 1.939706942109056e-05, + "loss": 0.5109, "step": 1615 }, { - "epoch": 0.06, - "learning_rate": 1.8996247654784242e-05, - "loss": 0.4044, + "epoch": 0.058384690236782354, + "grad_norm": 0.2583639621734619, + "learning_rate": 1.9457122267595486e-05, + "loss": 0.5174, "step": 1620 }, { - "epoch": 0.06, - "learning_rate": 1.9054878048780488e-05, - "loss": 0.3489, + "epoch": 0.05856488989800699, + "grad_norm": 0.20627906918525696, + "learning_rate": 1.951717511410041e-05, + "loss": 0.5224, "step": 1625 }, { - "epoch": 0.06, - "learning_rate": 1.9113508442776737e-05, - "loss": 0.3395, + "epoch": 0.05874508955923163, + "grad_norm": 0.30713996291160583, + "learning_rate": 1.9577227960605334e-05, + "loss": 0.5218, "step": 1630 }, { - "epoch": 0.06, - "learning_rate": 1.9172138836772986e-05, - "loss": 0.4209, + "epoch": 0.05892528922045626, + "grad_norm": 0.26830917596817017, + "learning_rate": 1.963728080711026e-05, + "loss": 0.5101, "step": 1635 }, { - "epoch": 0.06, - "learning_rate": 1.923076923076923e-05, - "loss": 0.3852, + "epoch": 0.0591054888816809, + "grad_norm": 0.21486251056194305, + "learning_rate": 1.969733365361518e-05, + "loss": 0.4967, "step": 1640 }, { - "epoch": 0.06, - "learning_rate": 1.928939962476548e-05, - "loss": 0.3539, + "epoch": 0.059285688542905536, + "grad_norm": 0.26688727736473083, + "learning_rate": 1.9757386500120107e-05, + "loss": 0.5322, "step": 1645 }, { - "epoch": 0.06, - "learning_rate": 1.9348030018761726e-05, - "loss": 0.4027, + "epoch": 0.05946588820413018, + "grad_norm": 0.23560921847820282, + "learning_rate": 1.981743934662503e-05, + "loss": 0.491, "step": 1650 }, { - "epoch": 0.06, - "learning_rate": 1.9406660412757975e-05, - "loss": 0.369, + "epoch": 0.059646087865354816, + "grad_norm": 0.27083316445350647, + "learning_rate": 1.9877492193129955e-05, + "loss": 0.5475, "step": 1655 }, { - "epoch": 0.06, - "learning_rate": 1.946529080675422e-05, - "loss": 0.3624, + "epoch": 0.05982628752657945, + "grad_norm": 0.24693989753723145, + "learning_rate": 1.993754503963488e-05, + "loss": 0.5349, "step": 1660 }, { - "epoch": 0.06, - "learning_rate": 1.952392120075047e-05, - "loss": 0.365, + "epoch": 0.06000648718780409, + "grad_norm": 0.2298416942358017, + "learning_rate": 1.9997597886139806e-05, + "loss": 0.5048, "step": 1665 }, { - "epoch": 0.06, - "learning_rate": 1.958255159474672e-05, - "loss": 0.3642, + "epoch": 0.060186686849028725, + "grad_norm": 0.2370372712612152, + "learning_rate": 2.0057650732644728e-05, + "loss": 0.4959, "step": 1670 }, { - "epoch": 0.06, - "learning_rate": 1.9641181988742964e-05, - "loss": 0.3889, + "epoch": 0.06036688651025336, + "grad_norm": 0.25343507528305054, + "learning_rate": 2.011770357914965e-05, + "loss": 0.5119, "step": 1675 }, { - "epoch": 0.06, - "learning_rate": 1.9699812382739213e-05, - "loss": 0.3828, + "epoch": 0.060547086171478, + "grad_norm": 0.28533029556274414, + "learning_rate": 2.0177756425654576e-05, + "loss": 0.5539, "step": 1680 }, { - "epoch": 0.06, - "learning_rate": 1.9758442776735462e-05, - "loss": 0.3699, + "epoch": 0.060727285832702635, + "grad_norm": 0.2781274914741516, + "learning_rate": 2.02378092721595e-05, + "loss": 0.4953, "step": 1685 }, { - "epoch": 0.06, - "learning_rate": 1.9817073170731708e-05, - "loss": 0.3726, + "epoch": 0.06090748549392727, + "grad_norm": 0.33222508430480957, + "learning_rate": 2.0297862118664427e-05, + "loss": 0.5152, "step": 1690 }, { - "epoch": 0.06, - "learning_rate": 1.9875703564727957e-05, - "loss": 0.3941, + "epoch": 0.06108768515515191, + "grad_norm": 0.2313944697380066, + "learning_rate": 2.035791496516935e-05, + "loss": 0.5293, "step": 1695 }, { - "epoch": 0.06, - "learning_rate": 1.9934333958724203e-05, - "loss": 0.3476, + "epoch": 0.061267884816376544, + "grad_norm": 0.28105735778808594, + "learning_rate": 2.0417967811674275e-05, + "loss": 0.5141, "step": 1700 }, { - "epoch": 0.06, - "learning_rate": 1.999296435272045e-05, - "loss": 0.3631, + "epoch": 0.06144808447760118, + "grad_norm": 0.2256317436695099, + "learning_rate": 2.0478020658179197e-05, + "loss": 0.4813, "step": 1705 }, { - "epoch": 0.06, - "learning_rate": 2.0051594746716697e-05, - "loss": 0.3619, + "epoch": 0.06162828413882582, + "grad_norm": 0.23726746439933777, + "learning_rate": 2.0538073504684126e-05, + "loss": 0.5469, "step": 1710 }, { - "epoch": 0.06, - "learning_rate": 2.0110225140712946e-05, - "loss": 0.3699, + "epoch": 0.061808483800050454, + "grad_norm": 0.27420201897621155, + "learning_rate": 2.0598126351189048e-05, + "loss": 0.4634, "step": 1715 }, { - "epoch": 0.06, - "learning_rate": 2.0168855534709195e-05, - "loss": 0.3832, + "epoch": 0.06198868346127509, + "grad_norm": 0.2140614092350006, + "learning_rate": 2.065817919769397e-05, + "loss": 0.5011, "step": 1720 }, { - "epoch": 0.06, - "learning_rate": 2.022748592870544e-05, - "loss": 0.3747, + "epoch": 0.06216888312249973, + "grad_norm": 0.21357890963554382, + "learning_rate": 2.0718232044198896e-05, + "loss": 0.4775, "step": 1725 }, { - "epoch": 0.06, - "learning_rate": 2.028611632270169e-05, - "loss": 0.3727, + "epoch": 0.06234908278372436, + "grad_norm": 0.2980591058731079, + "learning_rate": 2.0778284890703818e-05, + "loss": 0.5457, "step": 1730 }, { - "epoch": 0.06, - "learning_rate": 2.034474671669794e-05, - "loss": 0.3725, + "epoch": 0.062529282444949, + "grad_norm": 0.2486668825149536, + "learning_rate": 2.0838337737208744e-05, + "loss": 0.4759, "step": 1735 }, { - "epoch": 0.06, - "learning_rate": 2.0403377110694185e-05, - "loss": 0.3771, + "epoch": 0.06270948210617364, + "grad_norm": 0.24250219762325287, + "learning_rate": 2.089839058371367e-05, + "loss": 0.5026, "step": 1740 }, { - "epoch": 0.06, - "learning_rate": 2.0462007504690434e-05, - "loss": 0.3689, + "epoch": 0.06288968176739827, + "grad_norm": 0.2787621319293976, + "learning_rate": 2.0958443430218595e-05, + "loss": 0.5464, "step": 1745 }, { - "epoch": 0.06, - "learning_rate": 2.0520637898686683e-05, - "loss": 0.3746, + "epoch": 0.06306988142862291, + "grad_norm": 0.23946985602378845, + "learning_rate": 2.1018496276723517e-05, + "loss": 0.5052, "step": 1750 }, { - "epoch": 0.06, - "learning_rate": 2.0579268292682928e-05, - "loss": 0.3559, + "epoch": 0.06325008108984755, + "grad_norm": 0.26933711767196655, + "learning_rate": 2.1078549123228443e-05, + "loss": 0.5287, "step": 1755 }, { - "epoch": 0.06, - "learning_rate": 2.0637898686679174e-05, - "loss": 0.3715, + "epoch": 0.06343028075107218, + "grad_norm": 0.2348264902830124, + "learning_rate": 2.1138601969733365e-05, + "loss": 0.5193, "step": 1760 }, { - "epoch": 0.06, - "learning_rate": 2.0696529080675423e-05, - "loss": 0.3851, + "epoch": 0.06361048041229682, + "grad_norm": 0.26587164402008057, + "learning_rate": 2.1198654816238294e-05, + "loss": 0.5169, "step": 1765 }, { - "epoch": 0.06, - "learning_rate": 2.0755159474671672e-05, - "loss": 0.3851, + "epoch": 0.06379068007352146, + "grad_norm": 0.2152547687292099, + "learning_rate": 2.1258707662743216e-05, + "loss": 0.5034, "step": 1770 }, { - "epoch": 0.06, - "learning_rate": 2.0813789868667917e-05, - "loss": 0.3639, + "epoch": 0.0639708797347461, + "grad_norm": 0.27491626143455505, + "learning_rate": 2.1318760509248138e-05, + "loss": 0.5476, "step": 1775 }, { - "epoch": 0.06, - "learning_rate": 2.0872420262664166e-05, - "loss": 0.3669, + "epoch": 0.06415107939597074, + "grad_norm": 0.22118522226810455, + "learning_rate": 2.1378813355753064e-05, + "loss": 0.5291, "step": 1780 }, { - "epoch": 0.06, - "learning_rate": 2.0931050656660415e-05, - "loss": 0.3942, + "epoch": 0.06433127905719538, + "grad_norm": 0.2910890281200409, + "learning_rate": 2.1438866202257986e-05, + "loss": 0.5183, "step": 1785 }, { - "epoch": 0.06, - "learning_rate": 2.098968105065666e-05, - "loss": 0.3495, + "epoch": 0.06451147871842002, + "grad_norm": 0.2631491720676422, + "learning_rate": 2.149891904876291e-05, + "loss": 0.4692, "step": 1790 }, { - "epoch": 0.06, - "learning_rate": 2.104831144465291e-05, - "loss": 0.3516, + "epoch": 0.06469167837964465, + "grad_norm": 0.2912589907646179, + "learning_rate": 2.1558971895267837e-05, + "loss": 0.5036, "step": 1795 }, { - "epoch": 0.06, - "learning_rate": 2.110694183864916e-05, - "loss": 0.3966, + "epoch": 0.06487187804086929, + "grad_norm": 0.2575485408306122, + "learning_rate": 2.1619024741772762e-05, + "loss": 0.4924, "step": 1800 }, { - "epoch": 0.06, - "learning_rate": 2.1165572232645405e-05, - "loss": 0.3839, + "epoch": 0.06505207770209392, + "grad_norm": 0.24182820320129395, + "learning_rate": 2.1679077588277685e-05, + "loss": 0.4854, "step": 1805 }, { - "epoch": 0.06, - "learning_rate": 2.122420262664165e-05, - "loss": 0.3659, + "epoch": 0.06523227736331856, + "grad_norm": 0.23146407306194305, + "learning_rate": 2.173913043478261e-05, + "loss": 0.4882, "step": 1810 }, { - "epoch": 0.06, - "learning_rate": 2.12828330206379e-05, - "loss": 0.3885, + "epoch": 0.0654124770245432, + "grad_norm": 0.2757367789745331, + "learning_rate": 2.1799183281287532e-05, + "loss": 0.4886, "step": 1815 }, { - "epoch": 0.06, - "learning_rate": 2.134146341463415e-05, - "loss": 0.3648, + "epoch": 0.06559267668576783, + "grad_norm": 0.22752000391483307, + "learning_rate": 2.1859236127792458e-05, + "loss": 0.49, "step": 1820 }, { - "epoch": 0.06, - "learning_rate": 2.1400093808630394e-05, - "loss": 0.38, + "epoch": 0.06577287634699247, + "grad_norm": 0.2603028416633606, + "learning_rate": 2.1919288974297383e-05, + "loss": 0.4861, "step": 1825 }, { - "epoch": 0.06, - "learning_rate": 2.1458724202626643e-05, - "loss": 0.3699, + "epoch": 0.0659530760082171, + "grad_norm": 0.226707324385643, + "learning_rate": 2.1979341820802306e-05, + "loss": 0.4598, "step": 1830 }, { - "epoch": 0.06, - "learning_rate": 2.1517354596622892e-05, - "loss": 0.3835, + "epoch": 0.06613327566944174, + "grad_norm": 0.2270716428756714, + "learning_rate": 2.203939466730723e-05, + "loss": 0.5174, "step": 1835 }, { - "epoch": 0.06, - "learning_rate": 2.1575984990619138e-05, - "loss": 0.3604, + "epoch": 0.06631347533066638, + "grad_norm": 0.2932356595993042, + "learning_rate": 2.2099447513812153e-05, + "loss": 0.5836, "step": 1840 }, { - "epoch": 0.06, - "learning_rate": 2.1634615384615387e-05, - "loss": 0.3876, + "epoch": 0.06649367499189102, + "grad_norm": 0.24968333542346954, + "learning_rate": 2.2159500360317082e-05, + "loss": 0.5215, "step": 1845 }, { - "epoch": 0.07, - "learning_rate": 2.1693245778611636e-05, - "loss": 0.3233, + "epoch": 0.06667387465311565, + "grad_norm": 0.3082146942615509, + "learning_rate": 2.2219553206822005e-05, + "loss": 0.5236, "step": 1850 }, { - "epoch": 0.07, - "learning_rate": 2.175187617260788e-05, - "loss": 0.3619, + "epoch": 0.06685407431434029, + "grad_norm": 0.2535383999347687, + "learning_rate": 2.227960605332693e-05, + "loss": 0.4725, "step": 1855 }, { - "epoch": 0.07, - "learning_rate": 2.1810506566604127e-05, - "loss": 0.3788, + "epoch": 0.06703427397556493, + "grad_norm": 0.380136102437973, + "learning_rate": 2.2339658899831852e-05, + "loss": 0.5161, "step": 1860 }, { - "epoch": 0.07, - "learning_rate": 2.1869136960600376e-05, - "loss": 0.3812, + "epoch": 0.06721447363678956, + "grad_norm": 0.24711187183856964, + "learning_rate": 2.2399711746336778e-05, + "loss": 0.5302, "step": 1865 }, { - "epoch": 0.07, - "learning_rate": 2.1927767354596625e-05, - "loss": 0.3977, + "epoch": 0.0673946732980142, + "grad_norm": 0.285430908203125, + "learning_rate": 2.24597645928417e-05, + "loss": 0.5437, "step": 1870 }, { - "epoch": 0.07, - "learning_rate": 2.198639774859287e-05, - "loss": 0.3496, + "epoch": 0.06757487295923884, + "grad_norm": 0.2444837987422943, + "learning_rate": 2.2519817439346626e-05, + "loss": 0.4988, "step": 1875 }, { - "epoch": 0.07, - "learning_rate": 2.204502814258912e-05, - "loss": 0.3678, + "epoch": 0.06775507262046347, + "grad_norm": 0.3459201455116272, + "learning_rate": 2.257987028585155e-05, + "loss": 0.5126, "step": 1880 }, { - "epoch": 0.07, - "learning_rate": 2.210365853658537e-05, - "loss": 0.3521, + "epoch": 0.06793527228168811, + "grad_norm": 0.23254404962062836, + "learning_rate": 2.2639923132356473e-05, + "loss": 0.5229, "step": 1885 }, { - "epoch": 0.07, - "learning_rate": 2.2162288930581614e-05, - "loss": 0.3765, + "epoch": 0.06811547194291274, + "grad_norm": 0.3149000406265259, + "learning_rate": 2.26999759788614e-05, + "loss": 0.5091, "step": 1890 }, { - "epoch": 0.07, - "learning_rate": 2.2220919324577863e-05, - "loss": 0.3787, + "epoch": 0.06829567160413738, + "grad_norm": 0.31007108092308044, + "learning_rate": 2.276002882536632e-05, + "loss": 0.487, "step": 1895 }, { - "epoch": 0.07, - "learning_rate": 2.2279549718574112e-05, - "loss": 0.3749, + "epoch": 0.06847587126536202, + "grad_norm": 0.25598689913749695, + "learning_rate": 2.282008167187125e-05, + "loss": 0.5061, "step": 1900 }, { - "epoch": 0.07, - "learning_rate": 2.2338180112570358e-05, - "loss": 0.3536, + "epoch": 0.06865607092658665, + "grad_norm": 0.27555733919143677, + "learning_rate": 2.2880134518376172e-05, + "loss": 0.5177, "step": 1905 }, { - "epoch": 0.07, - "learning_rate": 2.2396810506566603e-05, - "loss": 0.367, + "epoch": 0.06883627058781129, + "grad_norm": 0.28343528509140015, + "learning_rate": 2.2940187364881098e-05, + "loss": 0.5267, "step": 1910 }, { - "epoch": 0.07, - "learning_rate": 2.2455440900562852e-05, - "loss": 0.3746, + "epoch": 0.06901647024903593, + "grad_norm": 0.1938653290271759, + "learning_rate": 2.300024021138602e-05, + "loss": 0.4926, "step": 1915 }, { - "epoch": 0.07, - "learning_rate": 2.25140712945591e-05, - "loss": 0.3574, + "epoch": 0.06919666991026056, + "grad_norm": 0.2610265910625458, + "learning_rate": 2.3060293057890945e-05, + "loss": 0.5187, "step": 1920 }, { - "epoch": 0.07, - "learning_rate": 2.2572701688555347e-05, - "loss": 0.3853, + "epoch": 0.0693768695714852, + "grad_norm": 0.20911858975887299, + "learning_rate": 2.3120345904395868e-05, + "loss": 0.4979, "step": 1925 }, { - "epoch": 0.07, - "learning_rate": 2.2631332082551596e-05, - "loss": 0.3839, + "epoch": 0.06955706923270984, + "grad_norm": 0.2716982662677765, + "learning_rate": 2.3180398750900793e-05, + "loss": 0.4976, "step": 1930 }, { - "epoch": 0.07, - "learning_rate": 2.2689962476547845e-05, - "loss": 0.3809, + "epoch": 0.06973726889393447, + "grad_norm": 0.3215219974517822, + "learning_rate": 2.324045159740572e-05, + "loss": 0.5193, "step": 1935 }, { - "epoch": 0.07, - "learning_rate": 2.274859287054409e-05, - "loss": 0.3989, + "epoch": 0.06991746855515911, + "grad_norm": 0.3229633867740631, + "learning_rate": 2.330050444391064e-05, + "loss": 0.5057, "step": 1940 }, { - "epoch": 0.07, - "learning_rate": 2.280722326454034e-05, - "loss": 0.3575, + "epoch": 0.07009766821638376, + "grad_norm": 0.20286689698696136, + "learning_rate": 2.3360557290415567e-05, + "loss": 0.4657, "step": 1945 }, { - "epoch": 0.07, - "learning_rate": 2.286585365853659e-05, - "loss": 0.3877, + "epoch": 0.0702778678776084, + "grad_norm": 0.30475685000419617, + "learning_rate": 2.342061013692049e-05, + "loss": 0.5316, "step": 1950 }, { - "epoch": 0.07, - "learning_rate": 2.2924484052532834e-05, - "loss": 0.3573, + "epoch": 0.07045806753883303, + "grad_norm": 0.273539662361145, + "learning_rate": 2.3480662983425418e-05, + "loss": 0.5268, "step": 1955 }, { - "epoch": 0.07, - "learning_rate": 2.298311444652908e-05, - "loss": 0.3377, + "epoch": 0.07063826720005767, + "grad_norm": 0.2909047305583954, + "learning_rate": 2.354071582993034e-05, + "loss": 0.4885, "step": 1960 }, { - "epoch": 0.07, - "learning_rate": 2.304174484052533e-05, - "loss": 0.3755, + "epoch": 0.0708184668612823, + "grad_norm": 0.23149296641349792, + "learning_rate": 2.3600768676435265e-05, + "loss": 0.534, "step": 1965 }, { - "epoch": 0.07, - "learning_rate": 2.3100375234521575e-05, - "loss": 0.373, + "epoch": 0.07099866652250694, + "grad_norm": 0.32449668645858765, + "learning_rate": 2.3660821522940188e-05, + "loss": 0.508, "step": 1970 }, { - "epoch": 0.07, - "learning_rate": 2.3159005628517824e-05, - "loss": 0.3544, + "epoch": 0.07117886618373158, + "grad_norm": 0.2767580449581146, + "learning_rate": 2.3720874369445113e-05, + "loss": 0.5106, "step": 1975 }, { - "epoch": 0.07, - "learning_rate": 2.3217636022514073e-05, - "loss": 0.3509, + "epoch": 0.07135906584495622, + "grad_norm": 0.3347299098968506, + "learning_rate": 2.378092721595004e-05, + "loss": 0.4898, "step": 1980 }, { - "epoch": 0.07, - "learning_rate": 2.327626641651032e-05, - "loss": 0.3458, + "epoch": 0.07153926550618085, + "grad_norm": 0.2920180559158325, + "learning_rate": 2.384098006245496e-05, + "loss": 0.5102, "step": 1985 }, { - "epoch": 0.07, - "learning_rate": 2.3334896810506567e-05, - "loss": 0.3339, + "epoch": 0.07171946516740549, + "grad_norm": 0.2103186845779419, + "learning_rate": 2.3901032908959886e-05, + "loss": 0.5, "step": 1990 }, { - "epoch": 0.07, - "learning_rate": 2.3393527204502816e-05, - "loss": 0.3466, + "epoch": 0.07189966482863012, + "grad_norm": 0.276729941368103, + "learning_rate": 2.396108575546481e-05, + "loss": 0.4737, "step": 1995 }, { - "epoch": 0.07, - "learning_rate": 2.3452157598499065e-05, - "loss": 0.3545, + "epoch": 0.07207986448985476, + "grad_norm": 0.25289759039878845, + "learning_rate": 2.4021138601969734e-05, + "loss": 0.4869, "step": 2000 }, { - "epoch": 0.07, - "eval_loss": 0.3645670711994171, - "eval_runtime": 10.5377, - "eval_samples_per_second": 9.49, - "eval_steps_per_second": 9.49, + "epoch": 0.07207986448985476, + "eval_loss": 0.5271518230438232, + "eval_runtime": 3.5218, + "eval_samples_per_second": 28.395, + "eval_steps_per_second": 7.099, "step": 2000 }, { - "epoch": 0.07, - "learning_rate": 2.351078799249531e-05, - "loss": 0.3561, + "epoch": 0.0722600641510794, + "grad_norm": 0.2662065923213959, + "learning_rate": 2.4081191448474656e-05, + "loss": 0.5005, "step": 2005 }, { - "epoch": 0.07, - "learning_rate": 2.356941838649156e-05, - "loss": 0.3915, + "epoch": 0.07244026381230403, + "grad_norm": 0.2379211187362671, + "learning_rate": 2.4141244294979585e-05, + "loss": 0.4959, "step": 2010 }, { - "epoch": 0.07, - "learning_rate": 2.3628048780487806e-05, - "loss": 0.3427, + "epoch": 0.07262046347352867, + "grad_norm": 0.25165653228759766, + "learning_rate": 2.4201297141484507e-05, + "loss": 0.4869, "step": 2015 }, { - "epoch": 0.07, - "learning_rate": 2.368667917448405e-05, - "loss": 0.3641, + "epoch": 0.07280066313475331, + "grad_norm": 0.21869604289531708, + "learning_rate": 2.4261349987989433e-05, + "loss": 0.4938, "step": 2020 }, { - "epoch": 0.07, - "learning_rate": 2.37453095684803e-05, - "loss": 0.3611, + "epoch": 0.07298086279597794, + "grad_norm": 0.21399500966072083, + "learning_rate": 2.4321402834494355e-05, + "loss": 0.5029, "step": 2025 }, { - "epoch": 0.07, - "learning_rate": 2.380393996247655e-05, - "loss": 0.3538, + "epoch": 0.07316106245720258, + "grad_norm": 0.24735891819000244, + "learning_rate": 2.438145568099928e-05, + "loss": 0.503, "step": 2030 }, { - "epoch": 0.07, - "learning_rate": 2.3862570356472795e-05, - "loss": 0.3584, + "epoch": 0.07334126211842722, + "grad_norm": 0.27130669355392456, + "learning_rate": 2.4441508527504206e-05, + "loss": 0.4726, "step": 2035 }, { - "epoch": 0.07, - "learning_rate": 2.3921200750469044e-05, - "loss": 0.3742, + "epoch": 0.07352146177965185, + "grad_norm": 0.33456870913505554, + "learning_rate": 2.450156137400913e-05, + "loss": 0.4906, "step": 2040 }, { - "epoch": 0.07, - "learning_rate": 2.3979831144465293e-05, - "loss": 0.3686, + "epoch": 0.07370166144087649, + "grad_norm": 0.3058684766292572, + "learning_rate": 2.4561614220514054e-05, + "loss": 0.5345, "step": 2045 }, { - "epoch": 0.07, - "learning_rate": 2.4038461538461542e-05, - "loss": 0.4034, + "epoch": 0.07388186110210113, + "grad_norm": 0.24727894365787506, + "learning_rate": 2.4621667067018976e-05, + "loss": 0.5108, "step": 2050 }, { - "epoch": 0.07, - "learning_rate": 2.4097091932457787e-05, - "loss": 0.3634, + "epoch": 0.07406206076332576, + "grad_norm": 0.32694023847579956, + "learning_rate": 2.4681719913523902e-05, + "loss": 0.4917, "step": 2055 }, { - "epoch": 0.07, - "learning_rate": 2.4155722326454036e-05, - "loss": 0.3832, + "epoch": 0.0742422604245504, + "grad_norm": 0.3927519917488098, + "learning_rate": 2.4741772760028827e-05, + "loss": 0.4762, "step": 2060 }, { - "epoch": 0.07, - "learning_rate": 2.4214352720450282e-05, - "loss": 0.364, + "epoch": 0.07442246008577504, + "grad_norm": 0.22874149680137634, + "learning_rate": 2.4801825606533753e-05, + "loss": 0.4849, "step": 2065 }, { - "epoch": 0.07, - "learning_rate": 2.4272983114446528e-05, - "loss": 0.3563, + "epoch": 0.07460265974699967, + "grad_norm": 0.24440160393714905, + "learning_rate": 2.4861878453038675e-05, + "loss": 0.4888, "step": 2070 }, { - "epoch": 0.07, - "learning_rate": 2.4331613508442777e-05, - "loss": 0.3542, + "epoch": 0.07478285940822431, + "grad_norm": 0.2987866997718811, + "learning_rate": 2.49219312995436e-05, + "loss": 0.4926, "step": 2075 }, { - "epoch": 0.07, - "learning_rate": 2.4390243902439026e-05, - "loss": 0.382, + "epoch": 0.07496305906944895, + "grad_norm": 0.2661680281162262, + "learning_rate": 2.4981984146048523e-05, + "loss": 0.4854, "step": 2080 }, { - "epoch": 0.07, - "learning_rate": 2.444887429643527e-05, - "loss": 0.3793, + "epoch": 0.07514325873067358, + "grad_norm": 0.23432804644107819, + "learning_rate": 2.504203699255345e-05, + "loss": 0.4452, "step": 2085 }, { - "epoch": 0.07, - "learning_rate": 2.450750469043152e-05, - "loss": 0.3624, + "epoch": 0.07532345839189822, + "grad_norm": 0.28125226497650146, + "learning_rate": 2.5102089839058374e-05, + "loss": 0.4941, "step": 2090 }, { - "epoch": 0.07, - "learning_rate": 2.456613508442777e-05, - "loss": 0.3771, + "epoch": 0.07550365805312285, + "grad_norm": 0.2805047631263733, + "learning_rate": 2.5162142685563296e-05, + "loss": 0.4885, "step": 2095 }, { - "epoch": 0.07, - "learning_rate": 2.462476547842402e-05, - "loss": 0.3546, + "epoch": 0.07568385771434749, + "grad_norm": 0.24063357710838318, + "learning_rate": 2.5222195532068222e-05, + "loss": 0.4973, "step": 2100 }, { - "epoch": 0.07, - "learning_rate": 2.4683395872420264e-05, - "loss": 0.3705, + "epoch": 0.07586405737557213, + "grad_norm": 0.27712422609329224, + "learning_rate": 2.5282248378573147e-05, + "loss": 0.5314, "step": 2105 }, { - "epoch": 0.07, - "learning_rate": 2.4742026266416513e-05, - "loss": 0.3835, + "epoch": 0.07604425703679676, + "grad_norm": 0.2542542517185211, + "learning_rate": 2.534230122507807e-05, + "loss": 0.5142, "step": 2110 }, { - "epoch": 0.07, - "learning_rate": 2.4800656660412762e-05, - "loss": 0.3706, + "epoch": 0.07622445669802141, + "grad_norm": 0.19768337905406952, + "learning_rate": 2.5402354071582995e-05, + "loss": 0.4794, "step": 2115 }, { - "epoch": 0.07, - "learning_rate": 2.4859287054409004e-05, - "loss": 0.3756, + "epoch": 0.07640465635924605, + "grad_norm": 0.2631590664386749, + "learning_rate": 2.5462406918087917e-05, + "loss": 0.5131, "step": 2120 }, { - "epoch": 0.07, - "learning_rate": 2.4917917448405253e-05, - "loss": 0.3632, + "epoch": 0.07658485602047069, + "grad_norm": 0.29528722167015076, + "learning_rate": 2.5522459764592843e-05, + "loss": 0.5186, "step": 2125 }, { - "epoch": 0.07, - "learning_rate": 2.4976547842401502e-05, - "loss": 0.3488, + "epoch": 0.07676505568169532, + "grad_norm": 0.22340206801891327, + "learning_rate": 2.558251261109777e-05, + "loss": 0.4905, "step": 2130 }, { - "epoch": 0.08, - "learning_rate": 2.5035178236397748e-05, - "loss": 0.3553, + "epoch": 0.07694525534291996, + "grad_norm": 0.21741612255573273, + "learning_rate": 2.564256545760269e-05, + "loss": 0.4528, "step": 2135 }, { - "epoch": 0.08, - "learning_rate": 2.5093808630394e-05, - "loss": 0.3656, + "epoch": 0.0771254550041446, + "grad_norm": 0.2538367807865143, + "learning_rate": 2.5702618304107616e-05, + "loss": 0.5053, "step": 2140 }, { - "epoch": 0.08, - "learning_rate": 2.5152439024390246e-05, - "loss": 0.3598, + "epoch": 0.07730565466536923, + "grad_norm": 0.21461638808250427, + "learning_rate": 2.5762671150612538e-05, + "loss": 0.5069, "step": 2145 }, { - "epoch": 0.08, - "learning_rate": 2.521106941838649e-05, - "loss": 0.3478, + "epoch": 0.07748585432659387, + "grad_norm": 0.2098444551229477, + "learning_rate": 2.5822723997117464e-05, + "loss": 0.4934, "step": 2150 }, { - "epoch": 0.08, - "learning_rate": 2.526969981238274e-05, - "loss": 0.3963, + "epoch": 0.0776660539878185, + "grad_norm": 0.26539066433906555, + "learning_rate": 2.5882776843622393e-05, + "loss": 0.525, "step": 2155 }, { - "epoch": 0.08, - "learning_rate": 2.5328330206378986e-05, - "loss": 0.3649, + "epoch": 0.07784625364904314, + "grad_norm": 0.23766759037971497, + "learning_rate": 2.594282969012731e-05, + "loss": 0.473, "step": 2160 }, { - "epoch": 0.08, - "learning_rate": 2.538696060037524e-05, - "loss": 0.3606, + "epoch": 0.07802645331026778, + "grad_norm": 0.35496985912323, + "learning_rate": 2.600288253663224e-05, + "loss": 0.5096, "step": 2165 }, { - "epoch": 0.08, - "learning_rate": 2.544559099437148e-05, - "loss": 0.383, + "epoch": 0.07820665297149242, + "grad_norm": 0.23349033296108246, + "learning_rate": 2.606293538313716e-05, + "loss": 0.5294, "step": 2170 }, { - "epoch": 0.08, - "learning_rate": 2.5504221388367733e-05, - "loss": 0.3918, + "epoch": 0.07838685263271705, + "grad_norm": 0.26091858744621277, + "learning_rate": 2.6122988229642088e-05, + "loss": 0.4993, "step": 2175 }, { - "epoch": 0.08, - "learning_rate": 2.556285178236398e-05, - "loss": 0.3658, + "epoch": 0.07856705229394169, + "grad_norm": 0.22804076969623566, + "learning_rate": 2.6183041076147007e-05, + "loss": 0.4726, "step": 2180 }, { - "epoch": 0.08, - "learning_rate": 2.5621482176360228e-05, - "loss": 0.3517, + "epoch": 0.07874725195516633, + "grad_norm": 0.24472366273403168, + "learning_rate": 2.6243093922651936e-05, + "loss": 0.4801, "step": 2185 }, { - "epoch": 0.08, - "learning_rate": 2.5680112570356473e-05, - "loss": 0.3743, + "epoch": 0.07892745161639096, + "grad_norm": 0.30233868956565857, + "learning_rate": 2.630314676915686e-05, + "loss": 0.5144, "step": 2190 }, { - "epoch": 0.08, - "learning_rate": 2.573874296435272e-05, - "loss": 0.3722, + "epoch": 0.0791076512776156, + "grad_norm": 0.2996428608894348, + "learning_rate": 2.6363199615661784e-05, + "loss": 0.518, "step": 2195 }, { - "epoch": 0.08, - "learning_rate": 2.5797373358348968e-05, - "loss": 0.4099, + "epoch": 0.07928785093884023, + "grad_norm": 0.25679364800453186, + "learning_rate": 2.642325246216671e-05, + "loss": 0.4516, "step": 2200 }, { - "epoch": 0.08, - "learning_rate": 2.5856003752345214e-05, - "loss": 0.3321, + "epoch": 0.07946805060006487, + "grad_norm": 0.311570942401886, + "learning_rate": 2.648330530867163e-05, + "loss": 0.5341, "step": 2205 }, { - "epoch": 0.08, - "learning_rate": 2.5914634146341466e-05, - "loss": 0.3664, + "epoch": 0.07964825026128951, + "grad_norm": 0.2966238260269165, + "learning_rate": 2.6543358155176557e-05, + "loss": 0.4725, "step": 2210 }, { - "epoch": 0.08, - "learning_rate": 2.5973264540337712e-05, - "loss": 0.4203, + "epoch": 0.07982844992251414, + "grad_norm": 0.254808247089386, + "learning_rate": 2.6603411001681483e-05, + "loss": 0.4943, "step": 2215 }, { - "epoch": 0.08, - "learning_rate": 2.603189493433396e-05, - "loss": 0.3258, + "epoch": 0.08000864958373878, + "grad_norm": 0.23695941269397736, + "learning_rate": 2.6663463848186405e-05, + "loss": 0.5048, "step": 2220 }, { - "epoch": 0.08, - "learning_rate": 2.6090525328330206e-05, - "loss": 0.368, + "epoch": 0.08018884924496342, + "grad_norm": 0.29264283180236816, + "learning_rate": 2.672351669469133e-05, + "loss": 0.4968, "step": 2225 }, { - "epoch": 0.08, - "learning_rate": 2.614915572232646e-05, - "loss": 0.3918, + "epoch": 0.08036904890618805, + "grad_norm": 0.24488191306591034, + "learning_rate": 2.6783569541196252e-05, + "loss": 0.4911, "step": 2230 }, { - "epoch": 0.08, - "learning_rate": 2.62077861163227e-05, - "loss": 0.348, + "epoch": 0.08054924856741269, + "grad_norm": 0.281082421541214, + "learning_rate": 2.6843622387701178e-05, + "loss": 0.4946, "step": 2235 }, { - "epoch": 0.08, - "learning_rate": 2.6266416510318953e-05, - "loss": 0.3674, + "epoch": 0.08072944822863733, + "grad_norm": 0.2860982418060303, + "learning_rate": 2.6903675234206104e-05, + "loss": 0.5168, "step": 2240 }, { - "epoch": 0.08, - "learning_rate": 2.63250469043152e-05, - "loss": 0.3862, + "epoch": 0.08090964788986196, + "grad_norm": 0.2372538298368454, + "learning_rate": 2.6963728080711026e-05, + "loss": 0.4603, "step": 2245 }, { - "epoch": 0.08, - "learning_rate": 2.6383677298311445e-05, - "loss": 0.3807, + "epoch": 0.0810898475510866, + "grad_norm": 0.3054511845111847, + "learning_rate": 2.702378092721595e-05, + "loss": 0.4975, "step": 2250 }, { - "epoch": 0.08, - "learning_rate": 2.6442307692307694e-05, - "loss": 0.3581, + "epoch": 0.08127004721231124, + "grad_norm": 0.3154446482658386, + "learning_rate": 2.7083833773720874e-05, + "loss": 0.5038, "step": 2255 }, { - "epoch": 0.08, - "learning_rate": 2.650093808630394e-05, - "loss": 0.3731, + "epoch": 0.08145024687353587, + "grad_norm": 0.2663957476615906, + "learning_rate": 2.71438866202258e-05, + "loss": 0.5158, "step": 2260 }, { - "epoch": 0.08, - "learning_rate": 2.6559568480300188e-05, - "loss": 0.399, + "epoch": 0.08163044653476051, + "grad_norm": 0.2612464427947998, + "learning_rate": 2.7203939466730728e-05, + "loss": 0.5184, "step": 2265 }, { - "epoch": 0.08, - "learning_rate": 2.6618198874296434e-05, - "loss": 0.3581, + "epoch": 0.08181064619598515, + "grad_norm": 0.22396814823150635, + "learning_rate": 2.7263992313235647e-05, + "loss": 0.5009, "step": 2270 }, { - "epoch": 0.08, - "learning_rate": 2.6676829268292686e-05, - "loss": 0.3314, + "epoch": 0.08199084585720978, + "grad_norm": 0.2750130295753479, + "learning_rate": 2.7324045159740576e-05, + "loss": 0.5154, "step": 2275 }, { - "epoch": 0.08, - "learning_rate": 2.6735459662288932e-05, - "loss": 0.3697, + "epoch": 0.08217104551843442, + "grad_norm": 0.29694536328315735, + "learning_rate": 2.7384098006245495e-05, + "loss": 0.4796, "step": 2280 }, { - "epoch": 0.08, - "learning_rate": 2.679409005628518e-05, - "loss": 0.3752, + "epoch": 0.08235124517965907, + "grad_norm": 0.20941223204135895, + "learning_rate": 2.7444150852750424e-05, + "loss": 0.4525, "step": 2285 }, { - "epoch": 0.08, - "learning_rate": 2.6852720450281427e-05, - "loss": 0.3518, + "epoch": 0.0825314448408837, + "grad_norm": 0.3018248677253723, + "learning_rate": 2.750420369925535e-05, + "loss": 0.5211, "step": 2290 }, { - "epoch": 0.08, - "learning_rate": 2.691135084427768e-05, - "loss": 0.3563, + "epoch": 0.08271164450210834, + "grad_norm": 0.22867879271507263, + "learning_rate": 2.756425654576027e-05, + "loss": 0.4702, "step": 2295 }, { - "epoch": 0.08, - "learning_rate": 2.696998123827392e-05, - "loss": 0.3996, + "epoch": 0.08289184416333298, + "grad_norm": 0.26447781920433044, + "learning_rate": 2.7624309392265197e-05, + "loss": 0.4893, "step": 2300 }, { - "epoch": 0.08, - "learning_rate": 2.7028611632270167e-05, - "loss": 0.3122, + "epoch": 0.08307204382455761, + "grad_norm": 0.22963860630989075, + "learning_rate": 2.768436223877012e-05, + "loss": 0.509, "step": 2305 }, { - "epoch": 0.08, - "learning_rate": 2.708724202626642e-05, - "loss": 0.353, + "epoch": 0.08325224348578225, + "grad_norm": 0.34469008445739746, + "learning_rate": 2.7744415085275045e-05, + "loss": 0.4985, "step": 2310 }, { - "epoch": 0.08, - "learning_rate": 2.7145872420262665e-05, - "loss": 0.3473, + "epoch": 0.08343244314700689, + "grad_norm": 0.23541411757469177, + "learning_rate": 2.7804467931779963e-05, + "loss": 0.4892, "step": 2315 }, { - "epoch": 0.08, - "learning_rate": 2.7204502814258914e-05, - "loss": 0.3672, + "epoch": 0.08361264280823152, + "grad_norm": 0.2870751619338989, + "learning_rate": 2.7864520778284892e-05, + "loss": 0.4806, "step": 2320 }, { - "epoch": 0.08, - "learning_rate": 2.726313320825516e-05, - "loss": 0.3672, + "epoch": 0.08379284246945616, + "grad_norm": 0.24497903883457184, + "learning_rate": 2.7924573624789818e-05, + "loss": 0.4938, "step": 2325 }, { - "epoch": 0.08, - "learning_rate": 2.732176360225141e-05, - "loss": 0.3638, + "epoch": 0.0839730421306808, + "grad_norm": 0.29667189717292786, + "learning_rate": 2.798462647129474e-05, + "loss": 0.5338, "step": 2330 }, { - "epoch": 0.08, - "learning_rate": 2.7380393996247654e-05, - "loss": 0.3466, + "epoch": 0.08415324179190543, + "grad_norm": 0.2696093022823334, + "learning_rate": 2.8044679317799666e-05, + "loss": 0.4821, "step": 2335 }, { - "epoch": 0.08, - "learning_rate": 2.7439024390243906e-05, - "loss": 0.358, + "epoch": 0.08433344145313007, + "grad_norm": 0.2720743715763092, + "learning_rate": 2.8104732164304588e-05, + "loss": 0.4867, "step": 2340 }, { - "epoch": 0.08, - "learning_rate": 2.7497654784240152e-05, - "loss": 0.3614, + "epoch": 0.0845136411143547, + "grad_norm": 0.3151923418045044, + "learning_rate": 2.8164785010809513e-05, + "loss": 0.4833, "step": 2345 }, { - "epoch": 0.08, - "learning_rate": 2.75562851782364e-05, - "loss": 0.3242, + "epoch": 0.08469384077557934, + "grad_norm": 0.2476382553577423, + "learning_rate": 2.822483785731444e-05, + "loss": 0.4882, "step": 2350 }, { - "epoch": 0.08, - "learning_rate": 2.7614915572232647e-05, - "loss": 0.3651, + "epoch": 0.08487404043680398, + "grad_norm": 0.2863009572029114, + "learning_rate": 2.828489070381936e-05, + "loss": 0.4852, "step": 2355 }, { - "epoch": 0.08, - "learning_rate": 2.7673545966228892e-05, - "loss": 0.3657, + "epoch": 0.08505424009802862, + "grad_norm": 0.26534658670425415, + "learning_rate": 2.8344943550324287e-05, + "loss": 0.5093, "step": 2360 }, { - "epoch": 0.08, - "learning_rate": 2.773217636022514e-05, - "loss": 0.3347, + "epoch": 0.08523443975925325, + "grad_norm": 0.2766381800174713, + "learning_rate": 2.840499639682921e-05, + "loss": 0.4981, "step": 2365 }, { - "epoch": 0.08, - "learning_rate": 2.7790806754221387e-05, - "loss": 0.3429, + "epoch": 0.08541463942047789, + "grad_norm": 0.3110561966896057, + "learning_rate": 2.8465049243334134e-05, + "loss": 0.5176, "step": 2370 }, { - "epoch": 0.08, - "learning_rate": 2.784943714821764e-05, - "loss": 0.3619, + "epoch": 0.08559483908170253, + "grad_norm": 0.2818463444709778, + "learning_rate": 2.8525102089839063e-05, + "loss": 0.4864, "step": 2375 }, { - "epoch": 0.08, - "learning_rate": 2.7908067542213885e-05, - "loss": 0.3741, + "epoch": 0.08577503874292716, + "grad_norm": 0.2455795407295227, + "learning_rate": 2.8585154936343982e-05, + "loss": 0.5048, "step": 2380 }, { - "epoch": 0.08, - "learning_rate": 2.7966697936210134e-05, - "loss": 0.321, + "epoch": 0.0859552384041518, + "grad_norm": 0.3062375783920288, + "learning_rate": 2.864520778284891e-05, + "loss": 0.5028, "step": 2385 }, { - "epoch": 0.08, - "learning_rate": 2.802532833020638e-05, - "loss": 0.3701, + "epoch": 0.08613543806537644, + "grad_norm": 0.29522883892059326, + "learning_rate": 2.870526062935383e-05, + "loss": 0.5034, "step": 2390 }, { - "epoch": 0.08, - "learning_rate": 2.808395872420263e-05, - "loss": 0.3606, + "epoch": 0.08631563772660107, + "grad_norm": 0.3509838283061981, + "learning_rate": 2.876531347585876e-05, + "loss": 0.461, "step": 2395 }, { - "epoch": 0.08, - "learning_rate": 2.8142589118198874e-05, - "loss": 0.3868, + "epoch": 0.08649583738782571, + "grad_norm": 0.2348550707101822, + "learning_rate": 2.8825366322363684e-05, + "loss": 0.5092, "step": 2400 }, { - "epoch": 0.08, - "learning_rate": 2.820121951219512e-05, - "loss": 0.324, + "epoch": 0.08667603704905034, + "grad_norm": 0.28098031878471375, + "learning_rate": 2.8885419168868607e-05, + "loss": 0.5046, "step": 2405 }, { - "epoch": 0.08, - "learning_rate": 2.8259849906191372e-05, - "loss": 0.3593, + "epoch": 0.08685623671027498, + "grad_norm": 0.29102039337158203, + "learning_rate": 2.8945472015373532e-05, + "loss": 0.4669, "step": 2410 }, { - "epoch": 0.08, - "learning_rate": 2.8318480300187618e-05, - "loss": 0.3776, + "epoch": 0.08703643637149962, + "grad_norm": 0.273709774017334, + "learning_rate": 2.900552486187845e-05, + "loss": 0.5121, "step": 2415 }, { - "epoch": 0.09, - "learning_rate": 2.8377110694183867e-05, - "loss": 0.3843, + "epoch": 0.08721663603272425, + "grad_norm": 0.3082577586174011, + "learning_rate": 2.906557770838338e-05, + "loss": 0.5158, "step": 2420 }, { - "epoch": 0.09, - "learning_rate": 2.8435741088180113e-05, - "loss": 0.3398, + "epoch": 0.08739683569394889, + "grad_norm": 0.26482829451560974, + "learning_rate": 2.9125630554888305e-05, + "loss": 0.4774, "step": 2425 }, { - "epoch": 0.09, - "learning_rate": 2.849437148217636e-05, - "loss": 0.3671, + "epoch": 0.08757703535517353, + "grad_norm": 0.26536524295806885, + "learning_rate": 2.9185683401393228e-05, + "loss": 0.4929, "step": 2430 }, { - "epoch": 0.09, - "learning_rate": 2.8553001876172607e-05, - "loss": 0.353, + "epoch": 0.08775723501639816, + "grad_norm": 0.26633599400520325, + "learning_rate": 2.9245736247898153e-05, + "loss": 0.4826, "step": 2435 }, { - "epoch": 0.09, - "learning_rate": 2.861163227016886e-05, - "loss": 0.3404, + "epoch": 0.0879374346776228, + "grad_norm": 0.2031547874212265, + "learning_rate": 2.9305789094403075e-05, + "loss": 0.4906, "step": 2440 }, { - "epoch": 0.09, - "learning_rate": 2.8670262664165105e-05, - "loss": 0.3265, + "epoch": 0.08811763433884744, + "grad_norm": 0.2550112307071686, + "learning_rate": 2.9365841940908e-05, + "loss": 0.4968, "step": 2445 }, { - "epoch": 0.09, - "learning_rate": 2.8728893058161354e-05, - "loss": 0.3741, + "epoch": 0.08829783400007207, + "grad_norm": 0.28357723355293274, + "learning_rate": 2.9425894787412923e-05, + "loss": 0.4956, "step": 2450 }, { - "epoch": 0.09, - "learning_rate": 2.87875234521576e-05, - "loss": 0.3315, + "epoch": 0.08847803366129672, + "grad_norm": 0.32640430331230164, + "learning_rate": 2.948594763391785e-05, + "loss": 0.5074, "step": 2455 }, { - "epoch": 0.09, - "learning_rate": 2.8846153846153845e-05, - "loss": 0.3615, + "epoch": 0.08865823332252136, + "grad_norm": 0.240611732006073, + "learning_rate": 2.9546000480422774e-05, + "loss": 0.4661, "step": 2460 }, { - "epoch": 0.09, - "learning_rate": 2.8904784240150094e-05, - "loss": 0.3375, + "epoch": 0.088838432983746, + "grad_norm": 0.27230045199394226, + "learning_rate": 2.9606053326927696e-05, + "loss": 0.4959, "step": 2465 }, { - "epoch": 0.09, - "learning_rate": 2.896341463414634e-05, - "loss": 0.3464, + "epoch": 0.08901863264497063, + "grad_norm": 0.20566551387310028, + "learning_rate": 2.9666106173432622e-05, + "loss": 0.518, "step": 2470 }, { - "epoch": 0.09, - "learning_rate": 2.9022045028142592e-05, - "loss": 0.405, + "epoch": 0.08919883230619527, + "grad_norm": 0.2713625729084015, + "learning_rate": 2.9726159019937544e-05, + "loss": 0.5178, "step": 2475 }, { - "epoch": 0.09, - "learning_rate": 2.9080675422138838e-05, - "loss": 0.3392, + "epoch": 0.0893790319674199, + "grad_norm": 0.2567339837551117, + "learning_rate": 2.978621186644247e-05, + "loss": 0.498, "step": 2480 }, { - "epoch": 0.09, - "learning_rate": 2.9139305816135087e-05, - "loss": 0.3461, + "epoch": 0.08955923162864454, + "grad_norm": 0.27121180295944214, + "learning_rate": 2.98462647129474e-05, + "loss": 0.5194, "step": 2485 }, { - "epoch": 0.09, - "learning_rate": 2.9197936210131333e-05, - "loss": 0.348, + "epoch": 0.08973943128986918, + "grad_norm": 0.35053110122680664, + "learning_rate": 2.9906317559452317e-05, + "loss": 0.533, "step": 2490 }, { - "epoch": 0.09, - "learning_rate": 2.9256566604127582e-05, - "loss": 0.368, + "epoch": 0.08991963095109382, + "grad_norm": 0.2476474940776825, + "learning_rate": 2.9966370405957246e-05, + "loss": 0.4895, "step": 2495 }, { - "epoch": 0.09, - "learning_rate": 2.9315196998123827e-05, - "loss": 0.367, + "epoch": 0.09009983061231845, + "grad_norm": 0.22306469082832336, + "learning_rate": 3.0026423252462165e-05, + "loss": 0.4847, "step": 2500 }, { - "epoch": 0.09, - "eval_loss": 0.3577040433883667, - "eval_runtime": 10.5307, - "eval_samples_per_second": 9.496, - "eval_steps_per_second": 9.496, + "epoch": 0.09009983061231845, + "eval_loss": 0.5190762877464294, + "eval_runtime": 3.513, + "eval_samples_per_second": 28.466, + "eval_steps_per_second": 7.116, "step": 2500 }, { - "epoch": 0.09, - "learning_rate": 2.937382739212008e-05, - "loss": 0.3809, + "epoch": 0.09028003027354309, + "grad_norm": 0.2834235429763794, + "learning_rate": 3.008647609896709e-05, + "loss": 0.5589, "step": 2505 }, { - "epoch": 0.09, - "learning_rate": 2.9432457786116325e-05, - "loss": 0.3426, + "epoch": 0.09046022993476772, + "grad_norm": 0.24811461567878723, + "learning_rate": 3.014652894547202e-05, + "loss": 0.4679, "step": 2510 }, { - "epoch": 0.09, - "learning_rate": 2.949108818011257e-05, - "loss": 0.3887, + "epoch": 0.09064042959599236, + "grad_norm": 0.25787869095802307, + "learning_rate": 3.020658179197694e-05, + "loss": 0.4961, "step": 2515 }, { - "epoch": 0.09, - "learning_rate": 2.954971857410882e-05, - "loss": 0.3503, + "epoch": 0.090820629257217, + "grad_norm": 0.1960582286119461, + "learning_rate": 3.0266634638481867e-05, + "loss": 0.5253, "step": 2520 }, { - "epoch": 0.09, - "learning_rate": 2.9608348968105066e-05, - "loss": 0.3561, + "epoch": 0.09100082891844163, + "grad_norm": 0.28364789485931396, + "learning_rate": 3.0326687484986786e-05, + "loss": 0.4797, "step": 2525 }, { - "epoch": 0.09, - "learning_rate": 2.9666979362101315e-05, - "loss": 0.3581, + "epoch": 0.09118102857966627, + "grad_norm": 0.30647027492523193, + "learning_rate": 3.0386740331491715e-05, + "loss": 0.5131, "step": 2530 }, { - "epoch": 0.09, - "learning_rate": 2.972560975609756e-05, - "loss": 0.3724, + "epoch": 0.09136122824089091, + "grad_norm": 0.2918342351913452, + "learning_rate": 3.044679317799664e-05, + "loss": 0.4649, "step": 2535 }, { - "epoch": 0.09, - "learning_rate": 2.9784240150093813e-05, - "loss": 0.378, + "epoch": 0.09154142790211554, + "grad_norm": 0.34838005900382996, + "learning_rate": 3.0506846024501563e-05, + "loss": 0.5218, "step": 2540 }, { - "epoch": 0.09, - "learning_rate": 2.9842870544090058e-05, - "loss": 0.3481, + "epoch": 0.09172162756334018, + "grad_norm": 0.22675694525241852, + "learning_rate": 3.056689887100649e-05, + "loss": 0.5072, "step": 2545 }, { - "epoch": 0.09, - "learning_rate": 2.9901500938086307e-05, - "loss": 0.3424, + "epoch": 0.09190182722456482, + "grad_norm": 0.3220086991786957, + "learning_rate": 3.062695171751141e-05, + "loss": 0.4664, "step": 2550 }, { - "epoch": 0.09, - "learning_rate": 2.9960131332082553e-05, - "loss": 0.3445, + "epoch": 0.09208202688578945, + "grad_norm": 0.2494743913412094, + "learning_rate": 3.068700456401633e-05, + "loss": 0.5067, "step": 2555 }, { - "epoch": 0.09, - "learning_rate": 3.0018761726078802e-05, - "loss": 0.3466, + "epoch": 0.09226222654701409, + "grad_norm": 0.35692298412323, + "learning_rate": 3.0747057410521265e-05, + "loss": 0.5213, "step": 2560 }, { - "epoch": 0.09, - "learning_rate": 3.0077392120075048e-05, - "loss": 0.3691, + "epoch": 0.09244242620823873, + "grad_norm": 0.352923721075058, + "learning_rate": 3.0807110257026184e-05, + "loss": 0.5052, "step": 2565 }, { - "epoch": 0.09, - "learning_rate": 3.0136022514071293e-05, - "loss": 0.3692, + "epoch": 0.09262262586946336, + "grad_norm": 0.2893941402435303, + "learning_rate": 3.086716310353111e-05, + "loss": 0.4696, "step": 2570 }, { - "epoch": 0.09, - "learning_rate": 3.0194652908067546e-05, - "loss": 0.3332, + "epoch": 0.092802825530688, + "grad_norm": 0.26177549362182617, + "learning_rate": 3.092721595003603e-05, + "loss": 0.5036, "step": 2575 }, { - "epoch": 0.09, - "learning_rate": 3.025328330206379e-05, - "loss": 0.3502, + "epoch": 0.09298302519191264, + "grad_norm": 0.22046709060668945, + "learning_rate": 3.098726879654096e-05, + "loss": 0.4604, "step": 2580 }, { - "epoch": 0.09, - "learning_rate": 3.031191369606004e-05, - "loss": 0.3345, + "epoch": 0.09316322485313727, + "grad_norm": 0.28608959913253784, + "learning_rate": 3.104732164304588e-05, + "loss": 0.5139, "step": 2585 }, { - "epoch": 0.09, - "learning_rate": 3.0370544090056286e-05, - "loss": 0.3595, + "epoch": 0.09334342451436191, + "grad_norm": 0.31415843963623047, + "learning_rate": 3.1107374489550805e-05, + "loss": 0.4915, "step": 2590 }, { - "epoch": 0.09, - "learning_rate": 3.0429174484052535e-05, - "loss": 0.3554, + "epoch": 0.09352362417558654, + "grad_norm": 0.24263805150985718, + "learning_rate": 3.116742733605573e-05, + "loss": 0.5423, "step": 2595 }, { - "epoch": 0.09, - "learning_rate": 3.048780487804878e-05, - "loss": 0.3632, + "epoch": 0.09370382383681118, + "grad_norm": 0.30432233214378357, + "learning_rate": 3.1227480182560656e-05, + "loss": 0.4955, "step": 2600 }, { - "epoch": 0.09, - "learning_rate": 3.054643527204503e-05, - "loss": 0.3838, + "epoch": 0.09388402349803582, + "grad_norm": 0.2743891775608063, + "learning_rate": 3.128753302906558e-05, + "loss": 0.5167, "step": 2605 }, { - "epoch": 0.09, - "learning_rate": 3.060506566604128e-05, - "loss": 0.3539, + "epoch": 0.09406422315926045, + "grad_norm": 0.2813490629196167, + "learning_rate": 3.13475858755705e-05, + "loss": 0.5374, "step": 2610 }, { - "epoch": 0.09, - "learning_rate": 3.0663696060037524e-05, - "loss": 0.3716, + "epoch": 0.09424442282048509, + "grad_norm": 0.3482913076877594, + "learning_rate": 3.1407638722075426e-05, + "loss": 0.4621, "step": 2615 }, { - "epoch": 0.09, - "learning_rate": 3.072232645403377e-05, - "loss": 0.3561, + "epoch": 0.09442462248170974, + "grad_norm": 0.3260897696018219, + "learning_rate": 3.146769156858035e-05, + "loss": 0.5003, "step": 2620 }, { - "epoch": 0.09, - "learning_rate": 3.0780956848030015e-05, - "loss": 0.3504, + "epoch": 0.09460482214293438, + "grad_norm": 0.27140331268310547, + "learning_rate": 3.152774441508528e-05, + "loss": 0.5161, "step": 2625 }, { - "epoch": 0.09, - "learning_rate": 3.083958724202627e-05, - "loss": 0.3515, + "epoch": 0.09478502180415901, + "grad_norm": 0.24953608214855194, + "learning_rate": 3.15877972615902e-05, + "loss": 0.5059, "step": 2630 }, { - "epoch": 0.09, - "learning_rate": 3.089821763602251e-05, - "loss": 0.3483, + "epoch": 0.09496522146538365, + "grad_norm": 0.2428867667913437, + "learning_rate": 3.164785010809512e-05, + "loss": 0.5285, "step": 2635 }, { - "epoch": 0.09, - "learning_rate": 3.0956848030018766e-05, - "loss": 0.3907, + "epoch": 0.09514542112660829, + "grad_norm": 0.2688390910625458, + "learning_rate": 3.170790295460005e-05, + "loss": 0.4971, "step": 2640 }, { - "epoch": 0.09, - "learning_rate": 3.101547842401501e-05, - "loss": 0.3511, + "epoch": 0.09532562078783292, + "grad_norm": 0.292272686958313, + "learning_rate": 3.176795580110498e-05, + "loss": 0.4781, "step": 2645 }, { - "epoch": 0.09, - "learning_rate": 3.1074108818011264e-05, - "loss": 0.3438, + "epoch": 0.09550582044905756, + "grad_norm": 0.22563514113426208, + "learning_rate": 3.18280086476099e-05, + "loss": 0.4319, "step": 2650 }, { - "epoch": 0.09, - "learning_rate": 3.11327392120075e-05, - "loss": 0.3856, + "epoch": 0.0956860201102822, + "grad_norm": 0.31794285774230957, + "learning_rate": 3.1888061494114824e-05, + "loss": 0.5238, "step": 2655 }, { - "epoch": 0.09, - "learning_rate": 3.1191369606003755e-05, - "loss": 0.33, + "epoch": 0.09586621977150683, + "grad_norm": 0.36745190620422363, + "learning_rate": 3.194811434061974e-05, + "loss": 0.4985, "step": 2660 }, { - "epoch": 0.09, - "learning_rate": 3.125e-05, - "loss": 0.373, + "epoch": 0.09604641943273147, + "grad_norm": 0.2950117588043213, + "learning_rate": 3.200816718712467e-05, + "loss": 0.5128, "step": 2665 }, { - "epoch": 0.09, - "learning_rate": 3.1308630393996246e-05, - "loss": 0.3941, + "epoch": 0.0962266190939561, + "grad_norm": 0.30142202973365784, + "learning_rate": 3.20682200336296e-05, + "loss": 0.4884, "step": 2670 }, { - "epoch": 0.09, - "learning_rate": 3.13672607879925e-05, - "loss": 0.3685, + "epoch": 0.09640681875518074, + "grad_norm": 0.34995657205581665, + "learning_rate": 3.212827288013452e-05, + "loss": 0.5074, "step": 2675 }, { - "epoch": 0.09, - "learning_rate": 3.1425891181988744e-05, - "loss": 0.3595, + "epoch": 0.09658701841640538, + "grad_norm": 0.2538526952266693, + "learning_rate": 3.2188325726639445e-05, + "loss": 0.4964, "step": 2680 }, { - "epoch": 0.09, - "learning_rate": 3.148452157598499e-05, - "loss": 0.3686, + "epoch": 0.09676721807763002, + "grad_norm": 0.3425311744213104, + "learning_rate": 3.2248378573144364e-05, + "loss": 0.4721, "step": 2685 }, { - "epoch": 0.09, - "learning_rate": 3.1543151969981236e-05, - "loss": 0.3551, + "epoch": 0.09694741773885465, + "grad_norm": 0.3075348138809204, + "learning_rate": 3.2308431419649296e-05, + "loss": 0.5145, "step": 2690 }, { - "epoch": 0.09, - "learning_rate": 3.160178236397749e-05, - "loss": 0.357, + "epoch": 0.09712761740007929, + "grad_norm": 0.24702075123786926, + "learning_rate": 3.236848426615422e-05, + "loss": 0.4793, "step": 2695 }, { - "epoch": 0.09, - "learning_rate": 3.1660412757973734e-05, - "loss": 0.3788, + "epoch": 0.09730781706130393, + "grad_norm": 0.2921466827392578, + "learning_rate": 3.242853711265914e-05, + "loss": 0.5097, "step": 2700 }, { - "epoch": 0.1, - "learning_rate": 3.1719043151969986e-05, - "loss": 0.346, + "epoch": 0.09748801672252856, + "grad_norm": 0.22841855883598328, + "learning_rate": 3.2488589959164066e-05, + "loss": 0.4905, "step": 2705 }, { - "epoch": 0.1, - "learning_rate": 3.177767354596623e-05, - "loss": 0.3612, + "epoch": 0.0976682163837532, + "grad_norm": 0.24366632103919983, + "learning_rate": 3.254864280566899e-05, + "loss": 0.4773, "step": 2710 }, { - "epoch": 0.1, - "learning_rate": 3.1836303939962484e-05, - "loss": 0.3425, + "epoch": 0.09784841604497783, + "grad_norm": 0.2745325565338135, + "learning_rate": 3.260869565217392e-05, + "loss": 0.492, "step": 2715 }, { - "epoch": 0.1, - "learning_rate": 3.189493433395872e-05, - "loss": 0.3502, + "epoch": 0.09802861570620247, + "grad_norm": 0.22801122069358826, + "learning_rate": 3.266874849867884e-05, + "loss": 0.4883, "step": 2720 }, { - "epoch": 0.1, - "learning_rate": 3.195356472795497e-05, - "loss": 0.3617, + "epoch": 0.09820881536742711, + "grad_norm": 0.316967636346817, + "learning_rate": 3.272880134518376e-05, + "loss": 0.4811, "step": 2725 }, { - "epoch": 0.1, - "learning_rate": 3.201219512195122e-05, - "loss": 0.3544, + "epoch": 0.09838901502865174, + "grad_norm": 0.28945192694664, + "learning_rate": 3.278885419168869e-05, + "loss": 0.4782, "step": 2730 }, { - "epoch": 0.1, - "learning_rate": 3.2070825515947466e-05, - "loss": 0.3392, + "epoch": 0.09856921468987638, + "grad_norm": 0.25695520639419556, + "learning_rate": 3.284890703819361e-05, + "loss": 0.5038, "step": 2735 }, { - "epoch": 0.1, - "learning_rate": 3.212945590994372e-05, - "loss": 0.3688, + "epoch": 0.09874941435110102, + "grad_norm": 0.22989587485790253, + "learning_rate": 3.290895988469854e-05, + "loss": 0.4745, "step": 2740 }, { - "epoch": 0.1, - "learning_rate": 3.2188086303939964e-05, - "loss": 0.3643, + "epoch": 0.09892961401232565, + "grad_norm": 0.28844088315963745, + "learning_rate": 3.296901273120346e-05, + "loss": 0.4886, "step": 2745 }, { - "epoch": 0.1, - "learning_rate": 3.224671669793621e-05, - "loss": 0.3766, + "epoch": 0.09910981367355029, + "grad_norm": 0.21202966570854187, + "learning_rate": 3.302906557770838e-05, + "loss": 0.4926, "step": 2750 }, { - "epoch": 0.1, - "learning_rate": 3.2305347091932456e-05, - "loss": 0.3322, + "epoch": 0.09929001333477493, + "grad_norm": 0.4138276278972626, + "learning_rate": 3.308911842421331e-05, + "loss": 0.5199, "step": 2755 }, { - "epoch": 0.1, - "learning_rate": 3.236397748592871e-05, - "loss": 0.3508, + "epoch": 0.09947021299599956, + "grad_norm": 0.22605939209461212, + "learning_rate": 3.3149171270718233e-05, + "loss": 0.4753, "step": 2760 }, { - "epoch": 0.1, - "learning_rate": 3.2422607879924954e-05, - "loss": 0.3747, + "epoch": 0.0996504126572242, + "grad_norm": 0.25468626618385315, + "learning_rate": 3.320922411722316e-05, + "loss": 0.483, "step": 2765 }, { - "epoch": 0.1, - "learning_rate": 3.24812382739212e-05, - "loss": 0.3758, + "epoch": 0.09983061231844884, + "grad_norm": 0.25252848863601685, + "learning_rate": 3.326927696372808e-05, + "loss": 0.5189, "step": 2770 }, { - "epoch": 0.1, - "learning_rate": 3.253986866791745e-05, - "loss": 0.3588, + "epoch": 0.10001081197967347, + "grad_norm": 0.29405850172042847, + "learning_rate": 3.3329329810233003e-05, + "loss": 0.4849, "step": 2775 }, { - "epoch": 0.1, - "learning_rate": 3.25984990619137e-05, - "loss": 0.3316, + "epoch": 0.10019101164089811, + "grad_norm": 0.37302690744400024, + "learning_rate": 3.3389382656737936e-05, + "loss": 0.4963, "step": 2780 }, { - "epoch": 0.1, - "learning_rate": 3.265712945590994e-05, - "loss": 0.3572, + "epoch": 0.10037121130212275, + "grad_norm": 0.23915189504623413, + "learning_rate": 3.3449435503242855e-05, + "loss": 0.5278, "step": 2785 }, { - "epoch": 0.1, - "learning_rate": 3.271575984990619e-05, - "loss": 0.3326, + "epoch": 0.1005514109633474, + "grad_norm": 0.3481108546257019, + "learning_rate": 3.350948834974778e-05, + "loss": 0.4938, "step": 2790 }, { - "epoch": 0.1, - "learning_rate": 3.277439024390244e-05, - "loss": 0.3435, + "epoch": 0.10073161062457203, + "grad_norm": 0.23707103729248047, + "learning_rate": 3.35695411962527e-05, + "loss": 0.4766, "step": 2795 }, { - "epoch": 0.1, - "learning_rate": 3.283302063789869e-05, - "loss": 0.3519, + "epoch": 0.10091181028579667, + "grad_norm": 0.25163257122039795, + "learning_rate": 3.362959404275763e-05, + "loss": 0.4613, "step": 2800 }, { - "epoch": 0.1, - "learning_rate": 3.289165103189494e-05, - "loss": 0.3572, + "epoch": 0.1010920099470213, + "grad_norm": 0.2651137113571167, + "learning_rate": 3.368964688926256e-05, + "loss": 0.4841, "step": 2805 }, { - "epoch": 0.1, - "learning_rate": 3.2950281425891185e-05, - "loss": 0.3398, + "epoch": 0.10127220960824594, + "grad_norm": 0.32314032316207886, + "learning_rate": 3.3749699735767476e-05, + "loss": 0.4957, "step": 2810 }, { - "epoch": 0.1, - "learning_rate": 3.300891181988743e-05, - "loss": 0.3444, + "epoch": 0.10145240926947058, + "grad_norm": 0.3319461941719055, + "learning_rate": 3.38097525822724e-05, + "loss": 0.492, "step": 2815 }, { - "epoch": 0.1, - "learning_rate": 3.3067542213883676e-05, - "loss": 0.3467, + "epoch": 0.10163260893069521, + "grad_norm": 0.24138695001602173, + "learning_rate": 3.386980542877733e-05, + "loss": 0.511, "step": 2820 }, { - "epoch": 0.1, - "learning_rate": 3.312617260787992e-05, - "loss": 0.3668, + "epoch": 0.10181280859191985, + "grad_norm": 0.3053799271583557, + "learning_rate": 3.392985827528225e-05, + "loss": 0.4911, "step": 2825 }, { - "epoch": 0.1, - "learning_rate": 3.3184803001876174e-05, - "loss": 0.3473, + "epoch": 0.10199300825314449, + "grad_norm": 0.26419997215270996, + "learning_rate": 3.398991112178718e-05, + "loss": 0.4836, "step": 2830 }, { - "epoch": 0.1, - "learning_rate": 3.324343339587242e-05, - "loss": 0.3297, + "epoch": 0.10217320791436912, + "grad_norm": 0.2777937352657318, + "learning_rate": 3.4049963968292097e-05, + "loss": 0.5124, "step": 2835 }, { - "epoch": 0.1, - "learning_rate": 3.330206378986867e-05, - "loss": 0.3245, + "epoch": 0.10235340757559376, + "grad_norm": 0.2962092459201813, + "learning_rate": 3.411001681479702e-05, + "loss": 0.4948, "step": 2840 }, { - "epoch": 0.1, - "learning_rate": 3.336069418386492e-05, - "loss": 0.3357, + "epoch": 0.1025336072368184, + "grad_norm": 0.3595232665538788, + "learning_rate": 3.417006966130195e-05, + "loss": 0.5122, "step": 2845 }, { - "epoch": 0.1, - "learning_rate": 3.341932457786116e-05, - "loss": 0.3226, + "epoch": 0.10271380689804303, + "grad_norm": 0.20213566720485687, + "learning_rate": 3.423012250780687e-05, + "loss": 0.4886, "step": 2850 }, { - "epoch": 0.1, - "learning_rate": 3.347795497185741e-05, - "loss": 0.3611, + "epoch": 0.10289400655926767, + "grad_norm": 0.3061508536338806, + "learning_rate": 3.42901753543118e-05, + "loss": 0.5068, "step": 2855 }, { - "epoch": 0.1, - "learning_rate": 3.353658536585366e-05, - "loss": 0.3491, + "epoch": 0.1030742062204923, + "grad_norm": 0.27155801653862, + "learning_rate": 3.435022820081672e-05, + "loss": 0.4583, "step": 2860 }, { - "epoch": 0.1, - "learning_rate": 3.359521575984991e-05, - "loss": 0.3388, + "epoch": 0.10325440588171694, + "grad_norm": 0.2824406623840332, + "learning_rate": 3.441028104732164e-05, + "loss": 0.4857, "step": 2865 }, { - "epoch": 0.1, - "learning_rate": 3.365384615384616e-05, - "loss": 0.3568, + "epoch": 0.10343460554294158, + "grad_norm": 0.2800888121128082, + "learning_rate": 3.447033389382657e-05, + "loss": 0.4928, "step": 2870 }, { - "epoch": 0.1, - "learning_rate": 3.3712476547842405e-05, - "loss": 0.3603, + "epoch": 0.10361480520416622, + "grad_norm": 0.2530428469181061, + "learning_rate": 3.4530386740331494e-05, + "loss": 0.4841, "step": 2875 }, { - "epoch": 0.1, - "learning_rate": 3.377110694183865e-05, - "loss": 0.3639, + "epoch": 0.10379500486539085, + "grad_norm": 0.2559506595134735, + "learning_rate": 3.459043958683641e-05, + "loss": 0.4919, "step": 2880 }, { - "epoch": 0.1, - "learning_rate": 3.3829737335834896e-05, - "loss": 0.3595, + "epoch": 0.10397520452661549, + "grad_norm": 0.35989275574684143, + "learning_rate": 3.465049243334134e-05, + "loss": 0.5059, "step": 2885 }, { - "epoch": 0.1, - "learning_rate": 3.388836772983114e-05, - "loss": 0.3682, + "epoch": 0.10415540418784013, + "grad_norm": 0.2567185163497925, + "learning_rate": 3.471054527984627e-05, + "loss": 0.4752, "step": 2890 }, { - "epoch": 0.1, - "learning_rate": 3.3946998123827394e-05, - "loss": 0.3693, + "epoch": 0.10433560384906476, + "grad_norm": 0.3426368832588196, + "learning_rate": 3.477059812635119e-05, + "loss": 0.5043, "step": 2895 }, { - "epoch": 0.1, - "learning_rate": 3.400562851782364e-05, - "loss": 0.3433, + "epoch": 0.1045158035102894, + "grad_norm": 0.26572245359420776, + "learning_rate": 3.4830650972856115e-05, + "loss": 0.5032, "step": 2900 }, { - "epoch": 0.1, - "learning_rate": 3.406425891181989e-05, - "loss": 0.3549, + "epoch": 0.10469600317151403, + "grad_norm": 0.2190892994403839, + "learning_rate": 3.4890703819361034e-05, + "loss": 0.4637, "step": 2905 }, { - "epoch": 0.1, - "learning_rate": 3.412288930581614e-05, - "loss": 0.3357, + "epoch": 0.10487620283273867, + "grad_norm": 0.3017270267009735, + "learning_rate": 3.4950756665865967e-05, + "loss": 0.4957, "step": 2910 }, { - "epoch": 0.1, - "learning_rate": 3.418151969981238e-05, - "loss": 0.3879, + "epoch": 0.10505640249396331, + "grad_norm": 0.2628999650478363, + "learning_rate": 3.501080951237089e-05, + "loss": 0.469, "step": 2915 }, { - "epoch": 0.1, - "learning_rate": 3.424015009380863e-05, - "loss": 0.355, + "epoch": 0.10523660215518794, + "grad_norm": 0.26654675602912903, + "learning_rate": 3.507086235887581e-05, + "loss": 0.4872, "step": 2920 }, { - "epoch": 0.1, - "learning_rate": 3.429878048780488e-05, - "loss": 0.3463, + "epoch": 0.10541680181641258, + "grad_norm": 0.33271774649620056, + "learning_rate": 3.5130915205380736e-05, + "loss": 0.465, "step": 2925 }, { - "epoch": 0.1, - "learning_rate": 3.435741088180113e-05, - "loss": 0.3445, + "epoch": 0.10559700147763722, + "grad_norm": 0.2927808165550232, + "learning_rate": 3.519096805188566e-05, + "loss": 0.5051, "step": 2930 }, { - "epoch": 0.1, - "learning_rate": 3.441604127579737e-05, - "loss": 0.3564, + "epoch": 0.10577720113886185, + "grad_norm": 0.2696301341056824, + "learning_rate": 3.525102089839059e-05, + "loss": 0.459, "step": 2935 }, { - "epoch": 0.1, - "learning_rate": 3.4474671669793625e-05, - "loss": 0.3453, + "epoch": 0.10595740080008649, + "grad_norm": 0.21873407065868378, + "learning_rate": 3.531107374489551e-05, + "loss": 0.4857, "step": 2940 }, { - "epoch": 0.1, - "learning_rate": 3.453330206378987e-05, - "loss": 0.3659, + "epoch": 0.10613760046131113, + "grad_norm": 0.22248321771621704, + "learning_rate": 3.537112659140043e-05, + "loss": 0.4911, "step": 2945 }, { - "epoch": 0.1, - "learning_rate": 3.4591932457786116e-05, - "loss": 0.3705, + "epoch": 0.10631780012253576, + "grad_norm": 0.2554193139076233, + "learning_rate": 3.543117943790536e-05, + "loss": 0.5309, "step": 2950 }, { - "epoch": 0.1, - "learning_rate": 3.465056285178236e-05, - "loss": 0.3214, + "epoch": 0.1064979997837604, + "grad_norm": 0.27886924147605896, + "learning_rate": 3.549123228441028e-05, + "loss": 0.4637, "step": 2955 }, { - "epoch": 0.1, - "learning_rate": 3.4709193245778614e-05, - "loss": 0.374, + "epoch": 0.10667819944498505, + "grad_norm": 0.23546633124351501, + "learning_rate": 3.555128513091521e-05, + "loss": 0.4657, "step": 2960 }, { - "epoch": 0.1, - "learning_rate": 3.476782363977486e-05, - "loss": 0.3363, + "epoch": 0.10685839910620969, + "grad_norm": 0.2205863744020462, + "learning_rate": 3.5611337977420134e-05, + "loss": 0.493, "step": 2965 }, { - "epoch": 0.1, - "learning_rate": 3.482645403377111e-05, - "loss": 0.3535, + "epoch": 0.10703859876743432, + "grad_norm": 0.27504095435142517, + "learning_rate": 3.567139082392505e-05, + "loss": 0.4543, "step": 2970 }, { - "epoch": 0.1, - "learning_rate": 3.488508442776736e-05, - "loss": 0.3567, + "epoch": 0.10721879842865896, + "grad_norm": 0.27117300033569336, + "learning_rate": 3.573144367042998e-05, + "loss": 0.4604, "step": 2975 }, { - "epoch": 0.1, - "learning_rate": 3.4943714821763604e-05, - "loss": 0.3518, + "epoch": 0.1073989980898836, + "grad_norm": 0.29087886214256287, + "learning_rate": 3.5791496516934904e-05, + "loss": 0.515, "step": 2980 }, { - "epoch": 0.11, - "learning_rate": 3.500234521575985e-05, - "loss": 0.3385, + "epoch": 0.10757919775110823, + "grad_norm": 0.2880057096481323, + "learning_rate": 3.585154936343983e-05, + "loss": 0.5116, "step": 2985 }, { - "epoch": 0.11, - "learning_rate": 3.5060975609756095e-05, - "loss": 0.3581, + "epoch": 0.10775939741233287, + "grad_norm": 0.26126590371131897, + "learning_rate": 3.5911602209944755e-05, + "loss": 0.5085, "step": 2990 }, { - "epoch": 0.11, - "learning_rate": 3.511960600375235e-05, - "loss": 0.3654, + "epoch": 0.1079395970735575, + "grad_norm": 0.34922003746032715, + "learning_rate": 3.5971655056449674e-05, + "loss": 0.5063, "step": 2995 }, { - "epoch": 0.11, - "learning_rate": 3.517823639774859e-05, - "loss": 0.353, + "epoch": 0.10811979673478214, + "grad_norm": 0.34584909677505493, + "learning_rate": 3.6031707902954606e-05, + "loss": 0.5323, "step": 3000 }, { - "epoch": 0.11, - "eval_loss": 0.3522051274776459, - "eval_runtime": 10.5471, - "eval_samples_per_second": 9.481, - "eval_steps_per_second": 9.481, + "epoch": 0.10811979673478214, + "eval_loss": 0.515049397945404, + "eval_runtime": 3.5163, + "eval_samples_per_second": 28.439, + "eval_steps_per_second": 7.11, "step": 3000 }, { - "epoch": 0.11, - "learning_rate": 3.5236866791744845e-05, - "loss": 0.3403, + "epoch": 0.10829999639600678, + "grad_norm": 0.22614799439907074, + "learning_rate": 3.6091760749459525e-05, + "loss": 0.4944, "step": 3005 }, { - "epoch": 0.11, - "learning_rate": 3.529549718574109e-05, - "loss": 0.3401, + "epoch": 0.10848019605723142, + "grad_norm": 0.2877223789691925, + "learning_rate": 3.615181359596445e-05, + "loss": 0.5352, "step": 3010 }, { - "epoch": 0.11, - "learning_rate": 3.5354127579737336e-05, - "loss": 0.3447, + "epoch": 0.10866039571845605, + "grad_norm": 0.2686571478843689, + "learning_rate": 3.621186644246937e-05, + "loss": 0.4605, "step": 3015 }, { - "epoch": 0.11, - "learning_rate": 3.541275797373358e-05, - "loss": 0.3644, + "epoch": 0.10884059537968069, + "grad_norm": 0.3369406759738922, + "learning_rate": 3.62719192889743e-05, + "loss": 0.477, "step": 3020 }, { - "epoch": 0.11, - "learning_rate": 3.5471388367729835e-05, - "loss": 0.3646, + "epoch": 0.10902079504090532, + "grad_norm": 0.27363601326942444, + "learning_rate": 3.633197213547923e-05, + "loss": 0.503, "step": 3025 }, { - "epoch": 0.11, - "learning_rate": 3.553001876172608e-05, - "loss": 0.3469, + "epoch": 0.10920099470212996, + "grad_norm": 0.21706698834896088, + "learning_rate": 3.6392024981984146e-05, + "loss": 0.4783, "step": 3030 }, { - "epoch": 0.11, - "learning_rate": 3.5588649155722326e-05, - "loss": 0.3766, + "epoch": 0.1093811943633546, + "grad_norm": 0.2553368806838989, + "learning_rate": 3.645207782848907e-05, + "loss": 0.514, "step": 3035 }, { - "epoch": 0.11, - "learning_rate": 3.564727954971858e-05, - "loss": 0.3477, + "epoch": 0.10956139402457923, + "grad_norm": 0.22971071302890778, + "learning_rate": 3.6512130674994e-05, + "loss": 0.4765, "step": 3040 }, { - "epoch": 0.11, - "learning_rate": 3.5705909943714824e-05, - "loss": 0.349, + "epoch": 0.10974159368580387, + "grad_norm": 0.2552284598350525, + "learning_rate": 3.657218352149892e-05, + "loss": 0.4497, "step": 3045 }, { - "epoch": 0.11, - "learning_rate": 3.576454033771107e-05, - "loss": 0.3357, + "epoch": 0.1099217933470285, + "grad_norm": 0.336775541305542, + "learning_rate": 3.663223636800385e-05, + "loss": 0.5162, "step": 3050 }, { - "epoch": 0.11, - "learning_rate": 3.5823170731707315e-05, - "loss": 0.3592, + "epoch": 0.11010199300825314, + "grad_norm": 0.27510499954223633, + "learning_rate": 3.669228921450877e-05, + "loss": 0.4996, "step": 3055 }, { - "epoch": 0.11, - "learning_rate": 3.588180112570357e-05, - "loss": 0.3172, + "epoch": 0.11028219266947778, + "grad_norm": 0.2974676787853241, + "learning_rate": 3.675234206101369e-05, + "loss": 0.4634, "step": 3060 }, { - "epoch": 0.11, - "learning_rate": 3.594043151969981e-05, - "loss": 0.3552, + "epoch": 0.11046239233070242, + "grad_norm": 0.2689826190471649, + "learning_rate": 3.681239490751862e-05, + "loss": 0.5105, "step": 3065 }, { - "epoch": 0.11, - "learning_rate": 3.5999061913696065e-05, - "loss": 0.356, + "epoch": 0.11064259199192705, + "grad_norm": 0.27652692794799805, + "learning_rate": 3.6872447754023544e-05, + "loss": 0.5259, "step": 3070 }, { - "epoch": 0.11, - "learning_rate": 3.605769230769231e-05, - "loss": 0.3345, + "epoch": 0.11082279165315169, + "grad_norm": 0.29832860827445984, + "learning_rate": 3.693250060052847e-05, + "loss": 0.5042, "step": 3075 }, { - "epoch": 0.11, - "learning_rate": 3.611632270168856e-05, - "loss": 0.3301, + "epoch": 0.11100299131437633, + "grad_norm": 0.28680893778800964, + "learning_rate": 3.699255344703339e-05, + "loss": 0.5209, "step": 3080 }, { - "epoch": 0.11, - "learning_rate": 3.61749530956848e-05, - "loss": 0.3499, + "epoch": 0.11118319097560096, + "grad_norm": 0.3082677125930786, + "learning_rate": 3.7052606293538314e-05, + "loss": 0.503, "step": 3085 }, { - "epoch": 0.11, - "learning_rate": 3.623358348968105e-05, - "loss": 0.3707, + "epoch": 0.1113633906368256, + "grad_norm": 0.2548987865447998, + "learning_rate": 3.711265914004324e-05, + "loss": 0.4714, "step": 3090 }, { - "epoch": 0.11, - "learning_rate": 3.62922138836773e-05, - "loss": 0.3458, + "epoch": 0.11154359029805024, + "grad_norm": 0.39584243297576904, + "learning_rate": 3.7172711986548165e-05, + "loss": 0.5219, "step": 3095 }, { - "epoch": 0.11, - "learning_rate": 3.6350844277673546e-05, - "loss": 0.3501, + "epoch": 0.11172378995927487, + "grad_norm": 0.36074209213256836, + "learning_rate": 3.723276483305309e-05, + "loss": 0.5255, "step": 3100 }, { - "epoch": 0.11, - "learning_rate": 3.64094746716698e-05, - "loss": 0.3591, + "epoch": 0.11190398962049951, + "grad_norm": 0.22818799316883087, + "learning_rate": 3.729281767955801e-05, + "loss": 0.4651, "step": 3105 }, { - "epoch": 0.11, - "learning_rate": 3.6468105065666044e-05, - "loss": 0.3478, + "epoch": 0.11208418928172414, + "grad_norm": 0.22764305770397186, + "learning_rate": 3.735287052606294e-05, + "loss": 0.4482, "step": 3110 }, { - "epoch": 0.11, - "learning_rate": 3.652673545966229e-05, - "loss": 0.3257, + "epoch": 0.11226438894294878, + "grad_norm": 0.2548796236515045, + "learning_rate": 3.741292337256786e-05, + "loss": 0.4823, "step": 3115 }, { - "epoch": 0.11, - "learning_rate": 3.6585365853658535e-05, - "loss": 0.3472, + "epoch": 0.11244458860417342, + "grad_norm": 0.2939002811908722, + "learning_rate": 3.7472976219072786e-05, + "loss": 0.4909, "step": 3120 }, { - "epoch": 0.11, - "learning_rate": 3.664399624765479e-05, - "loss": 0.3647, + "epoch": 0.11262478826539805, + "grad_norm": 0.3162885308265686, + "learning_rate": 3.753302906557771e-05, + "loss": 0.4936, "step": 3125 }, { - "epoch": 0.11, - "learning_rate": 3.670262664165103e-05, - "loss": 0.3712, + "epoch": 0.1128049879266227, + "grad_norm": 0.2588067352771759, + "learning_rate": 3.759308191208264e-05, + "loss": 0.4921, "step": 3130 }, { - "epoch": 0.11, - "learning_rate": 3.6761257035647286e-05, - "loss": 0.3607, + "epoch": 0.11298518758784734, + "grad_norm": 0.3024102747440338, + "learning_rate": 3.765313475858756e-05, + "loss": 0.4931, "step": 3135 }, { - "epoch": 0.11, - "learning_rate": 3.681988742964353e-05, - "loss": 0.3935, + "epoch": 0.11316538724907198, + "grad_norm": 0.40092089772224426, + "learning_rate": 3.771318760509248e-05, + "loss": 0.5415, "step": 3140 }, { - "epoch": 0.11, - "learning_rate": 3.687851782363978e-05, - "loss": 0.3593, + "epoch": 0.11334558691029661, + "grad_norm": 0.29862499237060547, + "learning_rate": 3.777324045159741e-05, + "loss": 0.4613, "step": 3145 }, { - "epoch": 0.11, - "learning_rate": 3.693714821763602e-05, - "loss": 0.3689, + "epoch": 0.11352578657152125, + "grad_norm": 0.3131415843963623, + "learning_rate": 3.7833293298102326e-05, + "loss": 0.5001, "step": 3150 }, { - "epoch": 0.11, - "learning_rate": 3.699577861163227e-05, - "loss": 0.3612, + "epoch": 0.11370598623274589, + "grad_norm": 0.2650774419307709, + "learning_rate": 3.789334614460726e-05, + "loss": 0.4916, "step": 3155 }, { - "epoch": 0.11, - "learning_rate": 3.705440900562852e-05, - "loss": 0.3765, + "epoch": 0.11388618589397052, + "grad_norm": 0.20322157442569733, + "learning_rate": 3.7953398991112184e-05, + "loss": 0.4615, "step": 3160 }, { - "epoch": 0.11, - "learning_rate": 3.7113039399624766e-05, - "loss": 0.3804, + "epoch": 0.11406638555519516, + "grad_norm": 0.2679648995399475, + "learning_rate": 3.80134518376171e-05, + "loss": 0.5039, "step": 3165 }, { - "epoch": 0.11, - "learning_rate": 3.717166979362102e-05, - "loss": 0.3414, + "epoch": 0.1142465852164198, + "grad_norm": 0.3185202181339264, + "learning_rate": 3.807350468412203e-05, + "loss": 0.5249, "step": 3170 }, { - "epoch": 0.11, - "learning_rate": 3.7230300187617264e-05, - "loss": 0.3391, + "epoch": 0.11442678487764443, + "grad_norm": 0.31188127398490906, + "learning_rate": 3.8133557530626954e-05, + "loss": 0.4923, "step": 3175 }, { - "epoch": 0.11, - "learning_rate": 3.728893058161351e-05, - "loss": 0.39, + "epoch": 0.11460698453886907, + "grad_norm": 0.2713908553123474, + "learning_rate": 3.819361037713188e-05, + "loss": 0.4823, "step": 3180 }, { - "epoch": 0.11, - "learning_rate": 3.7347560975609755e-05, - "loss": 0.3589, + "epoch": 0.1147871842000937, + "grad_norm": 0.2642289102077484, + "learning_rate": 3.8253663223636805e-05, + "loss": 0.4711, "step": 3185 }, { - "epoch": 0.11, - "learning_rate": 3.7406191369606e-05, - "loss": 0.3554, + "epoch": 0.11496738386131834, + "grad_norm": 0.30651018023490906, + "learning_rate": 3.8313716070141724e-05, + "loss": 0.5242, "step": 3190 }, { - "epoch": 0.11, - "learning_rate": 3.7464821763602253e-05, - "loss": 0.3238, + "epoch": 0.11514758352254298, + "grad_norm": 0.31043529510498047, + "learning_rate": 3.837376891664665e-05, + "loss": 0.4922, "step": 3195 }, { - "epoch": 0.11, - "learning_rate": 3.75234521575985e-05, - "loss": 0.3561, + "epoch": 0.11532778318376762, + "grad_norm": 0.29356715083122253, + "learning_rate": 3.8433821763151575e-05, + "loss": 0.46, "step": 3200 }, { - "epoch": 0.11, - "learning_rate": 3.758208255159475e-05, - "loss": 0.3526, + "epoch": 0.11550798284499225, + "grad_norm": 0.3174915611743927, + "learning_rate": 3.84938746096565e-05, + "loss": 0.4916, "step": 3205 }, { - "epoch": 0.11, - "learning_rate": 3.7640712945591e-05, - "loss": 0.3791, + "epoch": 0.11568818250621689, + "grad_norm": 0.2561566233634949, + "learning_rate": 3.8553927456161426e-05, + "loss": 0.4666, "step": 3210 }, { - "epoch": 0.11, - "learning_rate": 3.769934333958724e-05, - "loss": 0.3545, + "epoch": 0.11586838216744152, + "grad_norm": 0.24369752407073975, + "learning_rate": 3.8613980302666345e-05, + "loss": 0.483, "step": 3215 }, { - "epoch": 0.11, - "learning_rate": 3.775797373358349e-05, - "loss": 0.3523, + "epoch": 0.11604858182866616, + "grad_norm": 0.24509058892726898, + "learning_rate": 3.867403314917128e-05, + "loss": 0.4709, "step": 3220 }, { - "epoch": 0.11, - "learning_rate": 3.781660412757974e-05, - "loss": 0.3347, + "epoch": 0.1162287814898908, + "grad_norm": 0.23607365787029266, + "learning_rate": 3.8734085995676196e-05, + "loss": 0.5009, "step": 3225 }, { - "epoch": 0.11, - "learning_rate": 3.7875234521575986e-05, - "loss": 0.3463, + "epoch": 0.11640898115111543, + "grad_norm": 0.30435293912887573, + "learning_rate": 3.879413884218112e-05, + "loss": 0.4995, "step": 3230 }, { - "epoch": 0.11, - "learning_rate": 3.793386491557224e-05, - "loss": 0.3371, + "epoch": 0.11658918081234007, + "grad_norm": 0.20363330841064453, + "learning_rate": 3.885419168868605e-05, + "loss": 0.4809, "step": 3235 }, { - "epoch": 0.11, - "learning_rate": 3.7992495309568484e-05, - "loss": 0.3451, + "epoch": 0.11676938047356471, + "grad_norm": 0.25914672017097473, + "learning_rate": 3.891424453519097e-05, + "loss": 0.4811, "step": 3240 }, { - "epoch": 0.11, - "learning_rate": 3.805112570356473e-05, - "loss": 0.3648, + "epoch": 0.11694958013478934, + "grad_norm": 0.2945806682109833, + "learning_rate": 3.89742973816959e-05, + "loss": 0.4874, "step": 3245 }, { - "epoch": 0.11, - "learning_rate": 3.8109756097560976e-05, - "loss": 0.372, + "epoch": 0.11712977979601398, + "grad_norm": 0.25838515162467957, + "learning_rate": 3.903435022820082e-05, + "loss": 0.493, "step": 3250 }, { - "epoch": 0.11, - "learning_rate": 3.816838649155722e-05, - "loss": 0.3444, + "epoch": 0.11730997945723862, + "grad_norm": 0.27623122930526733, + "learning_rate": 3.909440307470574e-05, + "loss": 0.4873, "step": 3255 }, { - "epoch": 0.11, - "learning_rate": 3.8227016885553474e-05, - "loss": 0.355, + "epoch": 0.11749017911846325, + "grad_norm": 0.24453672766685486, + "learning_rate": 3.915445592121067e-05, + "loss": 0.4419, "step": 3260 }, { - "epoch": 0.11, - "learning_rate": 3.828564727954972e-05, - "loss": 0.3656, + "epoch": 0.11767037877968789, + "grad_norm": 0.26414886116981506, + "learning_rate": 3.9214508767715593e-05, + "loss": 0.4597, "step": 3265 }, { - "epoch": 0.12, - "learning_rate": 3.834427767354597e-05, - "loss": 0.3422, + "epoch": 0.11785057844091253, + "grad_norm": 0.246024951338768, + "learning_rate": 3.927456161422052e-05, + "loss": 0.4923, "step": 3270 }, { - "epoch": 0.12, - "learning_rate": 3.840290806754222e-05, - "loss": 0.361, + "epoch": 0.11803077810213716, + "grad_norm": 0.25583988428115845, + "learning_rate": 3.933461446072544e-05, + "loss": 0.4346, "step": 3275 }, { - "epoch": 0.12, - "learning_rate": 3.846153846153846e-05, - "loss": 0.3587, + "epoch": 0.1182109777633618, + "grad_norm": 0.2518952190876007, + "learning_rate": 3.939466730723036e-05, + "loss": 0.5041, "step": 3280 }, { - "epoch": 0.12, - "learning_rate": 3.852016885553471e-05, - "loss": 0.3556, + "epoch": 0.11839117742458644, + "grad_norm": 0.29746881127357483, + "learning_rate": 3.945472015373529e-05, + "loss": 0.5066, "step": 3285 }, { - "epoch": 0.12, - "learning_rate": 3.857879924953096e-05, - "loss": 0.3273, + "epoch": 0.11857137708581107, + "grad_norm": 0.2888292074203491, + "learning_rate": 3.9514773000240214e-05, + "loss": 0.4976, "step": 3290 }, { - "epoch": 0.12, - "learning_rate": 3.8637429643527207e-05, - "loss": 0.339, + "epoch": 0.11875157674703571, + "grad_norm": 0.2132648229598999, + "learning_rate": 3.957482584674514e-05, + "loss": 0.4901, "step": 3295 }, { - "epoch": 0.12, - "learning_rate": 3.869606003752345e-05, - "loss": 0.3469, + "epoch": 0.11893177640826036, + "grad_norm": 0.2614684998989105, + "learning_rate": 3.963487869325006e-05, + "loss": 0.5054, "step": 3300 }, { - "epoch": 0.12, - "learning_rate": 3.8754690431519705e-05, - "loss": 0.3426, + "epoch": 0.119111976069485, + "grad_norm": 0.2751060128211975, + "learning_rate": 3.9694931539754984e-05, + "loss": 0.5719, "step": 3305 }, { - "epoch": 0.12, - "learning_rate": 3.881332082551595e-05, - "loss": 0.3597, + "epoch": 0.11929217573070963, + "grad_norm": 0.2760029435157776, + "learning_rate": 3.975498438625991e-05, + "loss": 0.4718, "step": 3310 }, { - "epoch": 0.12, - "learning_rate": 3.8871951219512196e-05, - "loss": 0.3435, + "epoch": 0.11947237539193427, + "grad_norm": 0.3538672626018524, + "learning_rate": 3.9815037232764836e-05, + "loss": 0.5002, "step": 3315 }, { - "epoch": 0.12, - "learning_rate": 3.893058161350844e-05, - "loss": 0.3399, + "epoch": 0.1196525750531589, + "grad_norm": 0.34897580742836, + "learning_rate": 3.987509007926976e-05, + "loss": 0.4935, "step": 3320 }, { - "epoch": 0.12, - "learning_rate": 3.8989212007504694e-05, - "loss": 0.3429, + "epoch": 0.11983277471438354, + "grad_norm": 0.3150857388973236, + "learning_rate": 3.993514292577468e-05, + "loss": 0.5232, "step": 3325 }, { - "epoch": 0.12, - "learning_rate": 3.904784240150094e-05, - "loss": 0.3387, + "epoch": 0.12001297437560818, + "grad_norm": 0.22704921662807465, + "learning_rate": 3.999519577227961e-05, + "loss": 0.4848, "step": 3330 }, { - "epoch": 0.12, - "learning_rate": 3.910647279549719e-05, - "loss": 0.3409, + "epoch": 0.12019317403683281, + "grad_norm": 0.25814276933670044, + "learning_rate": 4.005524861878453e-05, + "loss": 0.4773, "step": 3335 }, { - "epoch": 0.12, - "learning_rate": 3.916510318949344e-05, - "loss": 0.3602, + "epoch": 0.12037337369805745, + "grad_norm": 0.2505935728549957, + "learning_rate": 4.0115301465289457e-05, + "loss": 0.5039, "step": 3340 }, { - "epoch": 0.12, - "learning_rate": 3.922373358348968e-05, - "loss": 0.3468, + "epoch": 0.12055357335928209, + "grad_norm": 0.26671484112739563, + "learning_rate": 4.017535431179438e-05, + "loss": 0.486, "step": 3345 }, { - "epoch": 0.12, - "learning_rate": 3.928236397748593e-05, - "loss": 0.3489, + "epoch": 0.12073377302050672, + "grad_norm": 0.289005309343338, + "learning_rate": 4.02354071582993e-05, + "loss": 0.5201, "step": 3350 }, { - "epoch": 0.12, - "learning_rate": 3.9340994371482174e-05, - "loss": 0.3501, + "epoch": 0.12091397268173136, + "grad_norm": 0.284462571144104, + "learning_rate": 4.029546000480423e-05, + "loss": 0.5013, "step": 3355 }, { - "epoch": 0.12, - "learning_rate": 3.939962476547843e-05, - "loss": 0.3368, + "epoch": 0.121094172342956, + "grad_norm": 0.29563307762145996, + "learning_rate": 4.035551285130915e-05, + "loss": 0.5042, "step": 3360 }, { - "epoch": 0.12, - "learning_rate": 3.945825515947467e-05, - "loss": 0.372, + "epoch": 0.12127437200418063, + "grad_norm": 0.2727985680103302, + "learning_rate": 4.041556569781408e-05, + "loss": 0.46, "step": 3365 }, { - "epoch": 0.12, - "learning_rate": 3.9516885553470925e-05, - "loss": 0.3321, + "epoch": 0.12145457166540527, + "grad_norm": 0.24392500519752502, + "learning_rate": 4.0475618544319e-05, + "loss": 0.4903, "step": 3370 }, { - "epoch": 0.12, - "learning_rate": 3.957551594746717e-05, - "loss": 0.3535, + "epoch": 0.1216347713266299, + "grad_norm": 0.261043518781662, + "learning_rate": 4.053567139082393e-05, + "loss": 0.4818, "step": 3375 }, { - "epoch": 0.12, - "learning_rate": 3.9634146341463416e-05, - "loss": 0.3559, + "epoch": 0.12181497098785454, + "grad_norm": 0.294289767742157, + "learning_rate": 4.0595724237328854e-05, + "loss": 0.48, "step": 3380 }, { - "epoch": 0.12, - "learning_rate": 3.969277673545966e-05, - "loss": 0.3842, + "epoch": 0.12199517064907918, + "grad_norm": 0.300879567861557, + "learning_rate": 4.065577708383377e-05, + "loss": 0.4833, "step": 3385 }, { - "epoch": 0.12, - "learning_rate": 3.9751407129455914e-05, - "loss": 0.3575, + "epoch": 0.12217537031030382, + "grad_norm": 0.23128901422023773, + "learning_rate": 4.07158299303387e-05, + "loss": 0.5047, "step": 3390 }, { - "epoch": 0.12, - "learning_rate": 3.981003752345216e-05, - "loss": 0.3682, + "epoch": 0.12235556997152845, + "grad_norm": 0.3030316233634949, + "learning_rate": 4.0775882776843624e-05, + "loss": 0.478, "step": 3395 }, { - "epoch": 0.12, - "learning_rate": 3.9868667917448405e-05, - "loss": 0.3297, + "epoch": 0.12253576963275309, + "grad_norm": 0.25957420468330383, + "learning_rate": 4.083593562334855e-05, + "loss": 0.5159, "step": 3400 }, { - "epoch": 0.12, - "learning_rate": 3.992729831144466e-05, - "loss": 0.3697, + "epoch": 0.12271596929397773, + "grad_norm": 0.2559145390987396, + "learning_rate": 4.0895988469853475e-05, + "loss": 0.4766, "step": 3405 }, { - "epoch": 0.12, - "learning_rate": 3.99859287054409e-05, - "loss": 0.3504, + "epoch": 0.12289616895520236, + "grad_norm": 0.3006746768951416, + "learning_rate": 4.0956041316358394e-05, + "loss": 0.4874, "step": 3410 }, { - "epoch": 0.12, - "learning_rate": 4.004455909943715e-05, - "loss": 0.3731, + "epoch": 0.123076368616427, + "grad_norm": 0.3356633484363556, + "learning_rate": 4.101609416286332e-05, + "loss": 0.5086, "step": 3415 }, { - "epoch": 0.12, - "learning_rate": 4.0103189493433394e-05, - "loss": 0.3451, + "epoch": 0.12325656827765163, + "grad_norm": 0.3298927843570709, + "learning_rate": 4.107614700936825e-05, + "loss": 0.5317, "step": 3420 }, { - "epoch": 0.12, - "learning_rate": 4.016181988742965e-05, - "loss": 0.3504, + "epoch": 0.12343676793887627, + "grad_norm": 0.30408477783203125, + "learning_rate": 4.113619985587317e-05, + "loss": 0.5221, "step": 3425 }, { - "epoch": 0.12, - "learning_rate": 4.022045028142589e-05, - "loss": 0.3507, + "epoch": 0.12361696760010091, + "grad_norm": 0.300484299659729, + "learning_rate": 4.1196252702378096e-05, + "loss": 0.4728, "step": 3430 }, { - "epoch": 0.12, - "learning_rate": 4.0279080675422145e-05, - "loss": 0.331, + "epoch": 0.12379716726132554, + "grad_norm": 0.3350473642349243, + "learning_rate": 4.1256305548883015e-05, + "loss": 0.4809, "step": 3435 }, { - "epoch": 0.12, - "learning_rate": 4.033771106941839e-05, - "loss": 0.3374, + "epoch": 0.12397736692255018, + "grad_norm": 0.23786677420139313, + "learning_rate": 4.131635839538794e-05, + "loss": 0.4957, "step": 3440 }, { - "epoch": 0.12, - "learning_rate": 4.0396341463414636e-05, - "loss": 0.3286, + "epoch": 0.12415756658377482, + "grad_norm": 0.29893916845321655, + "learning_rate": 4.1376411241892866e-05, + "loss": 0.5178, "step": 3445 }, { - "epoch": 0.12, - "learning_rate": 4.045497185741088e-05, - "loss": 0.3469, + "epoch": 0.12433776624499945, + "grad_norm": 0.29565131664276123, + "learning_rate": 4.143646408839779e-05, + "loss": 0.4833, "step": 3450 }, { - "epoch": 0.12, - "learning_rate": 4.051360225140713e-05, - "loss": 0.3475, + "epoch": 0.12451796590622409, + "grad_norm": 0.27151501178741455, + "learning_rate": 4.149651693490272e-05, + "loss": 0.4696, "step": 3455 }, { - "epoch": 0.12, - "learning_rate": 4.057223264540338e-05, - "loss": 0.3546, + "epoch": 0.12469816556744873, + "grad_norm": 0.23224970698356628, + "learning_rate": 4.1556569781407636e-05, + "loss": 0.4969, "step": 3460 }, { - "epoch": 0.12, - "learning_rate": 4.0630863039399625e-05, - "loss": 0.3629, + "epoch": 0.12487836522867338, + "grad_norm": 0.20471958816051483, + "learning_rate": 4.161662262791257e-05, + "loss": 0.4783, "step": 3465 }, { - "epoch": 0.12, - "learning_rate": 4.068949343339588e-05, - "loss": 0.3342, + "epoch": 0.125058564889898, + "grad_norm": 0.24922746419906616, + "learning_rate": 4.167667547441749e-05, + "loss": 0.475, "step": 3470 }, { - "epoch": 0.12, - "learning_rate": 4.0748123827392123e-05, - "loss": 0.3458, + "epoch": 0.12523876455112265, + "grad_norm": 0.29032933712005615, + "learning_rate": 4.173672832092241e-05, + "loss": 0.4964, "step": 3475 }, { - "epoch": 0.12, - "learning_rate": 4.080675422138837e-05, - "loss": 0.3519, + "epoch": 0.12541896421234727, + "grad_norm": 0.2781813144683838, + "learning_rate": 4.179678116742734e-05, + "loss": 0.5235, "step": 3480 }, { - "epoch": 0.12, - "learning_rate": 4.0865384615384615e-05, - "loss": 0.3356, + "epoch": 0.12559916387357192, + "grad_norm": 0.20956522226333618, + "learning_rate": 4.1856834013932264e-05, + "loss": 0.4646, "step": 3485 }, { - "epoch": 0.12, - "learning_rate": 4.092401500938087e-05, - "loss": 0.3324, + "epoch": 0.12577936353479655, + "grad_norm": 0.2584211528301239, + "learning_rate": 4.191688686043719e-05, + "loss": 0.5034, "step": 3490 }, { - "epoch": 0.12, - "learning_rate": 4.098264540337711e-05, - "loss": 0.3544, + "epoch": 0.1259595631960212, + "grad_norm": 0.2762541174888611, + "learning_rate": 4.197693970694211e-05, + "loss": 0.4318, "step": 3495 }, { - "epoch": 0.12, - "learning_rate": 4.1041275797373365e-05, - "loss": 0.3438, + "epoch": 0.12613976285724582, + "grad_norm": 0.27284300327301025, + "learning_rate": 4.2036992553447034e-05, + "loss": 0.5069, "step": 3500 }, { - "epoch": 0.12, - "eval_loss": 0.3493571877479553, - "eval_runtime": 10.5406, - "eval_samples_per_second": 9.487, - "eval_steps_per_second": 9.487, + "epoch": 0.12613976285724582, + "eval_loss": 0.5114214420318604, + "eval_runtime": 3.5175, + "eval_samples_per_second": 28.429, + "eval_steps_per_second": 7.107, "step": 3500 }, { - "epoch": 0.12, - "learning_rate": 4.109990619136961e-05, - "loss": 0.3457, + "epoch": 0.12631996251847047, + "grad_norm": 0.27490076422691345, + "learning_rate": 4.209704539995196e-05, + "loss": 0.5157, "step": 3505 }, { - "epoch": 0.12, - "learning_rate": 4.1158536585365856e-05, - "loss": 0.332, + "epoch": 0.1265001621796951, + "grad_norm": 0.27542826533317566, + "learning_rate": 4.2157098246456885e-05, + "loss": 0.4931, "step": 3510 }, { - "epoch": 0.12, - "learning_rate": 4.12171669793621e-05, - "loss": 0.3388, + "epoch": 0.12668036184091974, + "grad_norm": 0.25971895456314087, + "learning_rate": 4.221715109296181e-05, + "loss": 0.4658, "step": 3515 }, { - "epoch": 0.12, - "learning_rate": 4.127579737335835e-05, - "loss": 0.3509, + "epoch": 0.12686056150214436, + "grad_norm": 0.26690730452537537, + "learning_rate": 4.227720393946673e-05, + "loss": 0.4838, "step": 3520 }, { - "epoch": 0.12, - "learning_rate": 4.13344277673546e-05, - "loss": 0.3456, + "epoch": 0.12704076116336901, + "grad_norm": 0.3250514566898346, + "learning_rate": 4.2337256785971655e-05, + "loss": 0.5006, "step": 3525 }, { - "epoch": 0.12, - "learning_rate": 4.1393058161350846e-05, - "loss": 0.335, + "epoch": 0.12722096082459364, + "grad_norm": 0.23514829576015472, + "learning_rate": 4.239730963247659e-05, + "loss": 0.4743, "step": 3530 }, { - "epoch": 0.12, - "learning_rate": 4.14516885553471e-05, - "loss": 0.3253, + "epoch": 0.1274011604858183, + "grad_norm": 0.2410300076007843, + "learning_rate": 4.2457362478981506e-05, + "loss": 0.4643, "step": 3535 }, { - "epoch": 0.12, - "learning_rate": 4.1510318949343344e-05, - "loss": 0.3525, + "epoch": 0.1275813601470429, + "grad_norm": 0.3281730115413666, + "learning_rate": 4.251741532548643e-05, + "loss": 0.4714, "step": 3540 }, { - "epoch": 0.12, - "learning_rate": 4.156894934333959e-05, - "loss": 0.3493, + "epoch": 0.12776155980826756, + "grad_norm": 0.28623196482658386, + "learning_rate": 4.257746817199135e-05, + "loss": 0.5064, "step": 3545 }, { - "epoch": 0.12, - "learning_rate": 4.1627579737335835e-05, - "loss": 0.3619, + "epoch": 0.1279417594694922, + "grad_norm": 0.2539494037628174, + "learning_rate": 4.2637521018496276e-05, + "loss": 0.4777, "step": 3550 }, { - "epoch": 0.13, - "learning_rate": 4.168621013133208e-05, - "loss": 0.3241, + "epoch": 0.12812195913071683, + "grad_norm": 0.26674479246139526, + "learning_rate": 4.269757386500121e-05, + "loss": 0.4719, "step": 3555 }, { - "epoch": 0.13, - "learning_rate": 4.174484052532833e-05, - "loss": 0.3642, + "epoch": 0.12830215879194148, + "grad_norm": 0.23590758442878723, + "learning_rate": 4.275762671150613e-05, + "loss": 0.4984, "step": 3560 }, { - "epoch": 0.13, - "learning_rate": 4.180347091932458e-05, - "loss": 0.3621, + "epoch": 0.1284823584531661, + "grad_norm": 0.26926812529563904, + "learning_rate": 4.281767955801105e-05, + "loss": 0.5151, "step": 3565 }, { - "epoch": 0.13, - "learning_rate": 4.186210131332083e-05, - "loss": 0.3247, + "epoch": 0.12866255811439076, + "grad_norm": 0.2809188663959503, + "learning_rate": 4.287773240451597e-05, + "loss": 0.5129, "step": 3570 }, { - "epoch": 0.13, - "learning_rate": 4.1920731707317077e-05, - "loss": 0.3477, + "epoch": 0.12884275777561538, + "grad_norm": 0.2685711681842804, + "learning_rate": 4.2937785251020904e-05, + "loss": 0.4916, "step": 3575 }, { - "epoch": 0.13, - "learning_rate": 4.197936210131332e-05, - "loss": 0.3352, + "epoch": 0.12902295743684003, + "grad_norm": 0.2880760431289673, + "learning_rate": 4.299783809752582e-05, + "loss": 0.504, "step": 3580 }, { - "epoch": 0.13, - "learning_rate": 4.203799249530957e-05, - "loss": 0.3672, + "epoch": 0.12920315709806465, + "grad_norm": 0.28242960572242737, + "learning_rate": 4.305789094403075e-05, + "loss": 0.51, "step": 3585 }, { - "epoch": 0.13, - "learning_rate": 4.209662288930582e-05, - "loss": 0.3714, + "epoch": 0.1293833567592893, + "grad_norm": 0.22377252578735352, + "learning_rate": 4.3117943790535674e-05, + "loss": 0.471, "step": 3590 }, { - "epoch": 0.13, - "learning_rate": 4.2155253283302066e-05, - "loss": 0.3332, + "epoch": 0.12956355642051393, + "grad_norm": 0.23554405570030212, + "learning_rate": 4.31779966370406e-05, + "loss": 0.497, "step": 3595 }, { - "epoch": 0.13, - "learning_rate": 4.221388367729832e-05, - "loss": 0.3222, + "epoch": 0.12974375608173858, + "grad_norm": 0.3754778802394867, + "learning_rate": 4.3238049483545525e-05, + "loss": 0.4746, "step": 3600 }, { - "epoch": 0.13, - "learning_rate": 4.2272514071294564e-05, - "loss": 0.3322, + "epoch": 0.1299239557429632, + "grad_norm": 0.2620472013950348, + "learning_rate": 4.3298102330050444e-05, + "loss": 0.4682, "step": 3605 }, { - "epoch": 0.13, - "learning_rate": 4.233114446529081e-05, - "loss": 0.345, + "epoch": 0.13010415540418785, + "grad_norm": 0.25988104939460754, + "learning_rate": 4.335815517655537e-05, + "loss": 0.4681, "step": 3610 }, { - "epoch": 0.13, - "learning_rate": 4.2389774859287055e-05, - "loss": 0.3589, + "epoch": 0.13028435506541247, + "grad_norm": 0.2066163271665573, + "learning_rate": 4.3418208023060295e-05, + "loss": 0.4465, "step": 3615 }, { - "epoch": 0.13, - "learning_rate": 4.24484052532833e-05, - "loss": 0.3631, + "epoch": 0.13046455472663712, + "grad_norm": 0.28050124645233154, + "learning_rate": 4.347826086956522e-05, + "loss": 0.4737, "step": 3620 }, { - "epoch": 0.13, - "learning_rate": 4.250703564727955e-05, - "loss": 0.3355, + "epoch": 0.13064475438786174, + "grad_norm": 0.32841894030570984, + "learning_rate": 4.3538313716070146e-05, + "loss": 0.4578, "step": 3625 }, { - "epoch": 0.13, - "learning_rate": 4.25656660412758e-05, - "loss": 0.3479, + "epoch": 0.1308249540490864, + "grad_norm": 0.2937125861644745, + "learning_rate": 4.3598366562575065e-05, + "loss": 0.4655, "step": 3630 }, { - "epoch": 0.13, - "learning_rate": 4.262429643527205e-05, - "loss": 0.3446, + "epoch": 0.13100515371031102, + "grad_norm": 0.2880992889404297, + "learning_rate": 4.365841940907999e-05, + "loss": 0.48, "step": 3635 }, { - "epoch": 0.13, - "learning_rate": 4.26829268292683e-05, - "loss": 0.3448, + "epoch": 0.13118535337153567, + "grad_norm": 0.23047612607479095, + "learning_rate": 4.3718472255584916e-05, + "loss": 0.5327, "step": 3640 }, { - "epoch": 0.13, - "learning_rate": 4.274155722326454e-05, - "loss": 0.3515, + "epoch": 0.1313655530327603, + "grad_norm": 0.27166831493377686, + "learning_rate": 4.377852510208984e-05, + "loss": 0.4772, "step": 3645 }, { - "epoch": 0.13, - "learning_rate": 4.280018761726079e-05, - "loss": 0.3433, + "epoch": 0.13154575269398494, + "grad_norm": 0.3133472502231598, + "learning_rate": 4.383857794859477e-05, + "loss": 0.5161, "step": 3650 }, { - "epoch": 0.13, - "learning_rate": 4.285881801125704e-05, - "loss": 0.3223, + "epoch": 0.13172595235520956, + "grad_norm": 0.25573498010635376, + "learning_rate": 4.3898630795099686e-05, + "loss": 0.5252, "step": 3655 }, { - "epoch": 0.13, - "learning_rate": 4.2917448405253286e-05, - "loss": 0.3672, + "epoch": 0.1319061520164342, + "grad_norm": 0.23422662913799286, + "learning_rate": 4.395868364160461e-05, + "loss": 0.4795, "step": 3660 }, { - "epoch": 0.13, - "learning_rate": 4.297607879924953e-05, - "loss": 0.3445, + "epoch": 0.13208635167765884, + "grad_norm": 0.19992312788963318, + "learning_rate": 4.4018736488109544e-05, + "loss": 0.5103, "step": 3665 }, { - "epoch": 0.13, - "learning_rate": 4.3034709193245784e-05, - "loss": 0.3612, + "epoch": 0.1322665513388835, + "grad_norm": 0.22987040877342224, + "learning_rate": 4.407878933461446e-05, + "loss": 0.5037, "step": 3670 }, { - "epoch": 0.13, - "learning_rate": 4.309333958724203e-05, - "loss": 0.3635, + "epoch": 0.1324467510001081, + "grad_norm": 0.35055404901504517, + "learning_rate": 4.413884218111939e-05, + "loss": 0.5178, "step": 3675 }, { - "epoch": 0.13, - "learning_rate": 4.3151969981238275e-05, - "loss": 0.3712, + "epoch": 0.13262695066133276, + "grad_norm": 0.2555431127548218, + "learning_rate": 4.419889502762431e-05, + "loss": 0.4925, "step": 3680 }, { - "epoch": 0.13, - "learning_rate": 4.321060037523452e-05, - "loss": 0.3576, + "epoch": 0.13280715032255738, + "grad_norm": 0.2736440598964691, + "learning_rate": 4.425894787412924e-05, + "loss": 0.5068, "step": 3685 }, { - "epoch": 0.13, - "learning_rate": 4.326923076923077e-05, - "loss": 0.3451, + "epoch": 0.13298734998378203, + "grad_norm": 0.20614711940288544, + "learning_rate": 4.4319000720634165e-05, + "loss": 0.4919, "step": 3690 }, { - "epoch": 0.13, - "learning_rate": 4.332786116322702e-05, - "loss": 0.337, + "epoch": 0.13316754964500666, + "grad_norm": 0.29815688729286194, + "learning_rate": 4.4379053567139083e-05, + "loss": 0.5117, "step": 3695 }, { - "epoch": 0.13, - "learning_rate": 4.338649155722327e-05, - "loss": 0.3955, + "epoch": 0.1333477493062313, + "grad_norm": 0.23376838862895966, + "learning_rate": 4.443910641364401e-05, + "loss": 0.5105, "step": 3700 }, { - "epoch": 0.13, - "learning_rate": 4.344512195121952e-05, - "loss": 0.3417, + "epoch": 0.13352794896745593, + "grad_norm": 0.27070969343185425, + "learning_rate": 4.4499159260148935e-05, + "loss": 0.4692, "step": 3705 }, { - "epoch": 0.13, - "learning_rate": 4.350375234521576e-05, - "loss": 0.353, + "epoch": 0.13370814862868058, + "grad_norm": 0.2089190036058426, + "learning_rate": 4.455921210665386e-05, + "loss": 0.4596, "step": 3710 }, { - "epoch": 0.13, - "learning_rate": 4.356238273921201e-05, - "loss": 0.3516, + "epoch": 0.1338883482899052, + "grad_norm": 0.331286758184433, + "learning_rate": 4.461926495315878e-05, + "loss": 0.5144, "step": 3715 }, { - "epoch": 0.13, - "learning_rate": 4.3621013133208254e-05, - "loss": 0.3788, + "epoch": 0.13406854795112985, + "grad_norm": 0.2310916930437088, + "learning_rate": 4.4679317799663705e-05, + "loss": 0.5188, "step": 3720 }, { - "epoch": 0.13, - "learning_rate": 4.3679643527204506e-05, - "loss": 0.3611, + "epoch": 0.1342487476123545, + "grad_norm": 0.21956689655780792, + "learning_rate": 4.473937064616863e-05, + "loss": 0.4799, "step": 3725 }, { - "epoch": 0.13, - "learning_rate": 4.373827392120075e-05, - "loss": 0.3479, + "epoch": 0.13442894727357912, + "grad_norm": 0.2846265733242035, + "learning_rate": 4.4799423492673556e-05, + "loss": 0.525, "step": 3730 }, { - "epoch": 0.13, - "learning_rate": 4.3796904315197004e-05, - "loss": 0.3494, + "epoch": 0.13460914693480378, + "grad_norm": 0.2660646140575409, + "learning_rate": 4.485947633917848e-05, + "loss": 0.451, "step": 3735 }, { - "epoch": 0.13, - "learning_rate": 4.385553470919325e-05, - "loss": 0.3495, + "epoch": 0.1347893465960284, + "grad_norm": 0.30038926005363464, + "learning_rate": 4.49195291856834e-05, + "loss": 0.4577, "step": 3740 }, { - "epoch": 0.13, - "learning_rate": 4.3914165103189495e-05, - "loss": 0.3221, + "epoch": 0.13496954625725305, + "grad_norm": 0.23106199502944946, + "learning_rate": 4.4979582032188326e-05, + "loss": 0.5007, "step": 3745 }, { - "epoch": 0.13, - "learning_rate": 4.397279549718574e-05, - "loss": 0.3679, + "epoch": 0.13514974591847767, + "grad_norm": 0.24195370078086853, + "learning_rate": 4.503963487869325e-05, + "loss": 0.4571, "step": 3750 }, { - "epoch": 0.13, - "learning_rate": 4.4031425891181993e-05, - "loss": 0.3789, + "epoch": 0.13532994557970232, + "grad_norm": 0.3315066993236542, + "learning_rate": 4.509968772519818e-05, + "loss": 0.5199, "step": 3755 }, { - "epoch": 0.13, - "learning_rate": 4.409005628517824e-05, - "loss": 0.3436, + "epoch": 0.13551014524092694, + "grad_norm": 0.23671256005764008, + "learning_rate": 4.51597405717031e-05, + "loss": 0.486, "step": 3760 }, { - "epoch": 0.13, - "learning_rate": 4.4148686679174485e-05, - "loss": 0.3464, + "epoch": 0.1356903449021516, + "grad_norm": 0.2768378257751465, + "learning_rate": 4.521979341820802e-05, + "loss": 0.478, "step": 3765 }, { - "epoch": 0.13, - "learning_rate": 4.420731707317074e-05, - "loss": 0.3243, + "epoch": 0.13587054456337622, + "grad_norm": 0.20859333872795105, + "learning_rate": 4.5279846264712947e-05, + "loss": 0.4651, "step": 3770 }, { - "epoch": 0.13, - "learning_rate": 4.426594746716698e-05, - "loss": 0.3591, + "epoch": 0.13605074422460087, + "grad_norm": 0.28573447465896606, + "learning_rate": 4.533989911121788e-05, + "loss": 0.4645, "step": 3775 }, { - "epoch": 0.13, - "learning_rate": 4.432457786116323e-05, - "loss": 0.3671, + "epoch": 0.1362309438858255, + "grad_norm": 0.2684060335159302, + "learning_rate": 4.53999519577228e-05, + "loss": 0.4407, "step": 3780 }, { - "epoch": 0.13, - "learning_rate": 4.4383208255159474e-05, - "loss": 0.3528, + "epoch": 0.13641114354705014, + "grad_norm": 0.24884513020515442, + "learning_rate": 4.546000480422772e-05, + "loss": 0.4747, "step": 3785 }, { - "epoch": 0.13, - "learning_rate": 4.4441838649155726e-05, - "loss": 0.3637, + "epoch": 0.13659134320827476, + "grad_norm": 0.25004905462265015, + "learning_rate": 4.552005765073264e-05, + "loss": 0.4725, "step": 3790 }, { - "epoch": 0.13, - "learning_rate": 4.450046904315197e-05, - "loss": 0.3185, + "epoch": 0.1367715428694994, + "grad_norm": 0.28418663144111633, + "learning_rate": 4.5580110497237574e-05, + "loss": 0.5091, "step": 3795 }, { - "epoch": 0.13, - "learning_rate": 4.4559099437148224e-05, - "loss": 0.3577, + "epoch": 0.13695174253072404, + "grad_norm": 0.2636741101741791, + "learning_rate": 4.56401633437425e-05, + "loss": 0.4824, "step": 3800 }, { - "epoch": 0.13, - "learning_rate": 4.461772983114447e-05, - "loss": 0.347, + "epoch": 0.13713194219194869, + "grad_norm": 0.22315765917301178, + "learning_rate": 4.570021619024742e-05, + "loss": 0.4479, "step": 3805 }, { - "epoch": 0.13, - "learning_rate": 4.4676360225140716e-05, - "loss": 0.3868, + "epoch": 0.1373121418531733, + "grad_norm": 0.26647093892097473, + "learning_rate": 4.5760269036752344e-05, + "loss": 0.4664, "step": 3810 }, { - "epoch": 0.13, - "learning_rate": 4.473499061913696e-05, - "loss": 0.3359, + "epoch": 0.13749234151439796, + "grad_norm": 0.26308777928352356, + "learning_rate": 4.582032188325727e-05, + "loss": 0.5274, "step": 3815 }, { - "epoch": 0.13, - "learning_rate": 4.479362101313321e-05, - "loss": 0.3245, + "epoch": 0.13767254117562258, + "grad_norm": 0.2349170744419098, + "learning_rate": 4.5880374729762195e-05, + "loss": 0.4634, "step": 3820 }, { - "epoch": 0.13, - "learning_rate": 4.485225140712946e-05, - "loss": 0.3556, + "epoch": 0.13785274083684723, + "grad_norm": 0.21153903007507324, + "learning_rate": 4.594042757626712e-05, + "loss": 0.4549, "step": 3825 }, { - "epoch": 0.13, - "learning_rate": 4.4910881801125705e-05, - "loss": 0.3418, + "epoch": 0.13803294049807185, + "grad_norm": 0.282583624124527, + "learning_rate": 4.600048042277204e-05, + "loss": 0.5048, "step": 3830 }, { - "epoch": 0.13, - "learning_rate": 4.496951219512196e-05, - "loss": 0.3455, + "epoch": 0.1382131401592965, + "grad_norm": 0.24750690162181854, + "learning_rate": 4.6060533269276965e-05, + "loss": 0.5016, "step": 3835 }, { - "epoch": 0.14, - "learning_rate": 4.50281425891182e-05, - "loss": 0.3431, + "epoch": 0.13839333982052113, + "grad_norm": 0.28650569915771484, + "learning_rate": 4.612058611578189e-05, + "loss": 0.503, "step": 3840 }, { - "epoch": 0.14, - "learning_rate": 4.508677298311445e-05, - "loss": 0.3509, + "epoch": 0.13857353948174578, + "grad_norm": 0.28856590390205383, + "learning_rate": 4.6180638962286817e-05, + "loss": 0.4748, "step": 3845 }, { - "epoch": 0.14, - "learning_rate": 4.5145403377110694e-05, - "loss": 0.3297, + "epoch": 0.1387537391429704, + "grad_norm": 0.24171088635921478, + "learning_rate": 4.6240691808791735e-05, + "loss": 0.4835, "step": 3850 }, { - "epoch": 0.14, - "learning_rate": 4.5204033771106947e-05, - "loss": 0.3432, + "epoch": 0.13893393880419505, + "grad_norm": 0.3100346028804779, + "learning_rate": 4.630074465529666e-05, + "loss": 0.4756, "step": 3855 }, { - "epoch": 0.14, - "learning_rate": 4.526266416510319e-05, - "loss": 0.3348, + "epoch": 0.13911413846541967, + "grad_norm": 0.18996688723564148, + "learning_rate": 4.6360797501801586e-05, + "loss": 0.5116, "step": 3860 }, { - "epoch": 0.14, - "learning_rate": 4.5321294559099445e-05, - "loss": 0.3428, + "epoch": 0.13929433812664432, + "grad_norm": 0.2496083527803421, + "learning_rate": 4.642085034830651e-05, + "loss": 0.4598, "step": 3865 }, { - "epoch": 0.14, - "learning_rate": 4.537992495309569e-05, - "loss": 0.3588, + "epoch": 0.13947453778786895, + "grad_norm": 0.27395179867744446, + "learning_rate": 4.648090319481144e-05, + "loss": 0.4652, "step": 3870 }, { - "epoch": 0.14, - "learning_rate": 4.543855534709193e-05, - "loss": 0.3464, + "epoch": 0.1396547374490936, + "grad_norm": 0.24019189178943634, + "learning_rate": 4.6540956041316356e-05, + "loss": 0.4794, "step": 3875 }, { - "epoch": 0.14, - "learning_rate": 4.549718574108818e-05, - "loss": 0.3704, + "epoch": 0.13983493711031822, + "grad_norm": 0.24794286489486694, + "learning_rate": 4.660100888782128e-05, + "loss": 0.4893, "step": 3880 }, { - "epoch": 0.14, - "learning_rate": 4.555581613508443e-05, - "loss": 0.3672, + "epoch": 0.14001513677154287, + "grad_norm": 0.2861761450767517, + "learning_rate": 4.6661061734326214e-05, + "loss": 0.5465, "step": 3885 }, { - "epoch": 0.14, - "learning_rate": 4.561444652908068e-05, - "loss": 0.3704, + "epoch": 0.14019533643276752, + "grad_norm": 0.24302253127098083, + "learning_rate": 4.672111458083113e-05, + "loss": 0.5152, "step": 3890 }, { - "epoch": 0.14, - "learning_rate": 4.5673076923076925e-05, - "loss": 0.3336, + "epoch": 0.14037553609399214, + "grad_norm": 0.25176292657852173, + "learning_rate": 4.678116742733606e-05, + "loss": 0.5149, "step": 3895 }, { - "epoch": 0.14, - "learning_rate": 4.573170731707318e-05, - "loss": 0.3272, + "epoch": 0.1405557357552168, + "grad_norm": 0.3168286085128784, + "learning_rate": 4.684122027384098e-05, + "loss": 0.5241, "step": 3900 }, { - "epoch": 0.14, - "learning_rate": 4.579033771106942e-05, - "loss": 0.3223, + "epoch": 0.14073593541644142, + "grad_norm": 0.3177297115325928, + "learning_rate": 4.690127312034591e-05, + "loss": 0.5131, "step": 3905 }, { - "epoch": 0.14, - "learning_rate": 4.584896810506567e-05, - "loss": 0.361, + "epoch": 0.14091613507766607, + "grad_norm": 0.30368152260780334, + "learning_rate": 4.6961325966850835e-05, + "loss": 0.5248, "step": 3910 }, { - "epoch": 0.14, - "learning_rate": 4.5907598499061914e-05, - "loss": 0.3281, + "epoch": 0.1410963347388907, + "grad_norm": 0.2546997368335724, + "learning_rate": 4.7021378813355754e-05, + "loss": 0.4857, "step": 3915 }, { - "epoch": 0.14, - "learning_rate": 4.596622889305816e-05, - "loss": 0.3614, + "epoch": 0.14127653440011534, + "grad_norm": 0.23904703557491302, + "learning_rate": 4.708143165986068e-05, + "loss": 0.4906, "step": 3920 }, { - "epoch": 0.14, - "learning_rate": 4.602485928705441e-05, - "loss": 0.3411, + "epoch": 0.14145673406133996, + "grad_norm": 0.26533734798431396, + "learning_rate": 4.7141484506365605e-05, + "loss": 0.4906, "step": 3925 }, { - "epoch": 0.14, - "learning_rate": 4.608348968105066e-05, - "loss": 0.3313, + "epoch": 0.1416369337225646, + "grad_norm": 0.3533400297164917, + "learning_rate": 4.720153735287053e-05, + "loss": 0.4932, "step": 3930 }, { - "epoch": 0.14, - "learning_rate": 4.614212007504691e-05, - "loss": 0.3629, + "epoch": 0.14181713338378923, + "grad_norm": 0.31380826234817505, + "learning_rate": 4.7261590199375456e-05, + "loss": 0.4983, "step": 3935 }, { - "epoch": 0.14, - "learning_rate": 4.620075046904315e-05, - "loss": 0.3631, + "epoch": 0.14199733304501388, + "grad_norm": 0.25796714425086975, + "learning_rate": 4.7321643045880375e-05, + "loss": 0.4602, "step": 3940 }, { - "epoch": 0.14, - "learning_rate": 4.62593808630394e-05, - "loss": 0.3539, + "epoch": 0.1421775327062385, + "grad_norm": 0.2745930850505829, + "learning_rate": 4.73816958923853e-05, + "loss": 0.4592, "step": 3945 }, { - "epoch": 0.14, - "learning_rate": 4.631801125703565e-05, - "loss": 0.3254, + "epoch": 0.14235773236746316, + "grad_norm": 0.24087023735046387, + "learning_rate": 4.7441748738890226e-05, + "loss": 0.4606, "step": 3950 }, { - "epoch": 0.14, - "learning_rate": 4.63766416510319e-05, - "loss": 0.3704, + "epoch": 0.14253793202868778, + "grad_norm": 0.2588431239128113, + "learning_rate": 4.750180158539515e-05, + "loss": 0.4881, "step": 3955 }, { - "epoch": 0.14, - "learning_rate": 4.6435272045028145e-05, - "loss": 0.3478, + "epoch": 0.14271813168991243, + "grad_norm": 0.26731452345848083, + "learning_rate": 4.756185443190008e-05, + "loss": 0.4715, "step": 3960 }, { - "epoch": 0.14, - "learning_rate": 4.64939024390244e-05, - "loss": 0.3576, + "epoch": 0.14289833135113705, + "grad_norm": 0.2952512204647064, + "learning_rate": 4.7621907278404996e-05, + "loss": 0.4721, "step": 3965 }, { - "epoch": 0.14, - "learning_rate": 4.655253283302064e-05, - "loss": 0.3851, + "epoch": 0.1430785310123617, + "grad_norm": 0.19790342450141907, + "learning_rate": 4.768196012490992e-05, + "loss": 0.473, "step": 3970 }, { - "epoch": 0.14, - "learning_rate": 4.661116322701688e-05, - "loss": 0.3313, + "epoch": 0.14325873067358633, + "grad_norm": 0.23839952051639557, + "learning_rate": 4.774201297141485e-05, + "loss": 0.5295, "step": 3975 }, { - "epoch": 0.14, - "learning_rate": 4.6669793621013135e-05, - "loss": 0.3325, + "epoch": 0.14343893033481098, + "grad_norm": 0.2507927715778351, + "learning_rate": 4.780206581791977e-05, + "loss": 0.4815, "step": 3980 }, { - "epoch": 0.14, - "learning_rate": 4.672842401500938e-05, - "loss": 0.3399, + "epoch": 0.1436191299960356, + "grad_norm": 0.24117940664291382, + "learning_rate": 4.786211866442469e-05, + "loss": 0.4546, "step": 3985 }, { - "epoch": 0.14, - "learning_rate": 4.678705440900563e-05, - "loss": 0.3315, + "epoch": 0.14379932965726025, + "grad_norm": 0.2980533838272095, + "learning_rate": 4.792217151092962e-05, + "loss": 0.4767, "step": 3990 }, { - "epoch": 0.14, - "learning_rate": 4.684568480300188e-05, - "loss": 0.3511, + "epoch": 0.14397952931848487, + "grad_norm": 0.2951893210411072, + "learning_rate": 4.798222435743455e-05, + "loss": 0.4729, "step": 3995 }, { - "epoch": 0.14, - "learning_rate": 4.690431519699813e-05, - "loss": 0.3507, + "epoch": 0.14415972897970952, + "grad_norm": 0.19777187705039978, + "learning_rate": 4.804227720393947e-05, + "loss": 0.4814, "step": 4000 }, { - "epoch": 0.14, - "eval_loss": 0.34727856516838074, - "eval_runtime": 10.5437, - "eval_samples_per_second": 9.484, - "eval_steps_per_second": 9.484, + "epoch": 0.14415972897970952, + "eval_loss": 0.5076904296875, + "eval_runtime": 3.5171, + "eval_samples_per_second": 28.432, + "eval_steps_per_second": 7.108, "step": 4000 }, { - "epoch": 0.14, - "learning_rate": 4.696294559099437e-05, - "loss": 0.3252, + "epoch": 0.14433992864093415, + "grad_norm": 0.21895861625671387, + "learning_rate": 4.8102330050444394e-05, + "loss": 0.495, "step": 4005 }, { - "epoch": 0.14, - "learning_rate": 4.702157598499062e-05, - "loss": 0.3467, + "epoch": 0.1445201283021588, + "grad_norm": 0.2850351631641388, + "learning_rate": 4.816238289694931e-05, + "loss": 0.4767, "step": 4010 }, { - "epoch": 0.14, - "learning_rate": 4.708020637898687e-05, - "loss": 0.3564, + "epoch": 0.14470032796338342, + "grad_norm": 0.26362937688827515, + "learning_rate": 4.8222435743454245e-05, + "loss": 0.5319, "step": 4015 }, { - "epoch": 0.14, - "learning_rate": 4.713883677298312e-05, - "loss": 0.3565, + "epoch": 0.14488052762460807, + "grad_norm": 0.25553131103515625, + "learning_rate": 4.828248858995917e-05, + "loss": 0.4739, "step": 4020 }, { - "epoch": 0.14, - "learning_rate": 4.7197467166979365e-05, - "loss": 0.3467, + "epoch": 0.1450607272858327, + "grad_norm": 0.2785678505897522, + "learning_rate": 4.834254143646409e-05, + "loss": 0.5092, "step": 4025 }, { - "epoch": 0.14, - "learning_rate": 4.725609756097561e-05, - "loss": 0.3248, + "epoch": 0.14524092694705734, + "grad_norm": 0.25108572840690613, + "learning_rate": 4.8402594282969015e-05, + "loss": 0.496, "step": 4030 }, { - "epoch": 0.14, - "learning_rate": 4.7314727954971863e-05, - "loss": 0.3628, + "epoch": 0.14542112660828196, + "grad_norm": 0.21198877692222595, + "learning_rate": 4.8462647129473934e-05, + "loss": 0.4941, "step": 4035 }, { - "epoch": 0.14, - "learning_rate": 4.73733583489681e-05, - "loss": 0.3153, + "epoch": 0.14560132626950661, + "grad_norm": 0.23192477226257324, + "learning_rate": 4.8522699975978866e-05, + "loss": 0.486, "step": 4040 }, { - "epoch": 0.14, - "learning_rate": 4.7431988742964355e-05, - "loss": 0.3289, + "epoch": 0.14578152593073124, + "grad_norm": 0.25340479612350464, + "learning_rate": 4.858275282248379e-05, + "loss": 0.5056, "step": 4045 }, { - "epoch": 0.14, - "learning_rate": 4.74906191369606e-05, - "loss": 0.3446, + "epoch": 0.1459617255919559, + "grad_norm": 0.28752046823501587, + "learning_rate": 4.864280566898871e-05, + "loss": 0.4944, "step": 4050 }, { - "epoch": 0.14, - "learning_rate": 4.754924953095685e-05, - "loss": 0.3631, + "epoch": 0.1461419252531805, + "grad_norm": 0.2980968952178955, + "learning_rate": 4.8702858515493636e-05, + "loss": 0.4604, "step": 4055 }, { - "epoch": 0.14, - "learning_rate": 4.76078799249531e-05, - "loss": 0.3446, + "epoch": 0.14632212491440516, + "grad_norm": 0.2499280720949173, + "learning_rate": 4.876291136199856e-05, + "loss": 0.52, "step": 4060 }, { - "epoch": 0.14, - "learning_rate": 4.766651031894935e-05, - "loss": 0.3435, + "epoch": 0.1465023245756298, + "grad_norm": 0.2250458002090454, + "learning_rate": 4.882296420850349e-05, + "loss": 0.475, "step": 4065 }, { - "epoch": 0.14, - "learning_rate": 4.772514071294559e-05, - "loss": 0.3314, + "epoch": 0.14668252423685443, + "grad_norm": 0.20067259669303894, + "learning_rate": 4.888301705500841e-05, + "loss": 0.4465, "step": 4070 }, { - "epoch": 0.14, - "learning_rate": 4.778377110694184e-05, - "loss": 0.3587, + "epoch": 0.14686272389807908, + "grad_norm": 0.2999953329563141, + "learning_rate": 4.894306990151333e-05, + "loss": 0.5123, "step": 4075 }, { - "epoch": 0.14, - "learning_rate": 4.784240150093809e-05, - "loss": 0.3589, + "epoch": 0.1470429235593037, + "grad_norm": 0.26574891805648804, + "learning_rate": 4.900312274801826e-05, + "loss": 0.4827, "step": 4080 }, { - "epoch": 0.14, - "learning_rate": 4.790103189493433e-05, - "loss": 0.3404, + "epoch": 0.14722312322052836, + "grad_norm": 0.2696743309497833, + "learning_rate": 4.906317559452318e-05, + "loss": 0.4896, "step": 4085 }, { - "epoch": 0.14, - "learning_rate": 4.7959662288930586e-05, - "loss": 0.3554, + "epoch": 0.14740332288175298, + "grad_norm": 0.3257639706134796, + "learning_rate": 4.912322844102811e-05, + "loss": 0.5203, "step": 4090 }, { - "epoch": 0.14, - "learning_rate": 4.801829268292683e-05, - "loss": 0.3638, + "epoch": 0.14758352254297763, + "grad_norm": 0.2311761975288391, + "learning_rate": 4.9183281287533034e-05, + "loss": 0.502, "step": 4095 }, { - "epoch": 0.14, - "learning_rate": 4.8076923076923084e-05, - "loss": 0.3528, + "epoch": 0.14776372220420225, + "grad_norm": 0.26127567887306213, + "learning_rate": 4.924333413403795e-05, + "loss": 0.4544, "step": 4100 }, { - "epoch": 0.14, - "learning_rate": 4.813555347091932e-05, - "loss": 0.3536, + "epoch": 0.1479439218654269, + "grad_norm": 0.25755074620246887, + "learning_rate": 4.9303386980542885e-05, + "loss": 0.4996, "step": 4105 }, { - "epoch": 0.14, - "learning_rate": 4.8194183864915575e-05, - "loss": 0.3846, + "epoch": 0.14812412152665153, + "grad_norm": 0.23005364835262299, + "learning_rate": 4.9363439827047804e-05, + "loss": 0.5183, "step": 4110 }, { - "epoch": 0.14, - "learning_rate": 4.825281425891182e-05, - "loss": 0.3407, + "epoch": 0.14830432118787618, + "grad_norm": 0.20392531156539917, + "learning_rate": 4.942349267355273e-05, + "loss": 0.4814, "step": 4115 }, { - "epoch": 0.14, - "learning_rate": 4.831144465290807e-05, - "loss": 0.3505, + "epoch": 0.1484845208491008, + "grad_norm": 0.2658933401107788, + "learning_rate": 4.9483545520057655e-05, + "loss": 0.5149, "step": 4120 }, { - "epoch": 0.15, - "learning_rate": 4.837007504690432e-05, - "loss": 0.3477, + "epoch": 0.14866472051032545, + "grad_norm": 0.19606181979179382, + "learning_rate": 4.9543598366562574e-05, + "loss": 0.4621, "step": 4125 }, { - "epoch": 0.15, - "learning_rate": 4.8428705440900564e-05, - "loss": 0.3686, + "epoch": 0.14884492017155007, + "grad_norm": 0.25544169545173645, + "learning_rate": 4.9603651213067506e-05, + "loss": 0.5054, "step": 4130 }, { - "epoch": 0.15, - "learning_rate": 4.8487335834896817e-05, - "loss": 0.3406, + "epoch": 0.14902511983277472, + "grad_norm": 0.25996389985084534, + "learning_rate": 4.9663704059572425e-05, + "loss": 0.4607, "step": 4135 }, { - "epoch": 0.15, - "learning_rate": 4.8545966228893055e-05, - "loss": 0.3638, + "epoch": 0.14920531949399934, + "grad_norm": 0.22277650237083435, + "learning_rate": 4.972375690607735e-05, + "loss": 0.4594, "step": 4140 }, { - "epoch": 0.15, - "learning_rate": 4.860459662288931e-05, - "loss": 0.3467, + "epoch": 0.149385519155224, + "grad_norm": 0.24612875282764435, + "learning_rate": 4.978380975258227e-05, + "loss": 0.4942, "step": 4145 }, { - "epoch": 0.15, - "learning_rate": 4.8663227016885553e-05, - "loss": 0.3367, + "epoch": 0.14956571881644862, + "grad_norm": 0.3200278878211975, + "learning_rate": 4.98438625990872e-05, + "loss": 0.497, "step": 4150 }, { - "epoch": 0.15, - "learning_rate": 4.8721857410881806e-05, - "loss": 0.3366, + "epoch": 0.14974591847767327, + "grad_norm": 0.2536075711250305, + "learning_rate": 4.990391544559213e-05, + "loss": 0.4744, "step": 4155 }, { - "epoch": 0.15, - "learning_rate": 4.878048780487805e-05, - "loss": 0.3646, + "epoch": 0.1499261181388979, + "grad_norm": 0.22233828902244568, + "learning_rate": 4.9963968292097046e-05, + "loss": 0.4684, "step": 4160 }, { - "epoch": 0.15, - "learning_rate": 4.8839118198874304e-05, - "loss": 0.3653, + "epoch": 0.15010631780012254, + "grad_norm": 0.2499028444290161, + "learning_rate": 4.999999997275039e-05, + "loss": 0.4484, "step": 4165 }, { - "epoch": 0.15, - "learning_rate": 4.889774859287054e-05, - "loss": 0.3176, + "epoch": 0.15028651746134716, + "grad_norm": 0.2228631228208542, + "learning_rate": 4.9999999666192246e-05, + "loss": 0.4891, "step": 4170 }, { - "epoch": 0.15, - "learning_rate": 4.8956378986866795e-05, - "loss": 0.3331, + "epoch": 0.1504667171225718, + "grad_norm": 0.25544434785842896, + "learning_rate": 4.9999999019013944e-05, + "loss": 0.4893, "step": 4175 }, { - "epoch": 0.15, - "learning_rate": 4.901500938086304e-05, - "loss": 0.3376, + "epoch": 0.15064691678379644, + "grad_norm": 0.22523342072963715, + "learning_rate": 4.99999980312155e-05, + "loss": 0.4708, "step": 4180 }, { - "epoch": 0.15, - "learning_rate": 4.9073639774859286e-05, - "loss": 0.3516, + "epoch": 0.1508271164450211, + "grad_norm": 0.2670489549636841, + "learning_rate": 4.999999670279692e-05, + "loss": 0.4553, "step": 4185 }, { - "epoch": 0.15, - "learning_rate": 4.913227016885554e-05, - "loss": 0.3702, + "epoch": 0.1510073161062457, + "grad_norm": 0.19969885051250458, + "learning_rate": 4.999999503375823e-05, + "loss": 0.4742, "step": 4190 }, { - "epoch": 0.15, - "learning_rate": 4.9190900562851784e-05, - "loss": 0.3589, + "epoch": 0.15118751576747036, + "grad_norm": 0.26136794686317444, + "learning_rate": 4.9999993024099446e-05, + "loss": 0.5093, "step": 4195 }, { - "epoch": 0.15, - "learning_rate": 4.924953095684804e-05, - "loss": 0.3326, + "epoch": 0.15136771542869498, + "grad_norm": 0.265652596950531, + "learning_rate": 4.999999067382059e-05, + "loss": 0.4894, "step": 4200 }, { - "epoch": 0.15, - "learning_rate": 4.9308161350844276e-05, - "loss": 0.3562, + "epoch": 0.15154791508991963, + "grad_norm": 0.22276762127876282, + "learning_rate": 4.999998798292171e-05, + "loss": 0.4643, "step": 4205 }, { - "epoch": 0.15, - "learning_rate": 4.936679174484053e-05, - "loss": 0.3464, + "epoch": 0.15172811475114426, + "grad_norm": 0.18586856126785278, + "learning_rate": 4.9999984951402834e-05, + "loss": 0.4719, "step": 4210 }, { - "epoch": 0.15, - "learning_rate": 4.9425422138836774e-05, - "loss": 0.3674, + "epoch": 0.1519083144123689, + "grad_norm": 0.2172873467206955, + "learning_rate": 4.9999981579263997e-05, + "loss": 0.4829, "step": 4215 }, { - "epoch": 0.15, - "learning_rate": 4.9484052532833026e-05, - "loss": 0.3546, + "epoch": 0.15208851407359353, + "grad_norm": 0.24789679050445557, + "learning_rate": 4.999997786650525e-05, + "loss": 0.4808, "step": 4220 }, { - "epoch": 0.15, - "learning_rate": 4.954268292682927e-05, - "loss": 0.3918, + "epoch": 0.15226871373481818, + "grad_norm": 0.2855437397956848, + "learning_rate": 4.9999973813126654e-05, + "loss": 0.4822, "step": 4225 }, { - "epoch": 0.15, - "learning_rate": 4.9601313320825524e-05, - "loss": 0.3447, + "epoch": 0.15244891339604283, + "grad_norm": 0.29781678318977356, + "learning_rate": 4.999996941912825e-05, + "loss": 0.5047, "step": 4230 }, { - "epoch": 0.15, - "learning_rate": 4.965994371482176e-05, - "loss": 0.3669, + "epoch": 0.15262911305726745, + "grad_norm": 0.25594308972358704, + "learning_rate": 4.9999964684510104e-05, + "loss": 0.5076, "step": 4235 }, { - "epoch": 0.15, - "learning_rate": 4.971857410881801e-05, - "loss": 0.3438, + "epoch": 0.1528093127184921, + "grad_norm": 0.21112053096294403, + "learning_rate": 4.999995960927228e-05, + "loss": 0.4764, "step": 4240 }, { - "epoch": 0.15, - "learning_rate": 4.977720450281426e-05, - "loss": 0.3425, + "epoch": 0.15298951237971672, + "grad_norm": 0.29285240173339844, + "learning_rate": 4.999995419341485e-05, + "loss": 0.4841, "step": 4245 }, { - "epoch": 0.15, - "learning_rate": 4.9835834896810507e-05, - "loss": 0.3144, + "epoch": 0.15316971204094137, + "grad_norm": 0.25095129013061523, + "learning_rate": 4.9999948436937873e-05, + "loss": 0.4723, "step": 4250 }, { - "epoch": 0.15, - "learning_rate": 4.989446529080676e-05, - "loss": 0.3529, + "epoch": 0.153349911702166, + "grad_norm": 0.2660590410232544, + "learning_rate": 4.999994233984145e-05, + "loss": 0.4527, "step": 4255 }, { - "epoch": 0.15, - "learning_rate": 4.9953095684803005e-05, - "loss": 0.3431, + "epoch": 0.15353011136339065, + "grad_norm": 0.19300931692123413, + "learning_rate": 4.999993590212564e-05, + "loss": 0.4346, "step": 4260 }, { - "epoch": 0.15, - "learning_rate": 4.999999999350783e-05, - "loss": 0.3392, + "epoch": 0.15371031102461527, + "grad_norm": 0.2573135197162628, + "learning_rate": 4.999992912379055e-05, + "loss": 0.4827, "step": 4265 }, { - "epoch": 0.15, - "learning_rate": 4.999999976628187e-05, - "loss": 0.3406, + "epoch": 0.15389051068583992, + "grad_norm": 0.2267155647277832, + "learning_rate": 4.999992200483626e-05, + "loss": 0.4743, "step": 4270 }, { - "epoch": 0.15, - "learning_rate": 4.9999999214447376e-05, - "loss": 0.3477, + "epoch": 0.15407071034706454, + "grad_norm": 0.22252587974071503, + "learning_rate": 4.999991454526289e-05, + "loss": 0.4899, "step": 4275 }, { - "epoch": 0.15, - "learning_rate": 4.9999998338004374e-05, - "loss": 0.3506, + "epoch": 0.1542509100082892, + "grad_norm": 0.24364939332008362, + "learning_rate": 4.9999906745070515e-05, + "loss": 0.4584, "step": 4280 }, { - "epoch": 0.15, - "learning_rate": 4.999999713695287e-05, - "loss": 0.3389, + "epoch": 0.15443110966951382, + "grad_norm": 0.2882271409034729, + "learning_rate": 4.999989860425924e-05, + "loss": 0.4897, "step": 4285 }, { - "epoch": 0.15, - "learning_rate": 4.999999561129288e-05, - "loss": 0.3461, + "epoch": 0.15461130933073847, + "grad_norm": 0.22930704057216644, + "learning_rate": 4.9999890122829205e-05, + "loss": 0.482, "step": 4290 }, { - "epoch": 0.15, - "learning_rate": 4.999999376102442e-05, - "loss": 0.3139, + "epoch": 0.1547915089919631, + "grad_norm": 0.22965483367443085, + "learning_rate": 4.9999881300780495e-05, + "loss": 0.4587, "step": 4295 }, { - "epoch": 0.15, - "learning_rate": 4.999999158614751e-05, - "loss": 0.3506, + "epoch": 0.15497170865318774, + "grad_norm": 0.2293003350496292, + "learning_rate": 4.999987213811325e-05, + "loss": 0.532, "step": 4300 }, { - "epoch": 0.15, - "learning_rate": 4.999998908666219e-05, - "loss": 0.3673, + "epoch": 0.15515190831441236, + "grad_norm": 0.3403257727622986, + "learning_rate": 4.999986263482758e-05, + "loss": 0.4722, "step": 4305 }, { - "epoch": 0.15, - "learning_rate": 4.9999986262568495e-05, - "loss": 0.3583, + "epoch": 0.155332107975637, + "grad_norm": 0.27816566824913025, + "learning_rate": 4.9999852790923626e-05, + "loss": 0.4603, "step": 4310 }, { - "epoch": 0.15, - "learning_rate": 4.9999983113866445e-05, - "loss": 0.341, + "epoch": 0.15551230763686164, + "grad_norm": 0.24829629063606262, + "learning_rate": 4.9999842606401516e-05, + "loss": 0.5108, "step": 4315 }, { - "epoch": 0.15, - "learning_rate": 4.9999979640556095e-05, - "loss": 0.3345, + "epoch": 0.15569250729808629, + "grad_norm": 0.1773914247751236, + "learning_rate": 4.999983208126139e-05, + "loss": 0.4665, "step": 4320 }, { - "epoch": 0.15, - "learning_rate": 4.9999975842637484e-05, - "loss": 0.3396, + "epoch": 0.1558727069593109, + "grad_norm": 0.22621670365333557, + "learning_rate": 4.9999821215503396e-05, + "loss": 0.4857, "step": 4325 }, { - "epoch": 0.15, - "learning_rate": 4.999997172011066e-05, - "loss": 0.3524, + "epoch": 0.15605290662053556, + "grad_norm": 0.232326477766037, + "learning_rate": 4.999981000912767e-05, + "loss": 0.4696, "step": 4330 }, { - "epoch": 0.15, - "learning_rate": 4.999996727297568e-05, - "loss": 0.3158, + "epoch": 0.15623310628176018, + "grad_norm": 0.30691394209861755, + "learning_rate": 4.999979846213438e-05, + "loss": 0.5046, "step": 4335 }, { - "epoch": 0.15, - "learning_rate": 4.99999625012326e-05, - "loss": 0.37, + "epoch": 0.15641330594298483, + "grad_norm": 0.2561699151992798, + "learning_rate": 4.9999786574523675e-05, + "loss": 0.4833, "step": 4340 }, { - "epoch": 0.15, - "learning_rate": 4.999995740488148e-05, - "loss": 0.3554, + "epoch": 0.15659350560420945, + "grad_norm": 0.33284783363342285, + "learning_rate": 4.9999774346295716e-05, + "loss": 0.5305, "step": 4345 }, { - "epoch": 0.15, - "learning_rate": 4.999995198392239e-05, - "loss": 0.3597, + "epoch": 0.1567737052654341, + "grad_norm": 0.18784891068935394, + "learning_rate": 4.999976177745067e-05, + "loss": 0.4514, "step": 4350 }, { - "epoch": 0.15, - "learning_rate": 4.9999946238355405e-05, - "loss": 0.3107, + "epoch": 0.15695390492665873, + "grad_norm": 0.23666001856327057, + "learning_rate": 4.999974886798872e-05, + "loss": 0.5089, "step": 4355 }, { - "epoch": 0.15, - "learning_rate": 4.999994016818059e-05, - "loss": 0.3513, + "epoch": 0.15713410458788338, + "grad_norm": 0.2393905371427536, + "learning_rate": 4.999973561791002e-05, + "loss": 0.508, "step": 4360 }, { - "epoch": 0.15, - "learning_rate": 4.999993377339802e-05, - "loss": 0.3312, + "epoch": 0.157314304249108, + "grad_norm": 0.2510823905467987, + "learning_rate": 4.999972202721477e-05, + "loss": 0.4712, "step": 4365 }, { - "epoch": 0.15, - "learning_rate": 4.99999270540078e-05, - "loss": 0.3627, + "epoch": 0.15749450391033265, + "grad_norm": 0.21815423667430878, + "learning_rate": 4.999970809590314e-05, + "loss": 0.4702, "step": 4370 }, { - "epoch": 0.15, - "learning_rate": 4.9999920010009995e-05, - "loss": 0.3232, + "epoch": 0.15767470357155727, + "grad_norm": 0.26132404804229736, + "learning_rate": 4.999969382397534e-05, + "loss": 0.4778, "step": 4375 }, { - "epoch": 0.15, - "learning_rate": 4.99999126414047e-05, - "loss": 0.3548, + "epoch": 0.15785490323278192, + "grad_norm": 0.21386702358722687, + "learning_rate": 4.999967921143154e-05, + "loss": 0.4859, "step": 4380 }, { - "epoch": 0.15, - "learning_rate": 4.9999904948192025e-05, - "loss": 0.3475, + "epoch": 0.15803510289400655, + "grad_norm": 0.3148122727870941, + "learning_rate": 4.999966425827195e-05, + "loss": 0.5291, "step": 4385 }, { - "epoch": 0.15, - "learning_rate": 4.999989693037205e-05, - "loss": 0.3552, + "epoch": 0.1582153025552312, + "grad_norm": 0.26083052158355713, + "learning_rate": 4.999964896449678e-05, + "loss": 0.4753, "step": 4390 }, { - "epoch": 0.15, - "learning_rate": 4.99998885879449e-05, - "loss": 0.3655, + "epoch": 0.15839550221645585, + "grad_norm": 0.24049343168735504, + "learning_rate": 4.9999633330106234e-05, + "loss": 0.5006, "step": 4395 }, { - "epoch": 0.15, - "learning_rate": 4.9999879920910664e-05, - "loss": 0.3671, + "epoch": 0.15857570187768047, + "grad_norm": 0.31498703360557556, + "learning_rate": 4.999961735510052e-05, + "loss": 0.4786, "step": 4400 }, { - "epoch": 0.15, - "learning_rate": 4.999987092926946e-05, - "loss": 0.3337, + "epoch": 0.15875590153890512, + "grad_norm": 0.29424259066581726, + "learning_rate": 4.999960103947986e-05, + "loss": 0.5437, "step": 4405 }, { - "epoch": 0.16, - "learning_rate": 4.9999861613021416e-05, - "loss": 0.3321, + "epoch": 0.15893610120012974, + "grad_norm": 0.28579822182655334, + "learning_rate": 4.999958438324448e-05, + "loss": 0.4814, "step": 4410 }, { - "epoch": 0.16, - "learning_rate": 4.999985197216663e-05, - "loss": 0.3543, + "epoch": 0.1591163008613544, + "grad_norm": 0.2311839759349823, + "learning_rate": 4.9999567386394595e-05, + "loss": 0.4655, "step": 4415 }, { - "epoch": 0.16, - "learning_rate": 4.999984200670526e-05, - "loss": 0.3368, + "epoch": 0.15929650052257902, + "grad_norm": 0.3129260241985321, + "learning_rate": 4.9999550048930455e-05, + "loss": 0.4614, "step": 4420 }, { - "epoch": 0.16, - "learning_rate": 4.99998317166374e-05, - "loss": 0.351, + "epoch": 0.15947670018380367, + "grad_norm": 0.2093297243118286, + "learning_rate": 4.999953237085228e-05, + "loss": 0.4925, "step": 4425 }, { - "epoch": 0.16, - "learning_rate": 4.999982110196322e-05, - "loss": 0.3441, + "epoch": 0.1596568998450283, + "grad_norm": 0.2211044728755951, + "learning_rate": 4.999951435216032e-05, + "loss": 0.4632, "step": 4430 }, { - "epoch": 0.16, - "learning_rate": 4.9999810162682824e-05, - "loss": 0.3481, + "epoch": 0.15983709950625294, + "grad_norm": 0.2064119130373001, + "learning_rate": 4.9999495992854806e-05, + "loss": 0.4794, "step": 4435 }, { - "epoch": 0.16, - "learning_rate": 4.999979889879637e-05, - "loss": 0.3649, + "epoch": 0.16001729916747756, + "grad_norm": 0.19835302233695984, + "learning_rate": 4.999947729293601e-05, + "loss": 0.4831, "step": 4440 }, { - "epoch": 0.16, - "learning_rate": 4.9999787310304e-05, - "loss": 0.3493, + "epoch": 0.1601974988287022, + "grad_norm": 0.2594761550426483, + "learning_rate": 4.9999458252404176e-05, + "loss": 0.4982, "step": 4445 }, { - "epoch": 0.16, - "learning_rate": 4.9999775397205875e-05, - "loss": 0.3405, + "epoch": 0.16037769848992683, + "grad_norm": 0.22020448744297028, + "learning_rate": 4.999943887125955e-05, + "loss": 0.4638, "step": 4450 }, { - "epoch": 0.16, - "learning_rate": 4.9999763159502146e-05, - "loss": 0.3482, + "epoch": 0.16055789815115148, + "grad_norm": 0.16716046631336212, + "learning_rate": 4.9999419149502426e-05, + "loss": 0.4716, "step": 4455 }, { - "epoch": 0.16, - "learning_rate": 4.999975059719296e-05, - "loss": 0.3288, + "epoch": 0.1607380978123761, + "grad_norm": 0.2861575782299042, + "learning_rate": 4.9999399087133044e-05, + "loss": 0.4905, "step": 4460 }, { - "epoch": 0.16, - "learning_rate": 4.9999737710278485e-05, - "loss": 0.3454, + "epoch": 0.16091829747360076, + "grad_norm": 0.22994907200336456, + "learning_rate": 4.99993786841517e-05, + "loss": 0.486, "step": 4465 }, { - "epoch": 0.16, - "learning_rate": 4.99997244987589e-05, - "loss": 0.3426, + "epoch": 0.16109849713482538, + "grad_norm": 0.17052839696407318, + "learning_rate": 4.9999357940558656e-05, + "loss": 0.4374, "step": 4470 }, { - "epoch": 0.16, - "learning_rate": 4.999971096263436e-05, - "loss": 0.3203, + "epoch": 0.16127869679605003, + "grad_norm": 0.20029401779174805, + "learning_rate": 4.99993368563542e-05, + "loss": 0.4667, "step": 4475 }, { - "epoch": 0.16, - "learning_rate": 4.999969710190505e-05, - "loss": 0.3542, + "epoch": 0.16145889645727465, + "grad_norm": 0.26389387249946594, + "learning_rate": 4.9999315431538616e-05, + "loss": 0.4532, "step": 4480 }, { - "epoch": 0.16, - "learning_rate": 4.999968291657115e-05, - "loss": 0.3207, + "epoch": 0.1616390961184993, + "grad_norm": 0.27325350046157837, + "learning_rate": 4.999929366611221e-05, + "loss": 0.4707, "step": 4485 }, { - "epoch": 0.16, - "learning_rate": 4.999966840663285e-05, - "loss": 0.336, + "epoch": 0.16181929577972393, + "grad_norm": 0.28567302227020264, + "learning_rate": 4.9999271560075256e-05, + "loss": 0.4888, "step": 4490 }, { - "epoch": 0.16, - "learning_rate": 4.999965357209032e-05, - "loss": 0.3624, + "epoch": 0.16199949544094858, + "grad_norm": 0.16461628675460815, + "learning_rate": 4.999924911342807e-05, + "loss": 0.4651, "step": 4495 }, { - "epoch": 0.16, - "learning_rate": 4.999963841294377e-05, - "loss": 0.3367, + "epoch": 0.1621796951021732, + "grad_norm": 0.1844627559185028, + "learning_rate": 4.999922632617096e-05, + "loss": 0.4724, "step": 4500 }, { - "epoch": 0.16, - "eval_loss": 0.3431379199028015, - "eval_runtime": 10.5416, - "eval_samples_per_second": 9.486, - "eval_steps_per_second": 9.486, + "epoch": 0.1621796951021732, + "eval_loss": 0.5034152865409851, + "eval_runtime": 3.5158, + "eval_samples_per_second": 28.443, + "eval_steps_per_second": 7.111, "step": 4500 }, { - "epoch": 0.16, - "learning_rate": 4.999962292919339e-05, - "loss": 0.3374, + "epoch": 0.16235989476339785, + "grad_norm": 0.21949437260627747, + "learning_rate": 4.999920319830423e-05, + "loss": 0.4663, "step": 4505 }, { - "epoch": 0.16, - "learning_rate": 4.9999607120839374e-05, - "loss": 0.345, + "epoch": 0.16254009442462247, + "grad_norm": 0.23475132882595062, + "learning_rate": 4.9999179729828195e-05, + "loss": 0.4849, "step": 4510 }, { - "epoch": 0.16, - "learning_rate": 4.9999590987881944e-05, - "loss": 0.3261, + "epoch": 0.16272029408584712, + "grad_norm": 0.20308546721935272, + "learning_rate": 4.999915592074318e-05, + "loss": 0.4665, "step": 4515 }, { - "epoch": 0.16, - "learning_rate": 4.99995745303213e-05, - "loss": 0.303, + "epoch": 0.16290049374707175, + "grad_norm": 0.18522381782531738, + "learning_rate": 4.9999131771049496e-05, + "loss": 0.4843, "step": 4520 }, { - "epoch": 0.16, - "learning_rate": 4.999955774815766e-05, - "loss": 0.3219, + "epoch": 0.1630806934082964, + "grad_norm": 0.2702092230319977, + "learning_rate": 4.9999107280747484e-05, + "loss": 0.5005, "step": 4525 }, { - "epoch": 0.16, - "learning_rate": 4.999954064139122e-05, - "loss": 0.3604, + "epoch": 0.16326089306952102, + "grad_norm": 0.20005741715431213, + "learning_rate": 4.999908244983748e-05, + "loss": 0.4915, "step": 4530 }, { - "epoch": 0.16, - "learning_rate": 4.999952321002224e-05, - "loss": 0.3528, + "epoch": 0.16344109273074567, + "grad_norm": 0.2156675010919571, + "learning_rate": 4.9999057278319817e-05, + "loss": 0.4701, "step": 4535 }, { - "epoch": 0.16, - "learning_rate": 4.9999505454050915e-05, - "loss": 0.3385, + "epoch": 0.1636212923919703, + "grad_norm": 0.2057947814464569, + "learning_rate": 4.999903176619484e-05, + "loss": 0.4371, "step": 4540 }, { - "epoch": 0.16, - "learning_rate": 4.9999487373477486e-05, - "loss": 0.344, + "epoch": 0.16380149205319494, + "grad_norm": 0.20522654056549072, + "learning_rate": 4.9999005913462896e-05, + "loss": 0.474, "step": 4545 }, { - "epoch": 0.16, - "learning_rate": 4.99994689683022e-05, - "loss": 0.3397, + "epoch": 0.16398169171441956, + "grad_norm": 0.20008191466331482, + "learning_rate": 4.999897972012433e-05, + "loss": 0.4826, "step": 4550 }, { - "epoch": 0.16, - "learning_rate": 4.999945023852527e-05, - "loss": 0.3112, + "epoch": 0.16416189137564421, + "grad_norm": 0.2456914186477661, + "learning_rate": 4.999895318617951e-05, + "loss": 0.47, "step": 4555 }, { - "epoch": 0.16, - "learning_rate": 4.999943118414696e-05, - "loss": 0.361, + "epoch": 0.16434209103686884, + "grad_norm": 0.21113859117031097, + "learning_rate": 4.999892631162879e-05, + "loss": 0.4505, "step": 4560 }, { - "epoch": 0.16, - "learning_rate": 4.9999411805167515e-05, - "loss": 0.3396, + "epoch": 0.1645222906980935, + "grad_norm": 0.2364521026611328, + "learning_rate": 4.999889909647254e-05, + "loss": 0.4626, "step": 4565 }, { - "epoch": 0.16, - "learning_rate": 4.999939210158718e-05, - "loss": 0.3423, + "epoch": 0.16470249035931814, + "grad_norm": 0.29241201281547546, + "learning_rate": 4.9998871540711126e-05, + "loss": 0.5229, "step": 4570 }, { - "epoch": 0.16, - "learning_rate": 4.999937207340622e-05, - "loss": 0.3222, + "epoch": 0.16488269002054276, + "grad_norm": 0.22053126990795135, + "learning_rate": 4.9998843644344926e-05, + "loss": 0.5066, "step": 4575 }, { - "epoch": 0.16, - "learning_rate": 4.999935172062488e-05, - "loss": 0.2994, + "epoch": 0.1650628896817674, + "grad_norm": 0.2373570203781128, + "learning_rate": 4.999881540737433e-05, + "loss": 0.4642, "step": 4580 }, { - "epoch": 0.16, - "learning_rate": 4.9999331043243436e-05, - "loss": 0.3463, + "epoch": 0.16524308934299203, + "grad_norm": 0.24440720677375793, + "learning_rate": 4.999878682979972e-05, + "loss": 0.4755, "step": 4585 }, { - "epoch": 0.16, - "learning_rate": 4.9999310041262154e-05, - "loss": 0.3634, + "epoch": 0.16542328900421668, + "grad_norm": 0.29946649074554443, + "learning_rate": 4.999875791162146e-05, + "loss": 0.4903, "step": 4590 }, { - "epoch": 0.16, - "learning_rate": 4.9999288714681306e-05, - "loss": 0.3839, + "epoch": 0.1656034886654413, + "grad_norm": 0.2605338394641876, + "learning_rate": 4.9998728652839974e-05, + "loss": 0.478, "step": 4595 }, { - "epoch": 0.16, - "learning_rate": 4.999926706350117e-05, - "loss": 0.345, + "epoch": 0.16578368832666596, + "grad_norm": 0.20642036199569702, + "learning_rate": 4.999869905345565e-05, + "loss": 0.4739, "step": 4600 }, { - "epoch": 0.16, - "learning_rate": 4.999924508772202e-05, - "loss": 0.3378, + "epoch": 0.16596388798789058, + "grad_norm": 0.2667411267757416, + "learning_rate": 4.999866911346889e-05, + "loss": 0.4881, "step": 4605 }, { - "epoch": 0.16, - "learning_rate": 4.9999222787344156e-05, - "loss": 0.3724, + "epoch": 0.16614408764911523, + "grad_norm": 0.20082050561904907, + "learning_rate": 4.999863883288011e-05, + "loss": 0.4852, "step": 4610 }, { - "epoch": 0.16, - "learning_rate": 4.999920016236786e-05, - "loss": 0.3219, + "epoch": 0.16632428731033985, + "grad_norm": 0.27822422981262207, + "learning_rate": 4.99986082116897e-05, + "loss": 0.4941, "step": 4615 }, { - "epoch": 0.16, - "learning_rate": 4.9999177212793424e-05, - "loss": 0.353, + "epoch": 0.1665044869715645, + "grad_norm": 0.28651654720306396, + "learning_rate": 4.999857724989811e-05, + "loss": 0.5, "step": 4620 }, { - "epoch": 0.16, - "learning_rate": 4.999915393862115e-05, - "loss": 0.3162, + "epoch": 0.16668468663278913, + "grad_norm": 0.2600594162940979, + "learning_rate": 4.9998545947505734e-05, + "loss": 0.48, "step": 4625 }, { - "epoch": 0.16, - "learning_rate": 4.9999130339851325e-05, - "loss": 0.3491, + "epoch": 0.16686488629401378, + "grad_norm": 0.2715243697166443, + "learning_rate": 4.999851430451301e-05, + "loss": 0.472, "step": 4630 }, { - "epoch": 0.16, - "learning_rate": 4.9999106416484274e-05, - "loss": 0.3276, + "epoch": 0.1670450859552384, + "grad_norm": 0.259139746427536, + "learning_rate": 4.999848232092037e-05, + "loss": 0.4986, "step": 4635 }, { - "epoch": 0.16, - "learning_rate": 4.99990821685203e-05, - "loss": 0.3685, + "epoch": 0.16722528561646305, + "grad_norm": 0.24041247367858887, + "learning_rate": 4.999844999672825e-05, + "loss": 0.4546, "step": 4640 }, { - "epoch": 0.16, - "learning_rate": 4.999905759595972e-05, - "loss": 0.3249, + "epoch": 0.16740548527768767, + "grad_norm": 0.2729252278804779, + "learning_rate": 4.9998417331937086e-05, + "loss": 0.4713, "step": 4645 }, { - "epoch": 0.16, - "learning_rate": 4.9999032698802853e-05, - "loss": 0.3665, + "epoch": 0.16758568493891232, + "grad_norm": 0.25324806571006775, + "learning_rate": 4.999838432654733e-05, + "loss": 0.4919, "step": 4650 }, { - "epoch": 0.16, - "learning_rate": 4.999900747705002e-05, - "loss": 0.3448, + "epoch": 0.16776588460013694, + "grad_norm": 0.25361695885658264, + "learning_rate": 4.9998350980559427e-05, + "loss": 0.4811, "step": 4655 }, { - "epoch": 0.16, - "learning_rate": 4.9998981930701556e-05, - "loss": 0.3512, + "epoch": 0.1679460842613616, + "grad_norm": 0.24803782999515533, + "learning_rate": 4.999831729397383e-05, + "loss": 0.4998, "step": 4660 }, { - "epoch": 0.16, - "learning_rate": 4.999895605975778e-05, - "loss": 0.3621, + "epoch": 0.16812628392258622, + "grad_norm": 0.28923875093460083, + "learning_rate": 4.9998283266791e-05, + "loss": 0.4713, "step": 4665 }, { - "epoch": 0.16, - "learning_rate": 4.9998929864219035e-05, - "loss": 0.3259, + "epoch": 0.16830648358381087, + "grad_norm": 0.17523469030857086, + "learning_rate": 4.9998248899011405e-05, + "loss": 0.4995, "step": 4670 }, { - "epoch": 0.16, - "learning_rate": 4.999890334408566e-05, - "loss": 0.3641, + "epoch": 0.1684866832450355, + "grad_norm": 0.21582381427288055, + "learning_rate": 4.9998214190635495e-05, + "loss": 0.4943, "step": 4675 }, { - "epoch": 0.16, - "learning_rate": 4.9998876499358e-05, - "loss": 0.3532, + "epoch": 0.16866688290626014, + "grad_norm": 0.35510435700416565, + "learning_rate": 4.999817914166377e-05, + "loss": 0.5125, "step": 4680 }, { - "epoch": 0.16, - "learning_rate": 4.9998849330036414e-05, - "loss": 0.3338, + "epoch": 0.16884708256748476, + "grad_norm": 0.255281001329422, + "learning_rate": 4.9998143752096684e-05, + "loss": 0.5037, "step": 4685 }, { - "epoch": 0.17, - "learning_rate": 4.999882183612123e-05, - "loss": 0.3298, + "epoch": 0.1690272822287094, + "grad_norm": 0.26469287276268005, + "learning_rate": 4.999810802193473e-05, + "loss": 0.4958, "step": 4690 }, { - "epoch": 0.17, - "learning_rate": 4.999879401761283e-05, - "loss": 0.3281, + "epoch": 0.16920748188993404, + "grad_norm": 0.23432576656341553, + "learning_rate": 4.9998071951178405e-05, + "loss": 0.4646, "step": 4695 }, { - "epoch": 0.17, - "learning_rate": 4.9998765874511554e-05, - "loss": 0.3606, + "epoch": 0.1693876815511587, + "grad_norm": 0.19142989814281464, + "learning_rate": 4.999803553982818e-05, + "loss": 0.4931, "step": 4700 }, { - "epoch": 0.17, - "learning_rate": 4.999873740681779e-05, - "loss": 0.3785, + "epoch": 0.1695678812123833, + "grad_norm": 0.21128664910793304, + "learning_rate": 4.999799878788456e-05, + "loss": 0.4901, "step": 4705 }, { - "epoch": 0.17, - "learning_rate": 4.9998708614531885e-05, - "loss": 0.3164, + "epoch": 0.16974808087360796, + "grad_norm": 0.22904910147190094, + "learning_rate": 4.999796169534805e-05, + "loss": 0.4975, "step": 4710 }, { - "epoch": 0.17, - "learning_rate": 4.999867949765422e-05, - "loss": 0.334, + "epoch": 0.16992828053483258, + "grad_norm": 0.27078938484191895, + "learning_rate": 4.999792426221915e-05, + "loss": 0.4434, "step": 4715 }, { - "epoch": 0.17, - "learning_rate": 4.999865005618519e-05, - "loss": 0.3884, + "epoch": 0.17010848019605723, + "grad_norm": 0.2444290965795517, + "learning_rate": 4.9997886488498374e-05, + "loss": 0.4923, "step": 4720 }, { - "epoch": 0.17, - "learning_rate": 4.999862029012516e-05, - "loss": 0.3537, + "epoch": 0.17028867985728185, + "grad_norm": 0.2548987865447998, + "learning_rate": 4.999784837418623e-05, + "loss": 0.4474, "step": 4725 }, { - "epoch": 0.17, - "learning_rate": 4.9998590199474516e-05, - "loss": 0.2878, + "epoch": 0.1704688795185065, + "grad_norm": 0.2576419711112976, + "learning_rate": 4.999780991928325e-05, + "loss": 0.5217, "step": 4730 }, { - "epoch": 0.17, - "learning_rate": 4.999855978423365e-05, - "loss": 0.3349, + "epoch": 0.17064907917973116, + "grad_norm": 0.2640141546726227, + "learning_rate": 4.999777112378994e-05, + "loss": 0.4983, "step": 4735 }, { - "epoch": 0.17, - "learning_rate": 4.999852904440296e-05, - "loss": 0.3547, + "epoch": 0.17082927884095578, + "grad_norm": 0.2303144633769989, + "learning_rate": 4.999773198770684e-05, + "loss": 0.477, "step": 4740 }, { - "epoch": 0.17, - "learning_rate": 4.999849797998286e-05, - "loss": 0.3466, + "epoch": 0.17100947850218043, + "grad_norm": 0.22392189502716064, + "learning_rate": 4.999769251103449e-05, + "loss": 0.4617, "step": 4745 }, { - "epoch": 0.17, - "learning_rate": 4.999846659097373e-05, - "loss": 0.3519, + "epoch": 0.17118967816340505, + "grad_norm": 0.26157307624816895, + "learning_rate": 4.9997652693773414e-05, + "loss": 0.4668, "step": 4750 }, { - "epoch": 0.17, - "learning_rate": 4.999843487737598e-05, - "loss": 0.3389, + "epoch": 0.1713698778246297, + "grad_norm": 0.24970059096813202, + "learning_rate": 4.999761253592415e-05, + "loss": 0.4901, "step": 4755 }, { - "epoch": 0.17, - "learning_rate": 4.999840283919003e-05, - "loss": 0.3458, + "epoch": 0.17155007748585432, + "grad_norm": 0.25207483768463135, + "learning_rate": 4.999757203748727e-05, + "loss": 0.4811, "step": 4760 }, { - "epoch": 0.17, - "learning_rate": 4.999837047641629e-05, - "loss": 0.3598, + "epoch": 0.17173027714707897, + "grad_norm": 0.23375752568244934, + "learning_rate": 4.99975311984633e-05, + "loss": 0.4795, "step": 4765 }, { - "epoch": 0.17, - "learning_rate": 4.9998337789055195e-05, - "loss": 0.3349, + "epoch": 0.1719104768083036, + "grad_norm": 0.2202087789773941, + "learning_rate": 4.999749001885281e-05, + "loss": 0.5194, "step": 4770 }, { - "epoch": 0.17, - "learning_rate": 4.9998304777107155e-05, - "loss": 0.3621, + "epoch": 0.17209067646952825, + "grad_norm": 0.17962399125099182, + "learning_rate": 4.999744849865636e-05, + "loss": 0.4766, "step": 4775 }, { - "epoch": 0.17, - "learning_rate": 4.99982714405726e-05, - "loss": 0.3359, + "epoch": 0.17227087613075287, + "grad_norm": 0.18169254064559937, + "learning_rate": 4.999740663787452e-05, + "loss": 0.5046, "step": 4780 }, { - "epoch": 0.17, - "learning_rate": 4.999823777945196e-05, - "loss": 0.3329, + "epoch": 0.17245107579197752, + "grad_norm": 0.21856611967086792, + "learning_rate": 4.999736443650784e-05, + "loss": 0.4918, "step": 4785 }, { - "epoch": 0.17, - "learning_rate": 4.999820379374568e-05, - "loss": 0.3453, + "epoch": 0.17263127545320214, + "grad_norm": 0.24348682165145874, + "learning_rate": 4.999732189455692e-05, + "loss": 0.4751, "step": 4790 }, { - "epoch": 0.17, - "learning_rate": 4.99981694834542e-05, - "loss": 0.3564, + "epoch": 0.1728114751144268, + "grad_norm": 0.22899436950683594, + "learning_rate": 4.9997279012022324e-05, + "loss": 0.4198, "step": 4795 }, { - "epoch": 0.17, - "learning_rate": 4.999813484857796e-05, - "loss": 0.3436, + "epoch": 0.17299167477565142, + "grad_norm": 0.26833006739616394, + "learning_rate": 4.9997235788904646e-05, + "loss": 0.4782, "step": 4800 }, { - "epoch": 0.17, - "learning_rate": 4.9998099889117416e-05, - "loss": 0.35, + "epoch": 0.17317187443687607, + "grad_norm": 0.2631605565547943, + "learning_rate": 4.9997192225204466e-05, + "loss": 0.4366, "step": 4805 }, { - "epoch": 0.17, - "learning_rate": 4.9998064605073025e-05, - "loss": 0.336, + "epoch": 0.1733520740981007, + "grad_norm": 0.22278451919555664, + "learning_rate": 4.999714832092238e-05, + "loss": 0.4783, "step": 4810 }, { - "epoch": 0.17, - "learning_rate": 4.999802899644524e-05, - "loss": 0.3788, + "epoch": 0.17353227375932534, + "grad_norm": 0.22488220036029816, + "learning_rate": 4.999710407605899e-05, + "loss": 0.4497, "step": 4815 }, { - "epoch": 0.17, - "learning_rate": 4.9997993063234505e-05, - "loss": 0.3505, + "epoch": 0.17371247342054996, + "grad_norm": 0.19599469006061554, + "learning_rate": 4.99970594906149e-05, + "loss": 0.5147, "step": 4820 }, { - "epoch": 0.17, - "learning_rate": 4.9997956805441316e-05, - "loss": 0.36, + "epoch": 0.1738926730817746, + "grad_norm": 0.22859926521778107, + "learning_rate": 4.9997014564590706e-05, + "loss": 0.4891, "step": 4825 }, { - "epoch": 0.17, - "learning_rate": 4.999792022306613e-05, - "loss": 0.3545, + "epoch": 0.17407287274299924, + "grad_norm": 0.29516199231147766, + "learning_rate": 4.9996969297987036e-05, + "loss": 0.454, "step": 4830 }, { - "epoch": 0.17, - "learning_rate": 4.999788331610943e-05, - "loss": 0.3646, + "epoch": 0.17425307240422389, + "grad_norm": 0.2656870484352112, + "learning_rate": 4.9996923690804495e-05, + "loss": 0.4491, "step": 4835 }, { - "epoch": 0.17, - "learning_rate": 4.999784608457169e-05, - "loss": 0.3414, + "epoch": 0.1744332720654485, + "grad_norm": 0.2088516801595688, + "learning_rate": 4.999687774304371e-05, + "loss": 0.489, "step": 4840 }, { - "epoch": 0.17, - "learning_rate": 4.9997808528453386e-05, - "loss": 0.3378, + "epoch": 0.17461347172667316, + "grad_norm": 0.20423473417758942, + "learning_rate": 4.99968314547053e-05, + "loss": 0.5235, "step": 4845 }, { - "epoch": 0.17, - "learning_rate": 4.999777064775502e-05, - "loss": 0.3516, + "epoch": 0.17479367138789778, + "grad_norm": 0.2307443767786026, + "learning_rate": 4.999678482578991e-05, + "loss": 0.4527, "step": 4850 }, { - "epoch": 0.17, - "learning_rate": 4.999773244247706e-05, - "loss": 0.343, + "epoch": 0.17497387104912243, + "grad_norm": 0.16717872023582458, + "learning_rate": 4.9996737856298156e-05, + "loss": 0.4769, "step": 4855 }, { - "epoch": 0.17, - "learning_rate": 4.9997693912620035e-05, - "loss": 0.3429, + "epoch": 0.17515407071034705, + "grad_norm": 0.23562130331993103, + "learning_rate": 4.99966905462307e-05, + "loss": 0.4787, "step": 4860 }, { - "epoch": 0.17, - "learning_rate": 4.999765505818442e-05, - "loss": 0.3423, + "epoch": 0.1753342703715717, + "grad_norm": 0.2515668272972107, + "learning_rate": 4.9996642895588166e-05, + "loss": 0.4721, "step": 4865 }, { - "epoch": 0.17, - "learning_rate": 4.999761587917073e-05, - "loss": 0.321, + "epoch": 0.17551447003279633, + "grad_norm": 0.20563237369060516, + "learning_rate": 4.9996594904371215e-05, + "loss": 0.481, "step": 4870 }, { - "epoch": 0.17, - "learning_rate": 4.999757637557948e-05, - "loss": 0.3474, + "epoch": 0.17569466969402098, + "grad_norm": 0.24153757095336914, + "learning_rate": 4.99965465725805e-05, + "loss": 0.49, "step": 4875 }, { - "epoch": 0.17, - "learning_rate": 4.9997536547411165e-05, - "loss": 0.3708, + "epoch": 0.1758748693552456, + "grad_norm": 0.19717536866664886, + "learning_rate": 4.999649790021667e-05, + "loss": 0.4529, "step": 4880 }, { - "epoch": 0.17, - "learning_rate": 4.999749639466632e-05, - "loss": 0.3307, + "epoch": 0.17605506901647025, + "grad_norm": 0.26905906200408936, + "learning_rate": 4.99964488872804e-05, + "loss": 0.4775, "step": 4885 }, { - "epoch": 0.17, - "learning_rate": 4.999745591734545e-05, - "loss": 0.3124, + "epoch": 0.17623526867769487, + "grad_norm": 0.18942050635814667, + "learning_rate": 4.999639953377235e-05, + "loss": 0.4558, "step": 4890 }, { - "epoch": 0.17, - "learning_rate": 4.999741511544909e-05, - "loss": 0.3249, + "epoch": 0.17641546833891952, + "grad_norm": 0.25289055705070496, + "learning_rate": 4.99963498396932e-05, + "loss": 0.4752, "step": 4895 }, { - "epoch": 0.17, - "learning_rate": 4.9997373988977776e-05, - "loss": 0.321, + "epoch": 0.17659566800014415, + "grad_norm": 0.28434181213378906, + "learning_rate": 4.9996299805043625e-05, + "loss": 0.498, "step": 4900 }, { - "epoch": 0.17, - "learning_rate": 4.999733253793203e-05, - "loss": 0.3264, + "epoch": 0.1767758676613688, + "grad_norm": 0.2828446328639984, + "learning_rate": 4.99962494298243e-05, + "loss": 0.498, "step": 4905 }, { - "epoch": 0.17, - "learning_rate": 4.999729076231239e-05, - "loss": 0.3436, + "epoch": 0.17695606732259345, + "grad_norm": 0.24182403087615967, + "learning_rate": 4.999619871403592e-05, + "loss": 0.4959, "step": 4910 }, { - "epoch": 0.17, - "learning_rate": 4.999724866211941e-05, - "loss": 0.3249, + "epoch": 0.17713626698381807, + "grad_norm": 0.2429310828447342, + "learning_rate": 4.999614765767917e-05, + "loss": 0.4828, "step": 4915 }, { - "epoch": 0.17, - "learning_rate": 4.999720623735363e-05, - "loss": 0.3331, + "epoch": 0.17731646664504272, + "grad_norm": 0.2273981124162674, + "learning_rate": 4.999609626075475e-05, + "loss": 0.4961, "step": 4920 }, { - "epoch": 0.17, - "learning_rate": 4.9997163488015595e-05, - "loss": 0.3406, + "epoch": 0.17749666630626734, + "grad_norm": 0.24011553823947906, + "learning_rate": 4.999604452326335e-05, + "loss": 0.4782, "step": 4925 }, { - "epoch": 0.17, - "learning_rate": 4.999712041410587e-05, - "loss": 0.3397, + "epoch": 0.177676865967492, + "grad_norm": 0.22934234142303467, + "learning_rate": 4.999599244520569e-05, + "loss": 0.5014, "step": 4930 }, { - "epoch": 0.17, - "learning_rate": 4.9997077015625007e-05, - "loss": 0.3426, + "epoch": 0.17785706562871662, + "grad_norm": 0.19686634838581085, + "learning_rate": 4.999594002658247e-05, + "loss": 0.446, "step": 4935 }, { - "epoch": 0.17, - "learning_rate": 4.9997033292573575e-05, - "loss": 0.3314, + "epoch": 0.17803726528994127, + "grad_norm": 0.2651787996292114, + "learning_rate": 4.999588726739441e-05, + "loss": 0.5142, "step": 4940 }, { - "epoch": 0.17, - "learning_rate": 4.999698924495214e-05, - "loss": 0.3547, + "epoch": 0.1782174649511659, + "grad_norm": 0.1980464607477188, + "learning_rate": 4.999583416764222e-05, + "loss": 0.4975, "step": 4945 }, { - "epoch": 0.17, - "learning_rate": 4.999694487276128e-05, - "loss": 0.3362, + "epoch": 0.17839766461239054, + "grad_norm": 0.24422787129878998, + "learning_rate": 4.999578072732664e-05, + "loss": 0.455, "step": 4950 }, { - "epoch": 0.17, - "learning_rate": 4.999690017600155e-05, - "loss": 0.3154, + "epoch": 0.17857786427361516, + "grad_norm": 0.31826359033584595, + "learning_rate": 4.9995726946448375e-05, + "loss": 0.5055, "step": 4955 }, { - "epoch": 0.17, - "learning_rate": 4.999685515467355e-05, - "loss": 0.3549, + "epoch": 0.1787580639348398, + "grad_norm": 0.26964086294174194, + "learning_rate": 4.999567282500817e-05, + "loss": 0.4871, "step": 4960 }, { - "epoch": 0.17, - "learning_rate": 4.999680980877787e-05, - "loss": 0.3302, + "epoch": 0.17893826359606443, + "grad_norm": 0.18171383440494537, + "learning_rate": 4.999561836300676e-05, + "loss": 0.4757, "step": 4965 }, { - "epoch": 0.17, - "learning_rate": 4.999676413831507e-05, - "loss": 0.3415, + "epoch": 0.17911846325728908, + "grad_norm": 0.26201415061950684, + "learning_rate": 4.99955635604449e-05, + "loss": 0.4891, "step": 4970 }, { - "epoch": 0.18, - "learning_rate": 4.999671814328577e-05, - "loss": 0.3449, + "epoch": 0.1792986629185137, + "grad_norm": 0.24988138675689697, + "learning_rate": 4.999550841732331e-05, + "loss": 0.4845, "step": 4975 }, { - "epoch": 0.18, - "learning_rate": 4.9996671823690565e-05, - "loss": 0.3202, + "epoch": 0.17947886257973836, + "grad_norm": 0.21902771294116974, + "learning_rate": 4.999545293364277e-05, + "loss": 0.4645, "step": 4980 }, { - "epoch": 0.18, - "learning_rate": 4.9996625179530046e-05, - "loss": 0.3373, + "epoch": 0.17965906224096298, + "grad_norm": 0.1887025535106659, + "learning_rate": 4.999539710940402e-05, + "loss": 0.4777, "step": 4985 }, { - "epoch": 0.18, - "learning_rate": 4.999657821080481e-05, - "loss": 0.3344, + "epoch": 0.17983926190218763, + "grad_norm": 0.21480071544647217, + "learning_rate": 4.9995340944607824e-05, + "loss": 0.499, "step": 4990 }, { - "epoch": 0.18, - "learning_rate": 4.999653091751549e-05, - "loss": 0.3466, + "epoch": 0.18001946156341225, + "grad_norm": 0.2067321240901947, + "learning_rate": 4.9995284439254944e-05, + "loss": 0.4782, "step": 4995 }, { - "epoch": 0.18, - "learning_rate": 4.9996483299662694e-05, - "loss": 0.3277, + "epoch": 0.1801996612246369, + "grad_norm": 0.22930487990379333, + "learning_rate": 4.999522759334616e-05, + "loss": 0.5074, "step": 5000 }, { - "epoch": 0.18, - "eval_loss": 0.33953267335891724, - "eval_runtime": 10.5352, - "eval_samples_per_second": 9.492, - "eval_steps_per_second": 9.492, + "epoch": 0.1801996612246369, + "eval_loss": 0.5006341338157654, + "eval_runtime": 3.5194, + "eval_samples_per_second": 28.414, + "eval_steps_per_second": 7.104, "step": 5000 }, { - "epoch": 0.18, - "learning_rate": 4.999643535724703e-05, - "loss": 0.345, + "epoch": 0.18037986088586153, + "grad_norm": 0.23178088665008545, + "learning_rate": 4.999517040688223e-05, + "loss": 0.4822, "step": 5005 }, { - "epoch": 0.18, - "learning_rate": 4.999638709026913e-05, - "loss": 0.3589, + "epoch": 0.18056006054708618, + "grad_norm": 0.21988582611083984, + "learning_rate": 4.999511287986395e-05, + "loss": 0.4559, "step": 5010 }, { - "epoch": 0.18, - "learning_rate": 4.999633849872961e-05, - "loss": 0.3331, + "epoch": 0.1807402602083108, + "grad_norm": 0.25729215145111084, + "learning_rate": 4.999505501229209e-05, + "loss": 0.4693, "step": 5015 }, { - "epoch": 0.18, - "learning_rate": 4.999628958262911e-05, - "loss": 0.3244, + "epoch": 0.18092045986953545, + "grad_norm": 0.23540961742401123, + "learning_rate": 4.999499680416745e-05, + "loss": 0.5181, "step": 5020 }, { - "epoch": 0.18, - "learning_rate": 4.9996240341968265e-05, - "loss": 0.3285, + "epoch": 0.18110065953076007, + "grad_norm": 0.17820635437965393, + "learning_rate": 4.9994938255490814e-05, + "loss": 0.4853, "step": 5025 }, { - "epoch": 0.18, - "learning_rate": 4.999619077674771e-05, - "loss": 0.3572, + "epoch": 0.18128085919198472, + "grad_norm": 0.2419961541891098, + "learning_rate": 4.999487936626299e-05, + "loss": 0.4927, "step": 5030 }, { - "epoch": 0.18, - "learning_rate": 4.999614088696809e-05, - "loss": 0.3537, + "epoch": 0.18146105885320934, + "grad_norm": 0.2524636685848236, + "learning_rate": 4.999482013648476e-05, + "loss": 0.4986, "step": 5035 }, { - "epoch": 0.18, - "learning_rate": 4.9996090672630056e-05, - "loss": 0.3523, + "epoch": 0.181641258514434, + "grad_norm": 0.18317490816116333, + "learning_rate": 4.999476056615696e-05, + "loss": 0.4806, "step": 5040 }, { - "epoch": 0.18, - "learning_rate": 4.999604013373426e-05, - "loss": 0.3402, + "epoch": 0.18182145817565862, + "grad_norm": 0.2592169940471649, + "learning_rate": 4.999470065528038e-05, + "loss": 0.4741, "step": 5045 }, { - "epoch": 0.18, - "learning_rate": 4.999598927028135e-05, - "loss": 0.3613, + "epoch": 0.18200165783688327, + "grad_norm": 0.24432708323001862, + "learning_rate": 4.999464040385584e-05, + "loss": 0.4726, "step": 5050 }, { - "epoch": 0.18, - "learning_rate": 4.9995938082271996e-05, - "loss": 0.356, + "epoch": 0.1821818574981079, + "grad_norm": 0.23248982429504395, + "learning_rate": 4.999457981188417e-05, + "loss": 0.4759, "step": 5055 }, { - "epoch": 0.18, - "learning_rate": 4.999588656970686e-05, - "loss": 0.3402, + "epoch": 0.18236205715933254, + "grad_norm": 0.23487761616706848, + "learning_rate": 4.999451887936618e-05, + "loss": 0.4953, "step": 5060 }, { - "epoch": 0.18, - "learning_rate": 4.999583473258661e-05, - "loss": 0.3801, + "epoch": 0.18254225682055716, + "grad_norm": 0.28242379426956177, + "learning_rate": 4.9994457606302714e-05, + "loss": 0.4772, "step": 5065 }, { - "epoch": 0.18, - "learning_rate": 4.999578257091192e-05, - "loss": 0.341, + "epoch": 0.18272245648178181, + "grad_norm": 0.1956668496131897, + "learning_rate": 4.999439599269461e-05, + "loss": 0.5024, "step": 5070 }, { - "epoch": 0.18, - "learning_rate": 4.999573008468347e-05, - "loss": 0.3114, + "epoch": 0.18290265614300646, + "grad_norm": 0.25298696756362915, + "learning_rate": 4.999433403854269e-05, + "loss": 0.5046, "step": 5075 }, { - "epoch": 0.18, - "learning_rate": 4.9995677273901924e-05, - "loss": 0.356, + "epoch": 0.1830828558042311, + "grad_norm": 0.2178865671157837, + "learning_rate": 4.999427174384781e-05, + "loss": 0.5063, "step": 5080 }, { - "epoch": 0.18, - "learning_rate": 4.999562413856799e-05, - "loss": 0.3366, + "epoch": 0.18326305546545574, + "grad_norm": 0.2657603621482849, + "learning_rate": 4.9994209108610814e-05, + "loss": 0.4846, "step": 5085 }, { - "epoch": 0.18, - "learning_rate": 4.999557067868235e-05, - "loss": 0.3245, + "epoch": 0.18344325512668036, + "grad_norm": 0.23558183014392853, + "learning_rate": 4.999414613283256e-05, + "loss": 0.4673, "step": 5090 }, { - "epoch": 0.18, - "learning_rate": 4.999551689424569e-05, - "loss": 0.3187, + "epoch": 0.183623454787905, + "grad_norm": 0.23601466417312622, + "learning_rate": 4.999408281651391e-05, + "loss": 0.5041, "step": 5095 }, { - "epoch": 0.18, - "learning_rate": 4.999546278525872e-05, - "loss": 0.3288, + "epoch": 0.18380365444912963, + "grad_norm": 0.22025693953037262, + "learning_rate": 4.999401915965572e-05, + "loss": 0.4699, "step": 5100 }, { - "epoch": 0.18, - "learning_rate": 4.9995408351722145e-05, - "loss": 0.35, + "epoch": 0.18398385411035428, + "grad_norm": 0.23485559225082397, + "learning_rate": 4.9993955162258844e-05, + "loss": 0.4927, "step": 5105 }, { - "epoch": 0.18, - "learning_rate": 4.999535359363666e-05, - "loss": 0.3453, + "epoch": 0.1841640537715789, + "grad_norm": 0.2333470582962036, + "learning_rate": 4.999389082432417e-05, + "loss": 0.472, "step": 5110 }, { - "epoch": 0.18, - "learning_rate": 4.999529851100298e-05, - "loss": 0.3534, + "epoch": 0.18434425343280356, + "grad_norm": 0.20295509696006775, + "learning_rate": 4.999382614585258e-05, + "loss": 0.4978, "step": 5115 }, { - "epoch": 0.18, - "learning_rate": 4.999524310382182e-05, - "loss": 0.3667, + "epoch": 0.18452445309402818, + "grad_norm": 0.23132069408893585, + "learning_rate": 4.999376112684495e-05, + "loss": 0.4796, "step": 5120 }, { - "epoch": 0.18, - "learning_rate": 4.9995187372093906e-05, - "loss": 0.3592, + "epoch": 0.18470465275525283, + "grad_norm": 0.19237832725048065, + "learning_rate": 4.9993695767302165e-05, + "loss": 0.5032, "step": 5125 }, { - "epoch": 0.18, - "learning_rate": 4.999513131581995e-05, - "loss": 0.3338, + "epoch": 0.18488485241647745, + "grad_norm": 0.19899766147136688, + "learning_rate": 4.999363006722511e-05, + "loss": 0.4785, "step": 5130 }, { - "epoch": 0.18, - "learning_rate": 4.9995074935000694e-05, - "loss": 0.3393, + "epoch": 0.1850650520777021, + "grad_norm": 0.22194600105285645, + "learning_rate": 4.999356402661469e-05, + "loss": 0.5207, "step": 5135 }, { - "epoch": 0.18, - "learning_rate": 4.999501822963686e-05, - "loss": 0.3725, + "epoch": 0.18524525173892673, + "grad_norm": 0.22985856235027313, + "learning_rate": 4.999349764547179e-05, + "loss": 0.4558, "step": 5140 }, { - "epoch": 0.18, - "learning_rate": 4.999496119972918e-05, - "loss": 0.3396, + "epoch": 0.18542545140015138, + "grad_norm": 0.2950509488582611, + "learning_rate": 4.9993430923797324e-05, + "loss": 0.4956, "step": 5145 }, { - "epoch": 0.18, - "learning_rate": 4.999490384527841e-05, - "loss": 0.3536, + "epoch": 0.185605651061376, + "grad_norm": 0.2260800004005432, + "learning_rate": 4.9993363861592204e-05, + "loss": 0.435, "step": 5150 }, { - "epoch": 0.18, - "learning_rate": 4.9994846166285284e-05, - "loss": 0.3628, + "epoch": 0.18578585072260065, + "grad_norm": 0.18863672018051147, + "learning_rate": 4.999329645885734e-05, + "loss": 0.5241, "step": 5155 }, { - "epoch": 0.18, - "learning_rate": 4.999478816275054e-05, - "loss": 0.343, + "epoch": 0.18596605038382527, + "grad_norm": 0.24623405933380127, + "learning_rate": 4.999322871559365e-05, + "loss": 0.4892, "step": 5160 }, { - "epoch": 0.18, - "learning_rate": 4.999472983467496e-05, - "loss": 0.3212, + "epoch": 0.18614625004504992, + "grad_norm": 0.2146414965391159, + "learning_rate": 4.999316063180206e-05, + "loss": 0.4762, "step": 5165 }, { - "epoch": 0.18, - "learning_rate": 4.9994671182059274e-05, - "loss": 0.3293, + "epoch": 0.18632644970627454, + "grad_norm": 0.24258659780025482, + "learning_rate": 4.999309220748349e-05, + "loss": 0.4825, "step": 5170 }, { - "epoch": 0.18, - "learning_rate": 4.999461220490427e-05, - "loss": 0.3619, + "epoch": 0.1865066493674992, + "grad_norm": 0.24239155650138855, + "learning_rate": 4.999302344263889e-05, + "loss": 0.4796, "step": 5175 }, { - "epoch": 0.18, - "learning_rate": 4.999455290321068e-05, - "loss": 0.3588, + "epoch": 0.18668684902872382, + "grad_norm": 0.25734949111938477, + "learning_rate": 4.999295433726917e-05, + "loss": 0.5216, "step": 5180 }, { - "epoch": 0.18, - "learning_rate": 4.9994493276979304e-05, - "loss": 0.3385, + "epoch": 0.18686704868994847, + "grad_norm": 0.2938891649246216, + "learning_rate": 4.999288489137529e-05, + "loss": 0.4744, "step": 5185 }, { - "epoch": 0.18, - "learning_rate": 4.99944333262109e-05, - "loss": 0.3556, + "epoch": 0.1870472483511731, + "grad_norm": 0.20838968455791473, + "learning_rate": 4.999281510495819e-05, + "loss": 0.4786, "step": 5190 }, { - "epoch": 0.18, - "learning_rate": 4.999437305090625e-05, - "loss": 0.3072, + "epoch": 0.18722744801239774, + "grad_norm": 0.25164559483528137, + "learning_rate": 4.999274497801883e-05, + "loss": 0.523, "step": 5195 }, { - "epoch": 0.18, - "learning_rate": 4.999431245106615e-05, - "loss": 0.3087, + "epoch": 0.18740764767362236, + "grad_norm": 0.23008379340171814, + "learning_rate": 4.999267451055815e-05, + "loss": 0.4591, "step": 5200 }, { - "epoch": 0.18, - "learning_rate": 4.999425152669136e-05, - "loss": 0.3562, + "epoch": 0.187587847334847, + "grad_norm": 0.21487128734588623, + "learning_rate": 4.9992603702577124e-05, + "loss": 0.4903, "step": 5205 }, { - "epoch": 0.18, - "learning_rate": 4.9994190277782696e-05, - "loss": 0.3484, + "epoch": 0.18776804699607164, + "grad_norm": 0.2193828970193863, + "learning_rate": 4.999253255407671e-05, + "loss": 0.4357, "step": 5210 }, { - "epoch": 0.18, - "learning_rate": 4.9994128704340936e-05, - "loss": 0.3419, + "epoch": 0.1879482466572963, + "grad_norm": 0.20415742695331573, + "learning_rate": 4.999246106505788e-05, + "loss": 0.4428, "step": 5215 }, { - "epoch": 0.18, - "learning_rate": 4.999406680636689e-05, - "loss": 0.3368, + "epoch": 0.1881284463185209, + "grad_norm": 0.21222014725208282, + "learning_rate": 4.9992389235521606e-05, + "loss": 0.4969, "step": 5220 }, { - "epoch": 0.18, - "learning_rate": 4.9994004583861356e-05, - "loss": 0.3485, + "epoch": 0.18830864597974556, + "grad_norm": 0.2923332750797272, + "learning_rate": 4.999231706546886e-05, + "loss": 0.5121, "step": 5225 }, { - "epoch": 0.18, - "learning_rate": 4.999394203682515e-05, - "loss": 0.3424, + "epoch": 0.18848884564097018, + "grad_norm": 0.26699545979499817, + "learning_rate": 4.999224455490065e-05, + "loss": 0.4731, "step": 5230 }, { - "epoch": 0.18, - "learning_rate": 4.9993879165259074e-05, - "loss": 0.3392, + "epoch": 0.18866904530219483, + "grad_norm": 0.27180203795433044, + "learning_rate": 4.999217170381794e-05, + "loss": 0.4727, "step": 5235 }, { - "epoch": 0.18, - "learning_rate": 4.999381596916395e-05, - "loss": 0.3408, + "epoch": 0.18884924496341948, + "grad_norm": 0.23332376778125763, + "learning_rate": 4.999209851222172e-05, + "loss": 0.4673, "step": 5240 }, { - "epoch": 0.18, - "learning_rate": 4.9993752448540596e-05, - "loss": 0.3023, + "epoch": 0.1890294446246441, + "grad_norm": 0.19692248106002808, + "learning_rate": 4.9992024980113e-05, + "loss": 0.4728, "step": 5245 }, { - "epoch": 0.18, - "learning_rate": 4.999368860338983e-05, - "loss": 0.3318, + "epoch": 0.18920964428586876, + "grad_norm": 0.22370965778827667, + "learning_rate": 4.999195110749278e-05, + "loss": 0.4951, "step": 5250 }, { - "epoch": 0.18, - "learning_rate": 4.99936244337125e-05, - "loss": 0.3265, + "epoch": 0.18938984394709338, + "grad_norm": 0.20484954118728638, + "learning_rate": 4.9991876894362064e-05, + "loss": 0.4878, "step": 5255 }, { - "epoch": 0.19, - "learning_rate": 4.9993559939509426e-05, - "loss": 0.3451, + "epoch": 0.18957004360831803, + "grad_norm": 0.18314266204833984, + "learning_rate": 4.9991802340721866e-05, + "loss": 0.5089, "step": 5260 }, { - "epoch": 0.19, - "learning_rate": 4.9993495120781445e-05, - "loss": 0.3405, + "epoch": 0.18975024326954265, + "grad_norm": 0.1618998497724533, + "learning_rate": 4.99917274465732e-05, + "loss": 0.5125, "step": 5265 }, { - "epoch": 0.19, - "learning_rate": 4.9993429977529405e-05, - "loss": 0.3369, + "epoch": 0.1899304429307673, + "grad_norm": 0.18222755193710327, + "learning_rate": 4.9991652211917086e-05, + "loss": 0.5117, "step": 5270 }, { - "epoch": 0.19, - "learning_rate": 4.999336450975414e-05, - "loss": 0.3678, + "epoch": 0.19011064259199192, + "grad_norm": 0.25865301489830017, + "learning_rate": 4.999157663675454e-05, + "loss": 0.4643, "step": 5275 }, { - "epoch": 0.19, - "learning_rate": 4.999329871745652e-05, - "loss": 0.3559, + "epoch": 0.19029084225321657, + "grad_norm": 0.21296486258506775, + "learning_rate": 4.999150072108661e-05, + "loss": 0.4639, "step": 5280 }, { - "epoch": 0.19, - "learning_rate": 4.9993232600637374e-05, - "loss": 0.3529, + "epoch": 0.1904710419144412, + "grad_norm": 0.19290651381015778, + "learning_rate": 4.9991424464914324e-05, + "loss": 0.4825, "step": 5285 }, { - "epoch": 0.19, - "learning_rate": 4.999316615929759e-05, - "loss": 0.3264, + "epoch": 0.19065124157566585, + "grad_norm": 0.28650933504104614, + "learning_rate": 4.9991347868238714e-05, + "loss": 0.5179, "step": 5290 }, { - "epoch": 0.19, - "learning_rate": 4.9993099393438006e-05, - "loss": 0.3242, + "epoch": 0.19083144123689047, + "grad_norm": 0.1798352748155594, + "learning_rate": 4.9991270931060826e-05, + "loss": 0.4681, "step": 5295 }, { - "epoch": 0.19, - "learning_rate": 4.9993032303059486e-05, - "loss": 0.326, + "epoch": 0.19101164089811512, + "grad_norm": 0.238149955868721, + "learning_rate": 4.9991193653381704e-05, + "loss": 0.4844, "step": 5300 }, { - "epoch": 0.19, - "learning_rate": 4.999296488816293e-05, - "loss": 0.3212, + "epoch": 0.19119184055933974, + "grad_norm": 0.27041110396385193, + "learning_rate": 4.9991116035202425e-05, + "loss": 0.5273, "step": 5305 }, { - "epoch": 0.19, - "learning_rate": 4.999289714874918e-05, - "loss": 0.341, + "epoch": 0.1913720402205644, + "grad_norm": 0.2148786038160324, + "learning_rate": 4.999103807652401e-05, + "loss": 0.4877, "step": 5310 }, { - "epoch": 0.19, - "learning_rate": 4.999282908481915e-05, - "loss": 0.3389, + "epoch": 0.19155223988178902, + "grad_norm": 0.20174206793308258, + "learning_rate": 4.999095977734755e-05, + "loss": 0.4858, "step": 5315 }, { - "epoch": 0.19, - "learning_rate": 4.9992760696373695e-05, - "loss": 0.3637, + "epoch": 0.19173243954301367, + "grad_norm": 0.26444557309150696, + "learning_rate": 4.9990881137674103e-05, + "loss": 0.4992, "step": 5320 }, { - "epoch": 0.19, - "learning_rate": 4.999269198341372e-05, - "loss": 0.3309, + "epoch": 0.1919126392042383, + "grad_norm": 0.21838323771953583, + "learning_rate": 4.9990802157504734e-05, + "loss": 0.4098, "step": 5325 }, { - "epoch": 0.19, - "learning_rate": 4.99926229459401e-05, - "loss": 0.3434, + "epoch": 0.19209283886546294, + "grad_norm": 0.23885351419448853, + "learning_rate": 4.999072283684052e-05, + "loss": 0.4674, "step": 5330 }, { - "epoch": 0.19, - "learning_rate": 4.999255358395374e-05, - "loss": 0.3455, + "epoch": 0.19227303852668756, + "grad_norm": 0.24196289479732513, + "learning_rate": 4.9990643175682554e-05, + "loss": 0.5008, "step": 5335 }, { - "epoch": 0.19, - "learning_rate": 4.999248389745556e-05, - "loss": 0.342, + "epoch": 0.1924532381879122, + "grad_norm": 0.18864691257476807, + "learning_rate": 4.999056317403191e-05, + "loss": 0.4603, "step": 5340 }, { - "epoch": 0.19, - "learning_rate": 4.999241388644643e-05, - "loss": 0.337, + "epoch": 0.19263343784913683, + "grad_norm": 0.2206612378358841, + "learning_rate": 4.9990482831889685e-05, + "loss": 0.4875, "step": 5345 }, { - "epoch": 0.19, - "learning_rate": 4.999234355092728e-05, - "loss": 0.3357, + "epoch": 0.19281363751036149, + "grad_norm": 0.18795616924762726, + "learning_rate": 4.9990402149256964e-05, + "loss": 0.4718, "step": 5350 }, { - "epoch": 0.19, - "learning_rate": 4.9992272890899024e-05, - "loss": 0.3281, + "epoch": 0.1929938371715861, + "grad_norm": 0.28613436222076416, + "learning_rate": 4.999032112613485e-05, + "loss": 0.4476, "step": 5355 }, { - "epoch": 0.19, - "learning_rate": 4.999220190636257e-05, - "loss": 0.3208, + "epoch": 0.19317403683281076, + "grad_norm": 0.22328707575798035, + "learning_rate": 4.999023976252445e-05, + "loss": 0.4631, "step": 5360 }, { - "epoch": 0.19, - "learning_rate": 4.999213059731885e-05, - "loss": 0.3598, + "epoch": 0.19335423649403538, + "grad_norm": 0.20430655777454376, + "learning_rate": 4.9990158058426875e-05, + "loss": 0.523, "step": 5365 }, { - "epoch": 0.19, - "learning_rate": 4.999205896376878e-05, - "loss": 0.3522, + "epoch": 0.19353443615526003, + "grad_norm": 0.19645529985427856, + "learning_rate": 4.999007601384324e-05, + "loss": 0.4338, "step": 5370 }, { - "epoch": 0.19, - "learning_rate": 4.99919870057133e-05, - "loss": 0.3178, + "epoch": 0.19371463581648465, + "grad_norm": 0.25969836115837097, + "learning_rate": 4.998999362877464e-05, + "loss": 0.4712, "step": 5375 }, { - "epoch": 0.19, - "learning_rate": 4.999191472315333e-05, - "loss": 0.3168, + "epoch": 0.1938948354777093, + "grad_norm": 0.19297480583190918, + "learning_rate": 4.9989910903222234e-05, + "loss": 0.471, "step": 5380 }, { - "epoch": 0.19, - "learning_rate": 4.999184211608983e-05, - "loss": 0.3462, + "epoch": 0.19407503513893393, + "grad_norm": 0.24897582828998566, + "learning_rate": 4.998982783718712e-05, + "loss": 0.4915, "step": 5385 }, { - "epoch": 0.19, - "learning_rate": 4.999176918452372e-05, - "loss": 0.351, + "epoch": 0.19425523480015858, + "grad_norm": 0.2619524896144867, + "learning_rate": 4.998974443067044e-05, + "loss": 0.5119, "step": 5390 }, { - "epoch": 0.19, - "learning_rate": 4.999169592845596e-05, - "loss": 0.3342, + "epoch": 0.1944354344613832, + "grad_norm": 0.17749615013599396, + "learning_rate": 4.998966068367334e-05, + "loss": 0.4782, "step": 5395 }, { - "epoch": 0.19, - "learning_rate": 4.99916223478875e-05, - "loss": 0.3408, + "epoch": 0.19461563412260785, + "grad_norm": 0.20441953837871552, + "learning_rate": 4.998957659619694e-05, + "loss": 0.4346, "step": 5400 }, { - "epoch": 0.19, - "learning_rate": 4.99915484428193e-05, - "loss": 0.3531, + "epoch": 0.19479583378383247, + "grad_norm": 0.22351542115211487, + "learning_rate": 4.9989492168242405e-05, + "loss": 0.4833, "step": 5405 }, { - "epoch": 0.19, - "learning_rate": 4.999147421325231e-05, - "loss": 0.359, + "epoch": 0.19497603344505712, + "grad_norm": 0.19759854674339294, + "learning_rate": 4.998940739981087e-05, + "loss": 0.5104, "step": 5410 }, { - "epoch": 0.19, - "learning_rate": 4.999139965918749e-05, - "loss": 0.3325, + "epoch": 0.19515623310628177, + "grad_norm": 0.2504650354385376, + "learning_rate": 4.9989322290903504e-05, + "loss": 0.4585, "step": 5415 }, { - "epoch": 0.19, - "learning_rate": 4.999132478062582e-05, - "loss": 0.3474, + "epoch": 0.1953364327675064, + "grad_norm": 0.2255609631538391, + "learning_rate": 4.998923684152145e-05, + "loss": 0.4881, "step": 5420 }, { - "epoch": 0.19, - "learning_rate": 4.9991249577568264e-05, - "loss": 0.3529, + "epoch": 0.19551663242873105, + "grad_norm": 0.25729531049728394, + "learning_rate": 4.99891510516659e-05, + "loss": 0.4705, "step": 5425 }, { - "epoch": 0.19, - "learning_rate": 4.999117405001581e-05, - "loss": 0.3142, + "epoch": 0.19569683208995567, + "grad_norm": 0.24127353727817535, + "learning_rate": 4.998906492133798e-05, + "loss": 0.4939, "step": 5430 }, { - "epoch": 0.19, - "learning_rate": 4.9991098197969426e-05, - "loss": 0.3436, + "epoch": 0.19587703175118032, + "grad_norm": 0.2850863039493561, + "learning_rate": 4.99889784505389e-05, + "loss": 0.4461, "step": 5435 }, { - "epoch": 0.19, - "learning_rate": 4.99910220214301e-05, - "loss": 0.3514, + "epoch": 0.19605723141240494, + "grad_norm": 0.1722259372472763, + "learning_rate": 4.998889163926983e-05, + "loss": 0.4819, "step": 5440 }, { - "epoch": 0.19, - "learning_rate": 4.999094552039883e-05, - "loss": 0.331, + "epoch": 0.1962374310736296, + "grad_norm": 0.21394529938697815, + "learning_rate": 4.998880448753195e-05, + "loss": 0.4985, "step": 5445 }, { - "epoch": 0.19, - "learning_rate": 4.999086869487659e-05, - "loss": 0.3635, + "epoch": 0.19641763073485421, + "grad_norm": 0.18665440380573273, + "learning_rate": 4.998871699532644e-05, + "loss": 0.5014, "step": 5450 }, { - "epoch": 0.19, - "learning_rate": 4.9990791544864404e-05, - "loss": 0.3251, + "epoch": 0.19659783039607887, + "grad_norm": 0.226554274559021, + "learning_rate": 4.998862916265451e-05, + "loss": 0.4596, "step": 5455 }, { - "epoch": 0.19, - "learning_rate": 4.999071407036325e-05, - "loss": 0.3262, + "epoch": 0.1967780300573035, + "grad_norm": 0.2326064258813858, + "learning_rate": 4.998854098951734e-05, + "loss": 0.4634, "step": 5460 }, { - "epoch": 0.19, - "learning_rate": 4.9990636271374144e-05, - "loss": 0.3029, + "epoch": 0.19695822971852814, + "grad_norm": 0.20283064246177673, + "learning_rate": 4.9988452475916134e-05, + "loss": 0.4553, "step": 5465 }, { - "epoch": 0.19, - "learning_rate": 4.99905581478981e-05, - "loss": 0.3293, + "epoch": 0.19713842937975276, + "grad_norm": 0.19399242103099823, + "learning_rate": 4.998836362185211e-05, + "loss": 0.472, "step": 5470 }, { - "epoch": 0.19, - "learning_rate": 4.999047969993612e-05, - "loss": 0.3535, + "epoch": 0.1973186290409774, + "grad_norm": 0.24312461912631989, + "learning_rate": 4.998827442732646e-05, + "loss": 0.4887, "step": 5475 }, { - "epoch": 0.19, - "learning_rate": 4.9990400927489234e-05, - "loss": 0.3354, + "epoch": 0.19749882870220203, + "grad_norm": 0.23328492045402527, + "learning_rate": 4.9988184892340414e-05, + "loss": 0.4683, "step": 5480 }, { - "epoch": 0.19, - "learning_rate": 4.999032183055846e-05, - "loss": 0.3349, + "epoch": 0.19767902836342668, + "grad_norm": 0.1696726530790329, + "learning_rate": 4.998809501689518e-05, + "loss": 0.4872, "step": 5485 }, { - "epoch": 0.19, - "learning_rate": 4.9990242409144835e-05, - "loss": 0.3415, + "epoch": 0.1978592280246513, + "grad_norm": 0.20958632230758667, + "learning_rate": 4.998800480099199e-05, + "loss": 0.4747, "step": 5490 }, { - "epoch": 0.19, - "learning_rate": 4.9990162663249373e-05, - "loss": 0.3273, + "epoch": 0.19803942768587596, + "grad_norm": 0.23482240736484528, + "learning_rate": 4.998791424463208e-05, + "loss": 0.4519, "step": 5495 }, { - "epoch": 0.19, - "learning_rate": 4.999008259287312e-05, - "loss": 0.3508, + "epoch": 0.19821962734710058, + "grad_norm": 0.20664586126804352, + "learning_rate": 4.998782334781668e-05, + "loss": 0.4516, "step": 5500 }, { - "epoch": 0.19, - "eval_loss": 0.3391436040401459, - "eval_runtime": 10.541, - "eval_samples_per_second": 9.487, - "eval_steps_per_second": 9.487, + "epoch": 0.19821962734710058, + "eval_loss": 0.49830162525177, + "eval_runtime": 3.5194, + "eval_samples_per_second": 28.414, + "eval_steps_per_second": 7.103, "step": 5500 }, { - "epoch": 0.19, - "learning_rate": 4.999000219801712e-05, - "loss": 0.3461, + "epoch": 0.19839982700832523, + "grad_norm": 0.2016601264476776, + "learning_rate": 4.998773211054701e-05, + "loss": 0.5032, "step": 5505 }, { - "epoch": 0.19, - "learning_rate": 4.99899214786824e-05, - "loss": 0.3272, + "epoch": 0.19858002666954985, + "grad_norm": 0.18409425020217896, + "learning_rate": 4.998764053282433e-05, + "loss": 0.4853, "step": 5510 }, { - "epoch": 0.19, - "learning_rate": 4.9989840434870025e-05, - "loss": 0.3521, + "epoch": 0.1987602263307745, + "grad_norm": 0.18686065077781677, + "learning_rate": 4.998754861464989e-05, + "loss": 0.4274, "step": 5515 }, { - "epoch": 0.19, - "learning_rate": 4.998975906658103e-05, - "loss": 0.3151, + "epoch": 0.19894042599199913, + "grad_norm": 0.20691046118736267, + "learning_rate": 4.9987456356024944e-05, + "loss": 0.5276, "step": 5520 }, { - "epoch": 0.19, - "learning_rate": 4.9989677373816494e-05, - "loss": 0.3279, + "epoch": 0.19912062565322378, + "grad_norm": 0.1962127685546875, + "learning_rate": 4.9987363756950736e-05, + "loss": 0.4697, "step": 5525 }, { - "epoch": 0.19, - "learning_rate": 4.9989595356577465e-05, - "loss": 0.3475, + "epoch": 0.1993008253144484, + "grad_norm": 0.20173537731170654, + "learning_rate": 4.9987270817428535e-05, + "loss": 0.434, "step": 5530 }, { - "epoch": 0.19, - "learning_rate": 4.998951301486501e-05, - "loss": 0.3282, + "epoch": 0.19948102497567305, + "grad_norm": 0.21217285096645355, + "learning_rate": 4.998717753745961e-05, + "loss": 0.4603, "step": 5535 }, { - "epoch": 0.19, - "learning_rate": 4.9989430348680186e-05, - "loss": 0.3457, + "epoch": 0.19966122463689767, + "grad_norm": 0.2284974455833435, + "learning_rate": 4.998708391704522e-05, + "loss": 0.476, "step": 5540 }, { - "epoch": 0.2, - "learning_rate": 4.998934735802409e-05, - "loss": 0.3311, + "epoch": 0.19984142429812232, + "grad_norm": 0.22764502465724945, + "learning_rate": 4.998698995618666e-05, + "loss": 0.5092, "step": 5545 }, { - "epoch": 0.2, - "learning_rate": 4.998926404289779e-05, - "loss": 0.3496, + "epoch": 0.20002162395934694, + "grad_norm": 0.19202855229377747, + "learning_rate": 4.998689565488519e-05, + "loss": 0.4573, "step": 5550 }, { - "epoch": 0.2, - "learning_rate": 4.998918040330235e-05, - "loss": 0.3349, + "epoch": 0.2002018236205716, + "grad_norm": 0.20149993896484375, + "learning_rate": 4.998680101314211e-05, + "loss": 0.4563, "step": 5555 }, { - "epoch": 0.2, - "learning_rate": 4.9989096439238884e-05, - "loss": 0.3327, + "epoch": 0.20038202328179622, + "grad_norm": 0.1978190690279007, + "learning_rate": 4.9986706030958705e-05, + "loss": 0.4785, "step": 5560 }, { - "epoch": 0.2, - "learning_rate": 4.998901215070846e-05, - "loss": 0.3649, + "epoch": 0.20056222294302087, + "grad_norm": 0.186653271317482, + "learning_rate": 4.998661070833627e-05, + "loss": 0.4688, "step": 5565 }, { - "epoch": 0.2, - "learning_rate": 4.998892753771219e-05, - "loss": 0.3336, + "epoch": 0.2007424226042455, + "grad_norm": 0.20869889855384827, + "learning_rate": 4.9986515045276094e-05, + "loss": 0.4916, "step": 5570 }, { - "epoch": 0.2, - "learning_rate": 4.998884260025116e-05, - "loss": 0.3531, + "epoch": 0.20092262226547014, + "grad_norm": 0.2684619128704071, + "learning_rate": 4.9986419041779485e-05, + "loss": 0.4965, "step": 5575 }, { - "epoch": 0.2, - "learning_rate": 4.9988757338326474e-05, - "loss": 0.3581, + "epoch": 0.2011028219266948, + "grad_norm": 0.2171335518360138, + "learning_rate": 4.9986322697847765e-05, + "loss": 0.4418, "step": 5580 }, { - "epoch": 0.2, - "learning_rate": 4.998867175193924e-05, - "loss": 0.3178, + "epoch": 0.20128302158791941, + "grad_norm": 0.24311456084251404, + "learning_rate": 4.998622601348223e-05, + "loss": 0.4689, "step": 5585 }, { - "epoch": 0.2, - "learning_rate": 4.998858584109059e-05, - "loss": 0.3431, + "epoch": 0.20146322124914406, + "grad_norm": 0.19631347060203552, + "learning_rate": 4.998612898868421e-05, + "loss": 0.4882, "step": 5590 }, { - "epoch": 0.2, - "learning_rate": 4.99884996057816e-05, - "loss": 0.3362, + "epoch": 0.2016434209103687, + "grad_norm": 0.2597336173057556, + "learning_rate": 4.9986031623455006e-05, + "loss": 0.5101, "step": 5595 }, { - "epoch": 0.2, - "learning_rate": 4.998841304601342e-05, - "loss": 0.359, + "epoch": 0.20182362057159334, + "grad_norm": 0.2971448600292206, + "learning_rate": 4.9985933917795966e-05, + "loss": 0.4788, "step": 5600 }, { - "epoch": 0.2, - "learning_rate": 4.998832616178716e-05, - "loss": 0.331, + "epoch": 0.20200382023281796, + "grad_norm": 0.2276962250471115, + "learning_rate": 4.998583587170842e-05, + "loss": 0.4641, "step": 5605 }, { - "epoch": 0.2, - "learning_rate": 4.998823895310396e-05, - "loss": 0.3425, + "epoch": 0.2021840198940426, + "grad_norm": 0.16654905676841736, + "learning_rate": 4.99857374851937e-05, + "loss": 0.4612, "step": 5610 }, { - "epoch": 0.2, - "learning_rate": 4.9988151419964935e-05, - "loss": 0.3234, + "epoch": 0.20236421955526723, + "grad_norm": 0.17817789316177368, + "learning_rate": 4.998563875825313e-05, + "loss": 0.4524, "step": 5615 }, { - "epoch": 0.2, - "learning_rate": 4.998806356237124e-05, - "loss": 0.3417, + "epoch": 0.20254441921649188, + "grad_norm": 0.1932527720928192, + "learning_rate": 4.998553969088807e-05, + "loss": 0.4654, "step": 5620 }, { - "epoch": 0.2, - "learning_rate": 4.9987975380324e-05, - "loss": 0.351, + "epoch": 0.2027246188777165, + "grad_norm": 0.22446228563785553, + "learning_rate": 4.9985440283099885e-05, + "loss": 0.5021, "step": 5625 }, { - "epoch": 0.2, - "learning_rate": 4.9987886873824374e-05, - "loss": 0.352, + "epoch": 0.20290481853894116, + "grad_norm": 0.22577793896198273, + "learning_rate": 4.99853405348899e-05, + "loss": 0.5042, "step": 5630 }, { - "epoch": 0.2, - "learning_rate": 4.99877980428735e-05, - "loss": 0.364, + "epoch": 0.20308501820016578, + "grad_norm": 0.19390153884887695, + "learning_rate": 4.998524044625949e-05, + "loss": 0.4851, "step": 5635 }, { - "epoch": 0.2, - "learning_rate": 4.998770888747254e-05, - "loss": 0.3268, + "epoch": 0.20326521786139043, + "grad_norm": 0.22741837799549103, + "learning_rate": 4.998514001721002e-05, + "loss": 0.4954, "step": 5640 }, { - "epoch": 0.2, - "learning_rate": 4.998761940762265e-05, - "loss": 0.3856, + "epoch": 0.20344541752261505, + "grad_norm": 0.20795418322086334, + "learning_rate": 4.998503924774285e-05, + "loss": 0.4748, "step": 5645 }, { - "epoch": 0.2, - "learning_rate": 4.998752960332498e-05, - "loss": 0.3381, + "epoch": 0.2036256171838397, + "grad_norm": 0.20817185938358307, + "learning_rate": 4.998493813785936e-05, + "loss": 0.5023, "step": 5650 }, { - "epoch": 0.2, - "learning_rate": 4.998743947458072e-05, - "loss": 0.3492, + "epoch": 0.20380581684506432, + "grad_norm": 0.24967476725578308, + "learning_rate": 4.9984836687560924e-05, + "loss": 0.5128, "step": 5655 }, { - "epoch": 0.2, - "learning_rate": 4.998734902139102e-05, - "loss": 0.3318, + "epoch": 0.20398601650628898, + "grad_norm": 0.2160884588956833, + "learning_rate": 4.998473489684892e-05, + "loss": 0.4386, "step": 5660 }, { - "epoch": 0.2, - "learning_rate": 4.9987258243757055e-05, - "loss": 0.3393, + "epoch": 0.2041662161675136, + "grad_norm": 0.19086240231990814, + "learning_rate": 4.998463276572475e-05, + "loss": 0.4722, "step": 5665 }, { - "epoch": 0.2, - "learning_rate": 4.998716714168e-05, - "loss": 0.3467, + "epoch": 0.20434641582873825, + "grad_norm": 0.21120908856391907, + "learning_rate": 4.9984530294189794e-05, + "loss": 0.4662, "step": 5670 }, { - "epoch": 0.2, - "learning_rate": 4.9987075715161064e-05, - "loss": 0.3667, + "epoch": 0.20452661548996287, + "grad_norm": 0.20728902518749237, + "learning_rate": 4.9984427482245445e-05, + "loss": 0.4166, "step": 5675 }, { - "epoch": 0.2, - "learning_rate": 4.998698396420141e-05, - "loss": 0.3488, + "epoch": 0.20470681515118752, + "grad_norm": 0.24865606427192688, + "learning_rate": 4.998432432989311e-05, + "loss": 0.49, "step": 5680 }, { - "epoch": 0.2, - "learning_rate": 4.998689188880223e-05, - "loss": 0.3166, + "epoch": 0.20488701481241214, + "grad_norm": 0.3118506968021393, + "learning_rate": 4.99842208371342e-05, + "loss": 0.5066, "step": 5685 }, { - "epoch": 0.2, - "learning_rate": 4.998679948896474e-05, - "loss": 0.3351, + "epoch": 0.2050672144736368, + "grad_norm": 0.21482154726982117, + "learning_rate": 4.998411700397011e-05, + "loss": 0.5173, "step": 5690 }, { - "epoch": 0.2, - "learning_rate": 4.998670676469012e-05, - "loss": 0.3503, + "epoch": 0.20524741413486142, + "grad_norm": 0.2417951226234436, + "learning_rate": 4.998401283040226e-05, + "loss": 0.4688, "step": 5695 }, { - "epoch": 0.2, - "learning_rate": 4.998661371597958e-05, - "loss": 0.3226, + "epoch": 0.20542761379608607, + "grad_norm": 0.22649191319942474, + "learning_rate": 4.9983908316432084e-05, + "loss": 0.4748, "step": 5700 }, { - "epoch": 0.2, - "learning_rate": 4.998652034283432e-05, - "loss": 0.3403, + "epoch": 0.2056078134573107, + "grad_norm": 0.28141918778419495, + "learning_rate": 4.998380346206099e-05, + "loss": 0.4986, "step": 5705 }, { - "epoch": 0.2, - "learning_rate": 4.998642664525557e-05, - "loss": 0.3472, + "epoch": 0.20578801311853534, + "grad_norm": 0.25580498576164246, + "learning_rate": 4.99836982672904e-05, + "loss": 0.5207, "step": 5710 }, { - "epoch": 0.2, - "learning_rate": 4.998633262324453e-05, - "loss": 0.3504, + "epoch": 0.20596821277975996, + "grad_norm": 0.2193039357662201, + "learning_rate": 4.998359273212177e-05, + "loss": 0.5007, "step": 5715 }, { - "epoch": 0.2, - "learning_rate": 4.9986238276802435e-05, - "loss": 0.3345, + "epoch": 0.2061484124409846, + "grad_norm": 0.22306658327579498, + "learning_rate": 4.998348685655653e-05, + "loss": 0.4909, "step": 5720 }, { - "epoch": 0.2, - "learning_rate": 4.998614360593049e-05, - "loss": 0.3224, + "epoch": 0.20632861210220924, + "grad_norm": 0.20013074576854706, + "learning_rate": 4.998338064059611e-05, + "loss": 0.472, "step": 5725 }, { - "epoch": 0.2, - "learning_rate": 4.998604861062995e-05, - "loss": 0.3267, + "epoch": 0.20650881176343389, + "grad_norm": 0.2772786319255829, + "learning_rate": 4.998327408424196e-05, + "loss": 0.4913, "step": 5730 }, { - "epoch": 0.2, - "learning_rate": 4.9985953290902024e-05, - "loss": 0.3438, + "epoch": 0.2066890114246585, + "grad_norm": 0.23623016476631165, + "learning_rate": 4.998316718749555e-05, + "loss": 0.4423, "step": 5735 }, { - "epoch": 0.2, - "learning_rate": 4.998585764674797e-05, - "loss": 0.3184, + "epoch": 0.20686921108588316, + "grad_norm": 0.2390371859073639, + "learning_rate": 4.9983059950358316e-05, + "loss": 0.4747, "step": 5740 }, { - "epoch": 0.2, - "learning_rate": 4.998576167816902e-05, - "loss": 0.3406, + "epoch": 0.20704941074710778, + "grad_norm": 0.16959403455257416, + "learning_rate": 4.9982952372831724e-05, + "loss": 0.4875, "step": 5745 }, { - "epoch": 0.2, - "learning_rate": 4.9985665385166416e-05, - "loss": 0.3495, + "epoch": 0.20722961040833243, + "grad_norm": 0.1819695085287094, + "learning_rate": 4.998284445491726e-05, + "loss": 0.4731, "step": 5750 }, { - "epoch": 0.2, - "learning_rate": 4.998556876774141e-05, - "loss": 0.3361, + "epoch": 0.20740981006955708, + "grad_norm": 0.21502423286437988, + "learning_rate": 4.998273619661636e-05, + "loss": 0.4587, "step": 5755 }, { - "epoch": 0.2, - "learning_rate": 4.998547182589527e-05, - "loss": 0.3761, + "epoch": 0.2075900097307817, + "grad_norm": 0.1912410408258438, + "learning_rate": 4.998262759793052e-05, + "loss": 0.4497, "step": 5760 }, { - "epoch": 0.2, - "learning_rate": 4.998537455962924e-05, - "loss": 0.3232, + "epoch": 0.20777020939200636, + "grad_norm": 0.21954499185085297, + "learning_rate": 4.9982518658861224e-05, + "loss": 0.4768, "step": 5765 }, { - "epoch": 0.2, - "learning_rate": 4.998527696894458e-05, - "loss": 0.3331, + "epoch": 0.20795040905323098, + "grad_norm": 0.23011371493339539, + "learning_rate": 4.998240937940993e-05, + "loss": 0.4931, "step": 5770 }, { - "epoch": 0.2, - "learning_rate": 4.998517905384258e-05, - "loss": 0.3593, + "epoch": 0.20813060871445563, + "grad_norm": 0.18826115131378174, + "learning_rate": 4.998229975957816e-05, + "loss": 0.4547, "step": 5775 }, { - "epoch": 0.2, - "learning_rate": 4.998508081432448e-05, - "loss": 0.3332, + "epoch": 0.20831080837568025, + "grad_norm": 0.23759086430072784, + "learning_rate": 4.998218979936739e-05, + "loss": 0.4687, "step": 5780 }, { - "epoch": 0.2, - "learning_rate": 4.998498225039158e-05, - "loss": 0.3465, + "epoch": 0.2084910080369049, + "grad_norm": 0.2513539493083954, + "learning_rate": 4.9982079498779125e-05, + "loss": 0.4908, "step": 5785 }, { - "epoch": 0.2, - "learning_rate": 4.998488336204515e-05, - "loss": 0.3313, + "epoch": 0.20867120769812952, + "grad_norm": 0.23931877315044403, + "learning_rate": 4.998196885781485e-05, + "loss": 0.4848, "step": 5790 }, { - "epoch": 0.2, - "learning_rate": 4.998478414928647e-05, - "loss": 0.3309, + "epoch": 0.20885140735935417, + "grad_norm": 0.21470136940479279, + "learning_rate": 4.99818578764761e-05, + "loss": 0.4397, "step": 5795 }, { - "epoch": 0.2, - "learning_rate": 4.998468461211684e-05, - "loss": 0.3517, + "epoch": 0.2090316070205788, + "grad_norm": 0.181662455201149, + "learning_rate": 4.9981746554764366e-05, + "loss": 0.4663, "step": 5800 }, { - "epoch": 0.2, - "learning_rate": 4.998458475053754e-05, - "loss": 0.3301, + "epoch": 0.20921180668180345, + "grad_norm": 0.20043453574180603, + "learning_rate": 4.9981634892681175e-05, + "loss": 0.5032, "step": 5805 }, { - "epoch": 0.2, - "learning_rate": 4.9984484564549884e-05, - "loss": 0.3438, + "epoch": 0.20939200634302807, + "grad_norm": 0.21683253347873688, + "learning_rate": 4.998152289022804e-05, + "loss": 0.4492, "step": 5810 }, { - "epoch": 0.2, - "learning_rate": 4.998438405415515e-05, - "loss": 0.334, + "epoch": 0.20957220600425272, + "grad_norm": 0.24924011528491974, + "learning_rate": 4.998141054740649e-05, + "loss": 0.4736, "step": 5815 }, { - "epoch": 0.2, - "learning_rate": 4.998428321935466e-05, - "loss": 0.3086, + "epoch": 0.20975240566547734, + "grad_norm": 0.2400321364402771, + "learning_rate": 4.998129786421807e-05, + "loss": 0.4451, "step": 5820 }, { - "epoch": 0.2, - "learning_rate": 4.998418206014971e-05, - "loss": 0.3291, + "epoch": 0.209932605326702, + "grad_norm": 0.19398939609527588, + "learning_rate": 4.9981184840664294e-05, + "loss": 0.4288, "step": 5825 }, { - "epoch": 0.21, - "learning_rate": 4.9984080576541636e-05, - "loss": 0.3316, + "epoch": 0.21011280498792662, + "grad_norm": 0.20897026360034943, + "learning_rate": 4.9981071476746717e-05, + "loss": 0.4522, "step": 5830 }, { - "epoch": 0.21, - "learning_rate": 4.9983978768531724e-05, - "loss": 0.3494, + "epoch": 0.21029300464915127, + "grad_norm": 0.19112837314605713, + "learning_rate": 4.998095777246687e-05, + "loss": 0.4699, "step": 5835 }, { - "epoch": 0.21, - "learning_rate": 4.9983876636121325e-05, - "loss": 0.3372, + "epoch": 0.2104732043103759, + "grad_norm": 0.27279478311538696, + "learning_rate": 4.998084372782631e-05, + "loss": 0.4821, "step": 5840 }, { - "epoch": 0.21, - "learning_rate": 4.998377417931175e-05, - "loss": 0.3628, + "epoch": 0.21065340397160054, + "grad_norm": 0.17838047444820404, + "learning_rate": 4.99807293428266e-05, + "loss": 0.4548, "step": 5845 }, { - "epoch": 0.21, - "learning_rate": 4.998367139810433e-05, - "loss": 0.3209, + "epoch": 0.21083360363282516, + "grad_norm": 0.23038989305496216, + "learning_rate": 4.9980614617469284e-05, + "loss": 0.4893, "step": 5850 }, { - "epoch": 0.21, - "learning_rate": 4.99835682925004e-05, - "loss": 0.3382, + "epoch": 0.2110138032940498, + "grad_norm": 0.21518467366695404, + "learning_rate": 4.998049955175593e-05, + "loss": 0.4707, "step": 5855 }, { - "epoch": 0.21, - "learning_rate": 4.998346486250131e-05, - "loss": 0.3258, + "epoch": 0.21119400295527443, + "grad_norm": 0.2731831669807434, + "learning_rate": 4.998038414568811e-05, + "loss": 0.4649, "step": 5860 }, { - "epoch": 0.21, - "learning_rate": 4.998336110810838e-05, - "loss": 0.3392, + "epoch": 0.21137420261649909, + "grad_norm": 0.2282124161720276, + "learning_rate": 4.998026839926738e-05, + "loss": 0.4721, "step": 5865 }, { - "epoch": 0.21, - "learning_rate": 4.9983257029322975e-05, - "loss": 0.3324, + "epoch": 0.2115544022777237, + "grad_norm": 0.24522550404071808, + "learning_rate": 4.9980152312495345e-05, + "loss": 0.4881, "step": 5870 }, { - "epoch": 0.21, - "learning_rate": 4.998315262614645e-05, - "loss": 0.3524, + "epoch": 0.21173460193894836, + "grad_norm": 0.24952565133571625, + "learning_rate": 4.998003588537356e-05, + "loss": 0.4917, "step": 5875 }, { - "epoch": 0.21, - "learning_rate": 4.9983047898580134e-05, - "loss": 0.3644, + "epoch": 0.21191480160017298, + "grad_norm": 0.19938194751739502, + "learning_rate": 4.997991911790363e-05, + "loss": 0.4714, "step": 5880 }, { - "epoch": 0.21, - "learning_rate": 4.998294284662542e-05, - "loss": 0.3748, + "epoch": 0.21209500126139763, + "grad_norm": 0.25844866037368774, + "learning_rate": 4.997980201008713e-05, + "loss": 0.485, "step": 5885 }, { - "epoch": 0.21, - "learning_rate": 4.998283747028366e-05, - "loss": 0.326, + "epoch": 0.21227520092262225, + "grad_norm": 0.19596533477306366, + "learning_rate": 4.9979684561925663e-05, + "loss": 0.4782, "step": 5890 }, { - "epoch": 0.21, - "learning_rate": 4.9982731769556216e-05, - "loss": 0.3364, + "epoch": 0.2124554005838469, + "grad_norm": 0.17095403373241425, + "learning_rate": 4.9979566773420836e-05, + "loss": 0.4786, "step": 5895 }, { - "epoch": 0.21, - "learning_rate": 4.9982625744444464e-05, - "loss": 0.3307, + "epoch": 0.21263560024507153, + "grad_norm": 0.20486395061016083, + "learning_rate": 4.9979448644574254e-05, + "loss": 0.4701, "step": 5900 }, { - "epoch": 0.21, - "learning_rate": 4.9982519394949776e-05, - "loss": 0.3832, + "epoch": 0.21281579990629618, + "grad_norm": 0.19251783192157745, + "learning_rate": 4.997933017538751e-05, + "loss": 0.4164, "step": 5905 }, { - "epoch": 0.21, - "learning_rate": 4.998241272107355e-05, - "loss": 0.3344, + "epoch": 0.2129959995675208, + "grad_norm": 0.21283471584320068, + "learning_rate": 4.9979211365862235e-05, + "loss": 0.4842, "step": 5910 }, { - "epoch": 0.21, - "learning_rate": 4.998230572281715e-05, - "loss": 0.3497, + "epoch": 0.21317619922874545, + "grad_norm": 0.21387127041816711, + "learning_rate": 4.997909221600003e-05, + "loss": 0.4786, "step": 5915 }, { - "epoch": 0.21, - "learning_rate": 4.9982198400181975e-05, - "loss": 0.3361, + "epoch": 0.2133563988899701, + "grad_norm": 0.22376859188079834, + "learning_rate": 4.997897272580254e-05, + "loss": 0.4504, "step": 5920 }, { - "epoch": 0.21, - "learning_rate": 4.998209075316942e-05, - "loss": 0.3129, + "epoch": 0.21353659855119472, + "grad_norm": 0.15793459117412567, + "learning_rate": 4.997885289527139e-05, + "loss": 0.4533, "step": 5925 }, { - "epoch": 0.21, - "learning_rate": 4.998198278178088e-05, - "loss": 0.3266, + "epoch": 0.21371679821241937, + "grad_norm": 0.2030140906572342, + "learning_rate": 4.9978732724408195e-05, + "loss": 0.4771, "step": 5930 }, { - "epoch": 0.21, - "learning_rate": 4.9981874486017746e-05, - "loss": 0.3299, + "epoch": 0.213896997873644, + "grad_norm": 0.19730666279792786, + "learning_rate": 4.997861221321461e-05, + "loss": 0.4598, "step": 5935 }, { - "epoch": 0.21, - "learning_rate": 4.9981765865881444e-05, - "loss": 0.3378, + "epoch": 0.21407719753486865, + "grad_norm": 0.28604263067245483, + "learning_rate": 4.9978491361692255e-05, + "loss": 0.5014, "step": 5940 }, { - "epoch": 0.21, - "learning_rate": 4.998165692137338e-05, - "loss": 0.3558, + "epoch": 0.21425739719609327, + "grad_norm": 0.2171340137720108, + "learning_rate": 4.99783701698428e-05, + "loss": 0.4597, "step": 5945 }, { - "epoch": 0.21, - "learning_rate": 4.998154765249496e-05, - "loss": 0.3249, + "epoch": 0.21443759685731792, + "grad_norm": 0.24565933644771576, + "learning_rate": 4.9978248637667883e-05, + "loss": 0.4856, "step": 5950 }, { - "epoch": 0.21, - "learning_rate": 4.998143805924761e-05, - "loss": 0.3064, + "epoch": 0.21461779651854254, + "grad_norm": 0.20074883103370667, + "learning_rate": 4.997812676516917e-05, + "loss": 0.4681, "step": 5955 }, { - "epoch": 0.21, - "learning_rate": 4.998132814163275e-05, - "loss": 0.3358, + "epoch": 0.2147979961797672, + "grad_norm": 0.17820361256599426, + "learning_rate": 4.9978004552348314e-05, + "loss": 0.485, "step": 5960 }, { - "epoch": 0.21, - "learning_rate": 4.998121789965181e-05, - "loss": 0.3356, + "epoch": 0.21497819584099181, + "grad_norm": 0.19465869665145874, + "learning_rate": 4.997788199920698e-05, + "loss": 0.4693, "step": 5965 }, { - "epoch": 0.21, - "learning_rate": 4.998110733330621e-05, - "loss": 0.3289, + "epoch": 0.21515839550221647, + "grad_norm": 0.22670099139213562, + "learning_rate": 4.997775910574685e-05, + "loss": 0.5077, "step": 5970 }, { - "epoch": 0.21, - "learning_rate": 4.99809964425974e-05, - "loss": 0.3755, + "epoch": 0.2153385951634411, + "grad_norm": 0.14773029088974, + "learning_rate": 4.997763587196957e-05, + "loss": 0.4664, "step": 5975 }, { - "epoch": 0.21, - "learning_rate": 4.998088522752681e-05, - "loss": 0.3381, + "epoch": 0.21551879482466574, + "grad_norm": 0.26467835903167725, + "learning_rate": 4.997751229787685e-05, + "loss": 0.5078, "step": 5980 }, { - "epoch": 0.21, - "learning_rate": 4.9980773688095896e-05, - "loss": 0.3545, + "epoch": 0.21569899448589036, + "grad_norm": 0.20489193499088287, + "learning_rate": 4.9977388383470356e-05, + "loss": 0.4256, "step": 5985 }, { - "epoch": 0.21, - "learning_rate": 4.99806618243061e-05, - "loss": 0.3409, + "epoch": 0.215879194147115, + "grad_norm": 0.2557709217071533, + "learning_rate": 4.997726412875178e-05, + "loss": 0.4677, "step": 5990 }, { - "epoch": 0.21, - "learning_rate": 4.998054963615886e-05, - "loss": 0.336, + "epoch": 0.21605939380833963, + "grad_norm": 0.20980533957481384, + "learning_rate": 4.997713953372282e-05, + "loss": 0.4954, "step": 5995 }, { - "epoch": 0.21, - "learning_rate": 4.998043712365566e-05, - "loss": 0.3559, + "epoch": 0.21623959346956428, + "grad_norm": 0.20784328877925873, + "learning_rate": 4.997701459838517e-05, + "loss": 0.515, "step": 6000 }, { - "epoch": 0.21, - "eval_loss": 0.3358333706855774, - "eval_runtime": 10.5554, - "eval_samples_per_second": 9.474, - "eval_steps_per_second": 9.474, + "epoch": 0.21623959346956428, + "eval_loss": 0.4947231411933899, + "eval_runtime": 3.5181, + "eval_samples_per_second": 28.424, + "eval_steps_per_second": 7.106, "step": 6000 }, { - "epoch": 0.21, - "learning_rate": 4.9980324286797935e-05, - "loss": 0.3347, + "epoch": 0.2164197931307889, + "grad_norm": 0.2340945601463318, + "learning_rate": 4.997688932274053e-05, + "loss": 0.4411, "step": 6005 }, { - "epoch": 0.21, - "learning_rate": 4.9980211125587166e-05, - "loss": 0.3317, + "epoch": 0.21659999279201356, + "grad_norm": 0.1668865829706192, + "learning_rate": 4.997676370679061e-05, + "loss": 0.4891, "step": 6010 }, { - "epoch": 0.21, - "learning_rate": 4.998009764002482e-05, - "loss": 0.3474, + "epoch": 0.21678019245323818, + "grad_norm": 0.22756971418857574, + "learning_rate": 4.997663775053712e-05, + "loss": 0.4992, "step": 6015 }, { - "epoch": 0.21, - "learning_rate": 4.9979983830112366e-05, - "loss": 0.3507, + "epoch": 0.21696039211446283, + "grad_norm": 0.21588781476020813, + "learning_rate": 4.997651145398177e-05, + "loss": 0.4388, "step": 6020 }, { - "epoch": 0.21, - "learning_rate": 4.997986969585129e-05, - "loss": 0.3373, + "epoch": 0.21714059177568745, + "grad_norm": 0.1796243041753769, + "learning_rate": 4.9976384817126295e-05, + "loss": 0.4314, "step": 6025 }, { - "epoch": 0.21, - "learning_rate": 4.997975523724306e-05, - "loss": 0.3213, + "epoch": 0.2173207914369121, + "grad_norm": 0.2105339765548706, + "learning_rate": 4.9976257839972406e-05, + "loss": 0.4898, "step": 6030 }, { - "epoch": 0.21, - "learning_rate": 4.9979640454289175e-05, - "loss": 0.3132, + "epoch": 0.21750099109813673, + "grad_norm": 0.23153090476989746, + "learning_rate": 4.9976130522521845e-05, + "loss": 0.4549, "step": 6035 }, { - "epoch": 0.21, - "learning_rate": 4.9979525346991126e-05, - "loss": 0.3358, + "epoch": 0.21768119075936138, + "grad_norm": 0.240932434797287, + "learning_rate": 4.997600286477634e-05, + "loss": 0.4623, "step": 6040 }, { - "epoch": 0.21, - "learning_rate": 4.997940991535039e-05, - "loss": 0.3218, + "epoch": 0.217861390420586, + "grad_norm": 0.19893620908260345, + "learning_rate": 4.997587486673763e-05, + "loss": 0.4703, "step": 6045 }, { - "epoch": 0.21, - "learning_rate": 4.997929415936849e-05, - "loss": 0.3567, + "epoch": 0.21804159008181065, + "grad_norm": 0.17426280677318573, + "learning_rate": 4.9975746528407466e-05, + "loss": 0.4738, "step": 6050 }, { - "epoch": 0.21, - "learning_rate": 4.9979178079046914e-05, - "loss": 0.3256, + "epoch": 0.21822178974303527, + "grad_norm": 0.20304343104362488, + "learning_rate": 4.997561784978758e-05, + "loss": 0.4971, "step": 6055 }, { - "epoch": 0.21, - "learning_rate": 4.997906167438717e-05, - "loss": 0.3522, + "epoch": 0.21840198940425992, + "grad_norm": 0.20934487879276276, + "learning_rate": 4.997548883087974e-05, + "loss": 0.4552, "step": 6060 }, { - "epoch": 0.21, - "learning_rate": 4.997894494539077e-05, - "loss": 0.351, + "epoch": 0.21858218906548454, + "grad_norm": 0.17177551984786987, + "learning_rate": 4.99753594716857e-05, + "loss": 0.4991, "step": 6065 }, { - "epoch": 0.21, - "learning_rate": 4.997882789205924e-05, - "loss": 0.3127, + "epoch": 0.2187623887267092, + "grad_norm": 0.2459387630224228, + "learning_rate": 4.9975229772207224e-05, + "loss": 0.4845, "step": 6070 }, { - "epoch": 0.21, - "learning_rate": 4.997871051439409e-05, - "loss": 0.3374, + "epoch": 0.21894258838793382, + "grad_norm": 0.23840418457984924, + "learning_rate": 4.9975099732446085e-05, + "loss": 0.4877, "step": 6075 }, { - "epoch": 0.21, - "learning_rate": 4.997859281239684e-05, - "loss": 0.3133, + "epoch": 0.21912278804915847, + "grad_norm": 0.2193634808063507, + "learning_rate": 4.9974969352404036e-05, + "loss": 0.4465, "step": 6080 }, { - "epoch": 0.21, - "learning_rate": 4.997847478606902e-05, - "loss": 0.3224, + "epoch": 0.21930298771038312, + "grad_norm": 0.15216214954853058, + "learning_rate": 4.997483863208288e-05, + "loss": 0.4538, "step": 6085 }, { - "epoch": 0.21, - "learning_rate": 4.997835643541218e-05, - "loss": 0.3394, + "epoch": 0.21948318737160774, + "grad_norm": 0.1821216493844986, + "learning_rate": 4.997470757148437e-05, + "loss": 0.4328, "step": 6090 }, { - "epoch": 0.21, - "learning_rate": 4.997823776042782e-05, - "loss": 0.3583, + "epoch": 0.2196633870328324, + "grad_norm": 0.2372681200504303, + "learning_rate": 4.99745761706103e-05, + "loss": 0.4391, "step": 6095 }, { - "epoch": 0.21, - "learning_rate": 4.997811876111753e-05, - "loss": 0.3395, + "epoch": 0.219843586694057, + "grad_norm": 0.1972285658121109, + "learning_rate": 4.9974444429462476e-05, + "loss": 0.4586, "step": 6100 }, { - "epoch": 0.21, - "learning_rate": 4.997799943748281e-05, - "loss": 0.3493, + "epoch": 0.22002378635528166, + "grad_norm": 0.20691488683223724, + "learning_rate": 4.997431234804267e-05, + "loss": 0.5046, "step": 6105 }, { - "epoch": 0.21, - "learning_rate": 4.997787978952524e-05, - "loss": 0.3356, + "epoch": 0.2202039860165063, + "grad_norm": 0.20699431002140045, + "learning_rate": 4.9974179926352706e-05, + "loss": 0.4709, "step": 6110 }, { - "epoch": 0.22, - "learning_rate": 4.997775981724636e-05, - "loss": 0.3484, + "epoch": 0.22038418567773094, + "grad_norm": 0.24073725938796997, + "learning_rate": 4.997404716439438e-05, + "loss": 0.4774, "step": 6115 }, { - "epoch": 0.22, - "learning_rate": 4.9977639520647735e-05, - "loss": 0.3517, + "epoch": 0.22056438533895556, + "grad_norm": 0.19937781989574432, + "learning_rate": 4.997391406216948e-05, + "loss": 0.4387, "step": 6120 }, { - "epoch": 0.22, - "learning_rate": 4.9977518899730915e-05, - "loss": 0.3185, + "epoch": 0.2207445850001802, + "grad_norm": 0.203308567404747, + "learning_rate": 4.997378061967984e-05, + "loss": 0.4535, "step": 6125 }, { - "epoch": 0.22, - "learning_rate": 4.997739795449747e-05, - "loss": 0.3593, + "epoch": 0.22092478466140483, + "grad_norm": 0.1698613315820694, + "learning_rate": 4.997364683692728e-05, + "loss": 0.4852, "step": 6130 }, { - "epoch": 0.22, - "learning_rate": 4.997727668494898e-05, - "loss": 0.3254, + "epoch": 0.22110498432262948, + "grad_norm": 0.22055956721305847, + "learning_rate": 4.997351271391362e-05, + "loss": 0.5054, "step": 6135 }, { - "epoch": 0.22, - "learning_rate": 4.9977155091087e-05, - "loss": 0.3356, + "epoch": 0.2212851839838541, + "grad_norm": 0.25628286600112915, + "learning_rate": 4.997337825064068e-05, + "loss": 0.4846, "step": 6140 }, { - "epoch": 0.22, - "learning_rate": 4.997703317291313e-05, - "loss": 0.355, + "epoch": 0.22146538364507876, + "grad_norm": 0.17073556780815125, + "learning_rate": 4.9973243447110294e-05, + "loss": 0.4608, "step": 6145 }, { - "epoch": 0.22, - "learning_rate": 4.997691093042896e-05, - "loss": 0.3249, + "epoch": 0.22164558330630338, + "grad_norm": 0.13316601514816284, + "learning_rate": 4.99731083033243e-05, + "loss": 0.4658, "step": 6150 }, { - "epoch": 0.22, - "learning_rate": 4.997678836363604e-05, - "loss": 0.3273, + "epoch": 0.22182578296752803, + "grad_norm": 0.22222983837127686, + "learning_rate": 4.997297281928455e-05, + "loss": 0.4558, "step": 6155 }, { - "epoch": 0.22, - "learning_rate": 4.997666547253599e-05, - "loss": 0.3471, + "epoch": 0.22200598262875265, + "grad_norm": 0.21334873139858246, + "learning_rate": 4.997283699499287e-05, + "loss": 0.4884, "step": 6160 }, { - "epoch": 0.22, - "learning_rate": 4.99765422571304e-05, - "loss": 0.3326, + "epoch": 0.2221861822899773, + "grad_norm": 0.1857357621192932, + "learning_rate": 4.997270083045112e-05, + "loss": 0.4372, "step": 6165 }, { - "epoch": 0.22, - "learning_rate": 4.997641871742087e-05, - "loss": 0.3423, + "epoch": 0.22236638195120192, + "grad_norm": 0.18568672239780426, + "learning_rate": 4.997256432566116e-05, + "loss": 0.488, "step": 6170 }, { - "epoch": 0.22, - "learning_rate": 4.9976294853409e-05, - "loss": 0.3468, + "epoch": 0.22254658161242657, + "grad_norm": 0.19653025269508362, + "learning_rate": 4.997242748062485e-05, + "loss": 0.4907, "step": 6175 }, { - "epoch": 0.22, - "learning_rate": 4.9976170665096405e-05, - "loss": 0.3496, + "epoch": 0.2227267812736512, + "grad_norm": 0.19124481081962585, + "learning_rate": 4.9972290295344046e-05, + "loss": 0.5021, "step": 6180 }, { - "epoch": 0.22, - "learning_rate": 4.997604615248469e-05, - "loss": 0.3701, + "epoch": 0.22290698093487585, + "grad_norm": 0.20403480529785156, + "learning_rate": 4.997215276982062e-05, + "loss": 0.4855, "step": 6185 }, { - "epoch": 0.22, - "learning_rate": 4.997592131557548e-05, - "loss": 0.34, + "epoch": 0.22308718059610047, + "grad_norm": 0.18618778884410858, + "learning_rate": 4.9972014904056446e-05, + "loss": 0.4315, "step": 6190 }, { - "epoch": 0.22, - "learning_rate": 4.997579615437039e-05, - "loss": 0.3093, + "epoch": 0.22326738025732512, + "grad_norm": 0.24033483862876892, + "learning_rate": 4.997187669805341e-05, + "loss": 0.4607, "step": 6195 }, { - "epoch": 0.22, - "learning_rate": 4.997567066887105e-05, - "loss": 0.3141, + "epoch": 0.22344757991854974, + "grad_norm": 0.26656222343444824, + "learning_rate": 4.997173815181339e-05, + "loss": 0.5022, "step": 6200 }, { - "epoch": 0.22, - "learning_rate": 4.9975544859079074e-05, - "loss": 0.3394, + "epoch": 0.2236277795797744, + "grad_norm": 0.22280102968215942, + "learning_rate": 4.997159926533826e-05, + "loss": 0.4928, "step": 6205 }, { - "epoch": 0.22, - "learning_rate": 4.997541872499611e-05, - "loss": 0.3649, + "epoch": 0.22380797924099902, + "grad_norm": 0.2294902205467224, + "learning_rate": 4.997146003862994e-05, + "loss": 0.5015, "step": 6210 }, { - "epoch": 0.22, - "learning_rate": 4.997529226662381e-05, - "loss": 0.3132, + "epoch": 0.22398817890222367, + "grad_norm": 0.22047486901283264, + "learning_rate": 4.9971320471690295e-05, + "loss": 0.4545, "step": 6215 }, { - "epoch": 0.22, - "learning_rate": 4.997516548396378e-05, - "loss": 0.3583, + "epoch": 0.2241683785634483, + "grad_norm": 0.21225684881210327, + "learning_rate": 4.9971180564521254e-05, + "loss": 0.4758, "step": 6220 }, { - "epoch": 0.22, - "learning_rate": 4.9975038377017705e-05, - "loss": 0.3419, + "epoch": 0.22434857822467294, + "grad_norm": 0.18276849389076233, + "learning_rate": 4.9971040317124706e-05, + "loss": 0.4767, "step": 6225 }, { - "epoch": 0.22, - "learning_rate": 4.99749109457872e-05, - "loss": 0.3315, + "epoch": 0.22452877788589756, + "grad_norm": 0.19218437373638153, + "learning_rate": 4.9970899729502576e-05, + "loss": 0.466, "step": 6230 }, { - "epoch": 0.22, - "learning_rate": 4.997478319027394e-05, - "loss": 0.3451, + "epoch": 0.2247089775471222, + "grad_norm": 0.22989146411418915, + "learning_rate": 4.997075880165677e-05, + "loss": 0.4543, "step": 6235 }, { - "epoch": 0.22, - "learning_rate": 4.9974655110479576e-05, - "loss": 0.3336, + "epoch": 0.22488917720834684, + "grad_norm": 0.25233665108680725, + "learning_rate": 4.997061753358921e-05, + "loss": 0.501, "step": 6240 }, { - "epoch": 0.22, - "learning_rate": 4.997452670640578e-05, - "loss": 0.3494, + "epoch": 0.22506937686957149, + "grad_norm": 0.23017632961273193, + "learning_rate": 4.997047592530182e-05, + "loss": 0.463, "step": 6245 }, { - "epoch": 0.22, - "learning_rate": 4.997439797805421e-05, - "loss": 0.3194, + "epoch": 0.2252495765307961, + "grad_norm": 0.18937349319458008, + "learning_rate": 4.9970333976796526e-05, + "loss": 0.4387, "step": 6250 }, { - "epoch": 0.22, - "learning_rate": 4.997426892542654e-05, - "loss": 0.3357, + "epoch": 0.22542977619202076, + "grad_norm": 0.15060578286647797, + "learning_rate": 4.997019168807527e-05, + "loss": 0.4603, "step": 6255 }, { - "epoch": 0.22, - "learning_rate": 4.997413954852445e-05, - "loss": 0.3278, + "epoch": 0.2256099758532454, + "grad_norm": 0.2040947675704956, + "learning_rate": 4.997004905913998e-05, + "loss": 0.4982, "step": 6260 }, { - "epoch": 0.22, - "learning_rate": 4.997400984734961e-05, - "loss": 0.3397, + "epoch": 0.22579017551447003, + "grad_norm": 0.2734396159648895, + "learning_rate": 4.9969906089992616e-05, + "loss": 0.4854, "step": 6265 }, { - "epoch": 0.22, - "learning_rate": 4.997387982190372e-05, - "loss": 0.3387, + "epoch": 0.22597037517569468, + "grad_norm": 0.17175772786140442, + "learning_rate": 4.996976278063511e-05, + "loss": 0.4451, "step": 6270 }, { - "epoch": 0.22, - "learning_rate": 4.997374947218845e-05, - "loss": 0.3167, + "epoch": 0.2261505748369193, + "grad_norm": 0.21332839131355286, + "learning_rate": 4.996961913106942e-05, + "loss": 0.4869, "step": 6275 }, { - "epoch": 0.22, - "learning_rate": 4.997361879820551e-05, - "loss": 0.2976, + "epoch": 0.22633077449814396, + "grad_norm": 0.16605186462402344, + "learning_rate": 4.9969475141297504e-05, + "loss": 0.4448, "step": 6280 }, { - "epoch": 0.22, - "learning_rate": 4.997348779995659e-05, - "loss": 0.3262, + "epoch": 0.22651097415936858, + "grad_norm": 0.18335530161857605, + "learning_rate": 4.9969330811321325e-05, + "loss": 0.4778, "step": 6285 }, { - "epoch": 0.22, - "learning_rate": 4.997335647744338e-05, - "loss": 0.3545, + "epoch": 0.22669117382059323, + "grad_norm": 0.20451930165290833, + "learning_rate": 4.996918614114285e-05, + "loss": 0.4643, "step": 6290 }, { - "epoch": 0.22, - "learning_rate": 4.997322483066759e-05, - "loss": 0.3226, + "epoch": 0.22687137348181785, + "grad_norm": 0.1627047061920166, + "learning_rate": 4.9969041130764046e-05, + "loss": 0.4595, "step": 6295 }, { - "epoch": 0.22, - "learning_rate": 4.9973092859630945e-05, - "loss": 0.35, + "epoch": 0.2270515731430425, + "grad_norm": 0.18496522307395935, + "learning_rate": 4.9968895780186884e-05, + "loss": 0.4552, "step": 6300 }, { - "epoch": 0.22, - "learning_rate": 4.997296056433514e-05, - "loss": 0.3216, + "epoch": 0.22723177280426712, + "grad_norm": 0.209730327129364, + "learning_rate": 4.9968750089413365e-05, + "loss": 0.4515, "step": 6305 }, { - "epoch": 0.22, - "learning_rate": 4.997282794478191e-05, - "loss": 0.3178, + "epoch": 0.22741197246549177, + "grad_norm": 0.21571126580238342, + "learning_rate": 4.996860405844545e-05, + "loss": 0.4874, "step": 6310 }, { - "epoch": 0.22, - "learning_rate": 4.997269500097296e-05, - "loss": 0.3279, + "epoch": 0.2275921721267164, + "grad_norm": 0.20048457384109497, + "learning_rate": 4.996845768728514e-05, + "loss": 0.4921, "step": 6315 }, { - "epoch": 0.22, - "learning_rate": 4.997256173291002e-05, - "loss": 0.3349, + "epoch": 0.22777237178794105, + "grad_norm": 0.211773082613945, + "learning_rate": 4.996831097593443e-05, + "loss": 0.4907, "step": 6320 }, { - "epoch": 0.22, - "learning_rate": 4.9972428140594824e-05, - "loss": 0.3049, + "epoch": 0.22795257144916567, + "grad_norm": 0.22399649024009705, + "learning_rate": 4.996816392439532e-05, + "loss": 0.4853, "step": 6325 }, { - "epoch": 0.22, - "learning_rate": 4.99722942240291e-05, - "loss": 0.3463, + "epoch": 0.22813277111039032, + "grad_norm": 0.2886163294315338, + "learning_rate": 4.9968016532669805e-05, + "loss": 0.4592, "step": 6330 }, { - "epoch": 0.22, - "learning_rate": 4.9972159983214605e-05, - "loss": 0.3329, + "epoch": 0.22831297077161494, + "grad_norm": 0.21131619811058044, + "learning_rate": 4.9967868800759895e-05, + "loss": 0.4624, "step": 6335 }, { - "epoch": 0.22, - "learning_rate": 4.9972025418153065e-05, - "loss": 0.3367, + "epoch": 0.2284931704328396, + "grad_norm": 0.2376997023820877, + "learning_rate": 4.996772072866762e-05, + "loss": 0.4935, "step": 6340 }, { - "epoch": 0.22, - "learning_rate": 4.9971890528846235e-05, - "loss": 0.337, + "epoch": 0.22867337009406422, + "grad_norm": 0.24256786704063416, + "learning_rate": 4.996757231639497e-05, + "loss": 0.4595, "step": 6345 }, { - "epoch": 0.22, - "learning_rate": 4.997175531529586e-05, - "loss": 0.3383, + "epoch": 0.22885356975528887, + "grad_norm": 0.22123675048351288, + "learning_rate": 4.9967423563943994e-05, + "loss": 0.4759, "step": 6350 }, { - "epoch": 0.22, - "learning_rate": 4.9971619777503705e-05, - "loss": 0.3563, + "epoch": 0.2290337694165135, + "grad_norm": 0.22762177884578705, + "learning_rate": 4.996727447131669e-05, + "loss": 0.4417, "step": 6355 }, { - "epoch": 0.22, - "learning_rate": 4.9971483915471516e-05, - "loss": 0.3604, + "epoch": 0.22921396907773814, + "grad_norm": 0.1603131741285324, + "learning_rate": 4.9967125038515116e-05, + "loss": 0.4618, "step": 6360 }, { - "epoch": 0.22, - "learning_rate": 4.997134772920107e-05, - "loss": 0.3429, + "epoch": 0.22939416873896276, + "grad_norm": 0.2037096470594406, + "learning_rate": 4.99669752655413e-05, + "loss": 0.4785, "step": 6365 }, { - "epoch": 0.22, - "learning_rate": 4.997121121869414e-05, - "loss": 0.3422, + "epoch": 0.2295743684001874, + "grad_norm": 0.25867146253585815, + "learning_rate": 4.996682515239728e-05, + "loss": 0.451, "step": 6370 }, { - "epoch": 0.22, - "learning_rate": 4.997107438395247e-05, - "loss": 0.3272, + "epoch": 0.22975456806141203, + "grad_norm": 0.15876412391662598, + "learning_rate": 4.996667469908509e-05, + "loss": 0.4362, "step": 6375 }, { - "epoch": 0.22, - "learning_rate": 4.997093722497787e-05, - "loss": 0.3286, + "epoch": 0.22993476772263668, + "grad_norm": 0.22342482209205627, + "learning_rate": 4.99665239056068e-05, + "loss": 0.4629, "step": 6380 }, { - "epoch": 0.22, - "learning_rate": 4.99707997417721e-05, - "loss": 0.3103, + "epoch": 0.2301149673838613, + "grad_norm": 0.1261013001203537, + "learning_rate": 4.9966372771964456e-05, + "loss": 0.4501, "step": 6385 }, { - "epoch": 0.22, - "learning_rate": 4.997066193433695e-05, - "loss": 0.3118, + "epoch": 0.23029516704508596, + "grad_norm": 0.20957724750041962, + "learning_rate": 4.996622129816011e-05, + "loss": 0.4736, "step": 6390 }, { - "epoch": 0.22, - "learning_rate": 4.997052380267421e-05, - "loss": 0.3341, + "epoch": 0.23047536670631058, + "grad_norm": 0.23294906318187714, + "learning_rate": 4.996606948419583e-05, + "loss": 0.4565, "step": 6395 }, { - "epoch": 0.23, - "learning_rate": 4.997038534678568e-05, - "loss": 0.35, + "epoch": 0.23065556636753523, + "grad_norm": 0.18201923370361328, + "learning_rate": 4.99659173300737e-05, + "loss": 0.5131, "step": 6400 }, { - "epoch": 0.23, - "learning_rate": 4.997024656667315e-05, - "loss": 0.3533, + "epoch": 0.23083576602875985, + "grad_norm": 0.2046813815832138, + "learning_rate": 4.996576483579577e-05, + "loss": 0.4632, "step": 6405 }, { - "epoch": 0.23, - "learning_rate": 4.997010746233843e-05, - "loss": 0.3333, + "epoch": 0.2310159656899845, + "grad_norm": 0.17519819736480713, + "learning_rate": 4.9965612001364124e-05, + "loss": 0.5046, "step": 6410 }, { - "epoch": 0.23, - "learning_rate": 4.996996803378331e-05, - "loss": 0.336, + "epoch": 0.23119616535120913, + "grad_norm": 0.19435188174247742, + "learning_rate": 4.996545882678086e-05, + "loss": 0.4378, "step": 6415 }, { - "epoch": 0.23, - "learning_rate": 4.996982828100962e-05, - "loss": 0.3255, + "epoch": 0.23137636501243378, + "grad_norm": 0.2328794300556183, + "learning_rate": 4.9965305312048043e-05, + "loss": 0.4884, "step": 6420 }, { - "epoch": 0.23, - "learning_rate": 4.996968820401916e-05, - "loss": 0.3319, + "epoch": 0.23155656467365843, + "grad_norm": 0.1869356334209442, + "learning_rate": 4.996515145716778e-05, + "loss": 0.4929, "step": 6425 }, { - "epoch": 0.23, - "learning_rate": 4.996954780281375e-05, - "loss": 0.358, + "epoch": 0.23173676433488305, + "grad_norm": 0.18540406227111816, + "learning_rate": 4.996499726214216e-05, + "loss": 0.4421, "step": 6430 }, { - "epoch": 0.23, - "learning_rate": 4.996940707739523e-05, - "loss": 0.3364, + "epoch": 0.2319169639961077, + "grad_norm": 0.23471662402153015, + "learning_rate": 4.9964842726973286e-05, + "loss": 0.5024, "step": 6435 }, { - "epoch": 0.23, - "learning_rate": 4.9969266027765404e-05, - "loss": 0.3389, + "epoch": 0.23209716365733232, + "grad_norm": 0.1900070160627365, + "learning_rate": 4.996468785166326e-05, + "loss": 0.4652, "step": 6440 }, { - "epoch": 0.23, - "learning_rate": 4.9969124653926127e-05, - "loss": 0.3334, + "epoch": 0.23227736331855697, + "grad_norm": 0.23058246076107025, + "learning_rate": 4.99645326362142e-05, + "loss": 0.5072, "step": 6445 }, { - "epoch": 0.23, - "learning_rate": 4.996898295587921e-05, - "loss": 0.344, + "epoch": 0.2324575629797816, + "grad_norm": 0.20412348210811615, + "learning_rate": 4.9964377080628215e-05, + "loss": 0.4847, "step": 6450 }, { - "epoch": 0.23, - "learning_rate": 4.9968840933626506e-05, - "loss": 0.3552, + "epoch": 0.23263776264100625, + "grad_norm": 0.20064546167850494, + "learning_rate": 4.9964221184907424e-05, + "loss": 0.4501, "step": 6455 }, { - "epoch": 0.23, - "learning_rate": 4.996869858716986e-05, - "loss": 0.3372, + "epoch": 0.23281796230223087, + "grad_norm": 0.16838903725147247, + "learning_rate": 4.996406494905396e-05, + "loss": 0.4665, "step": 6460 }, { - "epoch": 0.23, - "learning_rate": 4.9968555916511114e-05, - "loss": 0.3244, + "epoch": 0.23299816196345552, + "grad_norm": 0.17399536073207855, + "learning_rate": 4.9963908373069935e-05, + "loss": 0.4616, "step": 6465 }, { - "epoch": 0.23, - "learning_rate": 4.996841292165213e-05, - "loss": 0.3458, + "epoch": 0.23317836162468014, + "grad_norm": 0.18575124442577362, + "learning_rate": 4.9963751456957494e-05, + "loss": 0.4336, "step": 6470 }, { - "epoch": 0.23, - "learning_rate": 4.996826960259475e-05, - "loss": 0.3342, + "epoch": 0.2333585612859048, + "grad_norm": 0.22949109971523285, + "learning_rate": 4.996359420071877e-05, + "loss": 0.5, "step": 6475 }, { - "epoch": 0.23, - "learning_rate": 4.996812595934085e-05, - "loss": 0.3492, + "epoch": 0.23353876094712941, + "grad_norm": 0.20516782999038696, + "learning_rate": 4.9963436604355916e-05, + "loss": 0.4611, "step": 6480 }, { - "epoch": 0.23, - "learning_rate": 4.996798199189229e-05, - "loss": 0.343, + "epoch": 0.23371896060835406, + "grad_norm": 0.2293572872877121, + "learning_rate": 4.996327866787106e-05, + "loss": 0.501, "step": 6485 }, { - "epoch": 0.23, - "learning_rate": 4.996783770025094e-05, - "loss": 0.3175, + "epoch": 0.2338991602695787, + "grad_norm": 0.18813565373420715, + "learning_rate": 4.9963120391266376e-05, + "loss": 0.4347, "step": 6490 }, { - "epoch": 0.23, - "learning_rate": 4.9967693084418665e-05, - "loss": 0.3195, + "epoch": 0.23407935993080334, + "grad_norm": 0.2553117275238037, + "learning_rate": 4.9962961774544006e-05, + "loss": 0.4794, "step": 6495 }, { - "epoch": 0.23, - "learning_rate": 4.9967548144397356e-05, - "loss": 0.3236, + "epoch": 0.23425955959202796, + "grad_norm": 0.20878198742866516, + "learning_rate": 4.996280281770611e-05, + "loss": 0.4326, "step": 6500 }, { - "epoch": 0.23, - "eval_loss": 0.3339012861251831, - "eval_runtime": 10.5429, - "eval_samples_per_second": 9.485, - "eval_steps_per_second": 9.485, + "epoch": 0.23425955959202796, + "eval_loss": 0.4936707615852356, + "eval_runtime": 3.5146, + "eval_samples_per_second": 28.453, + "eval_steps_per_second": 7.113, "step": 6500 }, { - "epoch": 0.23, - "learning_rate": 4.9967402880188884e-05, - "loss": 0.3493, + "epoch": 0.2344397592532526, + "grad_norm": 0.20102368295192719, + "learning_rate": 4.996264352075487e-05, + "loss": 0.4871, "step": 6505 }, { - "epoch": 0.23, - "learning_rate": 4.9967257291795147e-05, - "loss": 0.3457, + "epoch": 0.23461995891447723, + "grad_norm": 0.20165607333183289, + "learning_rate": 4.996248388369243e-05, + "loss": 0.4737, "step": 6510 }, { - "epoch": 0.23, - "learning_rate": 4.996711137921802e-05, - "loss": 0.3331, + "epoch": 0.23480015857570188, + "grad_norm": 0.1898958534002304, + "learning_rate": 4.996232390652099e-05, + "loss": 0.4562, "step": 6515 }, { - "epoch": 0.23, - "learning_rate": 4.996696514245941e-05, - "loss": 0.3379, + "epoch": 0.2349803582369265, + "grad_norm": 0.2387424260377884, + "learning_rate": 4.996216358924272e-05, + "loss": 0.4935, "step": 6520 }, { - "epoch": 0.23, - "learning_rate": 4.9966818581521214e-05, - "loss": 0.3319, + "epoch": 0.23516055789815116, + "grad_norm": 0.2199474722146988, + "learning_rate": 4.99620029318598e-05, + "loss": 0.4808, "step": 6525 }, { - "epoch": 0.23, - "learning_rate": 4.996667169640533e-05, - "loss": 0.3487, + "epoch": 0.23534075755937578, + "grad_norm": 0.20181065797805786, + "learning_rate": 4.996184193437442e-05, + "loss": 0.4586, "step": 6530 }, { - "epoch": 0.23, - "learning_rate": 4.996652448711366e-05, - "loss": 0.3356, + "epoch": 0.23552095722060043, + "grad_norm": 0.21171115338802338, + "learning_rate": 4.9961680596788784e-05, + "loss": 0.451, "step": 6535 }, { - "epoch": 0.23, - "learning_rate": 4.996637695364813e-05, - "loss": 0.3379, + "epoch": 0.23570115688182505, + "grad_norm": 0.22186800837516785, + "learning_rate": 4.996151891910508e-05, + "loss": 0.4912, "step": 6540 }, { - "epoch": 0.23, - "learning_rate": 4.996622909601065e-05, - "loss": 0.3411, + "epoch": 0.2358813565430497, + "grad_norm": 0.20023906230926514, + "learning_rate": 4.9961356901325515e-05, + "loss": 0.481, "step": 6545 }, { - "epoch": 0.23, - "learning_rate": 4.996608091420314e-05, - "loss": 0.323, + "epoch": 0.23606155620427433, + "grad_norm": 0.2267458438873291, + "learning_rate": 4.9961194543452296e-05, + "loss": 0.4896, "step": 6550 }, { - "epoch": 0.23, - "learning_rate": 4.996593240822752e-05, - "loss": 0.3342, + "epoch": 0.23624175586549898, + "grad_norm": 0.1733904927968979, + "learning_rate": 4.996103184548763e-05, + "loss": 0.4545, "step": 6555 }, { - "epoch": 0.23, - "learning_rate": 4.996578357808572e-05, - "loss": 0.336, + "epoch": 0.2364219555267236, + "grad_norm": 0.1978592723608017, + "learning_rate": 4.9960868807433734e-05, + "loss": 0.4491, "step": 6560 }, { - "epoch": 0.23, - "learning_rate": 4.996563442377967e-05, - "loss": 0.3418, + "epoch": 0.23660215518794825, + "grad_norm": 0.1597129851579666, + "learning_rate": 4.9960705429292836e-05, + "loss": 0.4196, "step": 6565 }, { - "epoch": 0.23, - "learning_rate": 4.996548494531131e-05, - "loss": 0.3527, + "epoch": 0.23678235484917287, + "grad_norm": 0.19397877156734467, + "learning_rate": 4.996054171106716e-05, + "loss": 0.4751, "step": 6570 }, { - "epoch": 0.23, - "learning_rate": 4.996533514268259e-05, - "loss": 0.3421, + "epoch": 0.23696255451039752, + "grad_norm": 0.20515479147434235, + "learning_rate": 4.996037765275894e-05, + "loss": 0.4897, "step": 6575 }, { - "epoch": 0.23, - "learning_rate": 4.996518501589545e-05, - "loss": 0.3535, + "epoch": 0.23714275417162214, + "grad_norm": 0.240402951836586, + "learning_rate": 4.99602132543704e-05, + "loss": 0.4842, "step": 6580 }, { - "epoch": 0.23, - "learning_rate": 4.996503456495182e-05, - "loss": 0.3448, + "epoch": 0.2373229538328468, + "grad_norm": 0.20333066582679749, + "learning_rate": 4.996004851590379e-05, + "loss": 0.4708, "step": 6585 }, { - "epoch": 0.23, - "learning_rate": 4.996488378985367e-05, - "loss": 0.3458, + "epoch": 0.23750315349407142, + "grad_norm": 0.2151239812374115, + "learning_rate": 4.995988343736135e-05, + "loss": 0.4748, "step": 6590 }, { - "epoch": 0.23, - "learning_rate": 4.9964732690602964e-05, - "loss": 0.3429, + "epoch": 0.23768335315529607, + "grad_norm": 0.1680813729763031, + "learning_rate": 4.995971801874533e-05, + "loss": 0.4726, "step": 6595 }, { - "epoch": 0.23, - "learning_rate": 4.9964581267201646e-05, - "loss": 0.3458, + "epoch": 0.23786355281652072, + "grad_norm": 0.18834345042705536, + "learning_rate": 4.995955226005799e-05, + "loss": 0.4443, "step": 6600 }, { - "epoch": 0.23, - "learning_rate": 4.99644295196517e-05, - "loss": 0.3492, + "epoch": 0.23804375247774534, + "grad_norm": 0.21085232496261597, + "learning_rate": 4.995938616130158e-05, + "loss": 0.4451, "step": 6605 }, { - "epoch": 0.23, - "learning_rate": 4.996427744795509e-05, - "loss": 0.3352, + "epoch": 0.23822395213897, + "grad_norm": 0.20523454248905182, + "learning_rate": 4.9959219722478365e-05, + "loss": 0.4476, "step": 6610 }, { - "epoch": 0.23, - "learning_rate": 4.996412505211378e-05, - "loss": 0.3193, + "epoch": 0.2384041518001946, + "grad_norm": 0.20655976235866547, + "learning_rate": 4.995908632657321e-05, + "loss": 0.445, "step": 6615 }, { - "epoch": 0.23, - "learning_rate": 4.996397233212978e-05, - "loss": 0.3528, + "epoch": 0.23858435146141926, + "grad_norm": 0.1544501781463623, + "learning_rate": 4.995891927563548e-05, + "loss": 0.454, "step": 6620 }, { - "epoch": 0.23, - "learning_rate": 4.996381928800503e-05, - "loss": 0.3292, + "epoch": 0.2387645511226439, + "grad_norm": 0.20439325273036957, + "learning_rate": 4.9958751884637286e-05, + "loss": 0.4784, "step": 6625 }, { - "epoch": 0.23, - "learning_rate": 4.996366591974155e-05, - "loss": 0.3336, + "epoch": 0.23894475078386854, + "grad_norm": 0.15891632437705994, + "learning_rate": 4.9958584153580933e-05, + "loss": 0.4433, "step": 6630 }, { - "epoch": 0.23, - "learning_rate": 4.9963512227341315e-05, - "loss": 0.3314, + "epoch": 0.23912495044509316, + "grad_norm": 0.20959921181201935, + "learning_rate": 4.995841608246871e-05, + "loss": 0.4862, "step": 6635 }, { - "epoch": 0.23, - "learning_rate": 4.996335821080632e-05, - "loss": 0.3274, + "epoch": 0.2393051501063178, + "grad_norm": 0.17294225096702576, + "learning_rate": 4.995824767130289e-05, + "loss": 0.436, "step": 6640 }, { - "epoch": 0.23, - "learning_rate": 4.996320387013858e-05, - "loss": 0.3435, + "epoch": 0.23948534976754243, + "grad_norm": 0.26996639370918274, + "learning_rate": 4.995807892008578e-05, + "loss": 0.4688, "step": 6645 }, { - "epoch": 0.23, - "learning_rate": 4.996304920534008e-05, - "loss": 0.3747, + "epoch": 0.23966554942876708, + "grad_norm": 0.20224200189113617, + "learning_rate": 4.995790982881968e-05, + "loss": 0.4688, "step": 6650 }, { - "epoch": 0.23, - "learning_rate": 4.9962894216412845e-05, - "loss": 0.3473, + "epoch": 0.2398457490899917, + "grad_norm": 0.17374387383460999, + "learning_rate": 4.995774039750689e-05, + "loss": 0.4692, "step": 6655 }, { - "epoch": 0.23, - "learning_rate": 4.996273890335888e-05, - "loss": 0.32, + "epoch": 0.24002594875121636, + "grad_norm": 0.14203490316867828, + "learning_rate": 4.995757062614972e-05, + "loss": 0.4516, "step": 6660 }, { - "epoch": 0.23, - "learning_rate": 4.9962583266180194e-05, - "loss": 0.3586, + "epoch": 0.24020614841244098, + "grad_norm": 0.19781635701656342, + "learning_rate": 4.9957400514750484e-05, + "loss": 0.5077, "step": 6665 }, { - "epoch": 0.23, - "learning_rate": 4.9962427304878825e-05, - "loss": 0.3452, + "epoch": 0.24038634807366563, + "grad_norm": 0.2605617344379425, + "learning_rate": 4.99572300633115e-05, + "loss": 0.4947, "step": 6670 }, { - "epoch": 0.23, - "learning_rate": 4.996227101945678e-05, - "loss": 0.3305, + "epoch": 0.24056654773489025, + "grad_norm": 0.22600892186164856, + "learning_rate": 4.995705927183508e-05, + "loss": 0.4973, "step": 6675 }, { - "epoch": 0.24, - "learning_rate": 4.9962114409916105e-05, - "loss": 0.2937, + "epoch": 0.2407467473961149, + "grad_norm": 0.1905760020017624, + "learning_rate": 4.995688814032357e-05, + "loss": 0.5074, "step": 6680 }, { - "epoch": 0.24, - "learning_rate": 4.996195747625881e-05, - "loss": 0.3321, + "epoch": 0.24092694705733952, + "grad_norm": 0.19111768901348114, + "learning_rate": 4.995671666877928e-05, + "loss": 0.4408, "step": 6685 }, { - "epoch": 0.24, - "learning_rate": 4.9961800218486965e-05, - "loss": 0.3408, + "epoch": 0.24110714671856417, + "grad_norm": 0.16061218082904816, + "learning_rate": 4.995654485720456e-05, + "loss": 0.4824, "step": 6690 }, { - "epoch": 0.24, - "learning_rate": 4.996164263660258e-05, - "loss": 0.3322, + "epoch": 0.2412873463797888, + "grad_norm": 0.21395018696784973, + "learning_rate": 4.9956372705601754e-05, + "loss": 0.4909, "step": 6695 }, { - "epoch": 0.24, - "learning_rate": 4.996148473060773e-05, - "loss": 0.3353, + "epoch": 0.24146754604101345, + "grad_norm": 0.18952535092830658, + "learning_rate": 4.9956200213973195e-05, + "loss": 0.476, "step": 6700 }, { - "epoch": 0.24, - "learning_rate": 4.996132650050444e-05, - "loss": 0.3284, + "epoch": 0.24164774570223807, + "grad_norm": 0.23430414497852325, + "learning_rate": 4.9956027382321244e-05, + "loss": 0.4684, "step": 6705 }, { - "epoch": 0.24, - "learning_rate": 4.996116794629478e-05, - "loss": 0.3554, + "epoch": 0.24182794536346272, + "grad_norm": 0.23189213871955872, + "learning_rate": 4.9955854210648246e-05, + "loss": 0.4627, "step": 6710 }, { - "epoch": 0.24, - "learning_rate": 4.99610090679808e-05, - "loss": 0.34, + "epoch": 0.24200814502468734, + "grad_norm": 0.19272887706756592, + "learning_rate": 4.995568069895657e-05, + "loss": 0.4993, "step": 6715 }, { - "epoch": 0.24, - "learning_rate": 4.9960849865564576e-05, - "loss": 0.3264, + "epoch": 0.242188344685912, + "grad_norm": 0.24815817177295685, + "learning_rate": 4.995550684724858e-05, + "loss": 0.4526, "step": 6720 }, { - "epoch": 0.24, - "learning_rate": 4.996069033904816e-05, - "loss": 0.3286, + "epoch": 0.24236854434713662, + "grad_norm": 0.2101772576570511, + "learning_rate": 4.995533265552663e-05, + "loss": 0.4711, "step": 6725 }, { - "epoch": 0.24, - "learning_rate": 4.9960530488433634e-05, - "loss": 0.3279, + "epoch": 0.24254874400836127, + "grad_norm": 0.18738162517547607, + "learning_rate": 4.995515812379311e-05, + "loss": 0.4603, "step": 6730 }, { - "epoch": 0.24, - "learning_rate": 4.996037031372306e-05, - "loss": 0.3283, + "epoch": 0.2427289436695859, + "grad_norm": 0.20007184147834778, + "learning_rate": 4.9954983252050393e-05, + "loss": 0.4957, "step": 6735 }, { - "epoch": 0.24, - "learning_rate": 4.9960209814918534e-05, - "loss": 0.3508, + "epoch": 0.24290914333081054, + "grad_norm": 0.22411781549453735, + "learning_rate": 4.995480804030086e-05, + "loss": 0.4503, "step": 6740 }, { - "epoch": 0.24, - "learning_rate": 4.996004899202213e-05, - "loss": 0.3734, + "epoch": 0.24308934299203516, + "grad_norm": 0.19585049152374268, + "learning_rate": 4.99546324885469e-05, + "loss": 0.4572, "step": 6745 }, { - "epoch": 0.24, - "learning_rate": 4.995988784503595e-05, - "loss": 0.3299, + "epoch": 0.2432695426532598, + "grad_norm": 0.1979917585849762, + "learning_rate": 4.99544565967909e-05, + "loss": 0.436, "step": 6750 }, { - "epoch": 0.24, - "learning_rate": 4.995972637396206e-05, - "loss": 0.3035, + "epoch": 0.24344974231448444, + "grad_norm": 0.16787096858024597, + "learning_rate": 4.995428036503527e-05, + "loss": 0.4745, "step": 6755 }, { - "epoch": 0.24, - "learning_rate": 4.995956457880257e-05, - "loss": 0.3312, + "epoch": 0.24362994197570909, + "grad_norm": 0.1978481262922287, + "learning_rate": 4.995410379328239e-05, + "loss": 0.4553, "step": 6760 }, { - "epoch": 0.24, - "learning_rate": 4.99594024595596e-05, - "loss": 0.319, + "epoch": 0.24381014163693374, + "grad_norm": 0.22523565590381622, + "learning_rate": 4.995392688153468e-05, + "loss": 0.4355, "step": 6765 }, { - "epoch": 0.24, - "learning_rate": 4.9959240016235233e-05, - "loss": 0.3293, + "epoch": 0.24399034129815836, + "grad_norm": 0.16378116607666016, + "learning_rate": 4.995374962979455e-05, + "loss": 0.4136, "step": 6770 }, { - "epoch": 0.24, - "learning_rate": 4.995907724883158e-05, - "loss": 0.3427, + "epoch": 0.244170540959383, + "grad_norm": 0.17417633533477783, + "learning_rate": 4.995357203806441e-05, + "loss": 0.4231, "step": 6775 }, { - "epoch": 0.24, - "learning_rate": 4.995891415735075e-05, - "loss": 0.3499, + "epoch": 0.24435074062060763, + "grad_norm": 0.20466962456703186, + "learning_rate": 4.9953394106346686e-05, + "loss": 0.5051, "step": 6780 }, { - "epoch": 0.24, - "learning_rate": 4.995875074179488e-05, - "loss": 0.355, + "epoch": 0.24453094028183228, + "grad_norm": 0.1599748432636261, + "learning_rate": 4.99532158346438e-05, + "loss": 0.4419, "step": 6785 }, { - "epoch": 0.24, - "learning_rate": 4.995858700216607e-05, - "loss": 0.3269, + "epoch": 0.2447111399430569, + "grad_norm": 0.18782804906368256, + "learning_rate": 4.995303722295816e-05, + "loss": 0.4459, "step": 6790 }, { - "epoch": 0.24, - "learning_rate": 4.995842293846647e-05, - "loss": 0.3251, + "epoch": 0.24489133960428155, + "grad_norm": 0.19041045010089874, + "learning_rate": 4.995285827129224e-05, + "loss": 0.458, "step": 6795 }, { - "epoch": 0.24, - "learning_rate": 4.995825855069818e-05, - "loss": 0.3315, + "epoch": 0.24507153926550618, + "grad_norm": 0.2037457972764969, + "learning_rate": 4.995267897964845e-05, + "loss": 0.4666, "step": 6800 }, { - "epoch": 0.24, - "learning_rate": 4.995809383886336e-05, - "loss": 0.3505, + "epoch": 0.24525173892673083, + "grad_norm": 0.2316879779100418, + "learning_rate": 4.995249934802925e-05, + "loss": 0.4435, "step": 6805 }, { - "epoch": 0.24, - "learning_rate": 4.9957928802964135e-05, - "loss": 0.3231, + "epoch": 0.24543193858795545, + "grad_norm": 0.19015929102897644, + "learning_rate": 4.995231937643706e-05, + "loss": 0.4936, "step": 6810 }, { - "epoch": 0.24, - "learning_rate": 4.995776344300266e-05, - "loss": 0.3082, + "epoch": 0.2456121382491801, + "grad_norm": 0.17666086554527283, + "learning_rate": 4.995213906487436e-05, + "loss": 0.443, "step": 6815 }, { - "epoch": 0.24, - "learning_rate": 4.995759775898107e-05, - "loss": 0.3222, + "epoch": 0.24579233791040472, + "grad_norm": 0.17423413693904877, + "learning_rate": 4.995195841334359e-05, + "loss": 0.4887, "step": 6820 }, { - "epoch": 0.24, - "learning_rate": 4.995743175090152e-05, - "loss": 0.327, + "epoch": 0.24597253757162937, + "grad_norm": 0.19055670499801636, + "learning_rate": 4.9951777421847225e-05, + "loss": 0.4609, "step": 6825 }, { - "epoch": 0.24, - "learning_rate": 4.9957265418766165e-05, - "loss": 0.3357, + "epoch": 0.246152737232854, + "grad_norm": 0.21230706572532654, + "learning_rate": 4.995159609038772e-05, + "loss": 0.4283, "step": 6830 }, { - "epoch": 0.24, - "learning_rate": 4.9957098762577174e-05, - "loss": 0.3527, + "epoch": 0.24633293689407865, + "grad_norm": 0.20157566666603088, + "learning_rate": 4.995141441896754e-05, + "loss": 0.4772, "step": 6835 }, { - "epoch": 0.24, - "learning_rate": 4.9956931782336694e-05, - "loss": 0.3178, + "epoch": 0.24651313655530327, + "grad_norm": 0.177265927195549, + "learning_rate": 4.995123240758919e-05, + "loss": 0.4816, "step": 6840 }, { - "epoch": 0.24, - "learning_rate": 4.99567644780469e-05, - "loss": 0.3072, + "epoch": 0.24669333621652792, + "grad_norm": 0.20768406987190247, + "learning_rate": 4.995105005625511e-05, + "loss": 0.482, "step": 6845 }, { - "epoch": 0.24, - "learning_rate": 4.995659684970997e-05, - "loss": 0.338, + "epoch": 0.24687353587775254, + "grad_norm": 0.18862581253051758, + "learning_rate": 4.9950867364967814e-05, + "loss": 0.4796, "step": 6850 }, { - "epoch": 0.24, - "learning_rate": 4.995642889732808e-05, - "loss": 0.3291, + "epoch": 0.2470537355389772, + "grad_norm": 0.20289580523967743, + "learning_rate": 4.995068433372978e-05, + "loss": 0.509, "step": 6855 }, { - "epoch": 0.24, - "learning_rate": 4.9956260620903406e-05, - "loss": 0.3177, + "epoch": 0.24723393520020182, + "grad_norm": 0.23247657716274261, + "learning_rate": 4.9950500962543503e-05, + "loss": 0.4609, "step": 6860 }, { - "epoch": 0.24, - "learning_rate": 4.995609202043814e-05, - "loss": 0.3261, + "epoch": 0.24741413486142647, + "grad_norm": 0.17218773066997528, + "learning_rate": 4.995031725141147e-05, + "loss": 0.4785, "step": 6865 }, { - "epoch": 0.24, - "learning_rate": 4.9955923095934454e-05, - "loss": 0.3423, + "epoch": 0.2475943345226511, + "grad_norm": 0.21331791579723358, + "learning_rate": 4.99501332003362e-05, + "loss": 0.4784, "step": 6870 }, { - "epoch": 0.24, - "learning_rate": 4.995575384739456e-05, - "loss": 0.3177, + "epoch": 0.24777453418387574, + "grad_norm": 0.25006669759750366, + "learning_rate": 4.99499488093202e-05, + "loss": 0.4931, "step": 6875 }, { - "epoch": 0.24, - "learning_rate": 4.995558427482065e-05, - "loss": 0.3263, + "epoch": 0.24795473384510036, + "grad_norm": 0.18483485281467438, + "learning_rate": 4.994976407836598e-05, + "loss": 0.4738, "step": 6880 }, { - "epoch": 0.24, - "learning_rate": 4.995541437821492e-05, - "loss": 0.3622, + "epoch": 0.248134933506325, + "grad_norm": 0.18278944492340088, + "learning_rate": 4.994957900747606e-05, + "loss": 0.4416, "step": 6885 }, { - "epoch": 0.24, - "learning_rate": 4.995524415757958e-05, - "loss": 0.3131, + "epoch": 0.24831513316754963, + "grad_norm": 0.17712728679180145, + "learning_rate": 4.9949393596652936e-05, + "loss": 0.5227, "step": 6890 }, { - "epoch": 0.24, - "learning_rate": 4.995507361291685e-05, - "loss": 0.3471, + "epoch": 0.24849533282877428, + "grad_norm": 0.18737031519412994, + "learning_rate": 4.994920784589917e-05, + "loss": 0.4913, "step": 6895 }, { - "epoch": 0.24, - "learning_rate": 4.995490274422893e-05, - "loss": 0.3, + "epoch": 0.2486755324899989, + "grad_norm": 0.2273300439119339, + "learning_rate": 4.9949021755217276e-05, + "loss": 0.4476, "step": 6900 }, { - "epoch": 0.24, - "learning_rate": 4.995473155151804e-05, - "loss": 0.339, + "epoch": 0.24885573215122356, + "grad_norm": 0.238374263048172, + "learning_rate": 4.9948835324609786e-05, + "loss": 0.4756, "step": 6905 }, { - "epoch": 0.24, - "learning_rate": 4.995456003478641e-05, - "loss": 0.3626, + "epoch": 0.24903593181244818, + "grad_norm": 0.21403105556964874, + "learning_rate": 4.9948648554079246e-05, + "loss": 0.448, "step": 6910 }, { - "epoch": 0.24, - "learning_rate": 4.995438819403626e-05, - "loss": 0.2978, + "epoch": 0.24921613147367283, + "grad_norm": 0.23275822401046753, + "learning_rate": 4.9948461443628205e-05, + "loss": 0.48, "step": 6915 }, { - "epoch": 0.24, - "learning_rate": 4.995421602926983e-05, - "loss": 0.3304, + "epoch": 0.24939633113489745, + "grad_norm": 0.13692793250083923, + "learning_rate": 4.99482739932592e-05, + "loss": 0.4495, "step": 6920 }, { - "epoch": 0.24, - "learning_rate": 4.9954043540489346e-05, - "loss": 0.2999, + "epoch": 0.2495765307961221, + "grad_norm": 0.2151869684457779, + "learning_rate": 4.9948086202974795e-05, + "loss": 0.4986, "step": 6925 }, { - "epoch": 0.24, - "learning_rate": 4.995387072769706e-05, - "loss": 0.3274, + "epoch": 0.24975673045734675, + "grad_norm": 0.19741404056549072, + "learning_rate": 4.9947898072777557e-05, + "loss": 0.4688, "step": 6930 }, { - "epoch": 0.24, - "learning_rate": 4.99536975908952e-05, - "loss": 0.3499, + "epoch": 0.24993693011857138, + "grad_norm": 0.1841607540845871, + "learning_rate": 4.994770960267002e-05, + "loss": 0.4252, "step": 6935 }, { - "epoch": 0.24, - "learning_rate": 4.995352413008603e-05, - "loss": 0.337, + "epoch": 0.250117129779796, + "grad_norm": 0.16864712536334991, + "learning_rate": 4.994752079265478e-05, + "loss": 0.4635, "step": 6940 }, { - "epoch": 0.24, - "learning_rate": 4.995335034527178e-05, - "loss": 0.3155, + "epoch": 0.2502973294410207, + "grad_norm": 0.1859603226184845, + "learning_rate": 4.9947331642734394e-05, + "loss": 0.4722, "step": 6945 }, { - "epoch": 0.24, - "learning_rate": 4.995317623645474e-05, - "loss": 0.3544, + "epoch": 0.2504775291022453, + "grad_norm": 0.19181598722934723, + "learning_rate": 4.994714215291144e-05, + "loss": 0.5074, "step": 6950 }, { - "epoch": 0.24, - "learning_rate": 4.9953001803637136e-05, - "loss": 0.3439, + "epoch": 0.2506577287634699, + "grad_norm": 0.17568619549274445, + "learning_rate": 4.9946952323188514e-05, + "loss": 0.4396, "step": 6955 }, { - "epoch": 0.24, - "learning_rate": 4.995282704682125e-05, - "loss": 0.3484, + "epoch": 0.25083792842469455, + "grad_norm": 0.18716076016426086, + "learning_rate": 4.9946762153568195e-05, + "loss": 0.4776, "step": 6960 }, { - "epoch": 0.25, - "learning_rate": 4.995265196600935e-05, - "loss": 0.3429, + "epoch": 0.2510181280859192, + "grad_norm": 0.18289898335933685, + "learning_rate": 4.994657164405306e-05, + "loss": 0.4569, "step": 6965 }, { - "epoch": 0.25, - "learning_rate": 4.9952476561203706e-05, - "loss": 0.3532, + "epoch": 0.25119832774714385, + "grad_norm": 0.1580553501844406, + "learning_rate": 4.994638079464572e-05, + "loss": 0.4865, "step": 6970 }, { - "epoch": 0.25, - "learning_rate": 4.9952300832406605e-05, - "loss": 0.3107, + "epoch": 0.25137852740836847, + "grad_norm": 0.200215145945549, + "learning_rate": 4.9946189605348775e-05, + "loss": 0.4281, "step": 6975 }, { - "epoch": 0.25, - "learning_rate": 4.9952124779620324e-05, - "loss": 0.3309, + "epoch": 0.2515587270695931, + "grad_norm": 0.16953638195991516, + "learning_rate": 4.9945998076164824e-05, + "loss": 0.4802, "step": 6980 }, { - "epoch": 0.25, - "learning_rate": 4.995194840284714e-05, - "loss": 0.2985, + "epoch": 0.25173892673081777, + "grad_norm": 0.22101880609989166, + "learning_rate": 4.994580620709648e-05, + "loss": 0.4825, "step": 6985 }, { - "epoch": 0.25, - "learning_rate": 4.995177170208935e-05, - "loss": 0.3145, + "epoch": 0.2519191263920424, + "grad_norm": 0.1716253012418747, + "learning_rate": 4.9945613998146356e-05, + "loss": 0.4562, "step": 6990 }, { - "epoch": 0.25, - "learning_rate": 4.995159467734925e-05, - "loss": 0.3284, + "epoch": 0.252099326053267, + "grad_norm": 0.19001296162605286, + "learning_rate": 4.9945421449317065e-05, + "loss": 0.4797, "step": 6995 }, { - "epoch": 0.25, - "learning_rate": 4.9951417328629136e-05, - "loss": 0.3456, + "epoch": 0.25227952571449164, + "grad_norm": 0.1752406656742096, + "learning_rate": 4.9945228560611244e-05, + "loss": 0.4398, "step": 7000 }, { - "epoch": 0.25, - "eval_loss": 0.3314030170440674, - "eval_runtime": 10.5578, - "eval_samples_per_second": 9.472, - "eval_steps_per_second": 9.472, + "epoch": 0.25227952571449164, + "eval_loss": 0.49143746495246887, + "eval_runtime": 3.5206, + "eval_samples_per_second": 28.404, + "eval_steps_per_second": 7.101, "step": 7000 }, { - "epoch": 0.25, - "learning_rate": 4.9951239655931314e-05, - "loss": 0.3508, + "epoch": 0.2524597253757163, + "grad_norm": 0.15203414857387543, + "learning_rate": 4.994503533203151e-05, + "loss": 0.4602, "step": 7005 }, { - "epoch": 0.25, - "learning_rate": 4.995106165925809e-05, - "loss": 0.3357, + "epoch": 0.25263992503694094, + "grad_norm": 0.1956150233745575, + "learning_rate": 4.9944841763580505e-05, + "loss": 0.4639, "step": 7010 }, { - "epoch": 0.25, - "learning_rate": 4.9950883338611765e-05, - "loss": 0.3471, + "epoch": 0.25282012469816556, + "grad_norm": 0.18748140335083008, + "learning_rate": 4.9944647855260854e-05, + "loss": 0.4597, "step": 7015 }, { - "epoch": 0.25, - "learning_rate": 4.9950704693994673e-05, - "loss": 0.2958, + "epoch": 0.2530003243593902, + "grad_norm": 0.19696061313152313, + "learning_rate": 4.99444536070752e-05, + "loss": 0.449, "step": 7020 }, { - "epoch": 0.25, - "learning_rate": 4.995052572540912e-05, - "loss": 0.3492, + "epoch": 0.25318052402061486, + "grad_norm": 0.1847335398197174, + "learning_rate": 4.9944259019026207e-05, + "loss": 0.4875, "step": 7025 }, { - "epoch": 0.25, - "learning_rate": 4.995034643285743e-05, - "loss": 0.3428, + "epoch": 0.2533607236818395, + "grad_norm": 0.17691460251808167, + "learning_rate": 4.99440640911165e-05, + "loss": 0.4793, "step": 7030 }, { - "epoch": 0.25, - "learning_rate": 4.995016681634194e-05, - "loss": 0.3175, + "epoch": 0.2535409233430641, + "grad_norm": 0.18585610389709473, + "learning_rate": 4.994386882334877e-05, + "loss": 0.4646, "step": 7035 }, { - "epoch": 0.25, - "learning_rate": 4.9949986875864976e-05, - "loss": 0.3196, + "epoch": 0.25372112300428873, + "grad_norm": 0.2865478992462158, + "learning_rate": 4.994367321572564e-05, + "loss": 0.491, "step": 7040 }, { - "epoch": 0.25, - "learning_rate": 4.994980661142887e-05, - "loss": 0.3415, + "epoch": 0.2539013226655134, + "grad_norm": 0.20224061608314514, + "learning_rate": 4.9943477268249796e-05, + "loss": 0.5204, "step": 7045 }, { - "epoch": 0.25, - "learning_rate": 4.9949626023035974e-05, - "loss": 0.3353, + "epoch": 0.25408152232673803, + "grad_norm": 0.181314155459404, + "learning_rate": 4.99432809809239e-05, + "loss": 0.4963, "step": 7050 }, { - "epoch": 0.25, - "learning_rate": 4.994944511068862e-05, - "loss": 0.3419, + "epoch": 0.25426172198796265, + "grad_norm": 0.22092019021511078, + "learning_rate": 4.9943084353750635e-05, + "loss": 0.4489, "step": 7055 }, { - "epoch": 0.25, - "learning_rate": 4.9949263874389176e-05, - "loss": 0.3174, + "epoch": 0.2544419216491873, + "grad_norm": 0.19176073372364044, + "learning_rate": 4.9942887386732676e-05, + "loss": 0.4806, "step": 7060 }, { - "epoch": 0.25, - "learning_rate": 4.994908231413997e-05, - "loss": 0.3193, + "epoch": 0.25462212131041195, + "grad_norm": 0.2104840874671936, + "learning_rate": 4.99426900798727e-05, + "loss": 0.4741, "step": 7065 }, { - "epoch": 0.25, - "learning_rate": 4.994890042994337e-05, - "loss": 0.351, + "epoch": 0.2548023209716366, + "grad_norm": 0.14947918057441711, + "learning_rate": 4.9942492433173405e-05, + "loss": 0.5013, "step": 7070 }, { - "epoch": 0.25, - "learning_rate": 4.994871822180175e-05, - "loss": 0.3311, + "epoch": 0.2549825206328612, + "grad_norm": 0.1898050606250763, + "learning_rate": 4.9942294446637486e-05, + "loss": 0.4995, "step": 7075 }, { - "epoch": 0.25, - "learning_rate": 4.994853568971746e-05, - "loss": 0.3302, + "epoch": 0.2551627202940858, + "grad_norm": 0.18579892814159393, + "learning_rate": 4.9942096120267626e-05, + "loss": 0.4634, "step": 7080 }, { - "epoch": 0.25, - "learning_rate": 4.994835283369288e-05, - "loss": 0.372, + "epoch": 0.2553429199553105, + "grad_norm": 0.2083805650472641, + "learning_rate": 4.9941897454066535e-05, + "loss": 0.4602, "step": 7085 }, { - "epoch": 0.25, - "learning_rate": 4.994816965373038e-05, - "loss": 0.3315, + "epoch": 0.2555231196165351, + "grad_norm": 0.22849465906620026, + "learning_rate": 4.9941698448036916e-05, + "loss": 0.4593, "step": 7090 }, { - "epoch": 0.25, - "learning_rate": 4.994798614983234e-05, - "loss": 0.3142, + "epoch": 0.25570331927775974, + "grad_norm": 0.2611923813819885, + "learning_rate": 4.994149910218149e-05, + "loss": 0.4715, "step": 7095 }, { - "epoch": 0.25, - "learning_rate": 4.994780232200113e-05, - "loss": 0.3445, + "epoch": 0.2558835189389844, + "grad_norm": 0.20796480774879456, + "learning_rate": 4.994129941650296e-05, + "loss": 0.4678, "step": 7100 }, { - "epoch": 0.25, - "learning_rate": 4.9947618170239154e-05, - "loss": 0.3185, + "epoch": 0.25606371860020904, + "grad_norm": 0.16699600219726562, + "learning_rate": 4.994109939100406e-05, + "loss": 0.4886, "step": 7105 }, { - "epoch": 0.25, - "learning_rate": 4.99474336945488e-05, - "loss": 0.3178, + "epoch": 0.25624391826143367, + "grad_norm": 0.18829451501369476, + "learning_rate": 4.994089902568751e-05, + "loss": 0.4502, "step": 7110 }, { - "epoch": 0.25, - "learning_rate": 4.9947248894932464e-05, - "loss": 0.3449, + "epoch": 0.2564241179226583, + "grad_norm": 0.20573166012763977, + "learning_rate": 4.994069832055604e-05, + "loss": 0.4985, "step": 7115 }, { - "epoch": 0.25, - "learning_rate": 4.994706377139253e-05, - "loss": 0.3358, + "epoch": 0.25660431758388297, + "grad_norm": 0.22231273353099823, + "learning_rate": 4.994049727561239e-05, + "loss": 0.4926, "step": 7120 }, { - "epoch": 0.25, - "learning_rate": 4.994687832393142e-05, - "loss": 0.3166, + "epoch": 0.2567845172451076, + "grad_norm": 0.19772711396217346, + "learning_rate": 4.994029589085929e-05, + "loss": 0.4781, "step": 7125 }, { - "epoch": 0.25, - "learning_rate": 4.994669255255154e-05, - "loss": 0.3177, + "epoch": 0.2569647169063322, + "grad_norm": 0.24097611010074615, + "learning_rate": 4.9940094166299486e-05, + "loss": 0.4474, "step": 7130 }, { - "epoch": 0.25, - "learning_rate": 4.994650645725529e-05, - "loss": 0.3748, + "epoch": 0.25714491656755684, + "grad_norm": 0.2717370390892029, + "learning_rate": 4.9939892101935723e-05, + "loss": 0.473, "step": 7135 }, { - "epoch": 0.25, - "learning_rate": 4.99463200380451e-05, - "loss": 0.3148, + "epoch": 0.2573251162287815, + "grad_norm": 0.16408710181713104, + "learning_rate": 4.993968969777076e-05, + "loss": 0.4656, "step": 7140 }, { - "epoch": 0.25, - "learning_rate": 4.9946133294923384e-05, - "loss": 0.3303, + "epoch": 0.25750531589000614, + "grad_norm": 0.19900165498256683, + "learning_rate": 4.993948695380736e-05, + "loss": 0.4309, "step": 7145 }, { - "epoch": 0.25, - "learning_rate": 4.9945946227892566e-05, - "loss": 0.3862, + "epoch": 0.25768551555123076, + "grad_norm": 0.19811104238033295, + "learning_rate": 4.993928387004827e-05, + "loss": 0.4194, "step": 7150 }, { - "epoch": 0.25, - "learning_rate": 4.9945758836955075e-05, - "loss": 0.334, + "epoch": 0.2578657152124554, + "grad_norm": 0.20136548578739166, + "learning_rate": 4.9939080446496264e-05, + "loss": 0.4728, "step": 7155 }, { - "epoch": 0.25, - "learning_rate": 4.9945571122113344e-05, - "loss": 0.3192, + "epoch": 0.25804591487368006, + "grad_norm": 0.20853722095489502, + "learning_rate": 4.993887668315413e-05, + "loss": 0.436, "step": 7160 }, { - "epoch": 0.25, - "learning_rate": 4.994538308336981e-05, - "loss": 0.3359, + "epoch": 0.2582261145349047, + "grad_norm": 0.18556027114391327, + "learning_rate": 4.9938672580024615e-05, + "loss": 0.4321, "step": 7165 }, { - "epoch": 0.25, - "learning_rate": 4.9945194720726926e-05, - "loss": 0.3084, + "epoch": 0.2584063141961293, + "grad_norm": 0.24166804552078247, + "learning_rate": 4.993846813711052e-05, + "loss": 0.4749, "step": 7170 }, { - "epoch": 0.25, - "learning_rate": 4.9945006034187124e-05, - "loss": 0.3323, + "epoch": 0.25858651385735393, + "grad_norm": 0.17454582452774048, + "learning_rate": 4.9938263354414626e-05, + "loss": 0.4782, "step": 7175 }, { - "epoch": 0.25, - "learning_rate": 4.9944817023752853e-05, - "loss": 0.3196, + "epoch": 0.2587667135185786, + "grad_norm": 0.18085728585720062, + "learning_rate": 4.993805823193972e-05, + "loss": 0.4631, "step": 7180 }, { - "epoch": 0.25, - "learning_rate": 4.9944627689426574e-05, - "loss": 0.3358, + "epoch": 0.25894691317980323, + "grad_norm": 0.168624609708786, + "learning_rate": 4.99378527696886e-05, + "loss": 0.4665, "step": 7185 }, { - "epoch": 0.25, - "learning_rate": 4.994443803121075e-05, - "loss": 0.3596, + "epoch": 0.25912711284102785, + "grad_norm": 0.17559649050235748, + "learning_rate": 4.9937646967664066e-05, + "loss": 0.4262, "step": 7190 }, { - "epoch": 0.25, - "learning_rate": 4.994424804910783e-05, - "loss": 0.3423, + "epoch": 0.2593073125022525, + "grad_norm": 0.1460302621126175, + "learning_rate": 4.993744082586891e-05, + "loss": 0.4484, "step": 7195 }, { - "epoch": 0.25, - "learning_rate": 4.9944057743120284e-05, - "loss": 0.3223, + "epoch": 0.25948751216347715, + "grad_norm": 0.250219464302063, + "learning_rate": 4.9937234344305964e-05, + "loss": 0.477, "step": 7200 }, { - "epoch": 0.25, - "learning_rate": 4.99438671132506e-05, - "loss": 0.3438, + "epoch": 0.2596677118247018, + "grad_norm": 0.17748834192752838, + "learning_rate": 4.993702752297802e-05, + "loss": 0.4931, "step": 7205 }, { - "epoch": 0.25, - "learning_rate": 4.994367615950123e-05, - "loss": 0.2991, + "epoch": 0.2598479114859264, + "grad_norm": 0.21896113455295563, + "learning_rate": 4.99368203618879e-05, + "loss": 0.4621, "step": 7210 }, { - "epoch": 0.25, - "learning_rate": 4.994348488187467e-05, - "loss": 0.3617, + "epoch": 0.260028111147151, + "grad_norm": 0.25870126485824585, + "learning_rate": 4.9936612861038446e-05, + "loss": 0.4634, "step": 7215 }, { - "epoch": 0.25, - "learning_rate": 4.9943293280373395e-05, - "loss": 0.3359, + "epoch": 0.2602083108083757, + "grad_norm": 0.1914265900850296, + "learning_rate": 4.993640502043246e-05, + "loss": 0.4604, "step": 7220 }, { - "epoch": 0.25, - "learning_rate": 4.994310135499989e-05, - "loss": 0.3333, + "epoch": 0.2603885104696003, + "grad_norm": 0.16129115223884583, + "learning_rate": 4.993619684007278e-05, + "loss": 0.4598, "step": 7225 }, { - "epoch": 0.25, - "learning_rate": 4.994290910575666e-05, - "loss": 0.3275, + "epoch": 0.26056871013082494, + "grad_norm": 0.19293855130672455, + "learning_rate": 4.993598831996225e-05, + "loss": 0.464, "step": 7230 }, { - "epoch": 0.25, - "learning_rate": 4.994271653264619e-05, - "loss": 0.3242, + "epoch": 0.26074890979204957, + "grad_norm": 0.1789807379245758, + "learning_rate": 4.993577946010371e-05, + "loss": 0.4816, "step": 7235 }, { - "epoch": 0.25, - "learning_rate": 4.994252363567098e-05, - "loss": 0.3308, + "epoch": 0.26092910945327424, + "grad_norm": 0.19150225818157196, + "learning_rate": 4.99355702605e-05, + "loss": 0.4875, "step": 7240 }, { - "epoch": 0.25, - "learning_rate": 4.9942330414833546e-05, - "loss": 0.3246, + "epoch": 0.26110930911449887, + "grad_norm": 0.2224695235490799, + "learning_rate": 4.9935360721153965e-05, + "loss": 0.4727, "step": 7245 }, { - "epoch": 0.26, - "learning_rate": 4.9942136870136383e-05, - "loss": 0.3392, + "epoch": 0.2612895087757235, + "grad_norm": 0.17574471235275269, + "learning_rate": 4.993515084206848e-05, + "loss": 0.4247, "step": 7250 }, { - "epoch": 0.26, - "learning_rate": 4.9941943001582015e-05, - "loss": 0.331, + "epoch": 0.2614697084369481, + "grad_norm": 0.16948924958705902, + "learning_rate": 4.9934940623246387e-05, + "loss": 0.4704, "step": 7255 }, { - "epoch": 0.26, - "learning_rate": 4.994174880917295e-05, - "loss": 0.3131, + "epoch": 0.2616499080981728, + "grad_norm": 0.17520290613174438, + "learning_rate": 4.993473006469055e-05, + "loss": 0.4646, "step": 7260 }, { - "epoch": 0.26, - "learning_rate": 4.9941554292911715e-05, - "loss": 0.3576, + "epoch": 0.2618301077593974, + "grad_norm": 0.20636965334415436, + "learning_rate": 4.993451916640386e-05, + "loss": 0.4463, "step": 7265 }, { - "epoch": 0.26, - "learning_rate": 4.9941359452800826e-05, - "loss": 0.3267, + "epoch": 0.26201030742062204, + "grad_norm": 0.1967248171567917, + "learning_rate": 4.9934307928389154e-05, + "loss": 0.4631, "step": 7270 }, { - "epoch": 0.26, - "learning_rate": 4.994116428884283e-05, - "loss": 0.3251, + "epoch": 0.2621905070818467, + "grad_norm": 0.22940480709075928, + "learning_rate": 4.993409635064934e-05, + "loss": 0.4354, "step": 7275 }, { - "epoch": 0.26, - "learning_rate": 4.994096880104026e-05, - "loss": 0.3205, + "epoch": 0.26237070674307134, + "grad_norm": 0.2190198004245758, + "learning_rate": 4.9933884433187295e-05, + "loss": 0.4826, "step": 7280 }, { - "epoch": 0.26, - "learning_rate": 4.994077298939564e-05, - "loss": 0.3455, + "epoch": 0.26255090640429596, + "grad_norm": 0.16079477965831757, + "learning_rate": 4.9933672176005894e-05, + "loss": 0.4499, "step": 7285 }, { - "epoch": 0.26, - "learning_rate": 4.994057685391152e-05, - "loss": 0.3158, + "epoch": 0.2627311060655206, + "grad_norm": 0.24084745347499847, + "learning_rate": 4.993345957910804e-05, + "loss": 0.5024, "step": 7290 }, { - "epoch": 0.26, - "learning_rate": 4.994038039459045e-05, - "loss": 0.3288, + "epoch": 0.26291130572674526, + "grad_norm": 0.16274696588516235, + "learning_rate": 4.9933246642496626e-05, + "loss": 0.4463, "step": 7295 }, { - "epoch": 0.26, - "learning_rate": 4.994018361143497e-05, - "loss": 0.3069, + "epoch": 0.2630915053879699, + "grad_norm": 0.21243201196193695, + "learning_rate": 4.9933033366174554e-05, + "loss": 0.4764, "step": 7300 }, { - "epoch": 0.26, - "learning_rate": 4.993998650444764e-05, - "loss": 0.3741, + "epoch": 0.2632717050491945, + "grad_norm": 0.20116925239562988, + "learning_rate": 4.9932819750144734e-05, + "loss": 0.484, "step": 7305 }, { - "epoch": 0.26, - "learning_rate": 4.9939789073631025e-05, - "loss": 0.3298, + "epoch": 0.2634519047104191, + "grad_norm": 0.20278726518154144, + "learning_rate": 4.993260579441006e-05, + "loss": 0.4757, "step": 7310 }, { - "epoch": 0.26, - "learning_rate": 4.993959131898769e-05, - "loss": 0.3459, + "epoch": 0.2636321043716438, + "grad_norm": 0.17847001552581787, + "learning_rate": 4.993239149897347e-05, + "loss": 0.4309, "step": 7315 }, { - "epoch": 0.26, - "learning_rate": 4.993939324052019e-05, - "loss": 0.3345, + "epoch": 0.2638123040328684, + "grad_norm": 0.1934574991464615, + "learning_rate": 4.993217686383787e-05, + "loss": 0.4267, "step": 7320 }, { - "epoch": 0.26, - "learning_rate": 4.9939194838231104e-05, - "loss": 0.3655, + "epoch": 0.26399250369409305, + "grad_norm": 0.1816255897283554, + "learning_rate": 4.993196188900618e-05, + "loss": 0.4676, "step": 7325 }, { - "epoch": 0.26, - "learning_rate": 4.993899611212302e-05, - "loss": 0.3267, + "epoch": 0.2641727033553177, + "grad_norm": 0.1440691500902176, + "learning_rate": 4.993174657448135e-05, + "loss": 0.4451, "step": 7330 }, { - "epoch": 0.26, - "learning_rate": 4.993879706219849e-05, - "loss": 0.3244, + "epoch": 0.26435290301654235, + "grad_norm": 0.18416348099708557, + "learning_rate": 4.993153092026629e-05, + "loss": 0.4692, "step": 7335 }, { - "epoch": 0.26, - "learning_rate": 4.993859768846013e-05, - "loss": 0.3159, + "epoch": 0.264533102677767, + "grad_norm": 0.1908058077096939, + "learning_rate": 4.9931314926363945e-05, + "loss": 0.438, "step": 7340 }, { - "epoch": 0.26, - "learning_rate": 4.99383979909105e-05, - "loss": 0.3231, + "epoch": 0.2647133023389916, + "grad_norm": 0.17665372788906097, + "learning_rate": 4.993109859277727e-05, + "loss": 0.4475, "step": 7345 }, { - "epoch": 0.26, - "learning_rate": 4.993819796955221e-05, - "loss": 0.3526, + "epoch": 0.2648935020002162, + "grad_norm": 0.20020587742328644, + "learning_rate": 4.99308819195092e-05, + "loss": 0.466, "step": 7350 }, { - "epoch": 0.26, - "learning_rate": 4.9937997624387864e-05, - "loss": 0.329, + "epoch": 0.2650737016614409, + "grad_norm": 0.20081447064876556, + "learning_rate": 4.9930664906562695e-05, + "loss": 0.4793, "step": 7355 }, { - "epoch": 0.26, - "learning_rate": 4.993779695542005e-05, - "loss": 0.3234, + "epoch": 0.2652539013226655, + "grad_norm": 0.1721320003271103, + "learning_rate": 4.993049105163899e-05, + "loss": 0.4658, "step": 7360 }, { - "epoch": 0.26, - "learning_rate": 4.993759596265138e-05, - "loss": 0.3473, + "epoch": 0.26543410098389014, + "grad_norm": 0.24417546391487122, + "learning_rate": 4.993027342727875e-05, + "loss": 0.4484, "step": 7365 }, { - "epoch": 0.26, - "learning_rate": 4.9937394646084454e-05, - "loss": 0.3387, + "epoch": 0.26561430064511476, + "grad_norm": 0.2776060700416565, + "learning_rate": 4.993005546324836e-05, + "loss": 0.4846, "step": 7370 }, { - "epoch": 0.26, - "learning_rate": 4.993719300572189e-05, - "loss": 0.3333, + "epoch": 0.26579450030633944, + "grad_norm": 0.19747385382652283, + "learning_rate": 4.9929837159550784e-05, + "loss": 0.4603, "step": 7375 }, { - "epoch": 0.26, - "learning_rate": 4.993699104156632e-05, - "loss": 0.3357, + "epoch": 0.26597469996756407, + "grad_norm": 0.1888158768415451, + "learning_rate": 4.9929618516189e-05, + "loss": 0.4405, "step": 7380 }, { - "epoch": 0.26, - "learning_rate": 4.993678875362036e-05, - "loss": 0.338, + "epoch": 0.2661548996287887, + "grad_norm": 0.20889566838741302, + "learning_rate": 4.9929399533166e-05, + "loss": 0.4846, "step": 7385 }, { - "epoch": 0.26, - "learning_rate": 4.993658614188662e-05, - "loss": 0.3469, + "epoch": 0.2663350992900133, + "grad_norm": 0.19640065729618073, + "learning_rate": 4.992918021048475e-05, + "loss": 0.4725, "step": 7390 }, { - "epoch": 0.26, - "learning_rate": 4.9936383206367745e-05, - "loss": 0.3189, + "epoch": 0.266515298951238, + "grad_norm": 0.20840995013713837, + "learning_rate": 4.992896054814825e-05, + "loss": 0.4594, "step": 7395 }, { - "epoch": 0.26, - "learning_rate": 4.993617994706637e-05, - "loss": 0.3178, + "epoch": 0.2666954986124626, + "grad_norm": 0.2129368633031845, + "learning_rate": 4.992874054615949e-05, + "loss": 0.4721, "step": 7400 }, { - "epoch": 0.26, - "learning_rate": 4.9935976363985126e-05, - "loss": 0.3372, + "epoch": 0.26687569827368723, + "grad_norm": 0.13430319726467133, + "learning_rate": 4.992852020452147e-05, + "loss": 0.4479, "step": 7405 }, { - "epoch": 0.26, - "learning_rate": 4.993577245712666e-05, - "loss": 0.3349, + "epoch": 0.26705589793491186, + "grad_norm": 0.2261311113834381, + "learning_rate": 4.992829952323718e-05, + "loss": 0.4972, "step": 7410 }, { - "epoch": 0.26, - "learning_rate": 4.993556822649363e-05, - "loss": 0.3583, + "epoch": 0.26723609759613653, + "grad_norm": 0.21001654863357544, + "learning_rate": 4.992807850230964e-05, + "loss": 0.5047, "step": 7415 }, { - "epoch": 0.26, - "learning_rate": 4.993536367208868e-05, - "loss": 0.318, + "epoch": 0.26741629725736116, + "grad_norm": 0.2056044489145279, + "learning_rate": 4.992785714174185e-05, + "loss": 0.4793, "step": 7420 }, { - "epoch": 0.26, - "learning_rate": 4.993515879391447e-05, - "loss": 0.3181, + "epoch": 0.2675964969185858, + "grad_norm": 0.156636044383049, + "learning_rate": 4.9927635441536844e-05, + "loss": 0.4914, "step": 7425 }, { - "epoch": 0.26, - "learning_rate": 4.993495359197364e-05, - "loss": 0.3264, + "epoch": 0.2677766965798104, + "grad_norm": 0.18658673763275146, + "learning_rate": 4.9927413401697625e-05, + "loss": 0.4451, "step": 7430 }, { - "epoch": 0.26, - "learning_rate": 4.993474806626889e-05, - "loss": 0.2904, + "epoch": 0.2679568962410351, + "grad_norm": 0.14643876254558563, + "learning_rate": 4.992719102222723e-05, + "loss": 0.4108, "step": 7435 }, { - "epoch": 0.26, - "learning_rate": 4.9934542216802856e-05, - "loss": 0.3285, + "epoch": 0.2681370959022597, + "grad_norm": 0.2012728452682495, + "learning_rate": 4.9926968303128674e-05, + "loss": 0.4497, "step": 7440 }, { - "epoch": 0.26, - "learning_rate": 4.9934336043578225e-05, - "loss": 0.3649, + "epoch": 0.2683172955634843, + "grad_norm": 0.19251321256160736, + "learning_rate": 4.9926745244405e-05, + "loss": 0.4938, "step": 7445 }, { - "epoch": 0.26, - "learning_rate": 4.993412954659767e-05, - "loss": 0.3389, + "epoch": 0.268497495224709, + "grad_norm": 0.19038186967372894, + "learning_rate": 4.992652184605926e-05, + "loss": 0.4739, "step": 7450 }, { - "epoch": 0.26, - "learning_rate": 4.993392272586388e-05, - "loss": 0.3359, + "epoch": 0.2686776948859336, + "grad_norm": 0.21992941200733185, + "learning_rate": 4.992629810809448e-05, + "loss": 0.4839, "step": 7455 }, { - "epoch": 0.26, - "learning_rate": 4.9933715581379536e-05, - "loss": 0.35, + "epoch": 0.26885789454715825, + "grad_norm": 0.208626389503479, + "learning_rate": 4.992607403051371e-05, + "loss": 0.4498, "step": 7460 }, { - "epoch": 0.26, - "learning_rate": 4.993350811314733e-05, - "loss": 0.3076, + "epoch": 0.26903809420838287, + "grad_norm": 0.14760567247867584, + "learning_rate": 4.9925849613320006e-05, + "loss": 0.4185, "step": 7465 }, { - "epoch": 0.26, - "learning_rate": 4.993330032116995e-05, - "loss": 0.353, + "epoch": 0.26921829386960755, + "grad_norm": 0.2208520770072937, + "learning_rate": 4.992562485651644e-05, + "loss": 0.4734, "step": 7470 }, { - "epoch": 0.26, - "learning_rate": 4.9933092205450094e-05, - "loss": 0.3349, + "epoch": 0.2693984935308322, + "grad_norm": 0.18603351712226868, + "learning_rate": 4.992539976010605e-05, + "loss": 0.5101, "step": 7475 }, { - "epoch": 0.26, - "learning_rate": 4.9932883765990466e-05, - "loss": 0.3446, + "epoch": 0.2695786931920568, + "grad_norm": 0.2119520753622055, + "learning_rate": 4.992517432409192e-05, + "loss": 0.4816, "step": 7480 }, { - "epoch": 0.26, - "learning_rate": 4.993267500279378e-05, - "loss": 0.3393, + "epoch": 0.2697588928532814, + "grad_norm": 0.17017479240894318, + "learning_rate": 4.99249485484771e-05, + "loss": 0.4189, "step": 7485 }, { - "epoch": 0.26, - "learning_rate": 4.993246591586274e-05, - "loss": 0.3423, + "epoch": 0.2699390925145061, + "grad_norm": 0.2112009972333908, + "learning_rate": 4.99247224332647e-05, + "loss": 0.4506, "step": 7490 }, { - "epoch": 0.26, - "learning_rate": 4.993225650520006e-05, - "loss": 0.3387, + "epoch": 0.2701192921757307, + "grad_norm": 0.22888222336769104, + "learning_rate": 4.992449597845777e-05, + "loss": 0.444, "step": 7495 }, { - "epoch": 0.26, - "learning_rate": 4.9932046770808454e-05, - "loss": 0.3369, + "epoch": 0.27029949183695534, + "grad_norm": 0.19122794270515442, + "learning_rate": 4.992426918405941e-05, + "loss": 0.4645, "step": 7500 }, { - "epoch": 0.26, - "eval_loss": 0.3308403789997101, - "eval_runtime": 10.5623, - "eval_samples_per_second": 9.468, - "eval_steps_per_second": 9.468, + "epoch": 0.27029949183695534, + "eval_loss": 0.48843687772750854, + "eval_runtime": 3.5217, + "eval_samples_per_second": 28.395, + "eval_steps_per_second": 7.099, "step": 7500 }, { - "epoch": 0.26, - "learning_rate": 4.993183671269066e-05, - "loss": 0.3551, + "epoch": 0.27047969149817996, + "grad_norm": 0.18610242009162903, + "learning_rate": 4.992404205007272e-05, + "loss": 0.4546, "step": 7505 }, { - "epoch": 0.26, - "learning_rate": 4.993162633084939e-05, - "loss": 0.3412, + "epoch": 0.27065989115940464, + "grad_norm": 0.23450550436973572, + "learning_rate": 4.992381457650077e-05, + "loss": 0.4374, "step": 7510 }, { - "epoch": 0.26, - "learning_rate": 4.993141562528739e-05, - "loss": 0.3525, + "epoch": 0.27084009082062926, + "grad_norm": 0.2130429446697235, + "learning_rate": 4.9923586763346674e-05, + "loss": 0.4876, "step": 7515 }, { - "epoch": 0.26, - "learning_rate": 4.9931204596007386e-05, - "loss": 0.314, + "epoch": 0.2710202904818539, + "grad_norm": 0.18153907358646393, + "learning_rate": 4.992335861061354e-05, + "loss": 0.4945, "step": 7520 }, { - "epoch": 0.26, - "learning_rate": 4.9930993243012114e-05, - "loss": 0.3427, + "epoch": 0.2712004901430785, + "grad_norm": 0.16226467490196228, + "learning_rate": 4.992313011830446e-05, + "loss": 0.4293, "step": 7525 }, { - "epoch": 0.26, - "learning_rate": 4.993078156630433e-05, - "loss": 0.3562, + "epoch": 0.2713806898043032, + "grad_norm": 0.18704953789710999, + "learning_rate": 4.992290128642257e-05, + "loss": 0.4861, "step": 7530 }, { - "epoch": 0.27, - "learning_rate": 4.9930569565886775e-05, - "loss": 0.3455, + "epoch": 0.2715608894655278, + "grad_norm": 0.24006149172782898, + "learning_rate": 4.992267211497097e-05, + "loss": 0.5093, "step": 7535 }, { - "epoch": 0.27, - "learning_rate": 4.993035724176221e-05, - "loss": 0.3324, + "epoch": 0.27174108912675243, + "grad_norm": 0.17990513145923615, + "learning_rate": 4.992244260395278e-05, + "loss": 0.5006, "step": 7540 }, { - "epoch": 0.27, - "learning_rate": 4.993014459393338e-05, - "loss": 0.3426, + "epoch": 0.27192128878797706, + "grad_norm": 0.19324089586734772, + "learning_rate": 4.992221275337115e-05, + "loss": 0.4631, "step": 7545 }, { - "epoch": 0.27, - "learning_rate": 4.992993162240306e-05, - "loss": 0.328, + "epoch": 0.27210148844920173, + "grad_norm": 0.1453637182712555, + "learning_rate": 4.992198256322918e-05, + "loss": 0.4758, "step": 7550 }, { - "epoch": 0.27, - "learning_rate": 4.9929718327174004e-05, - "loss": 0.3342, + "epoch": 0.27228168811042636, + "grad_norm": 0.18295510113239288, + "learning_rate": 4.992175203353003e-05, + "loss": 0.4832, "step": 7555 }, { - "epoch": 0.27, - "learning_rate": 4.992950470824898e-05, - "loss": 0.3309, + "epoch": 0.272461887771651, + "grad_norm": 0.20734231173992157, + "learning_rate": 4.992152116427683e-05, + "loss": 0.4669, "step": 7560 }, { - "epoch": 0.27, - "learning_rate": 4.9929290765630766e-05, - "loss": 0.3365, + "epoch": 0.2726420874328756, + "grad_norm": 0.17811109125614166, + "learning_rate": 4.992128995547274e-05, + "loss": 0.4197, "step": 7565 }, { - "epoch": 0.27, - "learning_rate": 4.9929076499322145e-05, - "loss": 0.342, + "epoch": 0.2728222870941003, + "grad_norm": 0.16642598807811737, + "learning_rate": 4.992105840712089e-05, + "loss": 0.4575, "step": 7570 }, { - "epoch": 0.27, - "learning_rate": 4.9928861909325894e-05, - "loss": 0.3079, + "epoch": 0.2730024867553249, + "grad_norm": 0.2581879794597626, + "learning_rate": 4.992082651922444e-05, + "loss": 0.4631, "step": 7575 }, { - "epoch": 0.27, - "learning_rate": 4.9928646995644804e-05, - "loss": 0.3414, + "epoch": 0.2731826864165495, + "grad_norm": 0.17311778664588928, + "learning_rate": 4.992059429178656e-05, + "loss": 0.4674, "step": 7580 }, { - "epoch": 0.27, - "learning_rate": 4.992843175828165e-05, - "loss": 0.3426, + "epoch": 0.27336288607777415, + "grad_norm": 0.17181158065795898, + "learning_rate": 4.992036172481041e-05, + "loss": 0.4439, "step": 7585 }, { - "epoch": 0.27, - "learning_rate": 4.992821619723924e-05, - "loss": 0.3287, + "epoch": 0.2735430857389988, + "grad_norm": 0.21754592657089233, + "learning_rate": 4.992012881829915e-05, + "loss": 0.4399, "step": 7590 }, { - "epoch": 0.27, - "learning_rate": 4.992800031252038e-05, - "loss": 0.3117, + "epoch": 0.27372328540022345, + "grad_norm": 0.18678173422813416, + "learning_rate": 4.9919895572255956e-05, + "loss": 0.4932, "step": 7595 }, { - "epoch": 0.27, - "learning_rate": 4.9927784104127864e-05, - "loss": 0.3147, + "epoch": 0.27390348506144807, + "grad_norm": 0.16362018883228302, + "learning_rate": 4.9919661986684024e-05, + "loss": 0.4359, "step": 7600 }, { - "epoch": 0.27, - "learning_rate": 4.992756757206449e-05, - "loss": 0.3535, + "epoch": 0.27408368472267275, + "grad_norm": 0.2467060089111328, + "learning_rate": 4.991942806158652e-05, + "loss": 0.4648, "step": 7605 }, { - "epoch": 0.27, - "learning_rate": 4.992735071633308e-05, - "loss": 0.3252, + "epoch": 0.27426388438389737, + "grad_norm": 0.22692877054214478, + "learning_rate": 4.991919379696662e-05, + "loss": 0.5261, "step": 7610 }, { - "epoch": 0.27, - "learning_rate": 4.992713353693646e-05, - "loss": 0.3412, + "epoch": 0.274444084045122, + "grad_norm": 0.1850813329219818, + "learning_rate": 4.9918959192827534e-05, + "loss": 0.4305, "step": 7615 }, { - "epoch": 0.27, - "learning_rate": 4.992691603387742e-05, - "loss": 0.3453, + "epoch": 0.2746242837063466, + "grad_norm": 0.183277890086174, + "learning_rate": 4.9918724249172454e-05, + "loss": 0.4175, "step": 7620 }, { - "epoch": 0.27, - "learning_rate": 4.992669820715882e-05, - "loss": 0.3312, + "epoch": 0.2748044833675713, + "grad_norm": 0.18612994253635406, + "learning_rate": 4.9918488966004587e-05, + "loss": 0.4802, "step": 7625 }, { - "epoch": 0.27, - "learning_rate": 4.9926480056783465e-05, - "loss": 0.3548, + "epoch": 0.2749846830287959, + "grad_norm": 0.1732574850320816, + "learning_rate": 4.9918253343327123e-05, + "loss": 0.4322, "step": 7630 }, { - "epoch": 0.27, - "learning_rate": 4.99262615827542e-05, - "loss": 0.3218, + "epoch": 0.27516488269002054, + "grad_norm": 0.20043861865997314, + "learning_rate": 4.991801738114329e-05, + "loss": 0.4696, "step": 7635 }, { - "epoch": 0.27, - "learning_rate": 4.992604278507385e-05, - "loss": 0.3271, + "epoch": 0.27534508235124516, + "grad_norm": 0.2102193683385849, + "learning_rate": 4.991778107945629e-05, + "loss": 0.4652, "step": 7640 }, { - "epoch": 0.27, - "learning_rate": 4.9925823663745255e-05, - "loss": 0.3429, + "epoch": 0.27552528201246984, + "grad_norm": 0.20501478016376495, + "learning_rate": 4.9917544438269346e-05, + "loss": 0.4636, "step": 7645 }, { - "epoch": 0.27, - "learning_rate": 4.992560421877127e-05, - "loss": 0.3242, + "epoch": 0.27570548167369446, + "grad_norm": 0.17633172869682312, + "learning_rate": 4.991730745758568e-05, + "loss": 0.4995, "step": 7650 }, { - "epoch": 0.27, - "learning_rate": 4.992538445015474e-05, - "loss": 0.3192, + "epoch": 0.2758856813349191, + "grad_norm": 0.18572551012039185, + "learning_rate": 4.991707013740853e-05, + "loss": 0.4752, "step": 7655 }, { - "epoch": 0.27, - "learning_rate": 4.992516435789852e-05, - "loss": 0.3249, + "epoch": 0.2760658809961437, + "grad_norm": 0.14463745057582855, + "learning_rate": 4.991683247774113e-05, + "loss": 0.4448, "step": 7660 }, { - "epoch": 0.27, - "learning_rate": 4.992494394200548e-05, - "loss": 0.3245, + "epoch": 0.2762460806573684, + "grad_norm": 0.2664778232574463, + "learning_rate": 4.99165944785867e-05, + "loss": 0.4605, "step": 7665 }, { - "epoch": 0.27, - "learning_rate": 4.992472320247845e-05, - "loss": 0.3199, + "epoch": 0.276426280318593, + "grad_norm": 0.20322363078594208, + "learning_rate": 4.991635613994849e-05, + "loss": 0.4676, "step": 7670 }, { - "epoch": 0.27, - "learning_rate": 4.992450213932033e-05, - "loss": 0.3396, + "epoch": 0.27660647997981763, + "grad_norm": 0.21015766263008118, + "learning_rate": 4.991611746182977e-05, + "loss": 0.4449, "step": 7675 }, { - "epoch": 0.27, - "learning_rate": 4.9924280752533965e-05, - "loss": 0.3112, + "epoch": 0.27678667964104225, + "grad_norm": 0.23488974571228027, + "learning_rate": 4.991587844423376e-05, + "loss": 0.4613, "step": 7680 }, { - "epoch": 0.27, - "learning_rate": 4.992405904212224e-05, - "loss": 0.3247, + "epoch": 0.27696687930226693, + "grad_norm": 0.2240959256887436, + "learning_rate": 4.9915639087163736e-05, + "loss": 0.4601, "step": 7685 }, { - "epoch": 0.27, - "learning_rate": 4.9923837008088046e-05, - "loss": 0.3239, + "epoch": 0.27714707896349156, + "grad_norm": 0.18855224549770355, + "learning_rate": 4.991539939062295e-05, + "loss": 0.4584, "step": 7690 }, { - "epoch": 0.27, - "learning_rate": 4.9923614650434245e-05, - "loss": 0.3494, + "epoch": 0.2773272786247162, + "grad_norm": 0.20485761761665344, + "learning_rate": 4.9915159354614674e-05, + "loss": 0.4444, "step": 7695 }, { - "epoch": 0.27, - "learning_rate": 4.992339196916373e-05, - "loss": 0.3242, + "epoch": 0.2775074782859408, + "grad_norm": 0.18977908790111542, + "learning_rate": 4.9914918979142163e-05, + "loss": 0.4626, "step": 7700 }, { - "epoch": 0.27, - "learning_rate": 4.99231689642794e-05, - "loss": 0.3402, + "epoch": 0.2776876779471655, + "grad_norm": 0.15290701389312744, + "learning_rate": 4.991467826420872e-05, + "loss": 0.4448, "step": 7705 }, { - "epoch": 0.27, - "learning_rate": 4.992294563578415e-05, - "loss": 0.3341, + "epoch": 0.2778678776083901, + "grad_norm": 0.22318267822265625, + "learning_rate": 4.99144372098176e-05, + "loss": 0.4937, "step": 7710 }, { - "epoch": 0.27, - "learning_rate": 4.992272198368086e-05, - "loss": 0.3508, + "epoch": 0.2780480772696147, + "grad_norm": 0.2257704883813858, + "learning_rate": 4.9914195815972104e-05, + "loss": 0.4445, "step": 7715 }, { - "epoch": 0.27, - "learning_rate": 4.992249800797247e-05, - "loss": 0.3563, + "epoch": 0.27822827693083935, + "grad_norm": 0.2357822060585022, + "learning_rate": 4.991395408267551e-05, + "loss": 0.4546, "step": 7720 }, { - "epoch": 0.27, - "learning_rate": 4.992227370866186e-05, - "loss": 0.3281, + "epoch": 0.278408476592064, + "grad_norm": 0.17252907156944275, + "learning_rate": 4.991371200993111e-05, + "loss": 0.445, "step": 7725 }, { - "epoch": 0.27, - "learning_rate": 4.992204908575194e-05, - "loss": 0.3573, + "epoch": 0.27858867625328865, + "grad_norm": 0.16278617084026337, + "learning_rate": 4.991346959774221e-05, + "loss": 0.4839, "step": 7730 }, { - "epoch": 0.27, - "learning_rate": 4.9921824139245656e-05, - "loss": 0.3089, + "epoch": 0.27876887591451327, + "grad_norm": 0.1887577474117279, + "learning_rate": 4.9913226846112114e-05, + "loss": 0.4232, "step": 7735 }, { - "epoch": 0.27, - "learning_rate": 4.992159886914589e-05, - "loss": 0.3237, + "epoch": 0.2789490755757379, + "grad_norm": 0.17083100974559784, + "learning_rate": 4.991298375504413e-05, + "loss": 0.4863, "step": 7740 }, { - "epoch": 0.27, - "learning_rate": 4.9921373275455594e-05, - "loss": 0.3421, + "epoch": 0.27912927523696257, + "grad_norm": 0.1833745837211609, + "learning_rate": 4.991274032454156e-05, + "loss": 0.4544, "step": 7745 }, { - "epoch": 0.27, - "learning_rate": 4.99211473581777e-05, - "loss": 0.3322, + "epoch": 0.2793094748981872, + "grad_norm": 0.15337827801704407, + "learning_rate": 4.991249655460773e-05, + "loss": 0.4621, "step": 7750 }, { - "epoch": 0.27, - "learning_rate": 4.992092111731512e-05, - "loss": 0.3562, + "epoch": 0.2794896745594118, + "grad_norm": 0.22126014530658722, + "learning_rate": 4.991225244524595e-05, + "loss": 0.4861, "step": 7755 }, { - "epoch": 0.27, - "learning_rate": 4.992069455287081e-05, - "loss": 0.3329, + "epoch": 0.27966987422063644, + "grad_norm": 0.203162282705307, + "learning_rate": 4.991200799645955e-05, + "loss": 0.4432, "step": 7760 }, { - "epoch": 0.27, - "learning_rate": 4.99204676648477e-05, - "loss": 0.3719, + "epoch": 0.2798500738818611, + "grad_norm": 0.1990215927362442, + "learning_rate": 4.991176320825188e-05, + "loss": 0.4583, "step": 7765 }, { - "epoch": 0.27, - "learning_rate": 4.992024045324874e-05, - "loss": 0.3369, + "epoch": 0.28003027354308574, + "grad_norm": 0.18323101103305817, + "learning_rate": 4.991151808062625e-05, + "loss": 0.4382, "step": 7770 }, { - "epoch": 0.27, - "learning_rate": 4.992001291807688e-05, - "loss": 0.3362, + "epoch": 0.28021047320431036, + "grad_norm": 0.1830572485923767, + "learning_rate": 4.9911272613586006e-05, + "loss": 0.4663, "step": 7775 }, { - "epoch": 0.27, - "learning_rate": 4.991978505933508e-05, - "loss": 0.3308, + "epoch": 0.28039067286553504, + "grad_norm": 0.1886363923549652, + "learning_rate": 4.99110268071345e-05, + "loss": 0.4462, "step": 7780 }, { - "epoch": 0.27, - "learning_rate": 4.9919556877026284e-05, - "loss": 0.3392, + "epoch": 0.28057087252675966, + "grad_norm": 0.15756884217262268, + "learning_rate": 4.991078066127508e-05, + "loss": 0.4707, "step": 7785 }, { - "epoch": 0.27, - "learning_rate": 4.991932837115347e-05, - "loss": 0.338, + "epoch": 0.2807510721879843, + "grad_norm": 0.17909304797649384, + "learning_rate": 4.991053417601109e-05, + "loss": 0.497, "step": 7790 }, { - "epoch": 0.27, - "learning_rate": 4.991909954171961e-05, - "loss": 0.3218, + "epoch": 0.2809312718492089, + "grad_norm": 0.22559069097042084, + "learning_rate": 4.99102873513459e-05, + "loss": 0.5026, "step": 7795 }, { - "epoch": 0.27, - "learning_rate": 4.991887038872766e-05, - "loss": 0.3144, + "epoch": 0.2811114715104336, + "grad_norm": 0.19916412234306335, + "learning_rate": 4.991004018728286e-05, + "loss": 0.4825, "step": 7800 }, { - "epoch": 0.27, - "learning_rate": 4.991864091218059e-05, - "loss": 0.3482, + "epoch": 0.2812916711716582, + "grad_norm": 0.23998232185840607, + "learning_rate": 4.990979268382535e-05, + "loss": 0.5141, "step": 7805 }, { - "epoch": 0.27, - "learning_rate": 4.9918411112081395e-05, - "loss": 0.3249, + "epoch": 0.28147187083288283, + "grad_norm": 0.17623983323574066, + "learning_rate": 4.9909544840976744e-05, + "loss": 0.4464, "step": 7810 }, { - "epoch": 0.27, - "learning_rate": 4.991818098843305e-05, - "loss": 0.3433, + "epoch": 0.28165207049410745, + "grad_norm": 0.2640053927898407, + "learning_rate": 4.99092966587404e-05, + "loss": 0.4738, "step": 7815 }, { - "epoch": 0.28, - "learning_rate": 4.991795054123856e-05, - "loss": 0.3122, + "epoch": 0.28183227015533213, + "grad_norm": 0.15781162679195404, + "learning_rate": 4.990904813711972e-05, + "loss": 0.4283, "step": 7820 }, { - "epoch": 0.28, - "learning_rate": 4.991771977050089e-05, - "loss": 0.3412, + "epoch": 0.28201246981655675, + "grad_norm": 0.15870793163776398, + "learning_rate": 4.990879927611808e-05, + "loss": 0.4477, "step": 7825 }, { - "epoch": 0.28, - "learning_rate": 4.991748867622306e-05, - "loss": 0.3673, + "epoch": 0.2821926694777814, + "grad_norm": 0.23389936983585358, + "learning_rate": 4.990855007573887e-05, + "loss": 0.4772, "step": 7830 }, { - "epoch": 0.28, - "learning_rate": 4.991725725840805e-05, - "loss": 0.3109, + "epoch": 0.282372869139006, + "grad_norm": 0.24188333749771118, + "learning_rate": 4.9908300535985486e-05, + "loss": 0.4548, "step": 7835 }, { - "epoch": 0.28, - "learning_rate": 4.991702551705888e-05, - "loss": 0.3558, + "epoch": 0.2825530688002307, + "grad_norm": 0.1669192612171173, + "learning_rate": 4.990805065686133e-05, + "loss": 0.4476, "step": 7840 }, { - "epoch": 0.28, - "learning_rate": 4.9916793452178554e-05, - "loss": 0.3088, + "epoch": 0.2827332684614553, + "grad_norm": 0.15673232078552246, + "learning_rate": 4.990780043836981e-05, + "loss": 0.4181, "step": 7845 }, { - "epoch": 0.28, - "learning_rate": 4.9916561063770084e-05, - "loss": 0.3375, + "epoch": 0.2829134681226799, + "grad_norm": 0.146693155169487, + "learning_rate": 4.9907549880514334e-05, + "loss": 0.4416, "step": 7850 }, { - "epoch": 0.28, - "learning_rate": 4.991632835183649e-05, - "loss": 0.3374, + "epoch": 0.28309366778390455, + "grad_norm": 0.24487614631652832, + "learning_rate": 4.990729898329831e-05, + "loss": 0.4169, "step": 7855 }, { - "epoch": 0.28, - "learning_rate": 4.99160953163808e-05, - "loss": 0.3386, + "epoch": 0.2832738674451292, + "grad_norm": 0.2468927651643753, + "learning_rate": 4.9907047746725154e-05, + "loss": 0.4804, "step": 7860 }, { - "epoch": 0.28, - "learning_rate": 4.991586195740603e-05, - "loss": 0.3391, + "epoch": 0.28345406710635385, + "grad_norm": 0.22746756672859192, + "learning_rate": 4.99067961707983e-05, + "loss": 0.4499, "step": 7865 }, { - "epoch": 0.28, - "learning_rate": 4.991562827491521e-05, - "loss": 0.2986, + "epoch": 0.28363426676757847, + "grad_norm": 0.17610248923301697, + "learning_rate": 4.990654425552117e-05, + "loss": 0.4824, "step": 7870 }, { - "epoch": 0.28, - "learning_rate": 4.9915394268911375e-05, - "loss": 0.3754, + "epoch": 0.2838144664288031, + "grad_norm": 0.20157018303871155, + "learning_rate": 4.9906292000897196e-05, + "loss": 0.4641, "step": 7875 }, { - "epoch": 0.28, - "learning_rate": 4.9915159939397574e-05, - "loss": 0.315, + "epoch": 0.28399466609002777, + "grad_norm": 0.23462137579917908, + "learning_rate": 4.990603940692982e-05, + "loss": 0.4813, "step": 7880 }, { - "epoch": 0.28, - "learning_rate": 4.991492528637683e-05, - "loss": 0.3335, + "epoch": 0.2841748657512524, + "grad_norm": 0.19886912405490875, + "learning_rate": 4.990578647362247e-05, + "loss": 0.4413, "step": 7885 }, { - "epoch": 0.28, - "learning_rate": 4.9914690309852205e-05, - "loss": 0.3211, + "epoch": 0.284355065412477, + "grad_norm": 0.17550869286060333, + "learning_rate": 4.9905533200978606e-05, + "loss": 0.4518, "step": 7890 }, { - "epoch": 0.28, - "learning_rate": 4.991445500982675e-05, - "loss": 0.3403, + "epoch": 0.28453526507370164, + "grad_norm": 0.17420583963394165, + "learning_rate": 4.9905279589001674e-05, + "loss": 0.4741, "step": 7895 }, { - "epoch": 0.28, - "learning_rate": 4.9914219386303513e-05, - "loss": 0.3454, + "epoch": 0.2847154647349263, + "grad_norm": 0.2519500255584717, + "learning_rate": 4.990502563769514e-05, + "loss": 0.4974, "step": 7900 }, { - "epoch": 0.28, - "learning_rate": 4.991398343928555e-05, - "loss": 0.3164, + "epoch": 0.28489566439615094, + "grad_norm": 0.21269458532333374, + "learning_rate": 4.990477134706244e-05, + "loss": 0.527, "step": 7905 }, { - "epoch": 0.28, - "learning_rate": 4.9913747168775934e-05, - "loss": 0.3341, + "epoch": 0.28507586405737556, + "grad_norm": 0.2123083919286728, + "learning_rate": 4.990451671710705e-05, + "loss": 0.4385, "step": 7910 }, { - "epoch": 0.28, - "learning_rate": 4.991351057477774e-05, - "loss": 0.3248, + "epoch": 0.2852560637186002, + "grad_norm": 0.17294515669345856, + "learning_rate": 4.990426174783245e-05, + "loss": 0.4442, "step": 7915 }, { - "epoch": 0.28, - "learning_rate": 4.9913273657294025e-05, - "loss": 0.3214, + "epoch": 0.28543626337982486, + "grad_norm": 0.14767326414585114, + "learning_rate": 4.99040064392421e-05, + "loss": 0.4443, "step": 7920 }, { - "epoch": 0.28, - "learning_rate": 4.991303641632786e-05, - "loss": 0.3329, + "epoch": 0.2856164630410495, + "grad_norm": 0.19468598067760468, + "learning_rate": 4.9903750791339485e-05, + "loss": 0.475, "step": 7925 }, { - "epoch": 0.28, - "learning_rate": 4.991279885188235e-05, - "loss": 0.3253, + "epoch": 0.2857966627022741, + "grad_norm": 0.20397444069385529, + "learning_rate": 4.990349480412809e-05, + "loss": 0.4989, "step": 7930 }, { - "epoch": 0.28, - "learning_rate": 4.991256096396055e-05, - "loss": 0.3295, + "epoch": 0.28597686236349873, + "grad_norm": 0.18751677870750427, + "learning_rate": 4.990323847761139e-05, + "loss": 0.46, "step": 7935 }, { - "epoch": 0.28, - "learning_rate": 4.991232275256558e-05, - "loss": 0.3242, + "epoch": 0.2861570620247234, + "grad_norm": 0.20503003895282745, + "learning_rate": 4.99029818117929e-05, + "loss": 0.4274, "step": 7940 }, { - "epoch": 0.28, - "learning_rate": 4.991208421770051e-05, - "loss": 0.3328, + "epoch": 0.28633726168594803, + "grad_norm": 0.20516186952590942, + "learning_rate": 4.9902724806676094e-05, + "loss": 0.4503, "step": 7945 }, { - "epoch": 0.28, - "learning_rate": 4.9911845359368434e-05, - "loss": 0.3206, + "epoch": 0.28651746134717265, + "grad_norm": 0.21163448691368103, + "learning_rate": 4.990246746226449e-05, + "loss": 0.5434, "step": 7950 }, { - "epoch": 0.28, - "learning_rate": 4.9911606177572476e-05, - "loss": 0.3497, + "epoch": 0.28669766100839733, + "grad_norm": 0.21273358166217804, + "learning_rate": 4.9902209778561585e-05, + "loss": 0.4581, "step": 7955 }, { - "epoch": 0.28, - "learning_rate": 4.991136667231572e-05, - "loss": 0.3293, + "epoch": 0.28687786066962195, + "grad_norm": 0.2407582700252533, + "learning_rate": 4.9901951755570896e-05, + "loss": 0.4555, "step": 7960 }, { - "epoch": 0.28, - "learning_rate": 4.991112684360129e-05, - "loss": 0.3348, + "epoch": 0.2870580603308466, + "grad_norm": 0.19239749014377594, + "learning_rate": 4.9901693393295935e-05, + "loss": 0.4809, "step": 7965 }, { - "epoch": 0.28, - "learning_rate": 4.99108866914323e-05, - "loss": 0.3222, + "epoch": 0.2872382599920712, + "grad_norm": 0.20676837861537933, + "learning_rate": 4.990143469174022e-05, + "loss": 0.4315, "step": 7970 }, { - "epoch": 0.28, - "learning_rate": 4.9910646215811854e-05, - "loss": 0.3057, + "epoch": 0.2874184596532959, + "grad_norm": 0.26478490233421326, + "learning_rate": 4.990117565090728e-05, + "loss": 0.4561, "step": 7975 }, { - "epoch": 0.28, - "learning_rate": 4.9910405416743086e-05, - "loss": 0.3665, + "epoch": 0.2875986593145205, + "grad_norm": 0.19174893200397491, + "learning_rate": 4.990091627080065e-05, + "loss": 0.525, "step": 7980 }, { - "epoch": 0.28, - "learning_rate": 4.991016429422912e-05, - "loss": 0.329, + "epoch": 0.2877788589757451, + "grad_norm": 0.18685384094715118, + "learning_rate": 4.9900656551423844e-05, + "loss": 0.4683, "step": 7985 }, { - "epoch": 0.28, - "learning_rate": 4.990992284827309e-05, - "loss": 0.3296, + "epoch": 0.28795905863696974, + "grad_norm": 0.17788560688495636, + "learning_rate": 4.990039649278042e-05, + "loss": 0.4609, "step": 7990 }, { - "epoch": 0.28, - "learning_rate": 4.9909681078878136e-05, - "loss": 0.3494, + "epoch": 0.2881392582981944, + "grad_norm": 0.2256275713443756, + "learning_rate": 4.990013609487391e-05, + "loss": 0.4687, "step": 7995 }, { - "epoch": 0.28, - "learning_rate": 4.9909438986047375e-05, - "loss": 0.3159, + "epoch": 0.28831945795941905, + "grad_norm": 0.2363923341035843, + "learning_rate": 4.989987535770787e-05, + "loss": 0.4783, "step": 8000 }, { - "epoch": 0.28, - "eval_loss": 0.3288015127182007, - "eval_runtime": 10.5609, - "eval_samples_per_second": 9.469, - "eval_steps_per_second": 9.469, + "epoch": 0.28831945795941905, + "eval_loss": 0.48639407753944397, + "eval_runtime": 3.5338, + "eval_samples_per_second": 28.298, + "eval_steps_per_second": 7.075, "step": 8000 }, { - "epoch": 0.28, - "learning_rate": 4.9909196569783975e-05, - "loss": 0.3127, + "epoch": 0.28849965762064367, + "grad_norm": 0.20103095471858978, + "learning_rate": 4.9899614281285856e-05, + "loss": 0.4766, "step": 8005 }, { - "epoch": 0.28, - "learning_rate": 4.9908953830091066e-05, - "loss": 0.3201, + "epoch": 0.2886798572818683, + "grad_norm": 0.19780664145946503, + "learning_rate": 4.98993528656114e-05, + "loss": 0.5028, "step": 8010 }, { - "epoch": 0.28, - "learning_rate": 4.990871076697181e-05, - "loss": 0.3433, + "epoch": 0.28886005694309297, + "grad_norm": 0.19557078182697296, + "learning_rate": 4.9899091110688104e-05, + "loss": 0.4693, "step": 8015 }, { - "epoch": 0.28, - "learning_rate": 4.990846738042936e-05, - "loss": 0.3307, + "epoch": 0.2890402566043176, + "grad_norm": 0.18514655530452728, + "learning_rate": 4.98988290165195e-05, + "loss": 0.4637, "step": 8020 }, { - "epoch": 0.28, - "learning_rate": 4.990822367046688e-05, - "loss": 0.3218, + "epoch": 0.2892204562655422, + "grad_norm": 0.14113621413707733, + "learning_rate": 4.9898566583109174e-05, + "loss": 0.4324, "step": 8025 }, { - "epoch": 0.28, - "learning_rate": 4.990797963708752e-05, - "loss": 0.3419, + "epoch": 0.28940065592676684, + "grad_norm": 0.20794130861759186, + "learning_rate": 4.989830381046069e-05, + "loss": 0.5302, "step": 8030 }, { - "epoch": 0.28, - "learning_rate": 4.990773528029446e-05, - "loss": 0.338, + "epoch": 0.2895808555879915, + "grad_norm": 0.18446312844753265, + "learning_rate": 4.9898040698577655e-05, + "loss": 0.4497, "step": 8035 }, { - "epoch": 0.28, - "learning_rate": 4.990749060009089e-05, - "loss": 0.3297, + "epoch": 0.28976105524921614, + "grad_norm": 0.20031079649925232, + "learning_rate": 4.9897777247463615e-05, + "loss": 0.4522, "step": 8040 }, { - "epoch": 0.28, - "learning_rate": 4.9907245596479956e-05, - "loss": 0.3106, + "epoch": 0.28994125491044076, + "grad_norm": 0.21210455894470215, + "learning_rate": 4.989751345712219e-05, + "loss": 0.4408, "step": 8045 }, { - "epoch": 0.28, - "learning_rate": 4.990700026946485e-05, - "loss": 0.3051, + "epoch": 0.2901214545716654, + "grad_norm": 0.18559224903583527, + "learning_rate": 4.989724932755697e-05, + "loss": 0.4555, "step": 8050 }, { - "epoch": 0.28, - "learning_rate": 4.9906754619048767e-05, - "loss": 0.3461, + "epoch": 0.29030165423289006, + "grad_norm": 0.21420493721961975, + "learning_rate": 4.9896984858771546e-05, + "loss": 0.4763, "step": 8055 }, { - "epoch": 0.28, - "learning_rate": 4.9906508645234885e-05, - "loss": 0.323, + "epoch": 0.2904818538941147, + "grad_norm": 0.1991835981607437, + "learning_rate": 4.9896720050769516e-05, + "loss": 0.4475, "step": 8060 }, { - "epoch": 0.28, - "learning_rate": 4.990626234802641e-05, - "loss": 0.3196, + "epoch": 0.2906620535553393, + "grad_norm": 0.23717185854911804, + "learning_rate": 4.98964549035545e-05, + "loss": 0.4635, "step": 8065 }, { - "epoch": 0.28, - "learning_rate": 4.990601572742652e-05, - "loss": 0.3257, + "epoch": 0.29084225321656393, + "grad_norm": 0.2004196047782898, + "learning_rate": 4.989618941713011e-05, + "loss": 0.4411, "step": 8070 }, { - "epoch": 0.28, - "learning_rate": 4.990576878343844e-05, - "loss": 0.3501, + "epoch": 0.2910224528777886, + "grad_norm": 0.21253125369548798, + "learning_rate": 4.9895923591499954e-05, + "loss": 0.4925, "step": 8075 }, { - "epoch": 0.28, - "learning_rate": 4.9905521516065356e-05, - "loss": 0.3417, + "epoch": 0.29120265253901323, + "grad_norm": 0.253714382648468, + "learning_rate": 4.9895657426667666e-05, + "loss": 0.4723, "step": 8080 }, { - "epoch": 0.28, - "learning_rate": 4.99052739253105e-05, - "loss": 0.3415, + "epoch": 0.29138285220023785, + "grad_norm": 0.21997293829917908, + "learning_rate": 4.9895390922636854e-05, + "loss": 0.4838, "step": 8085 }, { - "epoch": 0.28, - "learning_rate": 4.990502601117707e-05, - "loss": 0.3378, + "epoch": 0.2915630518614625, + "grad_norm": 0.17473578453063965, + "learning_rate": 4.989512407941117e-05, + "loss": 0.4264, "step": 8090 }, { - "epoch": 0.28, - "learning_rate": 4.990477777366829e-05, - "loss": 0.3518, + "epoch": 0.29174325152268715, + "grad_norm": 0.1931363344192505, + "learning_rate": 4.989485689699423e-05, + "loss": 0.485, "step": 8095 }, { - "epoch": 0.28, - "learning_rate": 4.990452921278738e-05, - "loss": 0.35, + "epoch": 0.2919234511839118, + "grad_norm": 0.16745588183403015, + "learning_rate": 4.98945893753897e-05, + "loss": 0.4865, "step": 8100 }, { - "epoch": 0.29, - "learning_rate": 4.990428032853757e-05, - "loss": 0.3323, + "epoch": 0.2921036508451364, + "grad_norm": 0.19875845313072205, + "learning_rate": 4.98943215146012e-05, + "loss": 0.4319, "step": 8105 }, { - "epoch": 0.29, - "learning_rate": 4.99040311209221e-05, - "loss": 0.3318, + "epoch": 0.292283850506361, + "grad_norm": 0.21367180347442627, + "learning_rate": 4.989405331463239e-05, + "loss": 0.4779, "step": 8110 }, { - "epoch": 0.29, - "learning_rate": 4.9903781589944193e-05, - "loss": 0.3153, + "epoch": 0.2924640501675857, + "grad_norm": 0.16339702904224396, + "learning_rate": 4.989378477548692e-05, + "loss": 0.4386, "step": 8115 }, { - "epoch": 0.29, - "learning_rate": 4.99035317356071e-05, - "loss": 0.312, + "epoch": 0.2926442498288103, + "grad_norm": 0.15709401667118073, + "learning_rate": 4.9893515897168455e-05, + "loss": 0.4246, "step": 8120 }, { - "epoch": 0.29, - "learning_rate": 4.990328155791405e-05, - "loss": 0.3254, + "epoch": 0.29282444949003494, + "grad_norm": 0.23660311102867126, + "learning_rate": 4.989324667968066e-05, + "loss": 0.4658, "step": 8125 }, { - "epoch": 0.29, - "learning_rate": 4.9903031056868306e-05, - "loss": 0.3438, + "epoch": 0.2930046491512596, + "grad_norm": 0.17152658104896545, + "learning_rate": 4.9892977123027194e-05, + "loss": 0.4546, "step": 8130 }, { - "epoch": 0.29, - "learning_rate": 4.990278023247311e-05, - "loss": 0.331, + "epoch": 0.29318484881248424, + "grad_norm": 0.25571581721305847, + "learning_rate": 4.989270722721173e-05, + "loss": 0.4507, "step": 8135 }, { - "epoch": 0.29, - "learning_rate": 4.990252908473174e-05, - "loss": 0.3616, + "epoch": 0.29336504847370887, + "grad_norm": 0.16954413056373596, + "learning_rate": 4.989243699223796e-05, + "loss": 0.4685, "step": 8140 }, { - "epoch": 0.29, - "learning_rate": 4.990227761364743e-05, - "loss": 0.3103, + "epoch": 0.2935452481349335, + "grad_norm": 0.19579976797103882, + "learning_rate": 4.989216641810955e-05, + "loss": 0.5074, "step": 8145 }, { - "epoch": 0.29, - "learning_rate": 4.990202581922345e-05, - "loss": 0.3174, + "epoch": 0.29372544779615817, + "grad_norm": 0.16534093022346497, + "learning_rate": 4.989189550483019e-05, + "loss": 0.4594, "step": 8150 }, { - "epoch": 0.29, - "learning_rate": 4.990177370146308e-05, - "loss": 0.3259, + "epoch": 0.2939056474573828, + "grad_norm": 0.19394879043102264, + "learning_rate": 4.9891624252403574e-05, + "loss": 0.4696, "step": 8155 }, { - "epoch": 0.29, - "learning_rate": 4.99015212603696e-05, - "loss": 0.3287, + "epoch": 0.2940858471186074, + "grad_norm": 0.2109389305114746, + "learning_rate": 4.98913526608334e-05, + "loss": 0.4606, "step": 8160 }, { - "epoch": 0.29, - "learning_rate": 4.990126849594627e-05, - "loss": 0.3591, + "epoch": 0.29426604677983204, + "grad_norm": 0.19720369577407837, + "learning_rate": 4.9891080730123365e-05, + "loss": 0.4608, "step": 8165 }, { - "epoch": 0.29, - "learning_rate": 4.9901015408196375e-05, - "loss": 0.3171, + "epoch": 0.2944462464410567, + "grad_norm": 0.16822995245456696, + "learning_rate": 4.9890808460277163e-05, + "loss": 0.4925, "step": 8170 }, { - "epoch": 0.29, - "learning_rate": 4.9900761997123216e-05, - "loss": 0.3008, + "epoch": 0.29462644610228134, + "grad_norm": 0.1735120564699173, + "learning_rate": 4.9890535851298526e-05, + "loss": 0.4465, "step": 8175 }, { - "epoch": 0.29, - "learning_rate": 4.990050826273006e-05, - "loss": 0.3335, + "epoch": 0.29480664576350596, + "grad_norm": 0.16720019280910492, + "learning_rate": 4.989026290319115e-05, + "loss": 0.4391, "step": 8180 }, { - "epoch": 0.29, - "learning_rate": 4.990025420502022e-05, - "loss": 0.3139, + "epoch": 0.2949868454247306, + "grad_norm": 0.19565287232398987, + "learning_rate": 4.988998961595876e-05, + "loss": 0.4996, "step": 8185 }, { - "epoch": 0.29, - "learning_rate": 4.989999982399699e-05, - "loss": 0.3203, + "epoch": 0.29516704508595526, + "grad_norm": 0.17741116881370544, + "learning_rate": 4.988971598960509e-05, + "loss": 0.4652, "step": 8190 }, { - "epoch": 0.29, - "learning_rate": 4.9899745119663674e-05, - "loss": 0.3375, + "epoch": 0.2953472447471799, + "grad_norm": 0.16898617148399353, + "learning_rate": 4.988944202413386e-05, + "loss": 0.478, "step": 8195 }, { - "epoch": 0.29, - "learning_rate": 4.989949009202357e-05, - "loss": 0.362, + "epoch": 0.2955274444084045, + "grad_norm": 0.2120852917432785, + "learning_rate": 4.988916771954879e-05, + "loss": 0.4719, "step": 8200 }, { - "epoch": 0.29, - "learning_rate": 4.9899234741080006e-05, - "loss": 0.313, + "epoch": 0.2957076440696291, + "grad_norm": 0.16278114914894104, + "learning_rate": 4.988889307585364e-05, + "loss": 0.4799, "step": 8205 }, { - "epoch": 0.29, - "learning_rate": 4.989897906683628e-05, - "loss": 0.3655, + "epoch": 0.2958878437308538, + "grad_norm": 0.18301096558570862, + "learning_rate": 4.988861809305213e-05, + "loss": 0.4712, "step": 8210 }, { - "epoch": 0.29, - "learning_rate": 4.9898723069295725e-05, - "loss": 0.349, + "epoch": 0.29606804339207843, + "grad_norm": 0.15939167141914368, + "learning_rate": 4.988834277114802e-05, + "loss": 0.4389, "step": 8215 }, { - "epoch": 0.29, - "learning_rate": 4.9898466748461653e-05, - "loss": 0.3174, + "epoch": 0.29624824305330305, + "grad_norm": 0.1933128535747528, + "learning_rate": 4.988806711014505e-05, + "loss": 0.4845, "step": 8220 }, { - "epoch": 0.29, - "learning_rate": 4.989821010433741e-05, - "loss": 0.3315, + "epoch": 0.2964284427145277, + "grad_norm": 0.1766856461763382, + "learning_rate": 4.9887791110047e-05, + "loss": 0.4751, "step": 8225 }, { - "epoch": 0.29, - "learning_rate": 4.98979531369263e-05, - "loss": 0.3352, + "epoch": 0.29660864237575235, + "grad_norm": 0.1650475561618805, + "learning_rate": 4.9887514770857605e-05, + "loss": 0.416, "step": 8230 }, { - "epoch": 0.29, - "learning_rate": 4.9897695846231696e-05, - "loss": 0.2966, + "epoch": 0.296788842036977, + "grad_norm": 0.16024361550807953, + "learning_rate": 4.988723809258064e-05, + "loss": 0.4831, "step": 8235 }, { - "epoch": 0.29, - "learning_rate": 4.989743823225691e-05, - "loss": 0.3223, + "epoch": 0.2969690416982016, + "grad_norm": 0.1936086267232895, + "learning_rate": 4.9886961075219885e-05, + "loss": 0.4589, "step": 8240 }, { - "epoch": 0.29, - "learning_rate": 4.98971802950053e-05, - "loss": 0.3506, + "epoch": 0.2971492413594262, + "grad_norm": 0.20437081158161163, + "learning_rate": 4.988668371877909e-05, + "loss": 0.5085, "step": 8245 }, { - "epoch": 0.29, - "learning_rate": 4.989692203448021e-05, - "loss": 0.334, + "epoch": 0.2973294410206509, + "grad_norm": 0.2507721185684204, + "learning_rate": 4.988640602326205e-05, + "loss": 0.4888, "step": 8250 }, { - "epoch": 0.29, - "learning_rate": 4.9896663450684986e-05, - "loss": 0.3311, + "epoch": 0.2975096406818755, + "grad_norm": 0.2515529692173004, + "learning_rate": 4.9886127988672554e-05, + "loss": 0.4617, "step": 8255 }, { - "epoch": 0.29, - "learning_rate": 4.989640454362301e-05, - "loss": 0.3512, + "epoch": 0.29768984034310014, + "grad_norm": 0.18619805574417114, + "learning_rate": 4.988584961501438e-05, + "loss": 0.4843, "step": 8260 }, { - "epoch": 0.29, - "learning_rate": 4.9896145313297614e-05, - "loss": 0.3332, + "epoch": 0.29787004000432477, + "grad_norm": 0.18921491503715515, + "learning_rate": 4.988557090229132e-05, + "loss": 0.4308, "step": 8265 }, { - "epoch": 0.29, - "learning_rate": 4.989588575971219e-05, - "loss": 0.3331, + "epoch": 0.29805023966554944, + "grad_norm": 0.17561018466949463, + "learning_rate": 4.988529185050717e-05, + "loss": 0.4686, "step": 8270 }, { - "epoch": 0.29, - "learning_rate": 4.989562588287009e-05, - "loss": 0.3218, + "epoch": 0.29823043932677407, + "grad_norm": 0.18587404489517212, + "learning_rate": 4.988501245966574e-05, + "loss": 0.4476, "step": 8275 }, { - "epoch": 0.29, - "learning_rate": 4.98953656827747e-05, - "loss": 0.3283, + "epoch": 0.2984106389879987, + "grad_norm": 0.22094377875328064, + "learning_rate": 4.988473272977083e-05, + "loss": 0.4372, "step": 8280 }, { - "epoch": 0.29, - "learning_rate": 4.989510515942939e-05, - "loss": 0.3352, + "epoch": 0.29859083864922337, + "grad_norm": 0.23441436886787415, + "learning_rate": 4.988445266082626e-05, + "loss": 0.4697, "step": 8285 }, { - "epoch": 0.29, - "learning_rate": 4.989484431283755e-05, - "loss": 0.3431, + "epoch": 0.298771038310448, + "grad_norm": 0.2000618875026703, + "learning_rate": 4.988417225283585e-05, + "loss": 0.4548, "step": 8290 }, { - "epoch": 0.29, - "learning_rate": 4.989458314300256e-05, - "loss": 0.3274, + "epoch": 0.2989512379716726, + "grad_norm": 0.16407370567321777, + "learning_rate": 4.9883891505803394e-05, + "loss": 0.4583, "step": 8295 }, { - "epoch": 0.29, - "learning_rate": 4.989432164992781e-05, - "loss": 0.291, + "epoch": 0.29913143763289723, + "grad_norm": 0.1786169558763504, + "learning_rate": 4.988361041973274e-05, + "loss": 0.4534, "step": 8300 }, { - "epoch": 0.29, - "learning_rate": 4.98940598336167e-05, - "loss": 0.3318, + "epoch": 0.2993116372941219, + "grad_norm": 0.22652758657932281, + "learning_rate": 4.988332899462771e-05, + "loss": 0.4826, "step": 8305 }, { - "epoch": 0.29, - "learning_rate": 4.989379769407263e-05, - "loss": 0.3251, + "epoch": 0.29949183695534654, + "grad_norm": 0.23459400236606598, + "learning_rate": 4.9883047230492144e-05, + "loss": 0.4353, "step": 8310 }, { - "epoch": 0.29, - "learning_rate": 4.9893535231299004e-05, - "loss": 0.334, + "epoch": 0.29967203661657116, + "grad_norm": 0.19752709567546844, + "learning_rate": 4.988276512732987e-05, + "loss": 0.4551, "step": 8315 }, { - "epoch": 0.29, - "learning_rate": 4.9893272445299236e-05, - "loss": 0.3066, + "epoch": 0.2998522362777958, + "grad_norm": 0.184610515832901, + "learning_rate": 4.988248268514475e-05, + "loss": 0.4713, "step": 8320 }, { - "epoch": 0.29, - "learning_rate": 4.9893009336076716e-05, - "loss": 0.3556, + "epoch": 0.30003243593902046, + "grad_norm": 0.16262328624725342, + "learning_rate": 4.988219990394061e-05, + "loss": 0.472, "step": 8325 }, { - "epoch": 0.29, - "learning_rate": 4.989274590363489e-05, - "loss": 0.3614, + "epoch": 0.3002126356002451, + "grad_norm": 0.1739642322063446, + "learning_rate": 4.988191678372132e-05, + "loss": 0.4622, "step": 8330 }, { - "epoch": 0.29, - "learning_rate": 4.989248214797715e-05, - "loss": 0.3364, + "epoch": 0.3003928352614697, + "grad_norm": 0.15458573400974274, + "learning_rate": 4.988163332449073e-05, + "loss": 0.4829, "step": 8335 }, { - "epoch": 0.29, - "learning_rate": 4.9892218069106944e-05, - "loss": 0.3103, + "epoch": 0.3005730349226943, + "grad_norm": 0.18645693361759186, + "learning_rate": 4.9881349526252694e-05, + "loss": 0.4414, "step": 8340 }, { - "epoch": 0.29, - "learning_rate": 4.989195366702768e-05, - "loss": 0.3487, + "epoch": 0.300753234583919, + "grad_norm": 0.17546844482421875, + "learning_rate": 4.988106538901109e-05, + "loss": 0.4612, "step": 8345 }, { - "epoch": 0.29, - "learning_rate": 4.989168894174281e-05, - "loss": 0.3232, + "epoch": 0.3009334342451436, + "grad_norm": 0.1612381488084793, + "learning_rate": 4.9880780912769796e-05, + "loss": 0.4759, "step": 8350 }, { - "epoch": 0.29, - "learning_rate": 4.989142389325576e-05, - "loss": 0.3302, + "epoch": 0.30111363390636825, + "grad_norm": 0.179169699549675, + "learning_rate": 4.988049609753268e-05, + "loss": 0.4535, "step": 8355 }, { - "epoch": 0.29, - "learning_rate": 4.989115852156998e-05, - "loss": 0.3242, + "epoch": 0.3012938335675929, + "grad_norm": 0.1993793547153473, + "learning_rate": 4.988021094330362e-05, + "loss": 0.4463, "step": 8360 }, { - "epoch": 0.29, - "learning_rate": 4.989089282668891e-05, - "loss": 0.3235, + "epoch": 0.30147403322881755, + "grad_norm": 0.22224834561347961, + "learning_rate": 4.987992545008649e-05, + "loss": 0.4579, "step": 8365 }, { - "epoch": 0.29, - "learning_rate": 4.989062680861599e-05, - "loss": 0.3434, + "epoch": 0.3016542328900422, + "grad_norm": 0.17759408056735992, + "learning_rate": 4.987963961788521e-05, + "loss": 0.4718, "step": 8370 }, { - "epoch": 0.29, - "learning_rate": 4.9890360467354694e-05, - "loss": 0.3097, + "epoch": 0.3018344325512668, + "grad_norm": 0.1582093983888626, + "learning_rate": 4.9879353446703655e-05, + "loss": 0.4568, "step": 8375 }, { - "epoch": 0.29, - "learning_rate": 4.9890093802908466e-05, - "loss": 0.2919, + "epoch": 0.3020146322124914, + "grad_norm": 0.20277364552021027, + "learning_rate": 4.987906693654572e-05, + "loss": 0.4607, "step": 8380 }, { - "epoch": 0.3, - "learning_rate": 4.988982681528078e-05, - "loss": 0.3215, + "epoch": 0.3021948318737161, + "grad_norm": 0.20033441483974457, + "learning_rate": 4.987878008741531e-05, + "loss": 0.4503, "step": 8385 }, { - "epoch": 0.3, - "learning_rate": 4.988955950447509e-05, - "loss": 0.3283, + "epoch": 0.3023750315349407, + "grad_norm": 0.2945716977119446, + "learning_rate": 4.9878492899316346e-05, + "loss": 0.5069, "step": 8390 }, { - "epoch": 0.3, - "learning_rate": 4.988929187049488e-05, - "loss": 0.3286, + "epoch": 0.30255523119616534, + "grad_norm": 0.16416752338409424, + "learning_rate": 4.987820537225273e-05, + "loss": 0.4901, "step": 8395 }, { - "epoch": 0.3, - "learning_rate": 4.9889023913343605e-05, - "loss": 0.3343, + "epoch": 0.30273543085738996, + "grad_norm": 0.25771409273147583, + "learning_rate": 4.9877917506228386e-05, + "loss": 0.471, "step": 8400 }, { - "epoch": 0.3, - "learning_rate": 4.988875563302477e-05, - "loss": 0.3434, + "epoch": 0.30291563051861464, + "grad_norm": 0.18886184692382812, + "learning_rate": 4.987762930124723e-05, + "loss": 0.4543, "step": 8405 }, { - "epoch": 0.3, - "learning_rate": 4.988848702954184e-05, - "loss": 0.3349, + "epoch": 0.30309583017983927, + "grad_norm": 0.17397350072860718, + "learning_rate": 4.987734075731319e-05, + "loss": 0.4644, "step": 8410 }, { - "epoch": 0.3, - "learning_rate": 4.9888218102898305e-05, - "loss": 0.3262, + "epoch": 0.3032760298410639, + "grad_norm": 0.19836938381195068, + "learning_rate": 4.9877051874430204e-05, + "loss": 0.4784, "step": 8415 }, { - "epoch": 0.3, - "learning_rate": 4.988794885309766e-05, - "loss": 0.3224, + "epoch": 0.3034562295022885, + "grad_norm": 0.1647557020187378, + "learning_rate": 4.987676265260219e-05, + "loss": 0.4518, "step": 8420 }, { - "epoch": 0.3, - "learning_rate": 4.9887679280143404e-05, - "loss": 0.3409, + "epoch": 0.3036364291635132, + "grad_norm": 0.21178211271762848, + "learning_rate": 4.987647309183311e-05, + "loss": 0.4945, "step": 8425 }, { - "epoch": 0.3, - "learning_rate": 4.988740938403903e-05, - "loss": 0.3289, + "epoch": 0.3038166288247378, + "grad_norm": 0.19628405570983887, + "learning_rate": 4.98761831921269e-05, + "loss": 0.4792, "step": 8430 }, { - "epoch": 0.3, - "learning_rate": 4.9887139164788056e-05, - "loss": 0.331, + "epoch": 0.30399682848596243, + "grad_norm": 0.20497670769691467, + "learning_rate": 4.987589295348751e-05, + "loss": 0.4518, "step": 8435 }, { - "epoch": 0.3, - "learning_rate": 4.988686862239397e-05, - "loss": 0.3153, + "epoch": 0.30417702814718706, + "grad_norm": 0.2078750729560852, + "learning_rate": 4.987560237591889e-05, + "loss": 0.5137, "step": 8440 }, { - "epoch": 0.3, - "learning_rate": 4.98865977568603e-05, - "loss": 0.3435, + "epoch": 0.30435722780841173, + "grad_norm": 0.15747526288032532, + "learning_rate": 4.987531145942501e-05, + "loss": 0.4622, "step": 8445 }, { - "epoch": 0.3, - "learning_rate": 4.988632656819057e-05, - "loss": 0.3379, + "epoch": 0.30453742746963636, + "grad_norm": 0.19545340538024902, + "learning_rate": 4.987502020400983e-05, + "loss": 0.46, "step": 8450 }, { - "epoch": 0.3, - "learning_rate": 4.988605505638827e-05, - "loss": 0.3532, + "epoch": 0.304717627130861, + "grad_norm": 0.19771957397460938, + "learning_rate": 4.9874728609677316e-05, + "loss": 0.4655, "step": 8455 }, { - "epoch": 0.3, - "learning_rate": 4.9885783221456963e-05, - "loss": 0.3551, + "epoch": 0.30489782679208566, + "grad_norm": 0.1792868971824646, + "learning_rate": 4.9874436676431435e-05, + "loss": 0.452, "step": 8460 }, { - "epoch": 0.3, - "learning_rate": 4.988551106340015e-05, - "loss": 0.3478, + "epoch": 0.3050780264533103, + "grad_norm": 0.2185121327638626, + "learning_rate": 4.9874144404276165e-05, + "loss": 0.4803, "step": 8465 }, { - "epoch": 0.3, - "learning_rate": 4.988523858222138e-05, - "loss": 0.3282, + "epoch": 0.3052582261145349, + "grad_norm": 0.18078531324863434, + "learning_rate": 4.98738517932155e-05, + "loss": 0.4395, "step": 8470 }, { - "epoch": 0.3, - "learning_rate": 4.9884965777924185e-05, - "loss": 0.3283, + "epoch": 0.3054384257757595, + "grad_norm": 0.14936190843582153, + "learning_rate": 4.987355884325342e-05, + "loss": 0.4672, "step": 8475 }, { - "epoch": 0.3, - "learning_rate": 4.988469265051211e-05, - "loss": 0.3209, + "epoch": 0.3056186254369842, + "grad_norm": 0.19642381370067596, + "learning_rate": 4.987326555439392e-05, + "loss": 0.4507, "step": 8480 }, { - "epoch": 0.3, - "learning_rate": 4.9884419199988705e-05, - "loss": 0.3086, + "epoch": 0.3057988250982088, + "grad_norm": 0.1511949896812439, + "learning_rate": 4.987297192664099e-05, + "loss": 0.426, "step": 8485 }, { - "epoch": 0.3, - "learning_rate": 4.988414542635751e-05, - "loss": 0.2962, + "epoch": 0.30597902475943345, + "grad_norm": 0.20524610579013824, + "learning_rate": 4.9872677959998626e-05, + "loss": 0.4387, "step": 8490 }, { - "epoch": 0.3, - "learning_rate": 4.988387132962209e-05, - "loss": 0.314, + "epoch": 0.30615922442065807, + "grad_norm": 0.19021369516849518, + "learning_rate": 4.987238365447084e-05, + "loss": 0.49, "step": 8495 }, { - "epoch": 0.3, - "learning_rate": 4.9883596909786e-05, - "loss": 0.351, + "epoch": 0.30633942408188275, + "grad_norm": 0.2122432291507721, + "learning_rate": 4.987208901006165e-05, + "loss": 0.4536, "step": 8500 }, { - "epoch": 0.3, - "eval_loss": 0.32749566435813904, - "eval_runtime": 10.5451, - "eval_samples_per_second": 9.483, - "eval_steps_per_second": 9.483, + "epoch": 0.30633942408188275, + "eval_loss": 0.4835846722126007, + "eval_runtime": 3.5343, + "eval_samples_per_second": 28.294, + "eval_steps_per_second": 7.074, "step": 8500 }, { - "epoch": 0.3, - "learning_rate": 4.98833221668528e-05, - "loss": 0.3367, + "epoch": 0.3065196237431074, + "grad_norm": 0.2556203603744507, + "learning_rate": 4.9871794026775067e-05, + "loss": 0.4393, "step": 8505 }, { - "epoch": 0.3, - "learning_rate": 4.9883047100826064e-05, - "loss": 0.3108, + "epoch": 0.306699823404332, + "grad_norm": 0.16816875338554382, + "learning_rate": 4.98714987046151e-05, + "loss": 0.4726, "step": 8510 }, { - "epoch": 0.3, - "learning_rate": 4.988277171170936e-05, - "loss": 0.3493, + "epoch": 0.3068800230655566, + "grad_norm": 0.2908528745174408, + "learning_rate": 4.9871203043585774e-05, + "loss": 0.4778, "step": 8515 }, { - "epoch": 0.3, - "learning_rate": 4.988249599950626e-05, - "loss": 0.3309, + "epoch": 0.3070602227267813, + "grad_norm": 0.2010086327791214, + "learning_rate": 4.987090704369112e-05, + "loss": 0.4649, "step": 8520 }, { - "epoch": 0.3, - "learning_rate": 4.988221996422035e-05, - "loss": 0.3633, + "epoch": 0.3072404223880059, + "grad_norm": 0.19618774950504303, + "learning_rate": 4.9870610704935185e-05, + "loss": 0.5132, "step": 8525 }, { - "epoch": 0.3, - "learning_rate": 4.9881943605855215e-05, - "loss": 0.3119, + "epoch": 0.30742062204923054, + "grad_norm": 0.15902240574359894, + "learning_rate": 4.9870314027321984e-05, + "loss": 0.4017, "step": 8530 }, { - "epoch": 0.3, - "learning_rate": 4.9881666924414436e-05, - "loss": 0.3409, + "epoch": 0.30760082171045516, + "grad_norm": 0.22079305350780487, + "learning_rate": 4.9870017010855576e-05, + "loss": 0.4651, "step": 8535 }, { - "epoch": 0.3, - "learning_rate": 4.988138991990161e-05, - "loss": 0.3129, + "epoch": 0.30778102137167984, + "grad_norm": 0.19805465638637543, + "learning_rate": 4.9869719655539994e-05, + "loss": 0.4438, "step": 8540 }, { - "epoch": 0.3, - "learning_rate": 4.9881112592320333e-05, - "loss": 0.3077, + "epoch": 0.30796122103290446, + "grad_norm": 0.17793111503124237, + "learning_rate": 4.98694219613793e-05, + "loss": 0.4655, "step": 8545 }, { - "epoch": 0.3, - "learning_rate": 4.988083494167421e-05, - "loss": 0.3065, + "epoch": 0.3081414206941291, + "grad_norm": 0.16948306560516357, + "learning_rate": 4.986912392837755e-05, + "loss": 0.4485, "step": 8550 }, { - "epoch": 0.3, - "learning_rate": 4.988055696796683e-05, - "loss": 0.3331, + "epoch": 0.3083216203553537, + "grad_norm": 0.16313959658145905, + "learning_rate": 4.98688255565388e-05, + "loss": 0.4846, "step": 8555 }, { - "epoch": 0.3, - "learning_rate": 4.988027867120183e-05, - "loss": 0.3345, + "epoch": 0.3085018200165784, + "grad_norm": 0.20121654868125916, + "learning_rate": 4.9868526845867115e-05, + "loss": 0.4565, "step": 8560 }, { - "epoch": 0.3, - "learning_rate": 4.9880000051382804e-05, - "loss": 0.315, + "epoch": 0.308682019677803, + "grad_norm": 0.17432835698127747, + "learning_rate": 4.9868227796366566e-05, + "loss": 0.4505, "step": 8565 }, { - "epoch": 0.3, - "learning_rate": 4.9879721108513364e-05, - "loss": 0.331, + "epoch": 0.30886221933902763, + "grad_norm": 0.2507334053516388, + "learning_rate": 4.986792840804122e-05, + "loss": 0.4353, "step": 8570 }, { - "epoch": 0.3, - "learning_rate": 4.987944184259715e-05, - "loss": 0.3345, + "epoch": 0.30904241900025226, + "grad_norm": 0.12277240306138992, + "learning_rate": 4.986762868089517e-05, + "loss": 0.4637, "step": 8575 }, { - "epoch": 0.3, - "learning_rate": 4.987916225363778e-05, - "loss": 0.3136, + "epoch": 0.30922261866147693, + "grad_norm": 0.20205271244049072, + "learning_rate": 4.98673286149325e-05, + "loss": 0.4189, "step": 8580 }, { - "epoch": 0.3, - "learning_rate": 4.9878882341638886e-05, - "loss": 0.3176, + "epoch": 0.30940281832270156, + "grad_norm": 0.16052600741386414, + "learning_rate": 4.986702821015729e-05, + "loss": 0.4499, "step": 8585 }, { - "epoch": 0.3, - "learning_rate": 4.9878602106604097e-05, - "loss": 0.3392, + "epoch": 0.3095830179839262, + "grad_norm": 0.16914494335651398, + "learning_rate": 4.9866727466573634e-05, + "loss": 0.4447, "step": 8590 }, { - "epoch": 0.3, - "learning_rate": 4.987832154853706e-05, - "loss": 0.3551, + "epoch": 0.3097632176451508, + "grad_norm": 0.19733648002147675, + "learning_rate": 4.986642638418563e-05, + "loss": 0.4586, "step": 8595 }, { - "epoch": 0.3, - "learning_rate": 4.9878040667441405e-05, - "loss": 0.3136, + "epoch": 0.3099434173063755, + "grad_norm": 0.19025246798992157, + "learning_rate": 4.986612496299738e-05, + "loss": 0.5077, "step": 8600 }, { - "epoch": 0.3, - "learning_rate": 4.9877759463320795e-05, - "loss": 0.3382, + "epoch": 0.3101236169676001, + "grad_norm": 0.1786302626132965, + "learning_rate": 4.986582320301299e-05, + "loss": 0.4415, "step": 8605 }, { - "epoch": 0.3, - "learning_rate": 4.9877477936178864e-05, - "loss": 0.3103, + "epoch": 0.3103038166288247, + "grad_norm": 0.18163448572158813, + "learning_rate": 4.9865521104236575e-05, + "loss": 0.465, "step": 8610 }, { - "epoch": 0.3, - "learning_rate": 4.9877196086019286e-05, - "loss": 0.321, + "epoch": 0.31048401629004935, + "grad_norm": 0.19647279381752014, + "learning_rate": 4.986521866667225e-05, + "loss": 0.4357, "step": 8615 }, { - "epoch": 0.3, - "learning_rate": 4.9876913912845705e-05, - "loss": 0.3064, + "epoch": 0.310664215951274, + "grad_norm": 0.15463513135910034, + "learning_rate": 4.9864915890324136e-05, + "loss": 0.3975, "step": 8620 }, { - "epoch": 0.3, - "learning_rate": 4.9876631416661796e-05, - "loss": 0.3302, + "epoch": 0.31084441561249865, + "grad_norm": 0.1965053528547287, + "learning_rate": 4.986461277519635e-05, + "loss": 0.463, "step": 8625 }, { - "epoch": 0.3, - "learning_rate": 4.9876348597471224e-05, - "loss": 0.3565, + "epoch": 0.31102461527372327, + "grad_norm": 0.20269662141799927, + "learning_rate": 4.9864309321293035e-05, + "loss": 0.441, "step": 8630 }, { - "epoch": 0.3, - "learning_rate": 4.9876065455277654e-05, - "loss": 0.3379, + "epoch": 0.31120481493494795, + "grad_norm": 0.2272135317325592, + "learning_rate": 4.986400552861832e-05, + "loss": 0.5056, "step": 8635 }, { - "epoch": 0.3, - "learning_rate": 4.9875781990084766e-05, - "loss": 0.3173, + "epoch": 0.31138501459617257, + "grad_norm": 0.1746290922164917, + "learning_rate": 4.986370139717634e-05, + "loss": 0.4583, "step": 8640 }, { - "epoch": 0.3, - "learning_rate": 4.9875498201896254e-05, - "loss": 0.3347, + "epoch": 0.3115652142573972, + "grad_norm": 0.17649775743484497, + "learning_rate": 4.9863396926971245e-05, + "loss": 0.4873, "step": 8645 }, { - "epoch": 0.3, - "learning_rate": 4.987521409071578e-05, - "loss": 0.3462, + "epoch": 0.3117454139186218, + "grad_norm": 0.185885488986969, + "learning_rate": 4.9863092118007185e-05, + "loss": 0.4692, "step": 8650 }, { - "epoch": 0.3, - "learning_rate": 4.9874929656547054e-05, - "loss": 0.3155, + "epoch": 0.3119256135798465, + "grad_norm": 0.1990884244441986, + "learning_rate": 4.98627869702883e-05, + "loss": 0.4578, "step": 8655 }, { - "epoch": 0.3, - "learning_rate": 4.987464489939375e-05, - "loss": 0.3135, + "epoch": 0.3121058132410711, + "grad_norm": 0.22869646549224854, + "learning_rate": 4.9862481483818755e-05, + "loss": 0.4932, "step": 8660 }, { - "epoch": 0.3, - "learning_rate": 4.987435981925958e-05, - "loss": 0.3295, + "epoch": 0.31228601290229574, + "grad_norm": 0.1766314059495926, + "learning_rate": 4.986217565860272e-05, + "loss": 0.439, "step": 8665 }, { - "epoch": 0.31, - "learning_rate": 4.987407441614824e-05, - "loss": 0.3148, + "epoch": 0.31246621256352036, + "grad_norm": 0.14868828654289246, + "learning_rate": 4.986186949464435e-05, + "loss": 0.4357, "step": 8670 }, { - "epoch": 0.31, - "learning_rate": 4.987378869006344e-05, - "loss": 0.3619, + "epoch": 0.31264641222474504, + "grad_norm": 0.17688503861427307, + "learning_rate": 4.986156299194783e-05, + "loss": 0.4194, "step": 8675 }, { - "epoch": 0.31, - "learning_rate": 4.987350264100889e-05, - "loss": 0.3541, + "epoch": 0.31282661188596966, + "grad_norm": 0.215143084526062, + "learning_rate": 4.9861256150517324e-05, + "loss": 0.4795, "step": 8680 }, { - "epoch": 0.31, - "learning_rate": 4.987321626898829e-05, - "loss": 0.3266, + "epoch": 0.3130068115471943, + "grad_norm": 0.18795055150985718, + "learning_rate": 4.9860948970357014e-05, + "loss": 0.4408, "step": 8685 }, { - "epoch": 0.31, - "learning_rate": 4.987292957400537e-05, - "loss": 0.3196, + "epoch": 0.3131870112084189, + "grad_norm": 0.14594529569149017, + "learning_rate": 4.986064145147108e-05, + "loss": 0.4296, "step": 8690 }, { - "epoch": 0.31, - "learning_rate": 4.987264255606385e-05, - "loss": 0.3261, + "epoch": 0.3133672108696436, + "grad_norm": 0.19025376439094543, + "learning_rate": 4.986033359386373e-05, + "loss": 0.4736, "step": 8695 }, { - "epoch": 0.31, - "learning_rate": 4.987235521516747e-05, - "loss": 0.3328, + "epoch": 0.3135474105308682, + "grad_norm": 0.18264278769493103, + "learning_rate": 4.986002539753915e-05, + "loss": 0.4456, "step": 8700 }, { - "epoch": 0.31, - "learning_rate": 4.987206755131993e-05, - "loss": 0.3345, + "epoch": 0.31372761019209283, + "grad_norm": 0.14919918775558472, + "learning_rate": 4.985971686250153e-05, + "loss": 0.4644, "step": 8705 }, { - "epoch": 0.31, - "learning_rate": 4.9871779564525e-05, - "loss": 0.3279, + "epoch": 0.31390780985331745, + "grad_norm": 0.2042977660894394, + "learning_rate": 4.985940798875508e-05, + "loss": 0.4554, "step": 8710 }, { - "epoch": 0.31, - "learning_rate": 4.987149125478639e-05, - "loss": 0.3352, + "epoch": 0.31408800951454213, + "grad_norm": 0.1687832921743393, + "learning_rate": 4.9859098776304015e-05, + "loss": 0.4685, "step": 8715 }, { - "epoch": 0.31, - "learning_rate": 4.987120262210787e-05, - "loss": 0.3469, + "epoch": 0.31426820917576676, + "grad_norm": 0.19032564759254456, + "learning_rate": 4.985878922515253e-05, + "loss": 0.451, "step": 8720 }, { - "epoch": 0.31, - "learning_rate": 4.9870913666493166e-05, - "loss": 0.3329, + "epoch": 0.3144484088369914, + "grad_norm": 0.15537263453006744, + "learning_rate": 4.985847933530486e-05, + "loss": 0.4373, "step": 8725 }, { - "epoch": 0.31, - "learning_rate": 4.9870624387946046e-05, - "loss": 0.3516, + "epoch": 0.314628608498216, + "grad_norm": 0.24856919050216675, + "learning_rate": 4.985816910676523e-05, + "loss": 0.5004, "step": 8730 }, { - "epoch": 0.31, - "learning_rate": 4.9870334786470256e-05, - "loss": 0.3184, + "epoch": 0.3148088081594407, + "grad_norm": 0.18003836274147034, + "learning_rate": 4.985785853953786e-05, + "loss": 0.4703, "step": 8735 }, { - "epoch": 0.31, - "learning_rate": 4.987004486206956e-05, - "loss": 0.3499, + "epoch": 0.3149890078206653, + "grad_norm": 0.1739412099123001, + "learning_rate": 4.9857547633626964e-05, + "loss": 0.4538, "step": 8740 }, { - "epoch": 0.31, - "learning_rate": 4.986975461474772e-05, - "loss": 0.2988, + "epoch": 0.3151692074818899, + "grad_norm": 0.19691774249076843, + "learning_rate": 4.985723638903681e-05, + "loss": 0.4671, "step": 8745 }, { - "epoch": 0.31, - "learning_rate": 4.986946404450851e-05, - "loss": 0.349, + "epoch": 0.31534940714311455, + "grad_norm": 0.2076934427022934, + "learning_rate": 4.9856924805771614e-05, + "loss": 0.4796, "step": 8750 }, { - "epoch": 0.31, - "learning_rate": 4.986917315135569e-05, - "loss": 0.3331, + "epoch": 0.3155296068043392, + "grad_norm": 0.16024084389209747, + "learning_rate": 4.9856612883835633e-05, + "loss": 0.4567, "step": 8755 }, { - "epoch": 0.31, - "learning_rate": 4.986888193529305e-05, - "loss": 0.3303, + "epoch": 0.31570980646556385, + "grad_norm": 0.23982550203800201, + "learning_rate": 4.985630062323311e-05, + "loss": 0.4944, "step": 8760 }, { - "epoch": 0.31, - "learning_rate": 4.986859039632436e-05, - "loss": 0.3464, + "epoch": 0.31589000612678847, + "grad_norm": 0.16979862749576569, + "learning_rate": 4.9855988023968314e-05, + "loss": 0.481, "step": 8765 }, { - "epoch": 0.31, - "learning_rate": 4.986829853445343e-05, - "loss": 0.3315, + "epoch": 0.3160702057880131, + "grad_norm": 0.19428572058677673, + "learning_rate": 4.9855675086045486e-05, + "loss": 0.4769, "step": 8770 }, { - "epoch": 0.31, - "learning_rate": 4.986800634968402e-05, - "loss": 0.3099, + "epoch": 0.31625040544923777, + "grad_norm": 0.21432949602603912, + "learning_rate": 4.985536180946889e-05, + "loss": 0.4535, "step": 8775 }, { - "epoch": 0.31, - "learning_rate": 4.9867713842019934e-05, - "loss": 0.3392, + "epoch": 0.3164306051104624, + "grad_norm": 0.1915997415781021, + "learning_rate": 4.9855048194242816e-05, + "loss": 0.4451, "step": 8780 }, { - "epoch": 0.31, - "learning_rate": 4.9867421011464984e-05, - "loss": 0.3358, + "epoch": 0.316610804771687, + "grad_norm": 0.18941251933574677, + "learning_rate": 4.985473424037151e-05, + "loss": 0.4671, "step": 8785 }, { - "epoch": 0.31, - "learning_rate": 4.986712785802295e-05, - "loss": 0.3171, + "epoch": 0.3167910044329117, + "grad_norm": 0.17575441300868988, + "learning_rate": 4.985441994785927e-05, + "loss": 0.4611, "step": 8790 }, { - "epoch": 0.31, - "learning_rate": 4.986683438169766e-05, - "loss": 0.3225, + "epoch": 0.3169712040941363, + "grad_norm": 0.1792481243610382, + "learning_rate": 4.9854105316710364e-05, + "loss": 0.4631, "step": 8795 }, { - "epoch": 0.31, - "learning_rate": 4.9866540582492907e-05, - "loss": 0.2957, + "epoch": 0.31715140375536094, + "grad_norm": 0.20034508407115936, + "learning_rate": 4.9853790346929096e-05, + "loss": 0.4751, "step": 8800 }, { - "epoch": 0.31, - "learning_rate": 4.98662464604125e-05, - "loss": 0.3151, + "epoch": 0.31733160341658556, + "grad_norm": 0.21609006822109222, + "learning_rate": 4.9853475038519736e-05, + "loss": 0.516, "step": 8805 }, { - "epoch": 0.31, - "learning_rate": 4.9865952015460285e-05, - "loss": 0.3267, + "epoch": 0.31751180307781024, + "grad_norm": 0.17463666200637817, + "learning_rate": 4.9853159391486594e-05, + "loss": 0.4715, "step": 8810 }, { - "epoch": 0.31, - "learning_rate": 4.9865657247640065e-05, - "loss": 0.327, + "epoch": 0.31769200273903486, + "grad_norm": 0.1856069266796112, + "learning_rate": 4.9852843405833965e-05, + "loss": 0.4599, "step": 8815 }, { - "epoch": 0.31, - "learning_rate": 4.9865362156955674e-05, - "loss": 0.3355, + "epoch": 0.3178722024002595, + "grad_norm": 0.2578841745853424, + "learning_rate": 4.985252708156616e-05, + "loss": 0.4756, "step": 8820 }, { - "epoch": 0.31, - "learning_rate": 4.9865066743410946e-05, - "loss": 0.3326, + "epoch": 0.3180524020614841, + "grad_norm": 0.19595186412334442, + "learning_rate": 4.985221041868748e-05, + "loss": 0.4905, "step": 8825 }, { - "epoch": 0.31, - "learning_rate": 4.986477100700972e-05, - "loss": 0.3034, + "epoch": 0.3182326017227088, + "grad_norm": 0.1569208949804306, + "learning_rate": 4.9851893417202247e-05, + "loss": 0.4481, "step": 8830 }, { - "epoch": 0.31, - "learning_rate": 4.9864474947755816e-05, - "loss": 0.3149, + "epoch": 0.3184128013839334, + "grad_norm": 0.1994239240884781, + "learning_rate": 4.9851576077114784e-05, + "loss": 0.4628, "step": 8835 }, { - "epoch": 0.31, - "learning_rate": 4.9864178565653084e-05, - "loss": 0.3197, + "epoch": 0.31859300104515803, + "grad_norm": 0.15360099077224731, + "learning_rate": 4.98512583984294e-05, + "loss": 0.4508, "step": 8840 }, { - "epoch": 0.31, - "learning_rate": 4.98638818607054e-05, - "loss": 0.3333, + "epoch": 0.31877320070638265, + "grad_norm": 0.16948221623897552, + "learning_rate": 4.985094038115043e-05, + "loss": 0.4559, "step": 8845 }, { - "epoch": 0.31, - "learning_rate": 4.986358483291658e-05, - "loss": 0.3281, + "epoch": 0.31895340036760733, + "grad_norm": 0.1563452035188675, + "learning_rate": 4.985062202528221e-05, + "loss": 0.4163, "step": 8850 }, { - "epoch": 0.31, - "learning_rate": 4.98632874822905e-05, - "loss": 0.3363, + "epoch": 0.31913360002883195, + "grad_norm": 0.20190612971782684, + "learning_rate": 4.985030333082908e-05, + "loss": 0.4037, "step": 8855 }, { - "epoch": 0.31, - "learning_rate": 4.986298980883102e-05, - "loss": 0.3327, + "epoch": 0.3193137996900566, + "grad_norm": 0.17092286050319672, + "learning_rate": 4.984998429779538e-05, + "loss": 0.4339, "step": 8860 }, { - "epoch": 0.31, - "learning_rate": 4.9862691812542e-05, - "loss": 0.3381, + "epoch": 0.3194939993512812, + "grad_norm": 0.1695345640182495, + "learning_rate": 4.9849664926185445e-05, + "loss": 0.4485, "step": 8865 }, { - "epoch": 0.31, - "learning_rate": 4.986239349342732e-05, - "loss": 0.3479, + "epoch": 0.3196741990125059, + "grad_norm": 0.17635978758335114, + "learning_rate": 4.9849345216003654e-05, + "loss": 0.4437, "step": 8870 }, { - "epoch": 0.31, - "learning_rate": 4.9862094851490836e-05, - "loss": 0.3281, + "epoch": 0.3198543986737305, + "grad_norm": 0.17225243151187897, + "learning_rate": 4.9849025167254324e-05, + "loss": 0.4684, "step": 8875 }, { - "epoch": 0.31, - "learning_rate": 4.986179588673644e-05, - "loss": 0.3317, + "epoch": 0.3200345983349551, + "grad_norm": 0.21078833937644958, + "learning_rate": 4.984870477994186e-05, + "loss": 0.473, "step": 8880 }, { - "epoch": 0.31, - "learning_rate": 4.9861496599168006e-05, - "loss": 0.3445, + "epoch": 0.32021479799617975, + "grad_norm": 0.16367143392562866, + "learning_rate": 4.9848384054070584e-05, + "loss": 0.4629, "step": 8885 }, { - "epoch": 0.31, - "learning_rate": 4.9861196988789425e-05, - "loss": 0.3412, + "epoch": 0.3203949976574044, + "grad_norm": 0.1702301949262619, + "learning_rate": 4.9848062989644894e-05, + "loss": 0.445, "step": 8890 }, { - "epoch": 0.31, - "learning_rate": 4.986089705560458e-05, - "loss": 0.3177, + "epoch": 0.32057519731862905, + "grad_norm": 0.1748560667037964, + "learning_rate": 4.984774158666916e-05, + "loss": 0.4815, "step": 8895 }, { - "epoch": 0.31, - "learning_rate": 4.9860596799617375e-05, - "loss": 0.3395, + "epoch": 0.32075539697985367, + "grad_norm": 0.20928102731704712, + "learning_rate": 4.9847419845147755e-05, + "loss": 0.4564, "step": 8900 }, { - "epoch": 0.31, - "learning_rate": 4.986029622083171e-05, - "loss": 0.3146, + "epoch": 0.3209355966410783, + "grad_norm": 0.28013554215431213, + "learning_rate": 4.984709776508506e-05, + "loss": 0.473, "step": 8905 }, { - "epoch": 0.31, - "learning_rate": 4.9859995319251475e-05, - "loss": 0.3429, + "epoch": 0.32111579630230297, + "grad_norm": 0.14769336581230164, + "learning_rate": 4.984677534648548e-05, + "loss": 0.4582, "step": 8910 }, { - "epoch": 0.31, - "learning_rate": 4.985969409488059e-05, - "loss": 0.3353, + "epoch": 0.3212959959635276, + "grad_norm": 0.2364906519651413, + "learning_rate": 4.984645258935339e-05, + "loss": 0.468, "step": 8915 }, { - "epoch": 0.31, - "learning_rate": 4.985939254772295e-05, - "loss": 0.325, + "epoch": 0.3214761956247522, + "grad_norm": 0.13622739911079407, + "learning_rate": 4.9846129493693183e-05, + "loss": 0.4214, "step": 8920 }, { - "epoch": 0.31, - "learning_rate": 4.9859090677782494e-05, - "loss": 0.3591, + "epoch": 0.32165639528597684, + "grad_norm": 0.2001640349626541, + "learning_rate": 4.984580605950929e-05, + "loss": 0.4832, "step": 8925 }, { - "epoch": 0.31, - "learning_rate": 4.985878848506312e-05, - "loss": 0.3047, + "epoch": 0.3218365949472015, + "grad_norm": 0.18895497918128967, + "learning_rate": 4.984548228680609e-05, + "loss": 0.498, "step": 8930 }, { - "epoch": 0.31, - "learning_rate": 4.9858485969568755e-05, - "loss": 0.3302, + "epoch": 0.32201679460842614, + "grad_norm": 0.18323996663093567, + "learning_rate": 4.9845158175588006e-05, + "loss": 0.4827, "step": 8935 }, { - "epoch": 0.31, - "learning_rate": 4.9858183131303344e-05, - "loss": 0.2988, + "epoch": 0.32219699426965076, + "grad_norm": 0.16754454374313354, + "learning_rate": 4.9844833725859454e-05, + "loss": 0.4472, "step": 8940 }, { - "epoch": 0.31, - "learning_rate": 4.9857879970270805e-05, - "loss": 0.3423, + "epoch": 0.3223771939308754, + "grad_norm": 0.1655125617980957, + "learning_rate": 4.9844508937624844e-05, + "loss": 0.4637, "step": 8945 }, { - "epoch": 0.31, - "learning_rate": 4.985757648647507e-05, - "loss": 0.3119, + "epoch": 0.32255739359210006, + "grad_norm": 0.20413149893283844, + "learning_rate": 4.984418381088862e-05, + "loss": 0.444, "step": 8950 }, { - "epoch": 0.32, - "learning_rate": 4.985727267992009e-05, - "loss": 0.3219, + "epoch": 0.3227375932533247, + "grad_norm": 0.16138513386249542, + "learning_rate": 4.98438583456552e-05, + "loss": 0.4388, "step": 8955 }, { - "epoch": 0.32, - "learning_rate": 4.98569685506098e-05, - "loss": 0.3093, + "epoch": 0.3229177929145493, + "grad_norm": 0.194508358836174, + "learning_rate": 4.9843532541929016e-05, + "loss": 0.4823, "step": 8960 }, { - "epoch": 0.32, - "learning_rate": 4.9856664098548165e-05, - "loss": 0.3414, + "epoch": 0.323097992575774, + "grad_norm": 0.19578173756599426, + "learning_rate": 4.9843206399714516e-05, + "loss": 0.4623, "step": 8965 }, { - "epoch": 0.32, - "learning_rate": 4.9856359323739124e-05, - "loss": 0.3321, + "epoch": 0.3232781922369986, + "grad_norm": 0.19344592094421387, + "learning_rate": 4.984287991901613e-05, + "loss": 0.4333, "step": 8970 }, { - "epoch": 0.32, - "learning_rate": 4.985605422618663e-05, - "loss": 0.3158, + "epoch": 0.32345839189822323, + "grad_norm": 0.20126448571681976, + "learning_rate": 4.9842553099838324e-05, + "loss": 0.4837, "step": 8975 }, { - "epoch": 0.32, - "learning_rate": 4.985574880589467e-05, - "loss": 0.3373, + "epoch": 0.32363859155944785, + "grad_norm": 0.1657496839761734, + "learning_rate": 4.9842225942185536e-05, + "loss": 0.4983, "step": 8980 }, { - "epoch": 0.32, - "learning_rate": 4.985544306286717e-05, - "loss": 0.341, + "epoch": 0.32381879122067253, + "grad_norm": 0.14652882516384125, + "learning_rate": 4.984189844606223e-05, + "loss": 0.4724, "step": 8985 }, { - "epoch": 0.32, - "learning_rate": 4.9855136997108146e-05, - "loss": 0.334, + "epoch": 0.32399899088189715, + "grad_norm": 0.15776591002941132, + "learning_rate": 4.984157061147287e-05, + "loss": 0.4545, "step": 8990 }, { - "epoch": 0.32, - "learning_rate": 4.985483060862153e-05, - "loss": 0.3127, + "epoch": 0.3241791905431218, + "grad_norm": 0.1731184720993042, + "learning_rate": 4.984124243842192e-05, + "loss": 0.4698, "step": 8995 }, { - "epoch": 0.32, - "learning_rate": 4.985452389741132e-05, - "loss": 0.3007, + "epoch": 0.3243593902043464, + "grad_norm": 0.16583359241485596, + "learning_rate": 4.984091392691385e-05, + "loss": 0.4673, "step": 9000 }, { - "epoch": 0.32, - "eval_loss": 0.32597243785858154, - "eval_runtime": 10.5417, - "eval_samples_per_second": 9.486, - "eval_steps_per_second": 9.486, + "epoch": 0.3243593902043464, + "eval_loss": 0.48290419578552246, + "eval_runtime": 3.5289, + "eval_samples_per_second": 28.337, + "eval_steps_per_second": 7.084, "step": 9000 }, { - "epoch": 0.32, - "learning_rate": 4.9854216863481505e-05, - "loss": 0.357, + "epoch": 0.3245395898655711, + "grad_norm": 0.22497062385082245, + "learning_rate": 4.984058507695314e-05, + "loss": 0.4875, "step": 9005 }, { - "epoch": 0.32, - "learning_rate": 4.985390950683606e-05, - "loss": 0.3185, + "epoch": 0.3247197895267957, + "grad_norm": 0.1878102570772171, + "learning_rate": 4.9840255888544265e-05, + "loss": 0.4555, "step": 9010 }, { - "epoch": 0.32, - "learning_rate": 4.985360182747898e-05, - "loss": 0.3264, + "epoch": 0.3248999891880203, + "grad_norm": 0.2143104523420334, + "learning_rate": 4.983992636169171e-05, + "loss": 0.4425, "step": 9015 }, { - "epoch": 0.32, - "learning_rate": 4.985329382541426e-05, - "loss": 0.333, + "epoch": 0.32508018884924494, + "grad_norm": 0.18537423014640808, + "learning_rate": 4.9839596496399964e-05, + "loss": 0.4809, "step": 9020 }, { - "epoch": 0.32, - "learning_rate": 4.98529855006459e-05, - "loss": 0.3089, + "epoch": 0.3252603885104696, + "grad_norm": 0.1459311693906784, + "learning_rate": 4.983926629267353e-05, + "loss": 0.4394, "step": 9025 }, { - "epoch": 0.32, - "learning_rate": 4.985267685317789e-05, - "loss": 0.3267, + "epoch": 0.32544058817169425, + "grad_norm": 0.16225488483905792, + "learning_rate": 4.98389357505169e-05, + "loss": 0.4756, "step": 9030 }, { - "epoch": 0.32, - "learning_rate": 4.985236788301427e-05, - "loss": 0.3334, + "epoch": 0.32562078783291887, + "grad_norm": 0.21932868659496307, + "learning_rate": 4.983860486993458e-05, + "loss": 0.4869, "step": 9035 }, { - "epoch": 0.32, - "learning_rate": 4.985205859015902e-05, - "loss": 0.3135, + "epoch": 0.3258009874941435, + "grad_norm": 0.17709140479564667, + "learning_rate": 4.983827365093109e-05, + "loss": 0.4538, "step": 9040 }, { - "epoch": 0.32, - "learning_rate": 4.985174897461616e-05, - "loss": 0.338, + "epoch": 0.32598118715536817, + "grad_norm": 0.17748679220676422, + "learning_rate": 4.9837942093510914e-05, + "loss": 0.4759, "step": 9045 }, { - "epoch": 0.32, - "learning_rate": 4.985143903638973e-05, - "loss": 0.3489, + "epoch": 0.3261613868165928, + "grad_norm": 0.17787578701972961, + "learning_rate": 4.983761019767859e-05, + "loss": 0.4355, "step": 9050 }, { - "epoch": 0.32, - "learning_rate": 4.985112877548373e-05, - "loss": 0.3145, + "epoch": 0.3263415864778174, + "grad_norm": 0.21222223341464996, + "learning_rate": 4.983727796343864e-05, + "loss": 0.4351, "step": 9055 }, { - "epoch": 0.32, - "learning_rate": 4.9850818191902216e-05, - "loss": 0.3318, + "epoch": 0.32652178613904204, + "grad_norm": 0.18843764066696167, + "learning_rate": 4.983694539079558e-05, + "loss": 0.4608, "step": 9060 }, { - "epoch": 0.32, - "learning_rate": 4.9850507285649196e-05, - "loss": 0.316, + "epoch": 0.3267019858002667, + "grad_norm": 0.2049497812986374, + "learning_rate": 4.9836612479753955e-05, + "loss": 0.468, "step": 9065 }, { - "epoch": 0.32, - "learning_rate": 4.985019605672871e-05, - "loss": 0.3234, + "epoch": 0.32688218546149134, + "grad_norm": 0.1821064054965973, + "learning_rate": 4.9836279230318286e-05, + "loss": 0.4842, "step": 9070 }, { - "epoch": 0.32, - "learning_rate": 4.984988450514482e-05, - "loss": 0.3117, + "epoch": 0.32706238512271596, + "grad_norm": 0.18487945199012756, + "learning_rate": 4.983594564249312e-05, + "loss": 0.4394, "step": 9075 }, { - "epoch": 0.32, - "learning_rate": 4.984957263090154e-05, - "loss": 0.3166, + "epoch": 0.3272425847839406, + "grad_norm": 0.19593387842178345, + "learning_rate": 4.9835611716283015e-05, + "loss": 0.4812, "step": 9080 }, { - "epoch": 0.32, - "learning_rate": 4.984926043400295e-05, - "loss": 0.3375, + "epoch": 0.32742278444516526, + "grad_norm": 0.1479068547487259, + "learning_rate": 4.98352774516925e-05, + "loss": 0.4608, "step": 9085 }, { - "epoch": 0.32, - "learning_rate": 4.984894791445308e-05, - "loss": 0.32, + "epoch": 0.3276029841063899, + "grad_norm": 0.21857762336730957, + "learning_rate": 4.983494284872614e-05, + "loss": 0.4715, "step": 9090 }, { - "epoch": 0.32, - "learning_rate": 4.984863507225601e-05, - "loss": 0.3319, + "epoch": 0.3277831837676145, + "grad_norm": 0.18269813060760498, + "learning_rate": 4.983460790738849e-05, + "loss": 0.4397, "step": 9095 }, { - "epoch": 0.32, - "learning_rate": 4.984832190741577e-05, - "loss": 0.3137, + "epoch": 0.32796338342883913, + "grad_norm": 0.1688721776008606, + "learning_rate": 4.983427262768411e-05, + "loss": 0.4727, "step": 9100 }, { - "epoch": 0.32, - "learning_rate": 4.984800841993647e-05, - "loss": 0.3157, + "epoch": 0.3281435830900638, + "grad_norm": 0.2021578699350357, + "learning_rate": 4.983393700961758e-05, + "loss": 0.4429, "step": 9105 }, { - "epoch": 0.32, - "learning_rate": 4.9847694609822133e-05, - "loss": 0.3358, + "epoch": 0.32832378275128843, + "grad_norm": 0.16843554377555847, + "learning_rate": 4.9833601053193465e-05, + "loss": 0.4566, "step": 9110 }, { - "epoch": 0.32, - "learning_rate": 4.9847380477076866e-05, - "loss": 0.3316, + "epoch": 0.32850398241251305, + "grad_norm": 0.19692714512348175, + "learning_rate": 4.983326475841635e-05, + "loss": 0.4815, "step": 9115 }, { - "epoch": 0.32, - "learning_rate": 4.984706602170474e-05, - "loss": 0.3314, + "epoch": 0.3286841820737377, + "grad_norm": 0.20175185799598694, + "learning_rate": 4.983292812529081e-05, + "loss": 0.4429, "step": 9120 }, { - "epoch": 0.32, - "learning_rate": 4.9846751243709835e-05, - "loss": 0.3301, + "epoch": 0.32886438173496235, + "grad_norm": 0.16880235075950623, + "learning_rate": 4.9832591153821424e-05, + "loss": 0.4417, "step": 9125 }, { - "epoch": 0.32, - "learning_rate": 4.9846436143096245e-05, - "loss": 0.3047, + "epoch": 0.329044581396187, + "grad_norm": 0.1737411469221115, + "learning_rate": 4.983225384401279e-05, + "loss": 0.4304, "step": 9130 }, { - "epoch": 0.32, - "learning_rate": 4.9846120719868045e-05, - "loss": 0.324, + "epoch": 0.3292247810574116, + "grad_norm": 0.20997703075408936, + "learning_rate": 4.983191619586951e-05, + "loss": 0.4775, "step": 9135 }, { - "epoch": 0.32, - "learning_rate": 4.984580497402935e-05, - "loss": 0.3417, + "epoch": 0.3294049807186363, + "grad_norm": 0.20304125547409058, + "learning_rate": 4.9831578209396186e-05, + "loss": 0.4416, "step": 9140 }, { - "epoch": 0.32, - "learning_rate": 4.9845488905584245e-05, - "loss": 0.3142, + "epoch": 0.3295851803798609, + "grad_norm": 0.22506339848041534, + "learning_rate": 4.9831239884597407e-05, + "loss": 0.531, "step": 9145 }, { - "epoch": 0.32, - "learning_rate": 4.9845172514536844e-05, - "loss": 0.3444, + "epoch": 0.3297653800410855, + "grad_norm": 0.16725526750087738, + "learning_rate": 4.983090122147779e-05, + "loss": 0.4536, "step": 9150 }, { - "epoch": 0.32, - "learning_rate": 4.984485580089125e-05, - "loss": 0.3562, + "epoch": 0.32994557970231014, + "grad_norm": 0.1725703775882721, + "learning_rate": 4.983056222004196e-05, + "loss": 0.4525, "step": 9155 }, { - "epoch": 0.32, - "learning_rate": 4.9844538764651584e-05, - "loss": 0.3122, + "epoch": 0.3301257793635348, + "grad_norm": 0.13673150539398193, + "learning_rate": 4.9830222880294525e-05, + "loss": 0.4515, "step": 9160 }, { - "epoch": 0.32, - "learning_rate": 4.9844221405821946e-05, - "loss": 0.3444, + "epoch": 0.33030597902475944, + "grad_norm": 0.19130565226078033, + "learning_rate": 4.982988320224011e-05, + "loss": 0.4683, "step": 9165 }, { - "epoch": 0.32, - "learning_rate": 4.984390372440647e-05, - "loss": 0.2972, + "epoch": 0.33048617868598407, + "grad_norm": 0.2034883350133896, + "learning_rate": 4.9829543185883344e-05, + "loss": 0.4899, "step": 9170 }, { - "epoch": 0.32, - "learning_rate": 4.984358572040928e-05, - "loss": 0.3356, + "epoch": 0.3306663783472087, + "grad_norm": 0.22771266102790833, + "learning_rate": 4.982920283122885e-05, + "loss": 0.4736, "step": 9175 }, { - "epoch": 0.32, - "learning_rate": 4.9843267393834495e-05, - "loss": 0.3507, + "epoch": 0.33084657800843337, + "grad_norm": 0.18065966665744781, + "learning_rate": 4.982886213828128e-05, + "loss": 0.4542, "step": 9180 }, { - "epoch": 0.32, - "learning_rate": 4.9842948744686256e-05, - "loss": 0.3003, + "epoch": 0.331026777669658, + "grad_norm": 0.1368507593870163, + "learning_rate": 4.9828521107045276e-05, + "loss": 0.4624, "step": 9185 }, { - "epoch": 0.32, - "learning_rate": 4.984262977296871e-05, - "loss": 0.3545, + "epoch": 0.3312069773308826, + "grad_norm": 0.16015149652957916, + "learning_rate": 4.982817973752548e-05, + "loss": 0.4361, "step": 9190 }, { - "epoch": 0.32, - "learning_rate": 4.9842310478685975e-05, - "loss": 0.3167, + "epoch": 0.33138717699210724, + "grad_norm": 0.19293028116226196, + "learning_rate": 4.9827838029726535e-05, + "loss": 0.4403, "step": 9195 }, { - "epoch": 0.32, - "learning_rate": 4.984199086184222e-05, - "loss": 0.3063, + "epoch": 0.3315673766533319, + "grad_norm": 0.18895958364009857, + "learning_rate": 4.9827495983653104e-05, + "loss": 0.4999, "step": 9200 }, { - "epoch": 0.32, - "learning_rate": 4.984167092244158e-05, - "loss": 0.3186, + "epoch": 0.33174757631455654, + "grad_norm": 0.16563822329044342, + "learning_rate": 4.982715359930985e-05, + "loss": 0.436, "step": 9205 }, { - "epoch": 0.32, - "learning_rate": 4.9841350660488215e-05, - "loss": 0.3231, + "epoch": 0.33192777597578116, + "grad_norm": 0.21397612988948822, + "learning_rate": 4.982681087670144e-05, + "loss": 0.4835, "step": 9210 }, { - "epoch": 0.32, - "learning_rate": 4.9841030075986284e-05, - "loss": 0.3199, + "epoch": 0.3321079756370058, + "grad_norm": 0.18013893067836761, + "learning_rate": 4.982646781583252e-05, + "loss": 0.4622, "step": 9215 }, { - "epoch": 0.32, - "learning_rate": 4.9840709168939945e-05, - "loss": 0.3169, + "epoch": 0.33228817529823046, + "grad_norm": 0.19024062156677246, + "learning_rate": 4.98261244167078e-05, + "loss": 0.4419, "step": 9220 }, { - "epoch": 0.32, - "learning_rate": 4.9840387939353375e-05, - "loss": 0.3304, + "epoch": 0.3324683749594551, + "grad_norm": 0.18336628377437592, + "learning_rate": 4.9825780679331935e-05, + "loss": 0.4525, "step": 9225 }, { - "epoch": 0.32, - "learning_rate": 4.984006638723073e-05, - "loss": 0.3147, + "epoch": 0.3326485746206797, + "grad_norm": 0.17188718914985657, + "learning_rate": 4.982543660370962e-05, + "loss": 0.4737, "step": 9230 }, { - "epoch": 0.32, - "learning_rate": 4.9839744512576205e-05, - "loss": 0.329, + "epoch": 0.3328287742819043, + "grad_norm": 0.1877148300409317, + "learning_rate": 4.982509218984553e-05, + "loss": 0.4557, "step": 9235 }, { - "epoch": 0.33, - "learning_rate": 4.9839422315393954e-05, - "loss": 0.3072, + "epoch": 0.333008973943129, + "grad_norm": 0.2199414074420929, + "learning_rate": 4.982474743774437e-05, + "loss": 0.4892, "step": 9240 }, { - "epoch": 0.33, - "learning_rate": 4.983909979568818e-05, - "loss": 0.3553, + "epoch": 0.33318917360435363, + "grad_norm": 0.17190028727054596, + "learning_rate": 4.982440234741082e-05, + "loss": 0.487, "step": 9245 }, { - "epoch": 0.33, - "learning_rate": 4.983877695346306e-05, - "loss": 0.332, + "epoch": 0.33336937326557825, + "grad_norm": 0.2042684108018875, + "learning_rate": 4.9824056918849614e-05, + "loss": 0.4648, "step": 9250 }, { - "epoch": 0.33, - "learning_rate": 4.983845378872279e-05, - "loss": 0.3178, + "epoch": 0.3335495729268029, + "grad_norm": 0.15054477751255035, + "learning_rate": 4.9823711152065425e-05, + "loss": 0.4622, "step": 9255 }, { - "epoch": 0.33, - "learning_rate": 4.983813030147158e-05, - "loss": 0.328, + "epoch": 0.33372977258802755, + "grad_norm": 0.20531079173088074, + "learning_rate": 4.9823365047062986e-05, + "loss": 0.466, "step": 9260 }, { - "epoch": 0.33, - "learning_rate": 4.9837806491713606e-05, - "loss": 0.3063, + "epoch": 0.3339099722492522, + "grad_norm": 0.17524632811546326, + "learning_rate": 4.9823018603847e-05, + "loss": 0.4271, "step": 9265 }, { - "epoch": 0.33, - "learning_rate": 4.983748235945309e-05, - "loss": 0.3263, + "epoch": 0.3340901719104768, + "grad_norm": 0.21310240030288696, + "learning_rate": 4.9822671822422195e-05, + "loss": 0.4699, "step": 9270 }, { - "epoch": 0.33, - "learning_rate": 4.9837157904694234e-05, - "loss": 0.3139, + "epoch": 0.3342703715717014, + "grad_norm": 0.19141936302185059, + "learning_rate": 4.982232470279329e-05, + "loss": 0.4508, "step": 9275 }, { - "epoch": 0.33, - "learning_rate": 4.983683312744124e-05, - "loss": 0.3351, + "epoch": 0.3344505712329261, + "grad_norm": 0.21620051562786102, + "learning_rate": 4.9821977244965014e-05, + "loss": 0.502, "step": 9280 }, { - "epoch": 0.33, - "learning_rate": 4.983650802769834e-05, - "loss": 0.3058, + "epoch": 0.3346307708941507, + "grad_norm": 0.16024553775787354, + "learning_rate": 4.98216294489421e-05, + "loss": 0.442, "step": 9285 }, { - "epoch": 0.33, - "learning_rate": 4.983618260546976e-05, - "loss": 0.3224, + "epoch": 0.33481097055537534, + "grad_norm": 0.21437714993953705, + "learning_rate": 4.982128131472929e-05, + "loss": 0.4775, "step": 9290 }, { - "epoch": 0.33, - "learning_rate": 4.9835856860759714e-05, - "loss": 0.3122, + "epoch": 0.3349911702166, + "grad_norm": 0.18475022912025452, + "learning_rate": 4.982093284233134e-05, + "loss": 0.4903, "step": 9295 }, { - "epoch": 0.33, - "learning_rate": 4.9835530793572425e-05, - "loss": 0.2956, + "epoch": 0.33517136987782464, + "grad_norm": 0.13329213857650757, + "learning_rate": 4.982058403175298e-05, + "loss": 0.443, "step": 9300 }, { - "epoch": 0.33, - "learning_rate": 4.983520440391214e-05, - "loss": 0.3209, + "epoch": 0.33535156953904927, + "grad_norm": 0.16213421523571014, + "learning_rate": 4.982023488299897e-05, + "loss": 0.4502, "step": 9305 }, { - "epoch": 0.33, - "learning_rate": 4.9834877691783094e-05, - "loss": 0.3261, + "epoch": 0.3355317692002739, + "grad_norm": 0.19696162641048431, + "learning_rate": 4.981988539607406e-05, + "loss": 0.4939, "step": 9310 }, { - "epoch": 0.33, - "learning_rate": 4.983455065718953e-05, - "loss": 0.3342, + "epoch": 0.33571196886149857, + "grad_norm": 0.153734490275383, + "learning_rate": 4.981953557098302e-05, + "loss": 0.4557, "step": 9315 }, { - "epoch": 0.33, - "learning_rate": 4.983422330013569e-05, - "loss": 0.3251, + "epoch": 0.3358921685227232, + "grad_norm": 0.16626779735088348, + "learning_rate": 4.981918540773061e-05, + "loss": 0.4774, "step": 9320 }, { - "epoch": 0.33, - "learning_rate": 4.983389562062583e-05, - "loss": 0.3228, + "epoch": 0.3360723681839478, + "grad_norm": 0.2404673844575882, + "learning_rate": 4.981883490632161e-05, + "loss": 0.4717, "step": 9325 }, { - "epoch": 0.33, - "learning_rate": 4.98335676186642e-05, - "loss": 0.3524, + "epoch": 0.33625256784517243, + "grad_norm": 0.19677095115184784, + "learning_rate": 4.9818484066760786e-05, + "loss": 0.4442, "step": 9330 }, { - "epoch": 0.33, - "learning_rate": 4.9833239294255054e-05, - "loss": 0.342, + "epoch": 0.3364327675063971, + "grad_norm": 0.17953583598136902, + "learning_rate": 4.9818132889052914e-05, + "loss": 0.4613, "step": 9335 }, { - "epoch": 0.33, - "learning_rate": 4.9832910647402666e-05, - "loss": 0.3058, + "epoch": 0.33661296716762173, + "grad_norm": 0.19360299408435822, + "learning_rate": 4.98177813732028e-05, + "loss": 0.4416, "step": 9340 }, { - "epoch": 0.33, - "learning_rate": 4.9832581678111304e-05, - "loss": 0.3527, + "epoch": 0.33679316682884636, + "grad_norm": 0.16744250059127808, + "learning_rate": 4.9817429519215206e-05, + "loss": 0.454, "step": 9345 }, { - "epoch": 0.33, - "learning_rate": 4.983225238638523e-05, - "loss": 0.3186, + "epoch": 0.336973366490071, + "grad_norm": 0.15772564709186554, + "learning_rate": 4.981707732709495e-05, + "loss": 0.4651, "step": 9350 }, { - "epoch": 0.33, - "learning_rate": 4.983192277222872e-05, - "loss": 0.3057, + "epoch": 0.33715356615129566, + "grad_norm": 0.16711197793483734, + "learning_rate": 4.9816724796846814e-05, + "loss": 0.4269, "step": 9355 }, { - "epoch": 0.33, - "learning_rate": 4.9831592835646064e-05, - "loss": 0.3024, + "epoch": 0.3373337658125203, + "grad_norm": 0.14946970343589783, + "learning_rate": 4.981637192847561e-05, + "loss": 0.4646, "step": 9360 }, { - "epoch": 0.33, - "learning_rate": 4.983126257664153e-05, - "loss": 0.3142, + "epoch": 0.3375139654737449, + "grad_norm": 0.20291027426719666, + "learning_rate": 4.9816018721986145e-05, + "loss": 0.4598, "step": 9365 }, { - "epoch": 0.33, - "learning_rate": 4.9830931995219426e-05, - "loss": 0.3291, + "epoch": 0.3376941651349695, + "grad_norm": 0.1772489696741104, + "learning_rate": 4.981566517738323e-05, + "loss": 0.4794, "step": 9370 }, { - "epoch": 0.33, - "learning_rate": 4.983060109138403e-05, - "loss": 0.3053, + "epoch": 0.3378743647961942, + "grad_norm": 0.1733308732509613, + "learning_rate": 4.981531129467168e-05, + "loss": 0.4581, "step": 9375 }, { - "epoch": 0.33, - "learning_rate": 4.9830269865139643e-05, - "loss": 0.34, + "epoch": 0.3380545644574188, + "grad_norm": 0.13439474999904633, + "learning_rate": 4.981495707385632e-05, + "loss": 0.4519, "step": 9380 }, { - "epoch": 0.33, - "learning_rate": 4.9829938316490574e-05, - "loss": 0.3537, + "epoch": 0.33823476411864345, + "grad_norm": 0.15566691756248474, + "learning_rate": 4.9814602514941965e-05, + "loss": 0.452, "step": 9385 }, { - "epoch": 0.33, - "learning_rate": 4.9829606445441106e-05, - "loss": 0.3217, + "epoch": 0.33841496377986807, + "grad_norm": 0.1827496588230133, + "learning_rate": 4.981424761793346e-05, + "loss": 0.4634, "step": 9390 }, { - "epoch": 0.33, - "learning_rate": 4.982927425199557e-05, - "loss": 0.2932, + "epoch": 0.33859516344109275, + "grad_norm": 0.16756999492645264, + "learning_rate": 4.9813892382835635e-05, + "loss": 0.485, "step": 9395 }, { - "epoch": 0.33, - "learning_rate": 4.982894173615827e-05, - "loss": 0.3026, + "epoch": 0.3387753631023174, + "grad_norm": 0.17952053248882294, + "learning_rate": 4.981353680965334e-05, + "loss": 0.4769, "step": 9400 }, { - "epoch": 0.33, - "learning_rate": 4.982860889793352e-05, - "loss": 0.3515, + "epoch": 0.338955562763542, + "grad_norm": 0.14793772995471954, + "learning_rate": 4.98131808983914e-05, + "loss": 0.4435, "step": 9405 }, { - "epoch": 0.33, - "learning_rate": 4.982827573732565e-05, - "loss": 0.3247, + "epoch": 0.3391357624247666, + "grad_norm": 0.19632504880428314, + "learning_rate": 4.9812824649054674e-05, + "loss": 0.3992, "step": 9410 }, { - "epoch": 0.33, - "learning_rate": 4.9827942254338975e-05, - "loss": 0.3391, + "epoch": 0.3393159620859913, + "grad_norm": 0.19574934244155884, + "learning_rate": 4.9812468061648024e-05, + "loss": 0.489, "step": 9415 }, { - "epoch": 0.33, - "learning_rate": 4.982760844897785e-05, - "loss": 0.3342, + "epoch": 0.3394961617472159, + "grad_norm": 0.22474491596221924, + "learning_rate": 4.981211113617629e-05, + "loss": 0.4526, "step": 9420 }, { - "epoch": 0.33, - "learning_rate": 4.9827274321246575e-05, - "loss": 0.3341, + "epoch": 0.33967636140844054, + "grad_norm": 0.18208986520767212, + "learning_rate": 4.981175387264435e-05, + "loss": 0.4648, "step": 9425 }, { - "epoch": 0.33, - "learning_rate": 4.98269398711495e-05, - "loss": 0.3457, + "epoch": 0.33985656106966516, + "grad_norm": 0.16022849082946777, + "learning_rate": 4.9811396271057067e-05, + "loss": 0.4527, "step": 9430 }, { - "epoch": 0.33, - "learning_rate": 4.982660509869099e-05, - "loss": 0.3443, + "epoch": 0.34003676073088984, + "grad_norm": 0.21129754185676575, + "learning_rate": 4.981103833141931e-05, + "loss": 0.4382, "step": 9435 }, { - "epoch": 0.33, - "learning_rate": 4.9826270003875364e-05, - "loss": 0.3341, + "epoch": 0.34021696039211446, + "grad_norm": 0.20417428016662598, + "learning_rate": 4.981068005373597e-05, + "loss": 0.4474, "step": 9440 }, { - "epoch": 0.33, - "learning_rate": 4.9825934586706986e-05, - "loss": 0.309, + "epoch": 0.3403971600533391, + "grad_norm": 0.21465253829956055, + "learning_rate": 4.981032143801191e-05, + "loss": 0.4521, "step": 9445 }, { - "epoch": 0.33, - "learning_rate": 4.9825598847190205e-05, - "loss": 0.3391, + "epoch": 0.3405773597145637, + "grad_norm": 0.19207169115543365, + "learning_rate": 4.980996248425202e-05, + "loss": 0.456, "step": 9450 }, { - "epoch": 0.33, - "learning_rate": 4.9825262785329384e-05, - "loss": 0.3325, + "epoch": 0.3407575593757884, + "grad_norm": 0.1812460720539093, + "learning_rate": 4.98096031924612e-05, + "loss": 0.469, "step": 9455 }, { - "epoch": 0.33, - "learning_rate": 4.9824926401128894e-05, - "loss": 0.2993, + "epoch": 0.340937759037013, + "grad_norm": 0.21918563544750214, + "learning_rate": 4.9809243562644334e-05, + "loss": 0.4829, "step": 9460 }, { - "epoch": 0.33, - "learning_rate": 4.9824589694593086e-05, - "loss": 0.3034, + "epoch": 0.34111795869823763, + "grad_norm": 0.18760140240192413, + "learning_rate": 4.980888359480634e-05, + "loss": 0.4243, "step": 9465 }, { - "epoch": 0.33, - "learning_rate": 4.982425266572634e-05, - "loss": 0.3118, + "epoch": 0.3412981583594623, + "grad_norm": 0.1885528415441513, + "learning_rate": 4.98085232889521e-05, + "loss": 0.4846, "step": 9470 }, { - "epoch": 0.33, - "learning_rate": 4.982391531453304e-05, - "loss": 0.3312, + "epoch": 0.34147835802068693, + "grad_norm": 0.16368678212165833, + "learning_rate": 4.980816264508654e-05, + "loss": 0.4488, "step": 9475 }, { - "epoch": 0.33, - "learning_rate": 4.982357764101755e-05, - "loss": 0.3397, + "epoch": 0.34165855768191156, + "grad_norm": 0.21680162847042084, + "learning_rate": 4.980780166321456e-05, + "loss": 0.4932, "step": 9480 }, { - "epoch": 0.33, - "learning_rate": 4.982323964518427e-05, - "loss": 0.3235, + "epoch": 0.3418387573431362, + "grad_norm": 0.19433487951755524, + "learning_rate": 4.9807440343341095e-05, + "loss": 0.4472, "step": 9485 }, { - "epoch": 0.33, - "learning_rate": 4.982290132703759e-05, - "loss": 0.3259, + "epoch": 0.34201895700436086, + "grad_norm": 0.16287468373775482, + "learning_rate": 4.980707868547105e-05, + "loss": 0.4767, "step": 9490 }, { - "epoch": 0.33, - "learning_rate": 4.9822562686581884e-05, - "loss": 0.336, + "epoch": 0.3421991566655855, + "grad_norm": 0.15700995922088623, + "learning_rate": 4.9806716689609356e-05, + "loss": 0.4508, "step": 9495 }, { - "epoch": 0.33, - "learning_rate": 4.982222372382156e-05, - "loss": 0.3457, + "epoch": 0.3423793563268101, + "grad_norm": 0.2554052472114563, + "learning_rate": 4.980635435576096e-05, + "loss": 0.4737, "step": 9500 }, { - "epoch": 0.33, - "eval_loss": 0.32349956035614014, - "eval_runtime": 10.5421, - "eval_samples_per_second": 9.486, - "eval_steps_per_second": 9.486, + "epoch": 0.3423793563268101, + "eval_loss": 0.4815308451652527, + "eval_runtime": 3.5226, + "eval_samples_per_second": 28.388, + "eval_steps_per_second": 7.097, "step": 9500 }, { - "epoch": 0.33, - "learning_rate": 4.982188443876102e-05, - "loss": 0.3081, + "epoch": 0.3425595559880347, + "grad_norm": 0.21748638153076172, + "learning_rate": 4.980599168393079e-05, + "loss": 0.4881, "step": 9505 }, { - "epoch": 0.33, - "learning_rate": 4.982154483140468e-05, - "loss": 0.3298, + "epoch": 0.3427397556492594, + "grad_norm": 0.18842586874961853, + "learning_rate": 4.9805628674123774e-05, + "loss": 0.4897, "step": 9510 }, { - "epoch": 0.33, - "learning_rate": 4.982120490175692e-05, - "loss": 0.3431, + "epoch": 0.342919955310484, + "grad_norm": 0.1802000105381012, + "learning_rate": 4.9805265326344874e-05, + "loss": 0.4584, "step": 9515 }, { - "epoch": 0.33, - "learning_rate": 4.982086464982219e-05, - "loss": 0.303, + "epoch": 0.34310015497170865, + "grad_norm": 0.1921650469303131, + "learning_rate": 4.980490164059904e-05, + "loss": 0.4693, "step": 9520 }, { - "epoch": 0.34, - "learning_rate": 4.982052407560488e-05, - "loss": 0.3386, + "epoch": 0.34328035463293327, + "grad_norm": 0.1768704205751419, + "learning_rate": 4.9804537616891224e-05, + "loss": 0.4649, "step": 9525 }, { - "epoch": 0.34, - "learning_rate": 4.982018317910943e-05, - "loss": 0.3359, + "epoch": 0.34346055429415795, + "grad_norm": 0.1596553921699524, + "learning_rate": 4.980417325522638e-05, + "loss": 0.4141, "step": 9530 }, { - "epoch": 0.34, - "learning_rate": 4.9819841960340246e-05, - "loss": 0.3161, + "epoch": 0.34364075395538257, + "grad_norm": 0.18743056058883667, + "learning_rate": 4.980380855560949e-05, + "loss": 0.4305, "step": 9535 }, { - "epoch": 0.34, - "learning_rate": 4.981950041930178e-05, - "loss": 0.3158, + "epoch": 0.3438209536166072, + "grad_norm": 0.16719353199005127, + "learning_rate": 4.98034435180455e-05, + "loss": 0.4731, "step": 9540 }, { - "epoch": 0.34, - "learning_rate": 4.9819158555998455e-05, - "loss": 0.3248, + "epoch": 0.3440011532778318, + "grad_norm": 0.17522072792053223, + "learning_rate": 4.980307814253939e-05, + "loss": 0.4487, "step": 9545 }, { - "epoch": 0.34, - "learning_rate": 4.981881637043471e-05, - "loss": 0.3429, + "epoch": 0.3441813529390565, + "grad_norm": 0.15065240859985352, + "learning_rate": 4.9802712429096154e-05, + "loss": 0.4602, "step": 9550 }, { - "epoch": 0.34, - "learning_rate": 4.9818473862614996e-05, - "loss": 0.3222, + "epoch": 0.3443615526002811, + "grad_norm": 0.19820521771907806, + "learning_rate": 4.980234637772075e-05, + "loss": 0.4411, "step": 9555 }, { - "epoch": 0.34, - "learning_rate": 4.9818131032543746e-05, - "loss": 0.3426, + "epoch": 0.34454175226150574, + "grad_norm": 0.20624442398548126, + "learning_rate": 4.980197998841819e-05, + "loss": 0.4873, "step": 9560 }, { - "epoch": 0.34, - "learning_rate": 4.981778788022543e-05, - "loss": 0.3337, + "epoch": 0.34472195192273036, + "grad_norm": 0.18093755841255188, + "learning_rate": 4.9801613261193455e-05, + "loss": 0.46, "step": 9565 }, { - "epoch": 0.34, - "learning_rate": 4.981744440566449e-05, - "loss": 0.2884, + "epoch": 0.34490215158395504, + "grad_norm": 0.17363637685775757, + "learning_rate": 4.9801246196051535e-05, + "loss": 0.4306, "step": 9570 }, { - "epoch": 0.34, - "learning_rate": 4.981710060886539e-05, - "loss": 0.319, + "epoch": 0.34508235124517966, + "grad_norm": 0.1665421426296234, + "learning_rate": 4.980087879299744e-05, + "loss": 0.4478, "step": 9575 }, { - "epoch": 0.34, - "learning_rate": 4.9816756489832594e-05, - "loss": 0.3413, + "epoch": 0.3452625509064043, + "grad_norm": 0.18756218254566193, + "learning_rate": 4.980051105203617e-05, + "loss": 0.5001, "step": 9580 }, { - "epoch": 0.34, - "learning_rate": 4.981641204857057e-05, - "loss": 0.3181, + "epoch": 0.3454427505676289, + "grad_norm": 0.16964589059352875, + "learning_rate": 4.980014297317274e-05, + "loss": 0.4586, "step": 9585 }, { - "epoch": 0.34, - "learning_rate": 4.981606728508379e-05, - "loss": 0.3521, + "epoch": 0.3456229502288536, + "grad_norm": 0.24527285993099213, + "learning_rate": 4.979977455641217e-05, + "loss": 0.456, "step": 9590 }, { - "epoch": 0.34, - "learning_rate": 4.9815722199376734e-05, - "loss": 0.3322, + "epoch": 0.3458031498900782, + "grad_norm": 0.22246739268302917, + "learning_rate": 4.9799405801759466e-05, + "loss": 0.4951, "step": 9595 }, { - "epoch": 0.34, - "learning_rate": 4.9815376791453876e-05, - "loss": 0.3417, + "epoch": 0.34598334955130283, + "grad_norm": 0.18261441588401794, + "learning_rate": 4.979903670921966e-05, + "loss": 0.4364, "step": 9600 }, { - "epoch": 0.34, - "learning_rate": 4.9815031061319705e-05, - "loss": 0.3309, + "epoch": 0.34616354921252745, + "grad_norm": 0.15440581738948822, + "learning_rate": 4.9798667278797775e-05, + "loss": 0.4383, "step": 9605 }, { - "epoch": 0.34, - "learning_rate": 4.9814685008978713e-05, - "loss": 0.3283, + "epoch": 0.34634374887375213, + "grad_norm": 0.15516167879104614, + "learning_rate": 4.979829751049886e-05, + "loss": 0.4524, "step": 9610 }, { - "epoch": 0.34, - "learning_rate": 4.981433863443538e-05, - "loss": 0.3128, + "epoch": 0.34652394853497676, + "grad_norm": 0.13671459257602692, + "learning_rate": 4.979792740432794e-05, + "loss": 0.4862, "step": 9615 }, { - "epoch": 0.34, - "learning_rate": 4.981399193769422e-05, - "loss": 0.3348, + "epoch": 0.3467041481962014, + "grad_norm": 0.155983105301857, + "learning_rate": 4.9797556960290047e-05, + "loss": 0.4566, "step": 9620 }, { - "epoch": 0.34, - "learning_rate": 4.9813644918759725e-05, - "loss": 0.3077, + "epoch": 0.346884347857426, + "grad_norm": 0.14357610046863556, + "learning_rate": 4.9797186178390255e-05, + "loss": 0.4731, "step": 9625 }, { - "epoch": 0.34, - "learning_rate": 4.981329757763641e-05, - "loss": 0.3369, + "epoch": 0.3470645475186507, + "grad_norm": 0.19745934009552002, + "learning_rate": 4.97968150586336e-05, + "loss": 0.5, "step": 9630 }, { - "epoch": 0.34, - "learning_rate": 4.9812949914328766e-05, - "loss": 0.3688, + "epoch": 0.3472447471798753, + "grad_norm": 0.19887088239192963, + "learning_rate": 4.9796443601025144e-05, + "loss": 0.43, "step": 9635 }, { - "epoch": 0.34, - "learning_rate": 4.981260192884133e-05, - "loss": 0.3196, + "epoch": 0.3474249468410999, + "grad_norm": 0.17473819851875305, + "learning_rate": 4.9796071805569936e-05, + "loss": 0.4736, "step": 9640 }, { - "epoch": 0.34, - "learning_rate": 4.981225362117861e-05, - "loss": 0.3187, + "epoch": 0.3476051465023246, + "grad_norm": 0.17900219559669495, + "learning_rate": 4.9795699672273054e-05, + "loss": 0.4633, "step": 9645 }, { - "epoch": 0.34, - "learning_rate": 4.981190499134513e-05, - "loss": 0.3324, + "epoch": 0.3477853461635492, + "grad_norm": 0.21898692846298218, + "learning_rate": 4.979532720113956e-05, + "loss": 0.4815, "step": 9650 }, { - "epoch": 0.34, - "learning_rate": 4.981155603934541e-05, - "loss": 0.3113, + "epoch": 0.34796554582477385, + "grad_norm": 0.18762876093387604, + "learning_rate": 4.979495439217454e-05, + "loss": 0.4904, "step": 9655 }, { - "epoch": 0.34, - "learning_rate": 4.981120676518398e-05, - "loss": 0.3361, + "epoch": 0.34814574548599847, + "grad_norm": 0.2149989753961563, + "learning_rate": 4.9794581245383074e-05, + "loss": 0.468, "step": 9660 }, { - "epoch": 0.34, - "learning_rate": 4.9810857168865386e-05, - "loss": 0.3341, + "epoch": 0.34832594514722315, + "grad_norm": 0.1882586032152176, + "learning_rate": 4.979420776077023e-05, + "loss": 0.4934, "step": 9665 }, { - "epoch": 0.34, - "learning_rate": 4.981050725039417e-05, - "loss": 0.3221, + "epoch": 0.34850614480844777, + "grad_norm": 0.19263622164726257, + "learning_rate": 4.979383393834111e-05, + "loss": 0.4851, "step": 9670 }, { - "epoch": 0.34, - "learning_rate": 4.981015700977486e-05, - "loss": 0.3095, + "epoch": 0.3486863444696724, + "grad_norm": 0.190067321062088, + "learning_rate": 4.9793459778100794e-05, + "loss": 0.4344, "step": 9675 }, { - "epoch": 0.34, - "learning_rate": 4.980980644701202e-05, - "loss": 0.3289, + "epoch": 0.348866544130897, + "grad_norm": 0.16212069988250732, + "learning_rate": 4.979308528005439e-05, + "loss": 0.4459, "step": 9680 }, { - "epoch": 0.34, - "learning_rate": 4.980945556211019e-05, - "loss": 0.2993, + "epoch": 0.3490467437921217, + "grad_norm": 0.20930610597133636, + "learning_rate": 4.9792710444207004e-05, + "loss": 0.4726, "step": 9685 }, { - "epoch": 0.34, - "learning_rate": 4.980910435507393e-05, - "loss": 0.3396, + "epoch": 0.3492269434533463, + "grad_norm": 0.20407001674175262, + "learning_rate": 4.979233527056374e-05, + "loss": 0.47, "step": 9690 }, { - "epoch": 0.34, - "learning_rate": 4.9808752825907795e-05, - "loss": 0.3295, + "epoch": 0.34940714311457094, + "grad_norm": 0.16390258073806763, + "learning_rate": 4.9791959759129706e-05, + "loss": 0.4299, "step": 9695 }, { - "epoch": 0.34, - "learning_rate": 4.980840097461635e-05, - "loss": 0.3072, + "epoch": 0.34958734277579556, + "grad_norm": 0.18050611019134521, + "learning_rate": 4.979158390991002e-05, + "loss": 0.4971, "step": 9700 }, { - "epoch": 0.34, - "learning_rate": 4.980804880120418e-05, - "loss": 0.3135, + "epoch": 0.34976754243702024, + "grad_norm": 0.172181636095047, + "learning_rate": 4.9791207722909794e-05, + "loss": 0.4547, "step": 9705 }, { - "epoch": 0.34, - "learning_rate": 4.9807696305675844e-05, - "loss": 0.335, + "epoch": 0.34994774209824486, + "grad_norm": 0.12145093083381653, + "learning_rate": 4.9790831198134175e-05, + "loss": 0.471, "step": 9710 }, { - "epoch": 0.34, - "learning_rate": 4.980734348803592e-05, - "loss": 0.3319, + "epoch": 0.3501279417594695, + "grad_norm": 0.17209351062774658, + "learning_rate": 4.979045433558828e-05, + "loss": 0.4392, "step": 9715 }, { - "epoch": 0.34, - "learning_rate": 4.980699034828898e-05, - "loss": 0.3338, + "epoch": 0.3503081414206941, + "grad_norm": 0.21002522110939026, + "learning_rate": 4.979007713527723e-05, + "loss": 0.4976, "step": 9720 }, { - "epoch": 0.34, - "learning_rate": 4.980663688643963e-05, - "loss": 0.3719, + "epoch": 0.3504883410819188, + "grad_norm": 0.16209115087985992, + "learning_rate": 4.9789699597206196e-05, + "loss": 0.4635, "step": 9725 }, { - "epoch": 0.34, - "learning_rate": 4.980628310249244e-05, - "loss": 0.3326, + "epoch": 0.3506685407431434, + "grad_norm": 0.14901262521743774, + "learning_rate": 4.97893217213803e-05, + "loss": 0.4632, "step": 9730 }, { - "epoch": 0.34, - "learning_rate": 4.980592899645201e-05, - "loss": 0.3386, + "epoch": 0.35084874040436803, + "grad_norm": 0.16062456369400024, + "learning_rate": 4.9788943507804686e-05, + "loss": 0.4482, "step": 9735 }, { - "epoch": 0.34, - "learning_rate": 4.980557456832295e-05, - "loss": 0.3241, + "epoch": 0.35102894006559265, + "grad_norm": 0.14877210557460785, + "learning_rate": 4.9788564956484527e-05, + "loss": 0.4823, "step": 9740 }, { - "epoch": 0.34, - "learning_rate": 4.980521981810985e-05, - "loss": 0.3372, + "epoch": 0.35120913972681733, + "grad_norm": 0.1947886049747467, + "learning_rate": 4.978818606742496e-05, + "loss": 0.4606, "step": 9745 }, { - "epoch": 0.34, - "learning_rate": 4.980486474581732e-05, - "loss": 0.3224, + "epoch": 0.35138933938804195, + "grad_norm": 0.18804532289505005, + "learning_rate": 4.978780684063116e-05, + "loss": 0.4878, "step": 9750 }, { - "epoch": 0.34, - "learning_rate": 4.980450935144996e-05, - "loss": 0.2923, + "epoch": 0.3515695390492666, + "grad_norm": 0.179021954536438, + "learning_rate": 4.97874272761083e-05, + "loss": 0.4619, "step": 9755 }, { - "epoch": 0.34, - "learning_rate": 4.980415363501239e-05, - "loss": 0.3383, + "epoch": 0.3517497387104912, + "grad_norm": 0.1584351509809494, + "learning_rate": 4.978704737386153e-05, + "loss": 0.4884, "step": 9760 }, { - "epoch": 0.34, - "learning_rate": 4.9803797596509235e-05, - "loss": 0.3293, + "epoch": 0.3519299383717159, + "grad_norm": 0.18059109151363373, + "learning_rate": 4.9786667133896046e-05, + "loss": 0.4296, "step": 9765 }, { - "epoch": 0.34, - "learning_rate": 4.980344123594512e-05, - "loss": 0.3373, + "epoch": 0.3521101380329405, + "grad_norm": 0.16413117945194244, + "learning_rate": 4.978628655621702e-05, + "loss": 0.4785, "step": 9770 }, { - "epoch": 0.34, - "learning_rate": 4.9803084553324666e-05, - "loss": 0.3358, + "epoch": 0.3522903376941651, + "grad_norm": 0.17627866566181183, + "learning_rate": 4.9785905640829635e-05, + "loss": 0.4539, "step": 9775 }, { - "epoch": 0.34, - "learning_rate": 4.98027275486525e-05, - "loss": 0.3051, + "epoch": 0.35247053735538975, + "grad_norm": 0.19507911801338196, + "learning_rate": 4.978552438773909e-05, + "loss": 0.4753, "step": 9780 }, { - "epoch": 0.34, - "learning_rate": 4.980237022193326e-05, - "loss": 0.3373, + "epoch": 0.3526507370166144, + "grad_norm": 0.23072102665901184, + "learning_rate": 4.9785142796950566e-05, + "loss": 0.4645, "step": 9785 }, { - "epoch": 0.34, - "learning_rate": 4.98020125731716e-05, - "loss": 0.3485, + "epoch": 0.35283093667783905, + "grad_norm": 0.16471746563911438, + "learning_rate": 4.978476086846928e-05, + "loss": 0.4756, "step": 9790 }, { - "epoch": 0.34, - "learning_rate": 4.980165460237214e-05, - "loss": 0.3667, + "epoch": 0.35301113633906367, + "grad_norm": 0.17672182619571686, + "learning_rate": 4.978437860230042e-05, + "loss": 0.459, "step": 9795 }, { - "epoch": 0.34, - "learning_rate": 4.980129630953955e-05, - "loss": 0.3064, + "epoch": 0.3531913360002883, + "grad_norm": 0.17026053369045258, + "learning_rate": 4.97839959984492e-05, + "loss": 0.4961, "step": 9800 }, { - "epoch": 0.34, - "learning_rate": 4.980093769467846e-05, - "loss": 0.326, + "epoch": 0.35337153566151297, + "grad_norm": 0.19481323659420013, + "learning_rate": 4.978361305692083e-05, + "loss": 0.4534, "step": 9805 }, { - "epoch": 0.35, - "learning_rate": 4.9800578757793546e-05, - "loss": 0.304, + "epoch": 0.3535517353227376, + "grad_norm": 0.17143213748931885, + "learning_rate": 4.978322977772053e-05, + "loss": 0.4726, "step": 9810 }, { - "epoch": 0.35, - "learning_rate": 4.980021949888946e-05, - "loss": 0.3163, + "epoch": 0.3537319349839622, + "grad_norm": 0.20486843585968018, + "learning_rate": 4.978284616085352e-05, + "loss": 0.5067, "step": 9815 }, { - "epoch": 0.35, - "learning_rate": 4.979985991797087e-05, - "loss": 0.3347, + "epoch": 0.3539121346451869, + "grad_norm": 0.15131095051765442, + "learning_rate": 4.9782462206325045e-05, + "loss": 0.4922, "step": 9820 }, { - "epoch": 0.35, - "learning_rate": 4.979950001504244e-05, - "loss": 0.3157, + "epoch": 0.3540923343064115, + "grad_norm": 0.20476940274238586, + "learning_rate": 4.978207791414031e-05, + "loss": 0.47, "step": 9825 }, { - "epoch": 0.35, - "learning_rate": 4.9799139790108846e-05, - "loss": 0.2742, + "epoch": 0.35427253396763614, + "grad_norm": 0.16714629530906677, + "learning_rate": 4.978169328430456e-05, + "loss": 0.454, "step": 9830 }, { - "epoch": 0.35, - "learning_rate": 4.979877924317477e-05, - "loss": 0.3241, + "epoch": 0.35445273362886076, + "grad_norm": 0.18224307894706726, + "learning_rate": 4.978130831682304e-05, + "loss": 0.4527, "step": 9835 }, { - "epoch": 0.35, - "learning_rate": 4.979841837424488e-05, - "loss": 0.3275, + "epoch": 0.35463293329008544, + "grad_norm": 0.1600581556558609, + "learning_rate": 4.978092301170099e-05, + "loss": 0.4877, "step": 9840 }, { - "epoch": 0.35, - "learning_rate": 4.979805718332388e-05, - "loss": 0.331, + "epoch": 0.35481313295131006, + "grad_norm": 0.21684806048870087, + "learning_rate": 4.9780537368943655e-05, + "loss": 0.4997, "step": 9845 }, { - "epoch": 0.35, - "learning_rate": 4.979769567041643e-05, - "loss": 0.327, + "epoch": 0.3549933326125347, + "grad_norm": 0.15424644947052002, + "learning_rate": 4.978015138855631e-05, + "loss": 0.4398, "step": 9850 }, { - "epoch": 0.35, - "learning_rate": 4.979733383552727e-05, - "loss": 0.3175, + "epoch": 0.3551735322737593, + "grad_norm": 0.16304361820220947, + "learning_rate": 4.9779765070544195e-05, + "loss": 0.4756, "step": 9855 }, { - "epoch": 0.35, - "learning_rate": 4.979697167866105e-05, - "loss": 0.318, + "epoch": 0.355353731934984, + "grad_norm": 0.1486636996269226, + "learning_rate": 4.977937841491257e-05, + "loss": 0.4152, "step": 9860 }, { - "epoch": 0.35, - "learning_rate": 4.979660919982251e-05, - "loss": 0.3552, + "epoch": 0.3555339315962086, + "grad_norm": 0.1545153707265854, + "learning_rate": 4.9778991421666724e-05, + "loss": 0.447, "step": 9865 }, { - "epoch": 0.35, - "learning_rate": 4.979624639901633e-05, - "loss": 0.316, + "epoch": 0.35571413125743323, + "grad_norm": 0.14595122635364532, + "learning_rate": 4.977860409081191e-05, + "loss": 0.4589, "step": 9870 }, { - "epoch": 0.35, - "learning_rate": 4.979588327624725e-05, - "loss": 0.3402, + "epoch": 0.35589433091865785, + "grad_norm": 0.17412835359573364, + "learning_rate": 4.977821642235341e-05, + "loss": 0.4367, "step": 9875 }, { - "epoch": 0.35, - "learning_rate": 4.979551983151995e-05, - "loss": 0.3271, + "epoch": 0.35607453057988253, + "grad_norm": 0.20736642181873322, + "learning_rate": 4.9777828416296513e-05, + "loss": 0.4574, "step": 9880 }, { - "epoch": 0.35, - "learning_rate": 4.9795156064839166e-05, - "loss": 0.3719, + "epoch": 0.35625473024110715, + "grad_norm": 0.20509152114391327, + "learning_rate": 4.9777440072646504e-05, + "loss": 0.4694, "step": 9885 }, { - "epoch": 0.35, - "learning_rate": 4.9794791976209634e-05, - "loss": 0.3154, + "epoch": 0.3564349299023318, + "grad_norm": 0.1846247762441635, + "learning_rate": 4.977705139140867e-05, + "loss": 0.4674, "step": 9890 }, { - "epoch": 0.35, - "learning_rate": 4.979442756563605e-05, - "loss": 0.3374, + "epoch": 0.3566151295635564, + "grad_norm": 0.2067510038614273, + "learning_rate": 4.97766623725883e-05, + "loss": 0.4276, "step": 9895 }, { - "epoch": 0.35, - "learning_rate": 4.9794062833123176e-05, - "loss": 0.328, + "epoch": 0.3567953292247811, + "grad_norm": 0.2130732536315918, + "learning_rate": 4.977627301619071e-05, + "loss": 0.4933, "step": 9900 }, { - "epoch": 0.35, - "learning_rate": 4.979369777867574e-05, - "loss": 0.3276, + "epoch": 0.3569755288860057, + "grad_norm": 0.2266158163547516, + "learning_rate": 4.97758833222212e-05, + "loss": 0.4332, "step": 9905 }, { - "epoch": 0.35, - "learning_rate": 4.979333240229847e-05, - "loss": 0.3205, + "epoch": 0.3571557285472303, + "grad_norm": 0.18190445005893707, + "learning_rate": 4.977549329068506e-05, + "loss": 0.4467, "step": 9910 }, { - "epoch": 0.35, - "learning_rate": 4.9792966703996116e-05, - "loss": 0.309, + "epoch": 0.35733592820845494, + "grad_norm": 0.14068225026130676, + "learning_rate": 4.977510292158764e-05, + "loss": 0.4436, "step": 9915 }, { - "epoch": 0.35, - "learning_rate": 4.979260068377344e-05, - "loss": 0.3277, + "epoch": 0.3575161278696796, + "grad_norm": 0.1720120906829834, + "learning_rate": 4.977471221493423e-05, + "loss": 0.4622, "step": 9920 }, { - "epoch": 0.35, - "learning_rate": 4.979223434163517e-05, - "loss": 0.3036, + "epoch": 0.35769632753090425, + "grad_norm": 0.1588047444820404, + "learning_rate": 4.977432117073016e-05, + "loss": 0.4633, "step": 9925 }, { - "epoch": 0.35, - "learning_rate": 4.979186767758608e-05, - "loss": 0.3299, + "epoch": 0.35787652719212887, + "grad_norm": 0.168239563703537, + "learning_rate": 4.9773929788980766e-05, + "loss": 0.4476, "step": 9930 }, { - "epoch": 0.35, - "learning_rate": 4.9791500691630936e-05, - "loss": 0.335, + "epoch": 0.3580567268533535, + "grad_norm": 0.16691339015960693, + "learning_rate": 4.9773538069691375e-05, + "loss": 0.4693, "step": 9935 }, { - "epoch": 0.35, - "learning_rate": 4.979113338377448e-05, - "loss": 0.3341, + "epoch": 0.35823692651457817, + "grad_norm": 0.15054184198379517, + "learning_rate": 4.977314601286732e-05, + "loss": 0.4347, "step": 9940 }, { - "epoch": 0.35, - "learning_rate": 4.979076575402151e-05, - "loss": 0.3399, + "epoch": 0.3584171261758028, + "grad_norm": 0.147248312830925, + "learning_rate": 4.9772753618513945e-05, + "loss": 0.444, "step": 9945 }, { - "epoch": 0.35, - "learning_rate": 4.979039780237677e-05, - "loss": 0.3285, + "epoch": 0.3585973258370274, + "grad_norm": 0.20914539694786072, + "learning_rate": 4.9772360886636605e-05, + "loss": 0.4573, "step": 9950 }, { - "epoch": 0.35, - "learning_rate": 4.979002952884506e-05, - "loss": 0.3308, + "epoch": 0.35877752549825204, + "grad_norm": 0.16650797426700592, + "learning_rate": 4.977196781724064e-05, + "loss": 0.4585, "step": 9955 }, { - "epoch": 0.35, - "learning_rate": 4.978966093343115e-05, - "loss": 0.3288, + "epoch": 0.3589577251594767, + "grad_norm": 0.2121131271123886, + "learning_rate": 4.9771574410331415e-05, + "loss": 0.4204, "step": 9960 }, { - "epoch": 0.35, - "learning_rate": 4.978929201613983e-05, - "loss": 0.341, + "epoch": 0.35913792482070134, + "grad_norm": 0.1681807041168213, + "learning_rate": 4.977118066591427e-05, + "loss": 0.5008, "step": 9965 }, { - "epoch": 0.35, - "learning_rate": 4.978892277697589e-05, - "loss": 0.3037, + "epoch": 0.35931812448192596, + "grad_norm": 0.18898646533489227, + "learning_rate": 4.97707865839946e-05, + "loss": 0.5016, "step": 9970 }, { - "epoch": 0.35, - "learning_rate": 4.978855321594413e-05, - "loss": 0.315, + "epoch": 0.35949832414315064, + "grad_norm": 0.1509980857372284, + "learning_rate": 4.977039216457775e-05, + "loss": 0.4251, "step": 9975 }, { - "epoch": 0.35, - "learning_rate": 4.978818333304933e-05, - "loss": 0.3041, + "epoch": 0.35967852380437526, + "grad_norm": 0.15001198649406433, + "learning_rate": 4.976999740766911e-05, + "loss": 0.4745, "step": 9980 }, { - "epoch": 0.35, - "learning_rate": 4.978781312829633e-05, - "loss": 0.32, + "epoch": 0.3598587234655999, + "grad_norm": 0.19364404678344727, + "learning_rate": 4.976960231327404e-05, + "loss": 0.4624, "step": 9985 }, { - "epoch": 0.35, - "learning_rate": 4.978744260168988e-05, - "loss": 0.3322, + "epoch": 0.3600389231268245, + "grad_norm": 0.1906237006187439, + "learning_rate": 4.976920688139794e-05, + "loss": 0.4492, "step": 9990 }, { - "epoch": 0.35, - "learning_rate": 4.978707175323485e-05, - "loss": 0.336, + "epoch": 0.3602191227880492, + "grad_norm": 0.1953149288892746, + "learning_rate": 4.9768811112046196e-05, + "loss": 0.4655, "step": 9995 }, { - "epoch": 0.35, - "learning_rate": 4.978670058293601e-05, - "loss": 0.2962, + "epoch": 0.3603993224492738, + "grad_norm": 0.1708211600780487, + "learning_rate": 4.976841500522419e-05, + "loss": 0.4667, "step": 10000 }, { - "epoch": 0.35, - "eval_loss": 0.3222596049308777, - "eval_runtime": 10.5408, - "eval_samples_per_second": 9.487, - "eval_steps_per_second": 9.487, + "epoch": 0.3603993224492738, + "eval_loss": 0.4796702265739441, + "eval_runtime": 3.5164, + "eval_samples_per_second": 28.438, + "eval_steps_per_second": 7.11, "step": 10000 }, { - "epoch": 0.35, - "learning_rate": 4.978632909079821e-05, - "loss": 0.3262, + "epoch": 0.36057952211049843, + "grad_norm": 0.20447692275047302, + "learning_rate": 4.976801856093732e-05, + "loss": 0.4598, "step": 10005 }, { - "epoch": 0.35, - "learning_rate": 4.9785957276826254e-05, - "loss": 0.3352, + "epoch": 0.36075972177172305, + "grad_norm": 0.16265089809894562, + "learning_rate": 4.976762177919101e-05, + "loss": 0.4668, "step": 10010 }, { - "epoch": 0.35, - "learning_rate": 4.978558514102498e-05, - "loss": 0.3371, + "epoch": 0.36093992143294773, + "grad_norm": 0.1456652581691742, + "learning_rate": 4.976722465999063e-05, + "loss": 0.4301, "step": 10015 }, { - "epoch": 0.35, - "learning_rate": 4.978521268339922e-05, - "loss": 0.3208, + "epoch": 0.36112012109417235, + "grad_norm": 0.17092099785804749, + "learning_rate": 4.976682720334162e-05, + "loss": 0.4681, "step": 10020 }, { - "epoch": 0.35, - "learning_rate": 4.97848399039538e-05, - "loss": 0.3262, + "epoch": 0.361300320755397, + "grad_norm": 0.1890965700149536, + "learning_rate": 4.976642940924938e-05, + "loss": 0.477, "step": 10025 }, { - "epoch": 0.35, - "learning_rate": 4.978446680269358e-05, - "loss": 0.3099, + "epoch": 0.3614805204166216, + "grad_norm": 0.16893772780895233, + "learning_rate": 4.976603127771934e-05, + "loss": 0.4796, "step": 10030 }, { - "epoch": 0.35, - "learning_rate": 4.9784093379623386e-05, - "loss": 0.3112, + "epoch": 0.3616607200778463, + "grad_norm": 0.16961359977722168, + "learning_rate": 4.9765632808756915e-05, + "loss": 0.4314, "step": 10035 }, { - "epoch": 0.35, - "learning_rate": 4.9783719634748074e-05, - "loss": 0.3207, + "epoch": 0.3618409197390709, + "grad_norm": 0.1630767434835434, + "learning_rate": 4.9765234002367534e-05, + "loss": 0.4594, "step": 10040 }, { - "epoch": 0.35, - "learning_rate": 4.978334556807249e-05, - "loss": 0.3204, + "epoch": 0.3620211194002955, + "grad_norm": 0.16475652158260345, + "learning_rate": 4.9764834858556635e-05, + "loss": 0.4489, "step": 10045 }, { - "epoch": 0.35, - "learning_rate": 4.97829711796015e-05, - "loss": 0.327, + "epoch": 0.36220131906152014, + "grad_norm": 0.1670771986246109, + "learning_rate": 4.9764435377329654e-05, + "loss": 0.4278, "step": 10050 }, { - "epoch": 0.35, - "learning_rate": 4.978259646933996e-05, - "loss": 0.34, + "epoch": 0.3623815187227448, + "grad_norm": 0.1597401350736618, + "learning_rate": 4.9764035558692045e-05, + "loss": 0.4035, "step": 10055 }, { - "epoch": 0.35, - "learning_rate": 4.978222143729274e-05, - "loss": 0.338, + "epoch": 0.36256171838396944, + "grad_norm": 0.2012094110250473, + "learning_rate": 4.9763635402649236e-05, + "loss": 0.4649, "step": 10060 }, { - "epoch": 0.35, - "learning_rate": 4.978184608346471e-05, - "loss": 0.3158, + "epoch": 0.36274191804519407, + "grad_norm": 0.16659463942050934, + "learning_rate": 4.9763234909206695e-05, + "loss": 0.447, "step": 10065 }, { - "epoch": 0.35, - "learning_rate": 4.978147040786073e-05, - "loss": 0.3068, + "epoch": 0.3629221177064187, + "grad_norm": 0.15925155580043793, + "learning_rate": 4.976283407836987e-05, + "loss": 0.4575, "step": 10070 }, { - "epoch": 0.35, - "learning_rate": 4.97810944104857e-05, - "loss": 0.3244, + "epoch": 0.36310231736764337, + "grad_norm": 0.15115337073802948, + "learning_rate": 4.976243291014423e-05, + "loss": 0.4517, "step": 10075 }, { - "epoch": 0.35, - "learning_rate": 4.978071809134448e-05, - "loss": 0.3069, + "epoch": 0.363282517028868, + "grad_norm": 0.1577247679233551, + "learning_rate": 4.976203140453523e-05, + "loss": 0.4539, "step": 10080 }, { - "epoch": 0.35, - "learning_rate": 4.978034145044198e-05, - "loss": 0.3123, + "epoch": 0.3634627166900926, + "grad_norm": 0.16622968018054962, + "learning_rate": 4.9761629561548354e-05, + "loss": 0.4357, "step": 10085 }, { - "epoch": 0.35, - "learning_rate": 4.977996448778307e-05, - "loss": 0.3258, + "epoch": 0.36364291635131724, + "grad_norm": 0.1571248173713684, + "learning_rate": 4.976122738118906e-05, + "loss": 0.4368, "step": 10090 }, { - "epoch": 0.36, - "learning_rate": 4.977958720337265e-05, - "loss": 0.3306, + "epoch": 0.3638231160125419, + "grad_norm": 0.17340044677257538, + "learning_rate": 4.976082486346284e-05, + "loss": 0.4284, "step": 10095 }, { - "epoch": 0.36, - "learning_rate": 4.977920959721561e-05, - "loss": 0.3117, + "epoch": 0.36400331567376654, + "grad_norm": 0.20559607446193695, + "learning_rate": 4.9760422008375176e-05, + "loss": 0.4247, "step": 10100 }, { - "epoch": 0.36, - "learning_rate": 4.977883166931688e-05, - "loss": 0.2943, + "epoch": 0.36418351533499116, + "grad_norm": 0.159358948469162, + "learning_rate": 4.976001881593155e-05, + "loss": 0.4389, "step": 10105 }, { - "epoch": 0.36, - "learning_rate": 4.977845341968135e-05, - "loss": 0.3441, + "epoch": 0.3643637149962158, + "grad_norm": 0.19919893145561218, + "learning_rate": 4.9759615286137465e-05, + "loss": 0.4724, "step": 10110 }, { - "epoch": 0.36, - "learning_rate": 4.9778074848313925e-05, - "loss": 0.3405, + "epoch": 0.36454391465744046, + "grad_norm": 0.1773001104593277, + "learning_rate": 4.975921141899842e-05, + "loss": 0.4847, "step": 10115 }, { - "epoch": 0.36, - "learning_rate": 4.977769595521953e-05, - "loss": 0.3054, + "epoch": 0.3647241143186651, + "grad_norm": 0.17267437279224396, + "learning_rate": 4.975880721451991e-05, + "loss": 0.4398, "step": 10120 }, { - "epoch": 0.36, - "learning_rate": 4.977731674040308e-05, - "loss": 0.3339, + "epoch": 0.3649043139798897, + "grad_norm": 0.19149023294448853, + "learning_rate": 4.975840267270744e-05, + "loss": 0.4703, "step": 10125 }, { - "epoch": 0.36, - "learning_rate": 4.97769372038695e-05, - "loss": 0.3445, + "epoch": 0.3650845136411143, + "grad_norm": 0.1706707626581192, + "learning_rate": 4.975799779356653e-05, + "loss": 0.5034, "step": 10130 }, { - "epoch": 0.36, - "learning_rate": 4.977655734562373e-05, - "loss": 0.3327, + "epoch": 0.365264713302339, + "grad_norm": 0.18125681579113007, + "learning_rate": 4.97575925771027e-05, + "loss": 0.473, "step": 10135 }, { - "epoch": 0.36, - "learning_rate": 4.9776177165670676e-05, - "loss": 0.3147, + "epoch": 0.36544491296356363, + "grad_norm": 0.17481379210948944, + "learning_rate": 4.975718702332146e-05, + "loss": 0.4341, "step": 10140 }, { - "epoch": 0.36, - "learning_rate": 4.97757966640153e-05, - "loss": 0.3163, + "epoch": 0.36562511262478825, + "grad_norm": 0.19114582240581512, + "learning_rate": 4.9756781132228334e-05, + "loss": 0.4629, "step": 10145 }, { - "epoch": 0.36, - "learning_rate": 4.977541584066253e-05, - "loss": 0.3185, + "epoch": 0.36580531228601293, + "grad_norm": 0.1782168298959732, + "learning_rate": 4.975637490382887e-05, + "loss": 0.4375, "step": 10150 }, { - "epoch": 0.36, - "learning_rate": 4.9775034695617314e-05, - "loss": 0.3235, + "epoch": 0.36598551194723755, + "grad_norm": 0.21205340325832367, + "learning_rate": 4.975596833812858e-05, + "loss": 0.4514, "step": 10155 }, { - "epoch": 0.36, - "learning_rate": 4.9774653228884594e-05, - "loss": 0.3276, + "epoch": 0.3661657116084622, + "grad_norm": 0.19303537905216217, + "learning_rate": 4.9755561435133024e-05, + "loss": 0.4596, "step": 10160 }, { - "epoch": 0.36, - "learning_rate": 4.977427144046933e-05, - "loss": 0.3118, + "epoch": 0.3663459112696868, + "grad_norm": 0.16598139703273773, + "learning_rate": 4.9755154194847734e-05, + "loss": 0.4418, "step": 10165 }, { - "epoch": 0.36, - "learning_rate": 4.977388933037649e-05, - "loss": 0.3274, + "epoch": 0.3665261109309115, + "grad_norm": 0.19510379433631897, + "learning_rate": 4.9754746617278254e-05, + "loss": 0.4699, "step": 10170 }, { - "epoch": 0.36, - "learning_rate": 4.977350689861101e-05, - "loss": 0.3286, + "epoch": 0.3667063105921361, + "grad_norm": 0.16283252835273743, + "learning_rate": 4.975433870243015e-05, + "loss": 0.4337, "step": 10175 }, { - "epoch": 0.36, - "learning_rate": 4.9773124145177875e-05, - "loss": 0.308, + "epoch": 0.3668865102533607, + "grad_norm": 0.20761866867542267, + "learning_rate": 4.975393045030897e-05, + "loss": 0.4831, "step": 10180 }, { - "epoch": 0.36, - "learning_rate": 4.977274107008205e-05, - "loss": 0.3173, + "epoch": 0.36706670991458534, + "grad_norm": 0.15632273256778717, + "learning_rate": 4.9753521860920284e-05, + "loss": 0.4706, "step": 10185 }, { - "epoch": 0.36, - "learning_rate": 4.9772357673328506e-05, - "loss": 0.3216, + "epoch": 0.36724690957581, + "grad_norm": 0.19553816318511963, + "learning_rate": 4.975311293426965e-05, + "loss": 0.4547, "step": 10190 }, { - "epoch": 0.36, - "learning_rate": 4.977197395492223e-05, - "loss": 0.3111, + "epoch": 0.36742710923703464, + "grad_norm": 0.15499721467494965, + "learning_rate": 4.975270367036264e-05, + "loss": 0.4329, "step": 10195 }, { - "epoch": 0.36, - "learning_rate": 4.977158991486818e-05, - "loss": 0.315, + "epoch": 0.36760730889825927, + "grad_norm": 0.19358859956264496, + "learning_rate": 4.975229406920485e-05, + "loss": 0.4449, "step": 10200 }, { - "epoch": 0.36, - "learning_rate": 4.9771205553171376e-05, - "loss": 0.283, + "epoch": 0.3677875085594839, + "grad_norm": 0.19693566858768463, + "learning_rate": 4.975188413080184e-05, + "loss": 0.4487, "step": 10205 }, { - "epoch": 0.36, - "learning_rate": 4.9770820869836784e-05, - "loss": 0.3386, + "epoch": 0.36796770822070857, + "grad_norm": 0.16378581523895264, + "learning_rate": 4.97514738551592e-05, + "loss": 0.4419, "step": 10210 }, { - "epoch": 0.36, - "learning_rate": 4.977043586486941e-05, - "loss": 0.3396, + "epoch": 0.3681479078819332, + "grad_norm": 0.22160831093788147, + "learning_rate": 4.975106324228252e-05, + "loss": 0.4503, "step": 10215 }, { - "epoch": 0.36, - "learning_rate": 4.977005053827425e-05, - "loss": 0.3357, + "epoch": 0.3683281075431578, + "grad_norm": 0.18501290678977966, + "learning_rate": 4.975065229217739e-05, + "loss": 0.4692, "step": 10220 }, { - "epoch": 0.36, - "learning_rate": 4.976966489005631e-05, - "loss": 0.3087, + "epoch": 0.36850830720438243, + "grad_norm": 0.17586058378219604, + "learning_rate": 4.9750241004849415e-05, + "loss": 0.4593, "step": 10225 }, { - "epoch": 0.36, - "learning_rate": 4.9769278920220596e-05, - "loss": 0.3442, + "epoch": 0.3686885068656071, + "grad_norm": 0.17529945075511932, + "learning_rate": 4.974982938030421e-05, + "loss": 0.4924, "step": 10230 }, { - "epoch": 0.36, - "learning_rate": 4.9768892628772115e-05, - "loss": 0.343, + "epoch": 0.36886870652683174, + "grad_norm": 0.22143632173538208, + "learning_rate": 4.974941741854736e-05, + "loss": 0.4816, "step": 10235 }, { - "epoch": 0.36, - "learning_rate": 4.9768506015715896e-05, - "loss": 0.3256, + "epoch": 0.36904890618805636, + "grad_norm": 0.16022737324237823, + "learning_rate": 4.974900511958449e-05, + "loss": 0.469, "step": 10240 }, { - "epoch": 0.36, - "learning_rate": 4.976811908105694e-05, - "loss": 0.3094, + "epoch": 0.369229105849281, + "grad_norm": 0.1857171654701233, + "learning_rate": 4.974859248342122e-05, + "loss": 0.4561, "step": 10245 }, { - "epoch": 0.36, - "learning_rate": 4.9767731824800277e-05, - "loss": 0.3192, + "epoch": 0.36940930551050566, + "grad_norm": 0.15506871044635773, + "learning_rate": 4.974817951006318e-05, + "loss": 0.484, "step": 10250 }, { - "epoch": 0.36, - "learning_rate": 4.976734424695095e-05, - "loss": 0.3161, + "epoch": 0.3695895051717303, + "grad_norm": 0.16695240139961243, + "learning_rate": 4.9747766199515967e-05, + "loss": 0.4345, "step": 10255 }, { - "epoch": 0.36, - "learning_rate": 4.976695634751397e-05, - "loss": 0.3199, + "epoch": 0.3697697048329549, + "grad_norm": 0.1858297437429428, + "learning_rate": 4.9747352551785234e-05, + "loss": 0.4694, "step": 10260 }, { - "epoch": 0.36, - "learning_rate": 4.9766568126494394e-05, - "loss": 0.2714, + "epoch": 0.3699499044941795, + "grad_norm": 0.18636928498744965, + "learning_rate": 4.9746938566876624e-05, + "loss": 0.4836, "step": 10265 }, { - "epoch": 0.36, - "learning_rate": 4.976617958389724e-05, - "loss": 0.2952, + "epoch": 0.3701301041554042, + "grad_norm": 0.14627771079540253, + "learning_rate": 4.9746524244795755e-05, + "loss": 0.4755, "step": 10270 }, { - "epoch": 0.36, - "learning_rate": 4.976579071972758e-05, - "loss": 0.3468, + "epoch": 0.3703103038166288, + "grad_norm": 0.2531489431858063, + "learning_rate": 4.974610958554829e-05, + "loss": 0.4655, "step": 10275 }, { - "epoch": 0.36, - "learning_rate": 4.976540153399044e-05, - "loss": 0.3307, + "epoch": 0.37049050347785345, + "grad_norm": 0.14432822167873383, + "learning_rate": 4.974569458913988e-05, + "loss": 0.4514, "step": 10280 }, { - "epoch": 0.36, - "learning_rate": 4.976501202669089e-05, - "loss": 0.3136, + "epoch": 0.3706707031390781, + "grad_norm": 0.200893372297287, + "learning_rate": 4.974527925557616e-05, + "loss": 0.4709, "step": 10285 }, { - "epoch": 0.36, - "learning_rate": 4.976462219783397e-05, - "loss": 0.3317, + "epoch": 0.37085090280030275, + "grad_norm": 0.1737281084060669, + "learning_rate": 4.974486358486281e-05, + "loss": 0.4521, "step": 10290 }, { - "epoch": 0.36, - "learning_rate": 4.976423204742475e-05, - "loss": 0.2882, + "epoch": 0.3710311024615274, + "grad_norm": 0.18836097419261932, + "learning_rate": 4.9744447577005484e-05, + "loss": 0.4333, "step": 10295 }, { - "epoch": 0.36, - "learning_rate": 4.976384157546831e-05, - "loss": 0.3635, + "epoch": 0.371211302122752, + "grad_norm": 0.16161343455314636, + "learning_rate": 4.974403123200984e-05, + "loss": 0.4779, "step": 10300 }, { - "epoch": 0.36, - "learning_rate": 4.97634507819697e-05, - "loss": 0.307, + "epoch": 0.3713915017839766, + "grad_norm": 0.1600758135318756, + "learning_rate": 4.9743614549881566e-05, + "loss": 0.4377, "step": 10305 }, { - "epoch": 0.36, - "learning_rate": 4.9763059666933996e-05, - "loss": 0.2976, + "epoch": 0.3715717014452013, + "grad_norm": 0.2167365998029709, + "learning_rate": 4.974319753062634e-05, + "loss": 0.4531, "step": 10310 }, { - "epoch": 0.36, - "learning_rate": 4.9762668230366286e-05, - "loss": 0.3068, + "epoch": 0.3717519011064259, + "grad_norm": 0.20788373053073883, + "learning_rate": 4.9742780174249835e-05, + "loss": 0.4682, "step": 10315 }, { - "epoch": 0.36, - "learning_rate": 4.9762276472271653e-05, - "loss": 0.3155, + "epoch": 0.37193210076765054, + "grad_norm": 0.16195015609264374, + "learning_rate": 4.974236248075774e-05, + "loss": 0.4617, "step": 10320 }, { - "epoch": 0.36, - "learning_rate": 4.976188439265517e-05, - "loss": 0.3258, + "epoch": 0.3721123004288752, + "grad_norm": 0.21307627856731415, + "learning_rate": 4.974194445015574e-05, + "loss": 0.474, "step": 10325 }, { - "epoch": 0.36, - "learning_rate": 4.9761491991521945e-05, - "loss": 0.3216, + "epoch": 0.37229250009009984, + "grad_norm": 0.3052423894405365, + "learning_rate": 4.974152608244955e-05, + "loss": 0.4861, "step": 10330 }, { - "epoch": 0.36, - "learning_rate": 4.976109926887706e-05, - "loss": 0.3007, + "epoch": 0.37247269975132447, + "grad_norm": 0.16430044174194336, + "learning_rate": 4.9741107377644845e-05, + "loss": 0.4265, "step": 10335 }, { - "epoch": 0.36, - "learning_rate": 4.9760706224725616e-05, - "loss": 0.3354, + "epoch": 0.3726528994125491, + "grad_norm": 0.19519920647144318, + "learning_rate": 4.974068833574736e-05, + "loss": 0.447, "step": 10340 }, { - "epoch": 0.36, - "learning_rate": 4.9760312859072726e-05, - "loss": 0.3335, + "epoch": 0.37283309907377377, + "grad_norm": 0.22070425748825073, + "learning_rate": 4.974026895676277e-05, + "loss": 0.51, "step": 10345 }, { - "epoch": 0.36, - "learning_rate": 4.975991917192349e-05, - "loss": 0.3095, + "epoch": 0.3730132987349984, + "grad_norm": 0.17484411597251892, + "learning_rate": 4.973984924069681e-05, + "loss": 0.4552, "step": 10350 }, { - "epoch": 0.36, - "learning_rate": 4.975952516328302e-05, - "loss": 0.3143, + "epoch": 0.373193498396223, + "grad_norm": 0.15123674273490906, + "learning_rate": 4.9739429187555185e-05, + "loss": 0.4425, "step": 10355 }, { - "epoch": 0.36, - "learning_rate": 4.975913083315643e-05, - "loss": 0.3287, + "epoch": 0.37337369805744763, + "grad_norm": 0.17658279836177826, + "learning_rate": 4.973900879734364e-05, + "loss": 0.4305, "step": 10360 }, { - "epoch": 0.36, - "learning_rate": 4.975873618154885e-05, - "loss": 0.3102, + "epoch": 0.3735538977186723, + "grad_norm": 0.1411803662776947, + "learning_rate": 4.973858807006788e-05, + "loss": 0.4365, "step": 10365 }, { - "epoch": 0.36, - "learning_rate": 4.975834120846539e-05, - "loss": 0.3033, + "epoch": 0.37373409737989693, + "grad_norm": 0.1503876894712448, + "learning_rate": 4.973816700573366e-05, + "loss": 0.4826, "step": 10370 }, { - "epoch": 0.37, - "learning_rate": 4.975794591391119e-05, - "loss": 0.3203, + "epoch": 0.37391429704112156, + "grad_norm": 0.14822295308113098, + "learning_rate": 4.97377456043467e-05, + "loss": 0.4837, "step": 10375 }, { - "epoch": 0.37, - "learning_rate": 4.975755029789138e-05, - "loss": 0.342, + "epoch": 0.3740944967023462, + "grad_norm": 0.1543811559677124, + "learning_rate": 4.9737323865912734e-05, + "loss": 0.4731, "step": 10380 }, { - "epoch": 0.37, - "learning_rate": 4.9757154360411095e-05, - "loss": 0.2972, + "epoch": 0.37427469636357086, + "grad_norm": 0.18722733855247498, + "learning_rate": 4.973690179043753e-05, + "loss": 0.4697, "step": 10385 }, { - "epoch": 0.37, - "learning_rate": 4.975675810147547e-05, - "loss": 0.3124, + "epoch": 0.3744548960247955, + "grad_norm": 0.20646195113658905, + "learning_rate": 4.9736479377926826e-05, + "loss": 0.4538, "step": 10390 }, { - "epoch": 0.37, - "learning_rate": 4.975636152108966e-05, - "loss": 0.3097, + "epoch": 0.3746350956860201, + "grad_norm": 0.19029046595096588, + "learning_rate": 4.9736056628386374e-05, + "loss": 0.4714, "step": 10395 }, { - "epoch": 0.37, - "learning_rate": 4.975596461925881e-05, - "loss": 0.3229, + "epoch": 0.3748152953472447, + "grad_norm": 0.18136341869831085, + "learning_rate": 4.973563354182195e-05, + "loss": 0.4703, "step": 10400 }, { - "epoch": 0.37, - "learning_rate": 4.975556739598808e-05, - "loss": 0.3471, + "epoch": 0.3749954950084694, + "grad_norm": 0.20808909833431244, + "learning_rate": 4.97352101182393e-05, + "loss": 0.5067, "step": 10405 }, { - "epoch": 0.37, - "learning_rate": 4.975516985128261e-05, - "loss": 0.3405, + "epoch": 0.375175694669694, + "grad_norm": 0.18707624077796936, + "learning_rate": 4.9734786357644204e-05, + "loss": 0.4373, "step": 10410 }, { - "epoch": 0.37, - "learning_rate": 4.9754771985147586e-05, - "loss": 0.316, + "epoch": 0.37535589433091865, + "grad_norm": 0.19006437063217163, + "learning_rate": 4.9734362260042434e-05, + "loss": 0.4335, "step": 10415 }, { - "epoch": 0.37, - "learning_rate": 4.975437379758816e-05, - "loss": 0.3192, + "epoch": 0.37553609399214327, + "grad_norm": 0.22536343336105347, + "learning_rate": 4.973393782543976e-05, + "loss": 0.4656, "step": 10420 }, { - "epoch": 0.37, - "learning_rate": 4.97539752886095e-05, - "loss": 0.305, + "epoch": 0.37571629365336795, + "grad_norm": 0.20031480491161346, + "learning_rate": 4.9733513053841984e-05, + "loss": 0.4998, "step": 10425 }, { - "epoch": 0.37, - "learning_rate": 4.975357645821678e-05, - "loss": 0.3185, + "epoch": 0.3758964933145926, + "grad_norm": 0.17395085096359253, + "learning_rate": 4.973308794525487e-05, + "loss": 0.4831, "step": 10430 }, { - "epoch": 0.37, - "learning_rate": 4.9753177306415185e-05, - "loss": 0.3341, + "epoch": 0.3760766929758172, + "grad_norm": 0.20083628594875336, + "learning_rate": 4.973266249968423e-05, + "loss": 0.4584, "step": 10435 }, { - "epoch": 0.37, - "learning_rate": 4.9752777833209904e-05, - "loss": 0.3518, + "epoch": 0.3762568926370418, + "grad_norm": 0.1963411122560501, + "learning_rate": 4.973223671713585e-05, + "loss": 0.4865, "step": 10440 }, { - "epoch": 0.37, - "learning_rate": 4.975237803860611e-05, - "loss": 0.3262, + "epoch": 0.3764370922982665, + "grad_norm": 0.20614828169345856, + "learning_rate": 4.973181059761552e-05, + "loss": 0.4628, "step": 10445 }, { - "epoch": 0.37, - "learning_rate": 4.9751977922609e-05, - "loss": 0.3294, + "epoch": 0.3766172919594911, + "grad_norm": 0.2048252522945404, + "learning_rate": 4.973138414112908e-05, + "loss": 0.4685, "step": 10450 }, { - "epoch": 0.37, - "learning_rate": 4.975157748522376e-05, - "loss": 0.3283, + "epoch": 0.37679749162071574, + "grad_norm": 0.1850869357585907, + "learning_rate": 4.97309573476823e-05, + "loss": 0.4674, "step": 10455 }, { - "epoch": 0.37, - "learning_rate": 4.975117672645562e-05, - "loss": 0.369, + "epoch": 0.37697769128194036, + "grad_norm": 0.19031380116939545, + "learning_rate": 4.9730530217281023e-05, + "loss": 0.4621, "step": 10460 }, { - "epoch": 0.37, - "learning_rate": 4.9750775646309744e-05, - "loss": 0.3316, + "epoch": 0.37715789094316504, + "grad_norm": 0.15056711435317993, + "learning_rate": 4.973010274993106e-05, + "loss": 0.4718, "step": 10465 }, { - "epoch": 0.37, - "learning_rate": 4.975037424479136e-05, - "loss": 0.3258, + "epoch": 0.37733809060438966, + "grad_norm": 0.19555266201496124, + "learning_rate": 4.9729674945638236e-05, + "loss": 0.4505, "step": 10470 }, { - "epoch": 0.37, - "learning_rate": 4.974997252190568e-05, - "loss": 0.3497, + "epoch": 0.3775182902656143, + "grad_norm": 0.18490497767925262, + "learning_rate": 4.972924680440838e-05, + "loss": 0.5174, "step": 10475 }, { - "epoch": 0.37, - "learning_rate": 4.974957047765792e-05, - "loss": 0.2926, + "epoch": 0.37769848992683897, + "grad_norm": 0.19573722779750824, + "learning_rate": 4.9728818326247316e-05, + "loss": 0.4594, "step": 10480 }, { - "epoch": 0.37, - "learning_rate": 4.974916811205329e-05, - "loss": 0.3198, + "epoch": 0.3778786895880636, + "grad_norm": 0.17318245768547058, + "learning_rate": 4.97283895111609e-05, + "loss": 0.4886, "step": 10485 }, { - "epoch": 0.37, - "learning_rate": 4.9748765425097035e-05, - "loss": 0.3363, + "epoch": 0.3780588892492882, + "grad_norm": 0.17160874605178833, + "learning_rate": 4.972796035915496e-05, + "loss": 0.4565, "step": 10490 }, { - "epoch": 0.37, - "learning_rate": 4.974836241679436e-05, - "loss": 0.3114, + "epoch": 0.37823908891051283, + "grad_norm": 0.16455820202827454, + "learning_rate": 4.9727530870235345e-05, + "loss": 0.4022, "step": 10495 }, { - "epoch": 0.37, - "learning_rate": 4.974795908715051e-05, - "loss": 0.3215, + "epoch": 0.3784192885717375, + "grad_norm": 0.21172361075878143, + "learning_rate": 4.972710104440791e-05, + "loss": 0.449, "step": 10500 }, { - "epoch": 0.37, - "eval_loss": 0.31911852955818176, - "eval_runtime": 10.5355, - "eval_samples_per_second": 9.492, - "eval_steps_per_second": 9.492, + "epoch": 0.3784192885717375, + "eval_loss": 0.47920936346054077, + "eval_runtime": 3.5282, + "eval_samples_per_second": 28.343, + "eval_steps_per_second": 7.086, "step": 10500 }, { - "epoch": 0.37, - "learning_rate": 4.974755543617072e-05, - "loss": 0.312, + "epoch": 0.37859948823296213, + "grad_norm": 0.15352605283260345, + "learning_rate": 4.9726670881678517e-05, + "loss": 0.446, "step": 10505 }, { - "epoch": 0.37, - "learning_rate": 4.974715146386024e-05, - "loss": 0.328, + "epoch": 0.37877968789418676, + "grad_norm": 0.18436992168426514, + "learning_rate": 4.972624038205301e-05, + "loss": 0.4641, "step": 10510 }, { - "epoch": 0.37, - "learning_rate": 4.9746747170224306e-05, - "loss": 0.3036, + "epoch": 0.3789598875554114, + "grad_norm": 0.1690305769443512, + "learning_rate": 4.972580954553727e-05, + "loss": 0.4275, "step": 10515 }, { - "epoch": 0.37, - "learning_rate": 4.974634255526817e-05, - "loss": 0.3429, + "epoch": 0.37914008721663606, + "grad_norm": 0.20340761542320251, + "learning_rate": 4.9725378372137166e-05, + "loss": 0.4284, "step": 10520 }, { - "epoch": 0.37, - "learning_rate": 4.974593761899708e-05, - "loss": 0.31, + "epoch": 0.3793202868778607, + "grad_norm": 0.17345954477787018, + "learning_rate": 4.9724946861858566e-05, + "loss": 0.4662, "step": 10525 }, { - "epoch": 0.37, - "learning_rate": 4.9745532361416295e-05, - "loss": 0.3408, + "epoch": 0.3795004865390853, + "grad_norm": 0.16937977075576782, + "learning_rate": 4.9724515014707354e-05, + "loss": 0.4486, "step": 10530 }, { - "epoch": 0.37, - "learning_rate": 4.974512678253108e-05, - "loss": 0.3312, + "epoch": 0.3796806862003099, + "grad_norm": 0.1675933450460434, + "learning_rate": 4.9724082830689404e-05, + "loss": 0.466, "step": 10535 }, { - "epoch": 0.37, - "learning_rate": 4.97447208823467e-05, - "loss": 0.3257, + "epoch": 0.3798608858615346, + "grad_norm": 0.14321444928646088, + "learning_rate": 4.972365030981062e-05, + "loss": 0.4187, "step": 10540 }, { - "epoch": 0.37, - "learning_rate": 4.974431466086843e-05, - "loss": 0.3016, + "epoch": 0.3800410855227592, + "grad_norm": 0.19194680452346802, + "learning_rate": 4.972321745207688e-05, + "loss": 0.4069, "step": 10545 }, { - "epoch": 0.37, - "learning_rate": 4.9743908118101545e-05, - "loss": 0.3356, + "epoch": 0.38022128518398385, + "grad_norm": 0.16965104639530182, + "learning_rate": 4.972278425749409e-05, + "loss": 0.4158, "step": 10550 }, { - "epoch": 0.37, - "learning_rate": 4.9743501254051315e-05, - "loss": 0.3142, + "epoch": 0.38040148484520847, + "grad_norm": 0.18481513857841492, + "learning_rate": 4.972235072606816e-05, + "loss": 0.472, "step": 10555 }, { - "epoch": 0.37, - "learning_rate": 4.974309406872303e-05, - "loss": 0.3145, + "epoch": 0.38058168450643315, + "grad_norm": 0.17054641246795654, + "learning_rate": 4.972191685780498e-05, + "loss": 0.4005, "step": 10560 }, { - "epoch": 0.37, - "learning_rate": 4.974268656212198e-05, - "loss": 0.3287, + "epoch": 0.38076188416765777, + "grad_norm": 0.2037338763475418, + "learning_rate": 4.972148265271047e-05, + "loss": 0.4741, "step": 10565 }, { - "epoch": 0.37, - "learning_rate": 4.9742278734253444e-05, - "loss": 0.3307, + "epoch": 0.3809420838288824, + "grad_norm": 0.17392988502979279, + "learning_rate": 4.9721048110790546e-05, + "loss": 0.4823, "step": 10570 }, { - "epoch": 0.37, - "learning_rate": 4.974187058512273e-05, - "loss": 0.3203, + "epoch": 0.381122283490107, + "grad_norm": 0.1915632039308548, + "learning_rate": 4.972061323205113e-05, + "loss": 0.4084, "step": 10575 }, { - "epoch": 0.37, - "learning_rate": 4.9741462114735126e-05, - "loss": 0.3111, + "epoch": 0.3813024831513317, + "grad_norm": 0.16601555049419403, + "learning_rate": 4.972017801649814e-05, + "loss": 0.4264, "step": 10580 }, { - "epoch": 0.37, - "learning_rate": 4.974105332309594e-05, - "loss": 0.3086, + "epoch": 0.3814826828125563, + "grad_norm": 0.15968888998031616, + "learning_rate": 4.971974246413752e-05, + "loss": 0.4563, "step": 10585 }, { - "epoch": 0.37, - "learning_rate": 4.974064421021049e-05, - "loss": 0.2876, + "epoch": 0.38166288247378094, + "grad_norm": 0.17778979241847992, + "learning_rate": 4.971930657497518e-05, + "loss": 0.451, "step": 10590 }, { - "epoch": 0.37, - "learning_rate": 4.974023477608407e-05, - "loss": 0.3169, + "epoch": 0.38184308213500556, + "grad_norm": 0.16911092400550842, + "learning_rate": 4.971887034901708e-05, + "loss": 0.4147, "step": 10595 }, { - "epoch": 0.37, - "learning_rate": 4.973982502072201e-05, - "loss": 0.3043, + "epoch": 0.38202328179623024, + "grad_norm": 0.24606285989284515, + "learning_rate": 4.971843378626916e-05, + "loss": 0.4428, "step": 10600 }, { - "epoch": 0.37, - "learning_rate": 4.9739414944129633e-05, - "loss": 0.3481, + "epoch": 0.38220348145745486, + "grad_norm": 0.1590912640094757, + "learning_rate": 4.971799688673737e-05, + "loss": 0.441, "step": 10605 }, { - "epoch": 0.37, - "learning_rate": 4.973900454631225e-05, - "loss": 0.3462, + "epoch": 0.3823836811186795, + "grad_norm": 0.1825900822877884, + "learning_rate": 4.971755965042765e-05, + "loss": 0.4634, "step": 10610 }, { - "epoch": 0.37, - "learning_rate": 4.973859382727519e-05, - "loss": 0.3047, + "epoch": 0.3825638807799041, + "grad_norm": 0.14857780933380127, + "learning_rate": 4.9717122077345965e-05, + "loss": 0.4584, "step": 10615 }, { - "epoch": 0.37, - "learning_rate": 4.97381827870238e-05, - "loss": 0.3369, + "epoch": 0.3827440804411288, + "grad_norm": 0.18574172258377075, + "learning_rate": 4.971668416749828e-05, + "loss": 0.4721, "step": 10620 }, { - "epoch": 0.37, - "learning_rate": 4.973777142556342e-05, - "loss": 0.3028, + "epoch": 0.3829242801023534, + "grad_norm": 0.21157976984977722, + "learning_rate": 4.971624592089056e-05, + "loss": 0.464, "step": 10625 }, { - "epoch": 0.37, - "learning_rate": 4.973735974289937e-05, - "loss": 0.325, + "epoch": 0.38310447976357803, + "grad_norm": 0.15750320255756378, + "learning_rate": 4.971580733752877e-05, + "loss": 0.4572, "step": 10630 }, { - "epoch": 0.37, - "learning_rate": 4.9736947739037e-05, - "loss": 0.3325, + "epoch": 0.38328467942480265, + "grad_norm": 0.188838928937912, + "learning_rate": 4.9715368417418894e-05, + "loss": 0.4838, "step": 10635 }, { - "epoch": 0.37, - "learning_rate": 4.973653541398168e-05, - "loss": 0.3084, + "epoch": 0.38346487908602733, + "grad_norm": 0.19151824712753296, + "learning_rate": 4.9714929160566906e-05, + "loss": 0.4321, "step": 10640 }, { - "epoch": 0.37, - "learning_rate": 4.973612276773874e-05, - "loss": 0.3269, + "epoch": 0.38364507874725196, + "grad_norm": 0.17638704180717468, + "learning_rate": 4.971448956697879e-05, + "loss": 0.4894, "step": 10645 }, { - "epoch": 0.37, - "learning_rate": 4.973570980031355e-05, - "loss": 0.3139, + "epoch": 0.3838252784084766, + "grad_norm": 0.191755011677742, + "learning_rate": 4.9714049636660544e-05, + "loss": 0.4614, "step": 10650 }, { - "epoch": 0.37, - "learning_rate": 4.973529651171147e-05, - "loss": 0.3211, + "epoch": 0.38400547806970126, + "grad_norm": 0.1991795152425766, + "learning_rate": 4.971360936961815e-05, + "loss": 0.4343, "step": 10655 }, { - "epoch": 0.38, - "learning_rate": 4.973488290193786e-05, - "loss": 0.3487, + "epoch": 0.3841856777309259, + "grad_norm": 0.1604321449995041, + "learning_rate": 4.971316876585762e-05, + "loss": 0.4477, "step": 10660 }, { - "epoch": 0.38, - "learning_rate": 4.9734468970998105e-05, - "loss": 0.3103, + "epoch": 0.3843658773921505, + "grad_norm": 0.19753023982048035, + "learning_rate": 4.971272782538495e-05, + "loss": 0.4803, "step": 10665 }, { - "epoch": 0.38, - "learning_rate": 4.9734054718897574e-05, - "loss": 0.3237, + "epoch": 0.3845460770533751, + "grad_norm": 0.1573752909898758, + "learning_rate": 4.971228654820615e-05, + "loss": 0.4378, "step": 10670 }, { - "epoch": 0.38, - "learning_rate": 4.973364014564163e-05, - "loss": 0.2995, + "epoch": 0.3847262767145998, + "grad_norm": 0.19612035155296326, + "learning_rate": 4.971184493432722e-05, + "loss": 0.4939, "step": 10675 }, { - "epoch": 0.38, - "learning_rate": 4.973322525123568e-05, - "loss": 0.3166, + "epoch": 0.3849064763758244, + "grad_norm": 0.1873956173658371, + "learning_rate": 4.9711402983754194e-05, + "loss": 0.4378, "step": 10680 }, { - "epoch": 0.38, - "learning_rate": 4.973281003568509e-05, - "loss": 0.2974, + "epoch": 0.38508667603704905, + "grad_norm": 0.1653887778520584, + "learning_rate": 4.971096069649309e-05, + "loss": 0.4403, "step": 10685 }, { - "epoch": 0.38, - "learning_rate": 4.973239449899527e-05, - "loss": 0.3526, + "epoch": 0.38526687569827367, + "grad_norm": 0.1948213279247284, + "learning_rate": 4.971051807254993e-05, + "loss": 0.3959, "step": 10690 }, { - "epoch": 0.38, - "learning_rate": 4.97319786411716e-05, - "loss": 0.3314, + "epoch": 0.38544707535949835, + "grad_norm": 0.17649932205677032, + "learning_rate": 4.9710075111930744e-05, + "loss": 0.4329, "step": 10695 }, { - "epoch": 0.38, - "learning_rate": 4.9731562462219495e-05, - "loss": 0.3269, + "epoch": 0.38562727502072297, + "grad_norm": 0.16458600759506226, + "learning_rate": 4.970963181464157e-05, + "loss": 0.4233, "step": 10700 }, { - "epoch": 0.38, - "learning_rate": 4.9731145962144344e-05, - "loss": 0.3073, + "epoch": 0.3858074746819476, + "grad_norm": 0.1625463217496872, + "learning_rate": 4.970918818068844e-05, + "loss": 0.4972, "step": 10705 }, { - "epoch": 0.38, - "learning_rate": 4.973072914095156e-05, - "loss": 0.3118, + "epoch": 0.3859876743431722, + "grad_norm": 0.12830519676208496, + "learning_rate": 4.9708744210077406e-05, + "loss": 0.4458, "step": 10710 }, { - "epoch": 0.38, - "learning_rate": 4.973031199864656e-05, - "loss": 0.3227, + "epoch": 0.3861678740043969, + "grad_norm": 0.20030345022678375, + "learning_rate": 4.9708299902814516e-05, + "loss": 0.4751, "step": 10715 }, { - "epoch": 0.38, - "learning_rate": 4.9729894535234756e-05, - "loss": 0.3423, + "epoch": 0.3863480736656215, + "grad_norm": 0.15342171490192413, + "learning_rate": 4.970785525890582e-05, + "loss": 0.4467, "step": 10720 }, { - "epoch": 0.38, - "learning_rate": 4.972947675072157e-05, - "loss": 0.3177, + "epoch": 0.38652827332684614, + "grad_norm": 0.19483308494091034, + "learning_rate": 4.9707410278357393e-05, + "loss": 0.4594, "step": 10725 }, { - "epoch": 0.38, - "learning_rate": 4.972905864511241e-05, - "loss": 0.3132, + "epoch": 0.38670847298807076, + "grad_norm": 0.22155672311782837, + "learning_rate": 4.970696496117527e-05, + "loss": 0.4764, "step": 10730 }, { - "epoch": 0.38, - "learning_rate": 4.9728640218412744e-05, - "loss": 0.3445, + "epoch": 0.38688867264929544, + "grad_norm": 0.22407083213329315, + "learning_rate": 4.970651930736554e-05, + "loss": 0.4337, "step": 10735 }, { - "epoch": 0.38, - "learning_rate": 4.9728221470627965e-05, - "loss": 0.3371, + "epoch": 0.38706887231052006, + "grad_norm": 0.14635224640369415, + "learning_rate": 4.970607331693427e-05, + "loss": 0.4752, "step": 10740 }, { - "epoch": 0.38, - "learning_rate": 4.9727802401763537e-05, - "loss": 0.3068, + "epoch": 0.3872490719717447, + "grad_norm": 0.18128085136413574, + "learning_rate": 4.970562698988753e-05, + "loss": 0.4927, "step": 10745 }, { - "epoch": 0.38, - "learning_rate": 4.972738301182489e-05, - "loss": 0.308, + "epoch": 0.3874292716329693, + "grad_norm": 0.1800365000963211, + "learning_rate": 4.970518032623141e-05, + "loss": 0.4195, "step": 10750 }, { - "epoch": 0.38, - "learning_rate": 4.972696330081746e-05, - "loss": 0.3264, + "epoch": 0.387609471294194, + "grad_norm": 0.18861335515975952, + "learning_rate": 4.9704733325971986e-05, + "loss": 0.4563, "step": 10755 }, { - "epoch": 0.38, - "learning_rate": 4.9726543268746725e-05, - "loss": 0.3176, + "epoch": 0.3877896709554186, + "grad_norm": 0.18732388317584991, + "learning_rate": 4.9704285989115355e-05, + "loss": 0.4643, "step": 10760 }, { - "epoch": 0.38, - "learning_rate": 4.972612291561811e-05, - "loss": 0.3149, + "epoch": 0.38796987061664323, + "grad_norm": 0.17132775485515594, + "learning_rate": 4.970383831566762e-05, + "loss": 0.4932, "step": 10765 }, { - "epoch": 0.38, - "learning_rate": 4.972570224143708e-05, - "loss": 0.3294, + "epoch": 0.38815007027786785, + "grad_norm": 0.17382220923900604, + "learning_rate": 4.970339030563485e-05, + "loss": 0.4402, "step": 10770 }, { - "epoch": 0.38, - "learning_rate": 4.972528124620911e-05, - "loss": 0.341, + "epoch": 0.38833026993909253, + "grad_norm": 0.15054017305374146, + "learning_rate": 4.9702941959023185e-05, + "loss": 0.4593, "step": 10775 }, { - "epoch": 0.38, - "learning_rate": 4.9724859929939655e-05, - "loss": 0.331, + "epoch": 0.38851046960031715, + "grad_norm": 0.19927044212818146, + "learning_rate": 4.9702493275838713e-05, + "loss": 0.4606, "step": 10780 }, { - "epoch": 0.38, - "learning_rate": 4.972443829263419e-05, - "loss": 0.316, + "epoch": 0.3886906692615418, + "grad_norm": 0.16357219219207764, + "learning_rate": 4.970204425608756e-05, + "loss": 0.4324, "step": 10785 }, { - "epoch": 0.38, - "learning_rate": 4.9724016334298186e-05, - "loss": 0.3089, + "epoch": 0.3888708689227664, + "grad_norm": 0.1704314649105072, + "learning_rate": 4.970159489977583e-05, + "loss": 0.4257, "step": 10790 }, { - "epoch": 0.38, - "learning_rate": 4.972359405493712e-05, - "loss": 0.3033, + "epoch": 0.3890510685839911, + "grad_norm": 0.17188839614391327, + "learning_rate": 4.970114520690965e-05, + "loss": 0.4469, "step": 10795 }, { - "epoch": 0.38, - "learning_rate": 4.972317145455648e-05, - "loss": 0.3289, + "epoch": 0.3892312682452157, + "grad_norm": 0.17841334640979767, + "learning_rate": 4.9700695177495154e-05, + "loss": 0.3993, "step": 10800 }, { - "epoch": 0.38, - "learning_rate": 4.9722748533161753e-05, - "loss": 0.3353, + "epoch": 0.3894114679064403, + "grad_norm": 0.21081610023975372, + "learning_rate": 4.970024481153847e-05, + "loss": 0.4844, "step": 10805 }, { - "epoch": 0.38, - "learning_rate": 4.972232529075843e-05, - "loss": 0.3247, + "epoch": 0.38959166756766495, + "grad_norm": 0.18982069194316864, + "learning_rate": 4.9699794109045726e-05, + "loss": 0.4578, "step": 10810 }, { - "epoch": 0.38, - "learning_rate": 4.972190172735201e-05, - "loss": 0.3195, + "epoch": 0.3897718672288896, + "grad_norm": 0.18108953535556793, + "learning_rate": 4.969934307002307e-05, + "loss": 0.4661, "step": 10815 }, { - "epoch": 0.38, - "learning_rate": 4.9721477842947984e-05, - "loss": 0.3657, + "epoch": 0.38995206689011425, + "grad_norm": 0.1565956175327301, + "learning_rate": 4.969889169447664e-05, + "loss": 0.4347, "step": 10820 }, { - "epoch": 0.38, - "learning_rate": 4.972105363755186e-05, - "loss": 0.3068, + "epoch": 0.39013226655133887, + "grad_norm": 0.16742676496505737, + "learning_rate": 4.9698439982412616e-05, + "loss": 0.459, "step": 10825 }, { - "epoch": 0.38, - "learning_rate": 4.972062911116914e-05, - "loss": 0.3295, + "epoch": 0.39031246621256355, + "grad_norm": 0.15715833008289337, + "learning_rate": 4.969798793383711e-05, + "loss": 0.4821, "step": 10830 }, { - "epoch": 0.38, - "learning_rate": 4.9720204263805345e-05, - "loss": 0.321, + "epoch": 0.39049266587378817, + "grad_norm": 0.21100378036499023, + "learning_rate": 4.9697535548756304e-05, + "loss": 0.4679, "step": 10835 }, { - "epoch": 0.38, - "learning_rate": 4.9719779095466e-05, - "loss": 0.3057, + "epoch": 0.3906728655350128, + "grad_norm": 0.17519085109233856, + "learning_rate": 4.969708282717635e-05, + "loss": 0.4637, "step": 10840 }, { - "epoch": 0.38, - "learning_rate": 4.9719353606156606e-05, - "loss": 0.3415, + "epoch": 0.3908530651962374, + "grad_norm": 0.15852008759975433, + "learning_rate": 4.969662976910344e-05, + "loss": 0.4755, "step": 10845 }, { - "epoch": 0.38, - "learning_rate": 4.97189277958827e-05, - "loss": 0.3129, + "epoch": 0.3910332648574621, + "grad_norm": 0.22311712801456451, + "learning_rate": 4.969617637454373e-05, + "loss": 0.4252, "step": 10850 }, { - "epoch": 0.38, - "learning_rate": 4.97185016646498e-05, - "loss": 0.3154, + "epoch": 0.3912134645186867, + "grad_norm": 0.17925986647605896, + "learning_rate": 4.9695722643503384e-05, + "loss": 0.4975, "step": 10855 }, { - "epoch": 0.38, - "learning_rate": 4.971807521246345e-05, - "loss": 0.3251, + "epoch": 0.39139366417991134, + "grad_norm": 0.17491289973258972, + "learning_rate": 4.969526857598861e-05, + "loss": 0.4618, "step": 10860 }, { - "epoch": 0.38, - "learning_rate": 4.971764843932919e-05, - "loss": 0.3323, + "epoch": 0.39157386384113596, + "grad_norm": 0.2179514467716217, + "learning_rate": 4.969481417200558e-05, + "loss": 0.4999, "step": 10865 }, { - "epoch": 0.38, - "learning_rate": 4.971722134525254e-05, - "loss": 0.3168, + "epoch": 0.39175406350236064, + "grad_norm": 0.21115978062152863, + "learning_rate": 4.969435943156048e-05, + "loss": 0.4691, "step": 10870 }, { - "epoch": 0.38, - "learning_rate": 4.9716793930239066e-05, - "loss": 0.3006, + "epoch": 0.39193426316358526, + "grad_norm": 0.16908550262451172, + "learning_rate": 4.969390435465952e-05, + "loss": 0.4589, "step": 10875 }, { - "epoch": 0.38, - "learning_rate": 4.971636619429432e-05, - "loss": 0.3111, + "epoch": 0.3921144628248099, + "grad_norm": 0.16399134695529938, + "learning_rate": 4.969344894130889e-05, + "loss": 0.4356, "step": 10880 }, { - "epoch": 0.38, - "learning_rate": 4.971593813742383e-05, - "loss": 0.3157, + "epoch": 0.3922946624860345, + "grad_norm": 0.15767602622509003, + "learning_rate": 4.96929931915148e-05, + "loss": 0.4667, "step": 10885 }, { - "epoch": 0.38, - "learning_rate": 4.9715509759633194e-05, - "loss": 0.3366, + "epoch": 0.3924748621472592, + "grad_norm": 0.20866945385932922, + "learning_rate": 4.9692537105283465e-05, + "loss": 0.4452, "step": 10890 }, { - "epoch": 0.38, - "learning_rate": 4.9715081060927936e-05, - "loss": 0.3153, + "epoch": 0.3926550618084838, + "grad_norm": 0.15087029337882996, + "learning_rate": 4.969208068262109e-05, + "loss": 0.442, "step": 10895 }, { - "epoch": 0.38, - "learning_rate": 4.971465204131365e-05, - "loss": 0.3141, + "epoch": 0.39283526146970843, + "grad_norm": 0.1792287528514862, + "learning_rate": 4.969162392353389e-05, + "loss": 0.4511, "step": 10900 }, { - "epoch": 0.38, - "learning_rate": 4.971422270079588e-05, - "loss": 0.342, + "epoch": 0.39301546113093305, + "grad_norm": 0.16663479804992676, + "learning_rate": 4.96911668280281e-05, + "loss": 0.4726, "step": 10905 }, { - "epoch": 0.38, - "learning_rate": 4.971379303938023e-05, - "loss": 0.3119, + "epoch": 0.39319566079215773, + "grad_norm": 0.16242770850658417, + "learning_rate": 4.969070939610995e-05, + "loss": 0.4885, "step": 10910 }, { - "epoch": 0.38, - "learning_rate": 4.9713363057072256e-05, - "loss": 0.2966, + "epoch": 0.39337586045338235, + "grad_norm": 0.14637549221515656, + "learning_rate": 4.969025162778566e-05, + "loss": 0.4273, "step": 10915 }, { - "epoch": 0.38, - "learning_rate": 4.971293275387755e-05, - "loss": 0.3179, + "epoch": 0.393556060114607, + "grad_norm": 0.1730150431394577, + "learning_rate": 4.968979352306146e-05, + "loss": 0.4698, "step": 10920 }, { - "epoch": 0.38, - "learning_rate": 4.9712502129801695e-05, - "loss": 0.315, + "epoch": 0.3937362597758316, + "grad_norm": 0.15149009227752686, + "learning_rate": 4.968933508194361e-05, + "loss": 0.4338, "step": 10925 }, { - "epoch": 0.38, - "learning_rate": 4.971207118485029e-05, - "loss": 0.3437, + "epoch": 0.3939164594370563, + "grad_norm": 0.16374550759792328, + "learning_rate": 4.968887630443836e-05, + "loss": 0.4735, "step": 10930 }, { - "epoch": 0.38, - "learning_rate": 4.971163991902893e-05, - "loss": 0.3267, + "epoch": 0.3940966590982809, + "grad_norm": 0.16097819805145264, + "learning_rate": 4.968841719055194e-05, + "loss": 0.434, "step": 10935 }, { - "epoch": 0.38, - "learning_rate": 4.971120833234321e-05, - "loss": 0.3061, + "epoch": 0.3942768587595055, + "grad_norm": 0.18901294469833374, + "learning_rate": 4.968795774029061e-05, + "loss": 0.4419, "step": 10940 }, { - "epoch": 0.39, - "learning_rate": 4.971077642479873e-05, - "loss": 0.3452, + "epoch": 0.39445705842073014, + "grad_norm": 0.20963457226753235, + "learning_rate": 4.9687497953660646e-05, + "loss": 0.455, "step": 10945 }, { - "epoch": 0.39, - "learning_rate": 4.9710344196401114e-05, - "loss": 0.2993, + "epoch": 0.3946372580819548, + "grad_norm": 0.1951969563961029, + "learning_rate": 4.9687037830668306e-05, + "loss": 0.4736, "step": 10950 }, { - "epoch": 0.39, - "learning_rate": 4.9709911647155953e-05, - "loss": 0.3241, + "epoch": 0.39481745774317945, + "grad_norm": 0.17209535837173462, + "learning_rate": 4.968657737131984e-05, + "loss": 0.4568, "step": 10955 }, { - "epoch": 0.39, - "learning_rate": 4.9709478777068875e-05, - "loss": 0.3116, + "epoch": 0.39499765740440407, + "grad_norm": 0.20058324933052063, + "learning_rate": 4.968611657562154e-05, + "loss": 0.4182, "step": 10960 }, { - "epoch": 0.39, - "learning_rate": 4.97090455861455e-05, - "loss": 0.3161, + "epoch": 0.3951778570656287, + "grad_norm": 0.15861481428146362, + "learning_rate": 4.968565544357969e-05, + "loss": 0.4993, "step": 10965 }, { - "epoch": 0.39, - "learning_rate": 4.970861207439146e-05, - "loss": 0.3234, + "epoch": 0.39535805672685337, + "grad_norm": 0.18526114523410797, + "learning_rate": 4.968519397520056e-05, + "loss": 0.4578, "step": 10970 }, { - "epoch": 0.39, - "learning_rate": 4.970817824181236e-05, - "loss": 0.2984, + "epoch": 0.395538256388078, + "grad_norm": 0.14980530738830566, + "learning_rate": 4.968473217049044e-05, + "loss": 0.4595, "step": 10975 }, { - "epoch": 0.39, - "learning_rate": 4.970774408841387e-05, - "loss": 0.3408, + "epoch": 0.3957184560493026, + "grad_norm": 0.186975359916687, + "learning_rate": 4.9684270029455624e-05, + "loss": 0.45, "step": 10980 }, { - "epoch": 0.39, - "learning_rate": 4.970730961420159e-05, - "loss": 0.29, + "epoch": 0.3958986557105273, + "grad_norm": 0.15820086002349854, + "learning_rate": 4.968380755210241e-05, + "loss": 0.4645, "step": 10985 }, { - "epoch": 0.39, - "learning_rate": 4.970687481918118e-05, - "loss": 0.3175, + "epoch": 0.3960788553717519, + "grad_norm": 0.14868688583374023, + "learning_rate": 4.9683344738437096e-05, + "loss": 0.4714, "step": 10990 }, { - "epoch": 0.39, - "learning_rate": 4.970643970335828e-05, - "loss": 0.2969, + "epoch": 0.39625905503297654, + "grad_norm": 0.18641044199466705, + "learning_rate": 4.968288158846599e-05, + "loss": 0.4926, "step": 10995 }, { - "epoch": 0.39, - "learning_rate": 4.970600426673855e-05, - "loss": 0.3276, + "epoch": 0.39643925469420116, + "grad_norm": 0.17056040465831757, + "learning_rate": 4.968241810219539e-05, + "loss": 0.4738, "step": 11000 }, { - "epoch": 0.39, - "eval_loss": 0.31932923197746277, - "eval_runtime": 10.5374, - "eval_samples_per_second": 9.49, - "eval_steps_per_second": 9.49, + "epoch": 0.39643925469420116, + "eval_loss": 0.4763174057006836, + "eval_runtime": 3.5834, + "eval_samples_per_second": 27.907, + "eval_steps_per_second": 6.977, "step": 11000 }, { - "epoch": 0.39, - "learning_rate": 4.970556850932763e-05, - "loss": 0.3115, + "epoch": 0.39661945435542584, + "grad_norm": 0.16105006635189056, + "learning_rate": 4.9681954279631635e-05, + "loss": 0.4637, "step": 11005 }, { - "epoch": 0.39, - "learning_rate": 4.970513243113119e-05, - "loss": 0.3083, + "epoch": 0.39679965401665046, + "grad_norm": 0.17040333151817322, + "learning_rate": 4.968149012078103e-05, + "loss": 0.4618, "step": 11010 }, { - "epoch": 0.39, - "learning_rate": 4.970469603215488e-05, - "loss": 0.3105, + "epoch": 0.3969798536778751, + "grad_norm": 0.16445614397525787, + "learning_rate": 4.9681025625649905e-05, + "loss": 0.4516, "step": 11015 }, { - "epoch": 0.39, - "learning_rate": 4.9704259312404385e-05, - "loss": 0.3308, + "epoch": 0.3971600533390997, + "grad_norm": 0.16184879839420319, + "learning_rate": 4.968056079424457e-05, + "loss": 0.4915, "step": 11020 }, { - "epoch": 0.39, - "learning_rate": 4.9703822271885355e-05, - "loss": 0.332, + "epoch": 0.3973402530003244, + "grad_norm": 0.15629048645496368, + "learning_rate": 4.9680095626571384e-05, + "loss": 0.4608, "step": 11025 }, { - "epoch": 0.39, - "learning_rate": 4.9703384910603473e-05, - "loss": 0.3089, + "epoch": 0.397520452661549, + "grad_norm": 0.20805750787258148, + "learning_rate": 4.967963012263667e-05, + "loss": 0.4356, "step": 11030 }, { - "epoch": 0.39, - "learning_rate": 4.970294722856442e-05, - "loss": 0.3084, + "epoch": 0.39770065232277363, + "grad_norm": 0.16487817466259003, + "learning_rate": 4.967916428244677e-05, + "loss": 0.4734, "step": 11035 }, { - "epoch": 0.39, - "learning_rate": 4.970250922577387e-05, - "loss": 0.3292, + "epoch": 0.39788085198399825, + "grad_norm": 0.1743331402540207, + "learning_rate": 4.9678698106008034e-05, + "loss": 0.4458, "step": 11040 }, { - "epoch": 0.39, - "learning_rate": 4.970207090223753e-05, - "loss": 0.3208, + "epoch": 0.39806105164522293, + "grad_norm": 0.17199203372001648, + "learning_rate": 4.967823159332682e-05, + "loss": 0.4683, "step": 11045 }, { - "epoch": 0.39, - "learning_rate": 4.970163225796107e-05, - "loss": 0.3034, + "epoch": 0.39824125130644755, + "grad_norm": 0.17917218804359436, + "learning_rate": 4.967776474440948e-05, + "loss": 0.4884, "step": 11050 }, { - "epoch": 0.39, - "learning_rate": 4.97011932929502e-05, - "loss": 0.3415, + "epoch": 0.3984214509676722, + "grad_norm": 0.16735875606536865, + "learning_rate": 4.967729755926237e-05, + "loss": 0.4558, "step": 11055 }, { - "epoch": 0.39, - "learning_rate": 4.9700754007210606e-05, - "loss": 0.3013, + "epoch": 0.3986016506288968, + "grad_norm": 0.23970329761505127, + "learning_rate": 4.967683003789185e-05, + "loss": 0.4259, "step": 11060 }, { - "epoch": 0.39, - "learning_rate": 4.9700314400748005e-05, - "loss": 0.339, + "epoch": 0.3987818502901215, + "grad_norm": 0.17850400507450104, + "learning_rate": 4.967636218030431e-05, + "loss": 0.4225, "step": 11065 }, { - "epoch": 0.39, - "learning_rate": 4.9699874473568095e-05, - "loss": 0.3431, + "epoch": 0.3989620499513461, + "grad_norm": 0.14817573130130768, + "learning_rate": 4.967589398650611e-05, + "loss": 0.4513, "step": 11070 }, { - "epoch": 0.39, - "learning_rate": 4.96994342256766e-05, - "loss": 0.2995, + "epoch": 0.3991422496125707, + "grad_norm": 0.15426768362522125, + "learning_rate": 4.9675425456503634e-05, + "loss": 0.4577, "step": 11075 }, { - "epoch": 0.39, - "learning_rate": 4.969899365707922e-05, - "loss": 0.3354, + "epoch": 0.39932244927379534, + "grad_norm": 0.18102167546749115, + "learning_rate": 4.967495659030326e-05, + "loss": 0.4538, "step": 11080 }, { - "epoch": 0.39, - "learning_rate": 4.969855276778169e-05, - "loss": 0.3413, + "epoch": 0.39950264893502, + "grad_norm": 0.21321766078472137, + "learning_rate": 4.9674487387911374e-05, + "loss": 0.4502, "step": 11085 }, { - "epoch": 0.39, - "learning_rate": 4.969811155778974e-05, - "loss": 0.3404, + "epoch": 0.39968284859624464, + "grad_norm": 0.17416805028915405, + "learning_rate": 4.967401784933439e-05, + "loss": 0.4346, "step": 11090 }, { - "epoch": 0.39, - "learning_rate": 4.969767002710907e-05, - "loss": 0.3439, + "epoch": 0.39986304825746927, + "grad_norm": 0.20586751401424408, + "learning_rate": 4.9673547974578674e-05, + "loss": 0.4706, "step": 11095 }, { - "epoch": 0.39, - "learning_rate": 4.969722817574544e-05, - "loss": 0.3299, + "epoch": 0.4000432479186939, + "grad_norm": 0.1607694774866104, + "learning_rate": 4.967307776365065e-05, + "loss": 0.4478, "step": 11100 }, { - "epoch": 0.39, - "learning_rate": 4.969678600370457e-05, - "loss": 0.3264, + "epoch": 0.40022344757991857, + "grad_norm": 0.2245350033044815, + "learning_rate": 4.967260721655672e-05, + "loss": 0.4874, "step": 11105 }, { - "epoch": 0.39, - "learning_rate": 4.969634351099222e-05, - "loss": 0.3363, + "epoch": 0.4004036472411432, + "grad_norm": 0.18260863423347473, + "learning_rate": 4.967213633330329e-05, + "loss": 0.4176, "step": 11110 }, { - "epoch": 0.39, - "learning_rate": 4.9695900697614126e-05, - "loss": 0.3401, + "epoch": 0.4005838469023678, + "grad_norm": 0.1575452834367752, + "learning_rate": 4.967166511389678e-05, + "loss": 0.4249, "step": 11115 }, { - "epoch": 0.39, - "learning_rate": 4.969545756357603e-05, - "loss": 0.3148, + "epoch": 0.40076404656359244, + "grad_norm": 0.18237917125225067, + "learning_rate": 4.967119355834361e-05, + "loss": 0.4794, "step": 11120 }, { - "epoch": 0.39, - "learning_rate": 4.969501410888369e-05, - "loss": 0.2961, + "epoch": 0.4009442462248171, + "grad_norm": 0.2220340073108673, + "learning_rate": 4.96707216666502e-05, + "loss": 0.4514, "step": 11125 }, { - "epoch": 0.39, - "learning_rate": 4.9694570333542874e-05, - "loss": 0.2927, + "epoch": 0.40112444588604174, + "grad_norm": 0.17675887048244476, + "learning_rate": 4.9670249438822994e-05, + "loss": 0.4648, "step": 11130 }, { - "epoch": 0.39, - "learning_rate": 4.9694126237559335e-05, - "loss": 0.3554, + "epoch": 0.40130464554726636, + "grad_norm": 0.19447670876979828, + "learning_rate": 4.966977687486841e-05, + "loss": 0.4771, "step": 11135 }, { - "epoch": 0.39, - "learning_rate": 4.969368182093883e-05, - "loss": 0.3212, + "epoch": 0.401484845208491, + "grad_norm": 0.1338052898645401, + "learning_rate": 4.966930397479289e-05, + "loss": 0.416, "step": 11140 }, { - "epoch": 0.39, - "learning_rate": 4.9693237083687155e-05, - "loss": 0.3346, + "epoch": 0.40166504486971566, + "grad_norm": 0.19171032309532166, + "learning_rate": 4.966883073860288e-05, + "loss": 0.4714, "step": 11145 }, { - "epoch": 0.39, - "learning_rate": 4.969279202581007e-05, - "loss": 0.341, + "epoch": 0.4018452445309403, + "grad_norm": 0.17477290332317352, + "learning_rate": 4.966835716630483e-05, + "loss": 0.4862, "step": 11150 }, { - "epoch": 0.39, - "learning_rate": 4.9692346647313343e-05, - "loss": 0.3208, + "epoch": 0.4020254441921649, + "grad_norm": 0.22172503173351288, + "learning_rate": 4.966788325790519e-05, + "loss": 0.4264, "step": 11155 }, { - "epoch": 0.39, - "learning_rate": 4.969190094820278e-05, - "loss": 0.3564, + "epoch": 0.4022056438533896, + "grad_norm": 0.1821010559797287, + "learning_rate": 4.966740901341042e-05, + "loss": 0.4472, "step": 11160 }, { - "epoch": 0.39, - "learning_rate": 4.969145492848414e-05, - "loss": 0.3175, + "epoch": 0.4023858435146142, + "grad_norm": 0.21025283634662628, + "learning_rate": 4.9666934432826975e-05, + "loss": 0.446, "step": 11165 }, { - "epoch": 0.39, - "learning_rate": 4.969100858816324e-05, - "loss": 0.3286, + "epoch": 0.40256604317583883, + "grad_norm": 0.16647890210151672, + "learning_rate": 4.9666459516161316e-05, + "loss": 0.4754, "step": 11170 }, { - "epoch": 0.39, - "learning_rate": 4.969056192724586e-05, - "loss": 0.3217, + "epoch": 0.40274624283706345, + "grad_norm": 0.15028271079063416, + "learning_rate": 4.9665984263419926e-05, + "loss": 0.4176, "step": 11175 }, { - "epoch": 0.39, - "learning_rate": 4.969011494573781e-05, - "loss": 0.3129, + "epoch": 0.40292644249828813, + "grad_norm": 0.1495635062456131, + "learning_rate": 4.9665508674609277e-05, + "loss": 0.4659, "step": 11180 }, { - "epoch": 0.39, - "learning_rate": 4.968966764364489e-05, - "loss": 0.3072, + "epoch": 0.40310664215951275, + "grad_norm": 0.17869426310062408, + "learning_rate": 4.966503274973585e-05, + "loss": 0.453, "step": 11185 }, { - "epoch": 0.39, - "learning_rate": 4.96892200209729e-05, - "loss": 0.3381, + "epoch": 0.4032868418207374, + "grad_norm": 0.1512574553489685, + "learning_rate": 4.9664556488806124e-05, + "loss": 0.4645, "step": 11190 }, { - "epoch": 0.39, - "learning_rate": 4.968877207772766e-05, - "loss": 0.297, + "epoch": 0.403467041481962, + "grad_norm": 0.18554282188415527, + "learning_rate": 4.96640798918266e-05, + "loss": 0.4688, "step": 11195 }, { - "epoch": 0.39, - "learning_rate": 4.9688323813914995e-05, - "loss": 0.3213, + "epoch": 0.4036472411431867, + "grad_norm": 0.16681110858917236, + "learning_rate": 4.966360295880375e-05, + "loss": 0.4333, "step": 11200 }, { - "epoch": 0.39, - "learning_rate": 4.968787522954071e-05, - "loss": 0.3401, + "epoch": 0.4038274408044113, + "grad_norm": 0.2254079133272171, + "learning_rate": 4.966312568974409e-05, + "loss": 0.4501, "step": 11205 }, { - "epoch": 0.39, - "learning_rate": 4.968742632461063e-05, - "loss": 0.3256, + "epoch": 0.4040076404656359, + "grad_norm": 0.14422693848609924, + "learning_rate": 4.966264808465412e-05, + "loss": 0.4039, "step": 11210 }, { - "epoch": 0.39, - "learning_rate": 4.9686977099130594e-05, - "loss": 0.315, + "epoch": 0.40418784012686054, + "grad_norm": 0.17305003106594086, + "learning_rate": 4.9662170143540336e-05, + "loss": 0.4803, "step": 11215 }, { - "epoch": 0.39, - "learning_rate": 4.968652755310643e-05, - "loss": 0.3294, + "epoch": 0.4043680397880852, + "grad_norm": 0.18754348158836365, + "learning_rate": 4.966169186640927e-05, + "loss": 0.4577, "step": 11220 }, { - "epoch": 0.39, - "learning_rate": 4.968607768654398e-05, - "loss": 0.3661, + "epoch": 0.40454823944930984, + "grad_norm": 0.15006259083747864, + "learning_rate": 4.966121325326742e-05, + "loss": 0.4578, "step": 11225 }, { - "epoch": 0.4, - "learning_rate": 4.968562749944906e-05, - "loss": 0.3357, + "epoch": 0.40472843911053447, + "grad_norm": 0.1722080409526825, + "learning_rate": 4.9660734304121315e-05, + "loss": 0.4343, "step": 11230 }, { - "epoch": 0.4, - "learning_rate": 4.9685176991827555e-05, - "loss": 0.3292, + "epoch": 0.4049086387717591, + "grad_norm": 0.13911651074886322, + "learning_rate": 4.9660255018977475e-05, + "loss": 0.4435, "step": 11235 }, { - "epoch": 0.4, - "learning_rate": 4.968472616368529e-05, - "loss": 0.3061, + "epoch": 0.40508883843298377, + "grad_norm": 0.20192518830299377, + "learning_rate": 4.9659775397842444e-05, + "loss": 0.4632, "step": 11240 }, { - "epoch": 0.4, - "learning_rate": 4.968427501502812e-05, - "loss": 0.3282, + "epoch": 0.4052690380942084, + "grad_norm": 0.17392924427986145, + "learning_rate": 4.965929544072274e-05, + "loss": 0.4377, "step": 11245 }, { - "epoch": 0.4, - "learning_rate": 4.968382354586191e-05, - "loss": 0.324, + "epoch": 0.405449237755433, + "grad_norm": 0.17670120298862457, + "learning_rate": 4.9658815147624914e-05, + "loss": 0.4451, "step": 11250 }, { - "epoch": 0.4, - "learning_rate": 4.968337175619251e-05, - "loss": 0.3114, + "epoch": 0.40562943741665763, + "grad_norm": 0.22466880083084106, + "learning_rate": 4.9658334518555507e-05, + "loss": 0.5001, "step": 11255 }, { - "epoch": 0.4, - "learning_rate": 4.9682919646025805e-05, - "loss": 0.3184, + "epoch": 0.4058096370778823, + "grad_norm": 0.18523770570755005, + "learning_rate": 4.965785355352106e-05, + "loss": 0.4358, "step": 11260 }, { - "epoch": 0.4, - "learning_rate": 4.9682467215367655e-05, - "loss": 0.3407, + "epoch": 0.40598983673910694, + "grad_norm": 0.20358721911907196, + "learning_rate": 4.965737225252814e-05, + "loss": 0.4626, "step": 11265 }, { - "epoch": 0.4, - "learning_rate": 4.968201446422393e-05, - "loss": 0.2974, + "epoch": 0.40617003640033156, + "grad_norm": 0.2111283838748932, + "learning_rate": 4.9656890615583297e-05, + "loss": 0.4708, "step": 11270 }, { - "epoch": 0.4, - "learning_rate": 4.968156139260052e-05, - "loss": 0.3225, + "epoch": 0.4063502360615562, + "grad_norm": 0.200625941157341, + "learning_rate": 4.965640864269309e-05, + "loss": 0.4916, "step": 11275 }, { - "epoch": 0.4, - "learning_rate": 4.968110800050329e-05, - "loss": 0.328, + "epoch": 0.40653043572278086, + "grad_norm": 0.17772261798381805, + "learning_rate": 4.965592633386408e-05, + "loss": 0.4205, "step": 11280 }, { - "epoch": 0.4, - "learning_rate": 4.9680654287938145e-05, - "loss": 0.3285, + "epoch": 0.4067106353840055, + "grad_norm": 0.16529619693756104, + "learning_rate": 4.965544368910285e-05, + "loss": 0.4354, "step": 11285 }, { - "epoch": 0.4, - "learning_rate": 4.9680200254910966e-05, - "loss": 0.3313, + "epoch": 0.4068908350452301, + "grad_norm": 0.12482757121324539, + "learning_rate": 4.965496070841599e-05, + "loss": 0.4189, "step": 11290 }, { - "epoch": 0.4, - "learning_rate": 4.9679745901427656e-05, - "loss": 0.3458, + "epoch": 0.4070710347064547, + "grad_norm": 0.16134874522686005, + "learning_rate": 4.965447739181005e-05, + "loss": 0.4405, "step": 11295 }, { - "epoch": 0.4, - "learning_rate": 4.967929122749411e-05, - "loss": 0.3087, + "epoch": 0.4072512343676794, + "grad_norm": 0.1545429825782776, + "learning_rate": 4.965399373929163e-05, + "loss": 0.4762, "step": 11300 }, { - "epoch": 0.4, - "learning_rate": 4.967883623311622e-05, - "loss": 0.3174, + "epoch": 0.407431434028904, + "grad_norm": 0.16678392887115479, + "learning_rate": 4.9653509750867324e-05, + "loss": 0.4442, "step": 11305 }, { - "epoch": 0.4, - "learning_rate": 4.9678380918299915e-05, - "loss": 0.3439, + "epoch": 0.40761163369012865, + "grad_norm": 0.19490495324134827, + "learning_rate": 4.965302542654371e-05, + "loss": 0.436, "step": 11310 }, { - "epoch": 0.4, - "learning_rate": 4.9677925283051105e-05, - "loss": 0.3138, + "epoch": 0.4077918333513533, + "grad_norm": 0.19900862872600555, + "learning_rate": 4.9652540766327406e-05, + "loss": 0.5119, "step": 11315 }, { - "epoch": 0.4, - "learning_rate": 4.967746932737569e-05, - "loss": 0.3158, + "epoch": 0.40797203301257795, + "grad_norm": 0.1822587251663208, + "learning_rate": 4.9652055770225005e-05, + "loss": 0.4696, "step": 11320 }, { - "epoch": 0.4, - "learning_rate": 4.96770130512796e-05, - "loss": 0.3368, + "epoch": 0.4081522326738026, + "grad_norm": 0.1810266673564911, + "learning_rate": 4.965157043824311e-05, + "loss": 0.4568, "step": 11325 }, { - "epoch": 0.4, - "learning_rate": 4.9676556454768755e-05, - "loss": 0.3406, + "epoch": 0.4083324323350272, + "grad_norm": 0.19422537088394165, + "learning_rate": 4.965108477038835e-05, + "loss": 0.4705, "step": 11330 }, { - "epoch": 0.4, - "learning_rate": 4.9676099537849083e-05, - "loss": 0.3389, + "epoch": 0.4085126319962519, + "grad_norm": 0.15788643062114716, + "learning_rate": 4.965059876666733e-05, + "loss": 0.4606, "step": 11335 }, { - "epoch": 0.4, - "learning_rate": 4.967564230052653e-05, - "loss": 0.31, + "epoch": 0.4086928316574765, + "grad_norm": 0.15164151787757874, + "learning_rate": 4.965011242708667e-05, + "loss": 0.465, "step": 11340 }, { - "epoch": 0.4, - "learning_rate": 4.9675184742807025e-05, - "loss": 0.3074, + "epoch": 0.4088730313187011, + "grad_norm": 0.19079901278018951, + "learning_rate": 4.964962575165301e-05, + "loss": 0.4909, "step": 11345 }, { - "epoch": 0.4, - "learning_rate": 4.96747268646965e-05, - "loss": 0.3408, + "epoch": 0.40905323097992574, + "grad_norm": 0.1349342167377472, + "learning_rate": 4.964913874037296e-05, + "loss": 0.4571, "step": 11350 }, { - "epoch": 0.4, - "learning_rate": 4.967426866620091e-05, - "loss": 0.3086, + "epoch": 0.4092334306411504, + "grad_norm": 0.1663023829460144, + "learning_rate": 4.9648651393253176e-05, + "loss": 0.461, "step": 11355 }, { - "epoch": 0.4, - "learning_rate": 4.967381014732621e-05, - "loss": 0.329, + "epoch": 0.40941363030237504, + "grad_norm": 0.16813883185386658, + "learning_rate": 4.964816371030029e-05, + "loss": 0.408, "step": 11360 }, { - "epoch": 0.4, - "learning_rate": 4.9673351308078335e-05, - "loss": 0.3225, + "epoch": 0.40959382996359966, + "grad_norm": 0.15600088238716125, + "learning_rate": 4.964767569152093e-05, + "loss": 0.4358, "step": 11365 }, { - "epoch": 0.4, - "learning_rate": 4.9672892148463254e-05, - "loss": 0.3171, + "epoch": 0.4097740296248243, + "grad_norm": 0.14908957481384277, + "learning_rate": 4.964718733692178e-05, + "loss": 0.4046, "step": 11370 }, { - "epoch": 0.4, - "learning_rate": 4.967243266848694e-05, - "loss": 0.2968, + "epoch": 0.40995422928604897, + "grad_norm": 0.1925799548625946, + "learning_rate": 4.9646698646509465e-05, + "loss": 0.4426, "step": 11375 }, { - "epoch": 0.4, - "learning_rate": 4.967197286815534e-05, - "loss": 0.3255, + "epoch": 0.4101344289472736, + "grad_norm": 0.15564826130867004, + "learning_rate": 4.9646209620290654e-05, + "loss": 0.4438, "step": 11380 }, { - "epoch": 0.4, - "learning_rate": 4.9671512747474436e-05, - "loss": 0.313, + "epoch": 0.4103146286084982, + "grad_norm": 0.19330483675003052, + "learning_rate": 4.9645720258272014e-05, + "loss": 0.4536, "step": 11385 }, { - "epoch": 0.4, - "learning_rate": 4.967105230645019e-05, - "loss": 0.3233, + "epoch": 0.41049482826972283, + "grad_norm": 0.18074195086956024, + "learning_rate": 4.96452305604602e-05, + "loss": 0.4674, "step": 11390 }, { - "epoch": 0.4, - "learning_rate": 4.96705915450886e-05, - "loss": 0.3151, + "epoch": 0.4106750279309475, + "grad_norm": 0.20103107392787933, + "learning_rate": 4.964474052686189e-05, + "loss": 0.4257, "step": 11395 }, { - "epoch": 0.4, - "learning_rate": 4.9670130463395624e-05, - "loss": 0.3142, + "epoch": 0.41085522759217213, + "grad_norm": 0.14884856343269348, + "learning_rate": 4.9644250157483765e-05, + "loss": 0.4443, "step": 11400 }, { - "epoch": 0.4, - "learning_rate": 4.9669669061377266e-05, - "loss": 0.3363, + "epoch": 0.41103542725339676, + "grad_norm": 0.1957724243402481, + "learning_rate": 4.96437594523325e-05, + "loss": 0.4462, "step": 11405 }, { - "epoch": 0.4, - "learning_rate": 4.9669207339039514e-05, - "loss": 0.3107, + "epoch": 0.4112156269146214, + "grad_norm": 0.15725165605545044, + "learning_rate": 4.964326841141479e-05, + "loss": 0.4184, "step": 11410 }, { - "epoch": 0.4, - "learning_rate": 4.9668745296388366e-05, - "loss": 0.3451, + "epoch": 0.41139582657584606, + "grad_norm": 0.16227923333644867, + "learning_rate": 4.964277703473731e-05, + "loss": 0.4157, "step": 11415 }, { - "epoch": 0.4, - "learning_rate": 4.966828293342982e-05, - "loss": 0.3532, + "epoch": 0.4115760262370707, + "grad_norm": 0.1838981658220291, + "learning_rate": 4.9642285322306766e-05, + "loss": 0.4706, "step": 11420 }, { - "epoch": 0.4, - "learning_rate": 4.9667820250169874e-05, - "loss": 0.33, + "epoch": 0.4117562258982953, + "grad_norm": 0.19162507355213165, + "learning_rate": 4.9641793274129864e-05, + "loss": 0.4535, "step": 11425 }, { - "epoch": 0.4, - "learning_rate": 4.966735724661453e-05, - "loss": 0.3177, + "epoch": 0.4119364255595199, + "grad_norm": 0.14456330239772797, + "learning_rate": 4.964130089021329e-05, + "loss": 0.4752, "step": 11430 }, { - "epoch": 0.4, - "learning_rate": 4.966689392276982e-05, - "loss": 0.3225, + "epoch": 0.4121166252207446, + "grad_norm": 0.17111894488334656, + "learning_rate": 4.964080817056377e-05, + "loss": 0.4575, "step": 11435 }, { - "epoch": 0.4, - "learning_rate": 4.966643027864174e-05, - "loss": 0.3229, + "epoch": 0.4122968248819692, + "grad_norm": 0.16951219737529755, + "learning_rate": 4.9640315115188004e-05, + "loss": 0.4254, "step": 11440 }, { - "epoch": 0.4, - "learning_rate": 4.9665966314236324e-05, - "loss": 0.3148, + "epoch": 0.41247702454319385, + "grad_norm": 0.14794260263442993, + "learning_rate": 4.963982172409272e-05, + "loss": 0.4429, "step": 11445 }, { - "epoch": 0.4, - "learning_rate": 4.966550202955959e-05, - "loss": 0.3068, + "epoch": 0.41265722420441847, + "grad_norm": 0.21253348886966705, + "learning_rate": 4.963932799728462e-05, + "loss": 0.4842, "step": 11450 }, { - "epoch": 0.4, - "learning_rate": 4.966503742461756e-05, - "loss": 0.3421, + "epoch": 0.41283742386564315, + "grad_norm": 0.19845059514045715, + "learning_rate": 4.963883393477046e-05, + "loss": 0.4506, "step": 11455 }, { - "epoch": 0.4, - "learning_rate": 4.966457249941629e-05, - "loss": 0.3407, + "epoch": 0.41301762352686777, + "grad_norm": 0.16377714276313782, + "learning_rate": 4.963833953655696e-05, + "loss": 0.43, "step": 11460 }, { - "epoch": 0.4, - "learning_rate": 4.9664107253961786e-05, - "loss": 0.3071, + "epoch": 0.4131978231880924, + "grad_norm": 0.13513842225074768, + "learning_rate": 4.963784480265085e-05, + "loss": 0.4142, "step": 11465 }, { - "epoch": 0.4, - "learning_rate": 4.966364168826011e-05, - "loss": 0.3285, + "epoch": 0.413378022849317, + "grad_norm": 0.20631147921085358, + "learning_rate": 4.963734973305887e-05, + "loss": 0.4484, "step": 11470 }, { - "epoch": 0.4, - "learning_rate": 4.9663175802317306e-05, - "loss": 0.3165, + "epoch": 0.4135582225105417, + "grad_norm": 0.14362797141075134, + "learning_rate": 4.963685432778777e-05, + "loss": 0.4563, "step": 11475 }, { - "epoch": 0.4, - "learning_rate": 4.966270959613941e-05, - "loss": 0.3275, + "epoch": 0.4137384221717663, + "grad_norm": 0.17292509973049164, + "learning_rate": 4.963635858684431e-05, + "loss": 0.4537, "step": 11480 }, { - "epoch": 0.4, - "learning_rate": 4.966224306973249e-05, - "loss": 0.3074, + "epoch": 0.41391862183299094, + "grad_norm": 0.19188092648983002, + "learning_rate": 4.963586251023523e-05, + "loss": 0.469, "step": 11485 }, { - "epoch": 0.4, - "learning_rate": 4.9661776223102583e-05, - "loss": 0.3402, + "epoch": 0.41409882149421556, + "grad_norm": 0.17507772147655487, + "learning_rate": 4.963536609796729e-05, + "loss": 0.424, "step": 11490 }, { - "epoch": 0.4, - "learning_rate": 4.966130905625578e-05, - "loss": 0.3486, + "epoch": 0.41427902115544024, + "grad_norm": 0.13874538242816925, + "learning_rate": 4.963486935004725e-05, + "loss": 0.457, "step": 11495 }, { - "epoch": 0.4, - "learning_rate": 4.966084156919813e-05, - "loss": 0.3687, + "epoch": 0.41445922081666486, + "grad_norm": 0.1583535075187683, + "learning_rate": 4.96343722664819e-05, + "loss": 0.4558, "step": 11500 }, { - "epoch": 0.4, - "eval_loss": 0.31692856550216675, - "eval_runtime": 10.5329, - "eval_samples_per_second": 9.494, - "eval_steps_per_second": 9.494, + "epoch": 0.41445922081666486, + "eval_loss": 0.4750808775424957, + "eval_runtime": 3.597, + "eval_samples_per_second": 27.801, + "eval_steps_per_second": 6.95, "step": 11500 }, { - "epoch": 0.4, - "learning_rate": 4.966037376193569e-05, - "loss": 0.2997, + "epoch": 0.4146394204778895, + "grad_norm": 0.1701897829771042, + "learning_rate": 4.9633874847277985e-05, + "loss": 0.509, "step": 11505 }, { - "epoch": 0.4, - "learning_rate": 4.965990563447456e-05, - "loss": 0.3563, + "epoch": 0.41481962013911416, + "grad_norm": 0.20449738204479218, + "learning_rate": 4.9633377092442305e-05, + "loss": 0.4207, "step": 11510 }, { - "epoch": 0.41, - "learning_rate": 4.96594371868208e-05, - "loss": 0.2938, + "epoch": 0.4149998198003388, + "grad_norm": 0.21852335333824158, + "learning_rate": 4.9632879001981616e-05, + "loss": 0.4482, "step": 11515 }, { - "epoch": 0.41, - "learning_rate": 4.96589684189805e-05, - "loss": 0.309, + "epoch": 0.4151800194615634, + "grad_norm": 0.1580123007297516, + "learning_rate": 4.963238057590273e-05, + "loss": 0.4244, "step": 11520 }, { - "epoch": 0.41, - "learning_rate": 4.965849933095976e-05, - "loss": 0.3217, + "epoch": 0.41536021912278803, + "grad_norm": 0.18125052750110626, + "learning_rate": 4.963188181421243e-05, + "loss": 0.4599, "step": 11525 }, { - "epoch": 0.41, - "learning_rate": 4.965802992276464e-05, - "loss": 0.3492, + "epoch": 0.4155404187840127, + "grad_norm": 0.18217013776302338, + "learning_rate": 4.9631382716917504e-05, + "loss": 0.4347, "step": 11530 }, { - "epoch": 0.41, - "learning_rate": 4.9657560194401266e-05, - "loss": 0.3112, + "epoch": 0.41572061844523733, + "grad_norm": 0.1741897016763687, + "learning_rate": 4.9630883284024756e-05, + "loss": 0.428, "step": 11535 }, { - "epoch": 0.41, - "learning_rate": 4.9657090145875704e-05, - "loss": 0.3295, + "epoch": 0.41590081810646196, + "grad_norm": 0.20398588478565216, + "learning_rate": 4.9630383515541e-05, + "loss": 0.4587, "step": 11540 }, { - "epoch": 0.41, - "learning_rate": 4.965661977719409e-05, - "loss": 0.3108, + "epoch": 0.4160810177676866, + "grad_norm": 0.1447395384311676, + "learning_rate": 4.9629883411473025e-05, + "loss": 0.4406, "step": 11545 }, { - "epoch": 0.41, - "learning_rate": 4.965614908836251e-05, - "loss": 0.3004, + "epoch": 0.41626121742891126, + "grad_norm": 0.21012569963932037, + "learning_rate": 4.962938297182767e-05, + "loss": 0.4901, "step": 11550 }, { - "epoch": 0.41, - "learning_rate": 4.965567807938709e-05, - "loss": 0.3019, + "epoch": 0.4164414170901359, + "grad_norm": 0.17350612580776215, + "learning_rate": 4.962888219661173e-05, + "loss": 0.4447, "step": 11555 }, { - "epoch": 0.41, - "learning_rate": 4.965520675027394e-05, - "loss": 0.319, + "epoch": 0.4166216167513605, + "grad_norm": 0.167585551738739, + "learning_rate": 4.9628381085832046e-05, + "loss": 0.4391, "step": 11560 }, { - "epoch": 0.41, - "learning_rate": 4.965473510102918e-05, - "loss": 0.3118, + "epoch": 0.4168018164125851, + "grad_norm": 0.15864285826683044, + "learning_rate": 4.962787963949543e-05, + "loss": 0.4621, "step": 11565 }, { - "epoch": 0.41, - "learning_rate": 4.965426313165892e-05, - "loss": 0.3351, + "epoch": 0.4169820160738098, + "grad_norm": 0.20318114757537842, + "learning_rate": 4.9627377857608725e-05, + "loss": 0.4257, "step": 11570 }, { - "epoch": 0.41, - "learning_rate": 4.965379084216931e-05, - "loss": 0.3064, + "epoch": 0.4171622157350344, + "grad_norm": 0.1912168711423874, + "learning_rate": 4.962687574017877e-05, + "loss": 0.43, "step": 11575 }, { - "epoch": 0.41, - "learning_rate": 4.965331823256648e-05, - "loss": 0.3145, + "epoch": 0.41734241539625905, + "grad_norm": 0.1616230010986328, + "learning_rate": 4.962637328721239e-05, + "loss": 0.446, "step": 11580 }, { - "epoch": 0.41, - "learning_rate": 4.9652845302856545e-05, - "loss": 0.3336, + "epoch": 0.41752261505748367, + "grad_norm": 0.21484871208667755, + "learning_rate": 4.962587049871645e-05, + "loss": 0.4384, "step": 11585 }, { - "epoch": 0.41, - "learning_rate": 4.965237205304567e-05, - "loss": 0.298, + "epoch": 0.41770281471870835, + "grad_norm": 0.1611786037683487, + "learning_rate": 4.9625367374697795e-05, + "loss": 0.4294, "step": 11590 }, { - "epoch": 0.41, - "learning_rate": 4.965189848313999e-05, - "loss": 0.2922, + "epoch": 0.41788301437993297, + "grad_norm": 0.17779512703418732, + "learning_rate": 4.9624863915163275e-05, + "loss": 0.487, "step": 11595 }, { - "epoch": 0.41, - "learning_rate": 4.965142459314565e-05, - "loss": 0.2873, + "epoch": 0.4180632140411576, + "grad_norm": 0.16837961971759796, + "learning_rate": 4.962436012011975e-05, + "loss": 0.4363, "step": 11600 }, { - "epoch": 0.41, - "learning_rate": 4.9650950383068813e-05, - "loss": 0.3168, + "epoch": 0.4182434137023822, + "grad_norm": 0.13357023894786835, + "learning_rate": 4.9623855989574086e-05, + "loss": 0.4507, "step": 11605 }, { - "epoch": 0.41, - "learning_rate": 4.9650475852915625e-05, - "loss": 0.3136, + "epoch": 0.4184236133636069, + "grad_norm": 0.13329064846038818, + "learning_rate": 4.9623351523533144e-05, + "loss": 0.4462, "step": 11610 }, { - "epoch": 0.41, - "learning_rate": 4.965000100269226e-05, - "loss": 0.3312, + "epoch": 0.4186038130248315, + "grad_norm": 0.16751320660114288, + "learning_rate": 4.962284672200381e-05, + "loss": 0.4459, "step": 11615 }, { - "epoch": 0.41, - "learning_rate": 4.964952583240487e-05, - "loss": 0.3107, + "epoch": 0.41878401268605614, + "grad_norm": 0.14165711402893066, + "learning_rate": 4.962234158499296e-05, + "loss": 0.4635, "step": 11620 }, { - "epoch": 0.41, - "learning_rate": 4.9649050342059635e-05, - "loss": 0.324, + "epoch": 0.41896421234728076, + "grad_norm": 0.20206506550312042, + "learning_rate": 4.9621836112507475e-05, + "loss": 0.487, "step": 11625 }, { - "epoch": 0.41, - "learning_rate": 4.9648574531662726e-05, - "loss": 0.3329, + "epoch": 0.41914441200850544, + "grad_norm": 0.20387201011180878, + "learning_rate": 4.9621330304554234e-05, + "loss": 0.4851, "step": 11630 }, { - "epoch": 0.41, - "learning_rate": 4.964809840122032e-05, - "loss": 0.3211, + "epoch": 0.41932461166973006, + "grad_norm": 0.154767245054245, + "learning_rate": 4.962082416114014e-05, + "loss": 0.4584, "step": 11635 }, { - "epoch": 0.41, - "learning_rate": 4.96476219507386e-05, - "loss": 0.3055, + "epoch": 0.4195048113309547, + "grad_norm": 0.1658494919538498, + "learning_rate": 4.962031768227208e-05, + "loss": 0.4511, "step": 11640 }, { - "epoch": 0.41, - "learning_rate": 4.964714518022375e-05, - "loss": 0.3302, + "epoch": 0.4196850109921793, + "grad_norm": 0.16036735475063324, + "learning_rate": 4.9619810867956954e-05, + "loss": 0.4439, "step": 11645 }, { - "epoch": 0.41, - "learning_rate": 4.964666808968197e-05, - "loss": 0.3407, + "epoch": 0.419865210653404, + "grad_norm": 0.18504783511161804, + "learning_rate": 4.9619303718201685e-05, + "loss": 0.4651, "step": 11650 }, { - "epoch": 0.41, - "learning_rate": 4.9646190679119445e-05, - "loss": 0.3278, + "epoch": 0.4200454103146286, + "grad_norm": 0.213784322142601, + "learning_rate": 4.9618796233013155e-05, + "loss": 0.4465, "step": 11655 }, { - "epoch": 0.41, - "learning_rate": 4.964571294854237e-05, - "loss": 0.3406, + "epoch": 0.42022560997585323, + "grad_norm": 0.15321993827819824, + "learning_rate": 4.96182884123983e-05, + "loss": 0.4583, "step": 11660 }, { - "epoch": 0.41, - "learning_rate": 4.964523489795696e-05, - "loss": 0.3247, + "epoch": 0.4204058096370779, + "grad_norm": 0.23537690937519073, + "learning_rate": 4.961778025636402e-05, + "loss": 0.5085, "step": 11665 }, { - "epoch": 0.41, - "learning_rate": 4.964475652736942e-05, - "loss": 0.3085, + "epoch": 0.42058600929830253, + "grad_norm": 0.17639757692813873, + "learning_rate": 4.961727176491726e-05, + "loss": 0.449, "step": 11670 }, { - "epoch": 0.41, - "learning_rate": 4.964427783678595e-05, - "loss": 0.3015, + "epoch": 0.42076620895952715, + "grad_norm": 0.14851510524749756, + "learning_rate": 4.9616762938064945e-05, + "loss": 0.4373, "step": 11675 }, { - "epoch": 0.41, - "learning_rate": 4.964379882621278e-05, - "loss": 0.3264, + "epoch": 0.4209464086207518, + "grad_norm": 0.19301354885101318, + "learning_rate": 4.961625377581399e-05, + "loss": 0.4564, "step": 11680 }, { - "epoch": 0.41, - "learning_rate": 4.964331949565611e-05, - "loss": 0.3442, + "epoch": 0.42112660828197646, + "grad_norm": 0.1511390209197998, + "learning_rate": 4.961574427817135e-05, + "loss": 0.4522, "step": 11685 }, { - "epoch": 0.41, - "learning_rate": 4.9642839845122195e-05, - "loss": 0.3321, + "epoch": 0.4213068079432011, + "grad_norm": 0.16378375887870789, + "learning_rate": 4.9615234445143954e-05, + "loss": 0.4472, "step": 11690 }, { - "epoch": 0.41, - "learning_rate": 4.964235987461724e-05, - "loss": 0.3283, + "epoch": 0.4214870076044257, + "grad_norm": 0.14959801733493805, + "learning_rate": 4.961472427673875e-05, + "loss": 0.4596, "step": 11695 }, { - "epoch": 0.41, - "learning_rate": 4.9641879584147476e-05, - "loss": 0.2993, + "epoch": 0.4216672072656503, + "grad_norm": 0.16105423867702484, + "learning_rate": 4.961421377296271e-05, + "loss": 0.4472, "step": 11700 }, { - "epoch": 0.41, - "learning_rate": 4.964139897371915e-05, - "loss": 0.3119, + "epoch": 0.421847406926875, + "grad_norm": 0.18409523367881775, + "learning_rate": 4.9613702933822756e-05, + "loss": 0.4599, "step": 11705 }, { - "epoch": 0.41, - "learning_rate": 4.964091804333849e-05, - "loss": 0.3216, + "epoch": 0.4220276065880996, + "grad_norm": 0.215390145778656, + "learning_rate": 4.961319175932588e-05, + "loss": 0.4891, "step": 11710 }, { - "epoch": 0.41, - "learning_rate": 4.964043679301176e-05, - "loss": 0.313, + "epoch": 0.42220780624932425, + "grad_norm": 0.22909221053123474, + "learning_rate": 4.961268024947902e-05, + "loss": 0.4779, "step": 11715 }, { - "epoch": 0.41, - "learning_rate": 4.963995522274519e-05, - "loss": 0.333, + "epoch": 0.42238800591054887, + "grad_norm": 0.1938232034444809, + "learning_rate": 4.961216840428916e-05, + "loss": 0.4564, "step": 11720 }, { - "epoch": 0.41, - "learning_rate": 4.963947333254504e-05, - "loss": 0.3288, + "epoch": 0.42256820557177355, + "grad_norm": 0.18986928462982178, + "learning_rate": 4.961165622376327e-05, + "loss": 0.485, "step": 11725 }, { - "epoch": 0.41, - "learning_rate": 4.9638991122417564e-05, - "loss": 0.3167, + "epoch": 0.42274840523299817, + "grad_norm": 0.1635420173406601, + "learning_rate": 4.9611143707908336e-05, + "loss": 0.4641, "step": 11730 }, { - "epoch": 0.41, - "learning_rate": 4.963850859236904e-05, - "loss": 0.2891, + "epoch": 0.4229286048942228, + "grad_norm": 0.2149038463830948, + "learning_rate": 4.961063085673132e-05, + "loss": 0.5116, "step": 11735 }, { - "epoch": 0.41, - "learning_rate": 4.96380257424057e-05, - "loss": 0.3347, + "epoch": 0.4231088045554474, + "grad_norm": 0.15662051737308502, + "learning_rate": 4.9610117670239235e-05, + "loss": 0.4396, "step": 11740 }, { - "epoch": 0.41, - "learning_rate": 4.9637542572533844e-05, - "loss": 0.2952, + "epoch": 0.4232890042166721, + "grad_norm": 0.1819683313369751, + "learning_rate": 4.960960414843906e-05, + "loss": 0.4357, "step": 11745 }, { - "epoch": 0.41, - "learning_rate": 4.963705908275973e-05, - "loss": 0.2992, + "epoch": 0.4234692038778967, + "grad_norm": 0.17882205545902252, + "learning_rate": 4.960909029133779e-05, + "loss": 0.4215, "step": 11750 }, { - "epoch": 0.41, - "learning_rate": 4.9636575273089646e-05, - "loss": 0.3329, + "epoch": 0.42364940353912134, + "grad_norm": 0.1997404247522354, + "learning_rate": 4.9608576098942426e-05, + "loss": 0.4522, "step": 11755 }, { - "epoch": 0.41, - "learning_rate": 4.963609114352986e-05, - "loss": 0.2935, + "epoch": 0.42382960320034596, + "grad_norm": 0.18902002274990082, + "learning_rate": 4.960806157125998e-05, + "loss": 0.4362, "step": 11760 }, { - "epoch": 0.41, - "learning_rate": 4.963560669408668e-05, - "loss": 0.3331, + "epoch": 0.42400980286157064, + "grad_norm": 0.18085581064224243, + "learning_rate": 4.960754670829746e-05, + "loss": 0.4583, "step": 11765 }, { - "epoch": 0.41, - "learning_rate": 4.963512192476637e-05, - "loss": 0.3127, + "epoch": 0.42419000252279526, + "grad_norm": 0.2524605989456177, + "learning_rate": 4.960703151006189e-05, + "loss": 0.4756, "step": 11770 }, { - "epoch": 0.41, - "learning_rate": 4.963463683557524e-05, - "loss": 0.3097, + "epoch": 0.4243702021840199, + "grad_norm": 0.1703946441411972, + "learning_rate": 4.9606515976560265e-05, + "loss": 0.4533, "step": 11775 }, { - "epoch": 0.41, - "learning_rate": 4.9634151426519584e-05, - "loss": 0.3086, + "epoch": 0.4245504018452445, + "grad_norm": 0.16210578382015228, + "learning_rate": 4.960600010779963e-05, + "loss": 0.4632, "step": 11780 }, { - "epoch": 0.41, - "learning_rate": 4.9633665697605707e-05, - "loss": 0.3225, + "epoch": 0.4247306015064692, + "grad_norm": 0.17518621683120728, + "learning_rate": 4.9605483903787006e-05, + "loss": 0.4919, "step": 11785 }, { - "epoch": 0.41, - "learning_rate": 4.9633179648839916e-05, - "loss": 0.3424, + "epoch": 0.4249108011676938, + "grad_norm": 0.15813525021076202, + "learning_rate": 4.960496736452943e-05, + "loss": 0.4522, "step": 11790 }, { - "epoch": 0.41, - "learning_rate": 4.963269328022853e-05, - "loss": 0.3198, + "epoch": 0.42509100082891843, + "grad_norm": 0.19452042877674103, + "learning_rate": 4.9604450490033936e-05, + "loss": 0.4526, "step": 11795 }, { - "epoch": 0.42, - "learning_rate": 4.963220659177784e-05, - "loss": 0.3151, + "epoch": 0.42527120049014305, + "grad_norm": 0.21348360180854797, + "learning_rate": 4.960393328030757e-05, + "loss": 0.4682, "step": 11800 }, { - "epoch": 0.42, - "learning_rate": 4.9631719583494184e-05, - "loss": 0.3088, + "epoch": 0.42545140015136773, + "grad_norm": 0.1495480090379715, + "learning_rate": 4.9603415735357374e-05, + "loss": 0.4668, "step": 11805 }, { - "epoch": 0.42, - "learning_rate": 4.9631232255383884e-05, - "loss": 0.3407, + "epoch": 0.42563159981259235, + "grad_norm": 0.1405077874660492, + "learning_rate": 4.96028978551904e-05, + "loss": 0.3944, "step": 11810 }, { - "epoch": 0.42, - "learning_rate": 4.963074460745327e-05, - "loss": 0.3167, + "epoch": 0.425811799473817, + "grad_norm": 0.18521526455879211, + "learning_rate": 4.96023796398137e-05, + "loss": 0.4723, "step": 11815 }, { - "epoch": 0.42, - "learning_rate": 4.9630256639708675e-05, - "loss": 0.3313, + "epoch": 0.4259919991350416, + "grad_norm": 0.22655785083770752, + "learning_rate": 4.9601861089234355e-05, + "loss": 0.4484, "step": 11820 }, { - "epoch": 0.42, - "learning_rate": 4.962976835215641e-05, - "loss": 0.3076, + "epoch": 0.4261721987962663, + "grad_norm": 0.1640099287033081, + "learning_rate": 4.9601342203459405e-05, + "loss": 0.4423, "step": 11825 }, { - "epoch": 0.42, - "learning_rate": 4.9629279744802845e-05, - "loss": 0.3118, + "epoch": 0.4263523984574909, + "grad_norm": 0.17364855110645294, + "learning_rate": 4.960082298249593e-05, + "loss": 0.4574, "step": 11830 }, { - "epoch": 0.42, - "learning_rate": 4.962879081765432e-05, - "loss": 0.3026, + "epoch": 0.4265325981187155, + "grad_norm": 0.18359048664569855, + "learning_rate": 4.9600303426351013e-05, + "loss": 0.4274, "step": 11835 }, { - "epoch": 0.42, - "learning_rate": 4.9628301570717176e-05, - "loss": 0.3041, + "epoch": 0.4267127977799402, + "grad_norm": 0.1665671169757843, + "learning_rate": 4.959978353503172e-05, + "loss": 0.4399, "step": 11840 }, { - "epoch": 0.42, - "learning_rate": 4.962781200399776e-05, - "loss": 0.3037, + "epoch": 0.4268929974411648, + "grad_norm": 0.1284245401620865, + "learning_rate": 4.959926330854514e-05, + "loss": 0.4541, "step": 11845 }, { - "epoch": 0.42, - "learning_rate": 4.962732211750244e-05, - "loss": 0.3158, + "epoch": 0.42707319710238945, + "grad_norm": 0.19221022725105286, + "learning_rate": 4.9598742746898364e-05, + "loss": 0.4692, "step": 11850 }, { - "epoch": 0.42, - "learning_rate": 4.962683191123757e-05, - "loss": 0.307, + "epoch": 0.42725339676361407, + "grad_norm": 0.21203385293483734, + "learning_rate": 4.959822185009847e-05, + "loss": 0.4709, "step": 11855 }, { - "epoch": 0.42, - "learning_rate": 4.962634138520952e-05, - "loss": 0.3302, + "epoch": 0.42743359642483875, + "grad_norm": 0.15997375547885895, + "learning_rate": 4.959770061815258e-05, + "loss": 0.4242, "step": 11860 }, { - "epoch": 0.42, - "learning_rate": 4.9625850539424644e-05, - "loss": 0.3337, + "epoch": 0.42761379608606337, + "grad_norm": 0.19017674028873444, + "learning_rate": 4.959717905106777e-05, + "loss": 0.4596, "step": 11865 }, { - "epoch": 0.42, - "learning_rate": 4.9625359373889344e-05, - "loss": 0.3347, + "epoch": 0.427793995747288, + "grad_norm": 0.19879359006881714, + "learning_rate": 4.9596657148851154e-05, + "loss": 0.4626, "step": 11870 }, { - "epoch": 0.42, - "learning_rate": 4.962486788860997e-05, - "loss": 0.3301, + "epoch": 0.4279741954085126, + "grad_norm": 0.14506494998931885, + "learning_rate": 4.9596134911509865e-05, + "loss": 0.4507, "step": 11875 }, { - "epoch": 0.42, - "learning_rate": 4.962437608359293e-05, - "loss": 0.3024, + "epoch": 0.4281543950697373, + "grad_norm": 0.16767846047878265, + "learning_rate": 4.959561233905098e-05, + "loss": 0.4191, "step": 11880 }, { - "epoch": 0.42, - "learning_rate": 4.9623883958844574e-05, - "loss": 0.3066, + "epoch": 0.4283345947309619, + "grad_norm": 0.1973155289888382, + "learning_rate": 4.9595089431481645e-05, + "loss": 0.4705, "step": 11885 }, { - "epoch": 0.42, - "learning_rate": 4.9623391514371334e-05, - "loss": 0.3476, + "epoch": 0.42851479439218654, + "grad_norm": 0.16755364835262299, + "learning_rate": 4.9594566188808985e-05, + "loss": 0.421, "step": 11890 }, { - "epoch": 0.42, - "learning_rate": 4.962289875017957e-05, - "loss": 0.2938, + "epoch": 0.42869499405341116, + "grad_norm": 0.18635892868041992, + "learning_rate": 4.959404261104012e-05, + "loss": 0.4883, "step": 11895 }, { - "epoch": 0.42, - "learning_rate": 4.962240566627569e-05, - "loss": 0.3341, + "epoch": 0.42887519371463584, + "grad_norm": 0.18180792033672333, + "learning_rate": 4.959351869818218e-05, + "loss": 0.4775, "step": 11900 }, { - "epoch": 0.42, - "learning_rate": 4.96219122626661e-05, - "loss": 0.3252, + "epoch": 0.42905539337586046, + "grad_norm": 0.1798887848854065, + "learning_rate": 4.9592994450242316e-05, + "loss": 0.4618, "step": 11905 }, { - "epoch": 0.42, - "learning_rate": 4.962141853935721e-05, - "loss": 0.3255, + "epoch": 0.4292355930370851, + "grad_norm": 0.23241207003593445, + "learning_rate": 4.9592469867227655e-05, + "loss": 0.4479, "step": 11910 }, { - "epoch": 0.42, - "learning_rate": 4.962092449635543e-05, - "loss": 0.333, + "epoch": 0.4294157926983097, + "grad_norm": 0.18085531890392303, + "learning_rate": 4.959194494914537e-05, + "loss": 0.4617, "step": 11915 }, { - "epoch": 0.42, - "learning_rate": 4.9620430133667165e-05, - "loss": 0.3133, + "epoch": 0.4295959923595344, + "grad_norm": 0.1726156622171402, + "learning_rate": 4.9591419696002575e-05, + "loss": 0.4501, "step": 11920 }, { - "epoch": 0.42, - "learning_rate": 4.961993545129884e-05, - "loss": 0.3345, + "epoch": 0.429776192020759, + "grad_norm": 0.16227740049362183, + "learning_rate": 4.9590894107806454e-05, + "loss": 0.4358, "step": 11925 }, { - "epoch": 0.42, - "learning_rate": 4.961944044925688e-05, - "loss": 0.307, + "epoch": 0.42995639168198363, + "grad_norm": 0.15402673184871674, + "learning_rate": 4.959036818456417e-05, + "loss": 0.44, "step": 11930 }, { - "epoch": 0.42, - "learning_rate": 4.9618945127547715e-05, - "loss": 0.3097, + "epoch": 0.43013659134320825, + "grad_norm": 0.1438482403755188, + "learning_rate": 4.958984192628288e-05, + "loss": 0.463, "step": 11935 }, { - "epoch": 0.42, - "learning_rate": 4.961844948617777e-05, - "loss": 0.3506, + "epoch": 0.43031679100443293, + "grad_norm": 0.18575410544872284, + "learning_rate": 4.958931533296975e-05, + "loss": 0.4778, "step": 11940 }, { - "epoch": 0.42, - "learning_rate": 4.9617953525153485e-05, - "loss": 0.3049, + "epoch": 0.43049699066565755, + "grad_norm": 0.17367704212665558, + "learning_rate": 4.958878840463196e-05, + "loss": 0.4159, "step": 11945 }, { - "epoch": 0.42, - "learning_rate": 4.9617457244481294e-05, - "loss": 0.3365, + "epoch": 0.4306771903268822, + "grad_norm": 0.1562519371509552, + "learning_rate": 4.958826114127668e-05, + "loss": 0.4685, "step": 11950 }, { - "epoch": 0.42, - "learning_rate": 4.961696064416764e-05, - "loss": 0.3425, + "epoch": 0.4308573899881068, + "grad_norm": 0.13806965947151184, + "learning_rate": 4.958773354291111e-05, + "loss": 0.4313, "step": 11955 }, { - "epoch": 0.42, - "learning_rate": 4.961646372421898e-05, - "loss": 0.3385, + "epoch": 0.4310375896493315, + "grad_norm": 0.19402550160884857, + "learning_rate": 4.958720560954243e-05, + "loss": 0.4232, "step": 11960 }, { - "epoch": 0.42, - "learning_rate": 4.961596648464176e-05, - "loss": 0.3063, + "epoch": 0.4312177893105561, + "grad_norm": 0.18233174085617065, + "learning_rate": 4.958667734117784e-05, + "loss": 0.4742, "step": 11965 }, { - "epoch": 0.42, - "learning_rate": 4.9615468925442433e-05, - "loss": 0.306, + "epoch": 0.4313979889717807, + "grad_norm": 0.16293558478355408, + "learning_rate": 4.958614873782452e-05, + "loss": 0.4507, "step": 11970 }, { - "epoch": 0.42, - "learning_rate": 4.961497104662747e-05, - "loss": 0.3161, + "epoch": 0.43157818863300534, + "grad_norm": 0.23064178228378296, + "learning_rate": 4.95856197994897e-05, + "loss": 0.4566, "step": 11975 }, { - "epoch": 0.42, - "learning_rate": 4.961447284820333e-05, - "loss": 0.3016, + "epoch": 0.43175838829423, + "grad_norm": 0.14595326781272888, + "learning_rate": 4.958509052618055e-05, + "loss": 0.4589, "step": 11980 }, { - "epoch": 0.42, - "learning_rate": 4.9613974330176484e-05, - "loss": 0.3038, + "epoch": 0.43193858795545464, + "grad_norm": 0.13431671261787415, + "learning_rate": 4.958456091790431e-05, + "loss": 0.4459, "step": 11985 }, { - "epoch": 0.42, - "learning_rate": 4.96134754925534e-05, - "loss": 0.3315, + "epoch": 0.43211878761667927, + "grad_norm": 0.17889688909053802, + "learning_rate": 4.9584030974668195e-05, + "loss": 0.441, "step": 11990 }, { - "epoch": 0.42, - "learning_rate": 4.9612976335340556e-05, - "loss": 0.3475, + "epoch": 0.4322989872779039, + "grad_norm": 0.18637695908546448, + "learning_rate": 4.958350069647941e-05, + "loss": 0.4372, "step": 11995 }, { - "epoch": 0.42, - "learning_rate": 4.961247685854443e-05, - "loss": 0.3028, + "epoch": 0.43247918693912857, + "grad_norm": 0.20668061077594757, + "learning_rate": 4.958297008334519e-05, + "loss": 0.4777, "step": 12000 }, { - "epoch": 0.42, - "eval_loss": 0.31634584069252014, - "eval_runtime": 10.5394, - "eval_samples_per_second": 9.488, - "eval_steps_per_second": 9.488, + "epoch": 0.43247918693912857, + "eval_loss": 0.474940687417984, + "eval_runtime": 3.5638, + "eval_samples_per_second": 28.06, + "eval_steps_per_second": 7.015, "step": 12000 }, { - "epoch": 0.42, - "learning_rate": 4.961197706217152e-05, - "loss": 0.3324, + "epoch": 0.4326593866003532, + "grad_norm": 0.2014397382736206, + "learning_rate": 4.958243913527276e-05, + "loss": 0.4991, "step": 12005 }, { - "epoch": 0.42, - "learning_rate": 4.961147694622831e-05, - "loss": 0.3144, + "epoch": 0.4328395862615778, + "grad_norm": 0.17615807056427002, + "learning_rate": 4.958190785226936e-05, + "loss": 0.47, "step": 12010 }, { - "epoch": 0.42, - "learning_rate": 4.961097651072128e-05, - "loss": 0.3181, + "epoch": 0.4330197859228025, + "grad_norm": 0.20073723793029785, + "learning_rate": 4.958137623434222e-05, + "loss": 0.4421, "step": 12015 }, { - "epoch": 0.42, - "learning_rate": 4.961047575565695e-05, - "loss": 0.3045, + "epoch": 0.4331999855840271, + "grad_norm": 0.15735125541687012, + "learning_rate": 4.958084428149859e-05, + "loss": 0.4472, "step": 12020 }, { - "epoch": 0.42, - "learning_rate": 4.96099746810418e-05, - "loss": 0.3209, + "epoch": 0.43338018524525174, + "grad_norm": 0.21121670305728912, + "learning_rate": 4.9580311993745715e-05, + "loss": 0.451, "step": 12025 }, { - "epoch": 0.42, - "learning_rate": 4.960947328688236e-05, - "loss": 0.3149, + "epoch": 0.43356038490647636, + "grad_norm": 0.17626595497131348, + "learning_rate": 4.957977937109085e-05, + "loss": 0.4332, "step": 12030 }, { - "epoch": 0.42, - "learning_rate": 4.960897157318512e-05, - "loss": 0.3207, + "epoch": 0.43374058456770104, + "grad_norm": 0.18630550801753998, + "learning_rate": 4.9579246413541245e-05, + "loss": 0.4434, "step": 12035 }, { - "epoch": 0.42, - "learning_rate": 4.96084695399566e-05, - "loss": 0.3323, + "epoch": 0.43392078422892566, + "grad_norm": 0.17722322046756744, + "learning_rate": 4.957871312110417e-05, + "loss": 0.4437, "step": 12040 }, { - "epoch": 0.42, - "learning_rate": 4.960796718720333e-05, - "loss": 0.3058, + "epoch": 0.4341009838901503, + "grad_norm": 0.1969458907842636, + "learning_rate": 4.9578179493786884e-05, + "loss": 0.4555, "step": 12045 }, { - "epoch": 0.42, - "learning_rate": 4.9607464514931824e-05, - "loss": 0.3216, + "epoch": 0.4342811835513749, + "grad_norm": 0.19826200604438782, + "learning_rate": 4.9577645531596666e-05, + "loss": 0.4186, "step": 12050 }, { - "epoch": 0.42, - "learning_rate": 4.9606961523148597e-05, - "loss": 0.3235, + "epoch": 0.4344613832125996, + "grad_norm": 0.2032472789287567, + "learning_rate": 4.957711123454079e-05, + "loss": 0.4377, "step": 12055 }, { - "epoch": 0.42, - "learning_rate": 4.96064582118602e-05, - "loss": 0.3093, + "epoch": 0.4346415828738242, + "grad_norm": 0.20564451813697815, + "learning_rate": 4.957657660262652e-05, + "loss": 0.4529, "step": 12060 }, { - "epoch": 0.42, - "learning_rate": 4.960595458107315e-05, - "loss": 0.3025, + "epoch": 0.43482178253504883, + "grad_norm": 0.2119935005903244, + "learning_rate": 4.957604163586116e-05, + "loss": 0.4644, "step": 12065 }, { - "epoch": 0.42, - "learning_rate": 4.9605450630794e-05, - "loss": 0.3243, + "epoch": 0.43500198219627345, + "grad_norm": 0.15134219825267792, + "learning_rate": 4.9575506334251984e-05, + "loss": 0.4497, "step": 12070 }, { - "epoch": 0.42, - "learning_rate": 4.960494636102929e-05, - "loss": 0.334, + "epoch": 0.43518218185749813, + "grad_norm": 0.19318123161792755, + "learning_rate": 4.95749706978063e-05, + "loss": 0.4525, "step": 12075 }, { - "epoch": 0.43, - "learning_rate": 4.960444177178557e-05, - "loss": 0.336, + "epoch": 0.43536238151872275, + "grad_norm": 0.19543355703353882, + "learning_rate": 4.9574434726531395e-05, + "loss": 0.4887, "step": 12080 }, { - "epoch": 0.43, - "learning_rate": 4.960393686306939e-05, - "loss": 0.3214, + "epoch": 0.4355425811799474, + "grad_norm": 0.17110168933868408, + "learning_rate": 4.957389842043457e-05, + "loss": 0.4431, "step": 12085 }, { - "epoch": 0.43, - "learning_rate": 4.9603431634887296e-05, - "loss": 0.3246, + "epoch": 0.435722780841172, + "grad_norm": 0.18978413939476013, + "learning_rate": 4.957336177952314e-05, + "loss": 0.46, "step": 12090 }, { - "epoch": 0.43, - "learning_rate": 4.960292608724587e-05, - "loss": 0.3268, + "epoch": 0.4359029805023967, + "grad_norm": 0.22278402745723724, + "learning_rate": 4.957282480380442e-05, + "loss": 0.4595, "step": 12095 }, { - "epoch": 0.43, - "learning_rate": 4.960242022015165e-05, - "loss": 0.3246, + "epoch": 0.4360831801636213, + "grad_norm": 0.17783726751804352, + "learning_rate": 4.957228749328571e-05, + "loss": 0.4289, "step": 12100 }, { - "epoch": 0.43, - "learning_rate": 4.960191403361122e-05, - "loss": 0.3349, + "epoch": 0.4362633798248459, + "grad_norm": 0.2311362475156784, + "learning_rate": 4.957174984797434e-05, + "loss": 0.4326, "step": 12105 }, { - "epoch": 0.43, - "learning_rate": 4.960140752763115e-05, - "loss": 0.3367, + "epoch": 0.43644357948607054, + "grad_norm": 0.1933164894580841, + "learning_rate": 4.957121186787764e-05, + "loss": 0.4614, "step": 12110 }, { - "epoch": 0.43, - "learning_rate": 4.9600900702218016e-05, - "loss": 0.3581, + "epoch": 0.4366237791472952, + "grad_norm": 0.1860789954662323, + "learning_rate": 4.957067355300293e-05, + "loss": 0.4762, "step": 12115 }, { - "epoch": 0.43, - "learning_rate": 4.96003935573784e-05, - "loss": 0.3233, + "epoch": 0.43680397880851984, + "grad_norm": 0.18945522606372833, + "learning_rate": 4.9570134903357556e-05, + "loss": 0.4207, "step": 12120 }, { - "epoch": 0.43, - "learning_rate": 4.9599886093118887e-05, - "loss": 0.3331, + "epoch": 0.43698417846974447, + "grad_norm": 0.21275950968265533, + "learning_rate": 4.956959591894885e-05, + "loss": 0.4836, "step": 12125 }, { - "epoch": 0.43, - "learning_rate": 4.9599378309446063e-05, - "loss": 0.3189, + "epoch": 0.4371643781309691, + "grad_norm": 0.18478412926197052, + "learning_rate": 4.956905659978416e-05, + "loss": 0.4406, "step": 12130 }, { - "epoch": 0.43, - "learning_rate": 4.959887020636652e-05, - "loss": 0.3251, + "epoch": 0.43734457779219377, + "grad_norm": 0.16761377453804016, + "learning_rate": 4.9568516945870825e-05, + "loss": 0.4556, "step": 12135 }, { - "epoch": 0.43, - "learning_rate": 4.959836178388687e-05, - "loss": 0.3252, + "epoch": 0.4375247774534184, + "grad_norm": 0.14763802289962769, + "learning_rate": 4.9567976957216204e-05, + "loss": 0.4445, "step": 12140 }, { - "epoch": 0.43, - "learning_rate": 4.9597853042013694e-05, - "loss": 0.3224, + "epoch": 0.437704977114643, + "grad_norm": 0.1819683164358139, + "learning_rate": 4.956743663382766e-05, + "loss": 0.4411, "step": 12145 }, { - "epoch": 0.43, - "learning_rate": 4.959734398075361e-05, - "loss": 0.3192, + "epoch": 0.43788517677586763, + "grad_norm": 0.1549748033285141, + "learning_rate": 4.9566895975712533e-05, + "loss": 0.444, "step": 12150 }, { - "epoch": 0.43, - "learning_rate": 4.959683460011322e-05, - "loss": 0.3215, + "epoch": 0.4380653764370923, + "grad_norm": 0.16789652407169342, + "learning_rate": 4.9566354982878215e-05, + "loss": 0.4636, "step": 12155 }, { - "epoch": 0.43, - "learning_rate": 4.9596324900099145e-05, - "loss": 0.3254, + "epoch": 0.43824557609831694, + "grad_norm": 0.15570096671581268, + "learning_rate": 4.956581365533207e-05, + "loss": 0.4223, "step": 12160 }, { - "epoch": 0.43, - "learning_rate": 4.9595814880718e-05, - "loss": 0.3287, + "epoch": 0.43842577575954156, + "grad_norm": 0.20014168322086334, + "learning_rate": 4.956527199308146e-05, + "loss": 0.429, "step": 12165 }, { - "epoch": 0.43, - "learning_rate": 4.959530454197641e-05, - "loss": 0.3454, + "epoch": 0.43860597542076624, + "grad_norm": 0.2379717230796814, + "learning_rate": 4.956472999613379e-05, + "loss": 0.4862, "step": 12170 }, { - "epoch": 0.43, - "learning_rate": 4.9594793883881e-05, - "loss": 0.3208, + "epoch": 0.43878617508199086, + "grad_norm": 0.1970072239637375, + "learning_rate": 4.956418766449642e-05, + "loss": 0.4299, "step": 12175 }, { - "epoch": 0.43, - "learning_rate": 4.95942829064384e-05, - "loss": 0.3358, + "epoch": 0.4389663747432155, + "grad_norm": 0.19804427027702332, + "learning_rate": 4.956364499817674e-05, + "loss": 0.4662, "step": 12180 }, { - "epoch": 0.43, - "learning_rate": 4.9593771609655244e-05, - "loss": 0.3224, + "epoch": 0.4391465744044401, + "grad_norm": 0.18238474428653717, + "learning_rate": 4.956310199718217e-05, + "loss": 0.4628, "step": 12185 }, { - "epoch": 0.43, - "learning_rate": 4.9593259993538164e-05, - "loss": 0.3418, + "epoch": 0.4393267740656648, + "grad_norm": 0.1626834273338318, + "learning_rate": 4.956255866152008e-05, + "loss": 0.445, "step": 12190 }, { - "epoch": 0.43, - "learning_rate": 4.9592748058093824e-05, - "loss": 0.329, + "epoch": 0.4395069737268894, + "grad_norm": 0.18369080126285553, + "learning_rate": 4.956201499119788e-05, + "loss": 0.4526, "step": 12195 }, { - "epoch": 0.43, - "learning_rate": 4.959223580332885e-05, - "loss": 0.3275, + "epoch": 0.439687173388114, + "grad_norm": 0.1799646019935608, + "learning_rate": 4.956147098622299e-05, + "loss": 0.4295, "step": 12200 }, { - "epoch": 0.43, - "learning_rate": 4.9591723229249905e-05, - "loss": 0.3173, + "epoch": 0.43986737304933865, + "grad_norm": 0.13832755386829376, + "learning_rate": 4.9560926646602813e-05, + "loss": 0.4452, "step": 12205 }, { - "epoch": 0.43, - "learning_rate": 4.9591210335863635e-05, - "loss": 0.3148, + "epoch": 0.44004757271056333, + "grad_norm": 0.1626981645822525, + "learning_rate": 4.9560381972344765e-05, + "loss": 0.4387, "step": 12210 }, { - "epoch": 0.43, - "learning_rate": 4.959069712317671e-05, - "loss": 0.3237, + "epoch": 0.44022777237178795, + "grad_norm": 0.15181361138820648, + "learning_rate": 4.955983696345626e-05, + "loss": 0.4615, "step": 12215 }, { - "epoch": 0.43, - "learning_rate": 4.9590183591195785e-05, - "loss": 0.3236, + "epoch": 0.4404079720330126, + "grad_norm": 0.1977403461933136, + "learning_rate": 4.955929161994474e-05, + "loss": 0.429, "step": 12220 }, { - "epoch": 0.43, - "learning_rate": 4.9589669739927534e-05, - "loss": 0.3257, + "epoch": 0.4405881716942372, + "grad_norm": 0.1452832669019699, + "learning_rate": 4.955874594181763e-05, + "loss": 0.4529, "step": 12225 }, { - "epoch": 0.43, - "learning_rate": 4.958915556937862e-05, - "loss": 0.314, + "epoch": 0.4407683713554619, + "grad_norm": 0.15825548768043518, + "learning_rate": 4.955819992908235e-05, + "loss": 0.4748, "step": 12230 }, { - "epoch": 0.43, - "learning_rate": 4.9588641079555734e-05, - "loss": 0.3264, + "epoch": 0.4409485710166865, + "grad_norm": 0.17782393097877502, + "learning_rate": 4.9557653581746355e-05, + "loss": 0.4872, "step": 12235 }, { - "epoch": 0.43, - "learning_rate": 4.958812627046555e-05, - "loss": 0.341, + "epoch": 0.4411287706779111, + "grad_norm": 0.18894019722938538, + "learning_rate": 4.955710689981708e-05, + "loss": 0.4215, "step": 12240 }, { - "epoch": 0.43, - "learning_rate": 4.9587611142114744e-05, - "loss": 0.3338, + "epoch": 0.44130897033913574, + "grad_norm": 0.18643000721931458, + "learning_rate": 4.955655988330199e-05, + "loss": 0.4314, "step": 12245 }, { - "epoch": 0.43, - "learning_rate": 4.958709569451001e-05, - "loss": 0.3163, + "epoch": 0.4414891700003604, + "grad_norm": 0.17689816653728485, + "learning_rate": 4.955601253220852e-05, + "loss": 0.4323, "step": 12250 }, { - "epoch": 0.43, - "learning_rate": 4.958657992765805e-05, - "loss": 0.317, + "epoch": 0.44166936966158504, + "grad_norm": 0.19838203489780426, + "learning_rate": 4.955546484654413e-05, + "loss": 0.47, "step": 12255 }, { - "epoch": 0.43, - "learning_rate": 4.958606384156554e-05, - "loss": 0.3226, + "epoch": 0.44184956932280967, + "grad_norm": 0.17451122403144836, + "learning_rate": 4.955491682631629e-05, + "loss": 0.4202, "step": 12260 }, { - "epoch": 0.43, - "learning_rate": 4.958554743623921e-05, - "loss": 0.33, + "epoch": 0.4420297689840343, + "grad_norm": 0.16732750833034515, + "learning_rate": 4.955436847153246e-05, + "loss": 0.4592, "step": 12265 }, { - "epoch": 0.43, - "learning_rate": 4.9585030711685745e-05, - "loss": 0.318, + "epoch": 0.44220996864525897, + "grad_norm": 0.17255975306034088, + "learning_rate": 4.955381978220011e-05, + "loss": 0.4607, "step": 12270 }, { - "epoch": 0.43, - "learning_rate": 4.958451366791185e-05, - "loss": 0.3325, + "epoch": 0.4423901683064836, + "grad_norm": 0.20615766942501068, + "learning_rate": 4.955327075832672e-05, + "loss": 0.4488, "step": 12275 }, { - "epoch": 0.43, - "learning_rate": 4.958399630492425e-05, - "loss": 0.3203, + "epoch": 0.4425703679677082, + "grad_norm": 0.16766251623630524, + "learning_rate": 4.955272139991978e-05, + "loss": 0.4407, "step": 12280 }, { - "epoch": 0.43, - "learning_rate": 4.958347862272966e-05, - "loss": 0.3243, + "epoch": 0.44275056762893283, + "grad_norm": 0.16281658411026, + "learning_rate": 4.955217170698675e-05, + "loss": 0.4417, "step": 12285 }, { - "epoch": 0.43, - "learning_rate": 4.9582960621334805e-05, - "loss": 0.355, + "epoch": 0.4429307672901575, + "grad_norm": 0.1925809234380722, + "learning_rate": 4.955162167953514e-05, + "loss": 0.4254, "step": 12290 }, { - "epoch": 0.43, - "learning_rate": 4.95824423007464e-05, - "loss": 0.3308, + "epoch": 0.44311096695138213, + "grad_norm": 0.15888631343841553, + "learning_rate": 4.955107131757244e-05, + "loss": 0.4305, "step": 12295 }, { - "epoch": 0.43, - "learning_rate": 4.9581923660971186e-05, - "loss": 0.3271, + "epoch": 0.44329116661260676, + "grad_norm": 0.153702974319458, + "learning_rate": 4.955052062110615e-05, + "loss": 0.4306, "step": 12300 }, { - "epoch": 0.43, - "learning_rate": 4.9581404702015896e-05, - "loss": 0.3112, + "epoch": 0.4434713662738314, + "grad_norm": 0.190146803855896, + "learning_rate": 4.9549969590143765e-05, + "loss": 0.4711, "step": 12305 }, { - "epoch": 0.43, - "learning_rate": 4.958088542388726e-05, - "loss": 0.3161, + "epoch": 0.44365156593505606, + "grad_norm": 0.150401771068573, + "learning_rate": 4.9549418224692795e-05, + "loss": 0.4323, "step": 12310 }, { - "epoch": 0.43, - "learning_rate": 4.958036582659204e-05, - "loss": 0.3227, + "epoch": 0.4438317655962807, + "grad_norm": 0.17060506343841553, + "learning_rate": 4.954886652476076e-05, + "loss": 0.4777, "step": 12315 }, { - "epoch": 0.43, - "learning_rate": 4.9579845910136955e-05, - "loss": 0.3271, + "epoch": 0.4440119652575053, + "grad_norm": 0.16213923692703247, + "learning_rate": 4.9548314490355165e-05, + "loss": 0.4736, "step": 12320 }, { - "epoch": 0.43, - "learning_rate": 4.957932567452876e-05, - "loss": 0.3227, + "epoch": 0.4441921649187299, + "grad_norm": 0.17394715547561646, + "learning_rate": 4.954776212148354e-05, + "loss": 0.4731, "step": 12325 }, { - "epoch": 0.43, - "learning_rate": 4.957880511977424e-05, - "loss": 0.3203, + "epoch": 0.4443723645799546, + "grad_norm": 0.16405607759952545, + "learning_rate": 4.95472094181534e-05, + "loss": 0.42, "step": 12330 }, { - "epoch": 0.43, - "learning_rate": 4.957828424588012e-05, - "loss": 0.3467, + "epoch": 0.4445525642411792, + "grad_norm": 0.13627347350120544, + "learning_rate": 4.9546656380372306e-05, + "loss": 0.4613, "step": 12335 }, { - "epoch": 0.43, - "learning_rate": 4.9577763052853186e-05, - "loss": 0.3006, + "epoch": 0.44473276390240385, + "grad_norm": 0.19392867386341095, + "learning_rate": 4.954610300814776e-05, + "loss": 0.4315, "step": 12340 }, { - "epoch": 0.43, - "learning_rate": 4.957724154070019e-05, - "loss": 0.3177, + "epoch": 0.4449129635636285, + "grad_norm": 0.14533652365207672, + "learning_rate": 4.9545549301487306e-05, + "loss": 0.467, "step": 12345 }, { - "epoch": 0.43, - "learning_rate": 4.957671970942792e-05, - "loss": 0.3384, + "epoch": 0.44509316322485315, + "grad_norm": 0.17351950705051422, + "learning_rate": 4.95449952603985e-05, + "loss": 0.432, "step": 12350 }, { - "epoch": 0.43, - "learning_rate": 4.9576197559043134e-05, - "loss": 0.3048, + "epoch": 0.4452733628860778, + "grad_norm": 0.1487894058227539, + "learning_rate": 4.9544440884888885e-05, + "loss": 0.4856, "step": 12355 }, { - "epoch": 0.43, - "learning_rate": 4.957567508955262e-05, - "loss": 0.3106, + "epoch": 0.4454535625473024, + "grad_norm": 0.16079506278038025, + "learning_rate": 4.954388617496602e-05, + "loss": 0.4793, "step": 12360 }, { - "epoch": 0.44, - "learning_rate": 4.9575152300963154e-05, - "loss": 0.2756, + "epoch": 0.4456337622085271, + "grad_norm": 0.16144710779190063, + "learning_rate": 4.954333113063745e-05, + "loss": 0.4313, "step": 12365 }, { - "epoch": 0.44, - "learning_rate": 4.957462919328154e-05, - "loss": 0.3165, + "epoch": 0.4458139618697517, + "grad_norm": 0.1664080023765564, + "learning_rate": 4.9542775751910755e-05, + "loss": 0.4207, "step": 12370 }, { - "epoch": 0.44, - "learning_rate": 4.9574105766514555e-05, - "loss": 0.3283, + "epoch": 0.4459941615309763, + "grad_norm": 0.16213445365428925, + "learning_rate": 4.954222003879349e-05, + "loss": 0.4896, "step": 12375 }, { - "epoch": 0.44, - "learning_rate": 4.9573582020669007e-05, - "loss": 0.3237, + "epoch": 0.44617436119220094, + "grad_norm": 0.15756379067897797, + "learning_rate": 4.954166399129322e-05, + "loss": 0.4438, "step": 12380 }, { - "epoch": 0.44, - "learning_rate": 4.957305795575169e-05, - "loss": 0.3246, + "epoch": 0.4463545608534256, + "grad_norm": 0.21012753248214722, + "learning_rate": 4.954110760941754e-05, + "loss": 0.4689, "step": 12385 }, { - "epoch": 0.44, - "learning_rate": 4.957253357176941e-05, - "loss": 0.3304, + "epoch": 0.44653476051465024, + "grad_norm": 0.1980627030134201, + "learning_rate": 4.954055089317401e-05, + "loss": 0.4605, "step": 12390 }, { - "epoch": 0.44, - "learning_rate": 4.957200886872897e-05, - "loss": 0.3393, + "epoch": 0.44671496017587486, + "grad_norm": 0.19075866043567657, + "learning_rate": 4.9539993842570226e-05, + "loss": 0.4795, "step": 12395 }, { - "epoch": 0.44, - "learning_rate": 4.957148384663719e-05, - "loss": 0.307, + "epoch": 0.4468951598370995, + "grad_norm": 0.19187109172344208, + "learning_rate": 4.953943645761378e-05, + "loss": 0.4509, "step": 12400 }, { - "epoch": 0.44, - "learning_rate": 4.957095850550089e-05, - "loss": 0.3274, + "epoch": 0.44707535949832417, + "grad_norm": 0.16127610206604004, + "learning_rate": 4.9538878738312265e-05, + "loss": 0.4858, "step": 12405 }, { - "epoch": 0.44, - "learning_rate": 4.9570432845326884e-05, - "loss": 0.3076, + "epoch": 0.4472555591595488, + "grad_norm": 0.1766381561756134, + "learning_rate": 4.953832068467328e-05, + "loss": 0.4753, "step": 12410 }, { - "epoch": 0.44, - "learning_rate": 4.9569906866122e-05, - "loss": 0.3041, + "epoch": 0.4474357588207734, + "grad_norm": 0.16287901997566223, + "learning_rate": 4.953776229670442e-05, + "loss": 0.4756, "step": 12415 }, { - "epoch": 0.44, - "learning_rate": 4.956938056789306e-05, - "loss": 0.309, + "epoch": 0.44761595848199803, + "grad_norm": 0.15042343735694885, + "learning_rate": 4.9537203574413305e-05, + "loss": 0.408, "step": 12420 }, { - "epoch": 0.44, - "learning_rate": 4.956885395064692e-05, - "loss": 0.3154, + "epoch": 0.4477961581432227, + "grad_norm": 0.18103763461112976, + "learning_rate": 4.953664451780754e-05, + "loss": 0.4455, "step": 12425 }, { - "epoch": 0.44, - "learning_rate": 4.956832701439039e-05, - "loss": 0.3421, + "epoch": 0.44797635780444733, + "grad_norm": 0.23670852184295654, + "learning_rate": 4.953608512689474e-05, + "loss": 0.4823, "step": 12430 }, { - "epoch": 0.44, - "learning_rate": 4.9567799759130335e-05, - "loss": 0.3171, + "epoch": 0.44815655746567196, + "grad_norm": 0.21633177995681763, + "learning_rate": 4.9535525401682535e-05, + "loss": 0.4432, "step": 12435 }, { - "epoch": 0.44, - "learning_rate": 4.9567272184873586e-05, - "loss": 0.3307, + "epoch": 0.4483367571268966, + "grad_norm": 0.178372323513031, + "learning_rate": 4.9534965342178546e-05, + "loss": 0.4945, "step": 12440 }, { - "epoch": 0.44, - "learning_rate": 4.9566744291627e-05, - "loss": 0.3093, + "epoch": 0.44851695678812126, + "grad_norm": 0.2000657618045807, + "learning_rate": 4.95344049483904e-05, + "loss": 0.5071, "step": 12445 }, { - "epoch": 0.44, - "learning_rate": 4.956621607939742e-05, - "loss": 0.2988, + "epoch": 0.4486971564493459, + "grad_norm": 0.15917164087295532, + "learning_rate": 4.953384422032574e-05, + "loss": 0.4075, "step": 12450 }, { - "epoch": 0.44, - "learning_rate": 4.956568754819173e-05, - "loss": 0.2914, + "epoch": 0.4488773561105705, + "grad_norm": 0.14177782833576202, + "learning_rate": 4.9533283157992206e-05, + "loss": 0.4403, "step": 12455 }, { - "epoch": 0.44, - "learning_rate": 4.956515869801677e-05, - "loss": 0.3039, + "epoch": 0.4490575557717951, + "grad_norm": 0.17211756110191345, + "learning_rate": 4.953272176139744e-05, + "loss": 0.4176, "step": 12460 }, { - "epoch": 0.44, - "learning_rate": 4.9564629528879406e-05, - "loss": 0.3196, + "epoch": 0.4492377554330198, + "grad_norm": 0.14136864244937897, + "learning_rate": 4.953216003054908e-05, + "loss": 0.4492, "step": 12465 }, { - "epoch": 0.44, - "learning_rate": 4.956410004078652e-05, - "loss": 0.2953, + "epoch": 0.4494179550942444, + "grad_norm": 0.19541148841381073, + "learning_rate": 4.95315979654548e-05, + "loss": 0.4768, "step": 12470 }, { - "epoch": 0.44, - "learning_rate": 4.9563570233744995e-05, - "loss": 0.2914, + "epoch": 0.44959815475546905, + "grad_norm": 0.15044446289539337, + "learning_rate": 4.953103556612224e-05, + "loss": 0.4469, "step": 12475 }, { - "epoch": 0.44, - "learning_rate": 4.956304010776168e-05, - "loss": 0.302, + "epoch": 0.44977835441669367, + "grad_norm": 0.20979291200637817, + "learning_rate": 4.953047283255907e-05, + "loss": 0.4432, "step": 12480 }, { - "epoch": 0.44, - "learning_rate": 4.9562509662843495e-05, - "loss": 0.3218, + "epoch": 0.44995855407791835, + "grad_norm": 0.20431764423847198, + "learning_rate": 4.9529909764772956e-05, + "loss": 0.4589, "step": 12485 }, { - "epoch": 0.44, - "learning_rate": 4.95619788989973e-05, - "loss": 0.3176, + "epoch": 0.45013875373914297, + "grad_norm": 0.2022426575422287, + "learning_rate": 4.952934636277158e-05, + "loss": 0.4544, "step": 12490 }, { - "epoch": 0.44, - "learning_rate": 4.956144781622999e-05, - "loss": 0.3164, + "epoch": 0.4503189534003676, + "grad_norm": 0.22232398390769958, + "learning_rate": 4.95287826265626e-05, + "loss": 0.4909, "step": 12495 }, { - "epoch": 0.44, - "learning_rate": 4.9560916414548474e-05, - "loss": 0.2955, + "epoch": 0.4504991530615922, + "grad_norm": 0.18882164359092712, + "learning_rate": 4.95282185561537e-05, + "loss": 0.4937, "step": 12500 }, { - "epoch": 0.44, - "eval_loss": 0.31484097242355347, - "eval_runtime": 10.5384, - "eval_samples_per_second": 9.489, - "eval_steps_per_second": 9.489, + "epoch": 0.4504991530615922, + "eval_loss": 0.47478610277175903, + "eval_runtime": 3.606, + "eval_samples_per_second": 27.732, + "eval_steps_per_second": 6.933, "step": 12500 }, { - "epoch": 0.44, - "learning_rate": 4.9560384693959644e-05, - "loss": 0.3188, + "epoch": 0.4506793527228169, + "grad_norm": 0.15758134424686432, + "learning_rate": 4.952765415155258e-05, + "loss": 0.4397, "step": 12505 }, { - "epoch": 0.44, - "learning_rate": 4.955985265447041e-05, - "loss": 0.3233, + "epoch": 0.4508595523840415, + "grad_norm": 0.15819260478019714, + "learning_rate": 4.9527089412766926e-05, + "loss": 0.4245, "step": 12510 }, { - "epoch": 0.44, - "learning_rate": 4.9559320296087664e-05, - "loss": 0.3287, + "epoch": 0.45103975204526614, + "grad_norm": 0.20191067457199097, + "learning_rate": 4.952652433980442e-05, + "loss": 0.4507, "step": 12515 }, { - "epoch": 0.44, - "learning_rate": 4.955878761881834e-05, - "loss": 0.3187, + "epoch": 0.4512199517064908, + "grad_norm": 0.14131148159503937, + "learning_rate": 4.952595893267277e-05, + "loss": 0.4334, "step": 12520 }, { - "epoch": 0.44, - "learning_rate": 4.955825462266933e-05, - "loss": 0.3355, + "epoch": 0.45140015136771544, + "grad_norm": 0.1871136575937271, + "learning_rate": 4.9525393191379674e-05, + "loss": 0.4563, "step": 12525 }, { - "epoch": 0.44, - "learning_rate": 4.9557721307647576e-05, - "loss": 0.3231, + "epoch": 0.45158035102894006, + "grad_norm": 0.1495848298072815, + "learning_rate": 4.952482711593285e-05, + "loss": 0.4611, "step": 12530 }, { - "epoch": 0.44, - "learning_rate": 4.9557187673759996e-05, - "loss": 0.3486, + "epoch": 0.4517605506901647, + "grad_norm": 0.1738033890724182, + "learning_rate": 4.9524260706339996e-05, + "loss": 0.4509, "step": 12535 }, { - "epoch": 0.44, - "learning_rate": 4.955665372101351e-05, - "loss": 0.3138, + "epoch": 0.45194075035138936, + "grad_norm": 0.2945556938648224, + "learning_rate": 4.952369396260884e-05, + "loss": 0.4461, "step": 12540 }, { - "epoch": 0.44, - "learning_rate": 4.955611944941507e-05, - "loss": 0.3292, + "epoch": 0.452120950012614, + "grad_norm": 0.19618237018585205, + "learning_rate": 4.952312688474711e-05, + "loss": 0.4466, "step": 12545 }, { - "epoch": 0.44, - "learning_rate": 4.95555848589716e-05, - "loss": 0.3262, + "epoch": 0.4523011496738386, + "grad_norm": 0.1678617298603058, + "learning_rate": 4.952255947276252e-05, + "loss": 0.474, "step": 12550 }, { - "epoch": 0.44, - "learning_rate": 4.9555049949690044e-05, - "loss": 0.3101, + "epoch": 0.45248134933506323, + "grad_norm": 0.1514994353055954, + "learning_rate": 4.95219917266628e-05, + "loss": 0.4271, "step": 12555 }, { - "epoch": 0.44, - "learning_rate": 4.955451472157734e-05, - "loss": 0.3335, + "epoch": 0.4526615489962879, + "grad_norm": 0.14985314011573792, + "learning_rate": 4.95214236464557e-05, + "loss": 0.4505, "step": 12560 }, { - "epoch": 0.44, - "learning_rate": 4.9554086309533434e-05, - "loss": 0.3185, + "epoch": 0.45284174865751253, + "grad_norm": 0.19544857740402222, + "learning_rate": 4.952085523214894e-05, + "loss": 0.4893, "step": 12565 }, { - "epoch": 0.44, - "learning_rate": 4.955355050754219e-05, - "loss": 0.3242, + "epoch": 0.45302194831873716, + "grad_norm": 0.19242407381534576, + "learning_rate": 4.9520286483750277e-05, + "loss": 0.4616, "step": 12570 }, { - "epoch": 0.44, - "learning_rate": 4.955301438673928e-05, - "loss": 0.3294, + "epoch": 0.4532021479799618, + "grad_norm": 0.14265184104442596, + "learning_rate": 4.9519717401267465e-05, + "loss": 0.4539, "step": 12575 }, { - "epoch": 0.44, - "learning_rate": 4.9552477947131656e-05, - "loss": 0.3064, + "epoch": 0.45338234764118646, + "grad_norm": 0.181315615773201, + "learning_rate": 4.9519147984708246e-05, + "loss": 0.4585, "step": 12580 }, { - "epoch": 0.44, - "learning_rate": 4.9551941188726294e-05, - "loss": 0.3415, + "epoch": 0.4535625473024111, + "grad_norm": 0.16452257335186005, + "learning_rate": 4.9518578234080384e-05, + "loss": 0.4469, "step": 12585 }, { - "epoch": 0.44, - "learning_rate": 4.955140411153015e-05, - "loss": 0.3077, + "epoch": 0.4537427469636357, + "grad_norm": 0.14965125918388367, + "learning_rate": 4.951800814939164e-05, + "loss": 0.4432, "step": 12590 }, { - "epoch": 0.44, - "learning_rate": 4.9550866715550205e-05, - "loss": 0.3585, + "epoch": 0.4539229466248603, + "grad_norm": 0.16924121975898743, + "learning_rate": 4.951743773064978e-05, + "loss": 0.4248, "step": 12595 }, { - "epoch": 0.44, - "learning_rate": 4.955032900079343e-05, - "loss": 0.3242, + "epoch": 0.454103146286085, + "grad_norm": 0.21651718020439148, + "learning_rate": 4.951686697786258e-05, + "loss": 0.4606, "step": 12600 }, { - "epoch": 0.44, - "learning_rate": 4.954979096726682e-05, - "loss": 0.3211, + "epoch": 0.4542833459473096, + "grad_norm": 0.1570291370153427, + "learning_rate": 4.951629589103781e-05, + "loss": 0.4161, "step": 12605 }, { - "epoch": 0.44, - "learning_rate": 4.954925261497735e-05, - "loss": 0.2988, + "epoch": 0.45446354560853425, + "grad_norm": 0.18217509984970093, + "learning_rate": 4.951572447018326e-05, + "loss": 0.4096, "step": 12610 }, { - "epoch": 0.44, - "learning_rate": 4.954871394393201e-05, - "loss": 0.3153, + "epoch": 0.45464374526975887, + "grad_norm": 0.1948910504579544, + "learning_rate": 4.951515271530671e-05, + "loss": 0.4347, "step": 12615 }, { - "epoch": 0.44, - "learning_rate": 4.954817495413781e-05, - "loss": 0.3298, + "epoch": 0.45482394493098355, + "grad_norm": 0.1557484269142151, + "learning_rate": 4.951458062641595e-05, + "loss": 0.44, "step": 12620 }, { - "epoch": 0.44, - "learning_rate": 4.954763564560172e-05, - "loss": 0.3225, + "epoch": 0.45500414459220817, + "grad_norm": 0.21191979944705963, + "learning_rate": 4.951400820351877e-05, + "loss": 0.4676, "step": 12625 }, { - "epoch": 0.44, - "learning_rate": 4.954709601833078e-05, - "loss": 0.3155, + "epoch": 0.4551843442534328, + "grad_norm": 0.20376695692539215, + "learning_rate": 4.951343544662298e-05, + "loss": 0.4121, "step": 12630 }, { - "epoch": 0.44, - "learning_rate": 4.954655607233195e-05, - "loss": 0.3131, + "epoch": 0.4553645439146574, + "grad_norm": 0.18317648768424988, + "learning_rate": 4.9512862355736376e-05, + "loss": 0.4806, "step": 12635 }, { - "epoch": 0.44, - "learning_rate": 4.9546015807612275e-05, - "loss": 0.3121, + "epoch": 0.4555447435758821, + "grad_norm": 0.15849465131759644, + "learning_rate": 4.951228893086677e-05, + "loss": 0.4107, "step": 12640 }, { - "epoch": 0.44, - "learning_rate": 4.954547522417877e-05, - "loss": 0.3246, + "epoch": 0.4557249432371067, + "grad_norm": 0.1734628528356552, + "learning_rate": 4.951171517202197e-05, + "loss": 0.4431, "step": 12645 }, { - "epoch": 0.45, - "learning_rate": 4.954493432203844e-05, - "loss": 0.3338, + "epoch": 0.45590514289833134, + "grad_norm": 0.19202375411987305, + "learning_rate": 4.95111410792098e-05, + "loss": 0.4589, "step": 12650 }, { - "epoch": 0.45, - "learning_rate": 4.954439310119831e-05, - "loss": 0.3075, + "epoch": 0.45608534255955596, + "grad_norm": 0.1690249890089035, + "learning_rate": 4.951056665243807e-05, + "loss": 0.4878, "step": 12655 }, { - "epoch": 0.45, - "learning_rate": 4.954385156166541e-05, - "loss": 0.2985, + "epoch": 0.45626554222078064, + "grad_norm": 0.13854099810123444, + "learning_rate": 4.950999189171463e-05, + "loss": 0.4422, "step": 12660 }, { - "epoch": 0.45, - "learning_rate": 4.9543309703446783e-05, - "loss": 0.3016, + "epoch": 0.45644574188200526, + "grad_norm": 0.16640843451023102, + "learning_rate": 4.9509416797047284e-05, + "loss": 0.4411, "step": 12665 }, { - "epoch": 0.45, - "learning_rate": 4.9542767526549445e-05, - "loss": 0.3416, + "epoch": 0.4566259415432299, + "grad_norm": 0.19313612580299377, + "learning_rate": 4.9508841368443884e-05, + "loss": 0.4788, "step": 12670 }, { - "epoch": 0.45, - "learning_rate": 4.954222503098045e-05, - "loss": 0.3014, + "epoch": 0.45680614120445456, + "grad_norm": 0.19194231927394867, + "learning_rate": 4.9508265605912265e-05, + "loss": 0.4654, "step": 12675 }, { - "epoch": 0.45, - "learning_rate": 4.954168221674683e-05, - "loss": 0.313, + "epoch": 0.4569863408656792, + "grad_norm": 0.2459125965833664, + "learning_rate": 4.950768950946026e-05, + "loss": 0.4259, "step": 12680 }, { - "epoch": 0.45, - "learning_rate": 4.954113908385565e-05, - "loss": 0.3281, + "epoch": 0.4571665405269038, + "grad_norm": 0.17043115198612213, + "learning_rate": 4.950711307909575e-05, + "loss": 0.4381, "step": 12685 }, { - "epoch": 0.45, - "learning_rate": 4.954059563231394e-05, - "loss": 0.3349, + "epoch": 0.45734674018812843, + "grad_norm": 0.1516115516424179, + "learning_rate": 4.950653631482656e-05, + "loss": 0.4153, "step": 12690 }, { - "epoch": 0.45, - "learning_rate": 4.954005186212878e-05, - "loss": 0.3061, + "epoch": 0.4575269398493531, + "grad_norm": 0.18203213810920715, + "learning_rate": 4.9505959216660556e-05, + "loss": 0.4587, "step": 12695 }, { - "epoch": 0.45, - "learning_rate": 4.9539507773307215e-05, - "loss": 0.3097, + "epoch": 0.45770713951057773, + "grad_norm": 0.16077850759029388, + "learning_rate": 4.9505381784605606e-05, + "loss": 0.4684, "step": 12700 }, { - "epoch": 0.45, - "learning_rate": 4.9538963365856305e-05, - "loss": 0.3377, + "epoch": 0.45788733917180235, + "grad_norm": 0.16512863337993622, + "learning_rate": 4.9504804018669574e-05, + "loss": 0.4445, "step": 12705 }, { - "epoch": 0.45, - "learning_rate": 4.953841863978314e-05, - "loss": 0.3343, + "epoch": 0.458067538833027, + "grad_norm": 0.16586919128894806, + "learning_rate": 4.9504225918860326e-05, + "loss": 0.4602, "step": 12710 }, { - "epoch": 0.45, - "learning_rate": 4.953787359509477e-05, - "loss": 0.3359, + "epoch": 0.45824773849425166, + "grad_norm": 0.1363137811422348, + "learning_rate": 4.9503647485185744e-05, + "loss": 0.4658, "step": 12715 }, { - "epoch": 0.45, - "learning_rate": 4.953732823179829e-05, - "loss": 0.3121, + "epoch": 0.4584279381554763, + "grad_norm": 0.18424583971500397, + "learning_rate": 4.950306871765371e-05, + "loss": 0.4875, "step": 12720 }, { - "epoch": 0.45, - "learning_rate": 4.9536782549900776e-05, - "loss": 0.3339, + "epoch": 0.4586081378167009, + "grad_norm": 0.21040771901607513, + "learning_rate": 4.9502489616272115e-05, + "loss": 0.492, "step": 12725 }, { - "epoch": 0.45, - "learning_rate": 4.953623654940931e-05, - "loss": 0.3041, + "epoch": 0.4587883374779255, + "grad_norm": 0.15792599320411682, + "learning_rate": 4.9501910181048836e-05, + "loss": 0.4733, "step": 12730 }, { - "epoch": 0.45, - "learning_rate": 4.953569023033098e-05, - "loss": 0.3408, + "epoch": 0.4589685371391502, + "grad_norm": 0.1881541758775711, + "learning_rate": 4.950133041199177e-05, + "loss": 0.4476, "step": 12735 }, { - "epoch": 0.45, - "learning_rate": 4.9535143592672874e-05, - "loss": 0.315, + "epoch": 0.4591487368003748, + "grad_norm": 0.14302214980125427, + "learning_rate": 4.950075030910883e-05, + "loss": 0.457, "step": 12740 }, { - "epoch": 0.45, - "learning_rate": 4.9534596636442106e-05, - "loss": 0.3189, + "epoch": 0.45932893646159945, + "grad_norm": 0.16461120545864105, + "learning_rate": 4.9500169872407906e-05, + "loss": 0.4546, "step": 12745 }, { - "epoch": 0.45, - "learning_rate": 4.953404936164577e-05, - "loss": 0.3107, + "epoch": 0.45950913612282407, + "grad_norm": 0.18678593635559082, + "learning_rate": 4.949958910189692e-05, + "loss": 0.4279, "step": 12750 }, { - "epoch": 0.45, - "learning_rate": 4.953350176829097e-05, - "loss": 0.3081, + "epoch": 0.45968933578404875, + "grad_norm": 0.18721038103103638, + "learning_rate": 4.949900799758377e-05, + "loss": 0.443, "step": 12755 }, { - "epoch": 0.45, - "learning_rate": 4.9532953856384814e-05, - "loss": 0.3367, + "epoch": 0.45986953544527337, + "grad_norm": 0.1702054888010025, + "learning_rate": 4.949842655947637e-05, + "loss": 0.4364, "step": 12760 }, { - "epoch": 0.45, - "learning_rate": 4.9532405625934425e-05, - "loss": 0.3211, + "epoch": 0.460049735106498, + "grad_norm": 0.18055669963359833, + "learning_rate": 4.9497844787582665e-05, + "loss": 0.4579, "step": 12765 }, { - "epoch": 0.45, - "learning_rate": 4.9531857076946914e-05, - "loss": 0.3377, + "epoch": 0.4602299347677226, + "grad_norm": 0.18560905754566193, + "learning_rate": 4.949726268191056e-05, + "loss": 0.4471, "step": 12770 }, { - "epoch": 0.45, - "learning_rate": 4.9531308209429404e-05, - "loss": 0.3257, + "epoch": 0.4604101344289473, + "grad_norm": 0.16065679490566254, + "learning_rate": 4.9496680242467994e-05, + "loss": 0.4403, "step": 12775 }, { - "epoch": 0.45, - "learning_rate": 4.953075902338903e-05, - "loss": 0.3615, + "epoch": 0.4605903340901719, + "grad_norm": 0.1695624440908432, + "learning_rate": 4.94960974692629e-05, + "loss": 0.4245, "step": 12780 }, { - "epoch": 0.45, - "learning_rate": 4.9530209518832915e-05, - "loss": 0.307, + "epoch": 0.46077053375139654, + "grad_norm": 0.1966797411441803, + "learning_rate": 4.949551436230323e-05, + "loss": 0.4664, "step": 12785 }, { - "epoch": 0.45, - "learning_rate": 4.952965969576819e-05, - "loss": 0.3345, + "epoch": 0.46095073341262116, + "grad_norm": 0.19263668358325958, + "learning_rate": 4.9494930921596913e-05, + "loss": 0.4823, "step": 12790 }, { - "epoch": 0.45, - "learning_rate": 4.9529109554202e-05, - "loss": 0.3315, + "epoch": 0.46113093307384584, + "grad_norm": 0.16463421285152435, + "learning_rate": 4.949434714715191e-05, + "loss": 0.4511, "step": 12795 }, { - "epoch": 0.45, - "learning_rate": 4.95285590941415e-05, - "loss": 0.3312, + "epoch": 0.46131113273507046, + "grad_norm": 0.18872933089733124, + "learning_rate": 4.949376303897616e-05, + "loss": 0.4424, "step": 12800 }, { - "epoch": 0.45, - "learning_rate": 4.952800831559381e-05, - "loss": 0.3103, + "epoch": 0.4614913323962951, + "grad_norm": 0.14205656945705414, + "learning_rate": 4.949317859707764e-05, + "loss": 0.4761, "step": 12805 }, { - "epoch": 0.45, - "learning_rate": 4.9527457218566104e-05, - "loss": 0.3058, + "epoch": 0.4616715320575197, + "grad_norm": 0.16842302680015564, + "learning_rate": 4.94925938214643e-05, + "loss": 0.4496, "step": 12810 }, { - "epoch": 0.45, - "learning_rate": 4.952690580306553e-05, - "loss": 0.295, + "epoch": 0.4618517317187444, + "grad_norm": 0.15166863799095154, + "learning_rate": 4.9492008712144104e-05, + "loss": 0.4274, "step": 12815 }, { - "epoch": 0.45, - "learning_rate": 4.952635406909925e-05, - "loss": 0.3236, + "epoch": 0.462031931379969, + "grad_norm": 0.17034749686717987, + "learning_rate": 4.9491423269125035e-05, + "loss": 0.4313, "step": 12820 }, { - "epoch": 0.45, - "learning_rate": 4.952580201667442e-05, - "loss": 0.3461, + "epoch": 0.46221213104119363, + "grad_norm": 0.1309286206960678, + "learning_rate": 4.9490837492415074e-05, + "loss": 0.4346, "step": 12825 }, { - "epoch": 0.45, - "learning_rate": 4.952524964579821e-05, - "loss": 0.3296, + "epoch": 0.46239233070241825, + "grad_norm": 0.18976598978042603, + "learning_rate": 4.949025138202218e-05, + "loss": 0.4361, "step": 12830 }, { - "epoch": 0.45, - "learning_rate": 4.952469695647781e-05, - "loss": 0.3343, + "epoch": 0.46257253036364293, + "grad_norm": 0.1648472547531128, + "learning_rate": 4.948966493795437e-05, + "loss": 0.4638, "step": 12835 }, { - "epoch": 0.45, - "learning_rate": 4.952414394872037e-05, - "loss": 0.3293, + "epoch": 0.46275273002486755, + "grad_norm": 0.18952538073062897, + "learning_rate": 4.94890781602196e-05, + "loss": 0.4521, "step": 12840 }, { - "epoch": 0.45, - "learning_rate": 4.952359062253309e-05, - "loss": 0.3196, + "epoch": 0.4629329296860922, + "grad_norm": 0.15713319182395935, + "learning_rate": 4.9488491048825894e-05, + "loss": 0.4122, "step": 12845 }, { - "epoch": 0.45, - "learning_rate": 4.9523036977923145e-05, - "loss": 0.2872, + "epoch": 0.46311312934731685, + "grad_norm": 0.16628234088420868, + "learning_rate": 4.9487903603781225e-05, + "loss": 0.4013, "step": 12850 }, { - "epoch": 0.45, - "learning_rate": 4.952248301489772e-05, - "loss": 0.3446, + "epoch": 0.4632933290085415, + "grad_norm": 0.17581500113010406, + "learning_rate": 4.948731582509362e-05, + "loss": 0.4685, "step": 12855 }, { - "epoch": 0.45, - "learning_rate": 4.952192873346403e-05, - "loss": 0.3047, + "epoch": 0.4634735286697661, + "grad_norm": 0.1562207192182541, + "learning_rate": 4.948672771277107e-05, + "loss": 0.4381, "step": 12860 }, { - "epoch": 0.45, - "learning_rate": 4.952137413362924e-05, - "loss": 0.3364, + "epoch": 0.4636537283309907, + "grad_norm": 0.16530804336071014, + "learning_rate": 4.9486139266821606e-05, + "loss": 0.4846, "step": 12865 }, { - "epoch": 0.45, - "learning_rate": 4.9520819215400574e-05, - "loss": 0.3326, + "epoch": 0.4638339279922154, + "grad_norm": 0.17569473385810852, + "learning_rate": 4.948555048725323e-05, + "loss": 0.4499, "step": 12870 }, { - "epoch": 0.45, - "learning_rate": 4.952026397878523e-05, - "loss": 0.3088, + "epoch": 0.46401412765344, + "grad_norm": 0.16137835383415222, + "learning_rate": 4.948496137407397e-05, + "loss": 0.4263, "step": 12875 }, { - "epoch": 0.45, - "learning_rate": 4.951970842379042e-05, - "loss": 0.3121, + "epoch": 0.46419432731466465, + "grad_norm": 0.18716813623905182, + "learning_rate": 4.948437192729186e-05, + "loss": 0.4634, "step": 12880 }, { - "epoch": 0.45, - "learning_rate": 4.9519152550423355e-05, - "loss": 0.3348, + "epoch": 0.46437452697588927, + "grad_norm": 0.17155876755714417, + "learning_rate": 4.948378214691491e-05, + "loss": 0.4479, "step": 12885 }, { - "epoch": 0.45, - "learning_rate": 4.9518596358691246e-05, - "loss": 0.2744, + "epoch": 0.46455472663711395, + "grad_norm": 0.16690602898597717, + "learning_rate": 4.948319203295117e-05, + "loss": 0.409, "step": 12890 }, { - "epoch": 0.45, - "learning_rate": 4.951803984860133e-05, - "loss": 0.3146, + "epoch": 0.46473492629833857, + "grad_norm": 0.18938426673412323, + "learning_rate": 4.9482601585408684e-05, + "loss": 0.4384, "step": 12895 }, { - "epoch": 0.45, - "learning_rate": 4.9517483020160816e-05, - "loss": 0.3183, + "epoch": 0.4649151259595632, + "grad_norm": 0.1318012923002243, + "learning_rate": 4.9482010804295485e-05, + "loss": 0.4067, "step": 12900 }, { - "epoch": 0.45, - "learning_rate": 4.9516925873376955e-05, - "loss": 0.3227, + "epoch": 0.4650953256207878, + "grad_norm": 0.15733405947685242, + "learning_rate": 4.9481419689619635e-05, + "loss": 0.4644, "step": 12905 }, { - "epoch": 0.45, - "learning_rate": 4.951636840825695e-05, - "loss": 0.3254, + "epoch": 0.4652755252820125, + "grad_norm": 0.18979695439338684, + "learning_rate": 4.9480828241389176e-05, + "loss": 0.469, "step": 12910 }, { - "epoch": 0.45, - "learning_rate": 4.951581062480807e-05, - "loss": 0.353, + "epoch": 0.4654557249432371, + "grad_norm": 0.23890700936317444, + "learning_rate": 4.948023645961218e-05, + "loss": 0.468, "step": 12915 }, { - "epoch": 0.45, - "learning_rate": 4.9515252523037545e-05, - "loss": 0.3276, + "epoch": 0.46563592460446174, + "grad_norm": 0.13900893926620483, + "learning_rate": 4.9479644344296694e-05, + "loss": 0.4231, "step": 12920 }, { - "epoch": 0.45, - "learning_rate": 4.9514694102952625e-05, - "loss": 0.3128, + "epoch": 0.46581612426568636, + "grad_norm": 0.16857506334781647, + "learning_rate": 4.94790518954508e-05, + "loss": 0.3977, "step": 12925 }, { - "epoch": 0.45, - "learning_rate": 4.951413536456055e-05, - "loss": 0.3235, + "epoch": 0.46599632392691104, + "grad_norm": 0.2040061503648758, + "learning_rate": 4.9478459113082556e-05, + "loss": 0.4529, "step": 12930 }, { - "epoch": 0.46, - "learning_rate": 4.951357630786859e-05, - "loss": 0.3151, + "epoch": 0.46617652358813566, + "grad_norm": 0.1292639821767807, + "learning_rate": 4.9477865997200044e-05, + "loss": 0.4345, "step": 12935 }, { - "epoch": 0.46, - "learning_rate": 4.9513016932883994e-05, - "loss": 0.319, + "epoch": 0.4663567232493603, + "grad_norm": 0.1888824999332428, + "learning_rate": 4.9477272547811354e-05, + "loss": 0.436, "step": 12940 }, { - "epoch": 0.46, - "learning_rate": 4.9512457239614026e-05, - "loss": 0.3616, + "epoch": 0.4665369229105849, + "grad_norm": 0.18811088800430298, + "learning_rate": 4.947667876492457e-05, + "loss": 0.4403, "step": 12945 }, { - "epoch": 0.46, - "learning_rate": 4.951189722806596e-05, - "loss": 0.3197, + "epoch": 0.4667171225718096, + "grad_norm": 0.17005407810211182, + "learning_rate": 4.947608464854776e-05, + "loss": 0.4412, "step": 12950 }, { - "epoch": 0.46, - "learning_rate": 4.951133689824706e-05, - "loss": 0.304, + "epoch": 0.4668973222330342, + "grad_norm": 0.17521242797374725, + "learning_rate": 4.9475490198689044e-05, + "loss": 0.4371, "step": 12955 }, { - "epoch": 0.46, - "learning_rate": 4.95107762501646e-05, - "loss": 0.3244, + "epoch": 0.46707752189425883, + "grad_norm": 0.21031251549720764, + "learning_rate": 4.947489541535651e-05, + "loss": 0.4775, "step": 12960 }, { - "epoch": 0.46, - "learning_rate": 4.951021528382587e-05, - "loss": 0.2945, + "epoch": 0.46725772155548345, + "grad_norm": 0.19381965696811676, + "learning_rate": 4.947430029855827e-05, + "loss": 0.4625, "step": 12965 }, { - "epoch": 0.46, - "learning_rate": 4.9509653999238143e-05, - "loss": 0.3401, + "epoch": 0.46743792121670813, + "grad_norm": 0.19049398601055145, + "learning_rate": 4.9473704848302424e-05, + "loss": 0.4648, "step": 12970 }, { - "epoch": 0.46, - "learning_rate": 4.950909239640871e-05, - "loss": 0.3175, + "epoch": 0.46761812087793275, + "grad_norm": 0.17350056767463684, + "learning_rate": 4.947310906459709e-05, + "loss": 0.4143, "step": 12975 }, { - "epoch": 0.46, - "learning_rate": 4.950853047534487e-05, - "loss": 0.3314, + "epoch": 0.4677983205391574, + "grad_norm": 0.20652742683887482, + "learning_rate": 4.947251294745038e-05, + "loss": 0.4555, "step": 12980 }, { - "epoch": 0.46, - "learning_rate": 4.950796823605391e-05, - "loss": 0.3377, + "epoch": 0.467978520200382, + "grad_norm": 0.1818598359823227, + "learning_rate": 4.947191649687042e-05, + "loss": 0.4312, "step": 12985 }, { - "epoch": 0.46, - "learning_rate": 4.950740567854313e-05, - "loss": 0.3175, + "epoch": 0.4681587198616067, + "grad_norm": 0.17591311037540436, + "learning_rate": 4.947131971286534e-05, + "loss": 0.4311, "step": 12990 }, { - "epoch": 0.46, - "learning_rate": 4.950684280281985e-05, - "loss": 0.3056, + "epoch": 0.4683389195228313, + "grad_norm": 0.14075951278209686, + "learning_rate": 4.947072259544326e-05, + "loss": 0.4396, "step": 12995 }, { - "epoch": 0.46, - "learning_rate": 4.950627960889136e-05, - "loss": 0.306, + "epoch": 0.4685191191840559, + "grad_norm": 0.20111533999443054, + "learning_rate": 4.947012514461232e-05, + "loss": 0.4697, "step": 13000 }, { - "epoch": 0.46, - "eval_loss": 0.31242769956588745, - "eval_runtime": 10.5492, - "eval_samples_per_second": 9.479, - "eval_steps_per_second": 9.479, + "epoch": 0.4685191191840559, + "eval_loss": 0.4741581976413727, + "eval_runtime": 3.5647, + "eval_samples_per_second": 28.053, + "eval_steps_per_second": 7.013, "step": 13000 }, { - "epoch": 0.46, - "learning_rate": 4.9505716096764976e-05, - "loss": 0.3369, + "epoch": 0.46869931884528054, + "grad_norm": 0.1940506100654602, + "learning_rate": 4.9469527360380676e-05, + "loss": 0.4683, "step": 13005 }, { - "epoch": 0.46, - "learning_rate": 4.950515226644802e-05, - "loss": 0.3456, + "epoch": 0.4688795185065052, + "grad_norm": 0.16118963062763214, + "learning_rate": 4.946892924275645e-05, + "loss": 0.4534, "step": 13010 }, { - "epoch": 0.46, - "learning_rate": 4.9504588117947814e-05, - "loss": 0.3514, + "epoch": 0.46905971816772984, + "grad_norm": 0.1373254507780075, + "learning_rate": 4.9468330791747795e-05, + "loss": 0.4503, "step": 13015 }, { - "epoch": 0.46, - "learning_rate": 4.950402365127168e-05, - "loss": 0.3113, + "epoch": 0.46923991782895447, + "grad_norm": 0.1803598254919052, + "learning_rate": 4.946773200736288e-05, + "loss": 0.481, "step": 13020 }, { - "epoch": 0.46, - "learning_rate": 4.950345886642694e-05, - "loss": 0.3034, + "epoch": 0.46942011749017915, + "grad_norm": 0.18587857484817505, + "learning_rate": 4.9467132889609845e-05, + "loss": 0.4849, "step": 13025 }, { - "epoch": 0.46, - "learning_rate": 4.950289376342094e-05, - "loss": 0.3103, + "epoch": 0.46960031715140377, + "grad_norm": 0.17999742925167084, + "learning_rate": 4.946653343849686e-05, + "loss": 0.4621, "step": 13030 }, { - "epoch": 0.46, - "learning_rate": 4.9502328342261015e-05, - "loss": 0.3041, + "epoch": 0.4697805168126284, + "grad_norm": 0.17007941007614136, + "learning_rate": 4.9465933654032106e-05, + "loss": 0.441, "step": 13035 }, { - "epoch": 0.46, - "learning_rate": 4.9501762602954505e-05, - "loss": 0.3191, + "epoch": 0.469960716473853, + "grad_norm": 0.1974543035030365, + "learning_rate": 4.9465333536223734e-05, + "loss": 0.4479, "step": 13040 }, { - "epoch": 0.46, - "learning_rate": 4.9501196545508756e-05, - "loss": 0.3235, + "epoch": 0.4701409161350777, + "grad_norm": 0.1567683070898056, + "learning_rate": 4.946473308507993e-05, + "loss": 0.4881, "step": 13045 }, { - "epoch": 0.46, - "learning_rate": 4.950063016993112e-05, - "loss": 0.324, + "epoch": 0.4703211157963023, + "grad_norm": 0.17787274718284607, + "learning_rate": 4.9464132300608876e-05, + "loss": 0.4597, "step": 13050 }, { - "epoch": 0.46, - "learning_rate": 4.9500063476228936e-05, - "loss": 0.3255, + "epoch": 0.47050131545752694, + "grad_norm": 0.2110513597726822, + "learning_rate": 4.9463531182818756e-05, + "loss": 0.4353, "step": 13055 }, { - "epoch": 0.46, - "learning_rate": 4.9499496464409576e-05, - "loss": 0.3304, + "epoch": 0.47068151511875156, + "grad_norm": 0.19464978575706482, + "learning_rate": 4.946292973171777e-05, + "loss": 0.419, "step": 13060 }, { - "epoch": 0.46, - "learning_rate": 4.9498929134480406e-05, - "loss": 0.3112, + "epoch": 0.47086171477997624, + "grad_norm": 0.18189744651317596, + "learning_rate": 4.946232794731408e-05, + "loss": 0.463, "step": 13065 }, { - "epoch": 0.46, - "learning_rate": 4.9498361486448786e-05, - "loss": 0.3035, + "epoch": 0.47104191444120086, + "grad_norm": 0.21045507490634918, + "learning_rate": 4.946172582961593e-05, + "loss": 0.5024, "step": 13070 }, { - "epoch": 0.46, - "learning_rate": 4.949779352032208e-05, - "loss": 0.3178, + "epoch": 0.4712221141024255, + "grad_norm": 0.16427326202392578, + "learning_rate": 4.946112337863148e-05, + "loss": 0.4796, "step": 13075 }, { - "epoch": 0.46, - "learning_rate": 4.949722523610768e-05, - "loss": 0.309, + "epoch": 0.4714023137636501, + "grad_norm": 0.18981902301311493, + "learning_rate": 4.9460520594368975e-05, + "loss": 0.4395, "step": 13080 }, { - "epoch": 0.46, - "learning_rate": 4.949665663381295e-05, - "loss": 0.3165, + "epoch": 0.4715825134248748, + "grad_norm": 0.20156720280647278, + "learning_rate": 4.945991747683661e-05, + "loss": 0.4865, "step": 13085 }, { - "epoch": 0.46, - "learning_rate": 4.949608771344528e-05, - "loss": 0.3333, + "epoch": 0.4717627130860994, + "grad_norm": 0.14787843823432922, + "learning_rate": 4.9459314026042605e-05, + "loss": 0.4583, "step": 13090 }, { - "epoch": 0.46, - "learning_rate": 4.949551847501205e-05, - "loss": 0.3362, + "epoch": 0.47194291274732403, + "grad_norm": 0.1643127053976059, + "learning_rate": 4.9458710241995174e-05, + "loss": 0.4685, "step": 13095 }, { - "epoch": 0.46, - "learning_rate": 4.9494948918520655e-05, - "loss": 0.2987, + "epoch": 0.47212311240854865, + "grad_norm": 0.19920234382152557, + "learning_rate": 4.9458106124702565e-05, + "loss": 0.453, "step": 13100 }, { - "epoch": 0.46, - "learning_rate": 4.9494379043978494e-05, - "loss": 0.2885, + "epoch": 0.47230331206977333, + "grad_norm": 0.16437123715877533, + "learning_rate": 4.945750167417299e-05, + "loss": 0.4052, "step": 13105 }, { - "epoch": 0.46, - "learning_rate": 4.9493808851392964e-05, - "loss": 0.3404, + "epoch": 0.47248351173099795, + "grad_norm": 0.1786852478981018, + "learning_rate": 4.945689689041468e-05, + "loss": 0.4609, "step": 13110 }, { - "epoch": 0.46, - "learning_rate": 4.9493238340771465e-05, - "loss": 0.3182, + "epoch": 0.4726637113922226, + "grad_norm": NaN, + "learning_rate": 4.9456412823488897e-05, + "loss": 0.4437, "step": 13115 }, { - "epoch": 0.46, - "learning_rate": 4.949266751212142e-05, - "loss": 0.3212, + "epoch": 0.4728439110534472, + "grad_norm": 0.18749703466892242, + "learning_rate": 4.9455807439939664e-05, + "loss": 0.4415, "step": 13120 }, { - "epoch": 0.46, - "learning_rate": 4.949209636545021e-05, - "loss": 0.3207, + "epoch": 0.4730241107146719, + "grad_norm": 0.15940409898757935, + "learning_rate": 4.9455201723184773e-05, + "loss": 0.4648, "step": 13125 }, { - "epoch": 0.46, - "learning_rate": 4.949152490076528e-05, - "loss": 0.3215, + "epoch": 0.4732043103758965, + "grad_norm": 0.1593160629272461, + "learning_rate": 4.9454595673232505e-05, + "loss": 0.4266, "step": 13130 }, { - "epoch": 0.46, - "learning_rate": 4.949095311807404e-05, - "loss": 0.3127, + "epoch": 0.4733845100371211, + "grad_norm": 0.1828412562608719, + "learning_rate": 4.9453989290091106e-05, + "loss": 0.404, "step": 13135 }, { - "epoch": 0.46, - "learning_rate": 4.9490381017383906e-05, - "loss": 0.3429, + "epoch": 0.47356470969834574, + "grad_norm": 0.16431830823421478, + "learning_rate": 4.945338257376884e-05, + "loss": 0.487, "step": 13140 }, { - "epoch": 0.46, - "learning_rate": 4.9489808598702325e-05, - "loss": 0.3377, + "epoch": 0.4737449093595704, + "grad_norm": 0.17641369998455048, + "learning_rate": 4.9452775524273963e-05, + "loss": 0.4485, "step": 13145 }, { - "epoch": 0.46, - "learning_rate": 4.948923586203671e-05, - "loss": 0.3528, + "epoch": 0.47392510902079504, + "grad_norm": 0.1741415560245514, + "learning_rate": 4.9452168141614754e-05, + "loss": 0.4163, "step": 13150 }, { - "epoch": 0.46, - "learning_rate": 4.948866280739452e-05, - "loss": 0.3036, + "epoch": 0.47410530868201967, + "grad_norm": 0.20057284832000732, + "learning_rate": 4.9451560425799495e-05, + "loss": 0.4494, "step": 13155 }, { - "epoch": 0.46, - "learning_rate": 4.9488089434783165e-05, - "loss": 0.3039, + "epoch": 0.4742855083432443, + "grad_norm": 0.16417837142944336, + "learning_rate": 4.9450952376836454e-05, + "loss": 0.4283, "step": 13160 }, { - "epoch": 0.46, - "learning_rate": 4.948751574421011e-05, - "loss": 0.3316, + "epoch": 0.47446570800446897, + "grad_norm": 0.14967764914035797, + "learning_rate": 4.945034399473392e-05, + "loss": 0.441, "step": 13165 }, { - "epoch": 0.46, - "learning_rate": 4.948694173568281e-05, - "loss": 0.3309, + "epoch": 0.4746459076656936, + "grad_norm": 0.17966294288635254, + "learning_rate": 4.944973527950019e-05, + "loss": 0.4765, "step": 13170 }, { - "epoch": 0.46, - "learning_rate": 4.94863674092087e-05, - "loss": 0.3295, + "epoch": 0.4748261073269182, + "grad_norm": 0.18716269731521606, + "learning_rate": 4.944912623114354e-05, + "loss": 0.4723, "step": 13175 }, { - "epoch": 0.46, - "learning_rate": 4.9485792764795255e-05, - "loss": 0.3174, + "epoch": 0.47500630698814283, + "grad_norm": 0.16850250959396362, + "learning_rate": 4.9448516849672285e-05, + "loss": 0.4771, "step": 13180 }, { - "epoch": 0.46, - "learning_rate": 4.948521780244992e-05, - "loss": 0.3137, + "epoch": 0.4751865066493675, + "grad_norm": 0.1683826893568039, + "learning_rate": 4.944790713509472e-05, + "loss": 0.4295, "step": 13185 }, { - "epoch": 0.46, - "learning_rate": 4.948464252218017e-05, - "loss": 0.3127, + "epoch": 0.47536670631059214, + "grad_norm": 0.1629522442817688, + "learning_rate": 4.9447297087419155e-05, + "loss": 0.4116, "step": 13190 }, { - "epoch": 0.46, - "learning_rate": 4.9484066923993476e-05, - "loss": 0.3373, + "epoch": 0.47554690597181676, + "grad_norm": 0.16249153017997742, + "learning_rate": 4.9446686706653896e-05, + "loss": 0.431, "step": 13195 }, { - "epoch": 0.46, - "learning_rate": 4.948349100789731e-05, - "loss": 0.3312, + "epoch": 0.47572710563304144, + "grad_norm": 0.16128148138523102, + "learning_rate": 4.944607599280726e-05, + "loss": 0.4552, "step": 13200 }, { - "epoch": 0.46, - "learning_rate": 4.9483030046130576e-05, - "loss": 0.3225, + "epoch": 0.47590730529426606, + "grad_norm": 0.1829308420419693, + "learning_rate": 4.944546494588758e-05, + "loss": 0.4313, "step": 13205 }, { - "epoch": 0.46, - "learning_rate": 4.94824535578162e-05, - "loss": 0.3187, + "epoch": 0.4760875049554907, + "grad_norm": 0.1804686188697815, + "learning_rate": 4.944485356590317e-05, + "loss": 0.4562, "step": 13210 }, { - "epoch": 0.46, - "learning_rate": 4.94818767516133e-05, - "loss": 0.3131, + "epoch": 0.4762677046167153, + "grad_norm": 0.17455480992794037, + "learning_rate": 4.9444241852862366e-05, + "loss": 0.4468, "step": 13215 }, { - "epoch": 0.47, - "learning_rate": 4.948129962752937e-05, - "loss": 0.3162, + "epoch": 0.47644790427794, + "grad_norm": 0.18920566141605377, + "learning_rate": 4.944362980677349e-05, + "loss": 0.4687, "step": 13220 }, { - "epoch": 0.47, - "learning_rate": 4.948072218557189e-05, - "loss": 0.3216, + "epoch": 0.4766281039391646, + "grad_norm": 0.19825154542922974, + "learning_rate": 4.9443017427644906e-05, + "loss": 0.4629, "step": 13225 }, { - "epoch": 0.47, - "learning_rate": 4.948014442574837e-05, - "loss": 0.3128, + "epoch": 0.4768083036003892, + "grad_norm": 0.1756822019815445, + "learning_rate": 4.944240471548493e-05, + "loss": 0.4885, "step": 13230 }, { - "epoch": 0.47, - "learning_rate": 4.947956634806631e-05, - "loss": 0.2955, + "epoch": 0.47698850326161385, + "grad_norm": 0.20122404396533966, + "learning_rate": 4.944179167030193e-05, + "loss": 0.4641, "step": 13235 }, { - "epoch": 0.47, - "learning_rate": 4.947898795253321e-05, - "loss": 0.3057, + "epoch": 0.47716870292283853, + "grad_norm": 0.17638641595840454, + "learning_rate": 4.944117829210424e-05, + "loss": 0.4641, "step": 13240 }, { - "epoch": 0.47, - "learning_rate": 4.947840923915659e-05, - "loss": 0.315, + "epoch": 0.47734890258406315, + "grad_norm": 0.20242132246494293, + "learning_rate": 4.944056458090024e-05, + "loss": 0.4815, "step": 13245 }, { - "epoch": 0.47, - "learning_rate": 4.9477830207943944e-05, - "loss": 0.3163, + "epoch": 0.4775291022452878, + "grad_norm": 0.18638722598552704, + "learning_rate": 4.943995053669827e-05, + "loss": 0.4504, "step": 13250 }, { - "epoch": 0.47, - "learning_rate": 4.947725085890281e-05, - "loss": 0.3074, + "epoch": 0.4777093019065124, + "grad_norm": 0.16893352568149567, + "learning_rate": 4.943933615950671e-05, + "loss": 0.4275, "step": 13255 }, { - "epoch": 0.47, - "learning_rate": 4.947667119204071e-05, - "loss": 0.3027, + "epoch": 0.4778895015677371, + "grad_norm": 0.16085629165172577, + "learning_rate": 4.9438721449333925e-05, + "loss": 0.4648, "step": 13260 }, { - "epoch": 0.47, - "learning_rate": 4.947609120736516e-05, - "loss": 0.3092, + "epoch": 0.4780697012289617, + "grad_norm": 0.1869862973690033, + "learning_rate": 4.943810640618829e-05, + "loss": 0.4415, "step": 13265 }, { - "epoch": 0.47, - "learning_rate": 4.94755109048837e-05, - "loss": 0.3223, + "epoch": 0.4782499008901863, + "grad_norm": 0.16547581553459167, + "learning_rate": 4.9437491030078185e-05, + "loss": 0.5078, "step": 13270 }, { - "epoch": 0.47, - "learning_rate": 4.947493028460386e-05, - "loss": 0.317, + "epoch": 0.47843010055141094, + "grad_norm": 0.19443146884441376, + "learning_rate": 4.9436875321012e-05, + "loss": 0.4237, "step": 13275 }, { - "epoch": 0.47, - "learning_rate": 4.9474349346533175e-05, - "loss": 0.3389, + "epoch": 0.4786103002126356, + "grad_norm": 0.1601037085056305, + "learning_rate": 4.943625927899812e-05, + "loss": 0.4692, "step": 13280 }, { - "epoch": 0.47, - "learning_rate": 4.947376809067921e-05, - "loss": 0.3291, + "epoch": 0.47879049987386024, + "grad_norm": 0.17408448457717896, + "learning_rate": 4.943564290404494e-05, + "loss": 0.4617, "step": 13285 }, { - "epoch": 0.47, - "learning_rate": 4.947318651704948e-05, - "loss": 0.278, + "epoch": 0.47897069953508487, + "grad_norm": 0.17495988309383392, + "learning_rate": 4.943502619616085e-05, + "loss": 0.4349, "step": 13290 }, { - "epoch": 0.47, - "learning_rate": 4.947260462565155e-05, - "loss": 0.3282, + "epoch": 0.4791508991963095, + "grad_norm": 0.18326228857040405, + "learning_rate": 4.9434409155354266e-05, + "loss": 0.4518, "step": 13295 }, { - "epoch": 0.47, - "learning_rate": 4.947202241649298e-05, - "loss": 0.3176, + "epoch": 0.47933109885753417, + "grad_norm": 0.14436647295951843, + "learning_rate": 4.9433791781633584e-05, + "loss": 0.4689, "step": 13300 }, { - "epoch": 0.47, - "learning_rate": 4.9471439889581325e-05, - "loss": 0.3368, + "epoch": 0.4795112985187588, + "grad_norm": 0.1709493100643158, + "learning_rate": 4.9433174075007216e-05, + "loss": 0.4747, "step": 13305 }, { - "epoch": 0.47, - "learning_rate": 4.947085704492416e-05, - "loss": 0.3488, + "epoch": 0.4796914981799834, + "grad_norm": 0.19501015543937683, + "learning_rate": 4.943255603548359e-05, + "loss": 0.4446, "step": 13310 }, { - "epoch": 0.47, - "learning_rate": 4.947027388252903e-05, - "loss": 0.3198, + "epoch": 0.47987169784120803, + "grad_norm": 0.16329129040241241, + "learning_rate": 4.943193766307111e-05, + "loss": 0.4545, "step": 13315 }, { - "epoch": 0.47, - "learning_rate": 4.9469690402403525e-05, - "loss": 0.3155, + "epoch": 0.4800518975024327, + "grad_norm": 0.16876693069934845, + "learning_rate": 4.9431318957778214e-05, + "loss": 0.4874, "step": 13320 }, { - "epoch": 0.47, - "learning_rate": 4.946910660455522e-05, - "loss": 0.3389, + "epoch": 0.48023209716365733, + "grad_norm": 0.18165095150470734, + "learning_rate": 4.943069991961333e-05, + "loss": 0.4349, "step": 13325 }, { - "epoch": 0.47, - "learning_rate": 4.946852248899169e-05, - "loss": 0.3143, + "epoch": 0.48041229682488196, + "grad_norm": 0.17460200190544128, + "learning_rate": 4.9430080548584884e-05, + "loss": 0.4678, "step": 13330 }, { - "epoch": 0.47, - "learning_rate": 4.946793805572052e-05, - "loss": 0.3292, + "epoch": 0.4805924964861066, + "grad_norm": 0.14548338949680328, + "learning_rate": 4.9429460844701325e-05, + "loss": 0.4717, "step": 13335 }, { - "epoch": 0.47, - "learning_rate": 4.94673533047493e-05, - "loss": 0.3163, + "epoch": 0.48077269614733126, + "grad_norm": 0.17974305152893066, + "learning_rate": 4.942884080797109e-05, + "loss": 0.4617, "step": 13340 }, { - "epoch": 0.47, - "learning_rate": 4.946676823608562e-05, - "loss": 0.325, + "epoch": 0.4809528958085559, + "grad_norm": 0.194953054189682, + "learning_rate": 4.942822043840262e-05, + "loss": 0.4724, "step": 13345 }, { - "epoch": 0.47, - "learning_rate": 4.946618284973708e-05, - "loss": 0.3158, + "epoch": 0.4811330954697805, + "grad_norm": 0.16658879816532135, + "learning_rate": 4.942759973600439e-05, + "loss": 0.4236, "step": 13350 }, { - "epoch": 0.47, - "learning_rate": 4.946559714571127e-05, - "loss": 0.3372, + "epoch": 0.4813132951310052, + "grad_norm": 0.18458372354507446, + "learning_rate": 4.9426978700784834e-05, + "loss": 0.4801, "step": 13355 }, { - "epoch": 0.47, - "learning_rate": 4.946501112401582e-05, - "loss": 0.3333, + "epoch": 0.4814934947922298, + "grad_norm": 0.18043503165245056, + "learning_rate": 4.942635733275243e-05, + "loss": 0.4592, "step": 13360 }, { - "epoch": 0.47, - "learning_rate": 4.9464424784658314e-05, - "loss": 0.3234, + "epoch": 0.4816736944534544, + "grad_norm": 0.2545734643936157, + "learning_rate": 4.942573563191563e-05, + "loss": 0.4656, "step": 13365 }, { - "epoch": 0.47, - "learning_rate": 4.946383812764638e-05, - "loss": 0.2952, + "epoch": 0.48185389411467905, + "grad_norm": 0.18358397483825684, + "learning_rate": 4.9425113598282916e-05, + "loss": 0.4401, "step": 13370 }, { - "epoch": 0.47, - "learning_rate": 4.946325115298763e-05, - "loss": 0.3085, + "epoch": 0.4820340937759037, + "grad_norm": 0.17365938425064087, + "learning_rate": 4.942449123186274e-05, + "loss": 0.4459, "step": 13375 }, { - "epoch": 0.47, - "learning_rate": 4.946266386068968e-05, - "loss": 0.2821, + "epoch": 0.48221429343712835, + "grad_norm": 0.15696145594120026, + "learning_rate": 4.942386853266362e-05, + "loss": 0.4618, "step": 13380 }, { - "epoch": 0.47, - "learning_rate": 4.946207625076017e-05, - "loss": 0.2997, + "epoch": 0.48239449309835297, + "grad_norm": 0.16444018483161926, + "learning_rate": 4.942324550069402e-05, + "loss": 0.4885, "step": 13385 }, { - "epoch": 0.47, - "learning_rate": 4.946148832320672e-05, - "loss": 0.3096, + "epoch": 0.4825746927595776, + "grad_norm": 0.14012210071086884, + "learning_rate": 4.942262213596241e-05, + "loss": 0.4398, "step": 13390 }, { - "epoch": 0.47, - "learning_rate": 4.9460900078036955e-05, - "loss": 0.3095, + "epoch": 0.4827548924208023, + "grad_norm": 0.23050503432750702, + "learning_rate": 4.942199843847732e-05, + "loss": 0.4899, "step": 13395 }, { - "epoch": 0.47, - "learning_rate": 4.9460311515258526e-05, - "loss": 0.3151, + "epoch": 0.4829350920820269, + "grad_norm": 0.15428675711154938, + "learning_rate": 4.942137440824723e-05, + "loss": 0.4707, "step": 13400 }, { - "epoch": 0.47, - "learning_rate": 4.945972263487908e-05, - "loss": 0.341, + "epoch": 0.4831152917432515, + "grad_norm": 0.2107590287923813, + "learning_rate": 4.9420750045280636e-05, + "loss": 0.4507, "step": 13405 }, { - "epoch": 0.47, - "learning_rate": 4.945913343690625e-05, - "loss": 0.3015, + "epoch": 0.48329549140447614, + "grad_norm": 0.1626407951116562, + "learning_rate": 4.942012534958605e-05, + "loss": 0.4398, "step": 13410 }, { - "epoch": 0.47, - "learning_rate": 4.9458543921347686e-05, - "loss": 0.339, + "epoch": 0.4834756910657008, + "grad_norm": 0.14003750681877136, + "learning_rate": 4.9419500321171987e-05, + "loss": 0.4246, "step": 13415 }, { - "epoch": 0.47, - "learning_rate": 4.9457954088211055e-05, - "loss": 0.3195, + "epoch": 0.48365589072692544, + "grad_norm": 0.16057373583316803, + "learning_rate": 4.9418874960046954e-05, + "loss": 0.4636, "step": 13420 }, { - "epoch": 0.47, - "learning_rate": 4.945736393750401e-05, - "loss": 0.3124, + "epoch": 0.48383609038815006, + "grad_norm": 0.1851695328950882, + "learning_rate": 4.941824926621948e-05, + "loss": 0.4292, "step": 13425 }, { - "epoch": 0.47, - "learning_rate": 4.9456773469234206e-05, - "loss": 0.3075, + "epoch": 0.4840162900493747, + "grad_norm": 0.1717042773962021, + "learning_rate": 4.941762323969809e-05, + "loss": 0.4782, "step": 13430 }, { - "epoch": 0.47, - "learning_rate": 4.9456182683409316e-05, - "loss": 0.3203, + "epoch": 0.48419648971059936, + "grad_norm": 0.16300024092197418, + "learning_rate": 4.9416996880491305e-05, + "loss": 0.4498, "step": 13435 }, { - "epoch": 0.47, - "learning_rate": 4.945559158003702e-05, - "loss": 0.3248, + "epoch": 0.484376689371824, + "grad_norm": 0.1653520166873932, + "learning_rate": 4.941637018860767e-05, + "loss": 0.417, "step": 13440 }, { - "epoch": 0.47, - "learning_rate": 4.945500015912498e-05, - "loss": 0.3142, + "epoch": 0.4845568890330486, + "grad_norm": 0.14384515583515167, + "learning_rate": 4.941574316405572e-05, + "loss": 0.4253, "step": 13445 }, { - "epoch": 0.47, - "learning_rate": 4.945440842068087e-05, - "loss": 0.3309, + "epoch": 0.48473708869427323, + "grad_norm": 0.18884818255901337, + "learning_rate": 4.9415115806843993e-05, + "loss": 0.4521, "step": 13450 }, { - "epoch": 0.47, - "learning_rate": 4.94538163647124e-05, - "loss": 0.3139, + "epoch": 0.4849172883554979, + "grad_norm": 0.14921367168426514, + "learning_rate": 4.941448811698104e-05, + "loss": 0.4797, "step": 13455 }, { - "epoch": 0.47, - "learning_rate": 4.945322399122723e-05, - "loss": 0.2987, + "epoch": 0.48509748801672253, + "grad_norm": 0.16940531134605408, + "learning_rate": 4.9413860094475414e-05, + "loss": 0.4575, "step": 13460 }, { - "epoch": 0.47, - "learning_rate": 4.945263130023306e-05, - "loss": 0.3205, + "epoch": 0.48527768767794716, + "grad_norm": 0.23463140428066254, + "learning_rate": 4.9413231739335664e-05, + "loss": 0.4568, "step": 13465 }, { - "epoch": 0.47, - "learning_rate": 4.945203829173759e-05, - "loss": 0.3159, + "epoch": 0.4854578873391718, + "grad_norm": 0.14103110134601593, + "learning_rate": 4.9412603051570364e-05, + "loss": 0.4384, "step": 13470 }, { - "epoch": 0.47, - "learning_rate": 4.945144496574852e-05, - "loss": 0.3238, + "epoch": 0.48563808700039646, + "grad_norm": 0.1720830202102661, + "learning_rate": 4.941197403118808e-05, + "loss": 0.4834, "step": 13475 }, { - "epoch": 0.47, - "learning_rate": 4.945085132227354e-05, - "loss": 0.3118, + "epoch": 0.4858182866616211, + "grad_norm": 0.1815214455127716, + "learning_rate": 4.9411344678197366e-05, + "loss": 0.4557, "step": 13480 }, { - "epoch": 0.47, - "learning_rate": 4.945025736132038e-05, - "loss": 0.3195, + "epoch": 0.4859984863228457, + "grad_norm": 0.18314550817012787, + "learning_rate": 4.941071499260681e-05, + "loss": 0.4589, "step": 13485 }, { - "epoch": 0.47, - "learning_rate": 4.9449663082896744e-05, - "loss": 0.2916, + "epoch": 0.4861786859840703, + "grad_norm": 0.1780148446559906, + "learning_rate": 4.9410084974424994e-05, + "loss": 0.4338, "step": 13490 }, { - "epoch": 0.47, - "learning_rate": 4.944906848701034e-05, - "loss": 0.3271, + "epoch": 0.486358885645295, + "grad_norm": 0.17320701479911804, + "learning_rate": 4.940945462366049e-05, + "loss": 0.4356, "step": 13495 }, { - "epoch": 0.47, - "learning_rate": 4.94484735736689e-05, - "loss": 0.3491, + "epoch": 0.4865390853065196, + "grad_norm": 0.1759449988603592, + "learning_rate": 4.94088239403219e-05, + "loss": 0.4147, "step": 13500 }, { - "epoch": 0.47, - "eval_loss": 0.31191739439964294, - "eval_runtime": 10.5398, - "eval_samples_per_second": 9.488, - "eval_steps_per_second": 9.488, + "epoch": 0.4865390853065196, + "eval_loss": 0.47342750430107117, + "eval_runtime": 3.5547, + "eval_samples_per_second": 28.131, + "eval_steps_per_second": 7.033, "step": 13500 }, { - "epoch": 0.48, - "learning_rate": 4.9447878342880134e-05, - "loss": 0.3164, + "epoch": 0.48671928496774425, + "grad_norm": 0.1460382640361786, + "learning_rate": 4.94081929244178e-05, + "loss": 0.4134, "step": 13505 }, { - "epoch": 0.48, - "learning_rate": 4.944728279465179e-05, - "loss": 0.31, + "epoch": 0.48689948462896887, + "grad_norm": 0.20045733451843262, + "learning_rate": 4.94075615759568e-05, + "loss": 0.4641, "step": 13510 }, { - "epoch": 0.48, - "learning_rate": 4.9446686928991574e-05, - "loss": 0.2992, + "epoch": 0.48707968429019355, + "grad_norm": 0.1686198115348816, + "learning_rate": 4.94069298949475e-05, + "loss": 0.4628, "step": 13515 }, { - "epoch": 0.48, - "learning_rate": 4.944609074590725e-05, - "loss": 0.3281, + "epoch": 0.48725988395141817, + "grad_norm": 0.16252323985099792, + "learning_rate": 4.9406297881398504e-05, + "loss": 0.4763, "step": 13520 }, { - "epoch": 0.48, - "learning_rate": 4.944549424540655e-05, - "loss": 0.3215, + "epoch": 0.4874400836126428, + "grad_norm": 0.2166275978088379, + "learning_rate": 4.940566553531843e-05, + "loss": 0.4497, "step": 13525 }, { - "epoch": 0.48, - "learning_rate": 4.944489742749721e-05, - "loss": 0.3348, + "epoch": 0.48762028327386747, + "grad_norm": 0.22283247113227844, + "learning_rate": 4.940503285671588e-05, + "loss": 0.465, "step": 13530 }, { - "epoch": 0.48, - "learning_rate": 4.9444300292186994e-05, - "loss": 0.323, + "epoch": 0.4878004829350921, + "grad_norm": 0.11601988226175308, + "learning_rate": 4.940439984559949e-05, + "loss": 0.3932, "step": 13535 }, { - "epoch": 0.48, - "learning_rate": 4.944370283948365e-05, - "loss": 0.3194, + "epoch": 0.4879806825963167, + "grad_norm": 0.17983366549015045, + "learning_rate": 4.940376650197787e-05, + "loss": 0.4426, "step": 13540 }, { - "epoch": 0.48, - "learning_rate": 4.9443105069394924e-05, - "loss": 0.34, + "epoch": 0.48816088225754134, + "grad_norm": 0.2088562697172165, + "learning_rate": 4.940313282585967e-05, + "loss": 0.4687, "step": 13545 }, { - "epoch": 0.48, - "learning_rate": 4.944250698192859e-05, - "loss": 0.3085, + "epoch": 0.488341081918766, + "grad_norm": 0.17913992702960968, + "learning_rate": 4.940249881725349e-05, + "loss": 0.4504, "step": 13550 }, { - "epoch": 0.48, - "learning_rate": 4.944190857709242e-05, - "loss": 0.314, + "epoch": 0.48852128157999064, + "grad_norm": 0.18297719955444336, + "learning_rate": 4.9401864476168e-05, + "loss": 0.4756, "step": 13555 }, { - "epoch": 0.48, - "learning_rate": 4.944130985489417e-05, - "loss": 0.3261, + "epoch": 0.48870148124121526, + "grad_norm": 0.18946805596351624, + "learning_rate": 4.9401229802611826e-05, + "loss": 0.4461, "step": 13560 }, { - "epoch": 0.48, - "learning_rate": 4.944071081534161e-05, - "loss": 0.3415, + "epoch": 0.4888816809024399, + "grad_norm": 0.19642286002635956, + "learning_rate": 4.9400594796593626e-05, + "loss": 0.4813, "step": 13565 }, { - "epoch": 0.48, - "learning_rate": 4.944011145844254e-05, - "loss": 0.3244, + "epoch": 0.48906188056366456, + "grad_norm": 0.21718788146972656, + "learning_rate": 4.940008655241221e-05, + "loss": 0.4587, "step": 13570 }, { - "epoch": 0.48, - "learning_rate": 4.9439511784204715e-05, - "loss": 0.3109, + "epoch": 0.4892420802248892, + "grad_norm": 0.2410500943660736, + "learning_rate": 4.939945094798416e-05, + "loss": 0.4804, "step": 13575 }, { - "epoch": 0.48, - "learning_rate": 4.943891179263595e-05, - "loss": 0.3116, + "epoch": 0.4894222798861138, + "grad_norm": 0.1772763729095459, + "learning_rate": 4.93988150111183e-05, + "loss": 0.4455, "step": 13580 }, { - "epoch": 0.48, - "learning_rate": 4.9438311483744013e-05, - "loss": 0.3322, + "epoch": 0.48960247954733843, + "grad_norm": 0.16895850002765656, + "learning_rate": 4.939817874182333e-05, + "loss": 0.4365, "step": 13585 }, { - "epoch": 0.48, - "learning_rate": 4.9437710857536715e-05, - "loss": 0.3271, + "epoch": 0.4897826792085631, + "grad_norm": 0.2072199136018753, + "learning_rate": 4.939754214010788e-05, + "loss": 0.4403, "step": 13590 }, { - "epoch": 0.48, - "learning_rate": 4.943710991402184e-05, - "loss": 0.3172, + "epoch": 0.48996287886978773, + "grad_norm": 0.16987258195877075, + "learning_rate": 4.939690520598065e-05, + "loss": 0.449, "step": 13595 }, { - "epoch": 0.48, - "learning_rate": 4.94365086532072e-05, - "loss": 0.2977, + "epoch": 0.49014307853101236, + "grad_norm": 0.16706013679504395, + "learning_rate": 4.9396267939450316e-05, + "loss": 0.4429, "step": 13600 }, { - "epoch": 0.48, - "learning_rate": 4.9435907075100595e-05, - "loss": 0.2983, + "epoch": 0.490323278192237, + "grad_norm": 0.15251483023166656, + "learning_rate": 4.939563034052555e-05, + "loss": 0.4386, "step": 13605 }, { - "epoch": 0.48, - "learning_rate": 4.943530517970985e-05, - "loss": 0.3503, + "epoch": 0.49050347785346166, + "grad_norm": 0.17916177213191986, + "learning_rate": 4.9394992409215036e-05, + "loss": 0.4344, "step": 13610 }, { - "epoch": 0.48, - "learning_rate": 4.943470296704276e-05, - "loss": 0.326, + "epoch": 0.4906836775146863, + "grad_norm": 0.13827458024024963, + "learning_rate": 4.939435414552748e-05, + "loss": 0.4667, "step": 13615 }, { - "epoch": 0.48, - "learning_rate": 4.943410043710715e-05, - "loss": 0.3147, + "epoch": 0.4908638771759109, + "grad_norm": 0.1721857637166977, + "learning_rate": 4.939371554947156e-05, + "loss": 0.4614, "step": 13620 }, { - "epoch": 0.48, - "learning_rate": 4.9433497589910864e-05, - "loss": 0.3018, + "epoch": 0.4910440768371355, + "grad_norm": 0.19055204093456268, + "learning_rate": 4.9393076621056e-05, + "loss": 0.4408, "step": 13625 }, { - "epoch": 0.48, - "learning_rate": 4.9432894425461704e-05, - "loss": 0.3277, + "epoch": 0.4912242764983602, + "grad_norm": 0.1874268501996994, + "learning_rate": 4.93924373602895e-05, + "loss": 0.4647, "step": 13630 }, { - "epoch": 0.48, - "learning_rate": 4.943229094376752e-05, - "loss": 0.3264, + "epoch": 0.4914044761595848, + "grad_norm": 0.1848086565732956, + "learning_rate": 4.9391797767180755e-05, + "loss": 0.4801, "step": 13635 }, { - "epoch": 0.48, - "learning_rate": 4.9431687144836136e-05, - "loss": 0.3253, + "epoch": 0.49158467582080945, + "grad_norm": 0.1567952185869217, + "learning_rate": 4.939115784173849e-05, + "loss": 0.4194, "step": 13640 }, { - "epoch": 0.48, - "learning_rate": 4.9431083028675394e-05, - "loss": 0.3422, + "epoch": 0.49176487548203407, + "grad_norm": 0.16139036417007446, + "learning_rate": 4.9390517583971416e-05, + "loss": 0.473, "step": 13645 }, { - "epoch": 0.48, - "learning_rate": 4.9430478595293136e-05, - "loss": 0.3188, + "epoch": 0.49194507514325875, + "grad_norm": 0.14114044606685638, + "learning_rate": 4.938987699388827e-05, + "loss": 0.4326, "step": 13650 }, { - "epoch": 0.48, - "learning_rate": 4.942987384469723e-05, - "loss": 0.3083, + "epoch": 0.49212527480448337, + "grad_norm": 0.20750269293785095, + "learning_rate": 4.938923607149777e-05, + "loss": 0.4792, "step": 13655 }, { - "epoch": 0.48, - "learning_rate": 4.942926877689549e-05, - "loss": 0.3319, + "epoch": 0.492305474465708, + "grad_norm": 0.1520867943763733, + "learning_rate": 4.938859481680865e-05, + "loss": 0.453, "step": 13660 }, { - "epoch": 0.48, - "learning_rate": 4.942866339189581e-05, - "loss": 0.2984, + "epoch": 0.4924856741269326, + "grad_norm": 0.17390841245651245, + "learning_rate": 4.9387953229829644e-05, + "loss": 0.4304, "step": 13665 }, { - "epoch": 0.48, - "learning_rate": 4.942805768970603e-05, - "loss": 0.2981, + "epoch": 0.4926658737881573, + "grad_norm": 0.14850212633609772, + "learning_rate": 4.938731131056949e-05, + "loss": 0.4328, "step": 13670 }, { - "epoch": 0.48, - "learning_rate": 4.9427451670334014e-05, - "loss": 0.3058, + "epoch": 0.4928460734493819, + "grad_norm": 0.15505053102970123, + "learning_rate": 4.938666905903696e-05, + "loss": 0.4195, "step": 13675 }, { - "epoch": 0.48, - "learning_rate": 4.942684533378764e-05, - "loss": 0.3059, + "epoch": 0.49302627311060654, + "grad_norm": 0.1582787036895752, + "learning_rate": 4.938602647524077e-05, + "loss": 0.4119, "step": 13680 }, { - "epoch": 0.48, - "learning_rate": 4.942623868007478e-05, - "loss": 0.3228, + "epoch": 0.49320647277183116, + "grad_norm": 0.2123645544052124, + "learning_rate": 4.93853835591897e-05, + "loss": 0.4687, "step": 13685 }, { - "epoch": 0.48, - "learning_rate": 4.9425631709203314e-05, - "loss": 0.3068, + "epoch": 0.49338667243305584, + "grad_norm": 0.19299419224262238, + "learning_rate": 4.93847403108925e-05, + "loss": 0.4948, "step": 13690 }, { - "epoch": 0.48, - "learning_rate": 4.94250244211811e-05, - "loss": 0.3145, + "epoch": 0.49356687209428046, + "grad_norm": 0.19016866385936737, + "learning_rate": 4.938409673035793e-05, + "loss": 0.4477, "step": 13695 }, { - "epoch": 0.48, - "learning_rate": 4.9424416816016055e-05, - "loss": 0.3242, + "epoch": 0.4937470717555051, + "grad_norm": 0.1360589861869812, + "learning_rate": 4.938345281759476e-05, + "loss": 0.4734, "step": 13700 }, { - "epoch": 0.48, - "learning_rate": 4.942380889371605e-05, - "loss": 0.3058, + "epoch": 0.49392727141672976, + "grad_norm": 0.15430140495300293, + "learning_rate": 4.9382808572611775e-05, + "loss": 0.4432, "step": 13705 }, { - "epoch": 0.48, - "learning_rate": 4.942320065428898e-05, - "loss": 0.3208, + "epoch": 0.4941074710779544, + "grad_norm": 0.17401854693889618, + "learning_rate": 4.938216399541773e-05, + "loss": 0.4308, "step": 13710 }, { - "epoch": 0.48, - "learning_rate": 4.9422592097742745e-05, - "loss": 0.332, + "epoch": 0.494287670739179, + "grad_norm": 0.17819242179393768, + "learning_rate": 4.9381519086021434e-05, + "loss": 0.449, "step": 13715 }, { - "epoch": 0.48, - "learning_rate": 4.942198322408526e-05, - "loss": 0.3246, + "epoch": 0.49446787040040363, + "grad_norm": 0.1599666327238083, + "learning_rate": 4.9380873844431654e-05, + "loss": 0.4336, "step": 13720 }, { - "epoch": 0.48, - "learning_rate": 4.94213740333244e-05, - "loss": 0.324, + "epoch": 0.4946480700616283, + "grad_norm": 0.13881821930408478, + "learning_rate": 4.938022827065719e-05, + "loss": 0.4488, "step": 13725 }, { - "epoch": 0.48, - "learning_rate": 4.94207645254681e-05, - "loss": 0.3244, + "epoch": 0.49482826972285293, + "grad_norm": 0.1781865805387497, + "learning_rate": 4.937958236470684e-05, + "loss": 0.4461, "step": 13730 }, { - "epoch": 0.48, - "learning_rate": 4.942015470052426e-05, - "loss": 0.2947, + "epoch": 0.49500846938407755, + "grad_norm": 0.1789911985397339, + "learning_rate": 4.93789361265894e-05, + "loss": 0.4537, "step": 13735 }, { - "epoch": 0.48, - "learning_rate": 4.9419544558500813e-05, - "loss": 0.331, + "epoch": 0.4951886690453022, + "grad_norm": 0.1587119698524475, + "learning_rate": 4.9378289556313673e-05, + "loss": 0.4544, "step": 13740 }, { - "epoch": 0.48, - "learning_rate": 4.941893409940567e-05, - "loss": 0.3408, + "epoch": 0.49536886870652685, + "grad_norm": 0.1573282778263092, + "learning_rate": 4.9377642653888464e-05, + "loss": 0.396, "step": 13745 }, { - "epoch": 0.48, - "learning_rate": 4.9418323323246764e-05, - "loss": 0.3019, + "epoch": 0.4955490683677515, + "grad_norm": 0.13113892078399658, + "learning_rate": 4.93769954193226e-05, + "loss": 0.4652, "step": 13750 }, { - "epoch": 0.48, - "learning_rate": 4.9417712230032016e-05, - "loss": 0.3216, + "epoch": 0.4957292680289761, + "grad_norm": 0.1477142572402954, + "learning_rate": 4.9376347852624895e-05, + "loss": 0.4209, "step": 13755 }, { - "epoch": 0.48, - "learning_rate": 4.941710081976937e-05, - "loss": 0.3146, + "epoch": 0.4959094676902007, + "grad_norm": 0.1767813116312027, + "learning_rate": 4.937569995380417e-05, + "loss": 0.4656, "step": 13760 }, { - "epoch": 0.48, - "learning_rate": 4.941648909246676e-05, - "loss": 0.2873, + "epoch": 0.4960896673514254, + "grad_norm": 0.14544841647148132, + "learning_rate": 4.937505172286925e-05, + "loss": 0.453, "step": 13765 }, { - "epoch": 0.48, - "learning_rate": 4.941587704813213e-05, - "loss": 0.3238, + "epoch": 0.49626986701265, + "grad_norm": 0.16864298284053802, + "learning_rate": 4.9374403159828965e-05, + "loss": 0.4242, "step": 13770 }, { - "epoch": 0.48, - "learning_rate": 4.941526468677343e-05, - "loss": 0.3233, + "epoch": 0.49645006667387465, + "grad_norm": 0.16903114318847656, + "learning_rate": 4.9373754264692164e-05, + "loss": 0.4519, "step": 13775 }, { - "epoch": 0.48, - "learning_rate": 4.9414652008398605e-05, - "loss": 0.3045, + "epoch": 0.49663026633509927, + "grad_norm": 0.21901170909404755, + "learning_rate": 4.937310503746767e-05, + "loss": 0.4414, "step": 13780 }, { - "epoch": 0.48, - "learning_rate": 4.9414039013015615e-05, - "loss": 0.3186, + "epoch": 0.49681046599632395, + "grad_norm": 0.18153007328510284, + "learning_rate": 4.937245547816435e-05, + "loss": 0.4262, "step": 13785 }, { - "epoch": 0.49, - "learning_rate": 4.9413425700632415e-05, - "loss": 0.3047, + "epoch": 0.49699066565754857, + "grad_norm": 0.16006413102149963, + "learning_rate": 4.937180558679104e-05, + "loss": 0.4462, "step": 13790 }, { - "epoch": 0.49, - "learning_rate": 4.9412812071256974e-05, - "loss": 0.3117, + "epoch": 0.4971708653187732, + "grad_norm": 0.17855258285999298, + "learning_rate": 4.937115536335659e-05, + "loss": 0.4327, "step": 13795 }, { - "epoch": 0.49, - "learning_rate": 4.941219812489726e-05, - "loss": 0.3269, + "epoch": 0.4973510649799978, + "grad_norm": 0.1678069531917572, + "learning_rate": 4.937050480786987e-05, + "loss": 0.4655, "step": 13800 }, { - "epoch": 0.49, - "learning_rate": 4.9411583861561236e-05, - "loss": 0.303, + "epoch": 0.4975312646412225, + "grad_norm": 0.13157707452774048, + "learning_rate": 4.936985392033975e-05, + "loss": 0.4951, "step": 13805 }, { - "epoch": 0.49, - "learning_rate": 4.941096928125688e-05, - "loss": 0.3261, + "epoch": 0.4977114643024471, + "grad_norm": 0.20059573650360107, + "learning_rate": 4.936920270077508e-05, + "loss": 0.456, "step": 13810 }, { - "epoch": 0.49, - "learning_rate": 4.9410354383992186e-05, - "loss": 0.2925, + "epoch": 0.49789166396367174, + "grad_norm": 0.13859055936336517, + "learning_rate": 4.936855114918474e-05, + "loss": 0.4359, "step": 13815 }, { - "epoch": 0.49, - "learning_rate": 4.940973916977512e-05, - "loss": 0.3165, + "epoch": 0.49807186362489636, + "grad_norm": 0.165902242064476, + "learning_rate": 4.936789926557761e-05, + "loss": 0.4371, "step": 13820 }, { - "epoch": 0.49, - "learning_rate": 4.940912363861368e-05, - "loss": 0.3106, + "epoch": 0.49825206328612104, + "grad_norm": 0.1760404258966446, + "learning_rate": 4.936724704996257e-05, + "loss": 0.4465, "step": 13825 }, { - "epoch": 0.49, - "learning_rate": 4.940850779051586e-05, - "loss": 0.3322, + "epoch": 0.49843226294734566, + "grad_norm": 0.16966862976551056, + "learning_rate": 4.936659450234851e-05, + "loss": 0.4454, "step": 13830 }, { - "epoch": 0.49, - "learning_rate": 4.9407891625489644e-05, - "loss": 0.3183, + "epoch": 0.4986124626085703, + "grad_norm": 0.16548436880111694, + "learning_rate": 4.936594162274431e-05, + "loss": 0.4679, "step": 13835 }, { - "epoch": 0.49, - "learning_rate": 4.9407275143543044e-05, - "loss": 0.3155, + "epoch": 0.4987926622697949, + "grad_norm": 0.19853737950325012, + "learning_rate": 4.936528841115887e-05, + "loss": 0.4545, "step": 13840 }, { - "epoch": 0.49, - "learning_rate": 4.940665834468407e-05, - "loss": 0.3208, + "epoch": 0.4989728619310196, + "grad_norm": 0.19320517778396606, + "learning_rate": 4.936463486760111e-05, + "loss": 0.4758, "step": 13845 }, { - "epoch": 0.49, - "learning_rate": 4.940604122892071e-05, - "loss": 0.306, + "epoch": 0.4991530615922442, + "grad_norm": 0.2133074700832367, + "learning_rate": 4.936398099207991e-05, + "loss": 0.4675, "step": 13850 }, { - "epoch": 0.49, - "learning_rate": 4.940542379626099e-05, - "loss": 0.3462, + "epoch": 0.49933326125346883, + "grad_norm": 0.16722871363162994, + "learning_rate": 4.936332678460417e-05, + "loss": 0.4416, "step": 13855 }, { - "epoch": 0.49, - "learning_rate": 4.9404806046712927e-05, - "loss": 0.3338, + "epoch": 0.4995134609146935, + "grad_norm": 0.1773030161857605, + "learning_rate": 4.936267224518284e-05, + "loss": 0.475, "step": 13860 }, { - "epoch": 0.49, - "learning_rate": 4.940418798028455e-05, - "loss": 0.3077, + "epoch": 0.49969366057591813, + "grad_norm": 0.13764159381389618, + "learning_rate": 4.936201737382481e-05, + "loss": 0.4597, "step": 13865 }, { - "epoch": 0.49, - "learning_rate": 4.940356959698386e-05, - "loss": 0.3194, + "epoch": 0.49987386023714275, + "grad_norm": 0.19111090898513794, + "learning_rate": 4.9361362170539006e-05, + "loss": 0.4601, "step": 13870 }, { - "epoch": 0.49, - "learning_rate": 4.9402950896818906e-05, - "loss": 0.2823, + "epoch": 0.5000540598983674, + "grad_norm": 0.13824209570884705, + "learning_rate": 4.936070663533436e-05, + "loss": 0.4103, "step": 13875 }, { - "epoch": 0.49, - "learning_rate": 4.940233187979772e-05, - "loss": 0.3186, + "epoch": 0.500234259559592, + "grad_norm": 0.17304690182209015, + "learning_rate": 4.936005076821981e-05, + "loss": 0.4446, "step": 13880 }, { - "epoch": 0.49, - "learning_rate": 4.940171254592834e-05, - "loss": 0.3073, + "epoch": 0.5004144592208166, + "grad_norm": 0.1318497657775879, + "learning_rate": 4.9359394569204274e-05, + "loss": 0.4341, "step": 13885 }, { - "epoch": 0.49, - "learning_rate": 4.940109289521879e-05, - "loss": 0.3308, + "epoch": 0.5005946588820414, + "grad_norm": 0.1591554433107376, + "learning_rate": 4.9358738038296714e-05, + "loss": 0.4414, "step": 13890 }, { - "epoch": 0.49, - "learning_rate": 4.9400472927677144e-05, - "loss": 0.284, + "epoch": 0.500774858543266, + "grad_norm": 0.18022438883781433, + "learning_rate": 4.935808117550605e-05, + "loss": 0.4741, "step": 13895 }, { - "epoch": 0.49, - "learning_rate": 4.939985264331143e-05, - "loss": 0.3212, + "epoch": 0.5009550582044906, + "grad_norm": 0.16793961822986603, + "learning_rate": 4.935742398084127e-05, + "loss": 0.4567, "step": 13900 }, { - "epoch": 0.49, - "learning_rate": 4.939923204212972e-05, - "loss": 0.316, + "epoch": 0.5011352578657152, + "grad_norm": 0.13464850187301636, + "learning_rate": 4.935676645431128e-05, + "loss": 0.447, "step": 13905 }, { - "epoch": 0.49, - "learning_rate": 4.9398611124140046e-05, - "loss": 0.2989, + "epoch": 0.5013154575269398, + "grad_norm": 0.1593722254037857, + "learning_rate": 4.935610859592508e-05, + "loss": 0.425, "step": 13910 }, { - "epoch": 0.49, - "learning_rate": 4.939798988935049e-05, - "loss": 0.3339, + "epoch": 0.5014956571881645, + "grad_norm": 0.1893470287322998, + "learning_rate": 4.93554504056916e-05, + "loss": 0.4611, "step": 13915 }, { - "epoch": 0.49, - "learning_rate": 4.939736833776912e-05, - "loss": 0.3322, + "epoch": 0.5016758568493891, + "grad_norm": 0.17857231199741364, + "learning_rate": 4.935479188361983e-05, + "loss": 0.4438, "step": 13920 }, { - "epoch": 0.49, - "learning_rate": 4.9396746469403996e-05, - "loss": 0.3308, + "epoch": 0.5018560565106137, + "grad_norm": 0.15780913829803467, + "learning_rate": 4.935413302971874e-05, + "loss": 0.4474, "step": 13925 }, { - "epoch": 0.49, - "learning_rate": 4.93961242842632e-05, - "loss": 0.3017, + "epoch": 0.5020362561718384, + "grad_norm": 0.17613886296749115, + "learning_rate": 4.93534738439973e-05, + "loss": 0.4686, "step": 13930 }, { - "epoch": 0.49, - "learning_rate": 4.9395501782354813e-05, - "loss": 0.3068, + "epoch": 0.5022164558330631, + "grad_norm": 0.16873963177204132, + "learning_rate": 4.9352814326464493e-05, + "loss": 0.4534, "step": 13935 }, { - "epoch": 0.49, - "learning_rate": 4.939487896368691e-05, - "loss": 0.3179, + "epoch": 0.5023966554942877, + "grad_norm": 0.1617756485939026, + "learning_rate": 4.93521544771293e-05, + "loss": 0.4431, "step": 13940 }, { - "epoch": 0.49, - "learning_rate": 4.939425582826757e-05, - "loss": 0.3163, + "epoch": 0.5025768551555123, + "grad_norm": 0.1517868936061859, + "learning_rate": 4.9351494296000726e-05, + "loss": 0.4691, "step": 13945 }, { - "epoch": 0.49, - "learning_rate": 4.9393632376104904e-05, - "loss": 0.3401, + "epoch": 0.5027570548167369, + "grad_norm": 0.18268106877803802, + "learning_rate": 4.935083378308776e-05, + "loss": 0.4257, "step": 13950 }, { - "epoch": 0.49, - "learning_rate": 4.9393008607206995e-05, - "loss": 0.32, + "epoch": 0.5029372544779616, + "grad_norm": 0.16596758365631104, + "learning_rate": 4.935017293839939e-05, + "loss": 0.408, "step": 13955 }, { - "epoch": 0.49, - "learning_rate": 4.9392384521581945e-05, - "loss": 0.3022, + "epoch": 0.5031174541391862, + "grad_norm": 0.2062714397907257, + "learning_rate": 4.934951176194462e-05, + "loss": 0.4706, "step": 13960 }, { - "epoch": 0.49, - "learning_rate": 4.939176011923786e-05, - "loss": 0.3167, + "epoch": 0.5032976538004108, + "grad_norm": 0.1877586394548416, + "learning_rate": 4.934885025373248e-05, + "loss": 0.4726, "step": 13965 }, { - "epoch": 0.49, - "learning_rate": 4.939113540018284e-05, - "loss": 0.3418, + "epoch": 0.5034778534616355, + "grad_norm": 0.17197106778621674, + "learning_rate": 4.9348188413771966e-05, + "loss": 0.4465, "step": 13970 }, { - "epoch": 0.49, - "learning_rate": 4.9390510364425004e-05, - "loss": 0.3078, + "epoch": 0.5036580531228602, + "grad_norm": 0.16831296682357788, + "learning_rate": 4.93475262420721e-05, + "loss": 0.489, "step": 13975 }, { - "epoch": 0.49, - "learning_rate": 4.9389885011972464e-05, - "loss": 0.3185, + "epoch": 0.5038382527840848, + "grad_norm": 0.18322400748729706, + "learning_rate": 4.93468637386419e-05, + "loss": 0.4343, "step": 13980 }, { - "epoch": 0.49, - "learning_rate": 4.938925934283334e-05, - "loss": 0.3481, + "epoch": 0.5040184524453094, + "grad_norm": 0.21650521457195282, + "learning_rate": 4.93462009034904e-05, + "loss": 0.4735, "step": 13985 }, { - "epoch": 0.49, - "learning_rate": 4.9388633357015755e-05, - "loss": 0.317, + "epoch": 0.504198652106534, + "grad_norm": 0.17821024358272552, + "learning_rate": 4.9345537736626626e-05, + "loss": 0.4375, "step": 13990 }, { - "epoch": 0.49, - "learning_rate": 4.938800705452784e-05, - "loss": 0.3323, + "epoch": 0.5043788517677587, + "grad_norm": 0.15985864400863647, + "learning_rate": 4.934487423805961e-05, + "loss": 0.4419, "step": 13995 }, { - "epoch": 0.49, - "learning_rate": 4.9387380435377726e-05, - "loss": 0.316, + "epoch": 0.5045590514289833, + "grad_norm": 0.1555582582950592, + "learning_rate": 4.93442104077984e-05, + "loss": 0.4606, "step": 14000 }, { - "epoch": 0.49, - "eval_loss": 0.3117991089820862, - "eval_runtime": 10.538, - "eval_samples_per_second": 9.489, - "eval_steps_per_second": 9.489, + "epoch": 0.5045590514289833, + "eval_loss": 0.4720987379550934, + "eval_runtime": 3.5871, + "eval_samples_per_second": 27.878, + "eval_steps_per_second": 6.969, "step": 14000 }, { - "epoch": 0.49, - "learning_rate": 4.938675349957356e-05, - "loss": 0.3099, + "epoch": 0.5047392510902079, + "grad_norm": 0.1948181539773941, + "learning_rate": 4.934354624585202e-05, + "loss": 0.4392, "step": 14005 }, { - "epoch": 0.49, - "learning_rate": 4.938612624712346e-05, - "loss": 0.3418, + "epoch": 0.5049194507514326, + "grad_norm": 0.1451338827610016, + "learning_rate": 4.934288175222955e-05, + "loss": 0.4184, "step": 14010 }, { - "epoch": 0.49, - "learning_rate": 4.938549867803558e-05, - "loss": 0.3279, + "epoch": 0.5050996504126573, + "grad_norm": 0.1611844301223755, + "learning_rate": 4.934221692694003e-05, + "loss": 0.4428, "step": 14015 }, { - "epoch": 0.49, - "learning_rate": 4.9384870792318075e-05, - "loss": 0.3224, + "epoch": 0.5052798500738819, + "grad_norm": 0.19232018291950226, + "learning_rate": 4.934155176999252e-05, + "loss": 0.4512, "step": 14020 }, { - "epoch": 0.49, - "learning_rate": 4.938424258997909e-05, - "loss": 0.3266, + "epoch": 0.5054600497351065, + "grad_norm": 0.24862439930438995, + "learning_rate": 4.934088628139607e-05, + "loss": 0.4715, "step": 14025 }, { - "epoch": 0.49, - "learning_rate": 4.9383614071026787e-05, - "loss": 0.2839, + "epoch": 0.5056402493963311, + "grad_norm": 0.15161451697349548, + "learning_rate": 4.9340220461159757e-05, + "loss": 0.4179, "step": 14030 }, { - "epoch": 0.49, - "learning_rate": 4.938298523546932e-05, - "loss": 0.3114, + "epoch": 0.5058204490575557, + "grad_norm": 0.19004757702350616, + "learning_rate": 4.933955430929266e-05, + "loss": 0.4474, "step": 14035 }, { - "epoch": 0.49, - "learning_rate": 4.938235608331486e-05, - "loss": 0.3205, + "epoch": 0.5060006487187804, + "grad_norm": 0.23055416345596313, + "learning_rate": 4.933888782580385e-05, + "loss": 0.4602, "step": 14040 }, { - "epoch": 0.49, - "learning_rate": 4.938172661457158e-05, - "loss": 0.3337, + "epoch": 0.5061808483800051, + "grad_norm": 0.18940792977809906, + "learning_rate": 4.93382210107024e-05, + "loss": 0.4346, "step": 14045 }, { - "epoch": 0.49, - "learning_rate": 4.9381096829247644e-05, - "loss": 0.318, + "epoch": 0.5063610480412297, + "grad_norm": 0.1852693110704422, + "learning_rate": 4.9337553863997396e-05, + "loss": 0.4499, "step": 14050 }, { - "epoch": 0.49, - "learning_rate": 4.938046672735123e-05, - "loss": 0.316, + "epoch": 0.5065412477024543, + "grad_norm": 0.1449885368347168, + "learning_rate": 4.933688638569794e-05, + "loss": 0.4657, "step": 14055 }, { - "epoch": 0.49, - "learning_rate": 4.937983630889053e-05, - "loss": 0.3182, + "epoch": 0.506721447363679, + "grad_norm": 0.1560732126235962, + "learning_rate": 4.933621857581312e-05, + "loss": 0.4184, "step": 14060 }, { - "epoch": 0.49, - "learning_rate": 4.937920557387372e-05, - "loss": 0.3189, + "epoch": 0.5069016470249036, + "grad_norm": 0.1511814445257187, + "learning_rate": 4.933555043435203e-05, + "loss": 0.4409, "step": 14065 }, { - "epoch": 0.5, - "learning_rate": 4.937857452230899e-05, - "loss": 0.3122, + "epoch": 0.5070818466861282, + "grad_norm": 0.1766042411327362, + "learning_rate": 4.9334881961323776e-05, + "loss": 0.4559, "step": 14070 }, { - "epoch": 0.5, - "learning_rate": 4.937794315420454e-05, - "loss": 0.2821, + "epoch": 0.5072620463473528, + "grad_norm": 0.14052477478981018, + "learning_rate": 4.9334213156737465e-05, + "loss": 0.4476, "step": 14075 }, { - "epoch": 0.5, - "learning_rate": 4.9377311469568566e-05, - "loss": 0.3035, + "epoch": 0.5074422460085775, + "grad_norm": 0.1610243022441864, + "learning_rate": 4.933354402060221e-05, + "loss": 0.434, "step": 14080 }, { - "epoch": 0.5, - "learning_rate": 4.937667946840926e-05, - "loss": 0.3213, + "epoch": 0.5076224456698022, + "grad_norm": 0.1901833862066269, + "learning_rate": 4.9332874552927135e-05, + "loss": 0.4489, "step": 14085 }, { - "epoch": 0.5, - "learning_rate": 4.937604715073484e-05, - "loss": 0.3141, + "epoch": 0.5078026453310268, + "grad_norm": 0.16567949950695038, + "learning_rate": 4.933220475372136e-05, + "loss": 0.4473, "step": 14090 }, { - "epoch": 0.5, - "learning_rate": 4.937541451655351e-05, - "loss": 0.3298, + "epoch": 0.5079828449922514, + "grad_norm": 0.2085447609424591, + "learning_rate": 4.933153462299399e-05, + "loss": 0.4374, "step": 14095 }, { - "epoch": 0.5, - "learning_rate": 4.937478156587349e-05, - "loss": 0.3311, + "epoch": 0.5081630446534761, + "grad_norm": 0.1837865263223648, + "learning_rate": 4.933086416075418e-05, + "loss": 0.4615, "step": 14100 }, { - "epoch": 0.5, - "learning_rate": 4.9374148298702994e-05, - "loss": 0.3197, + "epoch": 0.5083432443147007, + "grad_norm": 0.1936039924621582, + "learning_rate": 4.933019336701106e-05, + "loss": 0.4533, "step": 14105 }, { - "epoch": 0.5, - "learning_rate": 4.937351471505024e-05, - "loss": 0.3159, + "epoch": 0.5085234439759253, + "grad_norm": 0.38860994577407837, + "learning_rate": 4.932952224177376e-05, + "loss": 0.4412, "step": 14110 }, { - "epoch": 0.5, - "learning_rate": 4.937288081492347e-05, - "loss": 0.3294, + "epoch": 0.5087036436371499, + "grad_norm": 0.18047145009040833, + "learning_rate": 4.932885078505143e-05, + "loss": 0.4725, "step": 14115 }, { - "epoch": 0.5, - "learning_rate": 4.9372246598330895e-05, - "loss": 0.3011, + "epoch": 0.5088838432983745, + "grad_norm": 0.16559572517871857, + "learning_rate": 4.932817899685323e-05, + "loss": 0.452, "step": 14120 }, { - "epoch": 0.5, - "learning_rate": 4.937161206528077e-05, - "loss": 0.3129, + "epoch": 0.5090640429595993, + "grad_norm": 0.1487295925617218, + "learning_rate": 4.9327506877188284e-05, + "loss": 0.4177, "step": 14125 }, { - "epoch": 0.5, - "learning_rate": 4.9370977215781314e-05, - "loss": 0.3069, + "epoch": 0.5092442426208239, + "grad_norm": 0.14861665666103363, + "learning_rate": 4.9326834426065775e-05, + "loss": 0.4183, "step": 14130 }, { - "epoch": 0.5, - "learning_rate": 4.9370342049840786e-05, - "loss": 0.3171, + "epoch": 0.5094244422820485, + "grad_norm": 0.13988757133483887, + "learning_rate": 4.9326161643494856e-05, + "loss": 0.4216, "step": 14135 }, { - "epoch": 0.5, - "learning_rate": 4.9369706567467425e-05, - "loss": 0.3133, + "epoch": 0.5096046419432732, + "grad_norm": 0.1480373740196228, + "learning_rate": 4.93254885294847e-05, + "loss": 0.4165, "step": 14140 }, { - "epoch": 0.5, - "learning_rate": 4.9369070768669477e-05, - "loss": 0.3231, + "epoch": 0.5097848416044978, + "grad_norm": 0.1930646002292633, + "learning_rate": 4.932481508404446e-05, + "loss": 0.426, "step": 14145 }, { - "epoch": 0.5, - "learning_rate": 4.936843465345522e-05, - "loss": 0.3313, + "epoch": 0.5099650412657224, + "grad_norm": 0.18709403276443481, + "learning_rate": 4.932414130718334e-05, + "loss": 0.4608, "step": 14150 }, { - "epoch": 0.5, - "learning_rate": 4.936779822183289e-05, - "loss": 0.3145, + "epoch": 0.510145240926947, + "grad_norm": 0.17628611624240875, + "learning_rate": 4.932346719891049e-05, + "loss": 0.4455, "step": 14155 }, { - "epoch": 0.5, - "learning_rate": 4.9367161473810766e-05, - "loss": 0.3093, + "epoch": 0.5103254405881716, + "grad_norm": 0.1651298552751541, + "learning_rate": 4.9322792759235115e-05, + "loss": 0.4711, "step": 14160 }, { - "epoch": 0.5, - "learning_rate": 4.93665244093971e-05, - "loss": 0.3168, + "epoch": 0.5105056402493964, + "grad_norm": 0.16614636778831482, + "learning_rate": 4.93221179881664e-05, + "loss": 0.4232, "step": 14165 }, { - "epoch": 0.5, - "learning_rate": 4.9365887028600175e-05, - "loss": 0.3277, + "epoch": 0.510685839910621, + "grad_norm": 0.16470062732696533, + "learning_rate": 4.932144288571353e-05, + "loss": 0.4558, "step": 14170 }, { - "epoch": 0.5, - "learning_rate": 4.9365249331428265e-05, - "loss": 0.3193, + "epoch": 0.5108660395718456, + "grad_norm": 0.20364750921726227, + "learning_rate": 4.932076745188571e-05, + "loss": 0.4421, "step": 14175 }, { - "epoch": 0.5, - "learning_rate": 4.9364611317889644e-05, - "loss": 0.315, + "epoch": 0.5110462392330702, + "grad_norm": 0.19398504495620728, + "learning_rate": 4.9320091686692136e-05, + "loss": 0.4581, "step": 14180 }, { - "epoch": 0.5, - "learning_rate": 4.936397298799261e-05, - "loss": 0.3124, + "epoch": 0.5112264388942949, + "grad_norm": 0.1814093142747879, + "learning_rate": 4.931941559014204e-05, + "loss": 0.4678, "step": 14185 }, { - "epoch": 0.5, - "learning_rate": 4.9363334341745446e-05, - "loss": 0.3186, + "epoch": 0.5114066385555195, + "grad_norm": 0.15303486585617065, + "learning_rate": 4.93187391622446e-05, + "loss": 0.436, "step": 14190 }, { - "epoch": 0.5, - "learning_rate": 4.936269537915643e-05, - "loss": 0.3363, + "epoch": 0.5115868382167441, + "grad_norm": 0.1580374389886856, + "learning_rate": 4.931806240300905e-05, + "loss": 0.4715, "step": 14195 }, { - "epoch": 0.5, - "learning_rate": 4.936205610023388e-05, - "loss": 0.3318, + "epoch": 0.5117670378779688, + "grad_norm": 0.18431012332439423, + "learning_rate": 4.931738531244461e-05, + "loss": 0.4241, "step": 14200 }, { - "epoch": 0.5, - "learning_rate": 4.936141650498608e-05, - "loss": 0.2952, + "epoch": 0.5119472375391935, + "grad_norm": 0.20473140478134155, + "learning_rate": 4.93167078905605e-05, + "loss": 0.4226, "step": 14205 }, { - "epoch": 0.5, - "learning_rate": 4.9360776593421347e-05, - "loss": 0.3331, + "epoch": 0.5121274372004181, + "grad_norm": 0.1783171445131302, + "learning_rate": 4.931603013736595e-05, + "loss": 0.4515, "step": 14210 }, { - "epoch": 0.5, - "learning_rate": 4.9360136365547985e-05, - "loss": 0.3029, + "epoch": 0.5123076368616427, + "grad_norm": 0.1747739315032959, + "learning_rate": 4.931535205287021e-05, + "loss": 0.4537, "step": 14215 }, { - "epoch": 0.5, - "learning_rate": 4.93594958213743e-05, - "loss": 0.2965, + "epoch": 0.5124878365228673, + "grad_norm": 0.17563074827194214, + "learning_rate": 4.93146736370825e-05, + "loss": 0.459, "step": 14220 }, { - "epoch": 0.5, - "learning_rate": 4.935885496090862e-05, - "loss": 0.3002, + "epoch": 0.512668036184092, + "grad_norm": 0.16567584872245789, + "learning_rate": 4.931399489001206e-05, + "loss": 0.4407, "step": 14225 }, { - "epoch": 0.5, - "learning_rate": 4.9358213784159264e-05, - "loss": 0.3356, + "epoch": 0.5128482358453166, + "grad_norm": 0.20225879549980164, + "learning_rate": 4.931331581166816e-05, + "loss": 0.5078, "step": 14230 }, { - "epoch": 0.5, - "learning_rate": 4.935757229113455e-05, - "loss": 0.3206, + "epoch": 0.5130284355065412, + "grad_norm": 0.17894190549850464, + "learning_rate": 4.931263640206003e-05, + "loss": 0.429, "step": 14235 }, { - "epoch": 0.5, - "learning_rate": 4.9356930481842814e-05, - "loss": 0.3251, + "epoch": 0.5132086351677659, + "grad_norm": 0.2424396276473999, + "learning_rate": 4.9311956661196945e-05, + "loss": 0.4652, "step": 14240 }, { - "epoch": 0.5, - "learning_rate": 4.935628835629238e-05, - "loss": 0.3013, + "epoch": 0.5133888348289906, + "grad_norm": 0.17801520228385925, + "learning_rate": 4.931127658908815e-05, + "loss": 0.4764, "step": 14245 }, { - "epoch": 0.5, - "learning_rate": 4.93556459144916e-05, - "loss": 0.2957, + "epoch": 0.5135690344902152, + "grad_norm": 0.16889409720897675, + "learning_rate": 4.931059618574292e-05, + "loss": 0.4521, "step": 14250 }, { - "epoch": 0.5, - "learning_rate": 4.935500315644881e-05, - "loss": 0.2915, + "epoch": 0.5137492341514398, + "grad_norm": 0.17756950855255127, + "learning_rate": 4.930991545117052e-05, + "loss": 0.4439, "step": 14255 }, { - "epoch": 0.5, - "learning_rate": 4.935436008217236e-05, - "loss": 0.3131, + "epoch": 0.5139294338126644, + "grad_norm": 0.13921865820884705, + "learning_rate": 4.930923438538024e-05, + "loss": 0.4176, "step": 14260 }, { - "epoch": 0.5, - "learning_rate": 4.935371669167058e-05, - "loss": 0.3186, + "epoch": 0.514109633473889, + "grad_norm": 0.21671532094478607, + "learning_rate": 4.930855298838134e-05, + "loss": 0.4944, "step": 14265 }, { - "epoch": 0.5, - "learning_rate": 4.935307298495185e-05, - "loss": 0.2998, + "epoch": 0.5142898331351137, + "grad_norm": 0.1547165960073471, + "learning_rate": 4.930787126018311e-05, + "loss": 0.4394, "step": 14270 }, { - "epoch": 0.5, - "learning_rate": 4.935242896202451e-05, - "loss": 0.3304, + "epoch": 0.5144700327963383, + "grad_norm": 0.16631481051445007, + "learning_rate": 4.930718920079484e-05, + "loss": 0.4397, "step": 14275 }, { - "epoch": 0.5, - "learning_rate": 4.935178462289694e-05, - "loss": 0.3435, + "epoch": 0.514650232457563, + "grad_norm": 0.14847290515899658, + "learning_rate": 4.9306506810225824e-05, + "loss": 0.4688, "step": 14280 }, { - "epoch": 0.5, - "learning_rate": 4.9351139967577475e-05, - "loss": 0.2953, + "epoch": 0.5148304321187877, + "grad_norm": 0.1879221498966217, + "learning_rate": 4.930582408848536e-05, + "loss": 0.4581, "step": 14285 }, { - "epoch": 0.5, - "learning_rate": 4.935049499607452e-05, - "loss": 0.3056, + "epoch": 0.5150106317800123, + "grad_norm": 0.15472771227359772, + "learning_rate": 4.930514103558275e-05, + "loss": 0.4211, "step": 14290 }, { - "epoch": 0.5, - "learning_rate": 4.9349849708396436e-05, - "loss": 0.302, + "epoch": 0.5151908314412369, + "grad_norm": 0.22856763005256653, + "learning_rate": 4.9304457651527305e-05, + "loss": 0.4399, "step": 14295 }, { - "epoch": 0.5, - "learning_rate": 4.93492041045516e-05, - "loss": 0.3219, + "epoch": 0.5153710311024615, + "grad_norm": 0.13668227195739746, + "learning_rate": 4.930377393632832e-05, + "loss": 0.4319, "step": 14300 }, { - "epoch": 0.5, - "learning_rate": 4.934855818454839e-05, - "loss": 0.3243, + "epoch": 0.5155512307636861, + "grad_norm": 0.1626194417476654, + "learning_rate": 4.9303089889995125e-05, + "loss": 0.4161, "step": 14305 }, { - "epoch": 0.5, - "learning_rate": 4.93479119483952e-05, - "loss": 0.3482, + "epoch": 0.5157314304249108, + "grad_norm": 0.17859403789043427, + "learning_rate": 4.930240551253703e-05, + "loss": 0.4305, "step": 14310 }, { - "epoch": 0.5, - "learning_rate": 4.9347265396100414e-05, - "loss": 0.3026, + "epoch": 0.5159116300861354, + "grad_norm": 0.16735269129276276, + "learning_rate": 4.930172080396337e-05, + "loss": 0.4726, "step": 14315 }, { - "epoch": 0.5, - "learning_rate": 4.934661852767244e-05, - "loss": 0.32, + "epoch": 0.5160918297473601, + "grad_norm": 0.1954079419374466, + "learning_rate": 4.930103576428346e-05, + "loss": 0.4316, "step": 14320 }, { - "epoch": 0.5, - "learning_rate": 4.934597134311967e-05, - "loss": 0.3109, + "epoch": 0.5162720294085847, + "grad_norm": 0.17768363654613495, + "learning_rate": 4.9300350393506655e-05, + "loss": 0.4013, "step": 14325 }, { - "epoch": 0.5, - "learning_rate": 4.93453238424505e-05, - "loss": 0.3353, + "epoch": 0.5164522290698094, + "grad_norm": 0.1929953694343567, + "learning_rate": 4.929966469164228e-05, + "loss": 0.4537, "step": 14330 }, { - "epoch": 0.5, - "learning_rate": 4.934467602567335e-05, - "loss": 0.2842, + "epoch": 0.516632428731034, + "grad_norm": 0.17287170886993408, + "learning_rate": 4.9298978658699674e-05, + "loss": 0.4316, "step": 14335 }, { - "epoch": 0.5, - "learning_rate": 4.934402789279662e-05, - "loss": 0.304, + "epoch": 0.5168126283922586, + "grad_norm": 0.18152733147144318, + "learning_rate": 4.9298292294688183e-05, + "loss": 0.4502, "step": 14340 }, { - "epoch": 0.5, - "learning_rate": 4.934337944382874e-05, - "loss": 0.3147, + "epoch": 0.5169928280534832, + "grad_norm": 0.19768409430980682, + "learning_rate": 4.929760559961717e-05, + "loss": 0.4375, "step": 14345 }, { - "epoch": 0.5, - "learning_rate": 4.934273067877811e-05, - "loss": 0.2964, + "epoch": 0.5171730277147079, + "grad_norm": 0.18449656665325165, + "learning_rate": 4.9296918573495984e-05, + "loss": 0.4628, "step": 14350 }, { - "epoch": 0.51, - "learning_rate": 4.934208159765317e-05, - "loss": 0.3063, + "epoch": 0.5173532273759325, + "grad_norm": 0.13669456541538239, + "learning_rate": 4.9296231216333986e-05, + "loss": 0.4446, "step": 14355 }, { - "epoch": 0.51, - "learning_rate": 4.9341432200462343e-05, - "loss": 0.3435, + "epoch": 0.5175334270371572, + "grad_norm": 0.17907440662384033, + "learning_rate": 4.929554352814055e-05, + "loss": 0.4351, "step": 14360 }, { - "epoch": 0.51, - "learning_rate": 4.934078248721407e-05, - "loss": 0.306, + "epoch": 0.5177136266983818, + "grad_norm": 0.2036353498697281, + "learning_rate": 4.9294855508925026e-05, + "loss": 0.4293, "step": 14365 }, { - "epoch": 0.51, - "learning_rate": 4.934013245791676e-05, - "loss": 0.3341, + "epoch": 0.5178938263596065, + "grad_norm": 0.14478379487991333, + "learning_rate": 4.92941671586968e-05, + "loss": 0.4088, "step": 14370 }, { - "epoch": 0.51, - "learning_rate": 4.933948211257888e-05, - "loss": 0.3356, + "epoch": 0.5180740260208311, + "grad_norm": 0.19350174069404602, + "learning_rate": 4.9293478477465254e-05, + "loss": 0.4735, "step": 14375 }, { - "epoch": 0.51, - "learning_rate": 4.933883145120886e-05, - "loss": 0.3242, + "epoch": 0.5182542256820557, + "grad_norm": 0.15223993360996246, + "learning_rate": 4.9292789465239765e-05, + "loss": 0.42, "step": 14380 }, { - "epoch": 0.51, - "learning_rate": 4.933818047381516e-05, - "loss": 0.3052, + "epoch": 0.5184344253432803, + "grad_norm": 0.17773163318634033, + "learning_rate": 4.929210012202973e-05, + "loss": 0.4626, "step": 14385 }, { - "epoch": 0.51, - "learning_rate": 4.933752918040622e-05, - "loss": 0.3118, + "epoch": 0.518614625004505, + "grad_norm": 0.19823719561100006, + "learning_rate": 4.929141044784452e-05, + "loss": 0.4291, "step": 14390 }, { - "epoch": 0.51, - "learning_rate": 4.933687757099051e-05, - "loss": 0.3243, + "epoch": 0.5187948246657297, + "grad_norm": 0.163058340549469, + "learning_rate": 4.929072044269356e-05, + "loss": 0.4824, "step": 14395 }, { - "epoch": 0.51, - "learning_rate": 4.933622564557648e-05, - "loss": 0.2979, + "epoch": 0.5189750243269543, + "grad_norm": 0.17107297480106354, + "learning_rate": 4.929003010658623e-05, + "loss": 0.4953, "step": 14400 }, { - "epoch": 0.51, - "learning_rate": 4.9335573404172594e-05, - "loss": 0.3052, + "epoch": 0.5191552239881789, + "grad_norm": 0.17323638498783112, + "learning_rate": 4.928933943953193e-05, + "loss": 0.4444, "step": 14405 }, { - "epoch": 0.51, - "learning_rate": 4.9334920846787324e-05, - "loss": 0.3168, + "epoch": 0.5193354236494035, + "grad_norm": 0.19337867200374603, + "learning_rate": 4.92886484415401e-05, + "loss": 0.4295, "step": 14410 }, { - "epoch": 0.51, - "learning_rate": 4.933426797342915e-05, - "loss": 0.3032, + "epoch": 0.5195156233106282, + "grad_norm": 0.250715434551239, + "learning_rate": 4.9287957112620134e-05, + "loss": 0.5048, "step": 14415 }, { - "epoch": 0.51, - "learning_rate": 4.933361478410654e-05, - "loss": 0.319, + "epoch": 0.5196958229718528, + "grad_norm": 0.19110535085201263, + "learning_rate": 4.928726545278145e-05, + "loss": 0.4508, "step": 14420 }, { - "epoch": 0.51, - "learning_rate": 4.933296127882798e-05, - "loss": 0.3071, + "epoch": 0.5198760226330774, + "grad_norm": 0.2401556819677353, + "learning_rate": 4.9286573462033484e-05, + "loss": 0.52, "step": 14425 }, { - "epoch": 0.51, - "learning_rate": 4.9332307457601955e-05, - "loss": 0.2847, + "epoch": 0.520056222294302, + "grad_norm": 0.145725280046463, + "learning_rate": 4.9285881140385645e-05, + "loss": 0.4753, "step": 14430 }, { - "epoch": 0.51, - "learning_rate": 4.9331653320436954e-05, - "loss": 0.2941, + "epoch": 0.5202364219555268, + "grad_norm": 0.17486672103405, + "learning_rate": 4.928518848784739e-05, + "loss": 0.4178, "step": 14435 }, { - "epoch": 0.51, - "learning_rate": 4.933099886734147e-05, - "loss": 0.2967, + "epoch": 0.5204166216167514, + "grad_norm": 0.1457482874393463, + "learning_rate": 4.928449550442814e-05, + "loss": 0.4379, "step": 14440 }, { - "epoch": 0.51, - "learning_rate": 4.9330344098324e-05, - "loss": 0.304, + "epoch": 0.520596821277976, + "grad_norm": 0.21257434785366058, + "learning_rate": 4.928380219013734e-05, + "loss": 0.465, "step": 14445 }, { - "epoch": 0.51, - "learning_rate": 4.932968901339304e-05, - "loss": 0.3298, + "epoch": 0.5207770209392006, + "grad_norm": 0.17879636585712433, + "learning_rate": 4.928310854498444e-05, + "loss": 0.4134, "step": 14450 }, { - "epoch": 0.51, - "learning_rate": 4.932903361255711e-05, - "loss": 0.2917, + "epoch": 0.5209572206004253, + "grad_norm": 0.20684200525283813, + "learning_rate": 4.928241456897887e-05, + "loss": 0.4, "step": 14455 }, { - "epoch": 0.51, - "learning_rate": 4.9328377895824705e-05, - "loss": 0.2729, + "epoch": 0.5211374202616499, + "grad_norm": 0.15960803627967834, + "learning_rate": 4.928172026213012e-05, + "loss": 0.4198, "step": 14460 }, { - "epoch": 0.51, - "learning_rate": 4.9327721863204354e-05, - "loss": 0.316, + "epoch": 0.5213176199228745, + "grad_norm": 0.1935255080461502, + "learning_rate": 4.928102562444763e-05, + "loss": 0.4503, "step": 14465 }, { - "epoch": 0.51, - "learning_rate": 4.932706551470456e-05, - "loss": 0.2947, + "epoch": 0.5214978195840991, + "grad_norm": 0.18149693310260773, + "learning_rate": 4.928033065594086e-05, + "loss": 0.4337, "step": 14470 }, { - "epoch": 0.51, - "learning_rate": 4.932640885033386e-05, - "loss": 0.3385, + "epoch": 0.5216780192453239, + "grad_norm": 0.21554191410541534, + "learning_rate": 4.927963535661929e-05, + "loss": 0.3948, "step": 14475 }, { - "epoch": 0.51, - "learning_rate": 4.9325751870100765e-05, - "loss": 0.2871, + "epoch": 0.5218582189065485, + "grad_norm": 0.14424189925193787, + "learning_rate": 4.927893972649239e-05, + "loss": 0.4235, "step": 14480 }, { - "epoch": 0.51, - "learning_rate": 4.932509457401382e-05, - "loss": 0.3188, + "epoch": 0.5220384185677731, + "grad_norm": 0.1991516500711441, + "learning_rate": 4.927824376556964e-05, + "loss": 0.4594, "step": 14485 }, { - "epoch": 0.51, - "learning_rate": 4.9324436962081546e-05, - "loss": 0.302, + "epoch": 0.5222186182289977, + "grad_norm": 0.12362515181303024, + "learning_rate": 4.927754747386051e-05, + "loss": 0.4268, "step": 14490 }, { - "epoch": 0.51, - "learning_rate": 4.932377903431249e-05, - "loss": 0.3015, + "epoch": 0.5223988178902224, + "grad_norm": 0.219016894698143, + "learning_rate": 4.92768508513745e-05, + "loss": 0.4574, "step": 14495 }, { - "epoch": 0.51, - "learning_rate": 4.9323120790715194e-05, - "loss": 0.3215, + "epoch": 0.522579017551447, + "grad_norm": 0.1444326788187027, + "learning_rate": 4.92761538981211e-05, + "loss": 0.4237, "step": 14500 }, { - "epoch": 0.51, - "eval_loss": 0.3104207515716553, - "eval_runtime": 10.5492, - "eval_samples_per_second": 9.479, - "eval_steps_per_second": 9.479, + "epoch": 0.522579017551447, + "eval_loss": 0.47057196497917175, + "eval_runtime": 3.5636, + "eval_samples_per_second": 28.062, + "eval_steps_per_second": 7.015, "step": 14500 }, { - "epoch": 0.51, - "learning_rate": 4.9322462231298206e-05, - "loss": 0.3215, + "epoch": 0.5227592172126716, + "grad_norm": 0.23088853061199188, + "learning_rate": 4.92754566141098e-05, + "loss": 0.4652, "step": 14505 }, { - "epoch": 0.51, - "learning_rate": 4.932180335607007e-05, - "loss": 0.3366, + "epoch": 0.5229394168738962, + "grad_norm": 0.20848578214645386, + "learning_rate": 4.9274758999350115e-05, + "loss": 0.4217, "step": 14510 }, { - "epoch": 0.51, - "learning_rate": 4.932114416503936e-05, - "loss": 0.3001, + "epoch": 0.523119616535121, + "grad_norm": 0.1422787457704544, + "learning_rate": 4.9274061053851525e-05, + "loss": 0.438, "step": 14515 }, { - "epoch": 0.51, - "learning_rate": 4.9320484658214605e-05, - "loss": 0.2973, + "epoch": 0.5232998161963456, + "grad_norm": 0.21294035017490387, + "learning_rate": 4.9273362777623555e-05, + "loss": 0.4632, "step": 14520 }, { - "epoch": 0.51, - "learning_rate": 4.9319824835604386e-05, - "loss": 0.3296, + "epoch": 0.5234800158575702, + "grad_norm": 0.1711554378271103, + "learning_rate": 4.927266417067572e-05, + "loss": 0.469, "step": 14525 }, { - "epoch": 0.51, - "learning_rate": 4.9319164697217276e-05, - "loss": 0.2947, + "epoch": 0.5236602155187948, + "grad_norm": 0.17742256820201874, + "learning_rate": 4.9271965233017527e-05, + "loss": 0.4506, "step": 14530 }, { - "epoch": 0.51, - "learning_rate": 4.931850424306183e-05, - "loss": 0.327, + "epoch": 0.5238404151800194, + "grad_norm": 0.17392541468143463, + "learning_rate": 4.9271265964658517e-05, + "loss": 0.4368, "step": 14535 }, { - "epoch": 0.51, - "learning_rate": 4.931784347314665e-05, - "loss": 0.3126, + "epoch": 0.5240206148412441, + "grad_norm": 0.15873579680919647, + "learning_rate": 4.92705663656082e-05, + "loss": 0.4176, "step": 14540 }, { - "epoch": 0.51, - "learning_rate": 4.931718238748029e-05, - "loss": 0.311, + "epoch": 0.5242008145024687, + "grad_norm": 0.19420740008354187, + "learning_rate": 4.926986643587612e-05, + "loss": 0.4758, "step": 14545 }, { - "epoch": 0.51, - "learning_rate": 4.9316520986071335e-05, - "loss": 0.3362, + "epoch": 0.5243810141636934, + "grad_norm": 0.16508692502975464, + "learning_rate": 4.9269166175471806e-05, + "loss": 0.4577, "step": 14550 }, { - "epoch": 0.51, - "learning_rate": 4.931585926892839e-05, - "loss": 0.3107, + "epoch": 0.524561213824918, + "grad_norm": 0.1393464207649231, + "learning_rate": 4.92684655844048e-05, + "loss": 0.4444, "step": 14555 }, { - "epoch": 0.51, - "learning_rate": 4.931519723606003e-05, - "loss": 0.3039, + "epoch": 0.5247414134861427, + "grad_norm": 0.16871263086795807, + "learning_rate": 4.9267764662684654e-05, + "loss": 0.4719, "step": 14560 }, { - "epoch": 0.51, - "learning_rate": 4.9314534887474864e-05, - "loss": 0.3238, + "epoch": 0.5249216131473673, + "grad_norm": 0.16914774477481842, + "learning_rate": 4.9267063410320907e-05, + "loss": 0.4371, "step": 14565 }, { - "epoch": 0.51, - "learning_rate": 4.9313872223181485e-05, - "loss": 0.3258, + "epoch": 0.5251018128085919, + "grad_norm": 0.15868036448955536, + "learning_rate": 4.926636182732313e-05, + "loss": 0.4464, "step": 14570 }, { - "epoch": 0.51, - "learning_rate": 4.93132092431885e-05, - "loss": 0.3171, + "epoch": 0.5252820124698165, + "grad_norm": 0.20540837943553925, + "learning_rate": 4.926565991370086e-05, + "loss": 0.4362, "step": 14575 }, { - "epoch": 0.51, - "learning_rate": 4.931254594750451e-05, - "loss": 0.2988, + "epoch": 0.5254622121310412, + "grad_norm": 0.20324106514453888, + "learning_rate": 4.926495766946368e-05, + "loss": 0.4428, "step": 14580 }, { - "epoch": 0.51, - "learning_rate": 4.9311882336138136e-05, - "loss": 0.3502, + "epoch": 0.5256424117922658, + "grad_norm": 0.190400630235672, + "learning_rate": 4.9264255094621135e-05, + "loss": 0.4453, "step": 14585 }, { - "epoch": 0.51, - "learning_rate": 4.931121840909799e-05, - "loss": 0.3244, + "epoch": 0.5258226114534905, + "grad_norm": 0.19358183443546295, + "learning_rate": 4.9263552189182826e-05, + "loss": 0.454, "step": 14590 }, { - "epoch": 0.51, - "learning_rate": 4.93105541663927e-05, - "loss": 0.3312, + "epoch": 0.5260028111147151, + "grad_norm": 0.18700343370437622, + "learning_rate": 4.926284895315831e-05, + "loss": 0.486, "step": 14595 }, { - "epoch": 0.51, - "learning_rate": 4.930988960803088e-05, - "loss": 0.2911, + "epoch": 0.5261830107759398, + "grad_norm": 0.1986667960882187, + "learning_rate": 4.926214538655718e-05, + "loss": 0.4768, "step": 14600 }, { - "epoch": 0.51, - "learning_rate": 4.930922473402117e-05, - "loss": 0.3002, + "epoch": 0.5263632104371644, + "grad_norm": 0.16327090561389923, + "learning_rate": 4.926144148938901e-05, + "loss": 0.4532, "step": 14605 }, { - "epoch": 0.51, - "learning_rate": 4.930855954437219e-05, - "loss": 0.2905, + "epoch": 0.526543410098389, + "grad_norm": 0.18385253846645355, + "learning_rate": 4.92607372616634e-05, + "loss": 0.4345, "step": 14610 }, { - "epoch": 0.51, - "learning_rate": 4.9307894039092596e-05, - "loss": 0.3078, + "epoch": 0.5267236097596136, + "grad_norm": 0.16108223795890808, + "learning_rate": 4.9260032703389936e-05, + "loss": 0.4524, "step": 14615 }, { - "epoch": 0.51, - "learning_rate": 4.9307228218191e-05, - "loss": 0.3168, + "epoch": 0.5269038094208383, + "grad_norm": 0.22687973082065582, + "learning_rate": 4.9259327814578234e-05, + "loss": 0.4706, "step": 14620 }, { - "epoch": 0.51, - "learning_rate": 4.9306562081676076e-05, - "loss": 0.3214, + "epoch": 0.5270840090820629, + "grad_norm": 0.20703905820846558, + "learning_rate": 4.925862259523788e-05, + "loss": 0.4411, "step": 14625 }, { - "epoch": 0.51, - "learning_rate": 4.9305895629556464e-05, - "loss": 0.3257, + "epoch": 0.5272642087432876, + "grad_norm": 0.17245493829250336, + "learning_rate": 4.925791704537849e-05, + "loss": 0.4483, "step": 14630 }, { - "epoch": 0.51, - "learning_rate": 4.930522886184081e-05, - "loss": 0.2989, + "epoch": 0.5274444084045122, + "grad_norm": 0.21393609046936035, + "learning_rate": 4.925721116500968e-05, + "loss": 0.4409, "step": 14635 }, { - "epoch": 0.52, - "learning_rate": 4.9304561778537786e-05, - "loss": 0.3381, + "epoch": 0.5276246080657369, + "grad_norm": 0.16517458856105804, + "learning_rate": 4.9256504954141066e-05, + "loss": 0.4424, "step": 14640 }, { - "epoch": 0.52, - "learning_rate": 4.9303894379656035e-05, - "loss": 0.3171, + "epoch": 0.5278048077269615, + "grad_norm": 0.1912750005722046, + "learning_rate": 4.925579841278226e-05, + "loss": 0.4668, "step": 14645 }, { - "epoch": 0.52, - "learning_rate": 4.930322666520423e-05, - "loss": 0.3156, + "epoch": 0.5279850073881861, + "grad_norm": 0.19278542697429657, + "learning_rate": 4.9255091540942905e-05, + "loss": 0.4311, "step": 14650 }, { - "epoch": 0.52, - "learning_rate": 4.9302558635191054e-05, - "loss": 0.3314, + "epoch": 0.5281652070494107, + "grad_norm": 0.14870208501815796, + "learning_rate": 4.925438433863262e-05, + "loss": 0.4558, "step": 14655 }, { - "epoch": 0.52, - "learning_rate": 4.9301890289625165e-05, - "loss": 0.316, + "epoch": 0.5283454067106353, + "grad_norm": 0.15407264232635498, + "learning_rate": 4.925367680586104e-05, + "loss": 0.4101, "step": 14660 }, { - "epoch": 0.52, - "learning_rate": 4.930122162851524e-05, - "loss": 0.3382, + "epoch": 0.52852560637186, + "grad_norm": 0.183024600148201, + "learning_rate": 4.925296894263782e-05, + "loss": 0.4485, "step": 14665 }, { - "epoch": 0.52, - "learning_rate": 4.930055265186997e-05, - "loss": 0.2921, + "epoch": 0.5287058060330847, + "grad_norm": 0.15193411707878113, + "learning_rate": 4.925226074897259e-05, + "loss": 0.4471, "step": 14670 }, { - "epoch": 0.52, - "learning_rate": 4.929988335969805e-05, - "loss": 0.3044, + "epoch": 0.5288860056943093, + "grad_norm": 0.18354560434818268, + "learning_rate": 4.9251552224875e-05, + "loss": 0.4246, "step": 14675 }, { - "epoch": 0.52, - "learning_rate": 4.929921375200814e-05, - "loss": 0.2977, + "epoch": 0.529066205355534, + "grad_norm": 0.16226573288440704, + "learning_rate": 4.9250843370354704e-05, + "loss": 0.4549, "step": 14680 }, { - "epoch": 0.52, - "learning_rate": 4.9298543828808966e-05, - "loss": 0.3022, + "epoch": 0.5292464050167586, + "grad_norm": 0.18447591364383698, + "learning_rate": 4.925013418542136e-05, + "loss": 0.4531, "step": 14685 }, { - "epoch": 0.52, - "learning_rate": 4.9297873590109213e-05, - "loss": 0.3132, + "epoch": 0.5294266046779832, + "grad_norm": 0.12299656867980957, + "learning_rate": 4.9249424670084636e-05, + "loss": 0.4247, "step": 14690 }, { - "epoch": 0.52, - "learning_rate": 4.9297203035917586e-05, - "loss": 0.3095, + "epoch": 0.5296068043392078, + "grad_norm": 0.19318439066410065, + "learning_rate": 4.92487148243542e-05, + "loss": 0.4277, "step": 14695 }, { - "epoch": 0.52, - "learning_rate": 4.9296532166242785e-05, - "loss": 0.3058, + "epoch": 0.5297870040004324, + "grad_norm": 0.22538743913173676, + "learning_rate": 4.924800464823971e-05, + "loss": 0.4561, "step": 14700 }, { - "epoch": 0.52, - "learning_rate": 4.929586098109352e-05, - "loss": 0.2842, + "epoch": 0.5299672036616572, + "grad_norm": 0.15232376754283905, + "learning_rate": 4.9247294141750864e-05, + "loss": 0.4437, "step": 14705 }, { - "epoch": 0.52, - "learning_rate": 4.929518948047852e-05, - "loss": 0.2967, + "epoch": 0.5301474033228818, + "grad_norm": 0.2040311098098755, + "learning_rate": 4.924658330489732e-05, + "loss": 0.451, "step": 14710 }, { - "epoch": 0.52, - "learning_rate": 4.92945176644065e-05, - "loss": 0.3238, + "epoch": 0.5303276029841064, + "grad_norm": 0.1730954647064209, + "learning_rate": 4.9245872137688776e-05, + "loss": 0.4292, "step": 14715 }, { - "epoch": 0.52, - "learning_rate": 4.9293845532886176e-05, - "loss": 0.3106, + "epoch": 0.530507802645331, + "grad_norm": 0.16043753921985626, + "learning_rate": 4.9245160640134916e-05, + "loss": 0.4814, "step": 14720 }, { - "epoch": 0.52, - "learning_rate": 4.929317308592627e-05, - "loss": 0.3143, + "epoch": 0.5306880023065557, + "grad_norm": 0.17977766692638397, + "learning_rate": 4.924444881224544e-05, + "loss": 0.4409, "step": 14725 }, { - "epoch": 0.52, - "learning_rate": 4.9292500323535526e-05, - "loss": 0.3155, + "epoch": 0.5308682019677803, + "grad_norm": 0.21359415352344513, + "learning_rate": 4.924373665403004e-05, + "loss": 0.4522, "step": 14730 }, { - "epoch": 0.52, - "learning_rate": 4.9291827245722674e-05, - "loss": 0.2988, + "epoch": 0.5310484016290049, + "grad_norm": 0.1500881314277649, + "learning_rate": 4.924302416549842e-05, + "loss": 0.4555, "step": 14735 }, { - "epoch": 0.52, - "learning_rate": 4.929115385249645e-05, - "loss": 0.3148, + "epoch": 0.5312286012902295, + "grad_norm": 0.18042995035648346, + "learning_rate": 4.92423113466603e-05, + "loss": 0.467, "step": 14740 }, { - "epoch": 0.52, - "learning_rate": 4.9290480143865616e-05, - "loss": 0.3172, + "epoch": 0.5314088009514543, + "grad_norm": 0.14702163636684418, + "learning_rate": 4.9241598197525374e-05, + "loss": 0.465, "step": 14745 }, { - "epoch": 0.52, - "learning_rate": 4.928980611983889e-05, - "loss": 0.3113, + "epoch": 0.5315890006126789, + "grad_norm": 0.14355307817459106, + "learning_rate": 4.9240884718103366e-05, + "loss": 0.4373, "step": 14750 }, { - "epoch": 0.52, - "learning_rate": 4.928913178042505e-05, - "loss": 0.3493, + "epoch": 0.5317692002739035, + "grad_norm": 0.17057915031909943, + "learning_rate": 4.9240170908403996e-05, + "loss": 0.4753, "step": 14755 }, { - "epoch": 0.52, - "learning_rate": 4.928845712563284e-05, - "loss": 0.3253, + "epoch": 0.5319493999351281, + "grad_norm": 0.16943103075027466, + "learning_rate": 4.9239456768436985e-05, + "loss": 0.442, "step": 14760 }, { - "epoch": 0.52, - "learning_rate": 4.928778215547101e-05, - "loss": 0.3245, + "epoch": 0.5321295995963528, + "grad_norm": 0.21389932930469513, + "learning_rate": 4.923874229821208e-05, + "loss": 0.4608, "step": 14765 }, { - "epoch": 0.52, - "learning_rate": 4.928710686994835e-05, - "loss": 0.3032, + "epoch": 0.5323097992575774, + "grad_norm": 0.17261211574077606, + "learning_rate": 4.9238027497738995e-05, + "loss": 0.4512, "step": 14770 }, { - "epoch": 0.52, - "learning_rate": 4.9286431269073596e-05, - "loss": 0.2691, + "epoch": 0.532489998918802, + "grad_norm": 0.1695832759141922, + "learning_rate": 4.9237312367027484e-05, + "loss": 0.4398, "step": 14775 }, { - "epoch": 0.52, - "learning_rate": 4.928575535285555e-05, - "loss": 0.2994, + "epoch": 0.5326701985800266, + "grad_norm": 0.17145270109176636, + "learning_rate": 4.923659690608728e-05, + "loss": 0.4597, "step": 14780 }, { - "epoch": 0.52, - "learning_rate": 4.9285079121302974e-05, - "loss": 0.3101, + "epoch": 0.5328503982412514, + "grad_norm": 0.193446546792984, + "learning_rate": 4.923588111492814e-05, + "loss": 0.4286, "step": 14785 }, { - "epoch": 0.52, - "learning_rate": 4.928440257442465e-05, - "loss": 0.3046, + "epoch": 0.533030597902476, + "grad_norm": 0.19881367683410645, + "learning_rate": 4.923516499355981e-05, + "loss": 0.4713, "step": 14790 }, { - "epoch": 0.52, - "learning_rate": 4.928372571222935e-05, - "loss": 0.3185, + "epoch": 0.5332107975637006, + "grad_norm": 0.16294337809085846, + "learning_rate": 4.9234448541992045e-05, + "loss": 0.3998, "step": 14795 }, { - "epoch": 0.52, - "learning_rate": 4.9283048534725896e-05, - "loss": 0.3414, + "epoch": 0.5333909972249252, + "grad_norm": 0.1420733481645584, + "learning_rate": 4.9233731760234616e-05, + "loss": 0.4301, "step": 14800 }, { - "epoch": 0.52, - "learning_rate": 4.928237104192305e-05, - "loss": 0.3148, + "epoch": 0.5335711968861498, + "grad_norm": 0.17047178745269775, + "learning_rate": 4.923301464829728e-05, + "loss": 0.4181, "step": 14805 }, { - "epoch": 0.52, - "learning_rate": 4.928169323382962e-05, - "loss": 0.3139, + "epoch": 0.5337513965473745, + "grad_norm": 0.13956919312477112, + "learning_rate": 4.923229720618981e-05, + "loss": 0.455, "step": 14810 }, { - "epoch": 0.52, - "learning_rate": 4.92810151104544e-05, - "loss": 0.3235, + "epoch": 0.5339315962085991, + "grad_norm": 0.17539288103580475, + "learning_rate": 4.923157943392199e-05, + "loss": 0.4681, "step": 14815 }, { - "epoch": 0.52, - "learning_rate": 4.9280336671806206e-05, - "loss": 0.3351, + "epoch": 0.5341117958698237, + "grad_norm": 0.19295528531074524, + "learning_rate": 4.9230861331503586e-05, + "loss": 0.4691, "step": 14820 }, { - "epoch": 0.52, - "learning_rate": 4.927965791789384e-05, - "loss": 0.3248, + "epoch": 0.5342919955310484, + "grad_norm": 0.15335458517074585, + "learning_rate": 4.923014289894439e-05, + "loss": 0.4094, "step": 14825 }, { - "epoch": 0.52, - "learning_rate": 4.927897884872612e-05, - "loss": 0.3035, + "epoch": 0.5344721951922731, + "grad_norm": 0.17784667015075684, + "learning_rate": 4.922942413625418e-05, + "loss": 0.4263, "step": 14830 }, { - "epoch": 0.52, - "learning_rate": 4.927829946431186e-05, - "loss": 0.2935, + "epoch": 0.5346523948534977, + "grad_norm": 0.14942409098148346, + "learning_rate": 4.922870504344276e-05, + "loss": 0.432, "step": 14835 }, { - "epoch": 0.52, - "learning_rate": 4.927761976465988e-05, - "loss": 0.3099, + "epoch": 0.5348325945147223, + "grad_norm": 0.1628066897392273, + "learning_rate": 4.9227985620519934e-05, + "loss": 0.4321, "step": 14840 }, { - "epoch": 0.52, - "learning_rate": 4.9276939749779005e-05, - "loss": 0.3211, + "epoch": 0.5350127941759469, + "grad_norm": 0.17270931601524353, + "learning_rate": 4.922726586749549e-05, + "loss": 0.4449, "step": 14845 }, { - "epoch": 0.52, - "learning_rate": 4.927625941967807e-05, - "loss": 0.3079, + "epoch": 0.5351929938371716, + "grad_norm": 0.1773119568824768, + "learning_rate": 4.922654578437923e-05, + "loss": 0.4791, "step": 14850 }, { - "epoch": 0.52, - "learning_rate": 4.92755787743659e-05, - "loss": 0.3197, + "epoch": 0.5353731934983962, + "grad_norm": 0.1501081883907318, + "learning_rate": 4.9225825371180985e-05, + "loss": 0.4242, "step": 14855 }, { - "epoch": 0.52, - "learning_rate": 4.927489781385134e-05, - "loss": 0.3212, + "epoch": 0.5355533931596208, + "grad_norm": 0.16684913635253906, + "learning_rate": 4.9225104627910553e-05, + "loss": 0.4395, "step": 14860 }, { - "epoch": 0.52, - "learning_rate": 4.927421653814323e-05, - "loss": 0.3044, + "epoch": 0.5357335928208455, + "grad_norm": 0.2014274299144745, + "learning_rate": 4.922438355457777e-05, + "loss": 0.4403, "step": 14865 }, { - "epoch": 0.52, - "learning_rate": 4.927353494725041e-05, - "loss": 0.3069, + "epoch": 0.5359137924820702, + "grad_norm": 0.14999887347221375, + "learning_rate": 4.922366215119244e-05, + "loss": 0.4354, "step": 14870 }, { - "epoch": 0.52, - "learning_rate": 4.9272853041181744e-05, - "loss": 0.3359, + "epoch": 0.5360939921432948, + "grad_norm": 0.17910043895244598, + "learning_rate": 4.922294041776441e-05, + "loss": 0.4549, "step": 14875 }, { - "epoch": 0.52, - "learning_rate": 4.927217081994607e-05, - "loss": 0.3104, + "epoch": 0.5362741918045194, + "grad_norm": 0.1802271157503128, + "learning_rate": 4.922221835430351e-05, + "loss": 0.4787, "step": 14880 }, { - "epoch": 0.52, - "learning_rate": 4.9271488283552266e-05, - "loss": 0.3353, + "epoch": 0.536454391465744, + "grad_norm": 0.15048733353614807, + "learning_rate": 4.922149596081956e-05, + "loss": 0.4334, "step": 14885 }, { - "epoch": 0.52, - "learning_rate": 4.927080543200917e-05, - "loss": 0.3059, + "epoch": 0.5366345911269687, + "grad_norm": 0.17902320623397827, + "learning_rate": 4.9220773237322424e-05, + "loss": 0.4446, "step": 14890 }, { - "epoch": 0.52, - "learning_rate": 4.9270122265325666e-05, - "loss": 0.3228, + "epoch": 0.5368147907881933, + "grad_norm": 0.2159227430820465, + "learning_rate": 4.922005018382195e-05, + "loss": 0.398, "step": 14895 }, { - "epoch": 0.52, - "learning_rate": 4.926943878351062e-05, - "loss": 0.3114, + "epoch": 0.536994990449418, + "grad_norm": 0.17634055018424988, + "learning_rate": 4.9219326800327967e-05, + "loss": 0.4409, "step": 14900 }, { - "epoch": 0.52, - "learning_rate": 4.926875498657291e-05, - "loss": 0.3156, + "epoch": 0.5371751901106426, + "grad_norm": 0.15450234711170197, + "learning_rate": 4.921860308685036e-05, + "loss": 0.482, "step": 14905 }, { - "epoch": 0.52, - "learning_rate": 4.92680708745214e-05, - "loss": 0.324, + "epoch": 0.5373553897718673, + "grad_norm": 0.18871042132377625, + "learning_rate": 4.921787904339897e-05, + "loss": 0.4748, "step": 14910 }, { - "epoch": 0.52, - "learning_rate": 4.926738644736498e-05, - "loss": 0.3226, + "epoch": 0.5375355894330919, + "grad_norm": 0.15520833432674408, + "learning_rate": 4.921715466998366e-05, + "loss": 0.4209, "step": 14915 }, { - "epoch": 0.52, - "learning_rate": 4.926670170511255e-05, - "loss": 0.3128, + "epoch": 0.5377157890943165, + "grad_norm": 0.17904292047023773, + "learning_rate": 4.921642996661431e-05, + "loss": 0.4665, "step": 14920 }, { - "epoch": 0.53, - "learning_rate": 4.9266016647772996e-05, - "loss": 0.3139, + "epoch": 0.5378959887555411, + "grad_norm": 0.23425260186195374, + "learning_rate": 4.9215704933300795e-05, + "loss": 0.4156, "step": 14925 }, { - "epoch": 0.53, - "learning_rate": 4.92653312753552e-05, - "loss": 0.2981, + "epoch": 0.5380761884167657, + "grad_norm": 0.18364645540714264, + "learning_rate": 4.921497957005299e-05, + "loss": 0.4648, "step": 14930 }, { - "epoch": 0.53, - "learning_rate": 4.9264645587868064e-05, - "loss": 0.3413, + "epoch": 0.5382563880779904, + "grad_norm": 0.13957159221172333, + "learning_rate": 4.921425387688077e-05, + "loss": 0.4577, "step": 14935 }, { - "epoch": 0.53, - "learning_rate": 4.92639595853205e-05, - "loss": 0.3121, + "epoch": 0.5384365877392151, + "grad_norm": 0.1416681855916977, + "learning_rate": 4.9213527853794025e-05, + "loss": 0.4146, "step": 14940 }, { - "epoch": 0.53, - "learning_rate": 4.926327326772141e-05, - "loss": 0.3223, + "epoch": 0.5386167874004397, + "grad_norm": 0.18616509437561035, + "learning_rate": 4.921280150080266e-05, + "loss": 0.4689, "step": 14945 }, { - "epoch": 0.53, - "learning_rate": 4.926258663507971e-05, - "loss": 0.3568, + "epoch": 0.5387969870616643, + "grad_norm": 0.2297711819410324, + "learning_rate": 4.9212074817916554e-05, + "loss": 0.4326, "step": 14950 }, { - "epoch": 0.53, - "learning_rate": 4.926189968740431e-05, - "loss": 0.3296, + "epoch": 0.538977186722889, + "grad_norm": 0.17574380338191986, + "learning_rate": 4.9211347805145626e-05, + "loss": 0.4521, "step": 14955 }, { - "epoch": 0.53, - "learning_rate": 4.926121242470413e-05, - "loss": 0.3021, + "epoch": 0.5391573863841136, + "grad_norm": 0.15670587122440338, + "learning_rate": 4.921062046249976e-05, + "loss": 0.4515, "step": 14960 }, { - "epoch": 0.53, - "learning_rate": 4.92605248469881e-05, - "loss": 0.3203, + "epoch": 0.5393375860453382, + "grad_norm": 0.17424379289150238, + "learning_rate": 4.9209892789988886e-05, + "loss": 0.4088, "step": 14965 }, { - "epoch": 0.53, - "learning_rate": 4.925983695426514e-05, - "loss": 0.3302, + "epoch": 0.5395177857065628, + "grad_norm": 0.16352404654026031, + "learning_rate": 4.920916478762291e-05, + "loss": 0.4369, "step": 14970 }, { - "epoch": 0.53, - "learning_rate": 4.925914874654418e-05, - "loss": 0.3069, + "epoch": 0.5396979853677875, + "grad_norm": 0.1390729397535324, + "learning_rate": 4.920843645541174e-05, + "loss": 0.442, "step": 14975 }, { - "epoch": 0.53, - "learning_rate": 4.925846022383417e-05, - "loss": 0.2889, + "epoch": 0.5398781850290122, + "grad_norm": 0.20522846281528473, + "learning_rate": 4.9207707793365325e-05, + "loss": 0.4439, "step": 14980 }, { - "epoch": 0.53, - "learning_rate": 4.9257771386144033e-05, - "loss": 0.3106, + "epoch": 0.5400583846902368, + "grad_norm": 0.18843060731887817, + "learning_rate": 4.9206978801493574e-05, + "loss": 0.4427, "step": 14985 }, { - "epoch": 0.53, - "learning_rate": 4.925708223348273e-05, - "loss": 0.2993, + "epoch": 0.5402385843514614, + "grad_norm": 0.1719266027212143, + "learning_rate": 4.920624947980642e-05, + "loss": 0.4301, "step": 14990 }, { - "epoch": 0.53, - "learning_rate": 4.9256392765859194e-05, - "loss": 0.3323, + "epoch": 0.5404187840126861, + "grad_norm": 0.21065543591976166, + "learning_rate": 4.9205519828313804e-05, + "loss": 0.4552, "step": 14995 }, { - "epoch": 0.53, - "learning_rate": 4.9255702983282394e-05, - "loss": 0.3239, + "epoch": 0.5405989836739107, + "grad_norm": 0.16618047654628754, + "learning_rate": 4.9204789847025666e-05, + "loss": 0.4583, "step": 15000 }, { - "epoch": 0.53, - "eval_loss": 0.3090079426765442, - "eval_runtime": 10.5523, - "eval_samples_per_second": 9.477, - "eval_steps_per_second": 9.477, + "epoch": 0.5405989836739107, + "eval_loss": 0.4699234068393707, + "eval_runtime": 3.5836, + "eval_samples_per_second": 27.905, + "eval_steps_per_second": 6.976, "step": 15000 }, { - "epoch": 0.53, - "learning_rate": 4.925501288576127e-05, - "loss": 0.3297, + "epoch": 0.5407791833351353, + "grad_norm": 0.16865772008895874, + "learning_rate": 4.920405953595196e-05, + "loss": 0.4513, "step": 15005 }, { - "epoch": 0.53, - "learning_rate": 4.925432247330477e-05, - "loss": 0.3281, + "epoch": 0.5409593829963599, + "grad_norm": 0.18350325524806976, + "learning_rate": 4.9203328895102616e-05, + "loss": 0.4655, "step": 15010 }, { - "epoch": 0.53, - "learning_rate": 4.92536317459219e-05, - "loss": 0.3328, + "epoch": 0.5411395826575846, + "grad_norm": 0.1351919025182724, + "learning_rate": 4.920259792448761e-05, + "loss": 0.4242, "step": 15015 }, { - "epoch": 0.53, - "learning_rate": 4.9252940703621596e-05, - "loss": 0.3205, + "epoch": 0.5413197823188093, + "grad_norm": 0.18003039062023163, + "learning_rate": 4.9201866624116896e-05, + "loss": 0.4616, "step": 15020 }, { - "epoch": 0.53, - "learning_rate": 4.925224934641284e-05, - "loss": 0.3209, + "epoch": 0.5414999819800339, + "grad_norm": 0.19233831763267517, + "learning_rate": 4.920113499400043e-05, + "loss": 0.4467, "step": 15025 }, { - "epoch": 0.53, - "learning_rate": 4.925155767430462e-05, - "loss": 0.3308, + "epoch": 0.5416801816412585, + "grad_norm": 0.16504104435443878, + "learning_rate": 4.9200403034148186e-05, + "loss": 0.4578, "step": 15030 }, { - "epoch": 0.53, - "learning_rate": 4.925086568730589e-05, - "loss": 0.3384, + "epoch": 0.5418603813024832, + "grad_norm": 0.2171589583158493, + "learning_rate": 4.919967074457014e-05, + "loss": 0.4737, "step": 15035 }, { - "epoch": 0.53, - "learning_rate": 4.925017338542566e-05, - "loss": 0.318, + "epoch": 0.5420405809637078, + "grad_norm": 0.17700451612472534, + "learning_rate": 4.919893812527626e-05, + "loss": 0.4803, "step": 15040 }, { - "epoch": 0.53, - "learning_rate": 4.924948076867291e-05, - "loss": 0.3091, + "epoch": 0.5422207806249324, + "grad_norm": 0.15637724101543427, + "learning_rate": 4.919820517627653e-05, + "loss": 0.4535, "step": 15045 }, { - "epoch": 0.53, - "learning_rate": 4.9248787837056633e-05, - "loss": 0.2807, + "epoch": 0.542400980286157, + "grad_norm": 0.18739740550518036, + "learning_rate": 4.9197471897580945e-05, + "loss": 0.4589, "step": 15050 }, { - "epoch": 0.53, - "learning_rate": 4.9248094590585825e-05, - "loss": 0.3351, + "epoch": 0.5425811799473818, + "grad_norm": 0.1649918407201767, + "learning_rate": 4.9196738289199484e-05, + "loss": 0.4825, "step": 15055 }, { - "epoch": 0.53, - "learning_rate": 4.924740102926948e-05, - "loss": 0.3397, + "epoch": 0.5427613796086064, + "grad_norm": 0.15141309797763824, + "learning_rate": 4.9196004351142156e-05, + "loss": 0.4802, "step": 15060 }, { - "epoch": 0.53, - "learning_rate": 4.9246707153116624e-05, - "loss": 0.286, + "epoch": 0.542941579269831, + "grad_norm": 0.18119317293167114, + "learning_rate": 4.9195270083418946e-05, + "loss": 0.469, "step": 15065 }, { - "epoch": 0.53, - "learning_rate": 4.9246012962136256e-05, - "loss": 0.3172, + "epoch": 0.5431217789310556, + "grad_norm": 0.145644873380661, + "learning_rate": 4.919453548603987e-05, + "loss": 0.4697, "step": 15070 }, { - "epoch": 0.53, - "learning_rate": 4.9245318456337384e-05, - "loss": 0.3121, + "epoch": 0.5433019785922802, + "grad_norm": 0.19345882534980774, + "learning_rate": 4.9193800559014935e-05, + "loss": 0.4733, "step": 15075 }, { - "epoch": 0.53, - "learning_rate": 4.924462363572903e-05, - "loss": 0.3049, + "epoch": 0.5434821782535049, + "grad_norm": 0.16904519498348236, + "learning_rate": 4.919306530235415e-05, + "loss": 0.4677, "step": 15080 }, { - "epoch": 0.53, - "learning_rate": 4.924392850032021e-05, - "loss": 0.3186, + "epoch": 0.5436623779147295, + "grad_norm": 0.15902064740657806, + "learning_rate": 4.919232971606753e-05, + "loss": 0.4743, "step": 15085 }, { - "epoch": 0.53, - "learning_rate": 4.924323305011996e-05, - "loss": 0.2936, + "epoch": 0.5438425775759541, + "grad_norm": 0.1665191799402237, + "learning_rate": 4.91915938001651e-05, + "loss": 0.4044, "step": 15090 }, { - "epoch": 0.53, - "learning_rate": 4.9242537285137314e-05, - "loss": 0.3289, + "epoch": 0.5440227772371788, + "grad_norm": 0.19838020205497742, + "learning_rate": 4.91908575546569e-05, + "loss": 0.4449, "step": 15095 }, { - "epoch": 0.53, - "learning_rate": 4.924184120538129e-05, - "loss": 0.3353, + "epoch": 0.5442029768984035, + "grad_norm": 0.1867869794368744, + "learning_rate": 4.919012097955294e-05, + "loss": 0.4659, "step": 15100 }, { - "epoch": 0.53, - "learning_rate": 4.924114481086094e-05, - "loss": 0.3201, + "epoch": 0.5443831765596281, + "grad_norm": 0.1696668267250061, + "learning_rate": 4.918938407486326e-05, + "loss": 0.4349, "step": 15105 }, { - "epoch": 0.53, - "learning_rate": 4.924044810158529e-05, - "loss": 0.3327, + "epoch": 0.5445633762208527, + "grad_norm": 0.15441665053367615, + "learning_rate": 4.918864684059792e-05, + "loss": 0.4425, "step": 15110 }, { - "epoch": 0.53, - "learning_rate": 4.9239751077563405e-05, - "loss": 0.3331, + "epoch": 0.5447435758820773, + "grad_norm": 0.1583663374185562, + "learning_rate": 4.918790927676694e-05, + "loss": 0.3933, "step": 15115 }, { - "epoch": 0.53, - "learning_rate": 4.923905373880433e-05, - "loss": 0.3006, + "epoch": 0.544923775543302, + "grad_norm": 0.16343651711940765, + "learning_rate": 4.918717138338038e-05, + "loss": 0.4006, "step": 15120 }, { - "epoch": 0.53, - "learning_rate": 4.923835608531711e-05, - "loss": 0.3307, + "epoch": 0.5451039752045266, + "grad_norm": 0.14582295715808868, + "learning_rate": 4.918643316044829e-05, + "loss": 0.4413, "step": 15125 }, { - "epoch": 0.53, - "learning_rate": 4.9237658117110805e-05, - "loss": 0.3225, + "epoch": 0.5452841748657512, + "grad_norm": 0.15008191764354706, + "learning_rate": 4.9185694607980737e-05, + "loss": 0.4231, "step": 15130 }, { - "epoch": 0.53, - "learning_rate": 4.923695983419449e-05, - "loss": 0.3058, + "epoch": 0.5454643745269759, + "grad_norm": 0.16880400478839874, + "learning_rate": 4.918495572598777e-05, + "loss": 0.4398, "step": 15135 }, { - "epoch": 0.53, - "learning_rate": 4.923626123657722e-05, - "loss": 0.307, + "epoch": 0.5456445741882006, + "grad_norm": 0.14454464614391327, + "learning_rate": 4.918421651447948e-05, + "loss": 0.4116, "step": 15140 }, { - "epoch": 0.53, - "learning_rate": 4.923556232426807e-05, - "loss": 0.3158, + "epoch": 0.5458247738494252, + "grad_norm": 0.22429820895195007, + "learning_rate": 4.9183476973465905e-05, + "loss": 0.4625, "step": 15145 }, { - "epoch": 0.53, - "learning_rate": 4.923486309727612e-05, - "loss": 0.3194, + "epoch": 0.5460049735106498, + "grad_norm": 0.17825621366500854, + "learning_rate": 4.9182737102957147e-05, + "loss": 0.4437, "step": 15150 }, { - "epoch": 0.53, - "learning_rate": 4.923416355561044e-05, - "loss": 0.2916, + "epoch": 0.5461851731718744, + "grad_norm": 0.16203615069389343, + "learning_rate": 4.9181996902963265e-05, + "loss": 0.4262, "step": 15155 }, { - "epoch": 0.53, - "learning_rate": 4.923346369928012e-05, - "loss": 0.3148, + "epoch": 0.546365372833099, + "grad_norm": 0.2014383226633072, + "learning_rate": 4.918125637349437e-05, + "loss": 0.479, "step": 15160 }, { - "epoch": 0.53, - "learning_rate": 4.923276352829424e-05, - "loss": 0.3263, + "epoch": 0.5465455724943237, + "grad_norm": 0.16691508889198303, + "learning_rate": 4.918051551456053e-05, + "loss": 0.4402, "step": 15165 }, { - "epoch": 0.53, - "learning_rate": 4.92320630426619e-05, - "loss": 0.3075, + "epoch": 0.5467257721555483, + "grad_norm": 0.18141350150108337, + "learning_rate": 4.917977432617186e-05, + "loss": 0.4648, "step": 15170 }, { - "epoch": 0.53, - "learning_rate": 4.9231362242392187e-05, - "loss": 0.3038, + "epoch": 0.546905971816773, + "grad_norm": 0.15297532081604004, + "learning_rate": 4.9179032808338435e-05, + "loss": 0.4106, "step": 15175 }, { - "epoch": 0.53, - "learning_rate": 4.92306611274942e-05, - "loss": 0.293, + "epoch": 0.5470861714779977, + "grad_norm": 0.14727525413036346, + "learning_rate": 4.917829096107037e-05, + "loss": 0.4584, "step": 15180 }, { - "epoch": 0.53, - "learning_rate": 4.9229959697977065e-05, - "loss": 0.3103, + "epoch": 0.5472663711392223, + "grad_norm": 0.18853066861629486, + "learning_rate": 4.917754878437778e-05, + "loss": 0.482, "step": 15185 }, { - "epoch": 0.53, - "learning_rate": 4.9229257953849855e-05, - "loss": 0.324, + "epoch": 0.5474465708004469, + "grad_norm": 0.196510449051857, + "learning_rate": 4.9176806278270757e-05, + "loss": 0.4466, "step": 15190 }, { - "epoch": 0.53, - "learning_rate": 4.9228555895121706e-05, - "loss": 0.2869, + "epoch": 0.5476267704616715, + "grad_norm": 0.16554872691631317, + "learning_rate": 4.917606344275944e-05, + "loss": 0.4687, "step": 15195 }, { - "epoch": 0.53, - "learning_rate": 4.9227853521801726e-05, - "loss": 0.3081, + "epoch": 0.5478069701228961, + "grad_norm": 0.15905487537384033, + "learning_rate": 4.917532027785394e-05, + "loss": 0.4855, "step": 15200 }, { - "epoch": 0.53, - "learning_rate": 4.922715083389903e-05, - "loss": 0.3291, + "epoch": 0.5479871697841208, + "grad_norm": 0.17025235295295715, + "learning_rate": 4.917457678356437e-05, + "loss": 0.4565, "step": 15205 }, { - "epoch": 0.54, - "learning_rate": 4.922644783142274e-05, - "loss": 0.3184, + "epoch": 0.5481673694453455, + "grad_norm": 0.18130546808242798, + "learning_rate": 4.917383295990088e-05, + "loss": 0.456, "step": 15210 }, { - "epoch": 0.54, - "learning_rate": 4.9225744514382007e-05, - "loss": 0.3145, + "epoch": 0.5483475691065701, + "grad_norm": 0.13437901437282562, + "learning_rate": 4.9173088806873596e-05, + "loss": 0.4207, "step": 15215 }, { - "epoch": 0.54, - "learning_rate": 4.922504088278593e-05, - "loss": 0.3139, + "epoch": 0.5485277687677947, + "grad_norm": 0.1458578109741211, + "learning_rate": 4.917234432449266e-05, + "loss": 0.4671, "step": 15220 }, { - "epoch": 0.54, - "learning_rate": 4.922433693664368e-05, - "loss": 0.3348, + "epoch": 0.5487079684290194, + "grad_norm": 0.16906622052192688, + "learning_rate": 4.9171599512768206e-05, + "loss": 0.4346, "step": 15225 }, { - "epoch": 0.54, - "learning_rate": 4.922363267596437e-05, - "loss": 0.3065, + "epoch": 0.548888168090244, + "grad_norm": 0.17966735363006592, + "learning_rate": 4.917085437171038e-05, + "loss": 0.4734, "step": 15230 }, { - "epoch": 0.54, - "learning_rate": 4.922292810075715e-05, - "loss": 0.3147, + "epoch": 0.5490683677514686, + "grad_norm": 0.135285422205925, + "learning_rate": 4.917010890132936e-05, + "loss": 0.4293, "step": 15235 }, { - "epoch": 0.54, - "learning_rate": 4.9222223211031185e-05, - "loss": 0.3264, + "epoch": 0.5492485674126932, + "grad_norm": 0.15000662207603455, + "learning_rate": 4.916936310163528e-05, + "loss": 0.4076, "step": 15240 }, { - "epoch": 0.54, - "learning_rate": 4.9221518006795595e-05, - "loss": 0.2945, + "epoch": 0.5494287670739179, + "grad_norm": 0.17892701923847198, + "learning_rate": 4.9168616972638304e-05, + "loss": 0.4461, "step": 15245 }, { - "epoch": 0.54, - "learning_rate": 4.922081248805958e-05, - "loss": 0.2911, + "epoch": 0.5496089667351426, + "grad_norm": 0.16764943301677704, + "learning_rate": 4.91678705143486e-05, + "loss": 0.4662, "step": 15250 }, { - "epoch": 0.54, - "learning_rate": 4.922010665483226e-05, - "loss": 0.3106, + "epoch": 0.5497891663963672, + "grad_norm": 0.19673018157482147, + "learning_rate": 4.916712372677635e-05, + "loss": 0.4367, "step": 15255 }, { - "epoch": 0.54, - "learning_rate": 4.921940050712281e-05, - "loss": 0.3063, + "epoch": 0.5499693660575918, + "grad_norm": 0.13246646523475647, + "learning_rate": 4.916637660993171e-05, + "loss": 0.4468, "step": 15260 }, { - "epoch": 0.54, - "learning_rate": 4.9218694044940424e-05, - "loss": 0.3207, + "epoch": 0.5501495657188165, + "grad_norm": 0.1905840039253235, + "learning_rate": 4.916562916382487e-05, + "loss": 0.4495, "step": 15265 }, { - "epoch": 0.54, - "learning_rate": 4.9217987268294244e-05, - "loss": 0.2964, + "epoch": 0.5503297653800411, + "grad_norm": 0.18271660804748535, + "learning_rate": 4.916488138846601e-05, + "loss": 0.4694, "step": 15270 }, { - "epoch": 0.54, - "learning_rate": 4.921728017719347e-05, - "loss": 0.344, + "epoch": 0.5505099650412657, + "grad_norm": 0.15053506195545197, + "learning_rate": 4.916413328386531e-05, + "loss": 0.4371, "step": 15275 }, { - "epoch": 0.54, - "learning_rate": 4.921657277164726e-05, - "loss": 0.2759, + "epoch": 0.5506901647024903, + "grad_norm": 0.17273497581481934, + "learning_rate": 4.916338485003298e-05, + "loss": 0.4595, "step": 15280 }, { - "epoch": 0.54, - "learning_rate": 4.9215865051664824e-05, - "loss": 0.3429, + "epoch": 0.550870364363715, + "grad_norm": 0.1739252358675003, + "learning_rate": 4.916263608697921e-05, + "loss": 0.4688, "step": 15285 }, { - "epoch": 0.54, - "learning_rate": 4.9215157017255333e-05, - "loss": 0.3455, + "epoch": 0.5510505640249397, + "grad_norm": 0.16885848343372345, + "learning_rate": 4.916188699471421e-05, + "loss": 0.4307, "step": 15290 }, { - "epoch": 0.54, - "learning_rate": 4.921444866842799e-05, - "loss": 0.3217, + "epoch": 0.5512307636861643, + "grad_norm": 0.15223874151706696, + "learning_rate": 4.916113757324817e-05, + "loss": 0.4597, "step": 15295 }, { - "epoch": 0.54, - "learning_rate": 4.921374000519199e-05, - "loss": 0.3146, + "epoch": 0.5514109633473889, + "grad_norm": 0.2001715898513794, + "learning_rate": 4.9160387822591306e-05, + "loss": 0.4417, "step": 15300 }, { - "epoch": 0.54, - "learning_rate": 4.9213031027556526e-05, - "loss": 0.294, + "epoch": 0.5515911630086135, + "grad_norm": 0.1790451556444168, + "learning_rate": 4.915963774275384e-05, + "loss": 0.447, "step": 15305 }, { - "epoch": 0.54, - "learning_rate": 4.921232173553082e-05, - "loss": 0.3116, + "epoch": 0.5517713626698382, + "grad_norm": 0.16498644649982452, + "learning_rate": 4.915888733374598e-05, + "loss": 0.4523, "step": 15310 }, { - "epoch": 0.54, - "learning_rate": 4.921161212912406e-05, - "loss": 0.332, + "epoch": 0.5519515623310628, + "grad_norm": 0.13701501488685608, + "learning_rate": 4.915813659557796e-05, + "loss": 0.4299, "step": 15315 }, { - "epoch": 0.54, - "learning_rate": 4.9210902208345486e-05, - "loss": 0.3257, + "epoch": 0.5521317619922874, + "grad_norm": 0.16090936958789825, + "learning_rate": 4.9157385528260016e-05, + "loss": 0.4038, "step": 15320 }, { - "epoch": 0.54, - "learning_rate": 4.921019197320429e-05, - "loss": 0.3238, + "epoch": 0.552311961653512, + "grad_norm": 0.156123086810112, + "learning_rate": 4.915663413180236e-05, + "loss": 0.4675, "step": 15325 }, { - "epoch": 0.54, - "learning_rate": 4.920948142370972e-05, - "loss": 0.2883, + "epoch": 0.5524921613147368, + "grad_norm": 0.17523333430290222, + "learning_rate": 4.915588240621524e-05, + "loss": 0.4965, "step": 15330 }, { - "epoch": 0.54, - "learning_rate": 4.9208770559870984e-05, - "loss": 0.3216, + "epoch": 0.5526723609759614, + "grad_norm": 0.1823972910642624, + "learning_rate": 4.9155130351508904e-05, + "loss": 0.4191, "step": 15335 }, { - "epoch": 0.54, - "learning_rate": 4.9208059381697314e-05, - "loss": 0.3331, + "epoch": 0.552852560637186, + "grad_norm": 0.19572138786315918, + "learning_rate": 4.915437796769359e-05, + "loss": 0.4305, "step": 15340 }, { - "epoch": 0.54, - "learning_rate": 4.920734788919794e-05, - "loss": 0.3136, + "epoch": 0.5530327602984106, + "grad_norm": 0.16662746667861938, + "learning_rate": 4.915362525477955e-05, + "loss": 0.4268, "step": 15345 }, { - "epoch": 0.54, - "learning_rate": 4.9206636082382115e-05, - "loss": 0.2992, + "epoch": 0.5532129599596353, + "grad_norm": 0.22275352478027344, + "learning_rate": 4.915287221277706e-05, + "loss": 0.4884, "step": 15350 }, { - "epoch": 0.54, - "learning_rate": 4.920592396125907e-05, - "loss": 0.3019, + "epoch": 0.5533931596208599, + "grad_norm": 0.19144576787948608, + "learning_rate": 4.915211884169635e-05, + "loss": 0.4326, "step": 15355 }, { - "epoch": 0.54, - "learning_rate": 4.920521152583806e-05, - "loss": 0.3563, + "epoch": 0.5535733592820845, + "grad_norm": 0.19240762293338776, + "learning_rate": 4.915136514154769e-05, + "loss": 0.4484, "step": 15360 }, { - "epoch": 0.54, - "learning_rate": 4.920449877612833e-05, - "loss": 0.3187, + "epoch": 0.5537535589433091, + "grad_norm": 0.13195356726646423, + "learning_rate": 4.915061111234136e-05, + "loss": 0.4294, "step": 15365 }, { - "epoch": 0.54, - "learning_rate": 4.9203785712139126e-05, - "loss": 0.3117, + "epoch": 0.5539337586045339, + "grad_norm": 0.1802310198545456, + "learning_rate": 4.914985675408763e-05, + "loss": 0.4616, "step": 15370 }, { - "epoch": 0.54, - "learning_rate": 4.920307233387973e-05, - "loss": 0.3155, + "epoch": 0.5541139582657585, + "grad_norm": 0.12225214391946793, + "learning_rate": 4.914910206679678e-05, + "loss": 0.3886, "step": 15375 }, { - "epoch": 0.54, - "learning_rate": 4.9202358641359375e-05, - "loss": 0.305, + "epoch": 0.5542941579269831, + "grad_norm": 0.139492928981781, + "learning_rate": 4.914834705047909e-05, + "loss": 0.447, "step": 15380 }, { - "epoch": 0.54, - "learning_rate": 4.9201644634587354e-05, - "loss": 0.29, + "epoch": 0.5544743575882077, + "grad_norm": 0.14158743619918823, + "learning_rate": 4.9147591705144844e-05, + "loss": 0.4547, "step": 15385 }, { - "epoch": 0.54, - "learning_rate": 4.920093031357292e-05, - "loss": 0.3041, + "epoch": 0.5546545572494324, + "grad_norm": 0.164401575922966, + "learning_rate": 4.9146836030804346e-05, + "loss": 0.4442, "step": 15390 }, { - "epoch": 0.54, - "learning_rate": 4.9200215678325354e-05, - "loss": 0.3318, + "epoch": 0.554834756910657, + "grad_norm": 0.17058567702770233, + "learning_rate": 4.914608002746787e-05, + "loss": 0.4267, "step": 15395 }, { - "epoch": 0.54, - "learning_rate": 4.9199500728853944e-05, - "loss": 0.3196, + "epoch": 0.5550149565718816, + "grad_norm": 0.18380822241306305, + "learning_rate": 4.914532369514573e-05, + "loss": 0.4541, "step": 15400 }, { - "epoch": 0.54, - "learning_rate": 4.919878546516795e-05, - "loss": 0.3327, + "epoch": 0.5551951562331063, + "grad_norm": 0.1776258945465088, + "learning_rate": 4.914456703384823e-05, + "loss": 0.4566, "step": 15405 }, { - "epoch": 0.54, - "learning_rate": 4.919806988727669e-05, - "loss": 0.2934, + "epoch": 0.555375355894331, + "grad_norm": 0.13963723182678223, + "learning_rate": 4.914381004358568e-05, + "loss": 0.4172, "step": 15410 }, { - "epoch": 0.54, - "learning_rate": 4.9197353995189434e-05, - "loss": 0.2936, + "epoch": 0.5555555555555556, + "grad_norm": 0.1411467045545578, + "learning_rate": 4.9143052724368396e-05, + "loss": 0.3885, "step": 15415 }, { - "epoch": 0.54, - "learning_rate": 4.9196637788915486e-05, - "loss": 0.3291, + "epoch": 0.5557357552167802, + "grad_norm": 0.1662912219762802, + "learning_rate": 4.914229507620669e-05, + "loss": 0.4943, "step": 15420 }, { - "epoch": 0.54, - "learning_rate": 4.9195921268464143e-05, - "loss": 0.3002, + "epoch": 0.5559159548780048, + "grad_norm": 0.2055153101682663, + "learning_rate": 4.914153709911088e-05, + "loss": 0.4344, "step": 15425 }, { - "epoch": 0.54, - "learning_rate": 4.91952044338447e-05, - "loss": 0.3251, + "epoch": 0.5560961545392294, + "grad_norm": 0.17844471335411072, + "learning_rate": 4.9140778793091316e-05, + "loss": 0.4778, "step": 15430 }, { - "epoch": 0.54, - "learning_rate": 4.9194487285066474e-05, - "loss": 0.3033, + "epoch": 0.5562763542004541, + "grad_norm": 0.17044708132743835, + "learning_rate": 4.9140020158158305e-05, + "loss": 0.4267, "step": 15435 }, { - "epoch": 0.54, - "learning_rate": 4.919376982213878e-05, - "loss": 0.284, + "epoch": 0.5564565538616787, + "grad_norm": 0.1435849368572235, + "learning_rate": 4.91392611943222e-05, + "loss": 0.4465, "step": 15440 }, { - "epoch": 0.54, - "learning_rate": 4.919305204507093e-05, - "loss": 0.3229, + "epoch": 0.5566367535229034, + "grad_norm": 0.19326213002204895, + "learning_rate": 4.913850190159333e-05, + "loss": 0.4709, "step": 15445 }, { - "epoch": 0.54, - "learning_rate": 4.919233395387224e-05, - "loss": 0.3252, + "epoch": 0.556816953184128, + "grad_norm": 0.17136943340301514, + "learning_rate": 4.9137742279982035e-05, + "loss": 0.4668, "step": 15450 }, { - "epoch": 0.54, - "learning_rate": 4.919161554855204e-05, - "loss": 0.3176, + "epoch": 0.5569971528453527, + "grad_norm": 0.16831465065479279, + "learning_rate": 4.913698232949868e-05, + "loss": 0.471, "step": 15455 }, { - "epoch": 0.54, - "learning_rate": 4.919089682911965e-05, - "loss": 0.3115, + "epoch": 0.5571773525065773, + "grad_norm": 0.20569349825382233, + "learning_rate": 4.9136222050153626e-05, + "loss": 0.4573, "step": 15460 }, { - "epoch": 0.54, - "learning_rate": 4.919017779558441e-05, - "loss": 0.3387, + "epoch": 0.5573575521678019, + "grad_norm": 0.1882450133562088, + "learning_rate": 4.913546144195721e-05, + "loss": 0.4867, "step": 15465 }, { - "epoch": 0.54, - "learning_rate": 4.918945844795565e-05, - "loss": 0.301, + "epoch": 0.5575377518290265, + "grad_norm": 0.15884269773960114, + "learning_rate": 4.9134700504919805e-05, + "loss": 0.4389, "step": 15470 }, { - "epoch": 0.54, - "learning_rate": 4.918873878624272e-05, - "loss": 0.3064, + "epoch": 0.5577179514902512, + "grad_norm": 0.16877703368663788, + "learning_rate": 4.913393923905178e-05, + "loss": 0.4443, "step": 15475 }, { - "epoch": 0.54, - "learning_rate": 4.918801881045495e-05, - "loss": 0.3127, + "epoch": 0.5578981511514758, + "grad_norm": 0.18422040343284607, + "learning_rate": 4.9133177644363506e-05, + "loss": 0.4407, "step": 15480 }, { - "epoch": 0.54, - "learning_rate": 4.91872985206017e-05, - "loss": 0.3176, + "epoch": 0.5580783508127005, + "grad_norm": 0.1388750970363617, + "learning_rate": 4.9132415720865355e-05, + "loss": 0.4309, "step": 15485 }, { - "epoch": 0.54, - "learning_rate": 4.9186577916692325e-05, - "loss": 0.3175, + "epoch": 0.5582585504739251, + "grad_norm": 0.16202110052108765, + "learning_rate": 4.913165346856772e-05, + "loss": 0.4428, "step": 15490 }, { - "epoch": 0.55, - "learning_rate": 4.918585699873617e-05, - "loss": 0.3359, + "epoch": 0.5584387501351498, + "grad_norm": 0.17716795206069946, + "learning_rate": 4.9130890887480966e-05, + "loss": 0.4904, "step": 15495 }, { - "epoch": 0.55, - "learning_rate": 4.918513576674259e-05, - "loss": 0.3018, + "epoch": 0.5586189497963744, + "grad_norm": 0.14335161447525024, + "learning_rate": 4.91301279776155e-05, + "loss": 0.4346, "step": 15500 }, { - "epoch": 0.55, - "eval_loss": 0.309041827917099, - "eval_runtime": 10.5379, - "eval_samples_per_second": 9.49, - "eval_steps_per_second": 9.49, + "epoch": 0.5586189497963744, + "eval_loss": 0.4687560200691223, + "eval_runtime": 3.5676, + "eval_samples_per_second": 28.03, + "eval_steps_per_second": 7.008, "step": 15500 }, { - "epoch": 0.55, - "learning_rate": 4.918441422072098e-05, - "loss": 0.3475, + "epoch": 0.558799149457599, + "grad_norm": 0.1963035613298416, + "learning_rate": 4.912936473898172e-05, + "loss": 0.4499, "step": 15505 }, { - "epoch": 0.55, - "learning_rate": 4.9183692360680676e-05, - "loss": 0.3085, + "epoch": 0.5589793491188236, + "grad_norm": 0.1802736520767212, + "learning_rate": 4.912860117159001e-05, + "loss": 0.4784, "step": 15510 }, { - "epoch": 0.55, - "learning_rate": 4.918297018663107e-05, - "loss": 0.3378, + "epoch": 0.5591595487800483, + "grad_norm": 0.16385473310947418, + "learning_rate": 4.9127837275450786e-05, + "loss": 0.428, "step": 15515 }, { - "epoch": 0.55, - "learning_rate": 4.918224769858153e-05, - "loss": 0.3255, + "epoch": 0.5593397484412729, + "grad_norm": 0.1578623354434967, + "learning_rate": 4.9127073050574445e-05, + "loss": 0.4125, "step": 15520 }, { - "epoch": 0.55, - "learning_rate": 4.918152489654145e-05, - "loss": 0.3167, + "epoch": 0.5595199481024976, + "grad_norm": 0.2023286670446396, + "learning_rate": 4.912630849697141e-05, + "loss": 0.4491, "step": 15525 }, { - "epoch": 0.55, - "learning_rate": 4.918080178052019e-05, - "loss": 0.3193, + "epoch": 0.5597001477637222, + "grad_norm": 0.17793312668800354, + "learning_rate": 4.912554361465208e-05, + "loss": 0.435, "step": 15530 }, { - "epoch": 0.55, - "learning_rate": 4.918007835052717e-05, - "loss": 0.311, + "epoch": 0.5598803474249469, + "grad_norm": 0.18119443953037262, + "learning_rate": 4.91247784036269e-05, + "loss": 0.4516, "step": 15535 }, { - "epoch": 0.55, - "learning_rate": 4.917935460657176e-05, - "loss": 0.3121, + "epoch": 0.5600605470861715, + "grad_norm": 0.15839527547359467, + "learning_rate": 4.912401286390629e-05, + "loss": 0.4549, "step": 15540 }, { - "epoch": 0.55, - "learning_rate": 4.917863054866336e-05, - "loss": 0.3069, + "epoch": 0.5602407467473961, + "grad_norm": 0.19357532262802124, + "learning_rate": 4.912324699550066e-05, + "loss": 0.4639, "step": 15545 }, { - "epoch": 0.55, - "learning_rate": 4.9177906176811384e-05, - "loss": 0.2918, + "epoch": 0.5604209464086207, + "grad_norm": 0.16731002926826477, + "learning_rate": 4.9122480798420474e-05, + "loss": 0.4079, "step": 15550 }, { - "epoch": 0.55, - "learning_rate": 4.917718149102523e-05, - "loss": 0.2929, + "epoch": 0.5606011460698453, + "grad_norm": 0.16960617899894714, + "learning_rate": 4.912171427267615e-05, + "loss": 0.4516, "step": 15555 }, { - "epoch": 0.55, - "learning_rate": 4.9176456491314304e-05, - "loss": 0.3343, + "epoch": 0.5607813457310701, + "grad_norm": 0.16323991119861603, + "learning_rate": 4.912094741827814e-05, + "loss": 0.4396, "step": 15560 }, { - "epoch": 0.55, - "learning_rate": 4.917573117768803e-05, - "loss": 0.2854, + "epoch": 0.5609615453922947, + "grad_norm": 0.17487618327140808, + "learning_rate": 4.9120180235236895e-05, + "loss": 0.4219, "step": 15565 }, { - "epoch": 0.55, - "learning_rate": 4.917500555015582e-05, - "loss": 0.3236, + "epoch": 0.5611417450535193, + "grad_norm": 0.15167950093746185, + "learning_rate": 4.911941272356286e-05, + "loss": 0.4486, "step": 15570 }, { - "epoch": 0.55, - "learning_rate": 4.917427960872708e-05, - "loss": 0.3021, + "epoch": 0.561321944714744, + "grad_norm": 0.17126460373401642, + "learning_rate": 4.91186448832665e-05, + "loss": 0.4338, "step": 15575 }, { - "epoch": 0.55, - "learning_rate": 4.917355335341126e-05, - "loss": 0.3098, + "epoch": 0.5615021443759686, + "grad_norm": 0.18867506086826324, + "learning_rate": 4.911787671435827e-05, + "loss": 0.4422, "step": 15580 }, { - "epoch": 0.55, - "learning_rate": 4.9172826784217786e-05, - "loss": 0.3106, + "epoch": 0.5616823440371932, + "grad_norm": 0.1628337949514389, + "learning_rate": 4.911710821684864e-05, + "loss": 0.4467, "step": 15585 }, { - "epoch": 0.55, - "learning_rate": 4.9172099901156076e-05, - "loss": 0.3122, + "epoch": 0.5618625436984178, + "grad_norm": 0.1885543018579483, + "learning_rate": 4.911633939074808e-05, + "loss": 0.4332, "step": 15590 }, { - "epoch": 0.55, - "learning_rate": 4.9171372704235584e-05, - "loss": 0.2936, + "epoch": 0.5620427433596424, + "grad_norm": 0.20939011871814728, + "learning_rate": 4.9115570236067065e-05, + "loss": 0.4422, "step": 15595 }, { - "epoch": 0.55, - "learning_rate": 4.917064519346575e-05, - "loss": 0.329, + "epoch": 0.5622229430208672, + "grad_norm": 0.15987086296081543, + "learning_rate": 4.911480075281607e-05, + "loss": 0.4408, "step": 15600 }, { - "epoch": 0.55, - "learning_rate": 4.916991736885602e-05, - "loss": 0.285, + "epoch": 0.5624031426820918, + "grad_norm": 0.13029427826404572, + "learning_rate": 4.911403094100559e-05, + "loss": 0.4403, "step": 15605 }, { - "epoch": 0.55, - "learning_rate": 4.916918923041584e-05, - "loss": 0.3033, + "epoch": 0.5625833423433164, + "grad_norm": 0.18694214522838593, + "learning_rate": 4.91132608006461e-05, + "loss": 0.4415, "step": 15610 }, { - "epoch": 0.55, - "learning_rate": 4.916846077815466e-05, - "loss": 0.3181, + "epoch": 0.562763542004541, + "grad_norm": 0.1758272796869278, + "learning_rate": 4.9112490331748105e-05, + "loss": 0.4124, "step": 15615 }, { - "epoch": 0.55, - "learning_rate": 4.9167732012081944e-05, - "loss": 0.3313, + "epoch": 0.5629437416657657, + "grad_norm": 0.16514959931373596, + "learning_rate": 4.91117195343221e-05, + "loss": 0.4524, "step": 15620 }, { - "epoch": 0.55, - "learning_rate": 4.9167002932207156e-05, - "loss": 0.2979, + "epoch": 0.5631239413269903, + "grad_norm": 0.14684879779815674, + "learning_rate": 4.911094840837859e-05, + "loss": 0.4564, "step": 15625 }, { - "epoch": 0.55, - "learning_rate": 4.916627353853976e-05, - "loss": 0.3045, + "epoch": 0.5633041409882149, + "grad_norm": 0.17186830937862396, + "learning_rate": 4.911017695392807e-05, + "loss": 0.4371, "step": 15630 }, { - "epoch": 0.55, - "learning_rate": 4.916554383108923e-05, - "loss": 0.3124, + "epoch": 0.5634843406494395, + "grad_norm": 0.1487584263086319, + "learning_rate": 4.910940517098105e-05, + "loss": 0.4323, "step": 15635 }, { - "epoch": 0.55, - "learning_rate": 4.916481380986504e-05, - "loss": 0.3113, + "epoch": 0.5636645403106643, + "grad_norm": 0.1303669810295105, + "learning_rate": 4.910863305954805e-05, + "loss": 0.4372, "step": 15640 }, { - "epoch": 0.55, - "learning_rate": 4.9164083474876675e-05, - "loss": 0.3283, + "epoch": 0.5638447399718889, + "grad_norm": 0.16420136392116547, + "learning_rate": 4.91078606196396e-05, + "loss": 0.4775, "step": 15645 }, { - "epoch": 0.55, - "learning_rate": 4.91633528261336e-05, - "loss": 0.3191, + "epoch": 0.5640249396331135, + "grad_norm": 0.18261446058750153, + "learning_rate": 4.910708785126622e-05, + "loss": 0.4135, "step": 15650 }, { - "epoch": 0.55, - "learning_rate": 4.9162621863645317e-05, - "loss": 0.3268, + "epoch": 0.5642051392943381, + "grad_norm": 0.1571108102798462, + "learning_rate": 4.910631475443843e-05, + "loss": 0.4463, "step": 15655 }, { - "epoch": 0.55, - "learning_rate": 4.916189058742131e-05, - "loss": 0.2921, + "epoch": 0.5643853389555628, + "grad_norm": 0.18948879837989807, + "learning_rate": 4.910554132916677e-05, + "loss": 0.4544, "step": 15660 }, { - "epoch": 0.55, - "learning_rate": 4.9161158997471086e-05, - "loss": 0.3122, + "epoch": 0.5645655386167874, + "grad_norm": 0.19807395339012146, + "learning_rate": 4.910476757546177e-05, + "loss": 0.4835, "step": 15665 }, { - "epoch": 0.55, - "learning_rate": 4.916042709380413e-05, - "loss": 0.3073, + "epoch": 0.564745738278012, + "grad_norm": 0.1863390952348709, + "learning_rate": 4.910399349333399e-05, + "loss": 0.4522, "step": 15670 }, { - "epoch": 0.55, - "learning_rate": 4.915969487642995e-05, - "loss": 0.2936, + "epoch": 0.5649259379392366, + "grad_norm": 0.17297495901584625, + "learning_rate": 4.910321908279396e-05, + "loss": 0.4493, "step": 15675 }, { - "epoch": 0.55, - "learning_rate": 4.9159108876667794e-05, - "loss": 0.3335, + "epoch": 0.5651061376004614, + "grad_norm": 0.1477319598197937, + "learning_rate": 4.910244434385224e-05, + "loss": 0.4326, "step": 15680 }, { - "epoch": 0.55, - "learning_rate": 4.915837609464457e-05, - "loss": 0.3175, + "epoch": 0.565286337261686, + "grad_norm": 0.18947510421276093, + "learning_rate": 4.9101669276519374e-05, + "loss": 0.4266, "step": 15685 }, { - "epoch": 0.55, - "learning_rate": 4.915764299894076e-05, - "loss": 0.2781, + "epoch": 0.5654665369229106, + "grad_norm": 0.1564638763666153, + "learning_rate": 4.910089388080593e-05, + "loss": 0.4275, "step": 15690 }, { - "epoch": 0.55, - "learning_rate": 4.9156909589565866e-05, - "loss": 0.3107, + "epoch": 0.5656467365841352, + "grad_norm": 0.1595679074525833, + "learning_rate": 4.9100118156722485e-05, + "loss": 0.4342, "step": 15695 }, { - "epoch": 0.55, - "learning_rate": 4.915617586652943e-05, - "loss": 0.3128, + "epoch": 0.5658269362453598, + "grad_norm": 0.16428592801094055, + "learning_rate": 4.909934210427959e-05, + "loss": 0.4652, "step": 15700 }, { - "epoch": 0.55, - "learning_rate": 4.915544182984096e-05, - "loss": 0.3059, + "epoch": 0.5660071359065845, + "grad_norm": 0.15228533744812012, + "learning_rate": 4.909856572348782e-05, + "loss": 0.3994, "step": 15705 }, { - "epoch": 0.55, - "learning_rate": 4.915470747951001e-05, - "loss": 0.3112, + "epoch": 0.5661873355678091, + "grad_norm": 0.18478292226791382, + "learning_rate": 4.9097789014357763e-05, + "loss": 0.4288, "step": 15710 }, { - "epoch": 0.55, - "learning_rate": 4.91539728155461e-05, - "loss": 0.3301, + "epoch": 0.5663675352290337, + "grad_norm": 0.1817883402109146, + "learning_rate": 4.9097011976899996e-05, + "loss": 0.4205, "step": 15715 }, { - "epoch": 0.55, - "learning_rate": 4.915323783795876e-05, - "loss": 0.3488, + "epoch": 0.5665477348902584, + "grad_norm": 0.16065534949302673, + "learning_rate": 4.90962346111251e-05, + "loss": 0.4548, "step": 15720 }, { - "epoch": 0.55, - "learning_rate": 4.915250254675755e-05, - "loss": 0.3322, + "epoch": 0.5667279345514831, + "grad_norm": 0.1838037371635437, + "learning_rate": 4.909545691704368e-05, + "loss": 0.4084, "step": 15725 }, { - "epoch": 0.55, - "learning_rate": 4.9151766941952015e-05, - "loss": 0.3113, + "epoch": 0.5669081342127077, + "grad_norm": 0.21379324793815613, + "learning_rate": 4.909467889466632e-05, + "loss": 0.4492, "step": 15730 }, { - "epoch": 0.55, - "learning_rate": 4.91510310235517e-05, - "loss": 0.3019, + "epoch": 0.5670883338739323, + "grad_norm": 0.18225577473640442, + "learning_rate": 4.9093900544003625e-05, + "loss": 0.4403, "step": 15735 }, { - "epoch": 0.55, - "learning_rate": 4.9150294791566167e-05, - "loss": 0.2977, + "epoch": 0.5672685335351569, + "grad_norm": 0.16789789497852325, + "learning_rate": 4.909327762711516e-05, + "loss": 0.4592, "step": 15740 }, { - "epoch": 0.55, - "learning_rate": 4.9149558246004964e-05, - "loss": 0.329, + "epoch": 0.5674487331963816, + "grad_norm": 0.16580015420913696, + "learning_rate": 4.909249868556559e-05, + "loss": 0.4639, "step": 15745 }, { - "epoch": 0.55, - "learning_rate": 4.914882138687767e-05, - "loss": 0.3104, + "epoch": 0.5676289328576062, + "grad_norm": 0.16387851536273956, + "learning_rate": 4.909171941576039e-05, + "loss": 0.3932, "step": 15750 }, { - "epoch": 0.55, - "learning_rate": 4.914808421419384e-05, - "loss": 0.3201, + "epoch": 0.5678091325188309, + "grad_norm": 0.1637287735939026, + "learning_rate": 4.909093981771018e-05, + "loss": 0.4378, "step": 15755 }, { - "epoch": 0.55, - "learning_rate": 4.914734672796305e-05, - "loss": 0.2997, + "epoch": 0.5679893321800555, + "grad_norm": 0.1793089210987091, + "learning_rate": 4.909015989142557e-05, + "loss": 0.4265, "step": 15760 }, { - "epoch": 0.55, - "learning_rate": 4.914660892819488e-05, - "loss": 0.3156, + "epoch": 0.5681695318412802, + "grad_norm": 0.17664669454097748, + "learning_rate": 4.908937963691721e-05, + "loss": 0.4682, "step": 15765 }, { - "epoch": 0.55, - "learning_rate": 4.914587081489891e-05, - "loss": 0.3139, + "epoch": 0.5683497315025048, + "grad_norm": 0.17455190420150757, + "learning_rate": 4.90885990541957e-05, + "loss": 0.4658, "step": 15770 }, { - "epoch": 0.56, - "learning_rate": 4.914513238808471e-05, - "loss": 0.3022, + "epoch": 0.5685299311637294, + "grad_norm": 0.17027586698532104, + "learning_rate": 4.9087818143271703e-05, + "loss": 0.4062, "step": 15775 }, { - "epoch": 0.56, - "learning_rate": 4.914439364776189e-05, - "loss": 0.2884, + "epoch": 0.568710130824954, + "grad_norm": 0.23216940462589264, + "learning_rate": 4.9087036904155844e-05, + "loss": 0.4304, "step": 15780 }, { - "epoch": 0.56, - "learning_rate": 4.914365459394002e-05, - "loss": 0.3486, + "epoch": 0.5688903304861787, + "grad_norm": 0.14484082162380219, + "learning_rate": 4.908625533685878e-05, + "loss": 0.4446, "step": 15785 }, { - "epoch": 0.56, - "learning_rate": 4.914291522662871e-05, - "loss": 0.3116, + "epoch": 0.5690705301474033, + "grad_norm": 0.18710222840309143, + "learning_rate": 4.908547344139114e-05, + "loss": 0.4319, "step": 15790 }, { - "epoch": 0.56, - "learning_rate": 4.9142175545837555e-05, - "loss": 0.3308, + "epoch": 0.569250729808628, + "grad_norm": 0.16132237017154694, + "learning_rate": 4.908469121776358e-05, + "loss": 0.4643, "step": 15795 }, { - "epoch": 0.56, - "learning_rate": 4.9141435551576165e-05, - "loss": 0.319, + "epoch": 0.5694309294698526, + "grad_norm": 0.16530324518680573, + "learning_rate": 4.908390866598678e-05, + "loss": 0.4599, "step": 15800 }, { - "epoch": 0.56, - "learning_rate": 4.914069524385414e-05, - "loss": 0.3092, + "epoch": 0.5696111291310773, + "grad_norm": 0.16622117161750793, + "learning_rate": 4.908312578607138e-05, + "loss": 0.4115, "step": 15805 }, { - "epoch": 0.56, - "learning_rate": 4.9139954622681094e-05, - "loss": 0.3191, + "epoch": 0.5697913287923019, + "grad_norm": 0.19069482386112213, + "learning_rate": 4.9082342578028054e-05, + "loss": 0.4046, "step": 15810 }, { - "epoch": 0.56, - "learning_rate": 4.9139213688066656e-05, - "loss": 0.328, + "epoch": 0.5699715284535265, + "grad_norm": 0.15032880008220673, + "learning_rate": 4.908155904186747e-05, + "loss": 0.4386, "step": 15815 }, { - "epoch": 0.56, - "learning_rate": 4.913847244002043e-05, - "loss": 0.3296, + "epoch": 0.5701517281147511, + "grad_norm": 0.18905720114707947, + "learning_rate": 4.9080775177600316e-05, + "loss": 0.4328, "step": 15820 }, { - "epoch": 0.56, - "learning_rate": 4.9137730878552044e-05, - "loss": 0.3267, + "epoch": 0.5703319277759757, + "grad_norm": 0.1644611656665802, + "learning_rate": 4.907999098523726e-05, + "loss": 0.4288, "step": 15825 }, { - "epoch": 0.56, - "learning_rate": 4.913698900367114e-05, - "loss": 0.3038, + "epoch": 0.5705121274372004, + "grad_norm": 0.21615582704544067, + "learning_rate": 4.9079206464788986e-05, + "loss": 0.4575, "step": 15830 }, { - "epoch": 0.56, - "learning_rate": 4.9136246815387334e-05, - "loss": 0.3019, + "epoch": 0.5706923270984251, + "grad_norm": 0.15707986056804657, + "learning_rate": 4.907842161626618e-05, + "loss": 0.4264, "step": 15835 }, { - "epoch": 0.56, - "learning_rate": 4.913550431371027e-05, - "loss": 0.2831, + "epoch": 0.5708725267596497, + "grad_norm": 0.22739115357398987, + "learning_rate": 4.9077636439679554e-05, + "loss": 0.4573, "step": 15840 }, { - "epoch": 0.56, - "learning_rate": 4.9134761498649586e-05, - "loss": 0.3151, + "epoch": 0.5710527264208743, + "grad_norm": 0.16067542135715485, + "learning_rate": 4.907685093503978e-05, + "loss": 0.4617, "step": 15845 }, { - "epoch": 0.56, - "learning_rate": 4.913401837021493e-05, - "loss": 0.3058, + "epoch": 0.571232926082099, + "grad_norm": 0.17511925101280212, + "learning_rate": 4.907606510235758e-05, + "loss": 0.4553, "step": 15850 }, { - "epoch": 0.56, - "learning_rate": 4.913327492841595e-05, - "loss": 0.2843, + "epoch": 0.5714131257433236, + "grad_norm": 0.15019887685775757, + "learning_rate": 4.9075278941643655e-05, + "loss": 0.4247, "step": 15855 }, { - "epoch": 0.56, - "learning_rate": 4.9132531173262306e-05, - "loss": 0.2963, + "epoch": 0.5715933254045482, + "grad_norm": 0.2024499922990799, + "learning_rate": 4.907449245290872e-05, + "loss": 0.4579, "step": 15860 }, { - "epoch": 0.56, - "learning_rate": 4.913178710476365e-05, - "loss": 0.3378, + "epoch": 0.5717735250657728, + "grad_norm": 0.1599128544330597, + "learning_rate": 4.907370563616347e-05, + "loss": 0.4592, "step": 15865 }, { - "epoch": 0.56, - "learning_rate": 4.913104272292963e-05, - "loss": 0.3084, + "epoch": 0.5719537247269975, + "grad_norm": 0.18762356042861938, + "learning_rate": 4.907291849141865e-05, + "loss": 0.4446, "step": 15870 }, { - "epoch": 0.56, - "learning_rate": 4.913029802776993e-05, - "loss": 0.3009, + "epoch": 0.5721339243882222, + "grad_norm": 0.20976999402046204, + "learning_rate": 4.907213101868498e-05, + "loss": 0.4023, "step": 15875 }, { - "epoch": 0.56, - "learning_rate": 4.9129553019294214e-05, - "loss": 0.3213, + "epoch": 0.5723141240494468, + "grad_norm": 0.1850195974111557, + "learning_rate": 4.9071343217973185e-05, + "loss": 0.3954, "step": 15880 }, { - "epoch": 0.56, - "learning_rate": 4.9128807697512146e-05, - "loss": 0.3199, + "epoch": 0.5724943237106714, + "grad_norm": 0.1581835001707077, + "learning_rate": 4.9070555089294004e-05, + "loss": 0.4613, "step": 15885 }, { - "epoch": 0.56, - "learning_rate": 4.912806206243342e-05, - "loss": 0.3262, + "epoch": 0.5726745233718961, + "grad_norm": 0.13934732973575592, + "learning_rate": 4.906976663265816e-05, + "loss": 0.3904, "step": 15890 }, { - "epoch": 0.56, - "learning_rate": 4.9127316114067715e-05, - "loss": 0.3089, + "epoch": 0.5728547230331207, + "grad_norm": 0.18741364777088165, + "learning_rate": 4.906897784807641e-05, + "loss": 0.4099, "step": 15895 }, { - "epoch": 0.56, - "learning_rate": 4.91265698524247e-05, - "loss": 0.3164, + "epoch": 0.5730349226943453, + "grad_norm": 0.24076466262340546, + "learning_rate": 4.9068188735559494e-05, + "loss": 0.4394, "step": 15900 }, { - "epoch": 0.56, - "learning_rate": 4.9125823277514075e-05, - "loss": 0.322, + "epoch": 0.5732151223555699, + "grad_norm": 0.14442354440689087, + "learning_rate": 4.9067399295118166e-05, + "loss": 0.389, "step": 15905 }, { - "epoch": 0.56, - "learning_rate": 4.9125076389345534e-05, - "loss": 0.3247, + "epoch": 0.5733953220167947, + "grad_norm": 0.15537168085575104, + "learning_rate": 4.906660952676318e-05, + "loss": 0.4864, "step": 15910 }, { - "epoch": 0.56, - "learning_rate": 4.912432918792878e-05, - "loss": 0.2992, + "epoch": 0.5735755216780193, + "grad_norm": 0.1750095635652542, + "learning_rate": 4.906581943050531e-05, + "loss": 0.453, "step": 15915 }, { - "epoch": 0.56, - "learning_rate": 4.912358167327351e-05, - "loss": 0.3167, + "epoch": 0.5737557213392439, + "grad_norm": 0.13674676418304443, + "learning_rate": 4.90650290063553e-05, + "loss": 0.4463, "step": 15920 }, { - "epoch": 0.56, - "learning_rate": 4.9122833845389426e-05, - "loss": 0.3208, + "epoch": 0.5739359210004685, + "grad_norm": 0.21467788517475128, + "learning_rate": 4.9064238254323934e-05, + "loss": 0.4174, "step": 15925 }, { - "epoch": 0.56, - "learning_rate": 4.9122085704286246e-05, - "loss": 0.2921, + "epoch": 0.5741161206616932, + "grad_norm": 0.19165554642677307, + "learning_rate": 4.906344717442198e-05, + "loss": 0.4429, "step": 15930 }, { - "epoch": 0.56, - "learning_rate": 4.9121337249973675e-05, - "loss": 0.3092, + "epoch": 0.5742963203229178, + "grad_norm": 0.183024063706398, + "learning_rate": 4.906265576666022e-05, + "loss": 0.4571, "step": 15935 }, { - "epoch": 0.56, - "learning_rate": 4.912058848246144e-05, - "loss": 0.3083, + "epoch": 0.5744765199841424, + "grad_norm": 0.18742796778678894, + "learning_rate": 4.906186403104942e-05, + "loss": 0.4394, "step": 15940 }, { - "epoch": 0.56, - "learning_rate": 4.911983940175926e-05, - "loss": 0.2973, + "epoch": 0.574656719645367, + "grad_norm": 0.17034278810024261, + "learning_rate": 4.9061071967600394e-05, + "loss": 0.4422, "step": 15945 }, { - "epoch": 0.56, - "learning_rate": 4.9119090007876866e-05, - "loss": 0.326, + "epoch": 0.5748369193065918, + "grad_norm": 0.19667524099349976, + "learning_rate": 4.906027957632392e-05, + "loss": 0.4346, "step": 15950 }, { - "epoch": 0.56, - "learning_rate": 4.911834030082397e-05, - "loss": 0.3251, + "epoch": 0.5750171189678164, + "grad_norm": 0.16463510692119598, + "learning_rate": 4.9059486857230785e-05, + "loss": 0.4951, "step": 15955 }, { - "epoch": 0.56, - "learning_rate": 4.911759028061033e-05, - "loss": 0.3269, + "epoch": 0.575197318629041, + "grad_norm": 0.1695079207420349, + "learning_rate": 4.9058693810331815e-05, + "loss": 0.4675, "step": 15960 }, { - "epoch": 0.56, - "learning_rate": 4.9116839947245677e-05, - "loss": 0.3348, + "epoch": 0.5753775182902656, + "grad_norm": 0.13872882723808289, + "learning_rate": 4.905790043563779e-05, + "loss": 0.4046, "step": 15965 }, { - "epoch": 0.56, - "learning_rate": 4.9116089300739745e-05, - "loss": 0.3269, + "epoch": 0.5755577179514902, + "grad_norm": 0.196579709649086, + "learning_rate": 4.905710673315953e-05, + "loss": 0.452, "step": 15970 }, { - "epoch": 0.56, - "learning_rate": 4.9115338341102287e-05, - "loss": 0.306, + "epoch": 0.5757379176127149, + "grad_norm": 0.195785254240036, + "learning_rate": 4.9056312702907844e-05, + "loss": 0.4594, "step": 15975 }, { - "epoch": 0.56, - "learning_rate": 4.911458706834305e-05, - "loss": 0.2972, + "epoch": 0.5759181172739395, + "grad_norm": 0.1784789115190506, + "learning_rate": 4.905551834489356e-05, + "loss": 0.4217, "step": 15980 }, { - "epoch": 0.56, - "learning_rate": 4.9113835482471796e-05, - "loss": 0.3247, + "epoch": 0.5760983169351641, + "grad_norm": 0.1556771695613861, + "learning_rate": 4.9054723659127496e-05, + "loss": 0.4264, "step": 15985 }, { - "epoch": 0.56, - "learning_rate": 4.911308358349828e-05, - "loss": 0.336, + "epoch": 0.5762785165963888, + "grad_norm": 0.1567842811346054, + "learning_rate": 4.9053928645620484e-05, + "loss": 0.449, "step": 15990 }, { - "epoch": 0.56, - "learning_rate": 4.911233137143226e-05, - "loss": 0.3204, + "epoch": 0.5764587162576135, + "grad_norm": 0.19066265225410461, + "learning_rate": 4.9053133304383346e-05, + "loss": 0.4512, "step": 15995 }, { - "epoch": 0.56, - "learning_rate": 4.911157884628352e-05, - "loss": 0.3203, + "epoch": 0.5766389159188381, + "grad_norm": 0.17999373376369476, + "learning_rate": 4.9052337635426925e-05, + "loss": 0.4467, "step": 16000 }, { - "epoch": 0.56, - "eval_loss": 0.30903559923171997, - "eval_runtime": 10.5428, - "eval_samples_per_second": 9.485, - "eval_steps_per_second": 9.485, + "epoch": 0.5766389159188381, + "eval_loss": 0.4678036570549011, + "eval_runtime": 3.5673, + "eval_samples_per_second": 28.032, + "eval_steps_per_second": 7.008, "step": 16000 }, { - "epoch": 0.56, - "learning_rate": 4.9110826008061804e-05, - "loss": 0.2991, + "epoch": 0.5768191155800627, + "grad_norm": 0.21432943642139435, + "learning_rate": 4.9051541638762055e-05, + "loss": 0.4813, "step": 16005 }, { - "epoch": 0.56, - "learning_rate": 4.911007285677691e-05, - "loss": 0.3346, + "epoch": 0.5769993152412873, + "grad_norm": 0.1877758800983429, + "learning_rate": 4.905074531439959e-05, + "loss": 0.4132, "step": 16010 }, { - "epoch": 0.56, - "learning_rate": 4.910931939243861e-05, - "loss": 0.3237, + "epoch": 0.577179514902512, + "grad_norm": 0.1863996535539627, + "learning_rate": 4.904994866235038e-05, + "loss": 0.4336, "step": 16015 }, { - "epoch": 0.56, - "learning_rate": 4.910856561505668e-05, - "loss": 0.2991, + "epoch": 0.5773597145637366, + "grad_norm": 0.16214637458324432, + "learning_rate": 4.904915168262527e-05, + "loss": 0.4463, "step": 16020 }, { - "epoch": 0.56, - "learning_rate": 4.9107811524640926e-05, - "loss": 0.3116, + "epoch": 0.5775399142249612, + "grad_norm": 0.17068390548229218, + "learning_rate": 4.9048354375235125e-05, + "loss": 0.4509, "step": 16025 }, { - "epoch": 0.56, - "learning_rate": 4.9107057121201116e-05, - "loss": 0.3334, + "epoch": 0.5777201138861859, + "grad_norm": 0.1428307294845581, + "learning_rate": 4.9047556740190814e-05, + "loss": 0.4417, "step": 16030 }, { - "epoch": 0.56, - "learning_rate": 4.9106302404747064e-05, - "loss": 0.2986, + "epoch": 0.5779003135474106, + "grad_norm": 0.19168958067893982, + "learning_rate": 4.90467587775032e-05, + "loss": 0.4479, "step": 16035 }, { - "epoch": 0.56, - "learning_rate": 4.9105547375288553e-05, - "loss": 0.3229, + "epoch": 0.5780805132086352, + "grad_norm": 0.15157431364059448, + "learning_rate": 4.9045960487183144e-05, + "loss": 0.4559, "step": 16040 }, { - "epoch": 0.56, - "learning_rate": 4.91047920328354e-05, - "loss": 0.3129, + "epoch": 0.5782607128698598, + "grad_norm": 0.17288236320018768, + "learning_rate": 4.904516186924154e-05, + "loss": 0.4378, "step": 16045 }, { - "epoch": 0.56, - "learning_rate": 4.910403637739741e-05, - "loss": 0.337, + "epoch": 0.5784409125310844, + "grad_norm": 0.15723341703414917, + "learning_rate": 4.904436292368925e-05, + "loss": 0.4208, "step": 16050 }, { - "epoch": 0.56, - "learning_rate": 4.910328040898439e-05, - "loss": 0.3069, + "epoch": 0.578621112192309, + "grad_norm": 0.13947317004203796, + "learning_rate": 4.9043563650537185e-05, + "loss": 0.445, "step": 16055 }, { - "epoch": 0.57, - "learning_rate": 4.9102524127606156e-05, - "loss": 0.3133, + "epoch": 0.5788013118535337, + "grad_norm": 0.18408885598182678, + "learning_rate": 4.9042764049796205e-05, + "loss": 0.4146, "step": 16060 }, { - "epoch": 0.57, - "learning_rate": 4.910176753327254e-05, - "loss": 0.3323, + "epoch": 0.5789815115147584, + "grad_norm": 0.17368938028812408, + "learning_rate": 4.904196412147723e-05, + "loss": 0.4689, "step": 16065 }, { - "epoch": 0.57, - "learning_rate": 4.910101062599334e-05, - "loss": 0.3099, + "epoch": 0.579161711175983, + "grad_norm": 0.14086973667144775, + "learning_rate": 4.904116386559115e-05, + "loss": 0.4167, "step": 16070 }, { - "epoch": 0.57, - "learning_rate": 4.910025340577841e-05, - "loss": 0.2945, + "epoch": 0.5793419108372077, + "grad_norm": 0.15786948800086975, + "learning_rate": 4.9040363282148854e-05, + "loss": 0.4391, "step": 16075 }, { - "epoch": 0.57, - "learning_rate": 4.9099495872637574e-05, - "loss": 0.3001, + "epoch": 0.5795221104984323, + "grad_norm": 0.15535566210746765, + "learning_rate": 4.903956237116127e-05, + "loss": 0.4552, "step": 16080 }, { - "epoch": 0.57, - "learning_rate": 4.909873802658067e-05, - "loss": 0.3157, + "epoch": 0.5797023101596569, + "grad_norm": 0.3356935679912567, + "learning_rate": 4.9038761132639304e-05, + "loss": 0.4424, "step": 16085 }, { - "epoch": 0.57, - "learning_rate": 4.909797986761753e-05, - "loss": 0.3145, + "epoch": 0.5798825098208815, + "grad_norm": 0.1919674128293991, + "learning_rate": 4.903795956659387e-05, + "loss": 0.4388, "step": 16090 }, { - "epoch": 0.57, - "learning_rate": 4.9097221395758e-05, - "loss": 0.3083, + "epoch": 0.5800627094821061, + "grad_norm": 0.18245883285999298, + "learning_rate": 4.9037157673035894e-05, + "loss": 0.452, "step": 16095 }, { - "epoch": 0.57, - "learning_rate": 4.909646261101193e-05, - "loss": 0.3221, + "epoch": 0.5802429091433308, + "grad_norm": 0.18774840235710144, + "learning_rate": 4.903635545197629e-05, + "loss": 0.4434, "step": 16100 }, { - "epoch": 0.57, - "learning_rate": 4.909570351338917e-05, - "loss": 0.3278, + "epoch": 0.5804231088045555, + "grad_norm": 0.1595427542924881, + "learning_rate": 4.9035552903426006e-05, + "loss": 0.4435, "step": 16105 }, { - "epoch": 0.57, - "learning_rate": 4.9094944102899585e-05, - "loss": 0.322, + "epoch": 0.5806033084657801, + "grad_norm": 0.1629880964756012, + "learning_rate": 4.903475002739596e-05, + "loss": 0.4895, "step": 16110 }, { - "epoch": 0.57, - "learning_rate": 4.909418437955303e-05, - "loss": 0.3157, + "epoch": 0.5807835081270047, + "grad_norm": 0.18076345324516296, + "learning_rate": 4.903394682389711e-05, + "loss": 0.4733, "step": 16115 }, { - "epoch": 0.57, - "learning_rate": 4.909342434335937e-05, - "loss": 0.3012, + "epoch": 0.5809637077882294, + "grad_norm": 0.16395287215709686, + "learning_rate": 4.9033143292940376e-05, + "loss": 0.4474, "step": 16120 }, { - "epoch": 0.57, - "learning_rate": 4.909266399432847e-05, - "loss": 0.3021, + "epoch": 0.581143907449454, + "grad_norm": 0.21613916754722595, + "learning_rate": 4.903233943453672e-05, + "loss": 0.4297, "step": 16125 }, { - "epoch": 0.57, - "learning_rate": 4.9091903332470215e-05, - "loss": 0.3079, + "epoch": 0.5813241071106786, + "grad_norm": 0.15141765773296356, + "learning_rate": 4.9031535248697095e-05, + "loss": 0.4635, "step": 16130 }, { - "epoch": 0.57, - "learning_rate": 4.909114235779446e-05, - "loss": 0.3236, + "epoch": 0.5815043067719032, + "grad_norm": 0.18841727077960968, + "learning_rate": 4.9030730735432453e-05, + "loss": 0.4198, "step": 16135 }, { - "epoch": 0.57, - "learning_rate": 4.909038107031111e-05, - "loss": 0.3187, + "epoch": 0.5816845064331279, + "grad_norm": 0.1524806171655655, + "learning_rate": 4.902992589475376e-05, + "loss": 0.431, "step": 16140 }, { - "epoch": 0.57, - "learning_rate": 4.9089619470030025e-05, - "loss": 0.3149, + "epoch": 0.5818647060943526, + "grad_norm": 0.21377435326576233, + "learning_rate": 4.9029120726671974e-05, + "loss": 0.4369, "step": 16145 }, { - "epoch": 0.57, - "learning_rate": 4.908885755696111e-05, - "loss": 0.2911, + "epoch": 0.5820449057555772, + "grad_norm": 0.17817193269729614, + "learning_rate": 4.902831523119808e-05, + "loss": 0.4394, "step": 16150 }, { - "epoch": 0.57, - "learning_rate": 4.908809533111427e-05, - "loss": 0.3365, + "epoch": 0.5822251054168018, + "grad_norm": 0.14097528159618378, + "learning_rate": 4.902750940834303e-05, + "loss": 0.4648, "step": 16155 }, { - "epoch": 0.57, - "learning_rate": 4.908733279249937e-05, - "loss": 0.272, + "epoch": 0.5824053050780265, + "grad_norm": 0.1693866103887558, + "learning_rate": 4.9026703258117825e-05, + "loss": 0.4576, "step": 16160 }, { - "epoch": 0.57, - "learning_rate": 4.9086569941126335e-05, - "loss": 0.3254, + "epoch": 0.5825855047392511, + "grad_norm": 0.17576813697814941, + "learning_rate": 4.902589678053343e-05, + "loss": 0.4165, "step": 16165 }, { - "epoch": 0.57, - "learning_rate": 4.9085806777005054e-05, - "loss": 0.2899, + "epoch": 0.5827657044004757, + "grad_norm": 0.16277313232421875, + "learning_rate": 4.9025089975600855e-05, + "loss": 0.4695, "step": 16170 }, { - "epoch": 0.57, - "learning_rate": 4.9085043300145453e-05, - "loss": 0.3273, + "epoch": 0.5829459040617003, + "grad_norm": 0.17352886497974396, + "learning_rate": 4.902428284333107e-05, + "loss": 0.4715, "step": 16175 }, { - "epoch": 0.57, - "learning_rate": 4.908427951055744e-05, - "loss": 0.2825, + "epoch": 0.583126103722925, + "grad_norm": 0.16284751892089844, + "learning_rate": 4.902347538373509e-05, + "loss": 0.4683, "step": 16180 }, { - "epoch": 0.57, - "learning_rate": 4.908351540825093e-05, - "loss": 0.304, + "epoch": 0.5833063033841497, + "grad_norm": 0.17194764316082, + "learning_rate": 4.9022667596823904e-05, + "loss": 0.4208, "step": 16185 }, { - "epoch": 0.57, - "learning_rate": 4.9082750993235835e-05, - "loss": 0.294, + "epoch": 0.5834865030453743, + "grad_norm": 0.1685447096824646, + "learning_rate": 4.902185948260853e-05, + "loss": 0.4543, "step": 16190 }, { - "epoch": 0.57, - "learning_rate": 4.90819862655221e-05, - "loss": 0.3118, + "epoch": 0.5836667027065989, + "grad_norm": 0.18297097086906433, + "learning_rate": 4.9021051041099966e-05, + "loss": 0.4427, "step": 16195 }, { - "epoch": 0.57, - "learning_rate": 4.908122122511963e-05, - "loss": 0.338, + "epoch": 0.5838469023678236, + "grad_norm": 0.20284834504127502, + "learning_rate": 4.902024227230924e-05, + "loss": 0.4038, "step": 16200 }, { - "epoch": 0.57, - "learning_rate": 4.9080455872038385e-05, - "loss": 0.3199, + "epoch": 0.5840271020290482, + "grad_norm": 0.1463736742734909, + "learning_rate": 4.9019433176247353e-05, + "loss": 0.4688, "step": 16205 }, { - "epoch": 0.57, - "learning_rate": 4.9079690206288284e-05, - "loss": 0.3157, + "epoch": 0.5842073016902728, + "grad_norm": 0.14948871731758118, + "learning_rate": 4.901862375292534e-05, + "loss": 0.4649, "step": 16210 }, { - "epoch": 0.57, - "learning_rate": 4.9078924227879274e-05, - "loss": 0.2938, + "epoch": 0.5843875013514974, + "grad_norm": 0.16728514432907104, + "learning_rate": 4.901781400235423e-05, + "loss": 0.4369, "step": 16215 }, { - "epoch": 0.57, - "learning_rate": 4.907815793682131e-05, - "loss": 0.3026, + "epoch": 0.584567701012722, + "grad_norm": 0.1717401146888733, + "learning_rate": 4.901700392454506e-05, + "loss": 0.4683, "step": 16220 }, { - "epoch": 0.57, - "learning_rate": 4.9077391333124323e-05, - "loss": 0.2991, + "epoch": 0.5847479006739468, + "grad_norm": 0.18547184765338898, + "learning_rate": 4.9016193519508855e-05, + "loss": 0.4614, "step": 16225 }, { - "epoch": 0.57, - "learning_rate": 4.9076624416798285e-05, - "loss": 0.3261, + "epoch": 0.5849281003351714, + "grad_norm": 0.1905558854341507, + "learning_rate": 4.901538278725666e-05, + "loss": 0.4074, "step": 16230 }, { - "epoch": 0.57, - "learning_rate": 4.907585718785314e-05, - "loss": 0.3206, + "epoch": 0.585108299996396, + "grad_norm": 0.169610396027565, + "learning_rate": 4.901457172779953e-05, + "loss": 0.4232, "step": 16235 }, { - "epoch": 0.57, - "learning_rate": 4.907508964629886e-05, - "loss": 0.3247, + "epoch": 0.5852884996576206, + "grad_norm": 0.14899376034736633, + "learning_rate": 4.9013760341148515e-05, + "loss": 0.4495, "step": 16240 }, { - "epoch": 0.57, - "learning_rate": 4.9074321792145405e-05, - "loss": 0.2934, + "epoch": 0.5854686993188453, + "grad_norm": 0.1737765222787857, + "learning_rate": 4.901294862731466e-05, + "loss": 0.4513, "step": 16245 }, { - "epoch": 0.57, - "learning_rate": 4.907355362540275e-05, - "loss": 0.3117, + "epoch": 0.5856488989800699, + "grad_norm": 0.18790043890476227, + "learning_rate": 4.901213658630902e-05, + "loss": 0.4812, "step": 16250 }, { - "epoch": 0.57, - "learning_rate": 4.907278514608086e-05, - "loss": 0.3057, + "epoch": 0.5858290986412945, + "grad_norm": 0.13040530681610107, + "learning_rate": 4.901132421814267e-05, + "loss": 0.4453, "step": 16255 }, { - "epoch": 0.57, - "learning_rate": 4.9072016354189725e-05, - "loss": 0.3288, + "epoch": 0.5860092983025192, + "grad_norm": 0.1362471878528595, + "learning_rate": 4.901051152282669e-05, + "loss": 0.433, "step": 16260 }, { - "epoch": 0.57, - "learning_rate": 4.9071247249739324e-05, - "loss": 0.3158, + "epoch": 0.5861894979637439, + "grad_norm": 0.1804387867450714, + "learning_rate": 4.9009698500372124e-05, + "loss": 0.4489, "step": 16265 }, { - "epoch": 0.57, - "learning_rate": 4.907047783273963e-05, - "loss": 0.3155, + "epoch": 0.5863696976249685, + "grad_norm": 0.1898290514945984, + "learning_rate": 4.9008885150790076e-05, + "loss": 0.4161, "step": 16270 }, { - "epoch": 0.57, - "learning_rate": 4.9069708103200655e-05, - "loss": 0.3297, + "epoch": 0.5865498972861931, + "grad_norm": 0.16169433295726776, + "learning_rate": 4.900807147409161e-05, + "loss": 0.4563, "step": 16275 }, { - "epoch": 0.57, - "learning_rate": 4.906893806113238e-05, - "loss": 0.3117, + "epoch": 0.5867300969474177, + "grad_norm": 0.13549362123012543, + "learning_rate": 4.900725747028782e-05, + "loss": 0.4627, "step": 16280 }, { - "epoch": 0.57, - "learning_rate": 4.906816770654481e-05, - "loss": 0.3229, + "epoch": 0.5869102966086424, + "grad_norm": 0.15982681512832642, + "learning_rate": 4.900644313938979e-05, + "loss": 0.4483, "step": 16285 }, { - "epoch": 0.57, - "learning_rate": 4.906739703944794e-05, - "loss": 0.2878, + "epoch": 0.587090496269867, + "grad_norm": 0.14519517123699188, + "learning_rate": 4.900562848140863e-05, + "loss": 0.4483, "step": 16290 }, { - "epoch": 0.57, - "learning_rate": 4.9066626059851775e-05, - "loss": 0.3256, + "epoch": 0.5872706959310916, + "grad_norm": 0.21161200106143951, + "learning_rate": 4.900481349635542e-05, + "loss": 0.4173, "step": 16295 }, { - "epoch": 0.57, - "learning_rate": 4.906585476776634e-05, - "loss": 0.31, + "epoch": 0.5874508955923163, + "grad_norm": 0.14361968636512756, + "learning_rate": 4.9003998184241275e-05, + "loss": 0.477, "step": 16300 }, { - "epoch": 0.57, - "learning_rate": 4.906508316320163e-05, - "loss": 0.3165, + "epoch": 0.587631095253541, + "grad_norm": 0.16247104108333588, + "learning_rate": 4.9003182545077305e-05, + "loss": 0.4436, "step": 16305 }, { - "epoch": 0.57, - "learning_rate": 4.9064311246167684e-05, - "loss": 0.3078, + "epoch": 0.5878112949147656, + "grad_norm": 0.21194829046726227, + "learning_rate": 4.9002366578874626e-05, + "loss": 0.4429, "step": 16310 }, { - "epoch": 0.57, - "learning_rate": 4.9063539016674517e-05, - "loss": 0.3042, + "epoch": 0.5879914945759902, + "grad_norm": 0.15204815566539764, + "learning_rate": 4.9001550285644336e-05, + "loss": 0.4554, "step": 16315 }, { - "epoch": 0.57, - "learning_rate": 4.9062766474732146e-05, - "loss": 0.2994, + "epoch": 0.5881716942372148, + "grad_norm": 0.16609960794448853, + "learning_rate": 4.900073366539758e-05, + "loss": 0.4564, "step": 16320 }, { - "epoch": 0.57, - "learning_rate": 4.906199362035062e-05, - "loss": 0.293, + "epoch": 0.5883518938984394, + "grad_norm": 0.1630159169435501, + "learning_rate": 4.8999916718145475e-05, + "loss": 0.4304, "step": 16325 }, { - "epoch": 0.57, - "learning_rate": 4.9061220453539955e-05, - "loss": 0.3131, + "epoch": 0.5885320935596641, + "grad_norm": 0.1939852088689804, + "learning_rate": 4.899909944389914e-05, + "loss": 0.4236, "step": 16330 }, { - "epoch": 0.57, - "learning_rate": 4.9060446974310206e-05, - "loss": 0.3289, + "epoch": 0.5887122932208887, + "grad_norm": 0.19106720387935638, + "learning_rate": 4.8998281842669734e-05, + "loss": 0.4602, "step": 16335 }, { - "epoch": 0.57, - "learning_rate": 4.90596731826714e-05, - "loss": 0.2934, + "epoch": 0.5888924928821134, + "grad_norm": 0.19278432428836823, + "learning_rate": 4.8997463914468376e-05, + "loss": 0.4576, "step": 16340 }, { - "epoch": 0.58, - "learning_rate": 4.905889907863361e-05, - "loss": 0.302, + "epoch": 0.589072692543338, + "grad_norm": 0.20767638087272644, + "learning_rate": 4.899664565930623e-05, + "loss": 0.484, "step": 16345 }, { - "epoch": 0.58, - "learning_rate": 4.9058124662206864e-05, - "loss": 0.2956, + "epoch": 0.5892528922045627, + "grad_norm": 0.1724158674478531, + "learning_rate": 4.8995827077194425e-05, + "loss": 0.4261, "step": 16350 }, { - "epoch": 0.58, - "learning_rate": 4.9057349933401213e-05, - "loss": 0.3129, + "epoch": 0.5894330918657873, + "grad_norm": 0.1584085077047348, + "learning_rate": 4.899500816814412e-05, + "loss": 0.4627, "step": 16355 }, { - "epoch": 0.58, - "learning_rate": 4.905657489222674e-05, - "loss": 0.3028, + "epoch": 0.5896132915270119, + "grad_norm": 0.159664586186409, + "learning_rate": 4.8994188932166473e-05, + "loss": 0.4483, "step": 16360 }, { - "epoch": 0.58, - "learning_rate": 4.905579953869349e-05, - "loss": 0.2844, + "epoch": 0.5897934911882365, + "grad_norm": 0.14001238346099854, + "learning_rate": 4.8993369369272646e-05, + "loss": 0.4758, "step": 16365 }, { - "epoch": 0.58, - "learning_rate": 4.905502387281154e-05, - "loss": 0.2775, + "epoch": 0.5899736908494612, + "grad_norm": 0.1366659700870514, + "learning_rate": 4.899254947947382e-05, + "loss": 0.441, "step": 16370 }, { - "epoch": 0.58, - "learning_rate": 4.905424789459096e-05, - "loss": 0.3449, + "epoch": 0.5901538905106858, + "grad_norm": 0.13994751870632172, + "learning_rate": 4.899172926278113e-05, + "loss": 0.3618, "step": 16375 }, { - "epoch": 0.58, - "learning_rate": 4.9053471604041815e-05, - "loss": 0.3173, + "epoch": 0.5903340901719105, + "grad_norm": 0.18011939525604248, + "learning_rate": 4.899090871920579e-05, + "loss": 0.444, "step": 16380 }, { - "epoch": 0.58, - "learning_rate": 4.9052695001174196e-05, - "loss": 0.286, + "epoch": 0.5905142898331351, + "grad_norm": 0.18300089240074158, + "learning_rate": 4.899008784875896e-05, + "loss": 0.4418, "step": 16385 }, { - "epoch": 0.58, - "learning_rate": 4.9051918085998184e-05, - "loss": 0.3029, + "epoch": 0.5906944894943598, + "grad_norm": 0.16376063227653503, + "learning_rate": 4.898926665145183e-05, + "loss": 0.4708, "step": 16390 }, { - "epoch": 0.58, - "learning_rate": 4.905114085852387e-05, - "loss": 0.2968, + "epoch": 0.5908746891555844, + "grad_norm": 0.20874595642089844, + "learning_rate": 4.898844512729558e-05, + "loss": 0.4593, "step": 16395 }, { - "epoch": 0.58, - "learning_rate": 4.905036331876134e-05, - "loss": 0.3236, + "epoch": 0.591054888816809, + "grad_norm": 0.1489773839712143, + "learning_rate": 4.898762327630142e-05, + "loss": 0.4314, "step": 16400 }, { - "epoch": 0.58, - "learning_rate": 4.9049585466720695e-05, - "loss": 0.2975, + "epoch": 0.5912350884780336, + "grad_norm": 0.1538877636194229, + "learning_rate": 4.898680109848053e-05, + "loss": 0.4327, "step": 16405 }, { - "epoch": 0.58, - "learning_rate": 4.904880730241202e-05, - "loss": 0.3245, + "epoch": 0.5914152881392583, + "grad_norm": 0.16070400178432465, + "learning_rate": 4.8985978593844115e-05, + "loss": 0.4837, "step": 16410 }, { - "epoch": 0.58, - "learning_rate": 4.904802882584545e-05, - "loss": 0.3016, + "epoch": 0.591595487800483, + "grad_norm": 0.15595030784606934, + "learning_rate": 4.898515576240339e-05, + "loss": 0.4543, "step": 16415 }, { - "epoch": 0.58, - "learning_rate": 4.9047250037031064e-05, - "loss": 0.3133, + "epoch": 0.5917756874617076, + "grad_norm": 0.1852567046880722, + "learning_rate": 4.898433260416956e-05, + "loss": 0.4185, "step": 16420 }, { - "epoch": 0.58, - "learning_rate": 4.9046470935978986e-05, - "loss": 0.3239, + "epoch": 0.5919558871229322, + "grad_norm": 0.18064150214195251, + "learning_rate": 4.898350911915385e-05, + "loss": 0.4524, "step": 16425 }, { - "epoch": 0.58, - "learning_rate": 4.904569152269932e-05, - "loss": 0.3015, + "epoch": 0.5921360867841569, + "grad_norm": 0.1406656950712204, + "learning_rate": 4.898268530736746e-05, + "loss": 0.4357, "step": 16430 }, { - "epoch": 0.58, - "learning_rate": 4.904491179720221e-05, - "loss": 0.3449, + "epoch": 0.5923162864453815, + "grad_norm": 0.1857532560825348, + "learning_rate": 4.898186116882163e-05, + "loss": 0.4287, "step": 16435 }, { - "epoch": 0.58, - "learning_rate": 4.9044131759497755e-05, - "loss": 0.2949, + "epoch": 0.5924964861066061, + "grad_norm": 0.2215130776166916, + "learning_rate": 4.8981036703527584e-05, + "loss": 0.4508, "step": 16440 }, { - "epoch": 0.58, - "learning_rate": 4.904335140959611e-05, - "loss": 0.2984, + "epoch": 0.5926766857678307, + "grad_norm": 0.16718199849128723, + "learning_rate": 4.898021191149655e-05, + "loss": 0.4264, "step": 16445 }, { - "epoch": 0.58, - "learning_rate": 4.904257074750738e-05, - "loss": 0.3246, + "epoch": 0.5928568854290553, + "grad_norm": 0.16508464515209198, + "learning_rate": 4.897938679273979e-05, + "loss": 0.4389, "step": 16450 }, { - "epoch": 0.58, - "learning_rate": 4.904178977324171e-05, - "loss": 0.3193, + "epoch": 0.5930370850902801, + "grad_norm": 0.16538691520690918, + "learning_rate": 4.897856134726851e-05, + "loss": 0.4461, "step": 16455 }, { - "epoch": 0.58, - "learning_rate": 4.904100848680925e-05, - "loss": 0.3002, + "epoch": 0.5932172847515047, + "grad_norm": 0.16483493149280548, + "learning_rate": 4.897773557509398e-05, + "loss": 0.4803, "step": 16460 }, { - "epoch": 0.58, - "learning_rate": 4.9040226888220144e-05, - "loss": 0.2971, + "epoch": 0.5933974844127293, + "grad_norm": 0.17354772984981537, + "learning_rate": 4.897690947622745e-05, + "loss": 0.4594, "step": 16465 }, { - "epoch": 0.58, - "learning_rate": 4.903944497748453e-05, - "loss": 0.3081, + "epoch": 0.593577684073954, + "grad_norm": 0.18836447596549988, + "learning_rate": 4.8976083050680164e-05, + "loss": 0.4594, "step": 16470 }, { - "epoch": 0.58, - "learning_rate": 4.9038662754612564e-05, - "loss": 0.3071, + "epoch": 0.5937578837351786, + "grad_norm": 0.17023785412311554, + "learning_rate": 4.897525629846339e-05, + "loss": 0.4418, "step": 16475 }, { - "epoch": 0.58, - "learning_rate": 4.9037880219614405e-05, - "loss": 0.3106, + "epoch": 0.5939380833964032, + "grad_norm": 0.13566334545612335, + "learning_rate": 4.897442921958839e-05, + "loss": 0.4338, "step": 16480 }, { - "epoch": 0.58, - "learning_rate": 4.903709737250021e-05, - "loss": 0.3069, + "epoch": 0.5941182830576278, + "grad_norm": 0.18862979114055634, + "learning_rate": 4.8973601814066436e-05, + "loss": 0.4653, "step": 16485 }, { - "epoch": 0.58, - "learning_rate": 4.9036314213280153e-05, - "loss": 0.3297, + "epoch": 0.5942984827188524, + "grad_norm": 0.16662077605724335, + "learning_rate": 4.897277408190879e-05, + "loss": 0.4596, "step": 16490 }, { - "epoch": 0.58, - "learning_rate": 4.903553074196439e-05, - "loss": 0.3066, + "epoch": 0.5944786823800772, + "grad_norm": 0.20535314083099365, + "learning_rate": 4.897194602312675e-05, + "loss": 0.4829, "step": 16495 }, { - "epoch": 0.58, - "learning_rate": 4.90347469585631e-05, - "loss": 0.2976, + "epoch": 0.5946588820413018, + "grad_norm": 0.16183775663375854, + "learning_rate": 4.8971117637731576e-05, + "loss": 0.4483, "step": 16500 }, { - "epoch": 0.58, - "eval_loss": 0.30791518092155457, - "eval_runtime": 10.5594, - "eval_samples_per_second": 9.47, - "eval_steps_per_second": 9.47, + "epoch": 0.5946588820413018, + "eval_loss": 0.4669727683067322, + "eval_runtime": 3.6173, + "eval_samples_per_second": 27.645, + "eval_steps_per_second": 6.911, "step": 16500 }, { - "epoch": 0.58, - "learning_rate": 4.9033962863086466e-05, - "loss": 0.304, + "epoch": 0.5948390817025264, + "grad_norm": 0.18323639035224915, + "learning_rate": 4.8970288925734575e-05, + "loss": 0.4502, "step": 16505 }, { - "epoch": 0.58, - "learning_rate": 4.903317845554466e-05, - "loss": 0.3337, + "epoch": 0.595019281363751, + "grad_norm": 0.1653471291065216, + "learning_rate": 4.8969459887147025e-05, + "loss": 0.4789, "step": 16510 }, { - "epoch": 0.58, - "learning_rate": 4.903239373594788e-05, - "loss": 0.3248, + "epoch": 0.5951994810249757, + "grad_norm": 0.14966969192028046, + "learning_rate": 4.896863052198022e-05, + "loss": 0.4468, "step": 16515 }, { - "epoch": 0.58, - "learning_rate": 4.90316087043063e-05, - "loss": 0.2933, + "epoch": 0.5953796806862003, + "grad_norm": 0.16616958379745483, + "learning_rate": 4.896780083024547e-05, + "loss": 0.49, "step": 16520 }, { - "epoch": 0.58, - "learning_rate": 4.9030823360630106e-05, - "loss": 0.3117, + "epoch": 0.5955598803474249, + "grad_norm": 0.17226427793502808, + "learning_rate": 4.896697081195407e-05, + "loss": 0.4843, "step": 16525 }, { - "epoch": 0.58, - "learning_rate": 4.903003770492952e-05, - "loss": 0.3175, + "epoch": 0.5957400800086495, + "grad_norm": 0.21469104290008545, + "learning_rate": 4.896614046711734e-05, + "loss": 0.4526, "step": 16530 }, { - "epoch": 0.58, - "learning_rate": 4.902925173721473e-05, - "loss": 0.3121, + "epoch": 0.5959202796698743, + "grad_norm": 0.17571860551834106, + "learning_rate": 4.896530979574658e-05, + "loss": 0.4803, "step": 16535 }, { - "epoch": 0.58, - "learning_rate": 4.9028465457495936e-05, - "loss": 0.2972, + "epoch": 0.5961004793310989, + "grad_norm": 0.16215255856513977, + "learning_rate": 4.896447879785311e-05, + "loss": 0.4276, "step": 16540 }, { - "epoch": 0.58, - "learning_rate": 4.9027678865783354e-05, - "loss": 0.3076, + "epoch": 0.5962806789923235, + "grad_norm": 0.17751629650592804, + "learning_rate": 4.896364747344827e-05, + "loss": 0.4628, "step": 16545 }, { - "epoch": 0.58, - "learning_rate": 4.90268919620872e-05, - "loss": 0.3243, + "epoch": 0.5964608786535481, + "grad_norm": 0.1444987803697586, + "learning_rate": 4.896281582254336e-05, + "loss": 0.4643, "step": 16550 }, { - "epoch": 0.58, - "learning_rate": 4.902610474641769e-05, - "loss": 0.3134, + "epoch": 0.5966410783147728, + "grad_norm": 0.17875704169273376, + "learning_rate": 4.8961983845149725e-05, + "loss": 0.4847, "step": 16555 }, { - "epoch": 0.58, - "learning_rate": 4.902531721878504e-05, - "loss": 0.3299, + "epoch": 0.5968212779759974, + "grad_norm": 0.20280811190605164, + "learning_rate": 4.8961151541278706e-05, + "loss": 0.4574, "step": 16560 }, { - "epoch": 0.58, - "learning_rate": 4.902452937919948e-05, - "loss": 0.3056, + "epoch": 0.597001477637222, + "grad_norm": 0.1767544448375702, + "learning_rate": 4.8960318910941626e-05, + "loss": 0.4164, "step": 16565 }, { - "epoch": 0.58, - "learning_rate": 4.9023741227671236e-05, - "loss": 0.336, + "epoch": 0.5971816772984467, + "grad_norm": 0.17916817963123322, + "learning_rate": 4.8959485954149855e-05, + "loss": 0.4439, "step": 16570 }, { - "epoch": 0.58, - "learning_rate": 4.902295276421055e-05, - "loss": 0.3167, + "epoch": 0.5973618769596714, + "grad_norm": 0.16673676669597626, + "learning_rate": 4.895865267091471e-05, + "loss": 0.4553, "step": 16575 }, { - "epoch": 0.58, - "learning_rate": 4.902216398882766e-05, - "loss": 0.29, + "epoch": 0.597542076620896, + "grad_norm": 0.1650109440088272, + "learning_rate": 4.895781906124757e-05, + "loss": 0.4454, "step": 16580 }, { - "epoch": 0.58, - "learning_rate": 4.902137490153279e-05, - "loss": 0.3181, + "epoch": 0.5977222762821206, + "grad_norm": 0.1822521984577179, + "learning_rate": 4.895698512515978e-05, + "loss": 0.4683, "step": 16585 }, { - "epoch": 0.58, - "learning_rate": 4.9020585502336205e-05, - "loss": 0.2852, + "epoch": 0.5979024759433452, + "grad_norm": 0.1935487687587738, + "learning_rate": 4.895615086266271e-05, + "loss": 0.4437, "step": 16590 }, { - "epoch": 0.58, - "learning_rate": 4.9019795791248144e-05, - "loss": 0.3185, + "epoch": 0.5980826756045698, + "grad_norm": 0.15225858986377716, + "learning_rate": 4.895531627376772e-05, + "loss": 0.4567, "step": 16595 }, { - "epoch": 0.58, - "learning_rate": 4.901900576827886e-05, - "loss": 0.3138, + "epoch": 0.5982628752657945, + "grad_norm": 0.1286819726228714, + "learning_rate": 4.8954481358486185e-05, + "loss": 0.4368, "step": 16600 }, { - "epoch": 0.58, - "learning_rate": 4.9018215433438616e-05, - "loss": 0.2948, + "epoch": 0.5984430749270191, + "grad_norm": 0.1508607119321823, + "learning_rate": 4.8953646116829477e-05, + "loss": 0.4622, "step": 16605 }, { - "epoch": 0.58, - "learning_rate": 4.9017424786737674e-05, - "loss": 0.3113, + "epoch": 0.5986232745882438, + "grad_norm": 0.1493672877550125, + "learning_rate": 4.895281054880898e-05, + "loss": 0.4192, "step": 16610 }, { - "epoch": 0.58, - "learning_rate": 4.9016633828186306e-05, - "loss": 0.2955, + "epoch": 0.5988034742494684, + "grad_norm": 0.17611226439476013, + "learning_rate": 4.895197465443608e-05, + "loss": 0.4706, "step": 16615 }, { - "epoch": 0.58, - "learning_rate": 4.901584255779477e-05, - "loss": 0.3217, + "epoch": 0.5989836739106931, + "grad_norm": 0.1974530965089798, + "learning_rate": 4.8951138433722154e-05, + "loss": 0.4066, "step": 16620 }, { - "epoch": 0.58, - "learning_rate": 4.9015209316963526e-05, - "loss": 0.3111, + "epoch": 0.5991638735719177, + "grad_norm": 0.14464324712753296, + "learning_rate": 4.895030188667861e-05, + "loss": 0.4478, "step": 16625 }, { - "epoch": 0.59, - "learning_rate": 4.9014417485285595e-05, - "loss": 0.2994, + "epoch": 0.5993440732331423, + "grad_norm": 0.13125242292881012, + "learning_rate": 4.894946501331684e-05, + "loss": 0.4564, "step": 16630 }, { - "epoch": 0.59, - "learning_rate": 4.9013625341796276e-05, - "loss": 0.3065, + "epoch": 0.5995242728943669, + "grad_norm": 0.20426687598228455, + "learning_rate": 4.894862781364824e-05, + "loss": 0.4608, "step": 16635 }, { - "epoch": 0.59, - "learning_rate": 4.9012832886505855e-05, - "loss": 0.3198, + "epoch": 0.5997044725555916, + "grad_norm": 0.17692770063877106, + "learning_rate": 4.894779028768423e-05, + "loss": 0.469, "step": 16640 }, { - "epoch": 0.59, - "learning_rate": 4.901204011942463e-05, - "loss": 0.3359, + "epoch": 0.5998846722168162, + "grad_norm": 0.18528582155704498, + "learning_rate": 4.894695243543621e-05, + "loss": 0.4498, "step": 16645 }, { - "epoch": 0.59, - "learning_rate": 4.901124704056289e-05, - "loss": 0.3179, + "epoch": 0.6000648718780409, + "grad_norm": 0.19530199468135834, + "learning_rate": 4.8946114256915597e-05, + "loss": 0.4055, "step": 16650 }, { - "epoch": 0.59, - "learning_rate": 4.901045364993092e-05, - "loss": 0.2868, + "epoch": 0.6002450715392655, + "grad_norm": 0.1795613169670105, + "learning_rate": 4.894527575213382e-05, + "loss": 0.4876, "step": 16655 }, { - "epoch": 0.59, - "learning_rate": 4.900965994753904e-05, - "loss": 0.3325, + "epoch": 0.6004252712004902, + "grad_norm": 0.16493387520313263, + "learning_rate": 4.8944436921102296e-05, + "loss": 0.4382, "step": 16660 }, { - "epoch": 0.59, - "learning_rate": 4.9008865933397547e-05, - "loss": 0.3052, + "epoch": 0.6006054708617148, + "grad_norm": 0.1708437204360962, + "learning_rate": 4.894359776383245e-05, + "loss": 0.4134, "step": 16665 }, { - "epoch": 0.59, - "learning_rate": 4.9008071607516746e-05, - "loss": 0.2971, + "epoch": 0.6007856705229394, + "grad_norm": 0.14613789319992065, + "learning_rate": 4.894275828033572e-05, + "loss": 0.469, "step": 16670 }, { - "epoch": 0.59, - "learning_rate": 4.900727696990697e-05, - "loss": 0.3172, + "epoch": 0.600965870184164, + "grad_norm": 0.15820561349391937, + "learning_rate": 4.894191847062355e-05, + "loss": 0.435, "step": 16675 }, { - "epoch": 0.59, - "learning_rate": 4.900648202057851e-05, - "loss": 0.3044, + "epoch": 0.6011460698453887, + "grad_norm": 0.16094128787517548, + "learning_rate": 4.894107833470737e-05, + "loss": 0.4457, "step": 16680 }, { - "epoch": 0.59, - "learning_rate": 4.9005686759541714e-05, - "loss": 0.3029, + "epoch": 0.6013262695066133, + "grad_norm": 0.21351011097431183, + "learning_rate": 4.8940237872598635e-05, + "loss": 0.4868, "step": 16685 }, { - "epoch": 0.59, - "learning_rate": 4.9004891186806894e-05, - "loss": 0.3203, + "epoch": 0.601506469167838, + "grad_norm": 0.1446036994457245, + "learning_rate": 4.8939397084308794e-05, + "loss": 0.4357, "step": 16690 }, { - "epoch": 0.59, - "learning_rate": 4.900409530238438e-05, - "loss": 0.301, + "epoch": 0.6016866688290626, + "grad_norm": 0.16345882415771484, + "learning_rate": 4.8938555969849306e-05, + "loss": 0.4722, "step": 16695 }, { - "epoch": 0.59, - "learning_rate": 4.9003299106284504e-05, - "loss": 0.3246, + "epoch": 0.6018668684902873, + "grad_norm": 0.1572277694940567, + "learning_rate": 4.893771452923162e-05, + "loss": 0.4265, "step": 16700 }, { - "epoch": 0.59, - "learning_rate": 4.900250259851762e-05, - "loss": 0.3039, + "epoch": 0.6020470681515119, + "grad_norm": 0.188238263130188, + "learning_rate": 4.893687276246721e-05, + "loss": 0.4051, "step": 16705 }, { - "epoch": 0.59, - "learning_rate": 4.900170577909404e-05, - "loss": 0.2734, + "epoch": 0.6022272678127365, + "grad_norm": 0.15958626568317413, + "learning_rate": 4.893603066956755e-05, + "loss": 0.4454, "step": 16710 }, { - "epoch": 0.59, - "learning_rate": 4.900090864802414e-05, - "loss": 0.2907, + "epoch": 0.6024074674739611, + "grad_norm": 0.18399354815483093, + "learning_rate": 4.89351882505441e-05, + "loss": 0.4459, "step": 16715 }, { - "epoch": 0.59, - "learning_rate": 4.9000111205318254e-05, - "loss": 0.3238, + "epoch": 0.6025876671351857, + "grad_norm": 0.16669590771198273, + "learning_rate": 4.8934345505408344e-05, + "loss": 0.4402, "step": 16720 }, { - "epoch": 0.59, - "learning_rate": 4.899931345098674e-05, - "loss": 0.3159, + "epoch": 0.6027678667964104, + "grad_norm": 0.16864614188671112, + "learning_rate": 4.893350243417177e-05, + "loss": 0.4374, "step": 16725 }, { - "epoch": 0.59, - "learning_rate": 4.899851538503996e-05, - "loss": 0.3073, + "epoch": 0.6029480664576351, + "grad_norm": 0.1995852291584015, + "learning_rate": 4.893265903684585e-05, + "loss": 0.4212, "step": 16730 }, { - "epoch": 0.59, - "learning_rate": 4.899771700748827e-05, - "loss": 0.3185, + "epoch": 0.6031282661188597, + "grad_norm": 0.17713283002376556, + "learning_rate": 4.8931815313442095e-05, + "loss": 0.4731, "step": 16735 }, { - "epoch": 0.59, - "learning_rate": 4.899691831834205e-05, - "loss": 0.2971, + "epoch": 0.6033084657800843, + "grad_norm": 0.16761018335819244, + "learning_rate": 4.893097126397198e-05, + "loss": 0.4195, "step": 16740 }, { - "epoch": 0.59, - "learning_rate": 4.899611931761164e-05, - "loss": 0.31, + "epoch": 0.603488665441309, + "grad_norm": 0.14120902121067047, + "learning_rate": 4.893012688844702e-05, + "loss": 0.4436, "step": 16745 }, { - "epoch": 0.59, - "learning_rate": 4.899532000530746e-05, - "loss": 0.3042, + "epoch": 0.6036688651025336, + "grad_norm": 0.1854945719242096, + "learning_rate": 4.8929282186878714e-05, + "loss": 0.4699, "step": 16750 }, { - "epoch": 0.59, - "learning_rate": 4.899452038143985e-05, - "loss": 0.3021, + "epoch": 0.6038490647637582, + "grad_norm": 0.15078695118427277, + "learning_rate": 4.892843715927857e-05, + "loss": 0.4322, "step": 16755 }, { - "epoch": 0.59, - "learning_rate": 4.89937204460192e-05, - "loss": 0.3235, + "epoch": 0.6040292644249828, + "grad_norm": 0.17069613933563232, + "learning_rate": 4.89275918056581e-05, + "loss": 0.4183, "step": 16760 }, { - "epoch": 0.59, - "learning_rate": 4.899292019905591e-05, - "loss": 0.2933, + "epoch": 0.6042094640862076, + "grad_norm": 0.1491325944662094, + "learning_rate": 4.892674612602882e-05, + "loss": 0.4358, "step": 16765 }, { - "epoch": 0.59, - "learning_rate": 4.899211964056036e-05, - "loss": 0.3161, + "epoch": 0.6043896637474322, + "grad_norm": 0.17146345973014832, + "learning_rate": 4.892590012040227e-05, + "loss": 0.4274, "step": 16770 }, { - "epoch": 0.59, - "learning_rate": 4.899131877054295e-05, - "loss": 0.3139, + "epoch": 0.6045698634086568, + "grad_norm": 0.1850728988647461, + "learning_rate": 4.8925053788789954e-05, + "loss": 0.4698, "step": 16775 }, { - "epoch": 0.59, - "learning_rate": 4.899051758901407e-05, - "loss": 0.3109, + "epoch": 0.6047500630698814, + "grad_norm": 0.1922415941953659, + "learning_rate": 4.892420713120341e-05, + "loss": 0.4253, "step": 16780 }, { - "epoch": 0.59, - "learning_rate": 4.898971609598414e-05, - "loss": 0.3411, + "epoch": 0.6049302627311061, + "grad_norm": 0.1879766434431076, + "learning_rate": 4.8923360147654184e-05, + "loss": 0.4515, "step": 16785 }, { - "epoch": 0.59, - "learning_rate": 4.898891429146355e-05, - "loss": 0.3293, + "epoch": 0.6051104623923307, + "grad_norm": 0.19672177731990814, + "learning_rate": 4.89225128381538e-05, + "loss": 0.4909, "step": 16790 }, { - "epoch": 0.59, - "learning_rate": 4.898811217546272e-05, - "loss": 0.327, + "epoch": 0.6052906620535553, + "grad_norm": 0.16113631427288055, + "learning_rate": 4.8921665202713816e-05, + "loss": 0.4287, "step": 16795 }, { - "epoch": 0.59, - "learning_rate": 4.8987309747992064e-05, - "loss": 0.3188, + "epoch": 0.6054708617147799, + "grad_norm": 0.1752663552761078, + "learning_rate": 4.8920817241345776e-05, + "loss": 0.4615, "step": 16800 }, { - "epoch": 0.59, - "learning_rate": 4.8986507009061994e-05, - "loss": 0.2961, + "epoch": 0.6056510613760047, + "grad_norm": 0.15758828818798065, + "learning_rate": 4.8919968954061223e-05, + "loss": 0.4314, "step": 16805 }, { - "epoch": 0.59, - "learning_rate": 4.898570395868294e-05, - "loss": 0.3104, + "epoch": 0.6058312610372293, + "grad_norm": 0.22554457187652588, + "learning_rate": 4.891912034087173e-05, + "loss": 0.48, "step": 16810 }, { - "epoch": 0.59, - "learning_rate": 4.898490059686532e-05, - "loss": 0.3094, + "epoch": 0.6060114606984539, + "grad_norm": 0.16206426918506622, + "learning_rate": 4.8918271401788855e-05, + "loss": 0.4626, "step": 16815 }, { - "epoch": 0.59, - "learning_rate": 4.8984096923619586e-05, - "loss": 0.3213, + "epoch": 0.6061916603596785, + "grad_norm": 0.16757212579250336, + "learning_rate": 4.8917422136824167e-05, + "loss": 0.4692, "step": 16820 }, { - "epoch": 0.59, - "learning_rate": 4.898329293895616e-05, - "loss": 0.3141, + "epoch": 0.6063718600209032, + "grad_norm": 0.19474922120571136, + "learning_rate": 4.891657254598922e-05, + "loss": 0.4177, "step": 16825 }, { - "epoch": 0.59, - "learning_rate": 4.898248864288547e-05, - "loss": 0.3198, + "epoch": 0.6065520596821278, + "grad_norm": 0.15436606109142303, + "learning_rate": 4.891572262929561e-05, + "loss": 0.4621, "step": 16830 }, { - "epoch": 0.59, - "learning_rate": 4.898168403541797e-05, - "loss": 0.3185, + "epoch": 0.6067322593433524, + "grad_norm": 0.15269695222377777, + "learning_rate": 4.89148723867549e-05, + "loss": 0.4116, "step": 16835 }, { - "epoch": 0.59, - "learning_rate": 4.8980879116564114e-05, - "loss": 0.3084, + "epoch": 0.606912459004577, + "grad_norm": 0.20142877101898193, + "learning_rate": 4.89140218183787e-05, + "loss": 0.4751, "step": 16840 }, { - "epoch": 0.59, - "learning_rate": 4.898007388633434e-05, - "loss": 0.3077, + "epoch": 0.6070926586658018, + "grad_norm": 0.14995922148227692, + "learning_rate": 4.8913170924178576e-05, + "loss": 0.3919, "step": 16845 }, { - "epoch": 0.59, - "learning_rate": 4.897926834473912e-05, - "loss": 0.3165, + "epoch": 0.6072728583270264, + "grad_norm": 0.179569274187088, + "learning_rate": 4.8912319704166124e-05, + "loss": 0.4542, "step": 16850 }, { - "epoch": 0.59, - "learning_rate": 4.89784624917889e-05, - "loss": 0.3233, + "epoch": 0.607453057988251, + "grad_norm": 0.15274690091609955, + "learning_rate": 4.891146815835295e-05, + "loss": 0.4364, "step": 16855 }, { - "epoch": 0.59, - "learning_rate": 4.8977656327494145e-05, - "loss": 0.3296, + "epoch": 0.6076332576494756, + "grad_norm": 0.13948167860507965, + "learning_rate": 4.8910616286750654e-05, + "loss": 0.4454, "step": 16860 }, { - "epoch": 0.59, - "learning_rate": 4.897684985186533e-05, - "loss": 0.2964, + "epoch": 0.6078134573107002, + "grad_norm": 0.18718601763248444, + "learning_rate": 4.890976408937084e-05, + "loss": 0.4327, "step": 16865 }, { - "epoch": 0.59, - "learning_rate": 4.8976043064912916e-05, - "loss": 0.3053, + "epoch": 0.6079936569719249, + "grad_norm": 0.2123367339372635, + "learning_rate": 4.890891156622511e-05, + "loss": 0.419, "step": 16870 }, { - "epoch": 0.59, - "learning_rate": 4.8975235966647376e-05, - "loss": 0.2974, + "epoch": 0.6081738566331495, + "grad_norm": 0.18205073475837708, + "learning_rate": 4.8908058717325097e-05, + "loss": 0.4819, "step": 16875 }, { - "epoch": 0.59, - "learning_rate": 4.8974428557079204e-05, - "loss": 0.3186, + "epoch": 0.6083540562943741, + "grad_norm": 0.1736924946308136, + "learning_rate": 4.8907205542682414e-05, + "loss": 0.4596, "step": 16880 }, { - "epoch": 0.59, - "learning_rate": 4.897362083621888e-05, - "loss": 0.294, + "epoch": 0.6085342559555988, + "grad_norm": 0.18796013295650482, + "learning_rate": 4.890635204230868e-05, + "loss": 0.4273, "step": 16885 }, { - "epoch": 0.59, - "learning_rate": 4.8972812804076884e-05, - "loss": 0.3269, + "epoch": 0.6087144556168235, + "grad_norm": 0.19030854105949402, + "learning_rate": 4.890549821621553e-05, + "loss": 0.4642, "step": 16890 }, { - "epoch": 0.59, - "learning_rate": 4.897200446066371e-05, - "loss": 0.3195, + "epoch": 0.6088946552780481, + "grad_norm": 0.16654013097286224, + "learning_rate": 4.8904644064414585e-05, + "loss": 0.4615, "step": 16895 }, { - "epoch": 0.59, - "learning_rate": 4.897119580598987e-05, - "loss": 0.3059, + "epoch": 0.6090748549392727, + "grad_norm": 0.20467707514762878, + "learning_rate": 4.8903789586917505e-05, + "loss": 0.4292, "step": 16900 }, { - "epoch": 0.59, - "learning_rate": 4.897038684006583e-05, - "loss": 0.2914, + "epoch": 0.6092550546004973, + "grad_norm": 0.14566195011138916, + "learning_rate": 4.890293478373592e-05, + "loss": 0.483, "step": 16905 }, { - "epoch": 0.59, - "learning_rate": 4.896957756290213e-05, - "loss": 0.3063, + "epoch": 0.609435254261722, + "grad_norm": 0.19296708703041077, + "learning_rate": 4.8902079654881466e-05, + "loss": 0.4536, "step": 16910 }, { - "epoch": 0.6, - "learning_rate": 4.896876797450926e-05, - "loss": 0.3383, + "epoch": 0.6096154539229466, + "grad_norm": 0.15972061455249786, + "learning_rate": 4.890122420036581e-05, + "loss": 0.4392, "step": 16915 }, { - "epoch": 0.6, - "learning_rate": 4.8967958074897724e-05, - "loss": 0.3196, + "epoch": 0.6097956535841713, + "grad_norm": 0.18985244631767273, + "learning_rate": 4.890036842020061e-05, + "loss": 0.4542, "step": 16920 }, { - "epoch": 0.6, - "learning_rate": 4.896714786407806e-05, - "loss": 0.3065, + "epoch": 0.6099758532453959, + "grad_norm": 0.1495533585548401, + "learning_rate": 4.88995123143975e-05, + "loss": 0.4318, "step": 16925 }, { - "epoch": 0.6, - "learning_rate": 4.8966337342060765e-05, - "loss": 0.3096, + "epoch": 0.6101560529066206, + "grad_norm": 0.21450436115264893, + "learning_rate": 4.8898655882968175e-05, + "loss": 0.4514, "step": 16930 }, { - "epoch": 0.6, - "learning_rate": 4.8965526508856387e-05, - "loss": 0.2989, + "epoch": 0.6103362525678452, + "grad_norm": 0.2090601772069931, + "learning_rate": 4.889779912592429e-05, + "loss": 0.4552, "step": 16935 }, { - "epoch": 0.6, - "learning_rate": 4.896471536447543e-05, - "loss": 0.3001, + "epoch": 0.6105164522290698, + "grad_norm": 0.1768369823694229, + "learning_rate": 4.889694204327751e-05, + "loss": 0.4526, "step": 16940 }, { - "epoch": 0.6, - "learning_rate": 4.896390390892844e-05, - "loss": 0.3072, + "epoch": 0.6106966518902944, + "grad_norm": 0.16384200751781464, + "learning_rate": 4.889608463503953e-05, + "loss": 0.4559, "step": 16945 }, { - "epoch": 0.6, - "learning_rate": 4.896309214222595e-05, - "loss": 0.2968, + "epoch": 0.610876851551519, + "grad_norm": 0.14642253518104553, + "learning_rate": 4.8895226901222026e-05, + "loss": 0.4559, "step": 16950 }, { - "epoch": 0.6, - "learning_rate": 4.8962280064378504e-05, - "loss": 0.2986, + "epoch": 0.6110570512127437, + "grad_norm": 0.14469105005264282, + "learning_rate": 4.889436884183667e-05, + "loss": 0.4116, "step": 16955 }, { - "epoch": 0.6, - "learning_rate": 4.896146767539664e-05, - "loss": 0.2951, + "epoch": 0.6112372508739684, + "grad_norm": 0.15812519192695618, + "learning_rate": 4.889351045689518e-05, + "loss": 0.4722, "step": 16960 }, { - "epoch": 0.6, - "learning_rate": 4.896065497529091e-05, - "loss": 0.3021, + "epoch": 0.611417450535193, + "grad_norm": 0.18630902469158173, + "learning_rate": 4.889265174640922e-05, + "loss": 0.4555, "step": 16965 }, { - "epoch": 0.6, - "learning_rate": 4.8959841964071864e-05, - "loss": 0.3164, + "epoch": 0.6115976501964177, + "grad_norm": 0.1799592524766922, + "learning_rate": 4.889179271039052e-05, + "loss": 0.4385, "step": 16970 }, { - "epoch": 0.6, - "learning_rate": 4.895902864175006e-05, - "loss": 0.329, + "epoch": 0.6117778498576423, + "grad_norm": 0.17888915538787842, + "learning_rate": 4.8890933348850757e-05, + "loss": 0.4545, "step": 16975 }, { - "epoch": 0.6, - "learning_rate": 4.895821500833606e-05, - "loss": 0.3042, + "epoch": 0.6119580495188669, + "grad_norm": 0.16347283124923706, + "learning_rate": 4.8890073661801655e-05, + "loss": 0.3919, "step": 16980 }, { - "epoch": 0.6, - "learning_rate": 4.8957401063840435e-05, - "loss": 0.3218, + "epoch": 0.6121382491800915, + "grad_norm": 0.22050714492797852, + "learning_rate": 4.888921364925493e-05, + "loss": 0.4469, "step": 16985 }, { - "epoch": 0.6, - "learning_rate": 4.895658680827374e-05, - "loss": 0.315, + "epoch": 0.6123184488413161, + "grad_norm": 0.2015305459499359, + "learning_rate": 4.888835331122229e-05, + "loss": 0.4552, "step": 16990 }, { - "epoch": 0.6, - "learning_rate": 4.8955772241646546e-05, - "loss": 0.3186, + "epoch": 0.6124986485025408, + "grad_norm": 0.1845419853925705, + "learning_rate": 4.888749264771546e-05, + "loss": 0.4543, "step": 16995 }, { - "epoch": 0.6, - "learning_rate": 4.8954957363969445e-05, - "loss": 0.3001, + "epoch": 0.6126788481637655, + "grad_norm": 0.2000664323568344, + "learning_rate": 4.8886631658746175e-05, + "loss": 0.4648, "step": 17000 }, { - "epoch": 0.6, - "eval_loss": 0.30690938234329224, - "eval_runtime": 10.5382, - "eval_samples_per_second": 9.489, - "eval_steps_per_second": 9.489, + "epoch": 0.6126788481637655, + "eval_loss": 0.4671698808670044, + "eval_runtime": 3.5317, + "eval_samples_per_second": 28.315, + "eval_steps_per_second": 7.079, "step": 17000 }, { - "epoch": 0.6, - "learning_rate": 4.895414217525301e-05, - "loss": 0.3164, + "epoch": 0.6128590478249901, + "grad_norm": 0.17209337651729584, + "learning_rate": 4.8885770344326154e-05, + "loss": 0.3971, "step": 17005 }, { - "epoch": 0.6, - "learning_rate": 4.895332667550783e-05, - "loss": 0.2993, + "epoch": 0.6130392474862147, + "grad_norm": 0.17581631243228912, + "learning_rate": 4.8884908704467137e-05, + "loss": 0.4352, "step": 17010 }, { - "epoch": 0.6, - "learning_rate": 4.895251086474447e-05, - "loss": 0.3298, + "epoch": 0.6132194471474394, + "grad_norm": 0.19064390659332275, + "learning_rate": 4.888404673918085e-05, + "loss": 0.4705, "step": 17015 }, { - "epoch": 0.6, - "learning_rate": 4.895169474297356e-05, - "loss": 0.3277, + "epoch": 0.613399646808664, + "grad_norm": 0.1969899833202362, + "learning_rate": 4.8883184448479066e-05, + "loss": 0.4157, "step": 17020 }, { - "epoch": 0.6, - "learning_rate": 4.895087831020567e-05, - "loss": 0.3215, + "epoch": 0.6135798464698886, + "grad_norm": 0.17131738364696503, + "learning_rate": 4.888232183237352e-05, + "loss": 0.436, "step": 17025 }, { - "epoch": 0.6, - "learning_rate": 4.895006156645141e-05, - "loss": 0.332, + "epoch": 0.6137600461311132, + "grad_norm": 0.17256931960582733, + "learning_rate": 4.888145889087595e-05, + "loss": 0.4619, "step": 17030 }, { - "epoch": 0.6, - "learning_rate": 4.8949244511721385e-05, - "loss": 0.3016, + "epoch": 0.6139402457923379, + "grad_norm": 0.17418308556079865, + "learning_rate": 4.888059562399814e-05, + "loss": 0.4318, "step": 17035 }, { - "epoch": 0.6, - "learning_rate": 4.894842714602621e-05, - "loss": 0.3298, + "epoch": 0.6141204454535626, + "grad_norm": 0.1743146926164627, + "learning_rate": 4.887973203175183e-05, + "loss": 0.4002, "step": 17040 }, { - "epoch": 0.6, - "learning_rate": 4.894760946937648e-05, - "loss": 0.3033, + "epoch": 0.6143006451147872, + "grad_norm": 0.168928861618042, + "learning_rate": 4.88788681141488e-05, + "loss": 0.4291, "step": 17045 }, { - "epoch": 0.6, - "learning_rate": 4.8946791481782826e-05, - "loss": 0.3404, + "epoch": 0.6144808447760118, + "grad_norm": 0.1870613992214203, + "learning_rate": 4.8878003871200807e-05, + "loss": 0.4425, "step": 17050 }, { - "epoch": 0.6, - "learning_rate": 4.894597318325587e-05, - "loss": 0.3124, + "epoch": 0.6146610444372365, + "grad_norm": 0.1746305525302887, + "learning_rate": 4.8877139302919636e-05, + "loss": 0.4185, "step": 17055 }, { - "epoch": 0.6, - "learning_rate": 4.8945154573806235e-05, - "loss": 0.3361, + "epoch": 0.6148412440984611, + "grad_norm": 0.16369090974330902, + "learning_rate": 4.887627440931707e-05, + "loss": 0.4495, "step": 17060 }, { - "epoch": 0.6, - "learning_rate": 4.8944335653444544e-05, - "loss": 0.2948, + "epoch": 0.6150214437596857, + "grad_norm": 0.16876505315303802, + "learning_rate": 4.887540919040488e-05, + "loss": 0.4685, "step": 17065 }, { - "epoch": 0.6, - "learning_rate": 4.8943516422181435e-05, - "loss": 0.3212, + "epoch": 0.6152016434209103, + "grad_norm": 0.17735637724399567, + "learning_rate": 4.887454364619487e-05, + "loss": 0.3699, "step": 17070 }, { - "epoch": 0.6, - "learning_rate": 4.894269688002755e-05, - "loss": 0.3018, + "epoch": 0.6153818430821351, + "grad_norm": 0.17825958132743835, + "learning_rate": 4.8873677776698824e-05, + "loss": 0.4413, "step": 17075 }, { - "epoch": 0.6, - "learning_rate": 4.8941877026993514e-05, - "loss": 0.3198, + "epoch": 0.6155620427433597, + "grad_norm": 0.21326866745948792, + "learning_rate": 4.8872811581928536e-05, + "loss": 0.4284, "step": 17080 }, { - "epoch": 0.6, - "learning_rate": 4.894105686308999e-05, - "loss": 0.2923, + "epoch": 0.6157422424045843, + "grad_norm": 0.16894589364528656, + "learning_rate": 4.887194506189581e-05, + "loss": 0.4346, "step": 17085 }, { - "epoch": 0.6, - "learning_rate": 4.894023638832762e-05, - "loss": 0.3264, + "epoch": 0.6159224420658089, + "grad_norm": 0.18406248092651367, + "learning_rate": 4.887107821661247e-05, + "loss": 0.4555, "step": 17090 }, { - "epoch": 0.6, - "learning_rate": 4.8939415602717056e-05, - "loss": 0.3026, + "epoch": 0.6161026417270336, + "grad_norm": 0.17447605729103088, + "learning_rate": 4.887021104609029e-05, + "loss": 0.4917, "step": 17095 }, { - "epoch": 0.6, - "learning_rate": 4.893859450626896e-05, - "loss": 0.3145, + "epoch": 0.6162828413882582, + "grad_norm": 0.1875600963830948, + "learning_rate": 4.886934355034112e-05, + "loss": 0.4652, "step": 17100 }, { - "epoch": 0.6, - "learning_rate": 4.893777309899399e-05, - "loss": 0.3186, + "epoch": 0.6164630410494828, + "grad_norm": 0.1392199695110321, + "learning_rate": 4.886847572937676e-05, + "loss": 0.4185, "step": 17105 }, { - "epoch": 0.6, - "learning_rate": 4.8936951380902804e-05, - "loss": 0.3439, + "epoch": 0.6166432407107074, + "grad_norm": 0.21595901250839233, + "learning_rate": 4.886760758320904e-05, + "loss": 0.445, "step": 17110 }, { - "epoch": 0.6, - "learning_rate": 4.893612935200609e-05, - "loss": 0.3224, + "epoch": 0.6168234403719322, + "grad_norm": 0.19852277636528015, + "learning_rate": 4.886673911184978e-05, + "loss": 0.4681, "step": 17115 }, { - "epoch": 0.6, - "learning_rate": 4.89353070123145e-05, - "loss": 0.3126, + "epoch": 0.6170036400331568, + "grad_norm": 0.15305131673812866, + "learning_rate": 4.8865870315310834e-05, + "loss": 0.4507, "step": 17120 }, { - "epoch": 0.6, - "learning_rate": 4.893448436183873e-05, - "loss": 0.314, + "epoch": 0.6171838396943814, + "grad_norm": 0.1263405680656433, + "learning_rate": 4.886500119360402e-05, + "loss": 0.4257, "step": 17125 }, { - "epoch": 0.6, - "learning_rate": 4.893366140058946e-05, - "loss": 0.3017, + "epoch": 0.617364039355606, + "grad_norm": 0.17302808165550232, + "learning_rate": 4.886413174674118e-05, + "loss": 0.4358, "step": 17130 }, { - "epoch": 0.6, - "learning_rate": 4.893283812857736e-05, - "loss": 0.3289, + "epoch": 0.6175442390168306, + "grad_norm": 0.157601997256279, + "learning_rate": 4.886326197473417e-05, + "loss": 0.4449, "step": 17135 }, { - "epoch": 0.6, - "learning_rate": 4.8932014545813134e-05, - "loss": 0.3609, + "epoch": 0.6177244386780553, + "grad_norm": 0.20116667449474335, + "learning_rate": 4.8862391877594835e-05, + "loss": 0.4696, "step": 17140 }, { - "epoch": 0.6, - "learning_rate": 4.893119065230747e-05, - "loss": 0.2998, + "epoch": 0.6179046383392799, + "grad_norm": 0.14023269712924957, + "learning_rate": 4.886152145533503e-05, + "loss": 0.4229, "step": 17145 }, { - "epoch": 0.6, - "learning_rate": 4.8930366448071066e-05, - "loss": 0.351, + "epoch": 0.6180848380005045, + "grad_norm": 0.18752652406692505, + "learning_rate": 4.886065070796662e-05, + "loss": 0.4556, "step": 17150 }, { - "epoch": 0.6, - "learning_rate": 4.892954193311462e-05, - "loss": 0.3124, + "epoch": 0.6182650376617292, + "grad_norm": 0.1420794129371643, + "learning_rate": 4.8859779635501456e-05, + "loss": 0.4229, "step": 17155 }, { - "epoch": 0.6, - "learning_rate": 4.892871710744884e-05, - "loss": 0.3311, + "epoch": 0.6184452373229539, + "grad_norm": 0.13428914546966553, + "learning_rate": 4.885890823795142e-05, + "loss": 0.4079, "step": 17160 }, { - "epoch": 0.6, - "learning_rate": 4.892789197108445e-05, - "loss": 0.3135, + "epoch": 0.6186254369841785, + "grad_norm": 0.18215683102607727, + "learning_rate": 4.885803651532838e-05, + "loss": 0.4227, "step": 17165 }, { - "epoch": 0.6, - "learning_rate": 4.892706652403215e-05, - "loss": 0.3331, + "epoch": 0.6188056366454031, + "grad_norm": 0.14137986302375793, + "learning_rate": 4.88571644676442e-05, + "loss": 0.3939, "step": 17170 }, { - "epoch": 0.6, - "learning_rate": 4.892624076630265e-05, - "loss": 0.3063, + "epoch": 0.6189858363066277, + "grad_norm": 0.16938935220241547, + "learning_rate": 4.885629209491078e-05, + "loss": 0.4753, "step": 17175 }, { - "epoch": 0.6, - "learning_rate": 4.892541469790669e-05, - "loss": 0.3259, + "epoch": 0.6191660359678524, + "grad_norm": 0.16520722210407257, + "learning_rate": 4.885541939714e-05, + "loss": 0.4466, "step": 17180 }, { - "epoch": 0.6, - "learning_rate": 4.892458831885498e-05, - "loss": 0.321, + "epoch": 0.619346235629077, + "grad_norm": 0.16338035464286804, + "learning_rate": 4.885454637434375e-05, + "loss": 0.4746, "step": 17185 }, { - "epoch": 0.6, - "learning_rate": 4.892376162915826e-05, - "loss": 0.2919, + "epoch": 0.6195264352903016, + "grad_norm": 0.1479274183511734, + "learning_rate": 4.8853673026533926e-05, + "loss": 0.4363, "step": 17190 }, { - "epoch": 0.6, - "learning_rate": 4.8922934628827263e-05, - "loss": 0.314, + "epoch": 0.6197066349515263, + "grad_norm": 0.1792302280664444, + "learning_rate": 4.885279935372242e-05, + "loss": 0.4159, "step": 17195 }, { - "epoch": 0.61, - "learning_rate": 4.892210731787273e-05, - "loss": 0.3051, + "epoch": 0.619886834612751, + "grad_norm": 0.1646362543106079, + "learning_rate": 4.8851925355921144e-05, + "loss": 0.4298, "step": 17200 }, { - "epoch": 0.61, - "learning_rate": 4.89212796963054e-05, - "loss": 0.316, + "epoch": 0.6200670342739756, + "grad_norm": 0.20265613496303558, + "learning_rate": 4.8851051033142004e-05, + "loss": 0.4525, "step": 17205 }, { - "epoch": 0.61, - "learning_rate": 4.892045176413601e-05, - "loss": 0.305, + "epoch": 0.6202472339352002, + "grad_norm": 0.18897970020771027, + "learning_rate": 4.8850176385396904e-05, + "loss": 0.4509, "step": 17210 }, { - "epoch": 0.61, - "learning_rate": 4.8919623521375316e-05, - "loss": 0.307, + "epoch": 0.6204274335964248, + "grad_norm": 0.2247178852558136, + "learning_rate": 4.884930141269778e-05, + "loss": 0.4515, "step": 17215 }, { - "epoch": 0.61, - "learning_rate": 4.8918794968034084e-05, - "loss": 0.3026, + "epoch": 0.6206076332576494, + "grad_norm": 0.22851864993572235, + "learning_rate": 4.884842611505653e-05, + "loss": 0.4337, "step": 17220 }, { - "epoch": 0.61, - "learning_rate": 4.891796610412306e-05, - "loss": 0.2894, + "epoch": 0.6207878329188741, + "grad_norm": 0.17176374793052673, + "learning_rate": 4.8847550492485094e-05, + "loss": 0.4607, "step": 17225 }, { - "epoch": 0.61, - "learning_rate": 4.8917136929653007e-05, - "loss": 0.2895, + "epoch": 0.6209680325800987, + "grad_norm": 0.16532184183597565, + "learning_rate": 4.884667454499541e-05, + "loss": 0.4572, "step": 17230 }, { - "epoch": 0.61, - "learning_rate": 4.891630744463469e-05, - "loss": 0.3233, + "epoch": 0.6211482322413234, + "grad_norm": 0.16950088739395142, + "learning_rate": 4.884579827259939e-05, + "loss": 0.4596, "step": 17235 }, { - "epoch": 0.61, - "learning_rate": 4.891547764907888e-05, - "loss": 0.3337, + "epoch": 0.621328431902548, + "grad_norm": 0.13479334115982056, + "learning_rate": 4.8844921675308985e-05, + "loss": 0.4187, "step": 17240 }, { - "epoch": 0.61, - "learning_rate": 4.891464754299636e-05, - "loss": 0.2812, + "epoch": 0.6215086315637727, + "grad_norm": 0.1946098804473877, + "learning_rate": 4.884404475313614e-05, + "loss": 0.4806, "step": 17245 }, { - "epoch": 0.61, - "learning_rate": 4.89138171263979e-05, - "loss": 0.3134, + "epoch": 0.6216888312249973, + "grad_norm": 0.20148830115795135, + "learning_rate": 4.884316750609281e-05, + "loss": 0.4833, "step": 17250 }, { - "epoch": 0.61, - "learning_rate": 4.8912986399294285e-05, - "loss": 0.3149, + "epoch": 0.6218690308862219, + "grad_norm": 0.18048211932182312, + "learning_rate": 4.884228993419093e-05, + "loss": 0.4797, "step": 17255 }, { - "epoch": 0.61, - "learning_rate": 4.89121553616963e-05, - "loss": 0.3306, + "epoch": 0.6220492305474465, + "grad_norm": 0.17456752061843872, + "learning_rate": 4.884141203744248e-05, + "loss": 0.4505, "step": 17260 }, { - "epoch": 0.61, - "learning_rate": 4.891132401361473e-05, - "loss": 0.3212, + "epoch": 0.6222294302086712, + "grad_norm": 0.19601954519748688, + "learning_rate": 4.884053381585939e-05, + "loss": 0.4685, "step": 17265 }, { - "epoch": 0.61, - "learning_rate": 4.8910492355060386e-05, - "loss": 0.3005, + "epoch": 0.6224096298698959, + "grad_norm": 0.191552072763443, + "learning_rate": 4.883965526945365e-05, + "loss": 0.4564, "step": 17270 }, { - "epoch": 0.61, - "learning_rate": 4.8909660386044055e-05, - "loss": 0.3292, + "epoch": 0.6225898295311205, + "grad_norm": 0.17397992312908173, + "learning_rate": 4.883877639823722e-05, + "loss": 0.4623, "step": 17275 }, { - "epoch": 0.61, - "learning_rate": 4.890882810657654e-05, - "loss": 0.337, + "epoch": 0.6227700291923451, + "grad_norm": 0.1623370498418808, + "learning_rate": 4.8837897202222076e-05, + "loss": 0.4449, "step": 17280 }, { - "epoch": 0.61, - "learning_rate": 4.890799551666865e-05, - "loss": 0.3018, + "epoch": 0.6229502288535698, + "grad_norm": 0.15014895796775818, + "learning_rate": 4.88370176814202e-05, + "loss": 0.443, "step": 17285 }, { - "epoch": 0.61, - "learning_rate": 4.890716261633119e-05, - "loss": 0.3002, + "epoch": 0.6231304285147944, + "grad_norm": 0.17987573146820068, + "learning_rate": 4.8836137835843574e-05, + "loss": 0.4186, "step": 17290 }, { - "epoch": 0.61, - "learning_rate": 4.890632940557498e-05, - "loss": 0.2851, + "epoch": 0.623310628176019, + "grad_norm": 0.16012683510780334, + "learning_rate": 4.8835257665504184e-05, + "loss": 0.4429, "step": 17295 }, { - "epoch": 0.61, - "learning_rate": 4.890549588441083e-05, - "loss": 0.3158, + "epoch": 0.6234908278372436, + "grad_norm": 0.17883889377117157, + "learning_rate": 4.883437717041403e-05, + "loss": 0.4408, "step": 17300 }, { - "epoch": 0.61, - "learning_rate": 4.8904662052849586e-05, - "loss": 0.3095, + "epoch": 0.6236710274984683, + "grad_norm": 0.17255452275276184, + "learning_rate": 4.883349635058508e-05, + "loss": 0.4326, "step": 17305 }, { - "epoch": 0.61, - "learning_rate": 4.890382791090205e-05, - "loss": 0.3175, + "epoch": 0.623851227159693, + "grad_norm": 0.14830821752548218, + "learning_rate": 4.883261520602937e-05, + "loss": 0.4488, "step": 17310 }, { - "epoch": 0.61, - "learning_rate": 4.890299345857907e-05, - "loss": 0.3159, + "epoch": 0.6240314268209176, + "grad_norm": 0.16953101754188538, + "learning_rate": 4.88317337367589e-05, + "loss": 0.4381, "step": 17315 }, { - "epoch": 0.61, - "learning_rate": 4.8902158695891464e-05, - "loss": 0.3371, + "epoch": 0.6242116264821422, + "grad_norm": 0.16917838156223297, + "learning_rate": 4.883085194278566e-05, + "loss": 0.4023, "step": 17320 }, { - "epoch": 0.61, - "learning_rate": 4.890132362285008e-05, - "loss": 0.3326, + "epoch": 0.6243918261433669, + "grad_norm": 0.14477120339870453, + "learning_rate": 4.882996982412168e-05, + "loss": 0.416, "step": 17325 }, { - "epoch": 0.61, - "learning_rate": 4.890048823946576e-05, - "loss": 0.3218, + "epoch": 0.6245720258045915, + "grad_norm": 0.1996786743402481, + "learning_rate": 4.882908738077898e-05, + "loss": 0.4539, "step": 17330 }, { - "epoch": 0.61, - "learning_rate": 4.889965254574935e-05, - "loss": 0.3009, + "epoch": 0.6247522254658161, + "grad_norm": 0.18695120513439178, + "learning_rate": 4.8828204612769566e-05, + "loss": 0.4737, "step": 17335 }, { - "epoch": 0.61, - "learning_rate": 4.8898816541711706e-05, - "loss": 0.3419, + "epoch": 0.6249324251270407, + "grad_norm": 0.17920991778373718, + "learning_rate": 4.882732152010549e-05, + "loss": 0.4147, "step": 17340 }, { - "epoch": 0.61, - "learning_rate": 4.8897980227363684e-05, - "loss": 0.3228, + "epoch": 0.6251126247882653, + "grad_norm": 0.1538057178258896, + "learning_rate": 4.8826438102798766e-05, + "loss": 0.4651, "step": 17345 }, { - "epoch": 0.61, - "learning_rate": 4.889714360271612e-05, - "loss": 0.3057, + "epoch": 0.6252928244494901, + "grad_norm": 0.17022547125816345, + "learning_rate": 4.8825554360861436e-05, + "loss": 0.4641, "step": 17350 }, { - "epoch": 0.61, - "learning_rate": 4.889630666777991e-05, - "loss": 0.2896, + "epoch": 0.6254730241107147, + "grad_norm": 0.17763908207416534, + "learning_rate": 4.882467029430554e-05, + "loss": 0.3932, "step": 17355 }, { - "epoch": 0.61, - "learning_rate": 4.889546942256591e-05, - "loss": 0.3151, + "epoch": 0.6256532237719393, + "grad_norm": 0.1638226956129074, + "learning_rate": 4.8823785903143124e-05, + "loss": 0.4439, "step": 17360 }, { - "epoch": 0.61, - "learning_rate": 4.889463186708498e-05, - "loss": 0.3091, + "epoch": 0.625833423433164, + "grad_norm": 0.1570916473865509, + "learning_rate": 4.882290118738624e-05, + "loss": 0.4014, "step": 17365 }, { - "epoch": 0.61, - "learning_rate": 4.8893794001348e-05, - "loss": 0.3189, + "epoch": 0.6260136230943886, + "grad_norm": 0.16065819561481476, + "learning_rate": 4.882201614704694e-05, + "loss": 0.4374, "step": 17370 }, { - "epoch": 0.61, - "learning_rate": 4.8892955825365856e-05, - "loss": 0.3118, + "epoch": 0.6261938227556132, + "grad_norm": 0.1514330506324768, + "learning_rate": 4.8821130782137275e-05, + "loss": 0.4195, "step": 17375 }, { - "epoch": 0.61, - "learning_rate": 4.889211733914942e-05, - "loss": 0.2891, + "epoch": 0.6263740224168378, + "grad_norm": 0.14630937576293945, + "learning_rate": 4.882024509266932e-05, + "loss": 0.4394, "step": 17380 }, { - "epoch": 0.61, - "learning_rate": 4.88912785427096e-05, - "loss": 0.2917, + "epoch": 0.6265542220780624, + "grad_norm": 0.15218989551067352, + "learning_rate": 4.881935907865514e-05, + "loss": 0.4132, "step": 17385 }, { - "epoch": 0.61, - "learning_rate": 4.889043943605726e-05, - "loss": 0.3112, + "epoch": 0.6267344217392872, + "grad_norm": 0.20841450989246368, + "learning_rate": 4.88184727401068e-05, + "loss": 0.4669, "step": 17390 }, { - "epoch": 0.61, - "learning_rate": 4.888960001920331e-05, - "loss": 0.3095, + "epoch": 0.6269146214005118, + "grad_norm": 0.18255776166915894, + "learning_rate": 4.881758607703638e-05, + "loss": 0.4354, "step": 17395 }, { - "epoch": 0.61, - "learning_rate": 4.8888760292158654e-05, - "loss": 0.3276, + "epoch": 0.6270948210617364, + "grad_norm": 0.12769578397274017, + "learning_rate": 4.881669908945596e-05, + "loss": 0.4514, "step": 17400 }, { - "epoch": 0.61, - "learning_rate": 4.888792025493418e-05, - "loss": 0.3031, + "epoch": 0.627275020722961, + "grad_norm": 0.18235896527767181, + "learning_rate": 4.881581177737763e-05, + "loss": 0.4523, "step": 17405 }, { - "epoch": 0.61, - "learning_rate": 4.888707990754081e-05, - "loss": 0.315, + "epoch": 0.6274552203841857, + "grad_norm": 0.13113798201084137, + "learning_rate": 4.8814924140813466e-05, + "loss": 0.4246, "step": 17410 }, { - "epoch": 0.61, - "learning_rate": 4.8886239249989455e-05, - "loss": 0.3281, + "epoch": 0.6276354200454103, + "grad_norm": 0.18605348467826843, + "learning_rate": 4.881403617977558e-05, + "loss": 0.4335, "step": 17415 }, { - "epoch": 0.61, - "learning_rate": 4.8885398282291015e-05, - "loss": 0.2979, + "epoch": 0.6278156197066349, + "grad_norm": 0.19018565118312836, + "learning_rate": 4.8813147894276065e-05, + "loss": 0.4198, "step": 17420 }, { - "epoch": 0.61, - "learning_rate": 4.888455700445642e-05, - "loss": 0.3312, + "epoch": 0.6279958193678596, + "grad_norm": 0.22579999268054962, + "learning_rate": 4.8812259284327013e-05, + "loss": 0.4842, "step": 17425 }, { - "epoch": 0.61, - "learning_rate": 4.88837154164966e-05, - "loss": 0.3159, + "epoch": 0.6281760190290843, + "grad_norm": 0.13994662463665009, + "learning_rate": 4.8811370349940545e-05, + "loss": 0.4695, "step": 17430 }, { - "epoch": 0.61, - "learning_rate": 4.888287351842247e-05, - "loss": 0.3123, + "epoch": 0.6283562186903089, + "grad_norm": 0.1737351268529892, + "learning_rate": 4.8810481091128756e-05, + "loss": 0.4521, "step": 17435 }, { - "epoch": 0.61, - "learning_rate": 4.888203131024497e-05, - "loss": 0.3222, + "epoch": 0.6285364183515335, + "grad_norm": 0.1526714414358139, + "learning_rate": 4.880959150790379e-05, + "loss": 0.428, "step": 17440 }, { - "epoch": 0.61, - "learning_rate": 4.888118879197502e-05, - "loss": 0.313, + "epoch": 0.6287166180127581, + "grad_norm": 0.16444048285484314, + "learning_rate": 4.880870160027773e-05, + "loss": 0.4125, "step": 17445 }, { - "epoch": 0.61, - "learning_rate": 4.888034596362359e-05, - "loss": 0.3172, + "epoch": 0.6288968176739828, + "grad_norm": 0.2083633989095688, + "learning_rate": 4.8807811368262726e-05, + "loss": 0.4616, "step": 17450 }, { - "epoch": 0.61, - "learning_rate": 4.887950282520159e-05, - "loss": 0.3118, + "epoch": 0.6290770173352074, + "grad_norm": 0.2831677496433258, + "learning_rate": 4.8806920811870905e-05, + "loss": 0.4404, "step": 17455 }, { - "epoch": 0.61, - "learning_rate": 4.8878659376719995e-05, - "loss": 0.2852, + "epoch": 0.629257216996432, + "grad_norm": 0.17356400191783905, + "learning_rate": 4.88060299311144e-05, + "loss": 0.4132, "step": 17460 }, { - "epoch": 0.61, - "learning_rate": 4.887781561818974e-05, - "loss": 0.3064, + "epoch": 0.6294374166576567, + "grad_norm": 0.1940339356660843, + "learning_rate": 4.880513872600534e-05, + "loss": 0.422, "step": 17465 }, { - "epoch": 0.61, - "learning_rate": 4.8876971549621784e-05, - "loss": 0.3038, + "epoch": 0.6296176163188814, + "grad_norm": 0.15967166423797607, + "learning_rate": 4.880424719655588e-05, + "loss": 0.4363, "step": 17470 }, { - "epoch": 0.61, - "learning_rate": 4.8876127171027084e-05, - "loss": 0.3113, + "epoch": 0.629797815980106, + "grad_norm": 0.16466853022575378, + "learning_rate": 4.8803355342778145e-05, + "loss": 0.4705, "step": 17475 }, { - "epoch": 0.61, - "learning_rate": 4.887528248241662e-05, - "loss": 0.3153, + "epoch": 0.6299780156413306, + "grad_norm": 0.17861898243427277, + "learning_rate": 4.880246316468432e-05, + "loss": 0.4117, "step": 17480 }, { - "epoch": 0.62, - "learning_rate": 4.8874437483801335e-05, - "loss": 0.3126, + "epoch": 0.6301582153025552, + "grad_norm": 0.13602939248085022, + "learning_rate": 4.880157066228653e-05, + "loss": 0.4148, "step": 17485 }, { - "epoch": 0.62, - "learning_rate": 4.887359217519222e-05, - "loss": 0.3134, + "epoch": 0.6303384149637798, + "grad_norm": 0.1881345808506012, + "learning_rate": 4.880067783559695e-05, + "loss": 0.4365, "step": 17490 }, { - "epoch": 0.62, - "learning_rate": 4.887274655660025e-05, - "loss": 0.2875, + "epoch": 0.6305186146250045, + "grad_norm": 0.23560486733913422, + "learning_rate": 4.8799784684627745e-05, + "loss": 0.459, "step": 17495 }, { - "epoch": 0.62, - "learning_rate": 4.887190062803638e-05, - "loss": 0.3005, + "epoch": 0.6306988142862291, + "grad_norm": 0.1441272646188736, + "learning_rate": 4.8798891209391085e-05, + "loss": 0.4468, "step": 17500 }, { - "epoch": 0.62, - "eval_loss": 0.3056062161922455, - "eval_runtime": 10.5404, - "eval_samples_per_second": 9.487, - "eval_steps_per_second": 9.487, + "epoch": 0.6306988142862291, + "eval_loss": 0.4662560522556305, + "eval_runtime": 3.533, + "eval_samples_per_second": 28.305, + "eval_steps_per_second": 7.076, "step": 17500 }, { - "epoch": 0.62, - "learning_rate": 4.8871054389511626e-05, - "loss": 0.3079, + "epoch": 0.6308790139474538, + "grad_norm": 0.18622781336307526, + "learning_rate": 4.879799740989914e-05, + "loss": 0.4294, "step": 17505 }, { - "epoch": 0.62, - "learning_rate": 4.887020784103696e-05, - "loss": 0.3054, + "epoch": 0.6310592136086784, + "grad_norm": 0.17122696340084076, + "learning_rate": 4.879710328616408e-05, + "loss": 0.463, "step": 17510 }, { - "epoch": 0.62, - "learning_rate": 4.886936098262338e-05, - "loss": 0.3035, + "epoch": 0.6312394132699031, + "grad_norm": 0.18917937576770782, + "learning_rate": 4.87962088381981e-05, + "loss": 0.4626, "step": 17515 }, { - "epoch": 0.62, - "learning_rate": 4.8868513814281866e-05, - "loss": 0.3326, + "epoch": 0.6314196129311277, + "grad_norm": 0.1895674765110016, + "learning_rate": 4.8795314066013386e-05, + "loss": 0.4487, "step": 17520 }, { - "epoch": 0.62, - "learning_rate": 4.8867666336023445e-05, - "loss": 0.3312, + "epoch": 0.6315998125923523, + "grad_norm": 0.17945843935012817, + "learning_rate": 4.879441896962212e-05, + "loss": 0.4573, "step": 17525 }, { - "epoch": 0.62, - "learning_rate": 4.8866818547859095e-05, - "loss": 0.3105, + "epoch": 0.6317800122535769, + "grad_norm": 0.16244640946388245, + "learning_rate": 4.87935235490365e-05, + "loss": 0.415, "step": 17530 }, { - "epoch": 0.62, - "learning_rate": 4.886597044979984e-05, - "loss": 0.3202, + "epoch": 0.6319602119148016, + "grad_norm": 0.16950534284114838, + "learning_rate": 4.879262780426873e-05, + "loss": 0.4423, "step": 17535 }, { - "epoch": 0.62, - "learning_rate": 4.886512204185669e-05, - "loss": 0.3186, + "epoch": 0.6321404115760262, + "grad_norm": 0.15685759484767914, + "learning_rate": 4.879173173533101e-05, + "loss": 0.4512, "step": 17540 }, { - "epoch": 0.62, - "learning_rate": 4.886427332404065e-05, - "loss": 0.3076, + "epoch": 0.6323206112372509, + "grad_norm": 0.18144509196281433, + "learning_rate": 4.879083534223556e-05, + "loss": 0.4618, "step": 17545 }, { - "epoch": 0.62, - "learning_rate": 4.886342429636276e-05, - "loss": 0.3072, + "epoch": 0.6325008108984755, + "grad_norm": 0.14772038161754608, + "learning_rate": 4.878993862499458e-05, + "loss": 0.4673, "step": 17550 }, { - "epoch": 0.62, - "learning_rate": 4.8862574958834016e-05, - "loss": 0.2953, + "epoch": 0.6326810105597002, + "grad_norm": 0.17589062452316284, + "learning_rate": 4.8789041583620285e-05, + "loss": 0.4252, "step": 17555 }, { - "epoch": 0.62, - "learning_rate": 4.8861725311465475e-05, - "loss": 0.313, + "epoch": 0.6328612102209248, + "grad_norm": 0.1934499889612198, + "learning_rate": 4.878814421812492e-05, + "loss": 0.4261, "step": 17560 }, { - "epoch": 0.62, - "learning_rate": 4.886087535426815e-05, - "loss": 0.3168, + "epoch": 0.6330414098821494, + "grad_norm": 0.16541697084903717, + "learning_rate": 4.878724652852068e-05, + "loss": 0.4186, "step": 17565 }, { - "epoch": 0.62, - "learning_rate": 4.8860025087253086e-05, - "loss": 0.301, + "epoch": 0.633221609543374, + "grad_norm": 0.18114767968654633, + "learning_rate": 4.878634851481982e-05, + "loss": 0.4684, "step": 17570 }, { - "epoch": 0.62, - "learning_rate": 4.8859174510431314e-05, - "loss": 0.3209, + "epoch": 0.6334018092045987, + "grad_norm": 0.17742811143398285, + "learning_rate": 4.878545017703457e-05, + "loss": 0.4497, "step": 17575 }, { - "epoch": 0.62, - "learning_rate": 4.8858323623813896e-05, - "loss": 0.3257, + "epoch": 0.6335820088658234, + "grad_norm": 0.16837285459041595, + "learning_rate": 4.8784551515177154e-05, + "loss": 0.4176, "step": 17580 }, { - "epoch": 0.62, - "learning_rate": 4.8857472427411864e-05, - "loss": 0.3161, + "epoch": 0.633762208527048, + "grad_norm": 0.14889617264270782, + "learning_rate": 4.878365252925984e-05, + "loss": 0.4337, "step": 17585 }, { - "epoch": 0.62, - "learning_rate": 4.8856620921236276e-05, - "loss": 0.2901, + "epoch": 0.6339424081882726, + "grad_norm": 0.1742192506790161, + "learning_rate": 4.878275321929485e-05, + "loss": 0.4535, "step": 17590 }, { - "epoch": 0.62, - "learning_rate": 4.8855769105298185e-05, - "loss": 0.3008, + "epoch": 0.6341226078494973, + "grad_norm": 0.20734204351902008, + "learning_rate": 4.878185358529447e-05, + "loss": 0.4681, "step": 17595 }, { - "epoch": 0.62, - "learning_rate": 4.8854916979608656e-05, - "loss": 0.3401, + "epoch": 0.6343028075107219, + "grad_norm": 0.16586579382419586, + "learning_rate": 4.8780953627270924e-05, + "loss": 0.4205, "step": 17600 }, { - "epoch": 0.62, - "learning_rate": 4.885406454417875e-05, - "loss": 0.3126, + "epoch": 0.6344830071719465, + "grad_norm": 0.1419735550880432, + "learning_rate": 4.8780053345236496e-05, + "loss": 0.4481, "step": 17605 }, { - "epoch": 0.62, - "learning_rate": 4.8853211799019534e-05, - "loss": 0.3105, + "epoch": 0.6346632068331711, + "grad_norm": 0.1975310742855072, + "learning_rate": 4.877915273920345e-05, + "loss": 0.4853, "step": 17610 }, { - "epoch": 0.62, - "learning_rate": 4.8852358744142085e-05, - "loss": 0.3052, + "epoch": 0.6348434064943957, + "grad_norm": 0.19924567639827728, + "learning_rate": 4.8778251809184045e-05, + "loss": 0.4531, "step": 17615 }, { - "epoch": 0.62, - "learning_rate": 4.8851505379557475e-05, - "loss": 0.3248, + "epoch": 0.6350236061556205, + "grad_norm": 0.17732134461402893, + "learning_rate": 4.877735055519056e-05, + "loss": 0.4414, "step": 17620 }, { - "epoch": 0.62, - "learning_rate": 4.885065170527679e-05, - "loss": 0.3126, + "epoch": 0.6352038058168451, + "grad_norm": 0.2488788664340973, + "learning_rate": 4.877644897723528e-05, + "loss": 0.4517, "step": 17625 }, { - "epoch": 0.62, - "learning_rate": 4.8849797721311106e-05, - "loss": 0.3261, + "epoch": 0.6353840054780697, + "grad_norm": 0.1604602336883545, + "learning_rate": 4.877554707533049e-05, + "loss": 0.4141, "step": 17630 }, { - "epoch": 0.62, - "learning_rate": 4.8848943427671526e-05, - "loss": 0.3238, + "epoch": 0.6355642051392943, + "grad_norm": 0.1665060818195343, + "learning_rate": 4.877464484948847e-05, + "loss": 0.4129, "step": 17635 }, { - "epoch": 0.62, - "learning_rate": 4.884808882436912e-05, - "loss": 0.3123, + "epoch": 0.635744404800519, + "grad_norm": 0.17682184278964996, + "learning_rate": 4.877374229972152e-05, + "loss": 0.4501, "step": 17640 }, { - "epoch": 0.62, - "learning_rate": 4.8847233911415e-05, - "loss": 0.3282, + "epoch": 0.6359246044617436, + "grad_norm": 0.17669498920440674, + "learning_rate": 4.8772839426041935e-05, + "loss": 0.4363, "step": 17645 }, { - "epoch": 0.62, - "learning_rate": 4.884637868882027e-05, - "loss": 0.3058, + "epoch": 0.6361048041229682, + "grad_norm": 0.17283748090267181, + "learning_rate": 4.877193622846201e-05, + "loss": 0.4117, "step": 17650 }, { - "epoch": 0.62, - "learning_rate": 4.884552315659603e-05, - "loss": 0.3324, + "epoch": 0.6362850037841928, + "grad_norm": 0.18358486890792847, + "learning_rate": 4.877103270699406e-05, + "loss": 0.4261, "step": 17655 }, { - "epoch": 0.62, - "learning_rate": 4.884466731475338e-05, - "loss": 0.3031, + "epoch": 0.6364652034454176, + "grad_norm": 0.15538670122623444, + "learning_rate": 4.877012886165039e-05, + "loss": 0.4387, "step": 17660 }, { - "epoch": 0.62, - "learning_rate": 4.884381116330343e-05, - "loss": 0.3154, + "epoch": 0.6366454031066422, + "grad_norm": 0.17753304541110992, + "learning_rate": 4.8769224692443315e-05, + "loss": 0.4024, "step": 17665 }, { - "epoch": 0.62, - "learning_rate": 4.8842954702257324e-05, - "loss": 0.3194, + "epoch": 0.6368256027678668, + "grad_norm": 0.20612354576587677, + "learning_rate": 4.8768320199385166e-05, + "loss": 0.465, "step": 17670 }, { - "epoch": 0.62, - "learning_rate": 4.884209793162615e-05, - "loss": 0.3127, + "epoch": 0.6370058024290914, + "grad_norm": 0.21157251298427582, + "learning_rate": 4.876741538248825e-05, + "loss": 0.4686, "step": 17675 }, { - "epoch": 0.62, - "learning_rate": 4.884124085142106e-05, - "loss": 0.3082, + "epoch": 0.6371860020903161, + "grad_norm": 0.22163233160972595, + "learning_rate": 4.876651024176489e-05, + "loss": 0.4685, "step": 17680 }, { - "epoch": 0.62, - "learning_rate": 4.8840383461653164e-05, - "loss": 0.3087, + "epoch": 0.6373662017515407, + "grad_norm": 0.1729961782693863, + "learning_rate": 4.8765604777227434e-05, + "loss": 0.4498, "step": 17685 }, { - "epoch": 0.62, - "learning_rate": 4.883952576233359e-05, - "loss": 0.3167, + "epoch": 0.6375464014127653, + "grad_norm": 0.16511957347393036, + "learning_rate": 4.8764698988888223e-05, + "loss": 0.4789, "step": 17690 }, { - "epoch": 0.62, - "learning_rate": 4.8838667753473496e-05, - "loss": 0.3426, + "epoch": 0.6377266010739899, + "grad_norm": 0.15577971935272217, + "learning_rate": 4.876379287675958e-05, + "loss": 0.4499, "step": 17695 }, { - "epoch": 0.62, - "learning_rate": 4.8837809435084016e-05, - "loss": 0.3001, + "epoch": 0.6379068007352147, + "grad_norm": 0.23532330989837646, + "learning_rate": 4.8762886440853865e-05, + "loss": 0.458, "step": 17700 }, { - "epoch": 0.62, - "learning_rate": 4.8836950807176276e-05, - "loss": 0.2905, + "epoch": 0.6380870003964393, + "grad_norm": 0.1516086459159851, + "learning_rate": 4.876197968118342e-05, + "loss": 0.4136, "step": 17705 }, { - "epoch": 0.62, - "learning_rate": 4.8836091869761444e-05, - "loss": 0.3193, + "epoch": 0.6382672000576639, + "grad_norm": 0.16963382065296173, + "learning_rate": 4.8761072597760604e-05, + "loss": 0.4761, "step": 17710 }, { - "epoch": 0.62, - "learning_rate": 4.883523262285068e-05, - "loss": 0.3067, + "epoch": 0.6384473997188885, + "grad_norm": 0.17492809891700745, + "learning_rate": 4.876016519059778e-05, + "loss": 0.4299, "step": 17715 }, { - "epoch": 0.62, - "learning_rate": 4.8834373066455116e-05, - "loss": 0.3145, + "epoch": 0.6386275993801132, + "grad_norm": 0.21216866374015808, + "learning_rate": 4.87592574597073e-05, + "loss": 0.4399, "step": 17720 }, { - "epoch": 0.62, - "learning_rate": 4.883351320058592e-05, - "loss": 0.3245, + "epoch": 0.6388077990413378, + "grad_norm": 0.1838618665933609, + "learning_rate": 4.8758349405101535e-05, + "loss": 0.4794, "step": 17725 }, { - "epoch": 0.62, - "learning_rate": 4.883265302525427e-05, - "loss": 0.3328, + "epoch": 0.6389879987025624, + "grad_norm": 0.20086225867271423, + "learning_rate": 4.875744102679286e-05, + "loss": 0.4475, "step": 17730 }, { - "epoch": 0.62, - "learning_rate": 4.8831792540471334e-05, - "loss": 0.3123, + "epoch": 0.639168198363787, + "grad_norm": 0.18566478788852692, + "learning_rate": 4.8756532324793646e-05, + "loss": 0.4394, "step": 17735 }, { - "epoch": 0.62, - "learning_rate": 4.883093174624826e-05, - "loss": 0.3107, + "epoch": 0.6393483980250118, + "grad_norm": 0.1914960891008377, + "learning_rate": 4.875562329911629e-05, + "loss": 0.4574, "step": 17740 }, { - "epoch": 0.62, - "learning_rate": 4.8830070642596256e-05, - "loss": 0.3077, + "epoch": 0.6395285976862364, + "grad_norm": 0.14925505220890045, + "learning_rate": 4.8754713949773156e-05, + "loss": 0.4712, "step": 17745 }, { - "epoch": 0.62, - "learning_rate": 4.882920922952648e-05, - "loss": 0.3102, + "epoch": 0.639708797347461, + "grad_norm": 0.1491568684577942, + "learning_rate": 4.875380427677665e-05, + "loss": 0.4545, "step": 17750 }, { - "epoch": 0.62, - "learning_rate": 4.882834750705014e-05, - "loss": 0.3418, + "epoch": 0.6398889970086856, + "grad_norm": 0.16431868076324463, + "learning_rate": 4.8752894280139155e-05, + "loss": 0.4363, "step": 17755 }, { - "epoch": 0.62, - "learning_rate": 4.88274854751784e-05, - "loss": 0.2913, + "epoch": 0.6400691966699102, + "grad_norm": 0.16594944894313812, + "learning_rate": 4.8751983959873074e-05, + "loss": 0.443, "step": 17760 }, { - "epoch": 0.63, - "learning_rate": 4.882662313392247e-05, - "loss": 0.3132, + "epoch": 0.6402493963311349, + "grad_norm": 0.17833413183689117, + "learning_rate": 4.875107331599081e-05, + "loss": 0.4451, "step": 17765 }, { - "epoch": 0.63, - "learning_rate": 4.8825760483293533e-05, - "loss": 0.2979, + "epoch": 0.6404295959923595, + "grad_norm": 0.1914624124765396, + "learning_rate": 4.875016234850478e-05, + "loss": 0.4021, "step": 17770 }, { - "epoch": 0.63, - "learning_rate": 4.88248975233028e-05, - "loss": 0.3141, + "epoch": 0.6406097956535842, + "grad_norm": 0.16454005241394043, + "learning_rate": 4.8749251057427374e-05, + "loss": 0.4073, "step": 17775 }, { - "epoch": 0.63, - "learning_rate": 4.882403425396148e-05, - "loss": 0.3054, + "epoch": 0.6407899953148088, + "grad_norm": 0.1951947659254074, + "learning_rate": 4.874833944277103e-05, + "loss": 0.5004, "step": 17780 }, { - "epoch": 0.63, - "learning_rate": 4.882317067528077e-05, - "loss": 0.328, + "epoch": 0.6409701949760335, + "grad_norm": 0.1762334704399109, + "learning_rate": 4.8747427504548146e-05, + "loss": 0.4096, "step": 17785 }, { - "epoch": 0.63, - "learning_rate": 4.88223067872719e-05, - "loss": 0.3238, + "epoch": 0.6411503946372581, + "grad_norm": 0.18907704949378967, + "learning_rate": 4.8746515242771175e-05, + "loss": 0.4241, "step": 17790 }, { - "epoch": 0.63, - "learning_rate": 4.882144258994606e-05, - "loss": 0.3282, + "epoch": 0.6413305942984827, + "grad_norm": 0.20052656531333923, + "learning_rate": 4.874560265745252e-05, + "loss": 0.4246, "step": 17795 }, { - "epoch": 0.63, - "learning_rate": 4.88205780833145e-05, - "loss": 0.3001, + "epoch": 0.6415107939597073, + "grad_norm": 0.19393053650856018, + "learning_rate": 4.874468974860463e-05, + "loss": 0.4644, "step": 17800 }, { - "epoch": 0.63, - "learning_rate": 4.8819713267388425e-05, - "loss": 0.307, + "epoch": 0.641690993620932, + "grad_norm": 0.2143370658159256, + "learning_rate": 4.874377651623994e-05, + "loss": 0.462, "step": 17805 }, { - "epoch": 0.63, - "learning_rate": 4.881884814217907e-05, - "loss": 0.3345, + "epoch": 0.6418711932821566, + "grad_norm": 0.20955999195575714, + "learning_rate": 4.8742862960370895e-05, + "loss": 0.4356, "step": 17810 }, { - "epoch": 0.63, - "learning_rate": 4.881798270769767e-05, - "loss": 0.3091, + "epoch": 0.6420513929433813, + "grad_norm": 0.1920044720172882, + "learning_rate": 4.874194908100993e-05, + "loss": 0.4294, "step": 17815 }, { - "epoch": 0.63, - "learning_rate": 4.8817116963955465e-05, - "loss": 0.2985, + "epoch": 0.6422315926046059, + "grad_norm": 0.20551706850528717, + "learning_rate": 4.874103487816951e-05, + "loss": 0.4574, "step": 17820 }, { - "epoch": 0.63, - "learning_rate": 4.881625091096369e-05, - "loss": 0.2963, + "epoch": 0.6424117922658306, + "grad_norm": 0.1954600214958191, + "learning_rate": 4.8740120351862087e-05, + "loss": 0.439, "step": 17825 }, { - "epoch": 0.63, - "learning_rate": 4.881538454873359e-05, - "loss": 0.34, + "epoch": 0.6425919919270552, + "grad_norm": 0.17379389703273773, + "learning_rate": 4.873920550210012e-05, + "loss": 0.4056, "step": 17830 }, { - "epoch": 0.63, - "learning_rate": 4.881451787727642e-05, - "loss": 0.3245, + "epoch": 0.6427721915882798, + "grad_norm": 0.17289118468761444, + "learning_rate": 4.8738290328896076e-05, + "loss": 0.4321, "step": 17835 }, { - "epoch": 0.63, - "learning_rate": 4.881365089660342e-05, - "loss": 0.3092, + "epoch": 0.6429523912495044, + "grad_norm": 0.181446373462677, + "learning_rate": 4.8737374832262415e-05, + "loss": 0.4323, "step": 17840 }, { - "epoch": 0.63, - "learning_rate": 4.881278360672587e-05, - "loss": 0.3129, + "epoch": 0.643132590910729, + "grad_norm": 0.25358590483665466, + "learning_rate": 4.8736459012211624e-05, + "loss": 0.4085, "step": 17845 }, { - "epoch": 0.63, - "learning_rate": 4.881191600765501e-05, - "loss": 0.3273, + "epoch": 0.6433127905719537, + "grad_norm": 0.23851445317268372, + "learning_rate": 4.873554286875618e-05, + "loss": 0.4697, "step": 17850 }, { - "epoch": 0.63, - "learning_rate": 4.8811048099402124e-05, - "loss": 0.305, + "epoch": 0.6434929902331784, + "grad_norm": 0.18585999310016632, + "learning_rate": 4.873462640190855e-05, + "loss": 0.457, "step": 17855 }, { - "epoch": 0.63, - "learning_rate": 4.8810179881978455e-05, - "loss": 0.2934, + "epoch": 0.643673189894403, + "grad_norm": 0.16106140613555908, + "learning_rate": 4.873370961168123e-05, + "loss": 0.449, "step": 17860 }, { - "epoch": 0.63, - "learning_rate": 4.88093113553953e-05, - "loss": 0.3206, + "epoch": 0.6438533895556277, + "grad_norm": 0.18756996095180511, + "learning_rate": 4.873279249808672e-05, + "loss": 0.4517, "step": 17865 }, { - "epoch": 0.63, - "learning_rate": 4.8808442519663936e-05, - "loss": 0.3365, + "epoch": 0.6440335892168523, + "grad_norm": 0.26352357864379883, + "learning_rate": 4.87318750611375e-05, + "loss": 0.4494, "step": 17870 }, { - "epoch": 0.63, - "learning_rate": 4.880757337479562e-05, - "loss": 0.3166, + "epoch": 0.6442137888780769, + "grad_norm": 0.18100088834762573, + "learning_rate": 4.873095730084608e-05, + "loss": 0.442, "step": 17875 }, { - "epoch": 0.63, - "learning_rate": 4.8806703920801666e-05, - "loss": 0.3241, + "epoch": 0.6443939885393015, + "grad_norm": 0.1749761700630188, + "learning_rate": 4.873003921722496e-05, + "loss": 0.4248, "step": 17880 }, { - "epoch": 0.63, - "learning_rate": 4.880583415769334e-05, - "loss": 0.299, + "epoch": 0.6445741882005261, + "grad_norm": 0.16620847582817078, + "learning_rate": 4.8729120810286654e-05, + "loss": 0.4626, "step": 17885 }, { - "epoch": 0.63, - "learning_rate": 4.880496408548196e-05, - "loss": 0.3023, + "epoch": 0.6447543878617508, + "grad_norm": 0.18539084494113922, + "learning_rate": 4.872820208004367e-05, + "loss": 0.42, "step": 17890 }, { - "epoch": 0.63, - "learning_rate": 4.88040937041788e-05, - "loss": 0.3261, + "epoch": 0.6449345875229755, + "grad_norm": 0.15285968780517578, + "learning_rate": 4.8727283026508516e-05, + "loss": 0.428, "step": 17895 }, { - "epoch": 0.63, - "learning_rate": 4.8803223013795175e-05, - "loss": 0.3378, + "epoch": 0.6451147871842001, + "grad_norm": 0.16171573102474213, + "learning_rate": 4.872636364969373e-05, + "loss": 0.4222, "step": 17900 }, { - "epoch": 0.63, - "learning_rate": 4.8802352014342376e-05, - "loss": 0.2997, + "epoch": 0.6452949868454247, + "grad_norm": 0.2132413238286972, + "learning_rate": 4.872544394961184e-05, + "loss": 0.4037, "step": 17905 }, { - "epoch": 0.63, - "learning_rate": 4.880148070583174e-05, - "loss": 0.3037, + "epoch": 0.6454751865066494, + "grad_norm": 0.16865713894367218, + "learning_rate": 4.872452392627537e-05, + "loss": 0.4025, "step": 17910 }, { - "epoch": 0.63, - "learning_rate": 4.880060908827455e-05, - "loss": 0.3039, + "epoch": 0.645655386167874, + "grad_norm": 0.18928374350070953, + "learning_rate": 4.8723603579696844e-05, + "loss": 0.4316, "step": 17915 }, { - "epoch": 0.63, - "learning_rate": 4.879973716168214e-05, - "loss": 0.2984, + "epoch": 0.6458355858290986, + "grad_norm": 0.14977779984474182, + "learning_rate": 4.872268290988882e-05, + "loss": 0.4344, "step": 17920 }, { - "epoch": 0.63, - "learning_rate": 4.8798864926065823e-05, - "loss": 0.3049, + "epoch": 0.6460157854903232, + "grad_norm": 0.15273813903331757, + "learning_rate": 4.8721761916863826e-05, + "loss": 0.4632, "step": 17925 }, { - "epoch": 0.63, - "learning_rate": 4.879799238143693e-05, - "loss": 0.2784, + "epoch": 0.646195985151548, + "grad_norm": 0.16625703871250153, + "learning_rate": 4.872084060063443e-05, + "loss": 0.4079, "step": 17930 }, { - "epoch": 0.63, - "learning_rate": 4.879711952780679e-05, - "loss": 0.2891, + "epoch": 0.6463761848127726, + "grad_norm": 0.16598527133464813, + "learning_rate": 4.871991896121317e-05, + "loss": 0.4384, "step": 17935 }, { - "epoch": 0.63, - "learning_rate": 4.879624636518674e-05, - "loss": 0.297, + "epoch": 0.6465563844739972, + "grad_norm": 0.14382204413414001, + "learning_rate": 4.8718996998612605e-05, + "loss": 0.4601, "step": 17940 }, { - "epoch": 0.63, - "learning_rate": 4.879537289358811e-05, - "loss": 0.2995, + "epoch": 0.6467365841352218, + "grad_norm": 0.169216588139534, + "learning_rate": 4.8718074712845296e-05, + "loss": 0.4329, "step": 17945 }, { - "epoch": 0.63, - "learning_rate": 4.879449911302225e-05, - "loss": 0.3221, + "epoch": 0.6469167837964465, + "grad_norm": 0.21951742470264435, + "learning_rate": 4.871715210392381e-05, + "loss": 0.4817, "step": 17950 }, { - "epoch": 0.63, - "learning_rate": 4.879362502350049e-05, - "loss": 0.3157, + "epoch": 0.6470969834576711, + "grad_norm": 0.14136020839214325, + "learning_rate": 4.871622917186073e-05, + "loss": 0.4133, "step": 17955 }, { - "epoch": 0.63, - "learning_rate": 4.87927506250342e-05, - "loss": 0.3068, + "epoch": 0.6472771831188957, + "grad_norm": 0.22290566563606262, + "learning_rate": 4.871530591666861e-05, + "loss": 0.4538, "step": 17960 }, { - "epoch": 0.63, - "learning_rate": 4.879187591763472e-05, - "loss": 0.3526, + "epoch": 0.6474573827801203, + "grad_norm": 0.1538752168416977, + "learning_rate": 4.871438233836003e-05, + "loss": 0.4566, "step": 17965 }, { - "epoch": 0.63, - "learning_rate": 4.879100090131341e-05, - "loss": 0.2967, + "epoch": 0.6476375824413451, + "grad_norm": 0.13945698738098145, + "learning_rate": 4.871345843694759e-05, + "loss": 0.4536, "step": 17970 }, { - "epoch": 0.63, - "learning_rate": 4.879012557608162e-05, - "loss": 0.2943, + "epoch": 0.6478177821025697, + "grad_norm": 0.19078098237514496, + "learning_rate": 4.871253421244387e-05, + "loss": 0.4743, "step": 17975 }, { - "epoch": 0.63, - "learning_rate": 4.8789249941950746e-05, - "loss": 0.32, + "epoch": 0.6479979817637943, + "grad_norm": 0.17093034088611603, + "learning_rate": 4.871160966486147e-05, + "loss": 0.4625, "step": 17980 }, { - "epoch": 0.63, - "learning_rate": 4.878837399893213e-05, - "loss": 0.334, + "epoch": 0.6481781814250189, + "grad_norm": 0.20973354578018188, + "learning_rate": 4.8710684794212966e-05, + "loss": 0.4308, "step": 17985 }, { - "epoch": 0.63, - "learning_rate": 4.8787497747037156e-05, - "loss": 0.3192, + "epoch": 0.6483583810862436, + "grad_norm": 0.19713053107261658, + "learning_rate": 4.870975960051098e-05, + "loss": 0.4244, "step": 17990 }, { - "epoch": 0.63, - "learning_rate": 4.8786621186277204e-05, - "loss": 0.2938, + "epoch": 0.6485385807474682, + "grad_norm": 0.18588168919086456, + "learning_rate": 4.8708834083768106e-05, + "loss": 0.4199, "step": 17995 }, { - "epoch": 0.63, - "learning_rate": 4.878574431666365e-05, - "loss": 0.314, + "epoch": 0.6487187804086928, + "grad_norm": 0.15173941850662231, + "learning_rate": 4.870790824399696e-05, + "loss": 0.3994, "step": 18000 }, { - "epoch": 0.63, - "eval_loss": 0.3048493266105652, - "eval_runtime": 10.5346, - "eval_samples_per_second": 9.493, - "eval_steps_per_second": 9.493, + "epoch": 0.6487187804086928, + "eval_loss": 0.4651120901107788, + "eval_runtime": 3.5665, + "eval_samples_per_second": 28.038, + "eval_steps_per_second": 7.01, "step": 18000 }, { - "epoch": 0.63, - "learning_rate": 4.8784867138207877e-05, - "loss": 0.2992, + "epoch": 0.6488989800699174, + "grad_norm": 0.1769859790802002, + "learning_rate": 4.870698208121015e-05, + "loss": 0.4507, "step": 18005 }, { - "epoch": 0.63, - "learning_rate": 4.878398965092129e-05, - "loss": 0.3117, + "epoch": 0.6490791797311422, + "grad_norm": 0.19280551373958588, + "learning_rate": 4.870605559542031e-05, + "loss": 0.4139, "step": 18010 }, { - "epoch": 0.63, - "learning_rate": 4.878311185481526e-05, - "loss": 0.3081, + "epoch": 0.6492593793923668, + "grad_norm": 0.1606101095676422, + "learning_rate": 4.870512878664004e-05, + "loss": 0.4447, "step": 18015 }, { - "epoch": 0.63, - "learning_rate": 4.8782233749901205e-05, - "loss": 0.3108, + "epoch": 0.6494395790535914, + "grad_norm": 0.16047056019306183, + "learning_rate": 4.870420165488199e-05, + "loss": 0.4453, "step": 18020 }, { - "epoch": 0.63, - "learning_rate": 4.878135533619051e-05, - "loss": 0.3003, + "epoch": 0.649619778714816, + "grad_norm": 0.1686955988407135, + "learning_rate": 4.8703274200158765e-05, + "loss": 0.4609, "step": 18025 }, { - "epoch": 0.63, - "learning_rate": 4.8780476613694594e-05, - "loss": 0.3065, + "epoch": 0.6497999783760406, + "grad_norm": 0.15635234117507935, + "learning_rate": 4.8702346422483035e-05, + "loss": 0.4296, "step": 18030 }, { - "epoch": 0.63, - "learning_rate": 4.877959758242486e-05, - "loss": 0.307, + "epoch": 0.6499801780372653, + "grad_norm": 0.158394455909729, + "learning_rate": 4.870141832186742e-05, + "loss": 0.447, "step": 18035 }, { - "epoch": 0.63, - "learning_rate": 4.877871824239272e-05, - "loss": 0.3209, + "epoch": 0.6501603776984899, + "grad_norm": 0.16858071088790894, + "learning_rate": 4.870048989832456e-05, + "loss": 0.4233, "step": 18040 }, { - "epoch": 0.63, - "learning_rate": 4.877783859360959e-05, - "loss": 0.2935, + "epoch": 0.6503405773597145, + "grad_norm": 0.11584287881851196, + "learning_rate": 4.869956115186712e-05, + "loss": 0.3873, "step": 18045 }, { - "epoch": 0.64, - "learning_rate": 4.87769586360869e-05, - "loss": 0.3417, + "epoch": 0.6505207770209392, + "grad_norm": 0.15428020060062408, + "learning_rate": 4.8698632082507754e-05, + "loss": 0.4426, "step": 18050 }, { - "epoch": 0.64, - "learning_rate": 4.877607836983608e-05, - "loss": 0.3528, + "epoch": 0.6507009766821639, + "grad_norm": 0.2866957187652588, + "learning_rate": 4.869770269025911e-05, + "loss": 0.4639, "step": 18055 }, { - "epoch": 0.64, - "learning_rate": 4.8775197794868544e-05, - "loss": 0.3162, + "epoch": 0.6508811763433885, + "grad_norm": 0.21071985363960266, + "learning_rate": 4.8696772975133854e-05, + "loss": 0.4842, "step": 18060 }, { - "epoch": 0.64, - "learning_rate": 4.8774316911195736e-05, - "loss": 0.302, + "epoch": 0.6510613760046131, + "grad_norm": 0.17984484136104584, + "learning_rate": 4.869584293714465e-05, + "loss": 0.4272, "step": 18065 }, { - "epoch": 0.64, - "learning_rate": 4.877343571882909e-05, - "loss": 0.3097, + "epoch": 0.6512415756658377, + "grad_norm": 0.20826445519924164, + "learning_rate": 4.8694912576304175e-05, + "loss": 0.4561, "step": 18070 }, { - "epoch": 0.64, - "learning_rate": 4.877255421778004e-05, - "loss": 0.3214, + "epoch": 0.6514217753270624, + "grad_norm": 0.14839668571949005, + "learning_rate": 4.8693981892625105e-05, + "loss": 0.445, "step": 18075 }, { - "epoch": 0.64, - "learning_rate": 4.8771672408060056e-05, - "loss": 0.3215, + "epoch": 0.651601974988287, + "grad_norm": 0.15876393020153046, + "learning_rate": 4.869305088612012e-05, + "loss": 0.4377, "step": 18080 }, { - "epoch": 0.64, - "learning_rate": 4.8770790289680555e-05, - "loss": 0.321, + "epoch": 0.6517821746495117, + "grad_norm": 0.20614424347877502, + "learning_rate": 4.8692119556801905e-05, + "loss": 0.4499, "step": 18085 }, { - "epoch": 0.64, - "learning_rate": 4.876990786265302e-05, - "loss": 0.3224, + "epoch": 0.6519623743107363, + "grad_norm": 0.17227613925933838, + "learning_rate": 4.8691187904683145e-05, + "loss": 0.4511, "step": 18090 }, { - "epoch": 0.64, - "learning_rate": 4.8769025126988896e-05, - "loss": 0.3153, + "epoch": 0.652142573971961, + "grad_norm": 0.15382865071296692, + "learning_rate": 4.869025592977654e-05, + "loss": 0.4734, "step": 18095 }, { - "epoch": 0.64, - "learning_rate": 4.876814208269964e-05, - "loss": 0.33, + "epoch": 0.6523227736331856, + "grad_norm": 0.1578940749168396, + "learning_rate": 4.8689323632094784e-05, + "loss": 0.4353, "step": 18100 }, { - "epoch": 0.64, - "learning_rate": 4.8767258729796725e-05, - "loss": 0.3092, + "epoch": 0.6525029732944102, + "grad_norm": 0.17838072776794434, + "learning_rate": 4.868839101165058e-05, + "loss": 0.4476, "step": 18105 }, { - "epoch": 0.64, - "learning_rate": 4.876637506829162e-05, - "loss": 0.3139, + "epoch": 0.6526831729556348, + "grad_norm": 0.16446293890476227, + "learning_rate": 4.868745806845663e-05, + "loss": 0.4486, "step": 18110 }, { - "epoch": 0.64, - "learning_rate": 4.876549109819579e-05, - "loss": 0.3209, + "epoch": 0.6528633726168594, + "grad_norm": 0.16984452307224274, + "learning_rate": 4.8686524802525656e-05, + "loss": 0.4135, "step": 18115 }, { - "epoch": 0.64, - "learning_rate": 4.876460681952073e-05, - "loss": 0.2928, + "epoch": 0.6530435722780841, + "grad_norm": 0.18492572009563446, + "learning_rate": 4.868559121387036e-05, + "loss": 0.465, "step": 18120 }, { - "epoch": 0.64, - "learning_rate": 4.876372223227791e-05, - "loss": 0.2933, + "epoch": 0.6532237719393088, + "grad_norm": 0.18226665258407593, + "learning_rate": 4.868465730250348e-05, + "loss": 0.452, "step": 18125 }, { - "epoch": 0.64, - "learning_rate": 4.876283733647881e-05, - "loss": 0.3381, + "epoch": 0.6534039716005334, + "grad_norm": 0.18231280148029327, + "learning_rate": 4.868372306843772e-05, + "loss": 0.4465, "step": 18130 }, { - "epoch": 0.64, - "learning_rate": 4.876195213213494e-05, - "loss": 0.331, + "epoch": 0.653584171261758, + "grad_norm": 0.22582538425922394, + "learning_rate": 4.868278851168582e-05, + "loss": 0.4061, "step": 18135 }, { - "epoch": 0.64, - "learning_rate": 4.876106661925777e-05, - "loss": 0.3399, + "epoch": 0.6537643709229827, + "grad_norm": 0.16542305052280426, + "learning_rate": 4.8681853632260524e-05, + "loss": 0.4386, "step": 18140 }, { - "epoch": 0.64, - "learning_rate": 4.8760180797858815e-05, - "loss": 0.3081, + "epoch": 0.6539445705842073, + "grad_norm": 0.21034276485443115, + "learning_rate": 4.868091843017454e-05, + "loss": 0.4536, "step": 18145 }, { - "epoch": 0.64, - "learning_rate": 4.8759294667949565e-05, - "loss": 0.3229, + "epoch": 0.6541247702454319, + "grad_norm": 0.20631127059459686, + "learning_rate": 4.867998290544064e-05, + "loss": 0.4516, "step": 18150 }, { - "epoch": 0.64, - "learning_rate": 4.875840822954154e-05, - "loss": 0.3046, + "epoch": 0.6543049699066565, + "grad_norm": 0.1725880652666092, + "learning_rate": 4.867904705807155e-05, + "loss": 0.4615, "step": 18155 }, { - "epoch": 0.64, - "learning_rate": 4.875752148264624e-05, - "loss": 0.3098, + "epoch": 0.6544851695678812, + "grad_norm": 0.14972399175167084, + "learning_rate": 4.8678110888080026e-05, + "loss": 0.4562, "step": 18160 }, { - "epoch": 0.64, - "learning_rate": 4.875663442727517e-05, - "loss": 0.3071, + "epoch": 0.6546653692291059, + "grad_norm": 0.1854199767112732, + "learning_rate": 4.8677174395478834e-05, + "loss": 0.4723, "step": 18165 }, { - "epoch": 0.64, - "learning_rate": 4.875574706343987e-05, - "loss": 0.305, + "epoch": 0.6548455688903305, + "grad_norm": 0.16778244078159332, + "learning_rate": 4.867623758028072e-05, + "loss": 0.4472, "step": 18170 }, { - "epoch": 0.64, - "learning_rate": 4.8754859391151844e-05, - "loss": 0.2952, + "epoch": 0.6550257685515551, + "grad_norm": 0.14149196445941925, + "learning_rate": 4.867530044249845e-05, + "loss": 0.4036, "step": 18175 }, { - "epoch": 0.64, - "learning_rate": 4.875397141042263e-05, - "loss": 0.3284, + "epoch": 0.6552059682127798, + "grad_norm": 0.15938983857631683, + "learning_rate": 4.86743629821448e-05, + "loss": 0.431, "step": 18180 }, { - "epoch": 0.64, - "learning_rate": 4.875308312126374e-05, - "loss": 0.3252, + "epoch": 0.6553861678740044, + "grad_norm": 0.16331376135349274, + "learning_rate": 4.867342519923254e-05, + "loss": 0.4418, "step": 18185 }, { - "epoch": 0.64, - "learning_rate": 4.8752194523686726e-05, - "loss": 0.2949, + "epoch": 0.655566367535229, + "grad_norm": 0.15382960438728333, + "learning_rate": 4.867248709377443e-05, + "loss": 0.4471, "step": 18190 }, { - "epoch": 0.64, - "learning_rate": 4.875130561770312e-05, - "loss": 0.3216, + "epoch": 0.6557465671964536, + "grad_norm": 0.1844521015882492, + "learning_rate": 4.8671548665783287e-05, + "loss": 0.4734, "step": 18195 }, { - "epoch": 0.64, - "learning_rate": 4.875041640332446e-05, - "loss": 0.2878, + "epoch": 0.6559267668576783, + "grad_norm": 0.21769674122333527, + "learning_rate": 4.867060991527186e-05, + "loss": 0.4561, "step": 18200 }, { - "epoch": 0.64, - "learning_rate": 4.87495268805623e-05, - "loss": 0.2957, + "epoch": 0.656106966518903, + "grad_norm": 0.21248985826969147, + "learning_rate": 4.866967084225297e-05, + "loss": 0.4137, "step": 18205 }, { - "epoch": 0.64, - "learning_rate": 4.8748637049428185e-05, - "loss": 0.3343, + "epoch": 0.6562871661801276, + "grad_norm": 0.18385855853557587, + "learning_rate": 4.866873144673939e-05, + "loss": 0.4094, "step": 18210 }, { - "epoch": 0.64, - "learning_rate": 4.874774690993367e-05, - "loss": 0.2956, + "epoch": 0.6564673658413522, + "grad_norm": 0.13242194056510925, + "learning_rate": 4.866779172874392e-05, + "loss": 0.4741, "step": 18215 }, { - "epoch": 0.64, - "learning_rate": 4.8746856462090305e-05, - "loss": 0.3084, + "epoch": 0.6566475655025769, + "grad_norm": 0.17850540578365326, + "learning_rate": 4.866685168827938e-05, + "loss": 0.4539, "step": 18220 }, { - "epoch": 0.64, - "learning_rate": 4.874596570590967e-05, - "loss": 0.3132, + "epoch": 0.6568277651638015, + "grad_norm": 0.1694021075963974, + "learning_rate": 4.8665911325358556e-05, + "loss": 0.4531, "step": 18225 }, { - "epoch": 0.64, - "learning_rate": 4.874507464140331e-05, - "loss": 0.3079, + "epoch": 0.6570079648250261, + "grad_norm": 0.13871118426322937, + "learning_rate": 4.866497063999429e-05, + "loss": 0.4149, "step": 18230 }, { - "epoch": 0.64, - "learning_rate": 4.8744183268582804e-05, - "loss": 0.3015, + "epoch": 0.6571881644862507, + "grad_norm": 0.13802644610404968, + "learning_rate": 4.866402963219937e-05, + "loss": 0.4539, "step": 18235 }, { - "epoch": 0.64, - "learning_rate": 4.874329158745973e-05, - "loss": 0.3592, + "epoch": 0.6573683641474753, + "grad_norm": 0.17566382884979248, + "learning_rate": 4.8663088301986625e-05, + "loss": 0.4297, "step": 18240 }, { - "epoch": 0.64, - "learning_rate": 4.874239959804566e-05, - "loss": 0.3334, + "epoch": 0.6575485638087001, + "grad_norm": 0.21962518990039825, + "learning_rate": 4.866214664936889e-05, + "loss": 0.4451, "step": 18245 }, { - "epoch": 0.64, - "learning_rate": 4.874150730035218e-05, - "loss": 0.3284, + "epoch": 0.6577287634699247, + "grad_norm": 0.12450151145458221, + "learning_rate": 4.8661204674358984e-05, + "loss": 0.4547, "step": 18250 }, { - "epoch": 0.64, - "learning_rate": 4.874061469439087e-05, - "loss": 0.3008, + "epoch": 0.6579089631311493, + "grad_norm": 0.16591274738311768, + "learning_rate": 4.8660262376969745e-05, + "loss": 0.4615, "step": 18255 }, { - "epoch": 0.64, - "learning_rate": 4.873972178017333e-05, - "loss": 0.317, + "epoch": 0.658089162792374, + "grad_norm": 0.1811162382364273, + "learning_rate": 4.8659319757214016e-05, + "loss": 0.443, "step": 18260 }, { - "epoch": 0.64, - "learning_rate": 4.873882855771115e-05, - "loss": 0.3178, + "epoch": 0.6582693624535986, + "grad_norm": 0.17516489326953888, + "learning_rate": 4.865837681510463e-05, + "loss": 0.4069, "step": 18265 }, { - "epoch": 0.64, - "learning_rate": 4.8737935027015915e-05, - "loss": 0.3101, + "epoch": 0.6584495621148232, + "grad_norm": 0.16720084846019745, + "learning_rate": 4.865743355065444e-05, + "loss": 0.4127, "step": 18270 }, { - "epoch": 0.64, - "learning_rate": 4.873704118809924e-05, - "loss": 0.3289, + "epoch": 0.6586297617760478, + "grad_norm": 0.13735070824623108, + "learning_rate": 4.8656489963876305e-05, + "loss": 0.4367, "step": 18275 }, { - "epoch": 0.64, - "learning_rate": 4.8736147040972736e-05, - "loss": 0.2987, + "epoch": 0.6588099614372726, + "grad_norm": 0.15320982038974762, + "learning_rate": 4.865554605478308e-05, + "loss": 0.4592, "step": 18280 }, { - "epoch": 0.64, - "learning_rate": 4.8735252585647995e-05, - "loss": 0.3261, + "epoch": 0.6589901610984972, + "grad_norm": 0.2053997814655304, + "learning_rate": 4.8654601823387605e-05, + "loss": 0.4266, "step": 18285 }, { - "epoch": 0.64, - "learning_rate": 4.873435782213665e-05, - "loss": 0.3093, + "epoch": 0.6591703607597218, + "grad_norm": 0.1573738306760788, + "learning_rate": 4.8653657269702765e-05, + "loss": 0.4257, "step": 18290 }, { - "epoch": 0.64, - "learning_rate": 4.8733462750450306e-05, - "loss": 0.284, + "epoch": 0.6593505604209464, + "grad_norm": 0.1637280136346817, + "learning_rate": 4.865271239374142e-05, + "loss": 0.4559, "step": 18295 }, { - "epoch": 0.64, - "learning_rate": 4.873256737060059e-05, - "loss": 0.3229, + "epoch": 0.659530760082171, + "grad_norm": 0.2082606703042984, + "learning_rate": 4.865176719551645e-05, + "loss": 0.4498, "step": 18300 }, { - "epoch": 0.64, - "learning_rate": 4.8731671682599126e-05, - "loss": 0.3229, + "epoch": 0.6597109597433957, + "grad_norm": 0.1640523225069046, + "learning_rate": 4.8650821675040735e-05, + "loss": 0.4573, "step": 18305 }, { - "epoch": 0.64, - "learning_rate": 4.873077568645755e-05, - "loss": 0.3249, + "epoch": 0.6598911594046203, + "grad_norm": 0.130737766623497, + "learning_rate": 4.864987583232715e-05, + "loss": 0.4277, "step": 18310 }, { - "epoch": 0.64, - "learning_rate": 4.8729879382187485e-05, - "loss": 0.3393, + "epoch": 0.6600713590658449, + "grad_norm": 0.17735107243061066, + "learning_rate": 4.8648929667388596e-05, + "loss": 0.4758, "step": 18315 }, { - "epoch": 0.64, - "learning_rate": 4.872898276980058e-05, - "loss": 0.33, + "epoch": 0.6602515587270696, + "grad_norm": 0.180114284157753, + "learning_rate": 4.8647983180237945e-05, + "loss": 0.4557, "step": 18320 }, { - "epoch": 0.64, - "learning_rate": 4.872808584930847e-05, - "loss": 0.3207, + "epoch": 0.6604317583882943, + "grad_norm": 0.19766369462013245, + "learning_rate": 4.8647036370888096e-05, + "loss": 0.4227, "step": 18325 }, { - "epoch": 0.64, - "learning_rate": 4.87271886207228e-05, - "loss": 0.2978, + "epoch": 0.6606119580495189, + "grad_norm": 0.18029169738292694, + "learning_rate": 4.8646089239351966e-05, + "loss": 0.4243, "step": 18330 }, { - "epoch": 0.65, - "learning_rate": 4.872629108405522e-05, - "loss": 0.3273, + "epoch": 0.6607921577107435, + "grad_norm": 0.1884010136127472, + "learning_rate": 4.8645141785642444e-05, + "loss": 0.469, "step": 18335 }, { - "epoch": 0.65, - "learning_rate": 4.872539323931739e-05, - "loss": 0.2903, + "epoch": 0.6609723573719681, + "grad_norm": 0.17342260479927063, + "learning_rate": 4.864419400977244e-05, + "loss": 0.4331, "step": 18340 }, { - "epoch": 0.65, - "learning_rate": 4.8724495086520964e-05, - "loss": 0.3063, + "epoch": 0.6611525570331928, + "grad_norm": 0.15091748535633087, + "learning_rate": 4.864324591175487e-05, + "loss": 0.4205, "step": 18345 }, { - "epoch": 0.65, - "learning_rate": 4.872359662567761e-05, - "loss": 0.3177, + "epoch": 0.6613327566944174, + "grad_norm": 0.20120559632778168, + "learning_rate": 4.8642297491602654e-05, + "loss": 0.441, "step": 18350 }, { - "epoch": 0.65, - "learning_rate": 4.8722697856798985e-05, - "loss": 0.293, + "epoch": 0.661512956355642, + "grad_norm": 0.2035011202096939, + "learning_rate": 4.8641348749328716e-05, + "loss": 0.4648, "step": 18355 }, { - "epoch": 0.65, - "learning_rate": 4.8721798779896756e-05, - "loss": 0.3201, + "epoch": 0.6616931560168667, + "grad_norm": 0.1479252576828003, + "learning_rate": 4.864039968494597e-05, + "loss": 0.4378, "step": 18360 }, { - "epoch": 0.65, - "learning_rate": 4.872089939498261e-05, - "loss": 0.3389, + "epoch": 0.6618733556780914, + "grad_norm": 0.15773577988147736, + "learning_rate": 4.863945029846736e-05, + "loss": 0.4031, "step": 18365 }, { - "epoch": 0.65, - "learning_rate": 4.871999970206822e-05, - "loss": 0.297, + "epoch": 0.662053555339316, + "grad_norm": 0.20971988141536713, + "learning_rate": 4.863850058990582e-05, + "loss": 0.4773, "step": 18370 }, { - "epoch": 0.65, - "learning_rate": 4.871909970116526e-05, - "loss": 0.3382, + "epoch": 0.6622337550005406, + "grad_norm": 0.17060470581054688, + "learning_rate": 4.863755055927428e-05, + "loss": 0.4178, "step": 18375 }, { - "epoch": 0.65, - "learning_rate": 4.871819939228542e-05, - "loss": 0.3001, + "epoch": 0.6624139546617652, + "grad_norm": 0.22001157701015472, + "learning_rate": 4.863660020658569e-05, + "loss": 0.4637, "step": 18380 }, { - "epoch": 0.65, - "learning_rate": 4.871729877544039e-05, - "loss": 0.3313, + "epoch": 0.6625941543229898, + "grad_norm": 0.15214625000953674, + "learning_rate": 4.8635649531853e-05, + "loss": 0.4174, "step": 18385 }, { - "epoch": 0.65, - "learning_rate": 4.871639785064187e-05, - "loss": 0.3017, + "epoch": 0.6627743539842145, + "grad_norm": 0.15511523187160492, + "learning_rate": 4.863469853508916e-05, + "loss": 0.4156, "step": 18390 }, { - "epoch": 0.65, - "learning_rate": 4.871549661790156e-05, - "loss": 0.3424, + "epoch": 0.6629545536454391, + "grad_norm": 0.18479692935943604, + "learning_rate": 4.863374721630713e-05, + "loss": 0.4064, "step": 18395 }, { - "epoch": 0.65, - "learning_rate": 4.871459507723115e-05, - "loss": 0.3293, + "epoch": 0.6631347533066638, + "grad_norm": 0.3170281946659088, + "learning_rate": 4.8632795575519876e-05, + "loss": 0.4339, "step": 18400 }, { - "epoch": 0.65, - "learning_rate": 4.8713693228642356e-05, - "loss": 0.3195, + "epoch": 0.6633149529678884, + "grad_norm": 0.1785048097372055, + "learning_rate": 4.863184361274035e-05, + "loss": 0.4805, "step": 18405 }, { - "epoch": 0.65, - "learning_rate": 4.871279107214687e-05, - "loss": 0.3149, + "epoch": 0.6634951526291131, + "grad_norm": 0.1605699211359024, + "learning_rate": 4.863089132798153e-05, + "loss": 0.4641, "step": 18410 }, { - "epoch": 0.65, - "learning_rate": 4.871188860775643e-05, - "loss": 0.3221, + "epoch": 0.6636753522903377, + "grad_norm": 0.20723554491996765, + "learning_rate": 4.86299387212564e-05, + "loss": 0.4315, "step": 18415 }, { - "epoch": 0.65, - "learning_rate": 4.871098583548274e-05, - "loss": 0.2928, + "epoch": 0.6638555519515623, + "grad_norm": 0.14896468818187714, + "learning_rate": 4.862898579257792e-05, + "loss": 0.4462, "step": 18420 }, { - "epoch": 0.65, - "learning_rate": 4.871008275533753e-05, - "loss": 0.2977, + "epoch": 0.6640357516127869, + "grad_norm": 0.16965442895889282, + "learning_rate": 4.8628032541959086e-05, + "loss": 0.4679, "step": 18425 }, { - "epoch": 0.65, - "learning_rate": 4.8709179367332514e-05, - "loss": 0.3271, + "epoch": 0.6642159512740116, + "grad_norm": 0.1498372107744217, + "learning_rate": 4.8627078969412885e-05, + "loss": 0.43, "step": 18430 }, { - "epoch": 0.65, - "learning_rate": 4.8708275671479434e-05, - "loss": 0.3082, + "epoch": 0.6643961509352363, + "grad_norm": 0.1531539112329483, + "learning_rate": 4.86261250749523e-05, + "loss": 0.4028, "step": 18435 }, { - "epoch": 0.65, - "learning_rate": 4.8707371667790016e-05, - "loss": 0.3163, + "epoch": 0.6645763505964609, + "grad_norm": 0.1347551792860031, + "learning_rate": 4.862517085859034e-05, + "loss": 0.3987, "step": 18440 }, { - "epoch": 0.65, - "learning_rate": 4.8706467356276e-05, - "loss": 0.2826, + "epoch": 0.6647565502576855, + "grad_norm": 0.2017221748828888, + "learning_rate": 4.862421632034e-05, + "loss": 0.4778, "step": 18445 }, { - "epoch": 0.65, - "learning_rate": 4.870556273694913e-05, - "loss": 0.3069, + "epoch": 0.6649367499189102, + "grad_norm": 0.16347235441207886, + "learning_rate": 4.8623261460214284e-05, + "loss": 0.4565, "step": 18450 }, { - "epoch": 0.65, - "learning_rate": 4.8704657809821154e-05, - "loss": 0.3196, + "epoch": 0.6651169495801348, + "grad_norm": 0.18009866774082184, + "learning_rate": 4.862230627822621e-05, + "loss": 0.4521, "step": 18455 }, { - "epoch": 0.65, - "learning_rate": 4.8703752574903814e-05, - "loss": 0.3224, + "epoch": 0.6652971492413594, + "grad_norm": 0.18370333313941956, + "learning_rate": 4.8621350774388784e-05, + "loss": 0.4353, "step": 18460 }, { - "epoch": 0.65, - "learning_rate": 4.870284703220887e-05, - "loss": 0.331, + "epoch": 0.665477348902584, + "grad_norm": 0.17444929480552673, + "learning_rate": 4.8620394948715034e-05, + "loss": 0.4615, "step": 18465 }, { - "epoch": 0.65, - "learning_rate": 4.870194118174807e-05, - "loss": 0.2804, + "epoch": 0.6656575485638087, + "grad_norm": 0.17793411016464233, + "learning_rate": 4.8619438801217966e-05, + "loss": 0.4321, "step": 18470 }, { - "epoch": 0.65, - "learning_rate": 4.8701035023533193e-05, - "loss": 0.3272, + "epoch": 0.6658377482250334, + "grad_norm": 0.1900653839111328, + "learning_rate": 4.861848233191062e-05, + "loss": 0.4412, "step": 18475 }, { - "epoch": 0.65, - "learning_rate": 4.8700128557576e-05, - "loss": 0.2817, + "epoch": 0.666017947886258, + "grad_norm": 0.191162571310997, + "learning_rate": 4.861752554080602e-05, + "loss": 0.433, "step": 18480 }, { - "epoch": 0.65, - "learning_rate": 4.869922178388824e-05, - "loss": 0.301, + "epoch": 0.6661981475474826, + "grad_norm": 0.1730828732252121, + "learning_rate": 4.861656842791722e-05, + "loss": 0.457, "step": 18485 }, { - "epoch": 0.65, - "learning_rate": 4.869831470248172e-05, - "loss": 0.3247, + "epoch": 0.6663783472087073, + "grad_norm": 0.1662280112504959, + "learning_rate": 4.861561099325723e-05, + "loss": 0.4409, "step": 18490 }, { - "epoch": 0.65, - "learning_rate": 4.869740731336819e-05, - "loss": 0.3042, + "epoch": 0.6665585468699319, + "grad_norm": 0.16253672540187836, + "learning_rate": 4.8614653236839125e-05, + "loss": 0.4723, "step": 18495 }, { - "epoch": 0.65, - "learning_rate": 4.869649961655946e-05, - "loss": 0.302, + "epoch": 0.6667387465311565, + "grad_norm": 0.17181946337223053, + "learning_rate": 4.861369515867594e-05, + "loss": 0.4322, "step": 18500 }, { - "epoch": 0.65, - "eval_loss": 0.3054150938987732, - "eval_runtime": 10.5494, - "eval_samples_per_second": 9.479, - "eval_steps_per_second": 9.479, + "epoch": 0.6667387465311565, + "eval_loss": 0.4643363654613495, + "eval_runtime": 3.5289, + "eval_samples_per_second": 28.337, + "eval_steps_per_second": 7.084, "step": 18500 }, { - "epoch": 0.65, - "learning_rate": 4.869559161206728e-05, - "loss": 0.3064, + "epoch": 0.6669189461923811, + "grad_norm": 0.2049456685781479, + "learning_rate": 4.861273675878073e-05, + "loss": 0.4311, "step": 18505 }, { - "epoch": 0.65, - "learning_rate": 4.869468329990347e-05, - "loss": 0.3255, + "epoch": 0.6670991458536057, + "grad_norm": 0.1761494129896164, + "learning_rate": 4.861177803716655e-05, + "loss": 0.4485, "step": 18510 }, { - "epoch": 0.65, - "learning_rate": 4.8693774680079806e-05, - "loss": 0.3148, + "epoch": 0.6672793455148305, + "grad_norm": 0.17736728489398956, + "learning_rate": 4.861081899384647e-05, + "loss": 0.4739, "step": 18515 }, { - "epoch": 0.65, - "learning_rate": 4.86928657526081e-05, - "loss": 0.302, + "epoch": 0.6674595451760551, + "grad_norm": 0.19387714564800262, + "learning_rate": 4.860985962883355e-05, + "loss": 0.4827, "step": 18520 }, { - "epoch": 0.65, - "learning_rate": 4.869195651750014e-05, - "loss": 0.3206, + "epoch": 0.6676397448372797, + "grad_norm": 0.16782864928245544, + "learning_rate": 4.860889994214086e-05, + "loss": 0.4753, "step": 18525 }, { - "epoch": 0.65, - "learning_rate": 4.869104697476774e-05, - "loss": 0.3158, + "epoch": 0.6678199444985043, + "grad_norm": 0.17974188923835754, + "learning_rate": 4.860793993378149e-05, + "loss": 0.4338, "step": 18530 }, { - "epoch": 0.65, - "learning_rate": 4.869013712442271e-05, - "loss": 0.2796, + "epoch": 0.668000144159729, + "grad_norm": 0.1873011589050293, + "learning_rate": 4.86069796037685e-05, + "loss": 0.4763, "step": 18535 }, { - "epoch": 0.65, - "learning_rate": 4.868922696647685e-05, - "loss": 0.321, + "epoch": 0.6681803438209536, + "grad_norm": 0.1606231927871704, + "learning_rate": 4.8606018952114985e-05, + "loss": 0.4097, "step": 18540 }, { - "epoch": 0.65, - "learning_rate": 4.8688316500942e-05, - "loss": 0.3074, + "epoch": 0.6683605434821782, + "grad_norm": 0.19824917614459991, + "learning_rate": 4.8605057978834034e-05, + "loss": 0.4387, "step": 18545 }, { - "epoch": 0.65, - "learning_rate": 4.868740572782997e-05, - "loss": 0.2912, + "epoch": 0.6685407431434028, + "grad_norm": 0.16622477769851685, + "learning_rate": 4.860409668393873e-05, + "loss": 0.4453, "step": 18550 }, { - "epoch": 0.65, - "learning_rate": 4.868649464715259e-05, - "loss": 0.3335, + "epoch": 0.6687209428046276, + "grad_norm": 0.1473122388124466, + "learning_rate": 4.8603135067442184e-05, + "loss": 0.4171, "step": 18555 }, { - "epoch": 0.65, - "learning_rate": 4.868558325892168e-05, - "loss": 0.3184, + "epoch": 0.6689011424658522, + "grad_norm": 0.15606799721717834, + "learning_rate": 4.8602173129357496e-05, + "loss": 0.4392, "step": 18560 }, { - "epoch": 0.65, - "learning_rate": 4.8684671563149095e-05, - "loss": 0.3027, + "epoch": 0.6690813421270768, + "grad_norm": 0.2004939466714859, + "learning_rate": 4.8601210869697766e-05, + "loss": 0.4659, "step": 18565 }, { - "epoch": 0.65, - "learning_rate": 4.868375955984665e-05, - "loss": 0.3115, + "epoch": 0.6692615417883014, + "grad_norm": 0.17140799760818481, + "learning_rate": 4.8600248288476105e-05, + "loss": 0.4405, "step": 18570 }, { - "epoch": 0.65, - "learning_rate": 4.868284724902619e-05, - "loss": 0.3293, + "epoch": 0.6694417414495261, + "grad_norm": 0.20640818774700165, + "learning_rate": 4.859928538570564e-05, + "loss": 0.4663, "step": 18575 }, { - "epoch": 0.65, - "learning_rate": 4.868193463069958e-05, - "loss": 0.3297, + "epoch": 0.6696219411107507, + "grad_norm": 0.15983538329601288, + "learning_rate": 4.859832216139947e-05, + "loss": 0.4411, "step": 18580 }, { - "epoch": 0.65, - "learning_rate": 4.8681021704878634e-05, - "loss": 0.3219, + "epoch": 0.6698021407719753, + "grad_norm": 0.18739761412143707, + "learning_rate": 4.8597358615570734e-05, + "loss": 0.4453, "step": 18585 }, { - "epoch": 0.65, - "learning_rate": 4.868010847157525e-05, - "loss": 0.3026, + "epoch": 0.6699823404332, + "grad_norm": 0.2296430617570877, + "learning_rate": 4.8596394748232556e-05, + "loss": 0.4441, "step": 18590 }, { - "epoch": 0.65, - "learning_rate": 4.867919493080125e-05, - "loss": 0.3164, + "epoch": 0.6701625400944247, + "grad_norm": 0.1832958459854126, + "learning_rate": 4.8595430559398056e-05, + "loss": 0.4621, "step": 18595 }, { - "epoch": 0.65, - "learning_rate": 4.867828108256851e-05, - "loss": 0.3117, + "epoch": 0.6703427397556493, + "grad_norm": 0.2158745378255844, + "learning_rate": 4.85944660490804e-05, + "loss": 0.4708, "step": 18600 }, { - "epoch": 0.65, - "learning_rate": 4.86773669268889e-05, - "loss": 0.2914, + "epoch": 0.6705229394168739, + "grad_norm": 0.16886022686958313, + "learning_rate": 4.85935012172927e-05, + "loss": 0.4215, "step": 18605 }, { - "epoch": 0.65, - "learning_rate": 4.867645246377428e-05, - "loss": 0.3312, + "epoch": 0.6707031390780985, + "grad_norm": 0.15665309131145477, + "learning_rate": 4.859253606404812e-05, + "loss": 0.4263, "step": 18610 }, { - "epoch": 0.65, - "learning_rate": 4.867553769323653e-05, - "loss": 0.3089, + "epoch": 0.6708833387393232, + "grad_norm": 0.1781836897134781, + "learning_rate": 4.85915705893598e-05, + "loss": 0.4506, "step": 18615 }, { - "epoch": 0.66, - "learning_rate": 4.867462261528752e-05, - "loss": 0.3158, + "epoch": 0.6710635384005478, + "grad_norm": 0.18428124487400055, + "learning_rate": 4.85906047932409e-05, + "loss": 0.4865, "step": 18620 }, { - "epoch": 0.66, - "learning_rate": 4.867370722993915e-05, - "loss": 0.3053, + "epoch": 0.6712437380617724, + "grad_norm": 0.1660621613264084, + "learning_rate": 4.858963867570458e-05, + "loss": 0.4609, "step": 18625 }, { - "epoch": 0.66, - "learning_rate": 4.8672791537203286e-05, - "loss": 0.3143, + "epoch": 0.6714239377229971, + "grad_norm": 0.1916099488735199, + "learning_rate": 4.8588672236764e-05, + "loss": 0.4376, "step": 18630 }, { - "epoch": 0.66, - "learning_rate": 4.8671875537091824e-05, - "loss": 0.333, + "epoch": 0.6716041373842218, + "grad_norm": 0.1742095947265625, + "learning_rate": 4.858770547643232e-05, + "loss": 0.4813, "step": 18635 }, { - "epoch": 0.66, - "learning_rate": 4.867095922961666e-05, - "loss": 0.2965, + "epoch": 0.6717843370454464, + "grad_norm": 0.1986844837665558, + "learning_rate": 4.858673839472273e-05, + "loss": 0.447, "step": 18640 }, { - "epoch": 0.66, - "learning_rate": 4.867004261478969e-05, - "loss": 0.2995, + "epoch": 0.671964536706671, + "grad_norm": 0.168625608086586, + "learning_rate": 4.858577099164839e-05, + "loss": 0.4426, "step": 18645 }, { - "epoch": 0.66, - "learning_rate": 4.866912569262282e-05, - "loss": 0.3155, + "epoch": 0.6721447363678956, + "grad_norm": 0.18472012877464294, + "learning_rate": 4.858480326722249e-05, + "loss": 0.4643, "step": 18650 }, { - "epoch": 0.66, - "learning_rate": 4.866820846312794e-05, - "loss": 0.3045, + "epoch": 0.6723249360291202, + "grad_norm": 0.1810387521982193, + "learning_rate": 4.858383522145821e-05, + "loss": 0.4517, "step": 18655 }, { - "epoch": 0.66, - "learning_rate": 4.866729092631698e-05, - "loss": 0.2974, + "epoch": 0.6725051356903449, + "grad_norm": 0.18452127277851105, + "learning_rate": 4.858286685436873e-05, + "loss": 0.4554, "step": 18660 }, { - "epoch": 0.66, - "learning_rate": 4.866637308220184e-05, - "loss": 0.3212, + "epoch": 0.6726853353515695, + "grad_norm": 0.15459772944450378, + "learning_rate": 4.8581898165967274e-05, + "loss": 0.4355, "step": 18665 }, { - "epoch": 0.66, - "learning_rate": 4.8665454930794454e-05, - "loss": 0.3561, + "epoch": 0.6728655350127942, + "grad_norm": 0.1914420872926712, + "learning_rate": 4.858092915626701e-05, + "loss": 0.3937, "step": 18670 }, { - "epoch": 0.66, - "learning_rate": 4.8664536472106715e-05, - "loss": 0.2988, + "epoch": 0.6730457346740188, + "grad_norm": 0.15395872294902802, + "learning_rate": 4.8579959825281155e-05, + "loss": 0.4635, "step": 18675 }, { - "epoch": 0.66, - "learning_rate": 4.866361770615058e-05, - "loss": 0.2998, + "epoch": 0.6732259343352435, + "grad_norm": 0.13936087489128113, + "learning_rate": 4.857899017302291e-05, + "loss": 0.4779, "step": 18680 }, { - "epoch": 0.66, - "learning_rate": 4.8662698632937955e-05, - "loss": 0.3376, + "epoch": 0.6734061339964681, + "grad_norm": 0.14681056141853333, + "learning_rate": 4.8578020199505495e-05, + "loss": 0.4428, "step": 18685 }, { - "epoch": 0.66, - "learning_rate": 4.866177925248078e-05, - "loss": 0.3156, + "epoch": 0.6735863336576927, + "grad_norm": 0.13621117174625397, + "learning_rate": 4.857704990474211e-05, + "loss": 0.4088, "step": 18690 }, { - "epoch": 0.66, - "learning_rate": 4.8660859564791006e-05, - "loss": 0.3197, + "epoch": 0.6737665333189173, + "grad_norm": 0.1684703528881073, + "learning_rate": 4.857607928874599e-05, + "loss": 0.4154, "step": 18695 }, { - "epoch": 0.66, - "learning_rate": 4.865993956988055e-05, - "loss": 0.3498, + "epoch": 0.673946732980142, + "grad_norm": 0.1323196440935135, + "learning_rate": 4.857510835153035e-05, + "loss": 0.4315, "step": 18700 }, { - "epoch": 0.66, - "learning_rate": 4.865901926776139e-05, - "loss": 0.3049, + "epoch": 0.6741269326413666, + "grad_norm": 0.16398736834526062, + "learning_rate": 4.857413709310843e-05, + "loss": 0.4505, "step": 18705 }, { - "epoch": 0.66, - "learning_rate": 4.865809865844544e-05, - "loss": 0.3001, + "epoch": 0.6743071323025913, + "grad_norm": 0.18038709461688995, + "learning_rate": 4.857316551349345e-05, + "loss": 0.4387, "step": 18710 }, { - "epoch": 0.66, - "learning_rate": 4.865717774194467e-05, - "loss": 0.3121, + "epoch": 0.6744873319638159, + "grad_norm": 0.169601172208786, + "learning_rate": 4.8572193612698656e-05, + "loss": 0.4301, "step": 18715 }, { - "epoch": 0.66, - "learning_rate": 4.865625651827105e-05, - "loss": 0.308, + "epoch": 0.6746675316250406, + "grad_norm": 0.19574612379074097, + "learning_rate": 4.857122139073729e-05, + "loss": 0.4735, "step": 18720 }, { - "epoch": 0.66, - "learning_rate": 4.865533498743652e-05, - "loss": 0.3274, + "epoch": 0.6748477312862652, + "grad_norm": 0.19240935146808624, + "learning_rate": 4.857024884762259e-05, + "loss": 0.4467, "step": 18725 }, { - "epoch": 0.66, - "learning_rate": 4.8654413149453056e-05, - "loss": 0.3408, + "epoch": 0.6750279309474898, + "grad_norm": 0.16014917194843292, + "learning_rate": 4.856927598336782e-05, + "loss": 0.4609, "step": 18730 }, { - "epoch": 0.66, - "learning_rate": 4.865349100433263e-05, - "loss": 0.3402, + "epoch": 0.6752081306087144, + "grad_norm": 0.1911192089319229, + "learning_rate": 4.856830279798623e-05, + "loss": 0.5029, "step": 18735 }, { - "epoch": 0.66, - "learning_rate": 4.8652568552087206e-05, - "loss": 0.3118, + "epoch": 0.675388330269939, + "grad_norm": 0.1548725664615631, + "learning_rate": 4.856732929149107e-05, + "loss": 0.4556, "step": 18740 }, { - "epoch": 0.66, - "learning_rate": 4.865164579272877e-05, - "loss": 0.3159, + "epoch": 0.6755685299311637, + "grad_norm": 0.16711898148059845, + "learning_rate": 4.856635546389562e-05, + "loss": 0.4449, "step": 18745 }, { - "epoch": 0.66, - "learning_rate": 4.86507227262693e-05, - "loss": 0.3268, + "epoch": 0.6757487295923884, + "grad_norm": 0.18358518183231354, + "learning_rate": 4.856538131521313e-05, + "loss": 0.4313, "step": 18750 }, { - "epoch": 0.66, - "learning_rate": 4.864979935272078e-05, - "loss": 0.313, + "epoch": 0.675928929253613, + "grad_norm": 0.18038628995418549, + "learning_rate": 4.8564406845456885e-05, + "loss": 0.4427, "step": 18755 }, { - "epoch": 0.66, - "learning_rate": 4.8648875672095205e-05, - "loss": 0.274, + "epoch": 0.6761091289148377, + "grad_norm": 0.16218414902687073, + "learning_rate": 4.8563432054640155e-05, + "loss": 0.4352, "step": 18760 }, { - "epoch": 0.66, - "learning_rate": 4.864795168440456e-05, - "loss": 0.2841, + "epoch": 0.6762893285760623, + "grad_norm": 0.1689649075269699, + "learning_rate": 4.856245694277623e-05, + "loss": 0.4558, "step": 18765 }, { - "epoch": 0.66, - "learning_rate": 4.864702738966085e-05, - "loss": 0.3536, + "epoch": 0.6764695282372869, + "grad_norm": 0.2027473747730255, + "learning_rate": 4.8561481509878395e-05, + "loss": 0.4304, "step": 18770 }, { - "epoch": 0.66, - "learning_rate": 4.864610278787607e-05, - "loss": 0.3068, + "epoch": 0.6766497278985115, + "grad_norm": 0.18545962870121002, + "learning_rate": 4.856050575595993e-05, + "loss": 0.4456, "step": 18775 }, { - "epoch": 0.66, - "learning_rate": 4.8645177879062235e-05, - "loss": 0.3262, + "epoch": 0.6768299275597361, + "grad_norm": 0.17125312983989716, + "learning_rate": 4.8559529681034135e-05, + "loss": 0.4816, "step": 18780 }, { - "epoch": 0.66, - "learning_rate": 4.8644252663231344e-05, - "loss": 0.2929, + "epoch": 0.6770101272209609, + "grad_norm": 0.14758311212062836, + "learning_rate": 4.855855328511432e-05, + "loss": 0.458, "step": 18785 }, { - "epoch": 0.66, - "learning_rate": 4.864332714039541e-05, - "loss": 0.3021, + "epoch": 0.6771903268821855, + "grad_norm": 0.14200688898563385, + "learning_rate": 4.8557576568213755e-05, + "loss": 0.4096, "step": 18790 }, { - "epoch": 0.66, - "learning_rate": 4.864240131056646e-05, - "loss": 0.3201, + "epoch": 0.6773705265434101, + "grad_norm": 0.15297238528728485, + "learning_rate": 4.855659953034579e-05, + "loss": 0.3955, "step": 18795 }, { - "epoch": 0.66, - "learning_rate": 4.8641475173756514e-05, - "loss": 0.307, + "epoch": 0.6775507262046347, + "grad_norm": 0.17643597722053528, + "learning_rate": 4.855562217152371e-05, + "loss": 0.4581, "step": 18800 }, { - "epoch": 0.66, - "learning_rate": 4.864054872997759e-05, - "loss": 0.3242, + "epoch": 0.6777309258658594, + "grad_norm": 0.15438847243785858, + "learning_rate": 4.8554644491760845e-05, + "loss": 0.4511, "step": 18805 }, { - "epoch": 0.66, - "learning_rate": 4.863962197924172e-05, - "loss": 0.3069, + "epoch": 0.677911125527084, + "grad_norm": 0.17449019849300385, + "learning_rate": 4.8553666491070505e-05, + "loss": 0.4128, "step": 18810 }, { - "epoch": 0.66, - "learning_rate": 4.863869492156093e-05, - "loss": 0.3135, + "epoch": 0.6780913251883086, + "grad_norm": 0.180333212018013, + "learning_rate": 4.855268816946601e-05, + "loss": 0.4442, "step": 18815 }, { - "epoch": 0.66, - "learning_rate": 4.863776755694728e-05, - "loss": 0.324, + "epoch": 0.6782715248495332, + "grad_norm": 0.1781253069639206, + "learning_rate": 4.855170952696071e-05, + "loss": 0.4484, "step": 18820 }, { - "epoch": 0.66, - "learning_rate": 4.863683988541279e-05, - "loss": 0.3285, + "epoch": 0.678451724510758, + "grad_norm": 0.15952306985855103, + "learning_rate": 4.855073056356793e-05, + "loss": 0.4409, "step": 18825 }, { - "epoch": 0.66, - "learning_rate": 4.86359119069695e-05, - "loss": 0.2937, + "epoch": 0.6786319241719826, + "grad_norm": 0.18420594930648804, + "learning_rate": 4.854975127930099e-05, + "loss": 0.4467, "step": 18830 }, { - "epoch": 0.66, - "learning_rate": 4.863498362162948e-05, - "loss": 0.3272, + "epoch": 0.6788121238332072, + "grad_norm": 0.15397235751152039, + "learning_rate": 4.854877167417327e-05, + "loss": 0.4439, "step": 18835 }, { - "epoch": 0.66, - "learning_rate": 4.863405502940478e-05, - "loss": 0.2852, + "epoch": 0.6789923234944318, + "grad_norm": 0.15009304881095886, + "learning_rate": 4.854779174819807e-05, + "loss": 0.4126, "step": 18840 }, { - "epoch": 0.66, - "learning_rate": 4.863312613030744e-05, - "loss": 0.2904, + "epoch": 0.6791725231556565, + "grad_norm": 0.17719674110412598, + "learning_rate": 4.8546811501388784e-05, + "loss": 0.4565, "step": 18845 }, { - "epoch": 0.66, - "learning_rate": 4.863219692434954e-05, - "loss": 0.3309, + "epoch": 0.6793527228168811, + "grad_norm": 0.1648426502943039, + "learning_rate": 4.854583093375875e-05, + "loss": 0.4347, "step": 18850 }, { - "epoch": 0.66, - "learning_rate": 4.863126741154313e-05, - "loss": 0.3213, + "epoch": 0.6795329224781057, + "grad_norm": 0.14466212689876556, + "learning_rate": 4.8544850045321324e-05, + "loss": 0.3926, "step": 18855 }, { - "epoch": 0.66, - "learning_rate": 4.863033759190029e-05, - "loss": 0.3233, + "epoch": 0.6797131221393303, + "grad_norm": 0.1948980838060379, + "learning_rate": 4.8543868836089865e-05, + "loss": 0.4354, "step": 18860 }, { - "epoch": 0.66, - "learning_rate": 4.862940746543308e-05, - "loss": 0.3006, + "epoch": 0.6798933218005551, + "grad_norm": 0.1797015517950058, + "learning_rate": 4.8542887306077765e-05, + "loss": 0.4419, "step": 18865 }, { - "epoch": 0.66, - "learning_rate": 4.862847703215361e-05, - "loss": 0.3136, + "epoch": 0.6800735214617797, + "grad_norm": 0.12938088178634644, + "learning_rate": 4.8541905455298373e-05, + "loss": 0.4491, "step": 18870 }, { - "epoch": 0.66, - "learning_rate": 4.8627546292073915e-05, - "loss": 0.3159, + "epoch": 0.6802537211230043, + "grad_norm": 0.21112160384655, + "learning_rate": 4.854092328376509e-05, + "loss": 0.4418, "step": 18875 }, { - "epoch": 0.66, - "learning_rate": 4.862661524520611e-05, - "loss": 0.2867, + "epoch": 0.6804339207842289, + "grad_norm": 0.18489904701709747, + "learning_rate": 4.853994079149128e-05, + "loss": 0.4395, "step": 18880 }, { - "epoch": 0.66, - "learning_rate": 4.862568389156228e-05, - "loss": 0.3126, + "epoch": 0.6806141204454536, + "grad_norm": 0.15210841596126556, + "learning_rate": 4.853895797849033e-05, + "loss": 0.4479, "step": 18885 }, { - "epoch": 0.66, - "learning_rate": 4.8624752231154517e-05, - "loss": 0.322, + "epoch": 0.6807943201066782, + "grad_norm": 0.15210148692131042, + "learning_rate": 4.8537974844775636e-05, + "loss": 0.3912, "step": 18890 }, { - "epoch": 0.66, - "learning_rate": 4.86238202639949e-05, - "loss": 0.311, + "epoch": 0.6809745197679028, + "grad_norm": 0.1489754021167755, + "learning_rate": 4.85369913903606e-05, + "loss": 0.465, "step": 18895 }, { - "epoch": 0.66, - "learning_rate": 4.862288799009556e-05, - "loss": 0.3173, + "epoch": 0.6811547194291274, + "grad_norm": 0.20377914607524872, + "learning_rate": 4.85360076152586e-05, + "loss": 0.454, "step": 18900 }, { - "epoch": 0.67, - "learning_rate": 4.862195540946858e-05, - "loss": 0.2893, + "epoch": 0.6813349190903522, + "grad_norm": 0.20608647167682648, + "learning_rate": 4.8535023519483055e-05, + "loss": 0.4495, "step": 18905 }, { - "epoch": 0.67, - "learning_rate": 4.862102252212608e-05, - "loss": 0.305, + "epoch": 0.6815151187515768, + "grad_norm": 0.1873638778924942, + "learning_rate": 4.853403910304738e-05, + "loss": 0.4436, "step": 18910 }, { - "epoch": 0.67, - "learning_rate": 4.8620089328080176e-05, - "loss": 0.299, + "epoch": 0.6816953184128014, + "grad_norm": 0.16861487925052643, + "learning_rate": 4.853305436596497e-05, + "loss": 0.4342, "step": 18915 }, { - "epoch": 0.67, - "learning_rate": 4.861915582734297e-05, - "loss": 0.2985, + "epoch": 0.681875518074026, + "grad_norm": 0.14870557188987732, + "learning_rate": 4.853206930824925e-05, + "loss": 0.4825, "step": 18920 }, { - "epoch": 0.67, - "learning_rate": 4.861822201992659e-05, - "loss": 0.3384, + "epoch": 0.6820557177352506, + "grad_norm": 0.16530665755271912, + "learning_rate": 4.853108392991366e-05, + "loss": 0.4248, "step": 18925 }, { - "epoch": 0.67, - "learning_rate": 4.861728790584317e-05, - "loss": 0.3381, + "epoch": 0.6822359173964753, + "grad_norm": 0.15075165033340454, + "learning_rate": 4.8530098230971586e-05, + "loss": 0.4616, "step": 18930 }, { - "epoch": 0.67, - "learning_rate": 4.861635348510482e-05, - "loss": 0.3576, + "epoch": 0.6824161170576999, + "grad_norm": 0.16559144854545593, + "learning_rate": 4.852911221143649e-05, + "loss": 0.4233, "step": 18935 }, { - "epoch": 0.67, - "learning_rate": 4.861541875772369e-05, - "loss": 0.2984, + "epoch": 0.6825963167189246, + "grad_norm": 0.17002665996551514, + "learning_rate": 4.852812587132178e-05, + "loss": 0.448, "step": 18940 }, { - "epoch": 0.67, - "learning_rate": 4.861448372371191e-05, - "loss": 0.2902, + "epoch": 0.6827765163801492, + "grad_norm": 0.14457158744335175, + "learning_rate": 4.8527139210640924e-05, + "loss": 0.4475, "step": 18945 }, { - "epoch": 0.67, - "learning_rate": 4.861354838308162e-05, - "loss": 0.2975, + "epoch": 0.6829567160413739, + "grad_norm": 0.18576878309249878, + "learning_rate": 4.852615222940735e-05, + "loss": 0.432, "step": 18950 }, { - "epoch": 0.67, - "learning_rate": 4.861261273584497e-05, - "loss": 0.3032, + "epoch": 0.6831369157025985, + "grad_norm": 0.20716910064220428, + "learning_rate": 4.852516492763451e-05, + "loss": 0.4773, "step": 18955 }, { - "epoch": 0.67, - "learning_rate": 4.8611676782014104e-05, - "loss": 0.3307, + "epoch": 0.6833171153638231, + "grad_norm": 0.1403721570968628, + "learning_rate": 4.852417730533585e-05, + "loss": 0.4248, "step": 18960 }, { - "epoch": 0.67, - "learning_rate": 4.861074052160117e-05, - "loss": 0.3202, + "epoch": 0.6834973150250477, + "grad_norm": 0.1828588992357254, + "learning_rate": 4.852318936252482e-05, + "loss": 0.4335, "step": 18965 }, { - "epoch": 0.67, - "learning_rate": 4.860980395461834e-05, - "loss": 0.3167, + "epoch": 0.6836775146862724, + "grad_norm": 0.18035824596881866, + "learning_rate": 4.8522201099214904e-05, + "loss": 0.4083, "step": 18970 }, { - "epoch": 0.67, - "learning_rate": 4.860886708107776e-05, - "loss": 0.3472, + "epoch": 0.683857714347497, + "grad_norm": 0.16249680519104004, + "learning_rate": 4.852121251541955e-05, + "loss": 0.4238, "step": 18975 }, { - "epoch": 0.67, - "learning_rate": 4.8607929900991594e-05, - "loss": 0.3341, + "epoch": 0.6840379140087217, + "grad_norm": 0.1821296066045761, + "learning_rate": 4.8520223611152215e-05, + "loss": 0.4195, "step": 18980 }, { - "epoch": 0.67, - "learning_rate": 4.860699241437202e-05, - "loss": 0.3142, + "epoch": 0.6842181136699463, + "grad_norm": 0.16044920682907104, + "learning_rate": 4.851923438642639e-05, + "loss": 0.4541, "step": 18985 }, { - "epoch": 0.67, - "learning_rate": 4.860605462123121e-05, - "loss": 0.327, + "epoch": 0.684398313331171, + "grad_norm": 0.169897198677063, + "learning_rate": 4.851824484125556e-05, + "loss": 0.4616, "step": 18990 }, { - "epoch": 0.67, - "learning_rate": 4.860511652158134e-05, - "loss": 0.3138, + "epoch": 0.6845785129923956, + "grad_norm": 0.17817699909210205, + "learning_rate": 4.851725497565319e-05, + "loss": 0.4653, "step": 18995 }, { - "epoch": 0.67, - "learning_rate": 4.860417811543458e-05, - "loss": 0.3034, + "epoch": 0.6847587126536202, + "grad_norm": 0.19420120120048523, + "learning_rate": 4.851626478963278e-05, + "loss": 0.4647, "step": 19000 }, { - "epoch": 0.67, - "eval_loss": 0.30461186170578003, - "eval_runtime": 10.5328, - "eval_samples_per_second": 9.494, - "eval_steps_per_second": 9.494, + "epoch": 0.6847587126536202, + "eval_loss": 0.46408936381340027, + "eval_runtime": 3.5332, + "eval_samples_per_second": 28.303, + "eval_steps_per_second": 7.076, "step": 19000 }, { - "epoch": 0.67, - "learning_rate": 4.860323940280313e-05, - "loss": 0.3134, + "epoch": 0.6849389123148448, + "grad_norm": 0.18936601281166077, + "learning_rate": 4.851527428320781e-05, + "loss": 0.4437, "step": 19005 }, { - "epoch": 0.67, - "learning_rate": 4.860230038369917e-05, - "loss": 0.3235, + "epoch": 0.6851191119760695, + "grad_norm": 0.16841618716716766, + "learning_rate": 4.8514283456391785e-05, + "loss": 0.4336, "step": 19010 }, { - "epoch": 0.67, - "learning_rate": 4.860136105813489e-05, - "loss": 0.3028, + "epoch": 0.6852993116372941, + "grad_norm": 0.16009177267551422, + "learning_rate": 4.8513292309198197e-05, + "loss": 0.4376, "step": 19015 }, { - "epoch": 0.67, - "learning_rate": 4.86004214261225e-05, - "loss": 0.3016, + "epoch": 0.6854795112985188, + "grad_norm": 0.16848459839820862, + "learning_rate": 4.851230084164056e-05, + "loss": 0.4548, "step": 19020 }, { - "epoch": 0.67, - "learning_rate": 4.859948148767418e-05, - "loss": 0.2898, + "epoch": 0.6856597109597434, + "grad_norm": 0.1737171709537506, + "learning_rate": 4.851130905373237e-05, + "loss": 0.4012, "step": 19025 }, { - "epoch": 0.67, - "learning_rate": 4.8598541242802155e-05, - "loss": 0.2882, + "epoch": 0.685839910620968, + "grad_norm": 0.1548008769750595, + "learning_rate": 4.851031694548714e-05, + "loss": 0.4344, "step": 19030 }, { - "epoch": 0.67, - "learning_rate": 4.859760069151862e-05, - "loss": 0.2995, + "epoch": 0.6860201102821927, + "grad_norm": 0.17847006022930145, + "learning_rate": 4.85093245169184e-05, + "loss": 0.4582, "step": 19035 }, { - "epoch": 0.67, - "learning_rate": 4.859665983383579e-05, - "loss": 0.309, + "epoch": 0.6862003099434173, + "grad_norm": 0.17521251738071442, + "learning_rate": 4.8508331768039674e-05, + "loss": 0.436, "step": 19040 }, { - "epoch": 0.67, - "learning_rate": 4.8595718669765874e-05, - "loss": 0.3045, + "epoch": 0.6863805096046419, + "grad_norm": 0.1900710016489029, + "learning_rate": 4.850733869886446e-05, + "loss": 0.4429, "step": 19045 }, { - "epoch": 0.67, - "learning_rate": 4.859477719932112e-05, - "loss": 0.2974, + "epoch": 0.6865607092658665, + "grad_norm": 0.22452399134635925, + "learning_rate": 4.850634530940632e-05, + "loss": 0.4393, "step": 19050 }, { - "epoch": 0.67, - "learning_rate": 4.859383542251372e-05, - "loss": 0.322, + "epoch": 0.6867409089270912, + "grad_norm": 0.1778751015663147, + "learning_rate": 4.850535159967877e-05, + "loss": 0.4614, "step": 19055 }, { - "epoch": 0.67, - "learning_rate": 4.8592893339355925e-05, - "loss": 0.3175, + "epoch": 0.6869211085883159, + "grad_norm": 0.1997576355934143, + "learning_rate": 4.8504357569695365e-05, + "loss": 0.4381, "step": 19060 }, { - "epoch": 0.67, - "learning_rate": 4.859195094985996e-05, - "loss": 0.3287, + "epoch": 0.6871013082495405, + "grad_norm": 0.17467275261878967, + "learning_rate": 4.850336321946963e-05, + "loss": 0.4544, "step": 19065 }, { - "epoch": 0.67, - "learning_rate": 4.859100825403805e-05, - "loss": 0.311, + "epoch": 0.6872815079107651, + "grad_norm": 0.15613585710525513, + "learning_rate": 4.850236854901513e-05, + "loss": 0.42, "step": 19070 }, { - "epoch": 0.67, - "learning_rate": 4.859006525190244e-05, - "loss": 0.3208, + "epoch": 0.6874617075719898, + "grad_norm": 0.18194815516471863, + "learning_rate": 4.85013735583454e-05, + "loss": 0.475, "step": 19075 }, { - "epoch": 0.67, - "learning_rate": 4.858912194346539e-05, - "loss": 0.3133, + "epoch": 0.6876419072332144, + "grad_norm": 0.17985866963863373, + "learning_rate": 4.850037824747401e-05, + "loss": 0.4177, "step": 19080 }, { - "epoch": 0.67, - "learning_rate": 4.858817832873913e-05, - "loss": 0.315, + "epoch": 0.687822106894439, + "grad_norm": 0.19839969277381897, + "learning_rate": 4.8499382616414515e-05, + "loss": 0.4429, "step": 19085 }, { - "epoch": 0.67, - "learning_rate": 4.8587234407735926e-05, - "loss": 0.2925, + "epoch": 0.6880023065556636, + "grad_norm": 0.1918293535709381, + "learning_rate": 4.8498386665180474e-05, + "loss": 0.4412, "step": 19090 }, { - "epoch": 0.67, - "learning_rate": 4.858629018046803e-05, - "loss": 0.3254, + "epoch": 0.6881825062168883, + "grad_norm": 0.20238642394542694, + "learning_rate": 4.8497390393785475e-05, + "loss": 0.4255, "step": 19095 }, { - "epoch": 0.67, - "learning_rate": 4.858534564694769e-05, - "loss": 0.2914, + "epoch": 0.688362705878113, + "grad_norm": 0.15424714982509613, + "learning_rate": 4.849639380224308e-05, + "loss": 0.4449, "step": 19100 }, { - "epoch": 0.67, - "learning_rate": 4.858440080718719e-05, - "loss": 0.3161, + "epoch": 0.6885429055393376, + "grad_norm": 0.15987904369831085, + "learning_rate": 4.8495396890566855e-05, + "loss": 0.4287, "step": 19105 }, { - "epoch": 0.67, - "learning_rate": 4.858345566119879e-05, - "loss": 0.3113, + "epoch": 0.6887231052005622, + "grad_norm": 0.19473698735237122, + "learning_rate": 4.849439965877041e-05, + "loss": 0.4664, "step": 19110 }, { - "epoch": 0.67, - "learning_rate": 4.8582510208994746e-05, - "loss": 0.3259, + "epoch": 0.6889033048617869, + "grad_norm": 0.15366004407405853, + "learning_rate": 4.849340210686732e-05, + "loss": 0.4612, "step": 19115 }, { - "epoch": 0.67, - "learning_rate": 4.858156445058736e-05, - "loss": 0.3197, + "epoch": 0.6890835045230115, + "grad_norm": 0.20622040331363678, + "learning_rate": 4.849240423487117e-05, + "loss": 0.4397, "step": 19120 }, { - "epoch": 0.67, - "learning_rate": 4.85806183859889e-05, - "loss": 0.3311, + "epoch": 0.6892637041842361, + "grad_norm": 0.1534797102212906, + "learning_rate": 4.8491406042795565e-05, + "loss": 0.4492, "step": 19125 }, { - "epoch": 0.67, - "learning_rate": 4.857967201521165e-05, - "loss": 0.3248, + "epoch": 0.6894439038454607, + "grad_norm": 0.16360458731651306, + "learning_rate": 4.849040753065409e-05, + "loss": 0.4591, "step": 19130 }, { - "epoch": 0.67, - "learning_rate": 4.857872533826789e-05, - "loss": 0.3058, + "epoch": 0.6896241035066855, + "grad_norm": 0.15157437324523926, + "learning_rate": 4.848940869846037e-05, + "loss": 0.4604, "step": 19135 }, { - "epoch": 0.67, - "learning_rate": 4.8577778355169935e-05, - "loss": 0.3079, + "epoch": 0.6898043031679101, + "grad_norm": 0.2104407548904419, + "learning_rate": 4.8488409546228e-05, + "loss": 0.4153, "step": 19140 }, { - "epoch": 0.67, - "learning_rate": 4.857683106593005e-05, - "loss": 0.3095, + "epoch": 0.6899845028291347, + "grad_norm": 0.2011989951133728, + "learning_rate": 4.8487410073970594e-05, + "loss": 0.4757, "step": 19145 }, { - "epoch": 0.67, - "learning_rate": 4.8575883470560565e-05, - "loss": 0.2988, + "epoch": 0.6901647024903593, + "grad_norm": 0.18023835122585297, + "learning_rate": 4.848641028170178e-05, + "loss": 0.4305, "step": 19150 }, { - "epoch": 0.67, - "learning_rate": 4.857493556907376e-05, - "loss": 0.3, + "epoch": 0.690344902151584, + "grad_norm": 0.1514541208744049, + "learning_rate": 4.848541016943516e-05, + "loss": 0.4403, "step": 19155 }, { - "epoch": 0.67, - "learning_rate": 4.857398736148196e-05, - "loss": 0.3329, + "epoch": 0.6905251018128086, + "grad_norm": 0.1530730277299881, + "learning_rate": 4.8484409737184386e-05, + "loss": 0.4532, "step": 19160 }, { - "epoch": 0.67, - "learning_rate": 4.857303884779747e-05, - "loss": 0.3193, + "epoch": 0.6907053014740332, + "grad_norm": 0.18207421898841858, + "learning_rate": 4.848340898496308e-05, + "loss": 0.4431, "step": 19165 }, { - "epoch": 0.67, - "learning_rate": 4.8572090028032604e-05, - "loss": 0.3163, + "epoch": 0.6908855011352578, + "grad_norm": 0.17817628383636475, + "learning_rate": 4.848240791278486e-05, + "loss": 0.425, "step": 19170 }, { - "epoch": 0.67, - "learning_rate": 4.8571140902199684e-05, - "loss": 0.3055, + "epoch": 0.6910657007964826, + "grad_norm": 0.13877823948860168, + "learning_rate": 4.848140652066339e-05, + "loss": 0.4346, "step": 19175 }, { - "epoch": 0.67, - "learning_rate": 4.857019147031103e-05, - "loss": 0.3248, + "epoch": 0.6912459004577072, + "grad_norm": 0.15629951655864716, + "learning_rate": 4.84804048086123e-05, + "loss": 0.4438, "step": 19180 }, { - "epoch": 0.67, - "learning_rate": 4.8569241732378975e-05, - "loss": 0.3135, + "epoch": 0.6914261001189318, + "grad_norm": 0.15390846133232117, + "learning_rate": 4.8479402776645235e-05, + "loss": 0.4709, "step": 19185 }, { - "epoch": 0.68, - "learning_rate": 4.856829168841585e-05, - "loss": 0.3074, + "epoch": 0.6916062997801564, + "grad_norm": 0.17919474840164185, + "learning_rate": 4.847840042477586e-05, + "loss": 0.4731, "step": 19190 }, { - "epoch": 0.68, - "learning_rate": 4.856734133843399e-05, - "loss": 0.3359, + "epoch": 0.691786499441381, + "grad_norm": 0.170258030295372, + "learning_rate": 4.8477397753017816e-05, + "loss": 0.468, "step": 19195 }, { - "epoch": 0.68, - "learning_rate": 4.8566390682445734e-05, - "loss": 0.3111, + "epoch": 0.6919666991026057, + "grad_norm": 0.19165678322315216, + "learning_rate": 4.847639476138478e-05, + "loss": 0.5018, "step": 19200 }, { - "epoch": 0.68, - "learning_rate": 4.8565439720463424e-05, - "loss": 0.3306, + "epoch": 0.6921468987638303, + "grad_norm": 0.15710307657718658, + "learning_rate": 4.8475391449890405e-05, + "loss": 0.4447, "step": 19205 }, { - "epoch": 0.68, - "learning_rate": 4.856448845249941e-05, - "loss": 0.3012, + "epoch": 0.6923270984250549, + "grad_norm": 0.1523151695728302, + "learning_rate": 4.847438781854837e-05, + "loss": 0.4627, "step": 19210 }, { - "epoch": 0.68, - "learning_rate": 4.856353687856604e-05, - "loss": 0.3008, + "epoch": 0.6925072980862796, + "grad_norm": 0.14756174385547638, + "learning_rate": 4.8473383867372345e-05, + "loss": 0.4621, "step": 19215 }, { - "epoch": 0.68, - "learning_rate": 4.856258499867568e-05, - "loss": 0.3193, + "epoch": 0.6926874977475043, + "grad_norm": 0.2157120406627655, + "learning_rate": 4.847237959637602e-05, + "loss": 0.4199, "step": 19220 }, { - "epoch": 0.68, - "learning_rate": 4.8561632812840674e-05, - "loss": 0.3251, + "epoch": 0.6928676974087289, + "grad_norm": 0.14184360206127167, + "learning_rate": 4.847137500557305e-05, + "loss": 0.4347, "step": 19225 }, { - "epoch": 0.68, - "learning_rate": 4.8560680321073405e-05, - "loss": 0.2841, + "epoch": 0.6930478970699535, + "grad_norm": 0.13508060574531555, + "learning_rate": 4.847037009497715e-05, + "loss": 0.4535, "step": 19230 }, { - "epoch": 0.68, - "learning_rate": 4.8559727523386214e-05, - "loss": 0.3252, + "epoch": 0.6932280967311781, + "grad_norm": 0.14974738657474518, + "learning_rate": 4.8469364864602e-05, + "loss": 0.4337, "step": 19235 }, { - "epoch": 0.68, - "learning_rate": 4.8558774419791496e-05, - "loss": 0.3157, + "epoch": 0.6934082963924028, + "grad_norm": 0.2109774798154831, + "learning_rate": 4.846835931446129e-05, + "loss": 0.4732, "step": 19240 }, { - "epoch": 0.68, - "learning_rate": 4.855782101030162e-05, - "loss": 0.3063, + "epoch": 0.6935884960536274, + "grad_norm": 0.1753171980381012, + "learning_rate": 4.846735344456873e-05, + "loss": 0.4438, "step": 19245 }, { - "epoch": 0.68, - "learning_rate": 4.855686729492896e-05, - "loss": 0.284, + "epoch": 0.693768695714852, + "grad_norm": 0.20404259860515594, + "learning_rate": 4.8466347254938034e-05, + "loss": 0.4431, "step": 19250 }, { - "epoch": 0.68, - "learning_rate": 4.85559132736859e-05, - "loss": 0.3275, + "epoch": 0.6939488953760767, + "grad_norm": 0.1709374636411667, + "learning_rate": 4.846534074558289e-05, + "loss": 0.4206, "step": 19255 }, { - "epoch": 0.68, - "learning_rate": 4.855495894658484e-05, - "loss": 0.295, + "epoch": 0.6941290950373014, + "grad_norm": 0.13313916325569153, + "learning_rate": 4.8464333916517025e-05, + "loss": 0.4372, "step": 19260 }, { - "epoch": 0.68, - "learning_rate": 4.8554004313638154e-05, - "loss": 0.3287, + "epoch": 0.694309294698526, + "grad_norm": 0.1679733395576477, + "learning_rate": 4.8463326767754145e-05, + "loss": 0.4753, "step": 19265 }, { - "epoch": 0.68, - "learning_rate": 4.855304937485825e-05, - "loss": 0.3224, + "epoch": 0.6944894943597506, + "grad_norm": 0.14599260687828064, + "learning_rate": 4.846231929930799e-05, + "loss": 0.423, "step": 19270 }, { - "epoch": 0.68, - "learning_rate": 4.8552094130257515e-05, - "loss": 0.3019, + "epoch": 0.6946696940209752, + "grad_norm": 0.17458441853523254, + "learning_rate": 4.846131151119228e-05, + "loss": 0.4576, "step": 19275 }, { - "epoch": 0.68, - "learning_rate": 4.855113857984836e-05, - "loss": 0.334, + "epoch": 0.6948498936821998, + "grad_norm": 0.14298151433467865, + "learning_rate": 4.8460303403420735e-05, + "loss": 0.4628, "step": 19280 }, { - "epoch": 0.68, - "learning_rate": 4.85501827236432e-05, - "loss": 0.295, + "epoch": 0.6950300933434245, + "grad_norm": 0.18267078697681427, + "learning_rate": 4.84592949760071e-05, + "loss": 0.4505, "step": 19285 }, { - "epoch": 0.68, - "learning_rate": 4.8549226561654426e-05, - "loss": 0.2808, + "epoch": 0.6952102930046492, + "grad_norm": 0.18791311979293823, + "learning_rate": 4.845828622896511e-05, + "loss": 0.3925, "step": 19290 }, { - "epoch": 0.68, - "learning_rate": 4.8548270093894465e-05, - "loss": 0.3072, + "epoch": 0.6953904926658738, + "grad_norm": 0.1504139006137848, + "learning_rate": 4.8457277162308526e-05, + "loss": 0.4281, "step": 19295 }, { - "epoch": 0.68, - "learning_rate": 4.854731332037574e-05, - "loss": 0.3187, + "epoch": 0.6955706923270984, + "grad_norm": 0.1674523800611496, + "learning_rate": 4.8456267776051066e-05, + "loss": 0.4462, "step": 19300 }, { - "epoch": 0.68, - "learning_rate": 4.8546356241110666e-05, - "loss": 0.3158, + "epoch": 0.6957508919883231, + "grad_norm": 0.1913890391588211, + "learning_rate": 4.84552580702065e-05, + "loss": 0.4054, "step": 19305 }, { - "epoch": 0.68, - "learning_rate": 4.8545398856111676e-05, - "loss": 0.3075, + "epoch": 0.6959310916495477, + "grad_norm": 0.18787144124507904, + "learning_rate": 4.8454248044788594e-05, + "loss": 0.4503, "step": 19310 }, { - "epoch": 0.68, - "learning_rate": 4.854444116539121e-05, - "loss": 0.3375, + "epoch": 0.6961112913107723, + "grad_norm": 0.15281574428081512, + "learning_rate": 4.845323769981109e-05, + "loss": 0.4506, "step": 19315 }, { - "epoch": 0.68, - "learning_rate": 4.854348316896168e-05, - "loss": 0.3051, + "epoch": 0.6962914909719969, + "grad_norm": 0.17581430077552795, + "learning_rate": 4.845222703528777e-05, + "loss": 0.4174, "step": 19320 }, { - "epoch": 0.68, - "learning_rate": 4.854252486683554e-05, - "loss": 0.293, + "epoch": 0.6964716906332216, + "grad_norm": 0.14308330416679382, + "learning_rate": 4.845121605123239e-05, + "loss": 0.4106, "step": 19325 }, { - "epoch": 0.68, - "learning_rate": 4.8541566259025226e-05, - "loss": 0.3089, + "epoch": 0.6966518902944463, + "grad_norm": 0.1626398116350174, + "learning_rate": 4.8450204747658734e-05, + "loss": 0.4736, "step": 19330 }, { - "epoch": 0.68, - "learning_rate": 4.8540607345543194e-05, - "loss": 0.312, + "epoch": 0.6968320899556709, + "grad_norm": 0.14919036626815796, + "learning_rate": 4.844919312458058e-05, + "loss": 0.4042, "step": 19335 }, { - "epoch": 0.68, - "learning_rate": 4.853964812640188e-05, - "loss": 0.3218, + "epoch": 0.6970122896168955, + "grad_norm": 0.15542307496070862, + "learning_rate": 4.844818118201171e-05, + "loss": 0.4378, "step": 19340 }, { - "epoch": 0.68, - "learning_rate": 4.853868860161376e-05, - "loss": 0.2937, + "epoch": 0.6971924892781202, + "grad_norm": 0.18490786850452423, + "learning_rate": 4.844716891996591e-05, + "loss": 0.4315, "step": 19345 }, { - "epoch": 0.68, - "learning_rate": 4.8537728771191275e-05, - "loss": 0.3022, + "epoch": 0.6973726889393448, + "grad_norm": 0.20370669662952423, + "learning_rate": 4.8446156338456975e-05, + "loss": 0.4472, "step": 19350 }, { - "epoch": 0.68, - "learning_rate": 4.8536768635146895e-05, - "loss": 0.3084, + "epoch": 0.6975528886005694, + "grad_norm": 0.1456826627254486, + "learning_rate": 4.8445143437498696e-05, + "loss": 0.4498, "step": 19355 }, { - "epoch": 0.68, - "learning_rate": 4.853580819349309e-05, - "loss": 0.3086, + "epoch": 0.697733088261794, + "grad_norm": 0.159530371427536, + "learning_rate": 4.844413021710488e-05, + "loss": 0.4576, "step": 19360 }, { - "epoch": 0.68, - "learning_rate": 4.853484744624232e-05, - "loss": 0.3378, + "epoch": 0.6979132879230187, + "grad_norm": 0.1378137171268463, + "learning_rate": 4.844311667728932e-05, + "loss": 0.4643, "step": 19365 }, { - "epoch": 0.68, - "learning_rate": 4.853388639340707e-05, - "loss": 0.2914, + "epoch": 0.6980934875842434, + "grad_norm": 0.17406761646270752, + "learning_rate": 4.844210281806585e-05, + "loss": 0.4662, "step": 19370 }, { - "epoch": 0.68, - "learning_rate": 4.853292503499981e-05, - "loss": 0.293, + "epoch": 0.698273687245468, + "grad_norm": 0.14517958462238312, + "learning_rate": 4.844108863944826e-05, + "loss": 0.4136, "step": 19375 }, { - "epoch": 0.68, - "learning_rate": 4.8531963371033026e-05, - "loss": 0.2818, + "epoch": 0.6984538869066926, + "grad_norm": 0.16255003213882446, + "learning_rate": 4.844007414145037e-05, + "loss": 0.4686, "step": 19380 }, { - "epoch": 0.68, - "learning_rate": 4.8531001401519214e-05, - "loss": 0.3251, + "epoch": 0.6986340865679173, + "grad_norm": 0.14890755712985992, + "learning_rate": 4.843905932408601e-05, + "loss": 0.4148, "step": 19385 }, { - "epoch": 0.68, - "learning_rate": 4.853003912647085e-05, - "loss": 0.2689, + "epoch": 0.6988142862291419, + "grad_norm": 0.1591944694519043, + "learning_rate": 4.843804418736901e-05, + "loss": 0.4243, "step": 19390 }, { - "epoch": 0.68, - "learning_rate": 4.852907654590044e-05, - "loss": 0.3265, + "epoch": 0.6989944858903665, + "grad_norm": 0.15910327434539795, + "learning_rate": 4.84370287313132e-05, + "loss": 0.4091, "step": 19395 }, { - "epoch": 0.68, - "learning_rate": 4.8528113659820474e-05, - "loss": 0.306, + "epoch": 0.6991746855515911, + "grad_norm": 0.19701680541038513, + "learning_rate": 4.84360129559324e-05, + "loss": 0.4521, "step": 19400 }, { - "epoch": 0.68, - "learning_rate": 4.8527150468243454e-05, - "loss": 0.2849, + "epoch": 0.6993548852128157, + "grad_norm": 0.16682995855808258, + "learning_rate": 4.8434996861240457e-05, + "loss": 0.4676, "step": 19405 }, { - "epoch": 0.68, - "learning_rate": 4.8526186971181893e-05, - "loss": 0.3083, + "epoch": 0.6995350848740405, + "grad_norm": 0.14017580449581146, + "learning_rate": 4.843398044725123e-05, + "loss": 0.4273, "step": 19410 }, { - "epoch": 0.68, - "learning_rate": 4.8525223168648296e-05, - "loss": 0.3484, + "epoch": 0.6997152845352651, + "grad_norm": 0.19788876175880432, + "learning_rate": 4.843296371397855e-05, + "loss": 0.4129, "step": 19415 }, { - "epoch": 0.68, - "learning_rate": 4.852425906065519e-05, - "loss": 0.3175, + "epoch": 0.6998954841964897, + "grad_norm": 0.21277104318141937, + "learning_rate": 4.843194666143628e-05, + "loss": 0.5118, "step": 19420 }, { - "epoch": 0.68, - "learning_rate": 4.852329464721507e-05, - "loss": 0.3194, + "epoch": 0.7000756838577143, + "grad_norm": 0.19250738620758057, + "learning_rate": 4.843092928963827e-05, + "loss": 0.4856, "step": 19425 }, { - "epoch": 0.68, - "learning_rate": 4.852232992834048e-05, - "loss": 0.2969, + "epoch": 0.700255883518939, + "grad_norm": 0.15623074769973755, + "learning_rate": 4.8429911598598386e-05, + "loss": 0.4348, "step": 19430 }, { - "epoch": 0.68, - "learning_rate": 4.852136490404393e-05, - "loss": 0.3438, + "epoch": 0.7004360831801636, + "grad_norm": 0.13664281368255615, + "learning_rate": 4.84288935883305e-05, + "loss": 0.4232, "step": 19435 }, { - "epoch": 0.68, - "learning_rate": 4.852039957433797e-05, - "loss": 0.3239, + "epoch": 0.7006162828413882, + "grad_norm": 0.1700459122657776, + "learning_rate": 4.842787525884847e-05, + "loss": 0.4598, "step": 19440 }, { - "epoch": 0.68, - "learning_rate": 4.851943393923512e-05, - "loss": 0.3088, + "epoch": 0.700796482502613, + "grad_norm": 0.15901970863342285, + "learning_rate": 4.842685661016617e-05, + "loss": 0.4267, "step": 19445 }, { - "epoch": 0.68, - "learning_rate": 4.851846799874791e-05, - "loss": 0.3085, + "epoch": 0.7009766821638376, + "grad_norm": 0.20481829345226288, + "learning_rate": 4.842583764229749e-05, + "loss": 0.4309, "step": 19450 }, { - "epoch": 0.68, - "learning_rate": 4.8517501752888895e-05, - "loss": 0.274, + "epoch": 0.7011568818250622, + "grad_norm": 0.19733159244060516, + "learning_rate": 4.8424818355256304e-05, + "loss": 0.4754, "step": 19455 }, { - "epoch": 0.68, - "learning_rate": 4.851653520167062e-05, - "loss": 0.3079, + "epoch": 0.7013370814862868, + "grad_norm": 0.1566084325313568, + "learning_rate": 4.84237987490565e-05, + "loss": 0.4086, "step": 19460 }, { - "epoch": 0.68, - "learning_rate": 4.851556834510564e-05, - "loss": 0.3116, + "epoch": 0.7015172811475114, + "grad_norm": 0.16943992674350739, + "learning_rate": 4.842277882371198e-05, + "loss": 0.4623, "step": 19465 }, { - "epoch": 0.69, - "learning_rate": 4.851460118320649e-05, - "loss": 0.2865, + "epoch": 0.7016974808087361, + "grad_norm": 0.1365501880645752, + "learning_rate": 4.842175857923663e-05, + "loss": 0.4053, "step": 19470 }, { - "epoch": 0.69, - "learning_rate": 4.851363371598575e-05, - "loss": 0.3095, + "epoch": 0.7018776804699607, + "grad_norm": 0.1714967042207718, + "learning_rate": 4.842073801564436e-05, + "loss": 0.4687, "step": 19475 }, { - "epoch": 0.69, - "learning_rate": 4.851266594345597e-05, - "loss": 0.2859, + "epoch": 0.7020578801311853, + "grad_norm": 0.1788230538368225, + "learning_rate": 4.841971713294906e-05, + "loss": 0.4179, "step": 19480 }, { - "epoch": 0.69, - "learning_rate": 4.8511697865629715e-05, - "loss": 0.2908, + "epoch": 0.70223807979241, + "grad_norm": 0.16317056119441986, + "learning_rate": 4.841869593116466e-05, + "loss": 0.4278, "step": 19485 }, { - "epoch": 0.69, - "learning_rate": 4.851072948251957e-05, - "loss": 0.3073, + "epoch": 0.7024182794536347, + "grad_norm": 0.149271622300148, + "learning_rate": 4.841767441030505e-05, + "loss": 0.4528, "step": 19490 }, { - "epoch": 0.69, - "learning_rate": 4.850976079413808e-05, - "loss": 0.3105, + "epoch": 0.7025984791148593, + "grad_norm": 0.17781810462474823, + "learning_rate": 4.841665257038416e-05, + "loss": 0.421, "step": 19495 }, { - "epoch": 0.69, - "learning_rate": 4.850879180049785e-05, - "loss": 0.3014, + "epoch": 0.7027786787760839, + "grad_norm": 0.16509835422039032, + "learning_rate": 4.841563041141592e-05, + "loss": 0.4252, "step": 19500 }, { - "epoch": 0.69, - "eval_loss": 0.30417418479919434, - "eval_runtime": 10.5331, - "eval_samples_per_second": 9.494, - "eval_steps_per_second": 9.494, + "epoch": 0.7027786787760839, + "eval_loss": 0.4628825783729553, + "eval_runtime": 3.5462, + "eval_samples_per_second": 28.199, + "eval_steps_per_second": 7.05, "step": 19500 }, { - "epoch": 0.69, - "learning_rate": 4.8507822501611444e-05, - "loss": 0.3377, + "epoch": 0.7029588784373085, + "grad_norm": 0.16837020218372345, + "learning_rate": 4.841460793341425e-05, + "loss": 0.457, "step": 19505 }, { - "epoch": 0.69, - "learning_rate": 4.850685289749146e-05, - "loss": 0.2887, + "epoch": 0.7031390780985332, + "grad_norm": 0.18554091453552246, + "learning_rate": 4.8413585136393066e-05, + "loss": 0.421, "step": 19510 }, { - "epoch": 0.69, - "learning_rate": 4.850588298815048e-05, - "loss": 0.3038, + "epoch": 0.7033192777597578, + "grad_norm": 0.20547594130039215, + "learning_rate": 4.841256202036634e-05, + "loss": 0.4686, "step": 19515 }, { - "epoch": 0.69, - "learning_rate": 4.850491277360111e-05, - "loss": 0.2823, + "epoch": 0.7034994774209824, + "grad_norm": 0.18383750319480896, + "learning_rate": 4.841153858534797e-05, + "loss": 0.4405, "step": 19520 }, { - "epoch": 0.69, - "learning_rate": 4.8503942253855927e-05, - "loss": 0.3096, + "epoch": 0.7036796770822071, + "grad_norm": 0.18066047132015228, + "learning_rate": 4.8410514831351926e-05, + "loss": 0.3865, "step": 19525 }, { - "epoch": 0.69, - "learning_rate": 4.8502971428927544e-05, - "loss": 0.295, + "epoch": 0.7038598767434318, + "grad_norm": 0.144275963306427, + "learning_rate": 4.840949075839215e-05, + "loss": 0.4067, "step": 19530 }, { - "epoch": 0.69, - "learning_rate": 4.850200029882857e-05, - "loss": 0.2925, + "epoch": 0.7040400764046564, + "grad_norm": 0.19798412919044495, + "learning_rate": 4.84084663664826e-05, + "loss": 0.4514, "step": 19535 }, { - "epoch": 0.69, - "learning_rate": 4.850102886357161e-05, - "loss": 0.3171, + "epoch": 0.704220276065881, + "grad_norm": 0.16639620065689087, + "learning_rate": 4.840744165563721e-05, + "loss": 0.4901, "step": 19540 }, { - "epoch": 0.69, - "learning_rate": 4.850005712316928e-05, - "loss": 0.3163, + "epoch": 0.7044004757271056, + "grad_norm": 0.17380645871162415, + "learning_rate": 4.8406416625869977e-05, + "loss": 0.4401, "step": 19545 }, { - "epoch": 0.69, - "learning_rate": 4.849908507763419e-05, - "loss": 0.3025, + "epoch": 0.7045806753883302, + "grad_norm": 0.16596467792987823, + "learning_rate": 4.840539127719484e-05, + "loss": 0.4576, "step": 19550 }, { - "epoch": 0.69, - "learning_rate": 4.849811272697897e-05, - "loss": 0.3011, + "epoch": 0.7047608750495549, + "grad_norm": 0.14880597591400146, + "learning_rate": 4.8404365609625785e-05, + "loss": 0.4128, "step": 19555 }, { - "epoch": 0.69, - "learning_rate": 4.849714007121624e-05, - "loss": 0.3052, + "epoch": 0.7049410747107795, + "grad_norm": 0.21893320977687836, + "learning_rate": 4.840333962317678e-05, + "loss": 0.4075, "step": 19560 }, { - "epoch": 0.69, - "learning_rate": 4.8496167110358637e-05, - "loss": 0.304, + "epoch": 0.7051212743720042, + "grad_norm": 0.16774235665798187, + "learning_rate": 4.8402313317861797e-05, + "loss": 0.4624, "step": 19565 }, { - "epoch": 0.69, - "learning_rate": 4.8495193844418784e-05, - "loss": 0.3287, + "epoch": 0.7053014740332288, + "grad_norm": 0.1787051409482956, + "learning_rate": 4.840128669369483e-05, + "loss": 0.441, "step": 19570 }, { - "epoch": 0.69, - "learning_rate": 4.849422027340932e-05, - "loss": 0.3088, + "epoch": 0.7054816736944535, + "grad_norm": 0.1392039656639099, + "learning_rate": 4.840025975068986e-05, + "loss": 0.4667, "step": 19575 }, { - "epoch": 0.69, - "learning_rate": 4.8493246397342896e-05, - "loss": 0.3191, + "epoch": 0.7056618733556781, + "grad_norm": 0.17485755681991577, + "learning_rate": 4.839923248886089e-05, + "loss": 0.4378, "step": 19580 }, { - "epoch": 0.69, - "learning_rate": 4.849227221623215e-05, - "loss": 0.2954, + "epoch": 0.7058420730169027, + "grad_norm": 0.14245180785655975, + "learning_rate": 4.8398204908221894e-05, + "loss": 0.445, "step": 19585 }, { - "epoch": 0.69, - "learning_rate": 4.849129773008972e-05, - "loss": 0.2997, + "epoch": 0.7060222726781273, + "grad_norm": 0.19247664511203766, + "learning_rate": 4.839717700878689e-05, + "loss": 0.4163, "step": 19590 }, { - "epoch": 0.69, - "learning_rate": 4.849032293892828e-05, - "loss": 0.3194, + "epoch": 0.706202472339352, + "grad_norm": 0.14661888778209686, + "learning_rate": 4.839614879056989e-05, + "loss": 0.4428, "step": 19595 }, { - "epoch": 0.69, - "learning_rate": 4.848934784276048e-05, - "loss": 0.2893, + "epoch": 0.7063826720005766, + "grad_norm": 0.2213519662618637, + "learning_rate": 4.839512025358488e-05, + "loss": 0.4248, "step": 19600 }, { - "epoch": 0.69, - "learning_rate": 4.848837244159897e-05, - "loss": 0.3139, + "epoch": 0.7065628716618013, + "grad_norm": 0.13426418602466583, + "learning_rate": 4.839409139784589e-05, + "loss": 0.4201, "step": 19605 }, { - "epoch": 0.69, - "learning_rate": 4.848739673545643e-05, - "loss": 0.3236, + "epoch": 0.7067430713230259, + "grad_norm": 0.17198221385478973, + "learning_rate": 4.839306222336694e-05, + "loss": 0.4788, "step": 19610 }, { - "epoch": 0.69, - "learning_rate": 4.8486420724345516e-05, - "loss": 0.3074, + "epoch": 0.7069232709842506, + "grad_norm": 0.1814018040895462, + "learning_rate": 4.839203273016203e-05, + "loss": 0.4323, "step": 19615 }, { - "epoch": 0.69, - "learning_rate": 4.8485444408278915e-05, - "loss": 0.3413, + "epoch": 0.7071034706454752, + "grad_norm": 0.20377859473228455, + "learning_rate": 4.839100291824522e-05, + "loss": 0.4059, "step": 19620 }, { - "epoch": 0.69, - "learning_rate": 4.8484467787269294e-05, - "loss": 0.3285, + "epoch": 0.7072836703066998, + "grad_norm": 0.16725854575634003, + "learning_rate": 4.838997278763051e-05, + "loss": 0.4669, "step": 19625 }, { - "epoch": 0.69, - "learning_rate": 4.848349086132933e-05, - "loss": 0.3101, + "epoch": 0.7074638699679244, + "grad_norm": 0.15145759284496307, + "learning_rate": 4.838894233833196e-05, + "loss": 0.4384, "step": 19630 }, { - "epoch": 0.69, - "learning_rate": 4.848251363047171e-05, - "loss": 0.3094, + "epoch": 0.707644069629149, + "grad_norm": 0.16167926788330078, + "learning_rate": 4.8387911570363596e-05, + "loss": 0.4536, "step": 19635 }, { - "epoch": 0.69, - "learning_rate": 4.848153609470914e-05, - "loss": 0.3325, + "epoch": 0.7078242692903738, + "grad_norm": 0.15461744368076324, + "learning_rate": 4.838688048373946e-05, + "loss": 0.4833, "step": 19640 }, { - "epoch": 0.69, - "learning_rate": 4.848055825405429e-05, - "loss": 0.2964, + "epoch": 0.7080044689515984, + "grad_norm": 0.18322111666202545, + "learning_rate": 4.838584907847361e-05, + "loss": 0.4641, "step": 19645 }, { - "epoch": 0.69, - "learning_rate": 4.847958010851986e-05, - "loss": 0.3134, + "epoch": 0.708184668612823, + "grad_norm": 0.15840373933315277, + "learning_rate": 4.83848173545801e-05, + "loss": 0.4341, "step": 19650 }, { - "epoch": 0.69, - "learning_rate": 4.847860165811857e-05, - "loss": 0.2776, + "epoch": 0.7083648682740477, + "grad_norm": 0.1756211519241333, + "learning_rate": 4.8383785312072974e-05, + "loss": 0.4358, "step": 19655 }, { - "epoch": 0.69, - "learning_rate": 4.84776229028631e-05, - "loss": 0.3465, + "epoch": 0.7085450679352723, + "grad_norm": 0.17935514450073242, + "learning_rate": 4.83827529509663e-05, + "loss": 0.4682, "step": 19660 }, { - "epoch": 0.69, - "learning_rate": 4.847664384276617e-05, - "loss": 0.3151, + "epoch": 0.7087252675964969, + "grad_norm": 0.20242036879062653, + "learning_rate": 4.8381720271274146e-05, + "loss": 0.4391, "step": 19665 }, { - "epoch": 0.69, - "learning_rate": 4.84756644778405e-05, - "loss": 0.3081, + "epoch": 0.7089054672577215, + "grad_norm": 0.15790332853794098, + "learning_rate": 4.838068727301057e-05, + "loss": 0.4066, "step": 19670 }, { - "epoch": 0.69, - "learning_rate": 4.847468480809879e-05, - "loss": 0.3069, + "epoch": 0.7090856669189461, + "grad_norm": 0.14953632652759552, + "learning_rate": 4.8379653956189675e-05, + "loss": 0.4043, "step": 19675 }, { - "epoch": 0.69, - "learning_rate": 4.847370483355377e-05, - "loss": 0.3152, + "epoch": 0.7092658665801709, + "grad_norm": 0.20323163270950317, + "learning_rate": 4.8378620320825505e-05, + "loss": 0.4517, "step": 19680 }, { - "epoch": 0.69, - "learning_rate": 4.847272455421816e-05, - "loss": 0.2902, + "epoch": 0.7094460662413955, + "grad_norm": 0.17088757455348969, + "learning_rate": 4.837758636693217e-05, + "loss": 0.4426, "step": 19685 }, { - "epoch": 0.69, - "learning_rate": 4.847174397010469e-05, - "loss": 0.3075, + "epoch": 0.7096262659026201, + "grad_norm": 0.15249846875667572, + "learning_rate": 4.837655209452374e-05, + "loss": 0.4364, "step": 19690 }, { - "epoch": 0.69, - "learning_rate": 4.84707630812261e-05, - "loss": 0.2934, + "epoch": 0.7098064655638447, + "grad_norm": 0.17006030678749084, + "learning_rate": 4.837551750361432e-05, + "loss": 0.4031, "step": 19695 }, { - "epoch": 0.69, - "learning_rate": 4.8469781887595115e-05, - "loss": 0.3203, + "epoch": 0.7099866652250694, + "grad_norm": 0.17397870123386383, + "learning_rate": 4.837448259421799e-05, + "loss": 0.4184, "step": 19700 }, { - "epoch": 0.69, - "learning_rate": 4.846880038922447e-05, - "loss": 0.3175, + "epoch": 0.710166864886294, + "grad_norm": 0.16801589727401733, + "learning_rate": 4.837344736634887e-05, + "loss": 0.4629, "step": 19705 }, { - "epoch": 0.69, - "learning_rate": 4.846781858612693e-05, - "loss": 0.2932, + "epoch": 0.7103470645475186, + "grad_norm": 0.18342332541942596, + "learning_rate": 4.8372411820021054e-05, + "loss": 0.4433, "step": 19710 }, { - "epoch": 0.69, - "learning_rate": 4.846683647831522e-05, - "loss": 0.311, + "epoch": 0.7105272642087432, + "grad_norm": 0.15994971990585327, + "learning_rate": 4.8371375955248644e-05, + "loss": 0.444, "step": 19715 }, { - "epoch": 0.69, - "learning_rate": 4.846585406580212e-05, - "loss": 0.3067, + "epoch": 0.710707463869968, + "grad_norm": 0.1518271565437317, + "learning_rate": 4.837033977204577e-05, + "loss": 0.4278, "step": 19720 }, { - "epoch": 0.69, - "learning_rate": 4.846487134860036e-05, - "loss": 0.3117, + "epoch": 0.7108876635311926, + "grad_norm": 0.18965473771095276, + "learning_rate": 4.836930327042654e-05, + "loss": 0.4386, "step": 19725 }, { - "epoch": 0.69, - "learning_rate": 4.8463888326722714e-05, - "loss": 0.3373, + "epoch": 0.7110678631924172, + "grad_norm": 0.20504818856716156, + "learning_rate": 4.8368266450405077e-05, + "loss": 0.4627, "step": 19730 }, { - "epoch": 0.69, - "learning_rate": 4.846290500018194e-05, - "loss": 0.3003, + "epoch": 0.7112480628536418, + "grad_norm": 0.1648981124162674, + "learning_rate": 4.83672293119955e-05, + "loss": 0.4572, "step": 19735 }, { - "epoch": 0.69, - "learning_rate": 4.84619213689908e-05, - "loss": 0.3069, + "epoch": 0.7114282625148665, + "grad_norm": 0.2660510241985321, + "learning_rate": 4.836619185521196e-05, + "loss": 0.4748, "step": 19740 }, { - "epoch": 0.69, - "learning_rate": 4.846093743316208e-05, - "loss": 0.3144, + "epoch": 0.7116084621760911, + "grad_norm": 0.14745326340198517, + "learning_rate": 4.836515408006857e-05, + "loss": 0.4311, "step": 19745 }, { - "epoch": 0.69, - "learning_rate": 4.845995319270855e-05, - "loss": 0.3183, + "epoch": 0.7117886618373157, + "grad_norm": 0.13908466696739197, + "learning_rate": 4.8364115986579485e-05, + "loss": 0.436, "step": 19750 }, { - "epoch": 0.7, - "learning_rate": 4.845896864764299e-05, - "loss": 0.3005, + "epoch": 0.7119688614985403, + "grad_norm": 0.16212470829486847, + "learning_rate": 4.8363077574758836e-05, + "loss": 0.4103, "step": 19755 }, { - "epoch": 0.7, - "learning_rate": 4.845798379797818e-05, - "loss": 0.3114, + "epoch": 0.7121490611597651, + "grad_norm": 0.17893613874912262, + "learning_rate": 4.836203884462078e-05, + "loss": 0.4971, "step": 19760 }, { - "epoch": 0.7, - "learning_rate": 4.84569986437269e-05, - "loss": 0.3258, + "epoch": 0.7123292608209897, + "grad_norm": 0.18436241149902344, + "learning_rate": 4.836099979617947e-05, + "loss": 0.4828, "step": 19765 }, { - "epoch": 0.7, - "learning_rate": 4.845601318490196e-05, - "loss": 0.302, + "epoch": 0.7125094604822143, + "grad_norm": 0.16563914716243744, + "learning_rate": 4.835996042944907e-05, + "loss": 0.4026, "step": 19770 }, { - "epoch": 0.7, - "learning_rate": 4.845502742151615e-05, - "loss": 0.3038, + "epoch": 0.7126896601434389, + "grad_norm": 0.1609710305929184, + "learning_rate": 4.835892074444372e-05, + "loss": 0.4159, "step": 19775 }, { - "epoch": 0.7, - "learning_rate": 4.845404135358226e-05, - "loss": 0.3218, + "epoch": 0.7128698598046636, + "grad_norm": 0.14441636204719543, + "learning_rate": 4.8357880741177605e-05, + "loss": 0.4171, "step": 19780 }, { - "epoch": 0.7, - "learning_rate": 4.8453054981113106e-05, - "loss": 0.3271, + "epoch": 0.7130500594658882, + "grad_norm": 0.14850714802742004, + "learning_rate": 4.835684041966488e-05, + "loss": 0.4341, "step": 19785 }, { - "epoch": 0.7, - "learning_rate": 4.845206830412149e-05, - "loss": 0.3154, + "epoch": 0.7132302591271128, + "grad_norm": 0.17348629236221313, + "learning_rate": 4.835579977991973e-05, + "loss": 0.4328, "step": 19790 }, { - "epoch": 0.7, - "learning_rate": 4.8451081322620215e-05, - "loss": 0.3069, + "epoch": 0.7134104587883375, + "grad_norm": 0.18963472545146942, + "learning_rate": 4.8354758821956325e-05, + "loss": 0.418, "step": 19795 }, { - "epoch": 0.7, - "learning_rate": 4.845009403662211e-05, - "loss": 0.3261, + "epoch": 0.7135906584495622, + "grad_norm": 0.15153972804546356, + "learning_rate": 4.8353717545788855e-05, + "loss": 0.4423, "step": 19800 }, { - "epoch": 0.7, - "learning_rate": 4.844910644613998e-05, - "loss": 0.2997, + "epoch": 0.7137708581107868, + "grad_norm": 0.17662297189235687, + "learning_rate": 4.835267595143151e-05, + "loss": 0.4625, "step": 19805 }, { - "epoch": 0.7, - "learning_rate": 4.8448118551186664e-05, - "loss": 0.2808, + "epoch": 0.7139510577720114, + "grad_norm": 0.1881992071866989, + "learning_rate": 4.8351842446858454e-05, + "loss": 0.4305, "step": 19810 }, { - "epoch": 0.7, - "learning_rate": 4.8447130351774986e-05, - "loss": 0.2954, + "epoch": 0.714131257433236, + "grad_norm": 0.1803097277879715, + "learning_rate": 4.8350800279795086e-05, + "loss": 0.4069, "step": 19815 }, { - "epoch": 0.7, - "learning_rate": 4.844614184791776e-05, - "loss": 0.3093, + "epoch": 0.7143114570944606, + "grad_norm": 0.14879196882247925, + "learning_rate": 4.834975779458158e-05, + "loss": 0.447, "step": 19820 }, { - "epoch": 0.7, - "learning_rate": 4.844515303962784e-05, - "loss": 0.311, + "epoch": 0.7144916567556853, + "grad_norm": 0.1836402863264084, + "learning_rate": 4.834871499123216e-05, + "loss": 0.4518, "step": 19825 }, { - "epoch": 0.7, - "learning_rate": 4.844416392691806e-05, - "loss": 0.2939, + "epoch": 0.7146718564169099, + "grad_norm": 0.16930605471134186, + "learning_rate": 4.8347671869761e-05, + "loss": 0.4471, "step": 19830 }, { - "epoch": 0.7, - "learning_rate": 4.844317450980126e-05, - "loss": 0.3139, + "epoch": 0.7148520560781346, + "grad_norm": 0.14788581430912018, + "learning_rate": 4.8346837143546e-05, + "loss": 0.4639, "step": 19835 }, { - "epoch": 0.7, - "learning_rate": 4.844218478829028e-05, - "loss": 0.2842, + "epoch": 0.7150322557393592, + "grad_norm": 0.16612641513347626, + "learning_rate": 4.834579344949157e-05, + "loss": 0.4454, "step": 19840 }, { - "epoch": 0.7, - "learning_rate": 4.844119476239799e-05, - "loss": 0.2998, + "epoch": 0.7152124554005839, + "grad_norm": 0.17854836583137512, + "learning_rate": 4.834474943735522e-05, + "loss": 0.3823, "step": 19845 }, { - "epoch": 0.7, - "learning_rate": 4.844020443213724e-05, - "loss": 0.3093, + "epoch": 0.7153926550618085, + "grad_norm": 0.1602153182029724, + "learning_rate": 4.834370510715118e-05, + "loss": 0.4561, "step": 19850 }, { - "epoch": 0.7, - "learning_rate": 4.8439213797520874e-05, - "loss": 0.2712, + "epoch": 0.7155728547230331, + "grad_norm": 0.1520301103591919, + "learning_rate": 4.8342660458893677e-05, + "loss": 0.4599, "step": 19855 }, { - "epoch": 0.7, - "learning_rate": 4.8438222858561764e-05, - "loss": 0.3019, + "epoch": 0.7157530543842577, + "grad_norm": 0.16628147661685944, + "learning_rate": 4.834161549259695e-05, + "loss": 0.4178, "step": 19860 }, { - "epoch": 0.7, - "learning_rate": 4.8437231615272775e-05, - "loss": 0.3105, + "epoch": 0.7159332540454824, + "grad_norm": 0.19706813991069794, + "learning_rate": 4.8340570208275224e-05, + "loss": 0.4677, "step": 19865 }, { - "epoch": 0.7, - "learning_rate": 4.8436240067666784e-05, - "loss": 0.3299, + "epoch": 0.716113453706707, + "grad_norm": 0.1859341561794281, + "learning_rate": 4.833952460594275e-05, + "loss": 0.4665, "step": 19870 }, { - "epoch": 0.7, - "learning_rate": 4.843524821575666e-05, - "loss": 0.308, + "epoch": 0.7162936533679317, + "grad_norm": 0.16585981845855713, + "learning_rate": 4.8338478685613775e-05, + "loss": 0.4378, "step": 19875 }, { - "epoch": 0.7, - "learning_rate": 4.8434256059555284e-05, - "loss": 0.2992, + "epoch": 0.7164738530291563, + "grad_norm": 0.15817083418369293, + "learning_rate": 4.8337432447302544e-05, + "loss": 0.453, "step": 19880 }, { - "epoch": 0.7, - "learning_rate": 4.843326359907553e-05, - "loss": 0.3057, + "epoch": 0.716654052690381, + "grad_norm": 0.1971365064382553, + "learning_rate": 4.833638589102332e-05, + "loss": 0.4405, "step": 19885 }, { - "epoch": 0.7, - "learning_rate": 4.84322708343303e-05, - "loss": 0.2805, + "epoch": 0.7168342523516056, + "grad_norm": 0.16757121682167053, + "learning_rate": 4.8335339016790346e-05, + "loss": 0.4392, "step": 19890 }, { - "epoch": 0.7, - "learning_rate": 4.843127776533247e-05, - "loss": 0.2985, + "epoch": 0.7170144520128302, + "grad_norm": 0.20326150953769684, + "learning_rate": 4.8334291824617905e-05, + "loss": 0.4335, "step": 19895 }, { - "epoch": 0.7, - "learning_rate": 4.843028439209494e-05, - "loss": 0.3125, + "epoch": 0.7171946516740548, + "grad_norm": 0.16170883178710938, + "learning_rate": 4.8333244314520254e-05, + "loss": 0.4238, "step": 19900 }, { - "epoch": 0.7, - "learning_rate": 4.8429290714630605e-05, - "loss": 0.3265, + "epoch": 0.7173748513352795, + "grad_norm": 0.17413537204265594, + "learning_rate": 4.833219648651166e-05, + "loss": 0.479, "step": 19905 }, { - "epoch": 0.7, - "learning_rate": 4.8428296732952376e-05, - "loss": 0.302, + "epoch": 0.7175550509965041, + "grad_norm": 0.17065051198005676, + "learning_rate": 4.8331148340606416e-05, + "loss": 0.4408, "step": 19910 }, { - "epoch": 0.7, - "learning_rate": 4.842730244707315e-05, - "loss": 0.3318, + "epoch": 0.7177352506577288, + "grad_norm": 0.22111043334007263, + "learning_rate": 4.833009987681878e-05, + "loss": 0.4456, "step": 19915 }, { - "epoch": 0.7, - "learning_rate": 4.8426307857005845e-05, - "loss": 0.2854, + "epoch": 0.7179154503189534, + "grad_norm": 0.14916540682315826, + "learning_rate": 4.832905109516306e-05, + "loss": 0.4503, "step": 19920 }, { - "epoch": 0.7, - "learning_rate": 4.842531296276337e-05, - "loss": 0.3413, + "epoch": 0.718095649980178, + "grad_norm": 0.16950784623622894, + "learning_rate": 4.832800199565353e-05, + "loss": 0.4433, "step": 19925 }, { - "epoch": 0.7, - "learning_rate": 4.8424317764358645e-05, - "loss": 0.3107, + "epoch": 0.7182758496414027, + "grad_norm": 0.15380850434303284, + "learning_rate": 4.8326952578304496e-05, + "loss": 0.4298, "step": 19930 }, { - "epoch": 0.7, - "learning_rate": 4.842332226180459e-05, - "loss": 0.2836, + "epoch": 0.7184560493026273, + "grad_norm": 0.16172832250595093, + "learning_rate": 4.832590284313024e-05, + "loss": 0.4582, "step": 19935 }, { - "epoch": 0.7, - "learning_rate": 4.842232645511413e-05, - "loss": 0.3137, + "epoch": 0.7186362489638519, + "grad_norm": 0.17244853079319, + "learning_rate": 4.832485279014508e-05, + "loss": 0.471, "step": 19940 }, { - "epoch": 0.7, - "learning_rate": 4.8421330344300196e-05, - "loss": 0.3102, + "epoch": 0.7188164486250765, + "grad_norm": 0.14485011994838715, + "learning_rate": 4.832380241936332e-05, + "loss": 0.4233, "step": 19945 }, { - "epoch": 0.7, - "learning_rate": 4.842033392937573e-05, - "loss": 0.3285, + "epoch": 0.7189966482863013, + "grad_norm": 0.1884828507900238, + "learning_rate": 4.832275173079926e-05, + "loss": 0.4718, "step": 19950 }, { - "epoch": 0.7, - "learning_rate": 4.841933721035365e-05, - "loss": 0.3212, + "epoch": 0.7191768479475259, + "grad_norm": 0.17681987583637238, + "learning_rate": 4.832170072446723e-05, + "loss": 0.4592, "step": 19955 }, { - "epoch": 0.7, - "learning_rate": 4.8418340187246926e-05, - "loss": 0.2831, + "epoch": 0.7193570476087505, + "grad_norm": 0.1624554544687271, + "learning_rate": 4.832064940038154e-05, + "loss": 0.4429, "step": 19960 }, { - "epoch": 0.7, - "learning_rate": 4.8417342860068476e-05, - "loss": 0.3049, + "epoch": 0.7195372472699751, + "grad_norm": 0.19783754646778107, + "learning_rate": 4.831959775855651e-05, + "loss": 0.4366, "step": 19965 }, { - "epoch": 0.7, - "learning_rate": 4.8416345228831266e-05, - "loss": 0.3054, + "epoch": 0.7197174469311998, + "grad_norm": 0.15661893784999847, + "learning_rate": 4.831854579900649e-05, + "loss": 0.4336, "step": 19970 }, { - "epoch": 0.7, - "learning_rate": 4.8415347293548245e-05, - "loss": 0.2954, + "epoch": 0.7198976465924244, + "grad_norm": 0.1397693157196045, + "learning_rate": 4.831749352174578e-05, + "loss": 0.427, "step": 19975 }, { - "epoch": 0.7, - "learning_rate": 4.841434905423237e-05, - "loss": 0.2984, + "epoch": 0.720077846253649, + "grad_norm": 0.175705686211586, + "learning_rate": 4.831644092678875e-05, + "loss": 0.4108, "step": 19980 }, { - "epoch": 0.7, - "learning_rate": 4.8413350510896605e-05, - "loss": 0.3073, + "epoch": 0.7202580459148736, + "grad_norm": 0.1558927595615387, + "learning_rate": 4.831538801414972e-05, + "loss": 0.4495, "step": 19985 }, { - "epoch": 0.7, - "learning_rate": 4.841235166355391e-05, - "loss": 0.3117, + "epoch": 0.7204382455760984, + "grad_norm": 0.17159906029701233, + "learning_rate": 4.831433478384304e-05, + "loss": 0.4014, "step": 19990 }, { - "epoch": 0.7, - "learning_rate": 4.841135251221726e-05, - "loss": 0.317, + "epoch": 0.720618445237323, + "grad_norm": 0.13353735208511353, + "learning_rate": 4.831328123588307e-05, + "loss": 0.3997, "step": 19995 }, { - "epoch": 0.7, - "learning_rate": 4.8410353056899626e-05, - "loss": 0.3122, + "epoch": 0.7207986448985476, + "grad_norm": 0.1744467169046402, + "learning_rate": 4.8312227370284155e-05, + "loss": 0.4517, "step": 20000 }, { - "epoch": 0.7, - "eval_loss": 0.3004976212978363, - "eval_runtime": 10.5473, - "eval_samples_per_second": 9.481, - "eval_steps_per_second": 9.481, + "epoch": 0.7207986448985476, + "eval_loss": 0.46302729845046997, + "eval_runtime": 3.5385, + "eval_samples_per_second": 28.261, + "eval_steps_per_second": 7.065, "step": 20000 }, { - "epoch": 0.7, - "learning_rate": 4.8409353297613994e-05, - "loss": 0.3367, + "epoch": 0.7209788445597722, + "grad_norm": 0.14821864664554596, + "learning_rate": 4.831117318706065e-05, + "loss": 0.4504, "step": 20005 }, { - "epoch": 0.7, - "learning_rate": 4.840835323437332e-05, - "loss": 0.2945, + "epoch": 0.7211590442209969, + "grad_norm": 0.16435876488685608, + "learning_rate": 4.8310118686226926e-05, + "loss": 0.4398, "step": 20010 }, { - "epoch": 0.7, - "learning_rate": 4.840735286719061e-05, - "loss": 0.2952, + "epoch": 0.7213392438822215, + "grad_norm": 0.17198477685451508, + "learning_rate": 4.830906386779735e-05, + "loss": 0.4684, "step": 20015 }, { - "epoch": 0.7, - "learning_rate": 4.8406352196078864e-05, - "loss": 0.2975, + "epoch": 0.7215194435434461, + "grad_norm": 0.1611550748348236, + "learning_rate": 4.830800873178629e-05, + "loss": 0.4502, "step": 20020 }, { - "epoch": 0.7, - "learning_rate": 4.8405351221051044e-05, - "loss": 0.3112, + "epoch": 0.7216996432046707, + "grad_norm": 0.18277272582054138, + "learning_rate": 4.830695327820812e-05, + "loss": 0.4442, "step": 20025 }, { - "epoch": 0.7, - "learning_rate": 4.840434994212017e-05, - "loss": 0.2915, + "epoch": 0.7218798428658955, + "grad_norm": 0.21301540732383728, + "learning_rate": 4.8305897507077236e-05, + "loss": 0.4286, "step": 20030 }, { - "epoch": 0.7, - "learning_rate": 4.8403348359299236e-05, - "loss": 0.3102, + "epoch": 0.7220600425271201, + "grad_norm": 0.19387184083461761, + "learning_rate": 4.8304841418407995e-05, + "loss": 0.4271, "step": 20035 }, { - "epoch": 0.71, - "learning_rate": 4.840234647260125e-05, - "loss": 0.3047, + "epoch": 0.7222402421883447, + "grad_norm": 0.18802416324615479, + "learning_rate": 4.8303785012214814e-05, + "loss": 0.4276, "step": 20040 }, { - "epoch": 0.71, - "learning_rate": 4.8401344282039204e-05, - "loss": 0.3044, + "epoch": 0.7224204418495693, + "grad_norm": 0.18947114050388336, + "learning_rate": 4.8302728288512064e-05, + "loss": 0.4585, "step": 20045 }, { - "epoch": 0.71, - "learning_rate": 4.840034178762613e-05, - "loss": 0.282, + "epoch": 0.722600641510794, + "grad_norm": 0.18048980832099915, + "learning_rate": 4.8301671247314165e-05, + "loss": 0.4562, "step": 20050 }, { - "epoch": 0.71, - "learning_rate": 4.839933898937505e-05, - "loss": 0.3284, + "epoch": 0.7227808411720186, + "grad_norm": 0.16718988120555878, + "learning_rate": 4.830061388863549e-05, + "loss": 0.4233, "step": 20055 }, { - "epoch": 0.71, - "learning_rate": 4.8398335887298965e-05, - "loss": 0.2998, + "epoch": 0.7229610408332432, + "grad_norm": 0.17260010540485382, + "learning_rate": 4.8299556212490474e-05, + "loss": 0.3985, "step": 20060 }, { - "epoch": 0.71, - "learning_rate": 4.839733248141092e-05, - "loss": 0.3012, + "epoch": 0.7231412404944678, + "grad_norm": 0.1411719173192978, + "learning_rate": 4.829849821889352e-05, + "loss": 0.4143, "step": 20065 }, { - "epoch": 0.71, - "learning_rate": 4.839632877172392e-05, - "loss": 0.291, + "epoch": 0.7233214401556926, + "grad_norm": 0.19284233450889587, + "learning_rate": 4.829743990785903e-05, + "loss": 0.4585, "step": 20070 }, { - "epoch": 0.71, - "learning_rate": 4.839532475825101e-05, - "loss": 0.3247, + "epoch": 0.7235016398169172, + "grad_norm": 0.13794685900211334, + "learning_rate": 4.829638127940143e-05, + "loss": 0.4115, "step": 20075 }, { - "epoch": 0.71, - "learning_rate": 4.839432044100523e-05, - "loss": 0.3074, + "epoch": 0.7236818394781418, + "grad_norm": 0.17087949812412262, + "learning_rate": 4.8295322333535146e-05, + "loss": 0.4631, "step": 20080 }, { - "epoch": 0.71, - "learning_rate": 4.839331581999962e-05, - "loss": 0.2962, + "epoch": 0.7238620391393664, + "grad_norm": 0.18118272721767426, + "learning_rate": 4.829426307027461e-05, + "loss": 0.4688, "step": 20085 }, { - "epoch": 0.71, - "learning_rate": 4.839231089524721e-05, - "loss": 0.3216, + "epoch": 0.724042238800591, + "grad_norm": 0.1501915603876114, + "learning_rate": 4.829320348963425e-05, + "loss": 0.4322, "step": 20090 }, { - "epoch": 0.71, - "learning_rate": 4.8391305666761065e-05, - "loss": 0.3007, + "epoch": 0.7242224384618157, + "grad_norm": 0.1526506394147873, + "learning_rate": 4.8292143591628494e-05, + "loss": 0.4439, "step": 20095 }, { - "epoch": 0.71, - "learning_rate": 4.8390300134554236e-05, - "loss": 0.2944, + "epoch": 0.7244026381230403, + "grad_norm": 0.16467134654521942, + "learning_rate": 4.82910833762718e-05, + "loss": 0.4476, "step": 20100 }, { - "epoch": 0.71, - "learning_rate": 4.8389294298639766e-05, - "loss": 0.2958, + "epoch": 0.7245828377842649, + "grad_norm": 0.199048712849617, + "learning_rate": 4.82900228435786e-05, + "loss": 0.4174, "step": 20105 }, { - "epoch": 0.71, - "learning_rate": 4.8388288159030727e-05, - "loss": 0.3113, + "epoch": 0.7247630374454896, + "grad_norm": 0.22329410910606384, + "learning_rate": 4.828896199356335e-05, + "loss": 0.4558, "step": 20110 }, { - "epoch": 0.71, - "learning_rate": 4.838728171574018e-05, - "loss": 0.2786, + "epoch": 0.7249432371067143, + "grad_norm": 0.15609164535999298, + "learning_rate": 4.82879008262405e-05, + "loss": 0.4342, "step": 20115 }, { - "epoch": 0.71, - "learning_rate": 4.83862749687812e-05, - "loss": 0.3319, + "epoch": 0.7251234367679389, + "grad_norm": 0.15675166249275208, + "learning_rate": 4.8286839341624515e-05, + "loss": 0.4524, "step": 20120 }, { - "epoch": 0.71, - "learning_rate": 4.8385267918166844e-05, - "loss": 0.2875, + "epoch": 0.7253036364291635, + "grad_norm": 0.16207316517829895, + "learning_rate": 4.828577753972984e-05, + "loss": 0.4452, "step": 20125 }, { - "epoch": 0.71, - "learning_rate": 4.838426056391019e-05, - "loss": 0.32, + "epoch": 0.7254838360903881, + "grad_norm": 0.15296733379364014, + "learning_rate": 4.8284715420570964e-05, + "loss": 0.3956, "step": 20130 }, { - "epoch": 0.71, - "learning_rate": 4.838325290602434e-05, - "loss": 0.3277, + "epoch": 0.7256640357516128, + "grad_norm": 0.16610366106033325, + "learning_rate": 4.8283652984162345e-05, + "loss": 0.4279, "step": 20135 }, { - "epoch": 0.71, - "learning_rate": 4.8382244944522346e-05, - "loss": 0.3221, + "epoch": 0.7258442354128374, + "grad_norm": 0.1927533745765686, + "learning_rate": 4.828259023051847e-05, + "loss": 0.4534, "step": 20140 }, { - "epoch": 0.71, - "learning_rate": 4.838123667941731e-05, - "loss": 0.3474, + "epoch": 0.7260244350740621, + "grad_norm": 0.13150177896022797, + "learning_rate": 4.82815271596538e-05, + "loss": 0.4348, "step": 20145 }, { - "epoch": 0.71, - "learning_rate": 4.8380228110722336e-05, - "loss": 0.2811, + "epoch": 0.7262046347352867, + "grad_norm": 0.15792779624462128, + "learning_rate": 4.8280463771582835e-05, + "loss": 0.4647, "step": 20150 }, { - "epoch": 0.71, - "learning_rate": 4.83792192384505e-05, - "loss": 0.284, + "epoch": 0.7263848343965114, + "grad_norm": 0.16151747107505798, + "learning_rate": 4.8279400066320055e-05, + "loss": 0.4718, "step": 20155 }, { - "epoch": 0.71, - "learning_rate": 4.8378210062614916e-05, - "loss": 0.295, + "epoch": 0.726565034057736, + "grad_norm": 0.14716650545597076, + "learning_rate": 4.827833604387996e-05, + "loss": 0.4125, "step": 20160 }, { - "epoch": 0.71, - "learning_rate": 4.837720058322868e-05, - "loss": 0.3091, + "epoch": 0.7267452337189606, + "grad_norm": 0.16372278332710266, + "learning_rate": 4.827727170427704e-05, + "loss": 0.4419, "step": 20165 }, { - "epoch": 0.71, - "learning_rate": 4.8376190800304896e-05, - "loss": 0.3085, + "epoch": 0.7269254333801852, + "grad_norm": 0.15306620299816132, + "learning_rate": 4.82762070475258e-05, + "loss": 0.4368, "step": 20170 }, { - "epoch": 0.71, - "learning_rate": 4.8375180713856684e-05, - "loss": 0.3105, + "epoch": 0.7271056330414098, + "grad_norm": 0.13666532933712006, + "learning_rate": 4.827514207364075e-05, + "loss": 0.4406, "step": 20175 }, { - "epoch": 0.71, - "learning_rate": 4.837417032389715e-05, - "loss": 0.3035, + "epoch": 0.7272858327026345, + "grad_norm": 0.19249454140663147, + "learning_rate": 4.8274076782636393e-05, + "loss": 0.4606, "step": 20180 }, { - "epoch": 0.71, - "learning_rate": 4.837315963043943e-05, - "loss": 0.2796, + "epoch": 0.7274660323638592, + "grad_norm": 0.1536799520254135, + "learning_rate": 4.827301117452725e-05, + "loss": 0.4276, "step": 20185 }, { - "epoch": 0.71, - "learning_rate": 4.8372148633496614e-05, - "loss": 0.299, + "epoch": 0.7276462320250838, + "grad_norm": 0.15559114515781403, + "learning_rate": 4.8271945249327825e-05, + "loss": 0.4438, "step": 20190 }, { - "epoch": 0.71, - "learning_rate": 4.8371137333081866e-05, - "loss": 0.3032, + "epoch": 0.7278264316863085, + "grad_norm": 0.15832901000976562, + "learning_rate": 4.827087900705266e-05, + "loss": 0.4691, "step": 20195 }, { - "epoch": 0.71, - "learning_rate": 4.837012572920831e-05, - "loss": 0.33, + "epoch": 0.7280066313475331, + "grad_norm": 0.20250186324119568, + "learning_rate": 4.826981244771627e-05, + "loss": 0.4298, "step": 20200 }, { - "epoch": 0.71, - "learning_rate": 4.836911382188906e-05, - "loss": 0.2876, + "epoch": 0.7281868310087577, + "grad_norm": 0.17052537202835083, + "learning_rate": 4.826874557133319e-05, + "loss": 0.4224, "step": 20205 }, { - "epoch": 0.71, - "learning_rate": 4.8368101611137265e-05, - "loss": 0.3226, + "epoch": 0.7283670306699823, + "grad_norm": 0.15635935962200165, + "learning_rate": 4.826767837791796e-05, + "loss": 0.4561, "step": 20210 }, { - "epoch": 0.71, - "learning_rate": 4.836708909696608e-05, - "loss": 0.3189, + "epoch": 0.7285472303312069, + "grad_norm": 0.15289193391799927, + "learning_rate": 4.826661086748512e-05, + "loss": 0.4283, "step": 20215 }, { - "epoch": 0.71, - "learning_rate": 4.836607627938864e-05, - "loss": 0.3081, + "epoch": 0.7287274299924316, + "grad_norm": 0.16655376553535461, + "learning_rate": 4.82655430400492e-05, + "loss": 0.4535, "step": 20220 }, { - "epoch": 0.71, - "learning_rate": 4.836506315841809e-05, - "loss": 0.307, + "epoch": 0.7289076296536563, + "grad_norm": 0.18925489485263824, + "learning_rate": 4.826447489562477e-05, + "loss": 0.4096, "step": 20225 }, { - "epoch": 0.71, - "learning_rate": 4.8364049734067606e-05, - "loss": 0.3143, + "epoch": 0.7290878293148809, + "grad_norm": 0.15604831278324127, + "learning_rate": 4.826340643422637e-05, + "loss": 0.4379, "step": 20230 }, { - "epoch": 0.71, - "learning_rate": 4.8363036006350324e-05, - "loss": 0.3044, + "epoch": 0.7292680289761055, + "grad_norm": 0.16162985563278198, + "learning_rate": 4.826233765586856e-05, + "loss": 0.4017, "step": 20235 }, { - "epoch": 0.71, - "learning_rate": 4.836202197527943e-05, - "loss": 0.3096, + "epoch": 0.7294482286373302, + "grad_norm": 0.17815397679805756, + "learning_rate": 4.826126856056591e-05, + "loss": 0.4229, "step": 20240 }, { - "epoch": 0.71, - "learning_rate": 4.836100764086806e-05, - "loss": 0.2831, + "epoch": 0.7296284282985548, + "grad_norm": 0.17258299887180328, + "learning_rate": 4.826019914833297e-05, + "loss": 0.4471, "step": 20245 }, { - "epoch": 0.71, - "learning_rate": 4.835999300312941e-05, - "loss": 0.288, + "epoch": 0.7298086279597794, + "grad_norm": 0.1543063521385193, + "learning_rate": 4.8259129419184326e-05, + "loss": 0.4246, "step": 20250 }, { - "epoch": 0.71, - "learning_rate": 4.835897806207664e-05, - "loss": 0.3392, + "epoch": 0.729988827621004, + "grad_norm": 0.2001371681690216, + "learning_rate": 4.8258059373134546e-05, + "loss": 0.437, "step": 20255 }, { - "epoch": 0.71, - "learning_rate": 4.835796281772295e-05, - "loss": 0.3071, + "epoch": 0.7301690272822287, + "grad_norm": 0.16070033609867096, + "learning_rate": 4.8256989010198215e-05, + "loss": 0.4449, "step": 20260 }, { - "epoch": 0.71, - "learning_rate": 4.835694727008149e-05, - "loss": 0.3295, + "epoch": 0.7303492269434534, + "grad_norm": 0.16648970544338226, + "learning_rate": 4.8255918330389906e-05, + "loss": 0.4399, "step": 20265 }, { - "epoch": 0.71, - "learning_rate": 4.835593141916546e-05, - "loss": 0.3131, + "epoch": 0.730529426604678, + "grad_norm": 0.2026122361421585, + "learning_rate": 4.8254847333724204e-05, + "loss": 0.446, "step": 20270 }, { - "epoch": 0.71, - "learning_rate": 4.8354915264988065e-05, - "loss": 0.3186, + "epoch": 0.7307096262659026, + "grad_norm": 0.1755245178937912, + "learning_rate": 4.8253776020215725e-05, + "loss": 0.4188, "step": 20275 }, { - "epoch": 0.71, - "learning_rate": 4.835389880756248e-05, - "loss": 0.3357, + "epoch": 0.7308898259271273, + "grad_norm": 0.15573327243328094, + "learning_rate": 4.825270438987904e-05, + "loss": 0.4301, "step": 20280 }, { - "epoch": 0.71, - "learning_rate": 4.8352882046901915e-05, - "loss": 0.291, + "epoch": 0.7310700255883519, + "grad_norm": 0.17587211728096008, + "learning_rate": 4.825163244272876e-05, + "loss": 0.4094, "step": 20285 }, { - "epoch": 0.71, - "learning_rate": 4.8351864983019556e-05, - "loss": 0.3085, + "epoch": 0.7312502252495765, + "grad_norm": 0.15777091681957245, + "learning_rate": 4.825056017877949e-05, + "loss": 0.446, "step": 20290 }, { - "epoch": 0.71, - "learning_rate": 4.835084761592863e-05, - "loss": 0.3044, + "epoch": 0.7314304249108011, + "grad_norm": 0.1694624423980713, + "learning_rate": 4.824948759804584e-05, + "loss": 0.4262, "step": 20295 }, { - "epoch": 0.71, - "learning_rate": 4.8349829945642336e-05, - "loss": 0.3227, + "epoch": 0.7316106245720259, + "grad_norm": 0.15097464621067047, + "learning_rate": 4.824841470054242e-05, + "loss": 0.4455, "step": 20300 }, { - "epoch": 0.71, - "learning_rate": 4.8348811972173877e-05, - "loss": 0.3336, + "epoch": 0.7317908242332505, + "grad_norm": 0.1695692390203476, + "learning_rate": 4.824734148628386e-05, + "loss": 0.4678, "step": 20305 }, { - "epoch": 0.71, - "learning_rate": 4.834779369553649e-05, - "loss": 0.286, + "epoch": 0.7319710238944751, + "grad_norm": 0.18447819352149963, + "learning_rate": 4.824626795528476e-05, + "loss": 0.4814, "step": 20310 }, { - "epoch": 0.71, - "learning_rate": 4.8346775115743394e-05, - "loss": 0.2999, + "epoch": 0.7321512235556997, + "grad_norm": 0.15823550522327423, + "learning_rate": 4.8245194107559774e-05, + "loss": 0.4266, "step": 20315 }, { - "epoch": 0.71, - "learning_rate": 4.8345756232807805e-05, - "loss": 0.3127, + "epoch": 0.7323314232169243, + "grad_norm": 0.17037831246852875, + "learning_rate": 4.824411994312351e-05, + "loss": 0.4264, "step": 20320 }, { - "epoch": 0.72, - "learning_rate": 4.834473704674296e-05, - "loss": 0.3193, + "epoch": 0.732511622878149, + "grad_norm": 0.17536459863185883, + "learning_rate": 4.824304546199061e-05, + "loss": 0.4311, "step": 20325 }, { - "epoch": 0.72, - "learning_rate": 4.8343717557562076e-05, - "loss": 0.3123, + "epoch": 0.7326918225393736, + "grad_norm": 0.1618126928806305, + "learning_rate": 4.824197066417572e-05, + "loss": 0.4535, "step": 20330 }, { - "epoch": 0.72, - "learning_rate": 4.8342697765278424e-05, - "loss": 0.2938, + "epoch": 0.7328720222005982, + "grad_norm": 0.19258461892604828, + "learning_rate": 4.824089554969348e-05, + "loss": 0.4267, "step": 20335 }, { - "epoch": 0.72, - "learning_rate": 4.834167766990521e-05, - "loss": 0.3127, + "epoch": 0.733052221861823, + "grad_norm": 0.19731229543685913, + "learning_rate": 4.823982011855854e-05, + "loss": 0.4058, "step": 20340 }, { - "epoch": 0.72, - "learning_rate": 4.8340657271455694e-05, - "loss": 0.3214, + "epoch": 0.7332324215230476, + "grad_norm": 0.1781267374753952, + "learning_rate": 4.8238744370785545e-05, + "loss": 0.4357, "step": 20345 }, { - "epoch": 0.72, - "learning_rate": 4.833963656994313e-05, - "loss": 0.325, + "epoch": 0.7334126211842722, + "grad_norm": 0.11069954186677933, + "learning_rate": 4.823766830638916e-05, + "loss": 0.3899, "step": 20350 }, { - "epoch": 0.72, - "learning_rate": 4.833861556538076e-05, - "loss": 0.3356, + "epoch": 0.7335928208454968, + "grad_norm": 0.19684797525405884, + "learning_rate": 4.823659192538404e-05, + "loss": 0.4537, "step": 20355 }, { - "epoch": 0.72, - "learning_rate": 4.833759425778186e-05, - "loss": 0.3004, + "epoch": 0.7337730205067214, + "grad_norm": 0.13966168463230133, + "learning_rate": 4.8235515227784856e-05, + "loss": 0.407, "step": 20360 }, { - "epoch": 0.72, - "learning_rate": 4.833657264715967e-05, - "loss": 0.3253, + "epoch": 0.7339532201679461, + "grad_norm": 0.17704619467258453, + "learning_rate": 4.823443821360627e-05, + "loss": 0.4499, "step": 20365 }, { - "epoch": 0.72, - "learning_rate": 4.833555073352747e-05, - "loss": 0.3046, + "epoch": 0.7341334198291707, + "grad_norm": 0.16276319324970245, + "learning_rate": 4.8233360882862965e-05, + "loss": 0.4285, "step": 20370 }, { - "epoch": 0.72, - "learning_rate": 4.8334528516898515e-05, - "loss": 0.3052, + "epoch": 0.7343136194903953, + "grad_norm": 0.1888384073972702, + "learning_rate": 4.823228323556962e-05, + "loss": 0.4563, "step": 20375 }, { - "epoch": 0.72, - "learning_rate": 4.833350599728609e-05, - "loss": 0.3069, + "epoch": 0.73449381915162, + "grad_norm": 0.17120252549648285, + "learning_rate": 4.8231205271740916e-05, + "loss": 0.4775, "step": 20380 }, { - "epoch": 0.72, - "learning_rate": 4.8332483174703465e-05, - "loss": 0.2918, + "epoch": 0.7346740188128447, + "grad_norm": 0.1270987093448639, + "learning_rate": 4.8230126991391534e-05, + "loss": 0.4342, "step": 20385 }, { - "epoch": 0.72, - "learning_rate": 4.833146004916392e-05, - "loss": 0.2913, + "epoch": 0.7348542184740693, + "grad_norm": 0.16976775228977203, + "learning_rate": 4.822904839453617e-05, + "loss": 0.4159, "step": 20390 }, { - "epoch": 0.72, - "learning_rate": 4.8330436620680745e-05, - "loss": 0.2978, + "epoch": 0.7350344181352939, + "grad_norm": 0.1889120191335678, + "learning_rate": 4.822796948118952e-05, + "loss": 0.4339, "step": 20395 }, { - "epoch": 0.72, - "learning_rate": 4.832941288926723e-05, - "loss": 0.3143, + "epoch": 0.7352146177965185, + "grad_norm": 0.18324951827526093, + "learning_rate": 4.822689025136627e-05, + "loss": 0.4625, "step": 20400 }, { - "epoch": 0.72, - "learning_rate": 4.8328388854936656e-05, - "loss": 0.288, + "epoch": 0.7353948174577432, + "grad_norm": 0.19597896933555603, + "learning_rate": 4.822581070508115e-05, + "loss": 0.4828, "step": 20405 }, { - "epoch": 0.72, - "learning_rate": 4.832736451770233e-05, - "loss": 0.302, + "epoch": 0.7355750171189678, + "grad_norm": 0.20112694799900055, + "learning_rate": 4.8224730842348856e-05, + "loss": 0.4774, "step": 20410 }, { - "epoch": 0.72, - "learning_rate": 4.832633987757755e-05, - "loss": 0.3215, + "epoch": 0.7357552167801924, + "grad_norm": 0.16758687794208527, + "learning_rate": 4.8223650663184094e-05, + "loss": 0.4098, "step": 20415 }, { - "epoch": 0.72, - "learning_rate": 4.832531493457561e-05, - "loss": 0.3222, + "epoch": 0.7359354164414171, + "grad_norm": 0.16736017167568207, + "learning_rate": 4.82225701676016e-05, + "loss": 0.452, "step": 20420 }, { - "epoch": 0.72, - "learning_rate": 4.832428968870984e-05, - "loss": 0.3041, + "epoch": 0.7361156161026418, + "grad_norm": 0.20359498262405396, + "learning_rate": 4.822148935561607e-05, + "loss": 0.419, "step": 20425 }, { - "epoch": 0.72, - "learning_rate": 4.832326413999354e-05, - "loss": 0.3167, + "epoch": 0.7362958157638664, + "grad_norm": 0.17675939202308655, + "learning_rate": 4.8220408227242255e-05, + "loss": 0.4267, "step": 20430 }, { - "epoch": 0.72, - "learning_rate": 4.832223828844003e-05, - "loss": 0.288, + "epoch": 0.736476015425091, + "grad_norm": 0.18879984319210052, + "learning_rate": 4.821932678249487e-05, + "loss": 0.4493, "step": 20435 }, { - "epoch": 0.72, - "learning_rate": 4.832121213406261e-05, - "loss": 0.3283, + "epoch": 0.7366562150863156, + "grad_norm": 0.17970100045204163, + "learning_rate": 4.821824502138864e-05, + "loss": 0.4387, "step": 20440 }, { - "epoch": 0.72, - "learning_rate": 4.832018567687463e-05, - "loss": 0.3207, + "epoch": 0.7368364147475402, + "grad_norm": 0.16193163394927979, + "learning_rate": 4.8217162943938333e-05, + "loss": 0.418, "step": 20445 }, { - "epoch": 0.72, - "learning_rate": 4.83191589168894e-05, - "loss": 0.3025, + "epoch": 0.7370166144087649, + "grad_norm": 0.14100541174411774, + "learning_rate": 4.821608055015867e-05, + "loss": 0.4138, "step": 20450 }, { - "epoch": 0.72, - "learning_rate": 4.831813185412026e-05, - "loss": 0.2808, + "epoch": 0.7371968140699896, + "grad_norm": 0.16195495426654816, + "learning_rate": 4.8214997840064404e-05, + "loss": 0.4636, "step": 20455 }, { - "epoch": 0.72, - "learning_rate": 4.8317104488580553e-05, - "loss": 0.3145, + "epoch": 0.7373770137312142, + "grad_norm": 0.15169329941272736, + "learning_rate": 4.821391481367029e-05, + "loss": 0.4275, "step": 20460 }, { - "epoch": 0.72, - "learning_rate": 4.8316076820283604e-05, - "loss": 0.2893, + "epoch": 0.7375572133924388, + "grad_norm": 0.14933131635189056, + "learning_rate": 4.8212831470991076e-05, + "loss": 0.4403, "step": 20465 }, { - "epoch": 0.72, - "learning_rate": 4.8315048849242774e-05, - "loss": 0.3034, + "epoch": 0.7377374130536635, + "grad_norm": 0.19773240387439728, + "learning_rate": 4.821174781204153e-05, + "loss": 0.4501, "step": 20470 }, { - "epoch": 0.72, - "learning_rate": 4.8314020575471386e-05, - "loss": 0.2834, + "epoch": 0.7379176127148881, + "grad_norm": 0.18996679782867432, + "learning_rate": 4.821066383683641e-05, + "loss": 0.4403, "step": 20475 }, { - "epoch": 0.72, - "learning_rate": 4.8312991998982816e-05, - "loss": 0.3003, + "epoch": 0.7380978123761127, + "grad_norm": 0.17520780861377716, + "learning_rate": 4.82095795453905e-05, + "loss": 0.4348, "step": 20480 }, { - "epoch": 0.72, - "learning_rate": 4.83119631197904e-05, - "loss": 0.3166, + "epoch": 0.7382780120373373, + "grad_norm": 0.22540989518165588, + "learning_rate": 4.820849493771855e-05, + "loss": 0.4144, "step": 20485 }, { - "epoch": 0.72, - "learning_rate": 4.831093393790751e-05, - "loss": 0.2674, + "epoch": 0.738458211698562, + "grad_norm": 0.1550588756799698, + "learning_rate": 4.820741001383536e-05, + "loss": 0.3918, "step": 20490 }, { - "epoch": 0.72, - "learning_rate": 4.8309904453347504e-05, - "loss": 0.2891, + "epoch": 0.7386384113597867, + "grad_norm": 0.16206443309783936, + "learning_rate": 4.82063247737557e-05, + "loss": 0.4357, "step": 20495 }, { - "epoch": 0.72, - "learning_rate": 4.830887466612375e-05, - "loss": 0.2972, + "epoch": 0.7388186110210113, + "grad_norm": 0.20156899094581604, + "learning_rate": 4.820523921749435e-05, + "loss": 0.4141, "step": 20500 }, { - "epoch": 0.72, - "eval_loss": 0.3011039197444916, - "eval_runtime": 10.5518, - "eval_samples_per_second": 9.477, - "eval_steps_per_second": 9.477, + "epoch": 0.7388186110210113, + "eval_loss": 0.4630433917045593, + "eval_runtime": 3.5384, + "eval_samples_per_second": 28.261, + "eval_steps_per_second": 7.065, "step": 20500 }, { - "epoch": 0.72, - "learning_rate": 4.8307844576249625e-05, - "loss": 0.2883, + "epoch": 0.7389988106822359, + "grad_norm": 0.15963950753211975, + "learning_rate": 4.820415334506611e-05, + "loss": 0.433, "step": 20505 }, { - "epoch": 0.72, - "learning_rate": 4.830681418373849e-05, - "loss": 0.2932, + "epoch": 0.7391790103434606, + "grad_norm": 0.14632678031921387, + "learning_rate": 4.8203067156485775e-05, + "loss": 0.401, "step": 20510 }, { - "epoch": 0.72, - "learning_rate": 4.830578348860374e-05, - "loss": 0.303, + "epoch": 0.7393592100046852, + "grad_norm": 0.13777978718280792, + "learning_rate": 4.820198065176814e-05, + "loss": 0.4334, "step": 20515 }, { - "epoch": 0.72, - "learning_rate": 4.830475249085875e-05, - "loss": 0.3148, + "epoch": 0.7395394096659098, + "grad_norm": 0.1814904361963272, + "learning_rate": 4.820089383092802e-05, + "loss": 0.4499, "step": 20520 }, { - "epoch": 0.72, - "learning_rate": 4.83037211905169e-05, - "loss": 0.291, + "epoch": 0.7397196093271344, + "grad_norm": 0.17688485980033875, + "learning_rate": 4.819980669398021e-05, + "loss": 0.4625, "step": 20525 }, { - "epoch": 0.72, - "learning_rate": 4.83026895875916e-05, - "loss": 0.2962, + "epoch": 0.739899808988359, + "grad_norm": 0.17957401275634766, + "learning_rate": 4.819871924093951e-05, + "loss": 0.4324, "step": 20530 }, { - "epoch": 0.72, - "learning_rate": 4.8301657682096226e-05, - "loss": 0.3105, + "epoch": 0.7400800086495838, + "grad_norm": 0.17410768568515778, + "learning_rate": 4.819763147182077e-05, + "loss": 0.4289, "step": 20535 }, { - "epoch": 0.72, - "learning_rate": 4.830062547404418e-05, - "loss": 0.2883, + "epoch": 0.7402602083108084, + "grad_norm": 0.15775148570537567, + "learning_rate": 4.819654338663879e-05, + "loss": 0.423, "step": 20540 }, { - "epoch": 0.72, - "learning_rate": 4.829959296344887e-05, - "loss": 0.2962, + "epoch": 0.740440407972033, + "grad_norm": 0.2120039314031601, + "learning_rate": 4.8195454985408394e-05, + "loss": 0.4377, "step": 20545 }, { - "epoch": 0.72, - "learning_rate": 4.82985601503237e-05, - "loss": 0.3058, + "epoch": 0.7406206076332577, + "grad_norm": 0.15215463936328888, + "learning_rate": 4.8194366268144415e-05, + "loss": 0.4067, "step": 20550 }, { - "epoch": 0.72, - "learning_rate": 4.8297527034682086e-05, - "loss": 0.3153, + "epoch": 0.7408008072944823, + "grad_norm": 0.14415352046489716, + "learning_rate": 4.8193277234861686e-05, + "loss": 0.4525, "step": 20555 }, { - "epoch": 0.72, - "learning_rate": 4.8296493616537435e-05, - "loss": 0.2977, + "epoch": 0.7409810069557069, + "grad_norm": 0.18399272859096527, + "learning_rate": 4.819218788557505e-05, + "loss": 0.4216, "step": 20560 }, { - "epoch": 0.72, - "learning_rate": 4.8295459895903164e-05, - "loss": 0.3376, + "epoch": 0.7411612066169315, + "grad_norm": 0.20752954483032227, + "learning_rate": 4.819109822029933e-05, + "loss": 0.4236, "step": 20565 }, { - "epoch": 0.72, - "learning_rate": 4.82944258727927e-05, - "loss": 0.3026, + "epoch": 0.7413414062781561, + "grad_norm": 0.1981305330991745, + "learning_rate": 4.81900082390494e-05, + "loss": 0.4087, "step": 20570 }, { - "epoch": 0.72, - "learning_rate": 4.8293391547219465e-05, - "loss": 0.2868, + "epoch": 0.7415216059393809, + "grad_norm": 0.22662696242332458, + "learning_rate": 4.818891794184009e-05, + "loss": 0.4559, "step": 20575 }, { - "epoch": 0.72, - "learning_rate": 4.82923569191969e-05, - "loss": 0.3052, + "epoch": 0.7417018056006055, + "grad_norm": 0.16101478040218353, + "learning_rate": 4.818782732868627e-05, + "loss": 0.4777, "step": 20580 }, { - "epoch": 0.72, - "learning_rate": 4.829132198873842e-05, - "loss": 0.2877, + "epoch": 0.7418820052618301, + "grad_norm": 0.1838495433330536, + "learning_rate": 4.8186736399602784e-05, + "loss": 0.4384, "step": 20585 }, { - "epoch": 0.72, - "learning_rate": 4.829028675585748e-05, - "loss": 0.3021, + "epoch": 0.7420622049230547, + "grad_norm": 0.13098390400409698, + "learning_rate": 4.818564515460451e-05, + "loss": 0.4176, "step": 20590 }, { - "epoch": 0.72, - "learning_rate": 4.828925122056752e-05, - "loss": 0.2839, + "epoch": 0.7422424045842794, + "grad_norm": 0.18415838479995728, + "learning_rate": 4.818455359370631e-05, + "loss": 0.4396, "step": 20595 }, { - "epoch": 0.72, - "learning_rate": 4.8288215382881976e-05, - "loss": 0.3139, + "epoch": 0.742422604245504, + "grad_norm": 0.19082091748714447, + "learning_rate": 4.818346171692305e-05, + "loss": 0.4251, "step": 20600 }, { - "epoch": 0.72, - "learning_rate": 4.8287179242814303e-05, - "loss": 0.3174, + "epoch": 0.7426028039067286, + "grad_norm": 0.1550193876028061, + "learning_rate": 4.8182369524269616e-05, + "loss": 0.4532, "step": 20605 }, { - "epoch": 0.73, - "learning_rate": 4.828614280037795e-05, - "loss": 0.3124, + "epoch": 0.7427830035679532, + "grad_norm": 0.18546633422374725, + "learning_rate": 4.818127701576089e-05, + "loss": 0.4659, "step": 20610 }, { - "epoch": 0.73, - "learning_rate": 4.828510605558638e-05, - "loss": 0.3055, + "epoch": 0.742963203229178, + "grad_norm": 0.1380166858434677, + "learning_rate": 4.818018419141174e-05, + "loss": 0.4328, "step": 20615 }, { - "epoch": 0.73, - "learning_rate": 4.828406900845306e-05, - "loss": 0.3304, + "epoch": 0.7431434028904026, + "grad_norm": 0.15850889682769775, + "learning_rate": 4.817909105123708e-05, + "loss": 0.4383, "step": 20620 }, { - "epoch": 0.73, - "learning_rate": 4.8283031658991446e-05, - "loss": 0.2789, + "epoch": 0.7433236025516272, + "grad_norm": 0.14858169853687286, + "learning_rate": 4.817799759525179e-05, + "loss": 0.4636, "step": 20625 }, { - "epoch": 0.73, - "learning_rate": 4.8281994007215006e-05, - "loss": 0.3249, + "epoch": 0.7435038022128518, + "grad_norm": 0.15085402131080627, + "learning_rate": 4.8176903823470765e-05, + "loss": 0.4968, "step": 20630 }, { - "epoch": 0.73, - "learning_rate": 4.828095605313722e-05, - "loss": 0.2882, + "epoch": 0.7436840018740765, + "grad_norm": 0.16185277700424194, + "learning_rate": 4.817580973590892e-05, + "loss": 0.4329, "step": 20635 }, { - "epoch": 0.73, - "learning_rate": 4.827991779677157e-05, - "loss": 0.3259, + "epoch": 0.7438642015353011, + "grad_norm": 0.13464045524597168, + "learning_rate": 4.817471533258114e-05, + "loss": 0.4278, "step": 20640 }, { - "epoch": 0.73, - "learning_rate": 4.8278879238131514e-05, - "loss": 0.3139, + "epoch": 0.7440444011965257, + "grad_norm": 0.18652474880218506, + "learning_rate": 4.817362061350237e-05, + "loss": 0.4305, "step": 20645 }, { - "epoch": 0.73, - "learning_rate": 4.8277840377230565e-05, - "loss": 0.2858, + "epoch": 0.7442246008577504, + "grad_norm": 0.17949911952018738, + "learning_rate": 4.81725255786875e-05, + "loss": 0.4353, "step": 20650 }, { - "epoch": 0.73, - "learning_rate": 4.827680121408219e-05, - "loss": 0.3077, + "epoch": 0.7444048005189751, + "grad_norm": 0.1657877266407013, + "learning_rate": 4.817143022815145e-05, + "loss": 0.4464, "step": 20655 }, { - "epoch": 0.73, - "learning_rate": 4.82757617486999e-05, - "loss": 0.3209, + "epoch": 0.7445850001801997, + "grad_norm": 0.14313843846321106, + "learning_rate": 4.817033456190915e-05, + "loss": 0.4019, "step": 20660 }, { - "epoch": 0.73, - "learning_rate": 4.827472198109717e-05, - "loss": 0.295, + "epoch": 0.7447651998414243, + "grad_norm": 0.2136252373456955, + "learning_rate": 4.816923857997553e-05, + "loss": 0.4717, "step": 20665 }, { - "epoch": 0.73, - "learning_rate": 4.8273681911287525e-05, - "loss": 0.3056, + "epoch": 0.7449453995026489, + "grad_norm": 0.19905641674995422, + "learning_rate": 4.816814228236551e-05, + "loss": 0.4624, "step": 20670 }, { - "epoch": 0.73, - "learning_rate": 4.827264153928446e-05, - "loss": 0.319, + "epoch": 0.7451255991638736, + "grad_norm": 0.18663300573825836, + "learning_rate": 4.8167045669094044e-05, + "loss": 0.4294, "step": 20675 }, { - "epoch": 0.73, - "learning_rate": 4.8271600865101475e-05, - "loss": 0.317, + "epoch": 0.7453057988250982, + "grad_norm": 0.1515270322561264, + "learning_rate": 4.816594874017607e-05, + "loss": 0.4471, "step": 20680 }, { - "epoch": 0.73, - "learning_rate": 4.827055988875209e-05, - "loss": 0.3192, + "epoch": 0.7454859984863228, + "grad_norm": 0.15805648267269135, + "learning_rate": 4.8164851495626526e-05, + "loss": 0.4288, "step": 20685 }, { - "epoch": 0.73, - "learning_rate": 4.8269518610249825e-05, - "loss": 0.3178, + "epoch": 0.7456661981475475, + "grad_norm": 0.15768414735794067, + "learning_rate": 4.816375393546037e-05, + "loss": 0.4443, "step": 20690 }, { - "epoch": 0.73, - "learning_rate": 4.82684770296082e-05, - "loss": 0.3107, + "epoch": 0.7458463978087722, + "grad_norm": 0.15465152263641357, + "learning_rate": 4.8162656059692545e-05, + "loss": 0.4479, "step": 20695 }, { - "epoch": 0.73, - "learning_rate": 4.8267435146840724e-05, - "loss": 0.308, + "epoch": 0.7460265974699968, + "grad_norm": 0.17679835855960846, + "learning_rate": 4.816155786833802e-05, + "loss": 0.4345, "step": 20700 }, { - "epoch": 0.73, - "learning_rate": 4.826639296196094e-05, - "loss": 0.3015, + "epoch": 0.7462067971312214, + "grad_norm": 0.1588921993970871, + "learning_rate": 4.816045936141175e-05, + "loss": 0.4298, "step": 20705 }, { - "epoch": 0.73, - "learning_rate": 4.826535047498238e-05, - "loss": 0.3189, + "epoch": 0.746386996792446, + "grad_norm": 0.18818655610084534, + "learning_rate": 4.815936053892871e-05, + "loss": 0.4402, "step": 20710 }, { - "epoch": 0.73, - "learning_rate": 4.826430768591858e-05, - "loss": 0.3386, + "epoch": 0.7465671964536706, + "grad_norm": 0.11379419267177582, + "learning_rate": 4.815826140090386e-05, + "loss": 0.4041, "step": 20715 }, { - "epoch": 0.73, - "learning_rate": 4.8263264594783063e-05, - "loss": 0.3106, + "epoch": 0.7467473961148953, + "grad_norm": 0.22116567194461823, + "learning_rate": 4.815716194735218e-05, + "loss": 0.4516, "step": 20720 }, { - "epoch": 0.73, - "learning_rate": 4.8262221201589394e-05, - "loss": 0.2998, + "epoch": 0.7469275957761199, + "grad_norm": 0.20635400712490082, + "learning_rate": 4.8156062178288666e-05, + "loss": 0.4372, "step": 20725 }, { - "epoch": 0.73, - "learning_rate": 4.826117750635112e-05, - "loss": 0.3047, + "epoch": 0.7471077954373446, + "grad_norm": 0.19008758664131165, + "learning_rate": 4.815496209372827e-05, + "loss": 0.4649, "step": 20730 }, { - "epoch": 0.73, - "learning_rate": 4.8260133509081776e-05, - "loss": 0.3295, + "epoch": 0.7472879950985692, + "grad_norm": 0.1746254712343216, + "learning_rate": 4.815386169368601e-05, + "loss": 0.454, "step": 20735 }, { - "epoch": 0.73, - "learning_rate": 4.8259089209794926e-05, - "loss": 0.2965, + "epoch": 0.7474681947597939, + "grad_norm": 0.16542162001132965, + "learning_rate": 4.8152760978176864e-05, + "loss": 0.4185, "step": 20740 }, { - "epoch": 0.73, - "learning_rate": 4.825804460850414e-05, - "loss": 0.3049, + "epoch": 0.7476483944210185, + "grad_norm": 0.17639301717281342, + "learning_rate": 4.815165994721583e-05, + "loss": 0.4053, "step": 20745 }, { - "epoch": 0.73, - "learning_rate": 4.825699970522297e-05, - "loss": 0.3185, + "epoch": 0.7478285940822431, + "grad_norm": 0.13610634207725525, + "learning_rate": 4.8150558600817916e-05, + "loss": 0.4214, "step": 20750 }, { - "epoch": 0.73, - "learning_rate": 4.825595449996498e-05, - "loss": 0.3247, + "epoch": 0.7480087937434677, + "grad_norm": 0.17135217785835266, + "learning_rate": 4.814945693899812e-05, + "loss": 0.4546, "step": 20755 }, { - "epoch": 0.73, - "learning_rate": 4.8254908992743756e-05, - "loss": 0.2992, + "epoch": 0.7481889934046924, + "grad_norm": 0.18286027014255524, + "learning_rate": 4.8148354961771457e-05, + "loss": 0.4598, "step": 20760 }, { - "epoch": 0.73, - "learning_rate": 4.825386318357285e-05, - "loss": 0.2991, + "epoch": 0.748369193065917, + "grad_norm": 0.17390398681163788, + "learning_rate": 4.814725266915294e-05, + "loss": 0.4174, "step": 20765 }, { - "epoch": 0.73, - "learning_rate": 4.8252817072465864e-05, - "loss": 0.3054, + "epoch": 0.7485493927271417, + "grad_norm": 0.18580150604248047, + "learning_rate": 4.814615006115759e-05, + "loss": 0.4566, "step": 20770 }, { - "epoch": 0.73, - "learning_rate": 4.8251770659436375e-05, - "loss": 0.3022, + "epoch": 0.7487295923883663, + "grad_norm": 0.1448039710521698, + "learning_rate": 4.814504713780041e-05, + "loss": 0.4102, "step": 20775 }, { - "epoch": 0.73, - "learning_rate": 4.8250723944497964e-05, - "loss": 0.3082, + "epoch": 0.748909792049591, + "grad_norm": 0.14498092234134674, + "learning_rate": 4.814394389909647e-05, + "loss": 0.4198, "step": 20780 }, { - "epoch": 0.73, - "learning_rate": 4.824988635518195e-05, - "loss": 0.2986, + "epoch": 0.7490899917108156, + "grad_norm": 0.14252233505249023, + "learning_rate": 4.814284034506076e-05, + "loss": 0.4443, "step": 20785 }, { - "epoch": 0.73, - "learning_rate": 4.8248839096841736e-05, - "loss": 0.2889, + "epoch": 0.7492701913720402, + "grad_norm": 0.15442687273025513, + "learning_rate": 4.8141736475708325e-05, + "loss": 0.4487, "step": 20790 }, { - "epoch": 0.73, - "learning_rate": 4.8247791536630674e-05, - "loss": 0.3032, + "epoch": 0.7494503910332648, + "grad_norm": 0.15849895775318146, + "learning_rate": 4.814063229105422e-05, + "loss": 0.418, "step": 20795 }, { - "epoch": 0.73, - "learning_rate": 4.824674367456235e-05, - "loss": 0.3027, + "epoch": 0.7496305906944895, + "grad_norm": 0.16628096997737885, + "learning_rate": 4.813952779111348e-05, + "loss": 0.4183, "step": 20800 }, { - "epoch": 0.73, - "learning_rate": 4.824569551065039e-05, - "loss": 0.3137, + "epoch": 0.7498107903557142, + "grad_norm": 0.16264590620994568, + "learning_rate": 4.813842297590115e-05, + "loss": 0.4466, "step": 20805 }, { - "epoch": 0.73, - "learning_rate": 4.824464704490838e-05, - "loss": 0.3047, + "epoch": 0.7499909900169388, + "grad_norm": 0.16959542036056519, + "learning_rate": 4.813731784543229e-05, + "loss": 0.4476, "step": 20810 }, { - "epoch": 0.73, - "learning_rate": 4.824359827734997e-05, - "loss": 0.3319, + "epoch": 0.7501711896781634, + "grad_norm": 0.14073093235492706, + "learning_rate": 4.813621239972195e-05, + "loss": 0.4246, "step": 20815 }, { - "epoch": 0.73, - "learning_rate": 4.8242549207988735e-05, - "loss": 0.2975, + "epoch": 0.750351389339388, + "grad_norm": 0.21107293665409088, + "learning_rate": 4.81351066387852e-05, + "loss": 0.4279, "step": 20820 }, { - "epoch": 0.73, - "learning_rate": 4.824149983683832e-05, - "loss": 0.3049, + "epoch": 0.7505315890006127, + "grad_norm": 0.1864260733127594, + "learning_rate": 4.813400056263709e-05, + "loss": 0.4295, "step": 20825 }, { - "epoch": 0.73, - "learning_rate": 4.824045016391235e-05, - "loss": 0.2976, + "epoch": 0.7507117886618373, + "grad_norm": 0.13779747486114502, + "learning_rate": 4.813289417129272e-05, + "loss": 0.4166, "step": 20830 }, { - "epoch": 0.73, - "learning_rate": 4.823940018922445e-05, - "loss": 0.3072, + "epoch": 0.7508919883230619, + "grad_norm": 0.17728812992572784, + "learning_rate": 4.813178746476713e-05, + "loss": 0.4764, "step": 20835 }, { - "epoch": 0.73, - "learning_rate": 4.823834991278825e-05, - "loss": 0.3261, + "epoch": 0.7510721879842865, + "grad_norm": 0.1685779094696045, + "learning_rate": 4.813068044307543e-05, + "loss": 0.4457, "step": 20840 }, { - "epoch": 0.73, - "learning_rate": 4.8237299334617395e-05, - "loss": 0.3124, + "epoch": 0.7512523876455113, + "grad_norm": 0.17559683322906494, + "learning_rate": 4.812957310623267e-05, + "loss": 0.4377, "step": 20845 }, { - "epoch": 0.73, - "learning_rate": 4.823624845472552e-05, - "loss": 0.3198, + "epoch": 0.7514325873067359, + "grad_norm": 0.18856866657733917, + "learning_rate": 4.812846545425396e-05, + "loss": 0.4306, "step": 20850 }, { - "epoch": 0.73, - "learning_rate": 4.823519727312627e-05, - "loss": 0.3064, + "epoch": 0.7516127869679605, + "grad_norm": 0.1679704487323761, + "learning_rate": 4.812735748715439e-05, + "loss": 0.4218, "step": 20855 }, { - "epoch": 0.73, - "learning_rate": 4.8234145789833296e-05, - "loss": 0.2938, + "epoch": 0.7517929866291851, + "grad_norm": 0.20436346530914307, + "learning_rate": 4.812624920494905e-05, + "loss": 0.4475, "step": 20860 }, { - "epoch": 0.73, - "learning_rate": 4.8233094004860256e-05, - "loss": 0.3166, + "epoch": 0.7519731862904098, + "grad_norm": 0.16926033794879913, + "learning_rate": 4.812514060765304e-05, + "loss": 0.4159, "step": 20865 }, { - "epoch": 0.73, - "learning_rate": 4.823225235968135e-05, - "loss": 0.3031, + "epoch": 0.7521533859516344, + "grad_norm": 0.18585766851902008, + "learning_rate": 4.8124031695281465e-05, + "loss": 0.4935, "step": 20870 }, { - "epoch": 0.73, - "learning_rate": 4.823120003171859e-05, - "loss": 0.2976, + "epoch": 0.752333585612859, + "grad_norm": 0.15015818178653717, + "learning_rate": 4.812292246784944e-05, + "loss": 0.4683, "step": 20875 }, { - "epoch": 0.73, - "learning_rate": 4.8230147402114024e-05, - "loss": 0.3039, + "epoch": 0.7525137852740836, + "grad_norm": 0.18261437118053436, + "learning_rate": 4.8121812925372074e-05, + "loss": 0.4706, "step": 20880 }, { - "epoch": 0.73, - "learning_rate": 4.822909447088129e-05, - "loss": 0.2846, + "epoch": 0.7526939849353084, + "grad_norm": 0.15235644578933716, + "learning_rate": 4.812070306786448e-05, + "loss": 0.493, "step": 20885 }, { - "epoch": 0.73, - "learning_rate": 4.822804123803407e-05, - "loss": 0.3226, + "epoch": 0.752874184596533, + "grad_norm": 0.1405205875635147, + "learning_rate": 4.811959289534178e-05, + "loss": 0.4798, "step": 20890 }, { - "epoch": 0.74, - "learning_rate": 4.8226987703586056e-05, - "loss": 0.2975, + "epoch": 0.7530543842577576, + "grad_norm": 0.16182038187980652, + "learning_rate": 4.811848240781911e-05, + "loss": 0.4287, "step": 20895 }, { - "epoch": 0.74, - "learning_rate": 4.82259338675509e-05, - "loss": 0.3286, + "epoch": 0.7532345839189822, + "grad_norm": 0.17305776476860046, + "learning_rate": 4.811737160531159e-05, + "loss": 0.4401, "step": 20900 }, { - "epoch": 0.74, - "learning_rate": 4.8224879729942307e-05, - "loss": 0.2886, + "epoch": 0.7534147835802069, + "grad_norm": 0.1699703484773636, + "learning_rate": 4.811626048783435e-05, + "loss": 0.4514, "step": 20905 }, { - "epoch": 0.74, - "learning_rate": 4.822382529077396e-05, - "loss": 0.2958, + "epoch": 0.7535949832414315, + "grad_norm": 0.1501130610704422, + "learning_rate": 4.8115149055402545e-05, + "loss": 0.4463, "step": 20910 }, { - "epoch": 0.74, - "learning_rate": 4.8222770550059537e-05, - "loss": 0.3194, + "epoch": 0.7537751829026561, + "grad_norm": 0.18883809447288513, + "learning_rate": 4.811403730803131e-05, + "loss": 0.4432, "step": 20915 }, { - "epoch": 0.74, - "learning_rate": 4.8221715507812746e-05, - "loss": 0.3349, + "epoch": 0.7539553825638807, + "grad_norm": 0.1885019838809967, + "learning_rate": 4.811292524573579e-05, + "loss": 0.4145, "step": 20920 }, { - "epoch": 0.74, - "learning_rate": 4.822066016404729e-05, - "loss": 0.2894, + "epoch": 0.7541355822251055, + "grad_norm": 0.2025119662284851, + "learning_rate": 4.811181286853113e-05, + "loss": 0.4805, "step": 20925 }, { - "epoch": 0.74, - "learning_rate": 4.821960451877686e-05, - "loss": 0.2959, + "epoch": 0.7543157818863301, + "grad_norm": 0.22509577870368958, + "learning_rate": 4.811070017643251e-05, + "loss": 0.4674, "step": 20930 }, { - "epoch": 0.74, - "learning_rate": 4.821854857201516e-05, - "loss": 0.2829, + "epoch": 0.7544959815475547, + "grad_norm": 0.18206484615802765, + "learning_rate": 4.810958716945507e-05, + "loss": 0.4384, "step": 20935 }, { - "epoch": 0.74, - "learning_rate": 4.821749232377592e-05, - "loss": 0.3081, + "epoch": 0.7546761812087793, + "grad_norm": 0.1454673856496811, + "learning_rate": 4.810847384761397e-05, + "loss": 0.4224, "step": 20940 }, { - "epoch": 0.74, - "learning_rate": 4.821643577407284e-05, - "loss": 0.3231, + "epoch": 0.754856380870004, + "grad_norm": 0.15570025146007538, + "learning_rate": 4.810736021092439e-05, + "loss": 0.471, "step": 20945 }, { - "epoch": 0.74, - "learning_rate": 4.821537892291964e-05, - "loss": 0.3026, + "epoch": 0.7550365805312286, + "grad_norm": 0.18441855907440186, + "learning_rate": 4.810624625940151e-05, + "loss": 0.4365, "step": 20950 }, { - "epoch": 0.74, - "learning_rate": 4.821432177033004e-05, - "loss": 0.3115, + "epoch": 0.7552167801924532, + "grad_norm": 0.20488105714321136, + "learning_rate": 4.810513199306049e-05, + "loss": 0.458, "step": 20955 }, { - "epoch": 0.74, - "learning_rate": 4.8213264316317776e-05, - "loss": 0.3078, + "epoch": 0.7553969798536779, + "grad_norm": 0.16301538050174713, + "learning_rate": 4.8104017411916526e-05, + "loss": 0.4031, "step": 20960 }, { - "epoch": 0.74, - "learning_rate": 4.821220656089657e-05, - "loss": 0.3144, + "epoch": 0.7555771795149026, + "grad_norm": 0.1542273312807083, + "learning_rate": 4.810290251598479e-05, + "loss": 0.4097, "step": 20965 }, { - "epoch": 0.74, - "learning_rate": 4.8211148504080164e-05, - "loss": 0.3014, + "epoch": 0.7557573791761272, + "grad_norm": 0.1671588271856308, + "learning_rate": 4.8101787305280485e-05, + "loss": 0.3957, "step": 20970 }, { - "epoch": 0.74, - "learning_rate": 4.821009014588229e-05, - "loss": 0.2936, + "epoch": 0.7559375788373518, + "grad_norm": 0.1824352741241455, + "learning_rate": 4.8100671779818795e-05, + "loss": 0.4619, "step": 20975 }, { - "epoch": 0.74, - "learning_rate": 4.820903148631669e-05, - "loss": 0.2923, + "epoch": 0.7561177784985764, + "grad_norm": 0.18826597929000854, + "learning_rate": 4.809955593961493e-05, + "loss": 0.4472, "step": 20980 }, { - "epoch": 0.74, - "learning_rate": 4.820797252539711e-05, - "loss": 0.3053, + "epoch": 0.756297978159801, + "grad_norm": 0.14293283224105835, + "learning_rate": 4.809843978468409e-05, + "loss": 0.4693, "step": 20985 }, { - "epoch": 0.74, - "learning_rate": 4.82069132631373e-05, - "loss": 0.3627, + "epoch": 0.7564781778210257, + "grad_norm": 0.13552211225032806, + "learning_rate": 4.809732331504148e-05, + "loss": 0.39, "step": 20990 }, { - "epoch": 0.74, - "learning_rate": 4.8205853699551016e-05, - "loss": 0.291, + "epoch": 0.7566583774822503, + "grad_norm": 0.1683761477470398, + "learning_rate": 4.8096206530702305e-05, + "loss": 0.4059, "step": 20995 }, { - "epoch": 0.74, - "learning_rate": 4.820479383465202e-05, - "loss": 0.3204, + "epoch": 0.756838577143475, + "grad_norm": 0.17060311138629913, + "learning_rate": 4.809508943168179e-05, + "loss": 0.4352, "step": 21000 }, { - "epoch": 0.74, - "eval_loss": 0.29989683628082275, - "eval_runtime": 10.5493, - "eval_samples_per_second": 9.479, - "eval_steps_per_second": 9.479, + "epoch": 0.756838577143475, + "eval_loss": 0.462345689535141, + "eval_runtime": 3.5382, + "eval_samples_per_second": 28.263, + "eval_steps_per_second": 7.066, "step": 21000 }, { - "epoch": 0.74, - "learning_rate": 4.8203733668454066e-05, - "loss": 0.298, + "epoch": 0.7570187768046996, + "grad_norm": 0.179361492395401, + "learning_rate": 4.8093972017995155e-05, + "loss": 0.4466, "step": 21005 }, { - "epoch": 0.74, - "learning_rate": 4.8202673200970925e-05, - "loss": 0.2888, + "epoch": 0.7571989764659243, + "grad_norm": 0.19304408133029938, + "learning_rate": 4.8092854289657617e-05, + "loss": 0.4786, "step": 21010 }, { - "epoch": 0.74, - "learning_rate": 4.820161243221636e-05, - "loss": 0.2988, + "epoch": 0.7573791761271489, + "grad_norm": 0.16757160425186157, + "learning_rate": 4.809173624668442e-05, + "loss": 0.4614, "step": 21015 }, { - "epoch": 0.74, - "learning_rate": 4.820055136220415e-05, - "loss": 0.3334, + "epoch": 0.7575593757883735, + "grad_norm": 0.16519469022750854, + "learning_rate": 4.8090617889090786e-05, + "loss": 0.4297, "step": 21020 }, { - "epoch": 0.74, - "learning_rate": 4.819948999094807e-05, - "loss": 0.3501, + "epoch": 0.7577395754495981, + "grad_norm": 0.1686154156923294, + "learning_rate": 4.808949921689194e-05, + "loss": 0.387, "step": 21025 }, { - "epoch": 0.74, - "learning_rate": 4.81984283184619e-05, - "loss": 0.3129, + "epoch": 0.7579197751108228, + "grad_norm": 0.1524842232465744, + "learning_rate": 4.8088380230103145e-05, + "loss": 0.4291, "step": 21030 }, { - "epoch": 0.74, - "learning_rate": 4.819736634475942e-05, - "loss": 0.2927, + "epoch": 0.7580999747720474, + "grad_norm": 0.18697072565555573, + "learning_rate": 4.808726092873964e-05, + "loss": 0.4262, "step": 21035 }, { - "epoch": 0.74, - "learning_rate": 4.819630406985444e-05, - "loss": 0.2925, + "epoch": 0.7582801744332721, + "grad_norm": 0.21223722398281097, + "learning_rate": 4.808614131281668e-05, + "loss": 0.4522, "step": 21040 }, { - "epoch": 0.74, - "learning_rate": 4.819524149376072e-05, - "loss": 0.3117, + "epoch": 0.7584603740944967, + "grad_norm": 0.16958850622177124, + "learning_rate": 4.808502138234951e-05, + "loss": 0.4667, "step": 21045 }, { - "epoch": 0.74, - "learning_rate": 4.8194178616492094e-05, - "loss": 0.2986, + "epoch": 0.7586405737557214, + "grad_norm": 0.1471187025308609, + "learning_rate": 4.808390113735339e-05, + "loss": 0.4847, "step": 21050 }, { - "epoch": 0.74, - "learning_rate": 4.819311543806233e-05, - "loss": 0.3166, + "epoch": 0.758820773416946, + "grad_norm": 0.156825989484787, + "learning_rate": 4.808278057784359e-05, + "loss": 0.4648, "step": 21055 }, { - "epoch": 0.74, - "learning_rate": 4.819205195848526e-05, - "loss": 0.3084, + "epoch": 0.7590009730781706, + "grad_norm": 0.15920226275920868, + "learning_rate": 4.808165970383538e-05, + "loss": 0.4215, "step": 21060 }, { - "epoch": 0.74, - "learning_rate": 4.8190988177774666e-05, - "loss": 0.3021, + "epoch": 0.7591811727393952, + "grad_norm": 0.1420614868402481, + "learning_rate": 4.808053851534401e-05, + "loss": 0.411, "step": 21065 }, { - "epoch": 0.74, - "learning_rate": 4.818992409594438e-05, - "loss": 0.289, + "epoch": 0.7593613724006198, + "grad_norm": 0.21698161959648132, + "learning_rate": 4.8079417012384786e-05, + "loss": 0.4415, "step": 21070 }, { - "epoch": 0.74, - "learning_rate": 4.8188859713008215e-05, - "loss": 0.309, + "epoch": 0.7595415720618445, + "grad_norm": 0.18251734972000122, + "learning_rate": 4.807829519497297e-05, + "loss": 0.4644, "step": 21075 }, { - "epoch": 0.74, - "learning_rate": 4.8187795028979985e-05, - "loss": 0.3262, + "epoch": 0.7597217717230692, + "grad_norm": 0.18656039237976074, + "learning_rate": 4.8077173063123843e-05, + "loss": 0.456, "step": 21080 }, { - "epoch": 0.74, - "learning_rate": 4.818673004387352e-05, - "loss": 0.3161, + "epoch": 0.7599019713842938, + "grad_norm": 0.17298544943332672, + "learning_rate": 4.807605061685271e-05, + "loss": 0.4445, "step": 21085 }, { - "epoch": 0.74, - "learning_rate": 4.8185664757702654e-05, - "loss": 0.2769, + "epoch": 0.7600821710455185, + "grad_norm": 0.1439000368118286, + "learning_rate": 4.807492785617484e-05, + "loss": 0.4311, "step": 21090 }, { - "epoch": 0.74, - "learning_rate": 4.81845991704812e-05, - "loss": 0.3011, + "epoch": 0.7602623707067431, + "grad_norm": 0.14586910605430603, + "learning_rate": 4.807380478110556e-05, + "loss": 0.4383, "step": 21095 }, { - "epoch": 0.74, - "learning_rate": 4.8183533282223e-05, - "loss": 0.3161, + "epoch": 0.7604425703679677, + "grad_norm": 0.18215352296829224, + "learning_rate": 4.8072681391660153e-05, + "loss": 0.4222, "step": 21100 }, { - "epoch": 0.74, - "learning_rate": 4.8182467092941915e-05, - "loss": 0.298, + "epoch": 0.7606227700291923, + "grad_norm": 0.1855933964252472, + "learning_rate": 4.807155768785393e-05, + "loss": 0.4237, "step": 21105 }, { - "epoch": 0.74, - "learning_rate": 4.818140060265176e-05, - "loss": 0.3351, + "epoch": 0.7608029696904169, + "grad_norm": 0.22114041447639465, + "learning_rate": 4.80704336697022e-05, + "loss": 0.4643, "step": 21110 }, { - "epoch": 0.74, - "learning_rate": 4.81803338113664e-05, - "loss": 0.3028, + "epoch": 0.7609831693516416, + "grad_norm": 0.1789657175540924, + "learning_rate": 4.806930933722027e-05, + "loss": 0.4599, "step": 21115 }, { - "epoch": 0.74, - "learning_rate": 4.817926671909968e-05, - "loss": 0.3052, + "epoch": 0.7611633690128663, + "grad_norm": 0.17131780087947845, + "learning_rate": 4.806818469042348e-05, + "loss": 0.4171, "step": 21120 }, { - "epoch": 0.74, - "learning_rate": 4.8178199325865457e-05, - "loss": 0.3117, + "epoch": 0.7613435686740909, + "grad_norm": 0.20241133868694305, + "learning_rate": 4.806705972932713e-05, + "loss": 0.4611, "step": 21125 }, { - "epoch": 0.74, - "learning_rate": 4.817713163167759e-05, - "loss": 0.2919, + "epoch": 0.7615237683353155, + "grad_norm": 0.19503600895404816, + "learning_rate": 4.806593445394656e-05, + "loss": 0.4813, "step": 21130 }, { - "epoch": 0.74, - "learning_rate": 4.817606363654995e-05, - "loss": 0.3248, + "epoch": 0.7617039679965402, + "grad_norm": 0.1657884567975998, + "learning_rate": 4.8064808864297094e-05, + "loss": 0.4784, "step": 21135 }, { - "epoch": 0.74, - "learning_rate": 4.81749953404964e-05, - "loss": 0.2991, + "epoch": 0.7618841676577648, + "grad_norm": 0.18670715391635895, + "learning_rate": 4.806368296039409e-05, + "loss": 0.42, "step": 21140 }, { - "epoch": 0.74, - "learning_rate": 4.8173926743530794e-05, - "loss": 0.3388, + "epoch": 0.7620643673189894, + "grad_norm": 0.16101418435573578, + "learning_rate": 4.806255674225285e-05, + "loss": 0.4258, "step": 21145 }, { - "epoch": 0.74, - "learning_rate": 4.817285784566703e-05, - "loss": 0.3201, + "epoch": 0.762244566980214, + "grad_norm": 0.16799142956733704, + "learning_rate": 4.806143020988875e-05, + "loss": 0.4105, "step": 21150 }, { - "epoch": 0.74, - "learning_rate": 4.817178864691898e-05, - "loss": 0.3244, + "epoch": 0.7624247666414388, + "grad_norm": 0.15857410430908203, + "learning_rate": 4.806030336331713e-05, + "loss": 0.4251, "step": 21155 }, { - "epoch": 0.74, - "learning_rate": 4.817071914730052e-05, - "loss": 0.3065, + "epoch": 0.7626049663026634, + "grad_norm": 0.16540922224521637, + "learning_rate": 4.8059176202553337e-05, + "loss": 0.4593, "step": 21160 }, { - "epoch": 0.74, - "learning_rate": 4.816964934682555e-05, - "loss": 0.2745, + "epoch": 0.762785165963888, + "grad_norm": 0.1826573759317398, + "learning_rate": 4.8058048727612724e-05, + "loss": 0.4513, "step": 21165 }, { - "epoch": 0.74, - "learning_rate": 4.816857924550794e-05, - "loss": 0.3042, + "epoch": 0.7629653656251126, + "grad_norm": 0.1935107260942459, + "learning_rate": 4.8056920938510675e-05, + "loss": 0.4293, "step": 21170 }, { - "epoch": 0.74, - "learning_rate": 4.8167508843361606e-05, - "loss": 0.3095, + "epoch": 0.7631455652863373, + "grad_norm": 0.18235011398792267, + "learning_rate": 4.8055792835262536e-05, + "loss": 0.4956, "step": 21175 }, { - "epoch": 0.75, - "learning_rate": 4.816643814040044e-05, - "loss": 0.289, + "epoch": 0.7633257649475619, + "grad_norm": 0.168792262673378, + "learning_rate": 4.8054664417883685e-05, + "loss": 0.4623, "step": 21180 }, { - "epoch": 0.75, - "learning_rate": 4.8165367136638335e-05, - "loss": 0.3259, + "epoch": 0.7635059646087865, + "grad_norm": 0.17810115218162537, + "learning_rate": 4.8053535686389495e-05, + "loss": 0.4265, "step": 21185 }, { - "epoch": 0.75, - "learning_rate": 4.81642958320892e-05, - "loss": 0.313, + "epoch": 0.7636861642700111, + "grad_norm": 0.20032259821891785, + "learning_rate": 4.805240664079534e-05, + "loss": 0.4372, "step": 21190 }, { - "epoch": 0.75, - "learning_rate": 4.8163224226766955e-05, - "loss": 0.2789, + "epoch": 0.7638663639312359, + "grad_norm": 0.147112175822258, + "learning_rate": 4.805127728111662e-05, + "loss": 0.4288, "step": 21195 }, { - "epoch": 0.75, - "learning_rate": 4.8162152320685515e-05, - "loss": 0.3211, + "epoch": 0.7640465635924605, + "grad_norm": 0.1480092704296112, + "learning_rate": 4.80501476073687e-05, + "loss": 0.4137, "step": 21200 }, { - "epoch": 0.75, - "learning_rate": 4.8161080113858786e-05, - "loss": 0.3257, + "epoch": 0.7642267632536851, + "grad_norm": 0.17761540412902832, + "learning_rate": 4.8049017619566986e-05, + "loss": 0.4727, "step": 21205 }, { - "epoch": 0.75, - "learning_rate": 4.8160007606300694e-05, - "loss": 0.3075, + "epoch": 0.7644069629149097, + "grad_norm": 0.14854562282562256, + "learning_rate": 4.8047887317726865e-05, + "loss": 0.4549, "step": 21210 }, { - "epoch": 0.75, - "learning_rate": 4.815893479802517e-05, - "loss": 0.3072, + "epoch": 0.7645871625761343, + "grad_norm": 0.1848391890525818, + "learning_rate": 4.804675670186374e-05, + "loss": 0.4468, "step": 21215 }, { - "epoch": 0.75, - "learning_rate": 4.815786168904613e-05, - "loss": 0.3119, + "epoch": 0.764767362237359, + "grad_norm": 0.18001249432563782, + "learning_rate": 4.804562577199302e-05, + "loss": 0.4149, "step": 21220 }, { - "epoch": 0.75, - "learning_rate": 4.8156788279377534e-05, - "loss": 0.3135, + "epoch": 0.7649475618985836, + "grad_norm": 0.16468872129917145, + "learning_rate": 4.804449452813011e-05, + "loss": 0.4224, "step": 21225 }, { - "epoch": 0.75, - "learning_rate": 4.8155714569033294e-05, - "loss": 0.2867, + "epoch": 0.7651277615598082, + "grad_norm": 0.17478923499584198, + "learning_rate": 4.804336297029043e-05, + "loss": 0.4528, "step": 21230 }, { - "epoch": 0.75, - "learning_rate": 4.815464055802736e-05, - "loss": 0.2794, + "epoch": 0.765307961221033, + "grad_norm": 0.15894725918769836, + "learning_rate": 4.804223109848939e-05, + "loss": 0.4523, "step": 21235 }, { - "epoch": 0.75, - "learning_rate": 4.815356624637368e-05, - "loss": 0.3002, + "epoch": 0.7654881608822576, + "grad_norm": 0.14751605689525604, + "learning_rate": 4.80410989127424e-05, + "loss": 0.4273, "step": 21240 }, { - "epoch": 0.75, - "learning_rate": 4.815249163408621e-05, - "loss": 0.3174, + "epoch": 0.7656683605434822, + "grad_norm": 0.2025453895330429, + "learning_rate": 4.803996641306491e-05, + "loss": 0.44, "step": 21245 }, { - "epoch": 0.75, - "learning_rate": 4.815141672117888e-05, - "loss": 0.2666, + "epoch": 0.7658485602047068, + "grad_norm": 0.14769452810287476, + "learning_rate": 4.803883359947233e-05, + "loss": 0.4416, "step": 21250 }, { - "epoch": 0.75, - "learning_rate": 4.8150341507665664e-05, - "loss": 0.3275, + "epoch": 0.7660287598659314, + "grad_norm": 0.15946455299854279, + "learning_rate": 4.803770047198011e-05, + "loss": 0.4366, "step": 21255 }, { - "epoch": 0.75, - "learning_rate": 4.814926599356052e-05, - "loss": 0.2986, + "epoch": 0.7662089595271561, + "grad_norm": 0.1485140323638916, + "learning_rate": 4.8036567030603676e-05, + "loss": 0.4624, "step": 21260 }, { - "epoch": 0.75, - "learning_rate": 4.8148190178877414e-05, - "loss": 0.3171, + "epoch": 0.7663891591883807, + "grad_norm": 0.15414132177829742, + "learning_rate": 4.803543327535848e-05, + "loss": 0.4321, "step": 21265 }, { - "epoch": 0.75, - "learning_rate": 4.814711406363032e-05, - "loss": 0.3302, + "epoch": 0.7665693588496053, + "grad_norm": 0.17065320909023285, + "learning_rate": 4.803429920625996e-05, + "loss": 0.4396, "step": 21270 }, { - "epoch": 0.75, - "learning_rate": 4.81460376478332e-05, - "loss": 0.3228, + "epoch": 0.76674955851083, + "grad_norm": 0.15544413030147552, + "learning_rate": 4.803316482332358e-05, + "loss": 0.4714, "step": 21275 }, { - "epoch": 0.75, - "learning_rate": 4.8144960931500026e-05, - "loss": 0.3102, + "epoch": 0.7669297581720547, + "grad_norm": 0.22066541016101837, + "learning_rate": 4.803203012656479e-05, + "loss": 0.4651, "step": 21280 }, { - "epoch": 0.75, - "learning_rate": 4.8143883914644796e-05, - "loss": 0.2783, + "epoch": 0.7671099578332793, + "grad_norm": 0.21565388143062592, + "learning_rate": 4.803089511599904e-05, + "loss": 0.4593, "step": 21285 }, { - "epoch": 0.75, - "learning_rate": 4.814280659728149e-05, - "loss": 0.2945, + "epoch": 0.7672901574945039, + "grad_norm": 0.15955856442451477, + "learning_rate": 4.8029759791641804e-05, + "loss": 0.4406, "step": 21290 }, { - "epoch": 0.75, - "learning_rate": 4.8141728979424085e-05, - "loss": 0.2845, + "epoch": 0.7674703571557285, + "grad_norm": 0.19206349551677704, + "learning_rate": 4.8028624153508555e-05, + "loss": 0.4127, "step": 21295 }, { - "epoch": 0.75, - "learning_rate": 4.8140651061086584e-05, - "loss": 0.309, + "epoch": 0.7676505568169532, + "grad_norm": 0.15551474690437317, + "learning_rate": 4.8027488201614754e-05, + "loss": 0.4212, "step": 21300 }, { - "epoch": 0.75, - "learning_rate": 4.8139572842282975e-05, - "loss": 0.3261, + "epoch": 0.7678307564781778, + "grad_norm": 0.16938930749893188, + "learning_rate": 4.802635193597589e-05, + "loss": 0.4515, "step": 21305 }, { - "epoch": 0.75, - "learning_rate": 4.813849432302726e-05, - "loss": 0.3056, + "epoch": 0.7680109561394025, + "grad_norm": 0.16218577325344086, + "learning_rate": 4.802521535660744e-05, + "loss": 0.4228, "step": 21310 }, { - "epoch": 0.75, - "learning_rate": 4.813741550333345e-05, - "loss": 0.2973, + "epoch": 0.7681911558006271, + "grad_norm": 0.14813528954982758, + "learning_rate": 4.802407846352488e-05, + "loss": 0.4586, "step": 21315 }, { - "epoch": 0.75, - "learning_rate": 4.8136336383215554e-05, - "loss": 0.3224, + "epoch": 0.7683713554618518, + "grad_norm": 0.17613081634044647, + "learning_rate": 4.802294125674372e-05, + "loss": 0.4574, "step": 21320 }, { - "epoch": 0.75, - "learning_rate": 4.813525696268757e-05, - "loss": 0.2816, + "epoch": 0.7685515551230764, + "grad_norm": 0.1591581255197525, + "learning_rate": 4.8021803736279435e-05, + "loss": 0.4451, "step": 21325 }, { - "epoch": 0.75, - "learning_rate": 4.813417724176352e-05, - "loss": 0.291, + "epoch": 0.768731754784301, + "grad_norm": 0.16665023565292358, + "learning_rate": 4.8020665902147535e-05, + "loss": 0.4057, "step": 21330 }, { - "epoch": 0.75, - "learning_rate": 4.813309722045742e-05, - "loss": 0.3231, + "epoch": 0.7689119544455256, + "grad_norm": 0.17413721978664398, + "learning_rate": 4.801952775436352e-05, + "loss": 0.4655, "step": 21335 }, { - "epoch": 0.75, - "learning_rate": 4.813201689878331e-05, - "loss": 0.2945, + "epoch": 0.7690921541067502, + "grad_norm": 0.1840430349111557, + "learning_rate": 4.8018389292942886e-05, + "loss": 0.4568, "step": 21340 }, { - "epoch": 0.75, - "learning_rate": 4.8130936276755204e-05, - "loss": 0.3041, + "epoch": 0.7692723537679749, + "grad_norm": 0.158147931098938, + "learning_rate": 4.801725051790117e-05, + "loss": 0.4459, "step": 21345 }, { - "epoch": 0.75, - "learning_rate": 4.8129855354387125e-05, - "loss": 0.2851, + "epoch": 0.7694525534291996, + "grad_norm": 0.19126906991004944, + "learning_rate": 4.801611142925386e-05, + "loss": 0.4429, "step": 21350 }, { - "epoch": 0.75, - "learning_rate": 4.812877413169312e-05, - "loss": 0.3145, + "epoch": 0.7696327530904242, + "grad_norm": 0.22057095170021057, + "learning_rate": 4.801497202701649e-05, + "loss": 0.4369, "step": 21355 }, { - "epoch": 0.75, - "learning_rate": 4.812769260868723e-05, - "loss": 0.3013, + "epoch": 0.7698129527516488, + "grad_norm": 0.16785407066345215, + "learning_rate": 4.8013832311204586e-05, + "loss": 0.4594, "step": 21360 }, { - "epoch": 0.75, - "learning_rate": 4.812661078538349e-05, - "loss": 0.2727, + "epoch": 0.7699931524128735, + "grad_norm": 0.17427514493465424, + "learning_rate": 4.801269228183367e-05, + "loss": 0.4408, "step": 21365 }, { - "epoch": 0.75, - "learning_rate": 4.812552866179595e-05, - "loss": 0.3139, + "epoch": 0.7701733520740981, + "grad_norm": 0.14857777953147888, + "learning_rate": 4.8011551938919283e-05, + "loss": 0.4515, "step": 21370 }, { - "epoch": 0.75, - "learning_rate": 4.8124446237938665e-05, - "loss": 0.3225, + "epoch": 0.7703535517353227, + "grad_norm": 0.19615520536899567, + "learning_rate": 4.801041128247695e-05, + "loss": 0.4395, "step": 21375 }, { - "epoch": 0.75, - "learning_rate": 4.812336351382568e-05, - "loss": 0.3011, + "epoch": 0.7705337513965473, + "grad_norm": 0.46812736988067627, + "learning_rate": 4.800927031252222e-05, + "loss": 0.3949, "step": 21380 }, { - "epoch": 0.75, - "learning_rate": 4.8122280489471064e-05, - "loss": 0.3018, + "epoch": 0.770713951057772, + "grad_norm": 0.18027018010616302, + "learning_rate": 4.800812902907063e-05, + "loss": 0.446, "step": 21385 }, { - "epoch": 0.75, - "learning_rate": 4.812119716488886e-05, - "loss": 0.2842, + "epoch": 0.7708941507189967, + "grad_norm": 0.1679840236902237, + "learning_rate": 4.800698743213774e-05, + "loss": 0.4554, "step": 21390 }, { - "epoch": 0.75, - "learning_rate": 4.812011354009317e-05, - "loss": 0.2962, + "epoch": 0.7710743503802213, + "grad_norm": 0.19741559028625488, + "learning_rate": 4.80058455217391e-05, + "loss": 0.4695, "step": 21395 }, { - "epoch": 0.75, - "learning_rate": 4.8119029615098024e-05, - "loss": 0.3168, + "epoch": 0.7712545500414459, + "grad_norm": 0.19206707179546356, + "learning_rate": 4.800470329789027e-05, + "loss": 0.4572, "step": 21400 }, { - "epoch": 0.75, - "learning_rate": 4.811794538991752e-05, - "loss": 0.2889, + "epoch": 0.7714347497026706, + "grad_norm": 0.17079663276672363, + "learning_rate": 4.800356076060682e-05, + "loss": 0.4186, "step": 21405 }, { - "epoch": 0.75, - "learning_rate": 4.811686086456574e-05, - "loss": 0.3059, + "epoch": 0.7716149493638952, + "grad_norm": 0.20327655971050262, + "learning_rate": 4.800241790990429e-05, + "loss": 0.4195, "step": 21410 }, { - "epoch": 0.75, - "learning_rate": 4.811577603905674e-05, - "loss": 0.2992, + "epoch": 0.7717951490251198, + "grad_norm": 0.1791381984949112, + "learning_rate": 4.8001274745798286e-05, + "loss": 0.457, "step": 21415 }, { - "epoch": 0.75, - "learning_rate": 4.8114690913404634e-05, - "loss": 0.3163, + "epoch": 0.7719753486863444, + "grad_norm": 0.1447453647851944, + "learning_rate": 4.800013126830437e-05, + "loss": 0.4155, "step": 21420 }, { - "epoch": 0.75, - "learning_rate": 4.811360548762349e-05, - "loss": 0.2916, + "epoch": 0.772155548347569, + "grad_norm": 0.17693381011486053, + "learning_rate": 4.799898747743811e-05, + "loss": 0.3966, "step": 21425 }, { - "epoch": 0.75, - "learning_rate": 4.811251976172742e-05, - "loss": 0.2905, + "epoch": 0.7723357480087938, + "grad_norm": 0.2063083052635193, + "learning_rate": 4.799784337321509e-05, + "loss": 0.4712, "step": 21430 }, { - "epoch": 0.75, - "learning_rate": 4.811143373573051e-05, - "loss": 0.313, + "epoch": 0.7725159476700184, + "grad_norm": 0.217716246843338, + "learning_rate": 4.799669895565092e-05, + "loss": 0.4261, "step": 21435 }, { - "epoch": 0.75, - "learning_rate": 4.811034740964686e-05, - "loss": 0.3108, + "epoch": 0.772696147331243, + "grad_norm": 0.13754284381866455, + "learning_rate": 4.799555422476117e-05, + "loss": 0.4144, "step": 21440 }, { - "epoch": 0.75, - "learning_rate": 4.810926078349058e-05, - "loss": 0.3084, + "epoch": 0.7728763469924677, + "grad_norm": 0.1717095524072647, + "learning_rate": 4.799440918056145e-05, + "loss": 0.4505, "step": 21445 }, { - "epoch": 0.75, - "learning_rate": 4.8108173857275785e-05, - "loss": 0.3066, + "epoch": 0.7730565466536923, + "grad_norm": 0.16796359419822693, + "learning_rate": 4.7993263823067355e-05, + "loss": 0.4523, "step": 21450 }, { - "epoch": 0.75, - "learning_rate": 4.810708663101658e-05, - "loss": 0.3084, + "epoch": 0.7732367463149169, + "grad_norm": 0.1903134435415268, + "learning_rate": 4.79921181522945e-05, + "loss": 0.4197, "step": 21455 }, { - "epoch": 0.76, - "learning_rate": 4.8105999104727086e-05, - "loss": 0.2944, + "epoch": 0.7734169459761415, + "grad_norm": 0.1834080070257187, + "learning_rate": 4.799097216825847e-05, + "loss": 0.4232, "step": 21460 }, { - "epoch": 0.76, - "learning_rate": 4.810491127842141e-05, - "loss": 0.317, + "epoch": 0.7735971456373663, + "grad_norm": 0.17639869451522827, + "learning_rate": 4.7989825870974904e-05, + "loss": 0.4085, "step": 21465 }, { - "epoch": 0.76, - "learning_rate": 4.8103823152113695e-05, - "loss": 0.3204, + "epoch": 0.7737773452985909, + "grad_norm": 0.18790148198604584, + "learning_rate": 4.798867926045941e-05, + "loss": 0.4522, "step": 21470 }, { - "epoch": 0.76, - "learning_rate": 4.8102734725818056e-05, - "loss": 0.3106, + "epoch": 0.7739575449598155, + "grad_norm": 0.17099972069263458, + "learning_rate": 4.798753233672762e-05, + "loss": 0.4409, "step": 21475 }, { - "epoch": 0.76, - "learning_rate": 4.810164599954864e-05, - "loss": 0.3186, + "epoch": 0.7741377446210401, + "grad_norm": 0.15639808773994446, + "learning_rate": 4.798638509979514e-05, + "loss": 0.4576, "step": 21480 }, { - "epoch": 0.76, - "learning_rate": 4.810055697331957e-05, - "loss": 0.2905, + "epoch": 0.7743179442822647, + "grad_norm": 0.19810642302036285, + "learning_rate": 4.7985237549677624e-05, + "loss": 0.4336, "step": 21485 }, { - "epoch": 0.76, - "learning_rate": 4.809946764714499e-05, - "loss": 0.3014, + "epoch": 0.7744981439434894, + "grad_norm": 0.192165344953537, + "learning_rate": 4.79840896863907e-05, + "loss": 0.4481, "step": 21490 }, { - "epoch": 0.76, - "learning_rate": 4.809837802103905e-05, - "loss": 0.2906, + "epoch": 0.774678343604714, + "grad_norm": 0.17463374137878418, + "learning_rate": 4.798294150994999e-05, + "loss": 0.4444, "step": 21495 }, { - "epoch": 0.76, - "learning_rate": 4.80972880950159e-05, - "loss": 0.2881, + "epoch": 0.7748585432659386, + "grad_norm": 0.1958521455526352, + "learning_rate": 4.798179302037116e-05, + "loss": 0.4435, "step": 21500 }, { - "epoch": 0.76, - "eval_loss": 0.30054381489753723, - "eval_runtime": 10.5336, - "eval_samples_per_second": 9.493, - "eval_steps_per_second": 9.493, + "epoch": 0.7748585432659386, + "eval_loss": 0.4604308009147644, + "eval_runtime": 3.5333, + "eval_samples_per_second": 28.302, + "eval_steps_per_second": 7.075, "step": 21500 }, { - "epoch": 0.76, - "learning_rate": 4.809619786908968e-05, - "loss": 0.2831, + "epoch": 0.7750387429271633, + "grad_norm": 0.18340334296226501, + "learning_rate": 4.798064421766985e-05, + "loss": 0.4139, "step": 21505 }, { - "epoch": 0.76, - "learning_rate": 4.809510734327455e-05, - "loss": 0.3034, + "epoch": 0.775218942588388, + "grad_norm": 0.18603767454624176, + "learning_rate": 4.7979495101861705e-05, + "loss": 0.418, "step": 21510 }, { - "epoch": 0.76, - "learning_rate": 4.809401651758466e-05, - "loss": 0.3227, + "epoch": 0.7753991422496126, + "grad_norm": 0.21015454828739166, + "learning_rate": 4.7978345672962395e-05, + "loss": 0.4884, "step": 21515 }, { - "epoch": 0.76, - "learning_rate": 4.809292539203421e-05, - "loss": 0.3268, + "epoch": 0.7755793419108372, + "grad_norm": 0.16696617007255554, + "learning_rate": 4.797719593098757e-05, + "loss": 0.4452, "step": 21520 }, { - "epoch": 0.76, - "learning_rate": 4.809183396663732e-05, - "loss": 0.3147, + "epoch": 0.7757595415720618, + "grad_norm": 0.19136182963848114, + "learning_rate": 4.79760458759529e-05, + "loss": 0.449, "step": 21525 }, { - "epoch": 0.76, - "learning_rate": 4.809074224140819e-05, - "loss": 0.2981, + "epoch": 0.7759397412332865, + "grad_norm": 0.14051967859268188, + "learning_rate": 4.797489550787405e-05, + "loss": 0.4396, "step": 21530 }, { - "epoch": 0.76, - "learning_rate": 4.808965021636099e-05, - "loss": 0.2945, + "epoch": 0.7761199408945111, + "grad_norm": 0.15521185100078583, + "learning_rate": 4.7973744826766706e-05, + "loss": 0.479, "step": 21535 }, { - "epoch": 0.76, - "learning_rate": 4.808855789150991e-05, - "loss": 0.3062, + "epoch": 0.7763001405557357, + "grad_norm": 0.1503034085035324, + "learning_rate": 4.797259383264653e-05, + "loss": 0.4187, "step": 21540 }, { - "epoch": 0.76, - "learning_rate": 4.8087465266869115e-05, - "loss": 0.3048, + "epoch": 0.7764803402169604, + "grad_norm": 0.18658557534217834, + "learning_rate": 4.7971442525529206e-05, + "loss": 0.4604, "step": 21545 }, { - "epoch": 0.76, - "learning_rate": 4.808637234245279e-05, - "loss": 0.3082, + "epoch": 0.7766605398781851, + "grad_norm": 0.19798524677753448, + "learning_rate": 4.797029090543044e-05, + "loss": 0.447, "step": 21550 }, { - "epoch": 0.76, - "learning_rate": 4.808527911827514e-05, - "loss": 0.2964, + "epoch": 0.7768407395394097, + "grad_norm": 0.21234340965747833, + "learning_rate": 4.796913897236589e-05, + "loss": 0.4392, "step": 21555 }, { - "epoch": 0.76, - "learning_rate": 4.808418559435036e-05, - "loss": 0.2923, + "epoch": 0.7770209392006343, + "grad_norm": 0.16527117788791656, + "learning_rate": 4.796798672635128e-05, + "loss": 0.4516, "step": 21560 }, { - "epoch": 0.76, - "learning_rate": 4.8083091770692635e-05, - "loss": 0.3136, + "epoch": 0.7772011388618589, + "grad_norm": 0.20785808563232422, + "learning_rate": 4.7966834167402295e-05, + "loss": 0.4954, "step": 21565 }, { - "epoch": 0.76, - "learning_rate": 4.808199764731618e-05, - "loss": 0.3084, + "epoch": 0.7773813385230836, + "grad_norm": 0.15618237853050232, + "learning_rate": 4.7965681295534635e-05, + "loss": 0.399, "step": 21570 }, { - "epoch": 0.76, - "learning_rate": 4.80809032242352e-05, - "loss": 0.3067, + "epoch": 0.7775615381843082, + "grad_norm": 0.1682867556810379, + "learning_rate": 4.7964528110764026e-05, + "loss": 0.4513, "step": 21575 }, { - "epoch": 0.76, - "learning_rate": 4.8079808501463896e-05, - "loss": 0.2922, + "epoch": 0.7777417378455328, + "grad_norm": 0.1370386779308319, + "learning_rate": 4.796337461310616e-05, + "loss": 0.4479, "step": 21580 }, { - "epoch": 0.76, - "learning_rate": 4.807871347901649e-05, - "loss": 0.2992, + "epoch": 0.7779219375067575, + "grad_norm": 0.41439640522003174, + "learning_rate": 4.796222080257676e-05, + "loss": 0.4443, "step": 21585 }, { - "epoch": 0.76, - "learning_rate": 4.807761815690721e-05, - "loss": 0.3009, + "epoch": 0.7781021371679822, + "grad_norm": 0.1332816630601883, + "learning_rate": 4.7961066679191544e-05, + "loss": 0.429, "step": 21590 }, { - "epoch": 0.76, - "learning_rate": 4.807652253515025e-05, - "loss": 0.2782, + "epoch": 0.7782823368292068, + "grad_norm": 0.20065148174762726, + "learning_rate": 4.7959912242966245e-05, + "loss": 0.4144, "step": 21595 }, { - "epoch": 0.76, - "learning_rate": 4.8075426613759865e-05, - "loss": 0.2967, + "epoch": 0.7784625364904314, + "grad_norm": 0.18139199912548065, + "learning_rate": 4.795875749391659e-05, + "loss": 0.4245, "step": 21600 }, { - "epoch": 0.76, - "learning_rate": 4.807433039275027e-05, - "loss": 0.3211, + "epoch": 0.778642736151656, + "grad_norm": 0.1368846297264099, + "learning_rate": 4.79576024320583e-05, + "loss": 0.4475, "step": 21605 }, { - "epoch": 0.76, - "learning_rate": 4.80732338721357e-05, - "loss": 0.3165, + "epoch": 0.7788229358128806, + "grad_norm": 0.21975240111351013, + "learning_rate": 4.7956447057407125e-05, + "loss": 0.4563, "step": 21610 }, { - "epoch": 0.76, - "learning_rate": 4.807213705193039e-05, - "loss": 0.3044, + "epoch": 0.7790031354741053, + "grad_norm": 0.16079792380332947, + "learning_rate": 4.795529136997881e-05, + "loss": 0.4176, "step": 21615 }, { - "epoch": 0.76, - "learning_rate": 4.8071039932148585e-05, - "loss": 0.3058, + "epoch": 0.7791833351353299, + "grad_norm": 0.16716989874839783, + "learning_rate": 4.795413536978909e-05, + "loss": 0.4083, "step": 21620 }, { - "epoch": 0.76, - "learning_rate": 4.806994251280454e-05, - "loss": 0.3035, + "epoch": 0.7793635347965546, + "grad_norm": 0.21081286668777466, + "learning_rate": 4.795297905685372e-05, + "loss": 0.4439, "step": 21625 }, { - "epoch": 0.76, - "learning_rate": 4.806884479391249e-05, - "loss": 0.3142, + "epoch": 0.7795437344577792, + "grad_norm": 0.17637106776237488, + "learning_rate": 4.7951822431188455e-05, + "loss": 0.4057, "step": 21630 }, { - "epoch": 0.76, - "learning_rate": 4.8067746775486696e-05, - "loss": 0.3171, + "epoch": 0.7797239341190039, + "grad_norm": 0.1675841063261032, + "learning_rate": 4.7950665492809047e-05, + "loss": 0.467, "step": 21635 }, { - "epoch": 0.76, - "learning_rate": 4.806664845754141e-05, - "loss": 0.3367, + "epoch": 0.7799041337802285, + "grad_norm": 0.1935756951570511, + "learning_rate": 4.794950824173127e-05, + "loss": 0.3958, "step": 21640 }, { - "epoch": 0.76, - "learning_rate": 4.806554984009089e-05, - "loss": 0.3105, + "epoch": 0.7800843334414531, + "grad_norm": 0.18767641484737396, + "learning_rate": 4.794835067797089e-05, + "loss": 0.4528, "step": 21645 }, { - "epoch": 0.76, - "learning_rate": 4.806445092314941e-05, - "loss": 0.3224, + "epoch": 0.7802645331026777, + "grad_norm": 0.1749449372291565, + "learning_rate": 4.794719280154367e-05, + "loss": 0.4452, "step": 21650 }, { - "epoch": 0.76, - "learning_rate": 4.806335170673124e-05, - "loss": 0.2948, + "epoch": 0.7804447327639024, + "grad_norm": 0.13821455836296082, + "learning_rate": 4.794603461246539e-05, + "loss": 0.4487, "step": 21655 }, { - "epoch": 0.76, - "learning_rate": 4.806225219085064e-05, - "loss": 0.2976, + "epoch": 0.7806249324251271, + "grad_norm": 0.15566319227218628, + "learning_rate": 4.794487611075184e-05, + "loss": 0.4393, "step": 21660 }, { - "epoch": 0.76, - "learning_rate": 4.80611523755219e-05, - "loss": 0.3046, + "epoch": 0.7808051320863517, + "grad_norm": 0.17537739872932434, + "learning_rate": 4.794371729641878e-05, + "loss": 0.4475, "step": 21665 }, { - "epoch": 0.76, - "learning_rate": 4.806005226075929e-05, - "loss": 0.3117, + "epoch": 0.7809853317475763, + "grad_norm": 0.1799578070640564, + "learning_rate": 4.794255816948202e-05, + "loss": 0.4405, "step": 21670 }, { - "epoch": 0.76, - "learning_rate": 4.80589518465771e-05, - "loss": 0.3219, + "epoch": 0.781165531408801, + "grad_norm": 0.1784065216779709, + "learning_rate": 4.794139872995736e-05, + "loss": 0.4588, "step": 21675 }, { - "epoch": 0.76, - "learning_rate": 4.805785113298962e-05, - "loss": 0.3039, + "epoch": 0.7813457310700256, + "grad_norm": 0.18928910791873932, + "learning_rate": 4.7940238977860563e-05, + "loss": 0.4349, "step": 21680 }, { - "epoch": 0.76, - "learning_rate": 4.805675012001113e-05, - "loss": 0.3166, + "epoch": 0.7815259307312502, + "grad_norm": 0.15001779794692993, + "learning_rate": 4.793907891320746e-05, + "loss": 0.3882, "step": 21685 }, { - "epoch": 0.76, - "learning_rate": 4.805564880765594e-05, - "loss": 0.299, + "epoch": 0.7817061303924748, + "grad_norm": 0.21316440403461456, + "learning_rate": 4.793791853601385e-05, + "loss": 0.4141, "step": 21690 }, { - "epoch": 0.76, - "learning_rate": 4.8054547195938345e-05, - "loss": 0.3175, + "epoch": 0.7818863300536995, + "grad_norm": 0.1485886126756668, + "learning_rate": 4.793675784629554e-05, + "loss": 0.444, "step": 21695 }, { - "epoch": 0.76, - "learning_rate": 4.805344528487266e-05, - "loss": 0.2838, + "epoch": 0.7820665297149242, + "grad_norm": 0.17598822712898254, + "learning_rate": 4.7935596844068343e-05, + "loss": 0.4348, "step": 21700 }, { - "epoch": 0.76, - "learning_rate": 4.805234307447316e-05, - "loss": 0.2707, + "epoch": 0.7822467293761488, + "grad_norm": 0.1468224823474884, + "learning_rate": 4.793443552934808e-05, + "loss": 0.4099, "step": 21705 }, { - "epoch": 0.76, - "learning_rate": 4.8051240564754184e-05, - "loss": 0.3001, + "epoch": 0.7824269290373734, + "grad_norm": 0.21252159774303436, + "learning_rate": 4.793327390215058e-05, + "loss": 0.4587, "step": 21710 }, { - "epoch": 0.76, - "learning_rate": 4.805013775573004e-05, - "loss": 0.3248, + "epoch": 0.782607128698598, + "grad_norm": 0.1317516267299652, + "learning_rate": 4.7932111962491654e-05, + "loss": 0.4466, "step": 21715 }, { - "epoch": 0.76, - "learning_rate": 4.804903464741506e-05, - "loss": 0.2908, + "epoch": 0.7827873283598227, + "grad_norm": 0.17863036692142487, + "learning_rate": 4.7930949710387145e-05, + "loss": 0.436, "step": 21720 }, { - "epoch": 0.76, - "learning_rate": 4.804793123982354e-05, - "loss": 0.3068, + "epoch": 0.7829675280210473, + "grad_norm": 0.1505396068096161, + "learning_rate": 4.792978714585289e-05, + "loss": 0.4427, "step": 21725 }, { - "epoch": 0.76, - "learning_rate": 4.804682753296983e-05, - "loss": 0.3034, + "epoch": 0.7831477276822719, + "grad_norm": 0.17207299172878265, + "learning_rate": 4.7928624268904724e-05, + "loss": 0.3839, "step": 21730 }, { - "epoch": 0.76, - "learning_rate": 4.8045723526868245e-05, - "loss": 0.3117, + "epoch": 0.7833279273434965, + "grad_norm": 0.1621370017528534, + "learning_rate": 4.7927461079558476e-05, + "loss": 0.4446, "step": 21735 }, { - "epoch": 0.76, - "learning_rate": 4.8044619221533135e-05, - "loss": 0.2958, + "epoch": 0.7835081270047213, + "grad_norm": 0.1606713980436325, + "learning_rate": 4.792629757783003e-05, + "loss": 0.4394, "step": 21740 }, { - "epoch": 0.77, - "learning_rate": 4.804351461697883e-05, - "loss": 0.3404, + "epoch": 0.7836883266659459, + "grad_norm": 0.15808479487895966, + "learning_rate": 4.792513376373521e-05, + "loss": 0.4333, "step": 21745 }, { - "epoch": 0.77, - "learning_rate": 4.804240971321967e-05, - "loss": 0.2699, + "epoch": 0.7838685263271705, + "grad_norm": 0.19757013022899628, + "learning_rate": 4.7923969637289875e-05, + "loss": 0.4498, "step": 21750 }, { - "epoch": 0.77, - "learning_rate": 4.8041304510270016e-05, - "loss": 0.3103, + "epoch": 0.7840487259883951, + "grad_norm": 0.1744808852672577, + "learning_rate": 4.7922805198509905e-05, + "loss": 0.4234, "step": 21755 }, { - "epoch": 0.77, - "learning_rate": 4.804019900814419e-05, - "loss": 0.2829, + "epoch": 0.7842289256496198, + "grad_norm": 0.14034539461135864, + "learning_rate": 4.7921640447411146e-05, + "loss": 0.428, "step": 21760 }, { - "epoch": 0.77, - "learning_rate": 4.803909320685658e-05, - "loss": 0.3489, + "epoch": 0.7844091253108444, + "grad_norm": 0.12732501327991486, + "learning_rate": 4.792047538400947e-05, + "loss": 0.415, "step": 21765 }, { - "epoch": 0.77, - "learning_rate": 4.8037987106421525e-05, - "loss": 0.3017, + "epoch": 0.784589324972069, + "grad_norm": 0.13818371295928955, + "learning_rate": 4.791931000832076e-05, + "loss": 0.4057, "step": 21770 }, { - "epoch": 0.77, - "learning_rate": 4.8036880706853385e-05, - "loss": 0.3193, + "epoch": 0.7847695246332936, + "grad_norm": 0.2030201405286789, + "learning_rate": 4.791814432036088e-05, + "loss": 0.4127, "step": 21775 }, { - "epoch": 0.77, - "learning_rate": 4.803577400816654e-05, - "loss": 0.2851, + "epoch": 0.7849497242945184, + "grad_norm": 0.13716308772563934, + "learning_rate": 4.791697832014573e-05, + "loss": 0.4666, "step": 21780 }, { - "epoch": 0.77, - "learning_rate": 4.803466701037534e-05, - "loss": 0.3184, + "epoch": 0.785129923955743, + "grad_norm": 0.21387408673763275, + "learning_rate": 4.791581200769118e-05, + "loss": 0.4192, "step": 21785 }, { - "epoch": 0.77, - "learning_rate": 4.803355971349417e-05, - "loss": 0.3362, + "epoch": 0.7853101236169676, + "grad_norm": 0.21067731082439423, + "learning_rate": 4.7914645383013134e-05, + "loss": 0.4534, "step": 21790 }, { - "epoch": 0.77, - "learning_rate": 4.803245211753741e-05, - "loss": 0.2961, + "epoch": 0.7854903232781922, + "grad_norm": 0.18593548238277435, + "learning_rate": 4.791347844612748e-05, + "loss": 0.4512, "step": 21795 }, { - "epoch": 0.77, - "learning_rate": 4.8031344222519434e-05, - "loss": 0.2965, + "epoch": 0.7856705229394169, + "grad_norm": 0.18970215320587158, + "learning_rate": 4.7912311197050115e-05, + "loss": 0.4188, "step": 21800 }, { - "epoch": 0.77, - "learning_rate": 4.803023602845465e-05, - "loss": 0.331, + "epoch": 0.7858507226006415, + "grad_norm": 0.15977688133716583, + "learning_rate": 4.791114363579695e-05, + "loss": 0.4136, "step": 21805 }, { - "epoch": 0.77, - "learning_rate": 4.802912753535741e-05, - "loss": 0.3147, + "epoch": 0.7860309222618661, + "grad_norm": 0.17266367375850677, + "learning_rate": 4.790997576238389e-05, + "loss": 0.4518, "step": 21810 }, { - "epoch": 0.77, - "learning_rate": 4.802801874324213e-05, - "loss": 0.3112, + "epoch": 0.7862111219230908, + "grad_norm": 0.20080499351024628, + "learning_rate": 4.790880757682684e-05, + "loss": 0.427, "step": 21815 }, { - "epoch": 0.77, - "learning_rate": 4.8026909652123196e-05, - "loss": 0.316, + "epoch": 0.7863913215843155, + "grad_norm": 0.18386758863925934, + "learning_rate": 4.790763907914172e-05, + "loss": 0.4013, "step": 21820 }, { - "epoch": 0.77, - "learning_rate": 4.802580026201502e-05, - "loss": 0.3191, + "epoch": 0.7865715212455401, + "grad_norm": 0.17184041440486908, + "learning_rate": 4.790647026934446e-05, + "loss": 0.4037, "step": 21825 }, { - "epoch": 0.77, - "learning_rate": 4.802469057293201e-05, - "loss": 0.3106, + "epoch": 0.7867517209067647, + "grad_norm": 0.15970267355442047, + "learning_rate": 4.790530114745097e-05, + "loss": 0.4333, "step": 21830 }, { - "epoch": 0.77, - "learning_rate": 4.802358058488856e-05, - "loss": 0.3008, + "epoch": 0.7869319205679893, + "grad_norm": 0.17542044818401337, + "learning_rate": 4.7904131713477196e-05, + "loss": 0.4588, "step": 21835 }, { - "epoch": 0.77, - "learning_rate": 4.802247029789909e-05, - "loss": 0.3206, + "epoch": 0.787112120229214, + "grad_norm": 0.16546791791915894, + "learning_rate": 4.790296196743905e-05, + "loss": 0.4421, "step": 21840 }, { - "epoch": 0.77, - "learning_rate": 4.802135971197802e-05, - "loss": 0.3182, + "epoch": 0.7872923198904386, + "grad_norm": 0.1802026927471161, + "learning_rate": 4.790179190935249e-05, + "loss": 0.4335, "step": 21845 }, { - "epoch": 0.77, - "learning_rate": 4.802024882713976e-05, - "loss": 0.3298, + "epoch": 0.7874725195516632, + "grad_norm": 0.19843994081020355, + "learning_rate": 4.790062153923345e-05, + "loss": 0.4394, "step": 21850 }, { - "epoch": 0.77, - "learning_rate": 4.8019137643398736e-05, - "loss": 0.3019, + "epoch": 0.7876527192128879, + "grad_norm": 0.1890823394060135, + "learning_rate": 4.7899450857097875e-05, + "loss": 0.4763, "step": 21855 }, { - "epoch": 0.77, - "learning_rate": 4.801802616076939e-05, - "loss": 0.2883, + "epoch": 0.7878329188741126, + "grad_norm": 0.15386788547039032, + "learning_rate": 4.789827986296172e-05, + "loss": 0.4183, "step": 21860 }, { - "epoch": 0.77, - "learning_rate": 4.801691437926614e-05, - "loss": 0.2949, + "epoch": 0.7880131185353372, + "grad_norm": 0.17523469030857086, + "learning_rate": 4.789710855684092e-05, + "loss": 0.4289, "step": 21865 }, { - "epoch": 0.77, - "learning_rate": 4.801580229890342e-05, - "loss": 0.2882, + "epoch": 0.7881933181965618, + "grad_norm": 0.1903219223022461, + "learning_rate": 4.789593693875146e-05, + "loss": 0.4049, "step": 21870 }, { - "epoch": 0.77, - "learning_rate": 4.8014689919695685e-05, - "loss": 0.301, + "epoch": 0.7883735178577864, + "grad_norm": 0.21866121888160706, + "learning_rate": 4.7894765008709286e-05, + "loss": 0.4868, "step": 21875 }, { - "epoch": 0.77, - "learning_rate": 4.801357724165736e-05, - "loss": 0.3056, + "epoch": 0.788553717519011, + "grad_norm": 0.13972169160842896, + "learning_rate": 4.789359276673038e-05, + "loss": 0.4237, "step": 21880 }, { - "epoch": 0.77, - "learning_rate": 4.8012464264802906e-05, - "loss": 0.3166, + "epoch": 0.7887339171802357, + "grad_norm": 0.18218587338924408, + "learning_rate": 4.789242021283069e-05, + "loss": 0.4031, "step": 21885 }, { - "epoch": 0.77, - "learning_rate": 4.801135098914677e-05, - "loss": 0.2869, + "epoch": 0.7889141168414603, + "grad_norm": 0.1976306140422821, + "learning_rate": 4.789124734702622e-05, + "loss": 0.4757, "step": 21890 }, { - "epoch": 0.77, - "learning_rate": 4.80102374147034e-05, - "loss": 0.2947, + "epoch": 0.789094316502685, + "grad_norm": 0.15920376777648926, + "learning_rate": 4.789007416933293e-05, + "loss": 0.4281, "step": 21895 }, { - "epoch": 0.77, - "learning_rate": 4.800912354148727e-05, - "loss": 0.2838, + "epoch": 0.7892745161639096, + "grad_norm": 0.1940070241689682, + "learning_rate": 4.788890067976682e-05, + "loss": 0.4296, "step": 21900 }, { - "epoch": 0.77, - "learning_rate": 4.800800936951283e-05, - "loss": 0.308, + "epoch": 0.7894547158251343, + "grad_norm": 0.1800207644701004, + "learning_rate": 4.788772687834386e-05, + "loss": 0.4521, "step": 21905 }, { - "epoch": 0.77, - "learning_rate": 4.8006894898794554e-05, - "loss": 0.3103, + "epoch": 0.7896349154863589, + "grad_norm": 0.166525736451149, + "learning_rate": 4.7886552765080055e-05, + "loss": 0.4653, "step": 21910 }, { - "epoch": 0.77, - "learning_rate": 4.80057801293469e-05, - "loss": 0.3056, + "epoch": 0.7898151151475835, + "grad_norm": 0.19525845348834991, + "learning_rate": 4.78853783399914e-05, + "loss": 0.4835, "step": 21915 }, { - "epoch": 0.77, - "learning_rate": 4.800466506118436e-05, - "loss": 0.3012, + "epoch": 0.7899953148088081, + "grad_norm": 0.19068776071071625, + "learning_rate": 4.78842036030939e-05, + "loss": 0.427, "step": 21920 }, { - "epoch": 0.77, - "learning_rate": 4.800354969432141e-05, - "loss": 0.2974, + "epoch": 0.7901755144700328, + "grad_norm": 0.1702924221754074, + "learning_rate": 4.7883028554403554e-05, + "loss": 0.4455, "step": 21925 }, { - "epoch": 0.77, - "learning_rate": 4.800243402877252e-05, - "loss": 0.2985, + "epoch": 0.7903557141312574, + "grad_norm": 0.14877758920192719, + "learning_rate": 4.788185319393637e-05, + "loss": 0.4352, "step": 21930 }, { - "epoch": 0.77, - "learning_rate": 4.8001318064552184e-05, - "loss": 0.2776, + "epoch": 0.7905359137924821, + "grad_norm": 0.1357695311307907, + "learning_rate": 4.788067752170837e-05, + "loss": 0.4631, "step": 21935 }, { - "epoch": 0.77, - "learning_rate": 4.800020180167489e-05, - "loss": 0.2897, + "epoch": 0.7907161134537067, + "grad_norm": 0.194273442029953, + "learning_rate": 4.787950153773557e-05, + "loss": 0.4193, "step": 21940 }, { - "epoch": 0.77, - "learning_rate": 4.7999085240155137e-05, - "loss": 0.2792, + "epoch": 0.7908963131149314, + "grad_norm": 0.15530972182750702, + "learning_rate": 4.7878325242033987e-05, + "loss": 0.4188, "step": 21945 }, { - "epoch": 0.77, - "learning_rate": 4.7997968380007416e-05, - "loss": 0.3092, + "epoch": 0.791076512776156, + "grad_norm": 0.14824257791042328, + "learning_rate": 4.7877148634619657e-05, + "loss": 0.4546, "step": 21950 }, { - "epoch": 0.77, - "learning_rate": 4.799685122124624e-05, - "loss": 0.3157, + "epoch": 0.7912567124373806, + "grad_norm": 0.19110989570617676, + "learning_rate": 4.7875971715508606e-05, + "loss": 0.426, "step": 21955 }, { - "epoch": 0.77, - "learning_rate": 4.7995733763886095e-05, - "loss": 0.3111, + "epoch": 0.7914369120986052, + "grad_norm": 0.1911250203847885, + "learning_rate": 4.787479448471686e-05, + "loss": 0.4591, "step": 21960 }, { - "epoch": 0.77, - "learning_rate": 4.79946160079415e-05, - "loss": 0.2896, + "epoch": 0.7916171117598299, + "grad_norm": 0.16899849474430084, + "learning_rate": 4.787361694226048e-05, + "loss": 0.4502, "step": 21965 }, { - "epoch": 0.77, - "learning_rate": 4.7993497953426986e-05, - "loss": 0.3364, + "epoch": 0.7917973114210546, + "grad_norm": 0.15524151921272278, + "learning_rate": 4.787243908815548e-05, + "loss": 0.4257, "step": 21970 }, { - "epoch": 0.77, - "learning_rate": 4.799237960035704e-05, - "loss": 0.3244, + "epoch": 0.7919775110822792, + "grad_norm": 0.16246455907821655, + "learning_rate": 4.787126092241795e-05, + "loss": 0.4214, "step": 21975 }, { - "epoch": 0.77, - "learning_rate": 4.799126094874621e-05, - "loss": 0.3001, + "epoch": 0.7921577107435038, + "grad_norm": 0.1904502511024475, + "learning_rate": 4.78700824450639e-05, + "loss": 0.4643, "step": 21980 }, { - "epoch": 0.77, - "learning_rate": 4.7990141998609e-05, - "loss": 0.3243, + "epoch": 0.7923379104047285, + "grad_norm": 0.17854608595371246, + "learning_rate": 4.786890365610941e-05, + "loss": 0.4244, "step": 21985 }, { - "epoch": 0.77, - "learning_rate": 4.798902274995995e-05, - "loss": 0.3208, + "epoch": 0.7925181100659531, + "grad_norm": 0.1559765338897705, + "learning_rate": 4.786772455557054e-05, + "loss": 0.4281, "step": 21990 }, { - "epoch": 0.77, - "learning_rate": 4.7987903202813586e-05, - "loss": 0.318, + "epoch": 0.7926983097271777, + "grad_norm": 0.16417820751667023, + "learning_rate": 4.786654514346335e-05, + "loss": 0.4106, "step": 21995 }, { - "epoch": 0.77, - "learning_rate": 4.798678335718445e-05, - "loss": 0.3, + "epoch": 0.7928785093884023, + "grad_norm": 0.20214970409870148, + "learning_rate": 4.7865365419803896e-05, + "loss": 0.4704, "step": 22000 }, { - "epoch": 0.77, - "eval_loss": 0.30042049288749695, - "eval_runtime": 10.5417, - "eval_samples_per_second": 9.486, - "eval_steps_per_second": 9.486, + "epoch": 0.7928785093884023, + "eval_loss": 0.4604956805706024, + "eval_runtime": 3.5345, + "eval_samples_per_second": 28.293, + "eval_steps_per_second": 7.073, "step": 22000 }, { - "epoch": 0.77, - "learning_rate": 4.7985663213087085e-05, - "loss": 0.3042, + "epoch": 0.7930587090496269, + "grad_norm": 0.14078626036643982, + "learning_rate": 4.786418538460828e-05, + "loss": 0.4562, "step": 22005 }, { - "epoch": 0.77, - "learning_rate": 4.798454277053602e-05, - "loss": 0.3175, + "epoch": 0.7932389087108517, + "grad_norm": 0.15193304419517517, + "learning_rate": 4.7863005037892554e-05, + "loss": 0.482, "step": 22010 }, { - "epoch": 0.77, - "learning_rate": 4.798342202954583e-05, - "loss": 0.3416, + "epoch": 0.7934191083720763, + "grad_norm": 0.1598198413848877, + "learning_rate": 4.786182437967282e-05, + "loss": 0.4144, "step": 22015 }, { - "epoch": 0.77, - "learning_rate": 4.798230099013104e-05, - "loss": 0.3022, + "epoch": 0.7935993080333009, + "grad_norm": 0.1988956481218338, + "learning_rate": 4.786064340996515e-05, + "loss": 0.4673, "step": 22020 }, { - "epoch": 0.77, - "learning_rate": 4.7981179652306215e-05, - "loss": 0.302, + "epoch": 0.7937795076945255, + "grad_norm": 0.19150066375732422, + "learning_rate": 4.7859462128785635e-05, + "loss": 0.4278, "step": 22025 }, { - "epoch": 0.78, - "learning_rate": 4.7980058016085925e-05, - "loss": 0.29, + "epoch": 0.7939597073557502, + "grad_norm": 0.18495355546474457, + "learning_rate": 4.785828053615038e-05, + "loss": 0.4411, "step": 22030 }, { - "epoch": 0.78, - "learning_rate": 4.797893608148473e-05, - "loss": 0.3239, + "epoch": 0.7941399070169748, + "grad_norm": 0.1614546775817871, + "learning_rate": 4.785709863207548e-05, + "loss": 0.3946, "step": 22035 }, { - "epoch": 0.78, - "learning_rate": 4.797781384851719e-05, - "loss": 0.3278, + "epoch": 0.7943201066781994, + "grad_norm": 0.1417195051908493, + "learning_rate": 4.785591641657704e-05, + "loss": 0.4045, "step": 22040 }, { - "epoch": 0.78, - "learning_rate": 4.7976691317197874e-05, - "loss": 0.3006, + "epoch": 0.794500306339424, + "grad_norm": 0.16670504212379456, + "learning_rate": 4.7854733889671154e-05, + "loss": 0.4343, "step": 22045 }, { - "epoch": 0.78, - "learning_rate": 4.797556848754137e-05, - "loss": 0.3105, + "epoch": 0.7946805060006488, + "grad_norm": 0.17932060360908508, + "learning_rate": 4.785355105137395e-05, + "loss": 0.4841, "step": 22050 }, { - "epoch": 0.78, - "learning_rate": 4.7974445359562256e-05, - "loss": 0.3179, + "epoch": 0.7948607056618734, + "grad_norm": 0.1873452365398407, + "learning_rate": 4.785236790170153e-05, + "loss": 0.432, "step": 22055 }, { - "epoch": 0.78, - "learning_rate": 4.797332193327509e-05, - "loss": 0.3234, + "epoch": 0.795040905323098, + "grad_norm": 0.16498543322086334, + "learning_rate": 4.7851184440670026e-05, + "loss": 0.4354, "step": 22060 }, { - "epoch": 0.78, - "learning_rate": 4.7972198208694494e-05, - "loss": 0.3119, + "epoch": 0.7952211049843226, + "grad_norm": 0.15727977454662323, + "learning_rate": 4.785000066829556e-05, + "loss": 0.4322, "step": 22065 }, { - "epoch": 0.78, - "learning_rate": 4.7971074185835045e-05, - "loss": 0.3022, + "epoch": 0.7954013046455473, + "grad_norm": 0.17471718788146973, + "learning_rate": 4.784881658459426e-05, + "loss": 0.422, "step": 22070 }, { - "epoch": 0.78, - "learning_rate": 4.7969949864711335e-05, - "loss": 0.2895, + "epoch": 0.7955815043067719, + "grad_norm": 0.19372014701366425, + "learning_rate": 4.784763218958226e-05, + "loss": 0.4249, "step": 22075 }, { - "epoch": 0.78, - "learning_rate": 4.796882524533796e-05, - "loss": 0.344, + "epoch": 0.7957617039679965, + "grad_norm": 0.1373492181301117, + "learning_rate": 4.784644748327568e-05, + "loss": 0.4094, "step": 22080 }, { - "epoch": 0.78, - "learning_rate": 4.796770032772953e-05, - "loss": 0.3162, + "epoch": 0.7959419036292211, + "grad_norm": 0.16347211599349976, + "learning_rate": 4.7845262465690695e-05, + "loss": 0.4199, "step": 22085 }, { - "epoch": 0.78, - "learning_rate": 4.796657511190064e-05, - "loss": 0.293, + "epoch": 0.7961221032904459, + "grad_norm": 0.16035209596157074, + "learning_rate": 4.7844077136843426e-05, + "loss": 0.442, "step": 22090 }, { - "epoch": 0.78, - "learning_rate": 4.7965449597865914e-05, - "loss": 0.3211, + "epoch": 0.7963023029516705, + "grad_norm": 0.19094820320606232, + "learning_rate": 4.784289149675002e-05, + "loss": 0.4381, "step": 22095 }, { - "epoch": 0.78, - "learning_rate": 4.796432378563996e-05, - "loss": 0.3143, + "epoch": 0.7964825026128951, + "grad_norm": 0.17128083109855652, + "learning_rate": 4.784170554542665e-05, + "loss": 0.4571, "step": 22100 }, { - "epoch": 0.78, - "learning_rate": 4.796319767523739e-05, - "loss": 0.3107, + "epoch": 0.7966627022741197, + "grad_norm": 0.16196797788143158, + "learning_rate": 4.784051928288946e-05, + "loss": 0.4337, "step": 22105 }, { - "epoch": 0.78, - "learning_rate": 4.796207126667284e-05, - "loss": 0.2938, + "epoch": 0.7968429019353443, + "grad_norm": 0.16744011640548706, + "learning_rate": 4.7839332709154613e-05, + "loss": 0.4011, "step": 22110 }, { - "epoch": 0.78, - "learning_rate": 4.7960944559960915e-05, - "loss": 0.3009, + "epoch": 0.797023101596569, + "grad_norm": 0.1657259315252304, + "learning_rate": 4.783814582423829e-05, + "loss": 0.4232, "step": 22115 }, { - "epoch": 0.78, - "learning_rate": 4.795981755511627e-05, - "loss": 0.3159, + "epoch": 0.7972033012577936, + "grad_norm": 0.17152729630470276, + "learning_rate": 4.783695862815664e-05, + "loss": 0.4448, "step": 22120 }, { - "epoch": 0.78, - "learning_rate": 4.7958690252153514e-05, - "loss": 0.3194, + "epoch": 0.7973835009190182, + "grad_norm": 0.17332147061824799, + "learning_rate": 4.783577112092585e-05, + "loss": 0.4088, "step": 22125 }, { - "epoch": 0.78, - "learning_rate": 4.7957562651087304e-05, - "loss": 0.3359, + "epoch": 0.797563700580243, + "grad_norm": 0.1492212414741516, + "learning_rate": 4.783458330256211e-05, + "loss": 0.4846, "step": 22130 }, { - "epoch": 0.78, - "learning_rate": 4.795643475193227e-05, - "loss": 0.3258, + "epoch": 0.7977439002414676, + "grad_norm": 0.21426644921302795, + "learning_rate": 4.783339517308159e-05, + "loss": 0.3948, "step": 22135 }, { - "epoch": 0.78, - "learning_rate": 4.795530655470306e-05, - "loss": 0.304, + "epoch": 0.7979240999026922, + "grad_norm": 0.18977008759975433, + "learning_rate": 4.783220673250048e-05, + "loss": 0.4355, "step": 22140 }, { - "epoch": 0.78, - "learning_rate": 4.795417805941432e-05, - "loss": 0.2929, + "epoch": 0.7981042995639168, + "grad_norm": 0.1697508692741394, + "learning_rate": 4.783101798083498e-05, + "loss": 0.3963, "step": 22145 }, { - "epoch": 0.78, - "learning_rate": 4.795304926608071e-05, - "loss": 0.3054, + "epoch": 0.7982844992251414, + "grad_norm": 0.1844054013490677, + "learning_rate": 4.782982891810127e-05, + "loss": 0.4502, "step": 22150 }, { - "epoch": 0.78, - "learning_rate": 4.795192017471688e-05, - "loss": 0.3056, + "epoch": 0.7984646988863661, + "grad_norm": 0.19432871043682098, + "learning_rate": 4.782863954431557e-05, + "loss": 0.4602, "step": 22155 }, { - "epoch": 0.78, - "learning_rate": 4.795079078533749e-05, - "loss": 0.2981, + "epoch": 0.7986448985475907, + "grad_norm": 0.16126291453838348, + "learning_rate": 4.7827449859494065e-05, + "loss": 0.4409, "step": 22160 }, { - "epoch": 0.78, - "learning_rate": 4.7949661097957205e-05, - "loss": 0.2938, + "epoch": 0.7988250982088154, + "grad_norm": 0.17771978676319122, + "learning_rate": 4.7826259863652975e-05, + "loss": 0.4657, "step": 22165 }, { - "epoch": 0.78, - "learning_rate": 4.79485311125907e-05, - "loss": 0.2924, + "epoch": 0.79900529787004, + "grad_norm": 0.18326117098331451, + "learning_rate": 4.7825069556808525e-05, + "loss": 0.4722, "step": 22170 }, { - "epoch": 0.78, - "learning_rate": 4.7947400829252636e-05, - "loss": 0.3178, + "epoch": 0.7991854975312647, + "grad_norm": 0.14180731773376465, + "learning_rate": 4.782387893897692e-05, + "loss": 0.4298, "step": 22175 }, { - "epoch": 0.78, - "learning_rate": 4.7946270247957704e-05, - "loss": 0.3124, + "epoch": 0.7993656971924893, + "grad_norm": 0.1825062483549118, + "learning_rate": 4.7822688010174376e-05, + "loss": 0.4577, "step": 22180 }, { - "epoch": 0.78, - "learning_rate": 4.7945139368720573e-05, - "loss": 0.3451, + "epoch": 0.7995458968537139, + "grad_norm": 0.19557367265224457, + "learning_rate": 4.782149677041713e-05, + "loss": 0.4641, "step": 22185 }, { - "epoch": 0.78, - "learning_rate": 4.794400819155592e-05, - "loss": 0.2928, + "epoch": 0.7997260965149385, + "grad_norm": 0.17543824017047882, + "learning_rate": 4.782030521972141e-05, + "loss": 0.4703, "step": 22190 }, { - "epoch": 0.78, - "learning_rate": 4.7942876716478444e-05, - "loss": 0.3024, + "epoch": 0.7999062961761632, + "grad_norm": 0.15802043676376343, + "learning_rate": 4.781911335810345e-05, + "loss": 0.4311, "step": 22195 }, { - "epoch": 0.78, - "learning_rate": 4.794174494350284e-05, - "loss": 0.3331, + "epoch": 0.8000864958373878, + "grad_norm": 0.14436450600624084, + "learning_rate": 4.781792118557948e-05, + "loss": 0.4534, "step": 22200 }, { - "epoch": 0.78, - "learning_rate": 4.79406128726438e-05, - "loss": 0.2988, + "epoch": 0.8002666954986125, + "grad_norm": 0.14026832580566406, + "learning_rate": 4.7816728702165765e-05, + "loss": 0.3809, "step": 22205 }, { - "epoch": 0.78, - "learning_rate": 4.793948050391601e-05, - "loss": 0.2974, + "epoch": 0.8004468951598371, + "grad_norm": 0.1877908557653427, + "learning_rate": 4.781553590787853e-05, + "loss": 0.4493, "step": 22210 }, { - "epoch": 0.78, - "learning_rate": 4.7938347837334186e-05, - "loss": 0.2996, + "epoch": 0.8006270948210618, + "grad_norm": 0.18697671592235565, + "learning_rate": 4.7814342802734034e-05, + "loss": 0.4576, "step": 22215 }, { - "epoch": 0.78, - "learning_rate": 4.793721487291303e-05, - "loss": 0.3041, + "epoch": 0.8008072944822864, + "grad_norm": 0.1507420688867569, + "learning_rate": 4.781314938674855e-05, + "loss": 0.472, "step": 22220 }, { - "epoch": 0.78, - "learning_rate": 4.793608161066726e-05, - "loss": 0.2966, + "epoch": 0.800987494143511, + "grad_norm": 0.11861146241426468, + "learning_rate": 4.78119556599383e-05, + "loss": 0.4527, "step": 22225 }, { - "epoch": 0.78, - "learning_rate": 4.7934948050611586e-05, - "loss": 0.3155, + "epoch": 0.8011676938047356, + "grad_norm": 0.24812163412570953, + "learning_rate": 4.781076162231959e-05, + "loss": 0.4276, "step": 22230 }, { - "epoch": 0.78, - "learning_rate": 4.793381419276073e-05, - "loss": 0.3225, + "epoch": 0.8013478934659602, + "grad_norm": 0.13593071699142456, + "learning_rate": 4.7809567273908656e-05, + "loss": 0.4523, "step": 22235 }, { - "epoch": 0.78, - "learning_rate": 4.79326800371294e-05, - "loss": 0.2849, + "epoch": 0.8015280931271849, + "grad_norm": 0.16076035797595978, + "learning_rate": 4.7808372614721786e-05, + "loss": 0.401, "step": 22240 }, { - "epoch": 0.78, - "learning_rate": 4.793154558373234e-05, - "loss": 0.2952, + "epoch": 0.8017082927884096, + "grad_norm": 0.2315647453069687, + "learning_rate": 4.780717764477526e-05, + "loss": 0.3886, "step": 22245 }, { - "epoch": 0.78, - "learning_rate": 4.793041083258427e-05, - "loss": 0.296, + "epoch": 0.8018884924496342, + "grad_norm": 0.15930241346359253, + "learning_rate": 4.780598236408535e-05, + "loss": 0.4124, "step": 22250 }, { - "epoch": 0.78, - "learning_rate": 4.7929275783699935e-05, - "loss": 0.3116, + "epoch": 0.8020686921108588, + "grad_norm": 0.18420235812664032, + "learning_rate": 4.780478677266835e-05, + "loss": 0.4208, "step": 22255 }, { - "epoch": 0.78, - "learning_rate": 4.7928140437094055e-05, - "loss": 0.2974, + "epoch": 0.8022488917720835, + "grad_norm": 0.17045623064041138, + "learning_rate": 4.780359087054054e-05, + "loss": 0.4138, "step": 22260 }, { - "epoch": 0.78, - "learning_rate": 4.792700479278138e-05, - "loss": 0.2694, + "epoch": 0.8024290914333081, + "grad_norm": 0.165228009223938, + "learning_rate": 4.780239465771822e-05, + "loss": 0.4348, "step": 22265 }, { - "epoch": 0.78, - "learning_rate": 4.7925868850776664e-05, - "loss": 0.3148, + "epoch": 0.8026092910945327, + "grad_norm": 0.20021255314350128, + "learning_rate": 4.78011981342177e-05, + "loss": 0.4712, "step": 22270 }, { - "epoch": 0.78, - "learning_rate": 4.792473261109465e-05, - "loss": 0.3181, + "epoch": 0.8027894907557573, + "grad_norm": 0.18207870423793793, + "learning_rate": 4.7800001300055254e-05, + "loss": 0.4717, "step": 22275 }, { - "epoch": 0.78, - "learning_rate": 4.792359607375009e-05, - "loss": 0.2937, + "epoch": 0.802969690416982, + "grad_norm": 0.16366621851921082, + "learning_rate": 4.7798804155247205e-05, + "loss": 0.4682, "step": 22280 }, { - "epoch": 0.78, - "learning_rate": 4.7922459238757746e-05, - "loss": 0.294, + "epoch": 0.8031498900782067, + "grad_norm": 0.15080103278160095, + "learning_rate": 4.7797606699809874e-05, + "loss": 0.409, "step": 22285 }, { - "epoch": 0.78, - "learning_rate": 4.7921322106132366e-05, - "loss": 0.3054, + "epoch": 0.8033300897394313, + "grad_norm": 0.1815640777349472, + "learning_rate": 4.779640893375956e-05, + "loss": 0.456, "step": 22290 }, { - "epoch": 0.78, - "learning_rate": 4.792018467588873e-05, - "loss": 0.3309, + "epoch": 0.8035102894006559, + "grad_norm": 0.17141170799732208, + "learning_rate": 4.7795210857112585e-05, + "loss": 0.4179, "step": 22295 }, { - "epoch": 0.78, - "learning_rate": 4.7919046948041605e-05, - "loss": 0.2885, + "epoch": 0.8036904890618806, + "grad_norm": 0.1825985610485077, + "learning_rate": 4.7794012469885276e-05, + "loss": 0.4053, "step": 22300 }, { - "epoch": 0.78, - "learning_rate": 4.791790892260575e-05, - "loss": 0.3064, + "epoch": 0.8038706887231052, + "grad_norm": 0.17858922481536865, + "learning_rate": 4.779281377209396e-05, + "loss": 0.4419, "step": 22305 }, { - "epoch": 0.78, - "learning_rate": 4.791677059959595e-05, - "loss": 0.3486, + "epoch": 0.8040508883843298, + "grad_norm": 0.18266747891902924, + "learning_rate": 4.779161476375497e-05, + "loss": 0.4313, "step": 22310 }, { - "epoch": 0.79, - "learning_rate": 4.7915631979027e-05, - "loss": 0.3017, + "epoch": 0.8042310880455544, + "grad_norm": 0.15619586408138275, + "learning_rate": 4.7790415444884645e-05, + "loss": 0.4456, "step": 22315 }, { - "epoch": 0.79, - "learning_rate": 4.7914493060913654e-05, - "loss": 0.2882, + "epoch": 0.8044112877067792, + "grad_norm": 0.14940191805362701, + "learning_rate": 4.778921581549932e-05, + "loss": 0.4155, "step": 22320 }, { - "epoch": 0.79, - "learning_rate": 4.7913353845270725e-05, - "loss": 0.3062, + "epoch": 0.8045914873680038, + "grad_norm": 0.18177048861980438, + "learning_rate": 4.778801587561535e-05, + "loss": 0.4538, "step": 22325 }, { - "epoch": 0.79, - "learning_rate": 4.7912214332113e-05, - "loss": 0.3241, + "epoch": 0.8047716870292284, + "grad_norm": 0.19796302914619446, + "learning_rate": 4.778681562524906e-05, + "loss": 0.4435, "step": 22330 }, { - "epoch": 0.79, - "learning_rate": 4.7911074521455266e-05, - "loss": 0.2649, + "epoch": 0.804951886690453, + "grad_norm": 0.17838706076145172, + "learning_rate": 4.778561506441682e-05, + "loss": 0.4309, "step": 22335 }, { - "epoch": 0.79, - "learning_rate": 4.7909934413312335e-05, - "loss": 0.32, + "epoch": 0.8051320863516777, + "grad_norm": 0.14537879824638367, + "learning_rate": 4.7784414193135e-05, + "loss": 0.4494, "step": 22340 }, { - "epoch": 0.79, - "learning_rate": 4.7908794007698996e-05, - "loss": 0.2956, + "epoch": 0.8053122860129023, + "grad_norm": 0.2153496891260147, + "learning_rate": 4.778321301141994e-05, + "loss": 0.4376, "step": 22345 }, { - "epoch": 0.79, - "learning_rate": 4.7907653304630074e-05, - "loss": 0.3181, + "epoch": 0.8054924856741269, + "grad_norm": 0.17818701267242432, + "learning_rate": 4.7782011519288e-05, + "loss": 0.4245, "step": 22350 }, { - "epoch": 0.79, - "learning_rate": 4.790651230412037e-05, - "loss": 0.2962, + "epoch": 0.8056726853353515, + "grad_norm": 0.15808773040771484, + "learning_rate": 4.778080971675558e-05, + "loss": 0.42, "step": 22355 }, { - "epoch": 0.79, - "learning_rate": 4.790537100618469e-05, - "loss": 0.3048, + "epoch": 0.8058528849965763, + "grad_norm": 0.16724058985710144, + "learning_rate": 4.777960760383904e-05, + "loss": 0.4454, "step": 22360 }, { - "epoch": 0.79, - "learning_rate": 4.790422941083786e-05, - "loss": 0.3162, + "epoch": 0.8060330846578009, + "grad_norm": 0.14002850651741028, + "learning_rate": 4.777840518055475e-05, + "loss": 0.4447, "step": 22365 }, { - "epoch": 0.79, - "learning_rate": 4.790308751809471e-05, - "loss": 0.3088, + "epoch": 0.8062132843190255, + "grad_norm": 0.1951669454574585, + "learning_rate": 4.77772024469191e-05, + "loss": 0.4589, "step": 22370 }, { - "epoch": 0.79, - "learning_rate": 4.790194532797007e-05, - "loss": 0.3218, + "epoch": 0.8063934839802501, + "grad_norm": 0.17650310695171356, + "learning_rate": 4.7775999402948476e-05, + "loss": 0.3997, "step": 22375 }, { - "epoch": 0.79, - "learning_rate": 4.790080284047876e-05, - "loss": 0.2781, + "epoch": 0.8065736836414747, + "grad_norm": 0.17684021592140198, + "learning_rate": 4.7774796048659276e-05, + "loss": 0.4246, "step": 22380 }, { - "epoch": 0.79, - "learning_rate": 4.789966005563561e-05, - "loss": 0.2965, + "epoch": 0.8067538833026994, + "grad_norm": 0.18748098611831665, + "learning_rate": 4.7773592384067884e-05, + "loss": 0.4371, "step": 22385 }, { - "epoch": 0.79, - "learning_rate": 4.7898516973455474e-05, - "loss": 0.3253, + "epoch": 0.806934082963924, + "grad_norm": 0.19001294672489166, + "learning_rate": 4.7772388409190704e-05, + "loss": 0.4281, "step": 22390 }, { - "epoch": 0.79, - "learning_rate": 4.789737359395319e-05, - "loss": 0.2921, + "epoch": 0.8071142826251486, + "grad_norm": 0.1857748180627823, + "learning_rate": 4.7771184124044144e-05, + "loss": 0.4078, "step": 22395 }, { - "epoch": 0.79, - "learning_rate": 4.789622991714359e-05, - "loss": 0.2875, + "epoch": 0.8072944822863733, + "grad_norm": 0.1612103283405304, + "learning_rate": 4.776997952864461e-05, + "loss": 0.4418, "step": 22400 }, { - "epoch": 0.79, - "learning_rate": 4.789508594304154e-05, - "loss": 0.2885, + "epoch": 0.807474681947598, + "grad_norm": 0.1528964787721634, + "learning_rate": 4.7768774623008506e-05, + "loss": 0.4683, "step": 22405 }, { - "epoch": 0.79, - "learning_rate": 4.789394167166188e-05, - "loss": 0.3002, + "epoch": 0.8076548816088226, + "grad_norm": 0.15798471868038177, + "learning_rate": 4.776756940715226e-05, + "loss": 0.4182, "step": 22410 }, { - "epoch": 0.79, - "learning_rate": 4.7892797103019476e-05, - "loss": 0.3167, + "epoch": 0.8078350812700472, + "grad_norm": 0.16511400043964386, + "learning_rate": 4.7766363881092294e-05, + "loss": 0.4249, "step": 22415 }, { - "epoch": 0.79, - "learning_rate": 4.7891652237129194e-05, - "loss": 0.3041, + "epoch": 0.8080152809312718, + "grad_norm": 0.17201100289821625, + "learning_rate": 4.776515804484502e-05, + "loss": 0.4506, "step": 22420 }, { - "epoch": 0.79, - "learning_rate": 4.789050707400588e-05, - "loss": 0.3157, + "epoch": 0.8081954805924965, + "grad_norm": 0.2050049602985382, + "learning_rate": 4.776395189842688e-05, + "loss": 0.4595, "step": 22425 }, { - "epoch": 0.79, - "learning_rate": 4.7889361613664436e-05, - "loss": 0.3145, + "epoch": 0.8083756802537211, + "grad_norm": 0.16215232014656067, + "learning_rate": 4.7762745441854296e-05, + "loss": 0.4377, "step": 22430 }, { - "epoch": 0.79, - "learning_rate": 4.7888215856119714e-05, - "loss": 0.2968, + "epoch": 0.8085558799149457, + "grad_norm": 0.1810922622680664, + "learning_rate": 4.776153867514372e-05, + "loss": 0.4346, "step": 22435 }, { - "epoch": 0.79, - "learning_rate": 4.7887069801386585e-05, - "loss": 0.3106, + "epoch": 0.8087360795761704, + "grad_norm": 0.1510697603225708, + "learning_rate": 4.7760331598311584e-05, + "loss": 0.4279, "step": 22440 }, { - "epoch": 0.79, - "learning_rate": 4.7885923449479936e-05, - "loss": 0.3012, + "epoch": 0.8089162792373951, + "grad_norm": 0.21630224585533142, + "learning_rate": 4.7759124211374335e-05, + "loss": 0.4654, "step": 22445 }, { - "epoch": 0.79, - "learning_rate": 4.788477680041465e-05, - "loss": 0.28, + "epoch": 0.8090964788986197, + "grad_norm": 0.1879904568195343, + "learning_rate": 4.775791651434843e-05, + "loss": 0.3944, "step": 22450 }, { - "epoch": 0.79, - "learning_rate": 4.788362985420564e-05, - "loss": 0.2875, + "epoch": 0.8092766785598443, + "grad_norm": 0.20264166593551636, + "learning_rate": 4.7756708507250314e-05, + "loss": 0.4777, "step": 22455 }, { - "epoch": 0.79, - "learning_rate": 4.788248261086776e-05, - "loss": 0.3307, + "epoch": 0.8094568782210689, + "grad_norm": 0.1782803237438202, + "learning_rate": 4.775550019009645e-05, + "loss": 0.4346, "step": 22460 }, { - "epoch": 0.79, - "learning_rate": 4.788133507041592e-05, - "loss": 0.2875, + "epoch": 0.8096370778822936, + "grad_norm": 0.16567277908325195, + "learning_rate": 4.77542915629033e-05, + "loss": 0.4615, "step": 22465 }, { - "epoch": 0.79, - "learning_rate": 4.788018723286504e-05, - "loss": 0.2809, + "epoch": 0.8098172775435182, + "grad_norm": 0.17552456259727478, + "learning_rate": 4.7753082625687334e-05, + "loss": 0.4801, "step": 22470 }, { - "epoch": 0.79, - "learning_rate": 4.787903909822999e-05, - "loss": 0.2954, + "epoch": 0.8099974772047429, + "grad_norm": 0.21018646657466888, + "learning_rate": 4.7751873378465026e-05, + "loss": 0.4294, "step": 22475 }, { - "epoch": 0.79, - "learning_rate": 4.7877890666525703e-05, - "loss": 0.2881, + "epoch": 0.8101776768659675, + "grad_norm": 0.16765153408050537, + "learning_rate": 4.7750663821252844e-05, + "loss": 0.4349, "step": 22480 }, { - "epoch": 0.79, - "learning_rate": 4.787674193776709e-05, - "loss": 0.2969, + "epoch": 0.8103578765271922, + "grad_norm": 0.18359611928462982, + "learning_rate": 4.7749453954067275e-05, + "loss": 0.4012, "step": 22485 }, { - "epoch": 0.79, - "learning_rate": 4.7875592911969045e-05, - "loss": 0.3009, + "epoch": 0.8105380761884168, + "grad_norm": 0.19209226965904236, + "learning_rate": 4.774824377692479e-05, + "loss": 0.4422, "step": 22490 }, { - "epoch": 0.79, - "learning_rate": 4.787444358914651e-05, - "loss": 0.3125, + "epoch": 0.8107182758496414, + "grad_norm": 0.18595997989177704, + "learning_rate": 4.77470332898419e-05, + "loss": 0.4376, "step": 22495 }, { - "epoch": 0.79, - "learning_rate": 4.78732939693144e-05, - "loss": 0.3025, + "epoch": 0.810898475510866, + "grad_norm": 0.12643958628177643, + "learning_rate": 4.774582249283509e-05, + "loss": 0.3845, "step": 22500 }, { - "epoch": 0.79, - "eval_loss": 0.2996106445789337, - "eval_runtime": 10.5303, - "eval_samples_per_second": 9.496, - "eval_steps_per_second": 9.496, + "epoch": 0.810898475510866, + "eval_loss": 0.46091198921203613, + "eval_runtime": 3.5434, + "eval_samples_per_second": 28.221, + "eval_steps_per_second": 7.055, "step": 22500 }, { - "epoch": 0.79, - "learning_rate": 4.7872144052487636e-05, - "loss": 0.2975, + "epoch": 0.8110786751720906, + "grad_norm": 0.18900783360004425, + "learning_rate": 4.774461138592085e-05, + "loss": 0.4837, "step": 22505 }, { - "epoch": 0.79, - "learning_rate": 4.7870993838681166e-05, - "loss": 0.3188, + "epoch": 0.8112588748333153, + "grad_norm": 0.1918240487575531, + "learning_rate": 4.774339996911567e-05, + "loss": 0.4452, "step": 22510 }, { - "epoch": 0.79, - "learning_rate": 4.7869843327909904e-05, - "loss": 0.3126, + "epoch": 0.81143907449454, + "grad_norm": 0.18408475816249847, + "learning_rate": 4.7742188242436075e-05, + "loss": 0.4227, "step": 22515 }, { - "epoch": 0.79, - "learning_rate": 4.78686925201888e-05, - "loss": 0.3167, + "epoch": 0.8116192741557646, + "grad_norm": 0.1483888477087021, + "learning_rate": 4.774097620589857e-05, + "loss": 0.465, "step": 22520 }, { - "epoch": 0.79, - "learning_rate": 4.7867541415532796e-05, - "loss": 0.3221, + "epoch": 0.8117994738169892, + "grad_norm": 0.1650637537240982, + "learning_rate": 4.773976385951967e-05, + "loss": 0.4325, "step": 22525 }, { - "epoch": 0.79, - "learning_rate": 4.786639001395684e-05, - "loss": 0.2836, + "epoch": 0.8119796734782139, + "grad_norm": 0.19324971735477448, + "learning_rate": 4.773855120331588e-05, + "loss": 0.4326, "step": 22530 }, { - "epoch": 0.79, - "learning_rate": 4.786523831547587e-05, - "loss": 0.2995, + "epoch": 0.8121598731394385, + "grad_norm": 0.16592490673065186, + "learning_rate": 4.773733823730374e-05, + "loss": 0.4163, "step": 22535 }, { - "epoch": 0.79, - "learning_rate": 4.7864086320104853e-05, - "loss": 0.3091, + "epoch": 0.8123400728006631, + "grad_norm": 0.17151491343975067, + "learning_rate": 4.773612496149977e-05, + "loss": 0.4414, "step": 22540 }, { - "epoch": 0.79, - "learning_rate": 4.786293402785875e-05, - "loss": 0.2919, + "epoch": 0.8125202724618877, + "grad_norm": 0.1465132236480713, + "learning_rate": 4.77349113759205e-05, + "loss": 0.4609, "step": 22545 }, { - "epoch": 0.79, - "learning_rate": 4.786178143875251e-05, - "loss": 0.2988, + "epoch": 0.8127004721231124, + "grad_norm": 0.1558566689491272, + "learning_rate": 4.7733697480582464e-05, + "loss": 0.4365, "step": 22550 }, { - "epoch": 0.79, - "learning_rate": 4.786062855280111e-05, - "loss": 0.3053, + "epoch": 0.8128806717843371, + "grad_norm": 0.16560520231723785, + "learning_rate": 4.7732483275502194e-05, + "loss": 0.438, "step": 22555 }, { - "epoch": 0.79, - "learning_rate": 4.78594753700195e-05, - "loss": 0.3258, + "epoch": 0.8130608714455617, + "grad_norm": 0.14548031985759735, + "learning_rate": 4.773126876069625e-05, + "loss": 0.4234, "step": 22560 }, { - "epoch": 0.79, - "learning_rate": 4.785832189042268e-05, - "loss": 0.303, + "epoch": 0.8132410711067863, + "grad_norm": 0.16290007531642914, + "learning_rate": 4.773005393618116e-05, + "loss": 0.407, "step": 22565 }, { - "epoch": 0.79, - "learning_rate": 4.785716811402561e-05, - "loss": 0.2957, + "epoch": 0.813421270768011, + "grad_norm": 0.16611815989017487, + "learning_rate": 4.7728838801973485e-05, + "loss": 0.3838, "step": 22570 }, { - "epoch": 0.79, - "learning_rate": 4.785601404084328e-05, - "loss": 0.3172, + "epoch": 0.8136014704292356, + "grad_norm": 0.1640740931034088, + "learning_rate": 4.772762335808979e-05, + "loss": 0.4413, "step": 22575 }, { - "epoch": 0.79, - "learning_rate": 4.785485967089066e-05, - "loss": 0.3203, + "epoch": 0.8137816700904602, + "grad_norm": 0.1402037888765335, + "learning_rate": 4.772640760454663e-05, + "loss": 0.4131, "step": 22580 }, { - "epoch": 0.79, - "learning_rate": 4.7853705004182756e-05, - "loss": 0.3185, + "epoch": 0.8139618697516848, + "grad_norm": 0.15909187495708466, + "learning_rate": 4.772519154136056e-05, + "loss": 0.4276, "step": 22585 }, { - "epoch": 0.79, - "learning_rate": 4.785255004073455e-05, - "loss": 0.3293, + "epoch": 0.8141420694129095, + "grad_norm": 0.16949698328971863, + "learning_rate": 4.772397516854815e-05, + "loss": 0.4531, "step": 22590 }, { - "epoch": 0.79, - "learning_rate": 4.785139478056105e-05, - "loss": 0.2998, + "epoch": 0.8143222690741342, + "grad_norm": 0.19324524700641632, + "learning_rate": 4.7722758486125986e-05, + "loss": 0.4291, "step": 22595 }, { - "epoch": 0.8, - "learning_rate": 4.785023922367725e-05, - "loss": 0.3287, + "epoch": 0.8145024687353588, + "grad_norm": 0.17862281203269958, + "learning_rate": 4.772154149411063e-05, + "loss": 0.469, "step": 22600 }, { - "epoch": 0.8, - "learning_rate": 4.784908337009815e-05, - "loss": 0.2895, + "epoch": 0.8146826683965834, + "grad_norm": 0.19284473359584808, + "learning_rate": 4.772032419251868e-05, + "loss": 0.4235, "step": 22605 }, { - "epoch": 0.8, - "learning_rate": 4.784792721983875e-05, - "loss": 0.3014, + "epoch": 0.814862868057808, + "grad_norm": 0.14753711223602295, + "learning_rate": 4.77191065813667e-05, + "loss": 0.4528, "step": 22610 }, { - "epoch": 0.8, - "learning_rate": 4.784677077291408e-05, - "loss": 0.3052, + "epoch": 0.8150430677190327, + "grad_norm": 0.1892748773097992, + "learning_rate": 4.7717888660671306e-05, + "loss": 0.4374, "step": 22615 }, { - "epoch": 0.8, - "learning_rate": 4.784561402933915e-05, - "loss": 0.2981, + "epoch": 0.8152232673802573, + "grad_norm": 0.15627089142799377, + "learning_rate": 4.771667043044906e-05, + "loss": 0.4639, "step": 22620 }, { - "epoch": 0.8, - "learning_rate": 4.784445698912898e-05, - "loss": 0.2791, + "epoch": 0.8154034670414819, + "grad_norm": 0.1614178568124771, + "learning_rate": 4.771545189071659e-05, + "loss": 0.449, "step": 22625 }, { - "epoch": 0.8, - "learning_rate": 4.784329965229858e-05, - "loss": 0.3382, + "epoch": 0.8155836667027065, + "grad_norm": 0.18917955458164215, + "learning_rate": 4.771423304149049e-05, + "loss": 0.4695, "step": 22630 }, { - "epoch": 0.8, - "learning_rate": 4.7842142018863e-05, - "loss": 0.3277, + "epoch": 0.8157638663639313, + "grad_norm": 0.17164795100688934, + "learning_rate": 4.771301388278735e-05, + "loss": 0.4405, "step": 22635 }, { - "epoch": 0.8, - "learning_rate": 4.784098408883725e-05, - "loss": 0.2887, + "epoch": 0.8159440660251559, + "grad_norm": 0.18476401269435883, + "learning_rate": 4.7711794414623796e-05, + "loss": 0.4584, "step": 22640 }, { - "epoch": 0.8, - "learning_rate": 4.783982586223638e-05, - "loss": 0.3109, + "epoch": 0.8161242656863805, + "grad_norm": 0.1459241509437561, + "learning_rate": 4.771057463701644e-05, + "loss": 0.4258, "step": 22645 }, { - "epoch": 0.8, - "learning_rate": 4.783866733907543e-05, - "loss": 0.2923, + "epoch": 0.8163044653476051, + "grad_norm": 0.16828380525112152, + "learning_rate": 4.77093545499819e-05, + "loss": 0.4332, "step": 22650 }, { - "epoch": 0.8, - "learning_rate": 4.7837508519369425e-05, - "loss": 0.3152, + "epoch": 0.8164846650088298, + "grad_norm": 0.14585858583450317, + "learning_rate": 4.770813415353681e-05, + "loss": 0.4004, "step": 22655 }, { - "epoch": 0.8, - "learning_rate": 4.783634940313342e-05, - "loss": 0.3044, + "epoch": 0.8166648646700544, + "grad_norm": 0.16096548736095428, + "learning_rate": 4.7706913447697785e-05, + "loss": 0.4693, "step": 22660 }, { - "epoch": 0.8, - "learning_rate": 4.7835189990382475e-05, - "loss": 0.3201, + "epoch": 0.816845064331279, + "grad_norm": 0.19337119162082672, + "learning_rate": 4.7705692432481455e-05, + "loss": 0.4197, "step": 22665 }, { - "epoch": 0.8, - "learning_rate": 4.7834030281131636e-05, - "loss": 0.3069, + "epoch": 0.8170252639925037, + "grad_norm": 0.1468539834022522, + "learning_rate": 4.770447110790447e-05, + "loss": 0.4135, "step": 22670 }, { - "epoch": 0.8, - "learning_rate": 4.7832870275395956e-05, - "loss": 0.3071, + "epoch": 0.8172054636537284, + "grad_norm": 0.13654807209968567, + "learning_rate": 4.770324947398346e-05, + "loss": 0.4459, "step": 22675 }, { - "epoch": 0.8, - "learning_rate": 4.783170997319051e-05, - "loss": 0.306, + "epoch": 0.817385663314953, + "grad_norm": 0.16397765278816223, + "learning_rate": 4.770202753073506e-05, + "loss": 0.4188, "step": 22680 }, { - "epoch": 0.8, - "learning_rate": 4.783054937453035e-05, - "loss": 0.3189, + "epoch": 0.8175658629761776, + "grad_norm": 0.20366117358207703, + "learning_rate": 4.770080527817594e-05, + "loss": 0.4387, "step": 22685 }, { - "epoch": 0.8, - "learning_rate": 4.782938847943056e-05, - "loss": 0.3142, + "epoch": 0.8177460626374022, + "grad_norm": 0.1833733767271042, + "learning_rate": 4.7699582716322743e-05, + "loss": 0.4509, "step": 22690 }, { - "epoch": 0.8, - "learning_rate": 4.7828227287906205e-05, - "loss": 0.2958, + "epoch": 0.8179262622986269, + "grad_norm": 0.18486203253269196, + "learning_rate": 4.7698359845192126e-05, + "loss": 0.4042, "step": 22695 }, { - "epoch": 0.8, - "learning_rate": 4.782706579997236e-05, - "loss": 0.3093, + "epoch": 0.8181064619598515, + "grad_norm": 0.17110316455364227, + "learning_rate": 4.769713666480075e-05, + "loss": 0.406, "step": 22700 }, { - "epoch": 0.8, - "learning_rate": 4.7825904015644106e-05, - "loss": 0.3015, + "epoch": 0.8182866616210761, + "grad_norm": 0.15279895067214966, + "learning_rate": 4.769591317516528e-05, + "loss": 0.4436, "step": 22705 }, { - "epoch": 0.8, - "learning_rate": 4.782474193493654e-05, - "loss": 0.3103, + "epoch": 0.8184668612823008, + "grad_norm": 0.15406733751296997, + "learning_rate": 4.769468937630239e-05, + "loss": 0.4464, "step": 22710 }, { - "epoch": 0.8, - "learning_rate": 4.7823579557864725e-05, - "loss": 0.274, + "epoch": 0.8186470609435255, + "grad_norm": 0.16378438472747803, + "learning_rate": 4.769346526822874e-05, + "loss": 0.4166, "step": 22715 }, { - "epoch": 0.8, - "learning_rate": 4.7822416884443786e-05, - "loss": 0.2857, + "epoch": 0.8188272606047501, + "grad_norm": 0.16179268062114716, + "learning_rate": 4.769224085096103e-05, + "loss": 0.3881, "step": 22720 }, { - "epoch": 0.8, - "learning_rate": 4.78212539146888e-05, - "loss": 0.3126, + "epoch": 0.8190074602659747, + "grad_norm": 0.18712985515594482, + "learning_rate": 4.769101612451593e-05, + "loss": 0.4326, "step": 22725 }, { - "epoch": 0.8, - "learning_rate": 4.782009064861488e-05, - "loss": 0.3157, + "epoch": 0.8191876599271993, + "grad_norm": 0.1865232139825821, + "learning_rate": 4.768979108891013e-05, + "loss": 0.4499, "step": 22730 }, { - "epoch": 0.8, - "learning_rate": 4.7818927086237106e-05, - "loss": 0.3064, + "epoch": 0.819367859588424, + "grad_norm": 0.1458199918270111, + "learning_rate": 4.7688565744160315e-05, + "loss": 0.4193, "step": 22735 }, { - "epoch": 0.8, - "learning_rate": 4.7817763227570614e-05, - "loss": 0.3148, + "epoch": 0.8195480592496486, + "grad_norm": 0.17637084424495697, + "learning_rate": 4.768734009028319e-05, + "loss": 0.4239, "step": 22740 }, { - "epoch": 0.8, - "learning_rate": 4.78165990726305e-05, - "loss": 0.2939, + "epoch": 0.8197282589108732, + "grad_norm": 0.14654532074928284, + "learning_rate": 4.768611412729545e-05, + "loss": 0.4284, "step": 22745 }, { - "epoch": 0.8, - "learning_rate": 4.7815434621431884e-05, - "loss": 0.2772, + "epoch": 0.8199084585720979, + "grad_norm": 0.19782017171382904, + "learning_rate": 4.768488785521379e-05, + "loss": 0.4591, "step": 22750 }, { - "epoch": 0.8, - "learning_rate": 4.7814269873989895e-05, - "loss": 0.2954, + "epoch": 0.8200886582333226, + "grad_norm": 0.126174658536911, + "learning_rate": 4.768366127405493e-05, + "loss": 0.4302, "step": 22755 }, { - "epoch": 0.8, - "learning_rate": 4.7813104830319634e-05, - "loss": 0.3113, + "epoch": 0.8202688578945472, + "grad_norm": 0.17687900364398956, + "learning_rate": 4.768243438383557e-05, + "loss": 0.446, "step": 22760 }, { - "epoch": 0.8, - "learning_rate": 4.781193949043624e-05, - "loss": 0.3328, + "epoch": 0.8204490575557718, + "grad_norm": 0.17083390057086945, + "learning_rate": 4.768120718457244e-05, + "loss": 0.4627, "step": 22765 }, { - "epoch": 0.8, - "learning_rate": 4.781077385435485e-05, - "loss": 0.3097, + "epoch": 0.8206292572169964, + "grad_norm": 0.16993774473667145, + "learning_rate": 4.767997967628225e-05, + "loss": 0.4264, "step": 22770 }, { - "epoch": 0.8, - "learning_rate": 4.780960792209059e-05, - "loss": 0.3012, + "epoch": 0.820809456878221, + "grad_norm": 0.19042931497097015, + "learning_rate": 4.767875185898173e-05, + "loss": 0.4299, "step": 22775 }, { - "epoch": 0.8, - "learning_rate": 4.780844169365861e-05, - "loss": 0.3298, + "epoch": 0.8209896565394457, + "grad_norm": 0.19879989326000214, + "learning_rate": 4.767752373268761e-05, + "loss": 0.4124, "step": 22780 }, { - "epoch": 0.8, - "learning_rate": 4.780727516907405e-05, - "loss": 0.2949, + "epoch": 0.8211698562006703, + "grad_norm": 0.17709743976593018, + "learning_rate": 4.7676295297416615e-05, + "loss": 0.4415, "step": 22785 }, { - "epoch": 0.8, - "learning_rate": 4.780610834835204e-05, - "loss": 0.321, + "epoch": 0.821350055861895, + "grad_norm": 0.1532343327999115, + "learning_rate": 4.767506655318549e-05, + "loss": 0.3988, "step": 22790 }, { - "epoch": 0.8, - "learning_rate": 4.780494123150775e-05, - "loss": 0.3075, + "epoch": 0.8215302555231196, + "grad_norm": 0.20457062125205994, + "learning_rate": 4.767383750001097e-05, + "loss": 0.4313, "step": 22795 }, { - "epoch": 0.8, - "learning_rate": 4.780377381855633e-05, - "loss": 0.3277, + "epoch": 0.8217104551843443, + "grad_norm": 0.1625969558954239, + "learning_rate": 4.76726081379098e-05, + "loss": 0.4156, "step": 22800 }, { - "epoch": 0.8, - "learning_rate": 4.780260610951293e-05, - "loss": 0.3016, + "epoch": 0.8218906548455689, + "grad_norm": 0.17059491574764252, + "learning_rate": 4.7671378466898735e-05, + "loss": 0.4303, "step": 22805 }, { - "epoch": 0.8, - "learning_rate": 4.7801438104392715e-05, - "loss": 0.3104, + "epoch": 0.8220708545067935, + "grad_norm": 0.18344855308532715, + "learning_rate": 4.767014848699453e-05, + "loss": 0.4356, "step": 22810 }, { - "epoch": 0.8, - "learning_rate": 4.7800269803210856e-05, - "loss": 0.3077, + "epoch": 0.8222510541680181, + "grad_norm": 0.21067775785923004, + "learning_rate": 4.766891819821394e-05, + "loss": 0.4577, "step": 22815 }, { - "epoch": 0.8, - "learning_rate": 4.779910120598252e-05, - "loss": 0.3111, + "epoch": 0.8224312538292428, + "grad_norm": 0.1624084860086441, + "learning_rate": 4.766768760057374e-05, + "loss": 0.441, "step": 22820 }, { - "epoch": 0.8, - "learning_rate": 4.779793231272288e-05, - "loss": 0.2961, + "epoch": 0.8226114534904675, + "grad_norm": 0.16547471284866333, + "learning_rate": 4.766645669409067e-05, + "loss": 0.4524, "step": 22825 }, { - "epoch": 0.8, - "learning_rate": 4.779676312344711e-05, - "loss": 0.3126, + "epoch": 0.8227916531516921, + "grad_norm": 0.15320567786693573, + "learning_rate": 4.766522547878152e-05, + "loss": 0.3913, "step": 22830 }, { - "epoch": 0.8, - "learning_rate": 4.77955936381704e-05, - "loss": 0.3014, + "epoch": 0.8229718528129167, + "grad_norm": 0.19396758079528809, + "learning_rate": 4.766399395466307e-05, + "loss": 0.4019, "step": 22835 }, { - "epoch": 0.8, - "learning_rate": 4.779442385690793e-05, - "loss": 0.3251, + "epoch": 0.8231520524741414, + "grad_norm": 0.16974592208862305, + "learning_rate": 4.766276212175207e-05, + "loss": 0.4173, "step": 22840 }, { - "epoch": 0.8, - "learning_rate": 4.779325377967488e-05, - "loss": 0.3128, + "epoch": 0.823332252135366, + "grad_norm": 0.17741717398166656, + "learning_rate": 4.766152998006534e-05, + "loss": 0.4183, "step": 22845 }, { - "epoch": 0.8, - "learning_rate": 4.779208340648646e-05, - "loss": 0.3177, + "epoch": 0.8235124517965906, + "grad_norm": 0.20168620347976685, + "learning_rate": 4.766029752961965e-05, + "loss": 0.4499, "step": 22850 }, { - "epoch": 0.8, - "learning_rate": 4.779091273735785e-05, - "loss": 0.324, + "epoch": 0.8236926514578152, + "grad_norm": 0.1789730191230774, + "learning_rate": 4.765906477043179e-05, + "loss": 0.4358, "step": 22855 }, { - "epoch": 0.8, - "learning_rate": 4.7789741772304265e-05, - "loss": 0.2845, + "epoch": 0.8238728511190399, + "grad_norm": 0.1608288586139679, + "learning_rate": 4.765783170251856e-05, + "loss": 0.4312, "step": 22860 }, { - "epoch": 0.8, - "learning_rate": 4.7788570511340905e-05, - "loss": 0.2829, + "epoch": 0.8240530507802646, + "grad_norm": 0.16104714572429657, + "learning_rate": 4.7656598325896755e-05, + "loss": 0.4676, "step": 22865 }, { - "epoch": 0.8, - "learning_rate": 4.778739895448296e-05, - "loss": 0.2705, + "epoch": 0.8242332504414892, + "grad_norm": 0.17927196621894836, + "learning_rate": 4.765536464058319e-05, + "loss": 0.4367, "step": 22870 }, { - "epoch": 0.8, - "learning_rate": 4.7786227101745666e-05, - "loss": 0.3156, + "epoch": 0.8244134501027138, + "grad_norm": 0.1429985910654068, + "learning_rate": 4.7654130646594666e-05, + "loss": 0.4697, "step": 22875 }, { - "epoch": 0.8, - "learning_rate": 4.778505495314423e-05, - "loss": 0.2754, + "epoch": 0.8245936497639385, + "grad_norm": 0.1574939787387848, + "learning_rate": 4.7652896343948e-05, + "loss": 0.4621, "step": 22880 }, { - "epoch": 0.81, - "learning_rate": 4.7783882508693876e-05, - "loss": 0.2873, + "epoch": 0.8247738494251631, + "grad_norm": 0.15647585690021515, + "learning_rate": 4.765166173266001e-05, + "loss": 0.4169, "step": 22885 }, { - "epoch": 0.81, - "learning_rate": 4.7782709768409815e-05, - "loss": 0.3089, + "epoch": 0.8249540490863877, + "grad_norm": 0.15062108635902405, + "learning_rate": 4.7650426812747505e-05, + "loss": 0.4242, "step": 22890 }, { - "epoch": 0.81, - "learning_rate": 4.778153673230728e-05, - "loss": 0.3223, + "epoch": 0.8251342487476123, + "grad_norm": 0.15938465297222137, + "learning_rate": 4.764919158422733e-05, + "loss": 0.44, "step": 22895 }, { - "epoch": 0.81, - "learning_rate": 4.778036340040152e-05, - "loss": 0.3066, + "epoch": 0.8253144484088369, + "grad_norm": 0.19385161995887756, + "learning_rate": 4.76479560471163e-05, + "loss": 0.4743, "step": 22900 }, { - "epoch": 0.81, - "learning_rate": 4.777918977270773e-05, - "loss": 0.2998, + "epoch": 0.8254946480700617, + "grad_norm": 0.1362161636352539, + "learning_rate": 4.764672020143125e-05, + "loss": 0.417, "step": 22905 }, { - "epoch": 0.81, - "learning_rate": 4.777801584924119e-05, - "loss": 0.289, + "epoch": 0.8256748477312863, + "grad_norm": 0.1929071843624115, + "learning_rate": 4.7645484047189025e-05, + "loss": 0.4504, "step": 22910 }, { - "epoch": 0.81, - "learning_rate": 4.777684163001712e-05, - "loss": 0.3045, + "epoch": 0.8258550473925109, + "grad_norm": 0.17340517044067383, + "learning_rate": 4.764424758440647e-05, + "loss": 0.4313, "step": 22915 }, { - "epoch": 0.81, - "learning_rate": 4.777566711505078e-05, - "loss": 0.315, + "epoch": 0.8260352470537355, + "grad_norm": 0.13786520063877106, + "learning_rate": 4.764301081310042e-05, + "loss": 0.4284, "step": 22920 }, { - "epoch": 0.81, - "learning_rate": 4.7774492304357396e-05, - "loss": 0.2783, + "epoch": 0.8262154467149602, + "grad_norm": 0.22200287878513336, + "learning_rate": 4.764177373328773e-05, + "loss": 0.4638, "step": 22925 }, { - "epoch": 0.81, - "learning_rate": 4.7773317197952256e-05, - "loss": 0.3309, + "epoch": 0.8263956463761848, + "grad_norm": 0.1627381592988968, + "learning_rate": 4.764053634498526e-05, + "loss": 0.473, "step": 22930 }, { - "epoch": 0.81, - "learning_rate": 4.777214179585059e-05, - "loss": 0.3209, + "epoch": 0.8265758460374094, + "grad_norm": 0.18759234249591827, + "learning_rate": 4.763929864820986e-05, + "loss": 0.4233, "step": 22935 }, { - "epoch": 0.81, - "learning_rate": 4.777096609806767e-05, - "loss": 0.2981, + "epoch": 0.826756045698634, + "grad_norm": 0.16739268600940704, + "learning_rate": 4.7638060642978405e-05, + "loss": 0.4639, "step": 22940 }, { - "epoch": 0.81, - "learning_rate": 4.7769790104618764e-05, - "loss": 0.3165, + "epoch": 0.8269362453598588, + "grad_norm": 0.13698381185531616, + "learning_rate": 4.763682232930776e-05, + "loss": 0.4376, "step": 22945 }, { - "epoch": 0.81, - "learning_rate": 4.7768613815519145e-05, - "loss": 0.2977, + "epoch": 0.8271164450210834, + "grad_norm": 0.16585668921470642, + "learning_rate": 4.763558370721478e-05, + "loss": 0.3934, "step": 22950 }, { - "epoch": 0.81, - "learning_rate": 4.7767437230784075e-05, - "loss": 0.3124, + "epoch": 0.827296644682308, + "grad_norm": 0.1562475860118866, + "learning_rate": 4.7634344776716364e-05, + "loss": 0.4305, "step": 22955 }, { - "epoch": 0.81, - "learning_rate": 4.776626035042884e-05, - "loss": 0.2972, + "epoch": 0.8274768443435326, + "grad_norm": 0.18153679370880127, + "learning_rate": 4.763310553782938e-05, + "loss": 0.4395, "step": 22960 }, { - "epoch": 0.81, - "learning_rate": 4.776508317446871e-05, - "loss": 0.2934, + "epoch": 0.8276570440047573, + "grad_norm": 0.21862560510635376, + "learning_rate": 4.7631865990570715e-05, + "loss": 0.4454, "step": 22965 }, { - "epoch": 0.81, - "learning_rate": 4.7763905702918984e-05, - "loss": 0.2941, + "epoch": 0.8278372436659819, + "grad_norm": 0.16682131588459015, + "learning_rate": 4.763062613495726e-05, + "loss": 0.4646, "step": 22970 }, { - "epoch": 0.81, - "learning_rate": 4.776272793579494e-05, - "loss": 0.3158, + "epoch": 0.8280174433272065, + "grad_norm": 0.19076871871948242, + "learning_rate": 4.76293859710059e-05, + "loss": 0.4251, "step": 22975 }, { - "epoch": 0.81, - "learning_rate": 4.776154987311188e-05, - "loss": 0.3337, + "epoch": 0.8281976429884311, + "grad_norm": 0.14495553076267242, + "learning_rate": 4.7628145498733543e-05, + "loss": 0.4469, "step": 22980 }, { - "epoch": 0.81, - "learning_rate": 4.7760371514885096e-05, - "loss": 0.3142, + "epoch": 0.8283778426496559, + "grad_norm": 0.15680228173732758, + "learning_rate": 4.762690471815708e-05, + "loss": 0.4153, "step": 22985 }, { - "epoch": 0.81, - "learning_rate": 4.7759192861129886e-05, - "loss": 0.308, + "epoch": 0.8285580423108805, + "grad_norm": 0.15533293783664703, + "learning_rate": 4.762566362929343e-05, + "loss": 0.426, "step": 22990 }, { - "epoch": 0.81, - "learning_rate": 4.775801391186156e-05, - "loss": 0.3155, + "epoch": 0.8287382419721051, + "grad_norm": 0.16373470425605774, + "learning_rate": 4.762442223215949e-05, + "loss": 0.4247, "step": 22995 }, { - "epoch": 0.81, - "learning_rate": 4.775683466709541e-05, - "loss": 0.3034, + "epoch": 0.8289184416333297, + "grad_norm": 0.15683810412883759, + "learning_rate": 4.762318052677217e-05, + "loss": 0.4058, "step": 23000 }, { - "epoch": 0.81, - "eval_loss": 0.2990756332874298, - "eval_runtime": 10.544, - "eval_samples_per_second": 9.484, - "eval_steps_per_second": 9.484, + "epoch": 0.8289184416333297, + "eval_loss": 0.4584188461303711, + "eval_runtime": 3.5371, + "eval_samples_per_second": 28.271, + "eval_steps_per_second": 7.068, "step": 23000 }, { - "epoch": 0.81, - "learning_rate": 4.7755655126846766e-05, - "loss": 0.3311, + "epoch": 0.8290986412945544, + "grad_norm": 0.20195883512496948, + "learning_rate": 4.76219385131484e-05, + "loss": 0.4855, "step": 23005 }, { - "epoch": 0.81, - "learning_rate": 4.775447529113093e-05, - "loss": 0.3021, + "epoch": 0.829278840955779, + "grad_norm": 0.1833004504442215, + "learning_rate": 4.76206961913051e-05, + "loss": 0.4323, "step": 23010 }, { - "epoch": 0.81, - "learning_rate": 4.775329515996324e-05, - "loss": 0.3034, + "epoch": 0.8294590406170036, + "grad_norm": 0.19199278950691223, + "learning_rate": 4.7619453561259195e-05, + "loss": 0.4518, "step": 23015 }, { - "epoch": 0.81, - "learning_rate": 4.7752114733358996e-05, - "loss": 0.3255, + "epoch": 0.8296392402782283, + "grad_norm": 0.16530929505825043, + "learning_rate": 4.76182106230276e-05, + "loss": 0.4471, "step": 23020 }, { - "epoch": 0.81, - "learning_rate": 4.775093401133355e-05, - "loss": 0.3309, + "epoch": 0.829819439939453, + "grad_norm": 0.1725630909204483, + "learning_rate": 4.761696737662728e-05, + "loss": 0.4395, "step": 23025 }, { - "epoch": 0.81, - "learning_rate": 4.774975299390221e-05, - "loss": 0.3222, + "epoch": 0.8299996396006776, + "grad_norm": 0.17936821281909943, + "learning_rate": 4.761572382207515e-05, + "loss": 0.4192, "step": 23030 }, { - "epoch": 0.81, - "learning_rate": 4.774857168108032e-05, - "loss": 0.3248, + "epoch": 0.8301798392619022, + "grad_norm": 0.18170322477817535, + "learning_rate": 4.761447995938817e-05, + "loss": 0.4352, "step": 23035 }, { - "epoch": 0.81, - "learning_rate": 4.7747390072883217e-05, - "loss": 0.2998, + "epoch": 0.8303600389231268, + "grad_norm": 0.18376685678958893, + "learning_rate": 4.761323578858327e-05, + "loss": 0.4149, "step": 23040 }, { - "epoch": 0.81, - "learning_rate": 4.7746208169326246e-05, - "loss": 0.2918, + "epoch": 0.8305402385843514, + "grad_norm": 0.16382743418216705, + "learning_rate": 4.761199130967742e-05, + "loss": 0.4194, "step": 23045 }, { - "epoch": 0.81, - "learning_rate": 4.7745025970424756e-05, - "loss": 0.3339, + "epoch": 0.8307204382455761, + "grad_norm": 0.2041010707616806, + "learning_rate": 4.761074652268755e-05, + "loss": 0.457, "step": 23050 }, { - "epoch": 0.81, - "learning_rate": 4.7743843476194096e-05, - "loss": 0.2757, + "epoch": 0.8309006379068007, + "grad_norm": 0.15971267223358154, + "learning_rate": 4.7609501427630657e-05, + "loss": 0.41, "step": 23055 }, { - "epoch": 0.81, - "learning_rate": 4.774266068664961e-05, - "loss": 0.3043, + "epoch": 0.8310808375680254, + "grad_norm": 0.16985180974006653, + "learning_rate": 4.7608256024523666e-05, + "loss": 0.4303, "step": 23060 }, { - "epoch": 0.81, - "learning_rate": 4.7741477601806664e-05, - "loss": 0.3021, + "epoch": 0.83126103722925, + "grad_norm": 0.1820903718471527, + "learning_rate": 4.760701031338358e-05, + "loss": 0.4765, "step": 23065 }, { - "epoch": 0.81, - "learning_rate": 4.7740294221680616e-05, - "loss": 0.2808, + "epoch": 0.8314412368904747, + "grad_norm": 0.14193592965602875, + "learning_rate": 4.760576429422734e-05, + "loss": 0.4375, "step": 23070 }, { - "epoch": 0.81, - "learning_rate": 4.7739110546286844e-05, - "loss": 0.2895, + "epoch": 0.8316214365516993, + "grad_norm": 0.1694120466709137, + "learning_rate": 4.760451796707195e-05, + "loss": 0.4215, "step": 23075 }, { - "epoch": 0.81, - "learning_rate": 4.7737926575640705e-05, - "loss": 0.3191, + "epoch": 0.8318016362129239, + "grad_norm": 0.172800675034523, + "learning_rate": 4.7603271331934376e-05, + "loss": 0.4102, "step": 23080 }, { - "epoch": 0.81, - "learning_rate": 4.7736742309757565e-05, - "loss": 0.3305, + "epoch": 0.8319818358741485, + "grad_norm": 0.15721376240253448, + "learning_rate": 4.76020243888316e-05, + "loss": 0.4472, "step": 23085 }, { - "epoch": 0.81, - "learning_rate": 4.7735557748652815e-05, - "loss": 0.3037, + "epoch": 0.8321620355353732, + "grad_norm": 0.15566657483577728, + "learning_rate": 4.760077713778062e-05, + "loss": 0.3864, "step": 23090 }, { - "epoch": 0.81, - "learning_rate": 4.773437289234183e-05, - "loss": 0.2879, + "epoch": 0.8323422351965978, + "grad_norm": 0.18366749584674835, + "learning_rate": 4.759952957879843e-05, + "loss": 0.4561, "step": 23095 }, { - "epoch": 0.81, - "learning_rate": 4.7733187740839996e-05, - "loss": 0.3036, + "epoch": 0.8325224348578225, + "grad_norm": 0.15724977850914001, + "learning_rate": 4.759828171190202e-05, + "loss": 0.428, "step": 23100 }, { - "epoch": 0.81, - "learning_rate": 4.7732002294162706e-05, - "loss": 0.3107, + "epoch": 0.8327026345190471, + "grad_norm": 0.2003716081380844, + "learning_rate": 4.7597033537108405e-05, + "loss": 0.4906, "step": 23105 }, { - "epoch": 0.81, - "learning_rate": 4.773081655232534e-05, - "loss": 0.3171, + "epoch": 0.8328828341802718, + "grad_norm": 0.1818516105413437, + "learning_rate": 4.759578505443458e-05, + "loss": 0.4225, "step": 23110 }, { - "epoch": 0.81, - "learning_rate": 4.77296305153433e-05, - "loss": 0.3143, + "epoch": 0.8330630338414964, + "grad_norm": 0.21417291462421417, + "learning_rate": 4.759453626389756e-05, + "loss": 0.4341, "step": 23115 }, { - "epoch": 0.81, - "learning_rate": 4.7728444183232e-05, - "loss": 0.3031, + "epoch": 0.833243233502721, + "grad_norm": 0.16403871774673462, + "learning_rate": 4.759328716551435e-05, + "loss": 0.4445, "step": 23120 }, { - "epoch": 0.81, - "learning_rate": 4.772725755600682e-05, - "loss": 0.3024, + "epoch": 0.8334234331639456, + "grad_norm": 0.16569961607456207, + "learning_rate": 4.759203775930198e-05, + "loss": 0.4387, "step": 23125 }, { - "epoch": 0.81, - "learning_rate": 4.772607063368318e-05, - "loss": 0.2787, + "epoch": 0.8336036328251702, + "grad_norm": 0.16702339053153992, + "learning_rate": 4.7590788045277474e-05, + "loss": 0.3888, "step": 23130 }, { - "epoch": 0.81, - "learning_rate": 4.772488341627649e-05, - "loss": 0.3211, + "epoch": 0.8337838324863949, + "grad_norm": 0.16976197063922882, + "learning_rate": 4.758953802345785e-05, + "loss": 0.4482, "step": 23135 }, { - "epoch": 0.81, - "learning_rate": 4.772369590380217e-05, - "loss": 0.2972, + "epoch": 0.8339640321476196, + "grad_norm": 0.16811056435108185, + "learning_rate": 4.758828769386015e-05, + "loss": 0.4274, "step": 23140 }, { - "epoch": 0.81, - "learning_rate": 4.7722508096275626e-05, - "loss": 0.2936, + "epoch": 0.8341442318088442, + "grad_norm": 0.13746735453605652, + "learning_rate": 4.758703705650139e-05, + "loss": 0.4351, "step": 23145 }, { - "epoch": 0.81, - "learning_rate": 4.7721319993712286e-05, - "loss": 0.3176, + "epoch": 0.8343244314700689, + "grad_norm": 0.16167333722114563, + "learning_rate": 4.758578611139864e-05, + "loss": 0.4553, "step": 23150 }, { - "epoch": 0.81, - "learning_rate": 4.7720131596127594e-05, - "loss": 0.2945, + "epoch": 0.8345046311312935, + "grad_norm": 0.16944856941699982, + "learning_rate": 4.758453485856892e-05, + "loss": 0.3807, "step": 23155 }, { - "epoch": 0.81, - "learning_rate": 4.771894290353695e-05, - "loss": 0.2996, + "epoch": 0.8346848307925181, + "grad_norm": 0.17565567791461945, + "learning_rate": 4.758328329802928e-05, + "loss": 0.4513, "step": 23160 }, { - "epoch": 0.82, - "learning_rate": 4.771775391595582e-05, - "loss": 0.3121, + "epoch": 0.8348650304537427, + "grad_norm": 0.1966121941804886, + "learning_rate": 4.758203142979678e-05, + "loss": 0.4304, "step": 23165 }, { - "epoch": 0.82, - "learning_rate": 4.771656463339962e-05, - "loss": 0.3092, + "epoch": 0.8350452301149673, + "grad_norm": 0.15772442519664764, + "learning_rate": 4.7580779253888476e-05, + "loss": 0.4289, "step": 23170 }, { - "epoch": 0.82, - "learning_rate": 4.7715375055883804e-05, - "loss": 0.2792, + "epoch": 0.8352254297761921, + "grad_norm": 0.16442222893238068, + "learning_rate": 4.757952677032143e-05, + "loss": 0.4614, "step": 23175 }, { - "epoch": 0.82, - "learning_rate": 4.771418518342381e-05, - "loss": 0.2955, + "epoch": 0.8354056294374167, + "grad_norm": 0.15727081894874573, + "learning_rate": 4.7578273979112696e-05, + "loss": 0.4287, "step": 23180 }, { - "epoch": 0.82, - "learning_rate": 4.771299501603509e-05, - "loss": 0.3136, + "epoch": 0.8355858290986413, + "grad_norm": 0.15362627804279327, + "learning_rate": 4.757702088027935e-05, + "loss": 0.4199, "step": 23185 }, { - "epoch": 0.82, - "learning_rate": 4.7711804553733095e-05, - "loss": 0.3088, + "epoch": 0.8357660287598659, + "grad_norm": 0.1784539520740509, + "learning_rate": 4.757576747383847e-05, + "loss": 0.4247, "step": 23190 }, { - "epoch": 0.82, - "learning_rate": 4.771061379653329e-05, - "loss": 0.2912, + "epoch": 0.8359462284210906, + "grad_norm": 0.1273956149816513, + "learning_rate": 4.757451375980713e-05, + "loss": 0.4282, "step": 23195 }, { - "epoch": 0.82, - "learning_rate": 4.7709422744451126e-05, - "loss": 0.3338, + "epoch": 0.8361264280823152, + "grad_norm": 0.10995710641145706, + "learning_rate": 4.757325973820241e-05, + "loss": 0.393, "step": 23200 }, { - "epoch": 0.82, - "learning_rate": 4.7708231397502076e-05, - "loss": 0.2899, + "epoch": 0.8363066277435398, + "grad_norm": 0.1851365864276886, + "learning_rate": 4.75720054090414e-05, + "loss": 0.4583, "step": 23205 }, { - "epoch": 0.82, - "learning_rate": 4.770703975570161e-05, - "loss": 0.3269, + "epoch": 0.8364868274047644, + "grad_norm": 0.16413268446922302, + "learning_rate": 4.7570750772341174e-05, + "loss": 0.4797, "step": 23210 }, { - "epoch": 0.82, - "learning_rate": 4.7705847819065195e-05, - "loss": 0.3311, + "epoch": 0.8366670270659892, + "grad_norm": 0.1683937907218933, + "learning_rate": 4.756949582811885e-05, + "loss": 0.4372, "step": 23215 }, { - "epoch": 0.82, - "learning_rate": 4.770465558760831e-05, - "loss": 0.3322, + "epoch": 0.8368472267272138, + "grad_norm": 0.1728678047657013, + "learning_rate": 4.7568240576391507e-05, + "loss": 0.4376, "step": 23220 }, { - "epoch": 0.82, - "learning_rate": 4.770346306134643e-05, - "loss": 0.3201, + "epoch": 0.8370274263884384, + "grad_norm": 0.20877417922019958, + "learning_rate": 4.7566985017176255e-05, + "loss": 0.4624, "step": 23225 }, { - "epoch": 0.82, - "learning_rate": 4.770227024029506e-05, - "loss": 0.2873, + "epoch": 0.837207626049663, + "grad_norm": 0.18232528865337372, + "learning_rate": 4.756572915049021e-05, + "loss": 0.4714, "step": 23230 }, { - "epoch": 0.82, - "learning_rate": 4.7701077124469655e-05, - "loss": 0.2996, + "epoch": 0.8373878257108877, + "grad_norm": 0.1842896044254303, + "learning_rate": 4.756447297635047e-05, + "loss": 0.4341, "step": 23235 }, { - "epoch": 0.82, - "learning_rate": 4.769988371388573e-05, - "loss": 0.3076, + "epoch": 0.8375680253721123, + "grad_norm": 0.17772360146045685, + "learning_rate": 4.756321649477415e-05, + "loss": 0.4225, "step": 23240 }, { - "epoch": 0.82, - "learning_rate": 4.769869000855878e-05, - "loss": 0.3316, + "epoch": 0.8377482250333369, + "grad_norm": 0.16995792090892792, + "learning_rate": 4.756195970577838e-05, + "loss": 0.4395, "step": 23245 }, { - "epoch": 0.82, - "learning_rate": 4.7697496008504297e-05, - "loss": 0.3065, + "epoch": 0.8379284246945615, + "grad_norm": 0.1724376529455185, + "learning_rate": 4.7560702609380275e-05, + "loss": 0.428, "step": 23250 }, { - "epoch": 0.82, - "learning_rate": 4.769630171373778e-05, - "loss": 0.2948, + "epoch": 0.8381086243557863, + "grad_norm": 0.1635865867137909, + "learning_rate": 4.755944520559697e-05, + "loss": 0.424, "step": 23255 }, { - "epoch": 0.82, - "learning_rate": 4.769510712427475e-05, - "loss": 0.3184, + "epoch": 0.8382888240170109, + "grad_norm": 0.14121320843696594, + "learning_rate": 4.755818749444558e-05, + "loss": 0.4165, "step": 23260 }, { - "epoch": 0.82, - "learning_rate": 4.7693912240130703e-05, - "loss": 0.2942, + "epoch": 0.8384690236782355, + "grad_norm": 0.19829101860523224, + "learning_rate": 4.755692947594326e-05, + "loss": 0.4237, "step": 23265 }, { - "epoch": 0.82, - "learning_rate": 4.7692717061321165e-05, - "loss": 0.3225, + "epoch": 0.8386492233394601, + "grad_norm": 0.17292529344558716, + "learning_rate": 4.755567115010714e-05, + "loss": 0.4166, "step": 23270 }, { - "epoch": 0.82, - "learning_rate": 4.7691521587861655e-05, - "loss": 0.328, + "epoch": 0.8388294230006847, + "grad_norm": 0.174628347158432, + "learning_rate": 4.755441251695437e-05, + "loss": 0.4377, "step": 23275 }, { - "epoch": 0.82, - "learning_rate": 4.769032581976769e-05, - "loss": 0.3131, + "epoch": 0.8390096226619094, + "grad_norm": 0.1492447406053543, + "learning_rate": 4.75531535765021e-05, + "loss": 0.4591, "step": 23280 }, { - "epoch": 0.82, - "learning_rate": 4.7689129757054795e-05, - "loss": 0.3128, + "epoch": 0.839189822323134, + "grad_norm": 0.20081254839897156, + "learning_rate": 4.755189432876747e-05, + "loss": 0.4029, "step": 23285 }, { - "epoch": 0.82, - "learning_rate": 4.76879333997385e-05, - "loss": 0.2992, + "epoch": 0.8393700219843586, + "grad_norm": 0.19274207949638367, + "learning_rate": 4.755063477376766e-05, + "loss": 0.475, "step": 23290 }, { - "epoch": 0.82, - "learning_rate": 4.768673674783435e-05, - "loss": 0.3081, + "epoch": 0.8395502216455833, + "grad_norm": 0.16275402903556824, + "learning_rate": 4.75493749115198e-05, + "loss": 0.456, "step": 23295 }, { - "epoch": 0.82, - "learning_rate": 4.768553980135787e-05, - "loss": 0.2858, + "epoch": 0.839730421306808, + "grad_norm": 0.2030659317970276, + "learning_rate": 4.7548114742041084e-05, + "loss": 0.4322, "step": 23300 }, { - "epoch": 0.82, - "learning_rate": 4.768434256032461e-05, - "loss": 0.332, + "epoch": 0.8399106209680326, + "grad_norm": 0.15633316338062286, + "learning_rate": 4.754685426534866e-05, + "loss": 0.4611, "step": 23305 }, { - "epoch": 0.82, - "learning_rate": 4.768314502475011e-05, - "loss": 0.3253, + "epoch": 0.8400908206292572, + "grad_norm": 0.17400497198104858, + "learning_rate": 4.754559348145972e-05, + "loss": 0.4066, "step": 23310 }, { - "epoch": 0.82, - "learning_rate": 4.768194719464991e-05, - "loss": 0.3177, + "epoch": 0.8402710202904818, + "grad_norm": 0.157339945435524, + "learning_rate": 4.754433239039143e-05, + "loss": 0.4289, "step": 23315 }, { - "epoch": 0.82, - "learning_rate": 4.768074907003959e-05, - "loss": 0.314, + "epoch": 0.8404512199517065, + "grad_norm": 0.1500500738620758, + "learning_rate": 4.7543070992160984e-05, + "loss": 0.4215, "step": 23320 }, { - "epoch": 0.82, - "learning_rate": 4.7679550650934676e-05, - "loss": 0.2749, + "epoch": 0.8406314196129311, + "grad_norm": 0.21953436732292175, + "learning_rate": 4.754180928678555e-05, + "loss": 0.4825, "step": 23325 }, { - "epoch": 0.82, - "learning_rate": 4.767835193735075e-05, - "loss": 0.2995, + "epoch": 0.8408116192741558, + "grad_norm": 0.1740504950284958, + "learning_rate": 4.754054727428233e-05, + "loss": 0.4395, "step": 23330 }, { - "epoch": 0.82, - "learning_rate": 4.767715292930337e-05, - "loss": 0.3238, + "epoch": 0.8409918189353804, + "grad_norm": 0.17277638614177704, + "learning_rate": 4.753928495466853e-05, + "loss": 0.431, "step": 23335 }, { - "epoch": 0.82, - "learning_rate": 4.76759536268081e-05, - "loss": 0.312, + "epoch": 0.8411720185966051, + "grad_norm": 0.196901336312294, + "learning_rate": 4.7538022327961316e-05, + "loss": 0.4826, "step": 23340 }, { - "epoch": 0.82, - "learning_rate": 4.7674754029880524e-05, - "loss": 0.2929, + "epoch": 0.8413522182578297, + "grad_norm": 0.2405674159526825, + "learning_rate": 4.7536759394177925e-05, + "loss": 0.4524, "step": 23345 }, { - "epoch": 0.82, - "learning_rate": 4.76735541385362e-05, - "loss": 0.3103, + "epoch": 0.8415324179190543, + "grad_norm": 0.1807735562324524, + "learning_rate": 4.7535496153335544e-05, + "loss": 0.4144, "step": 23350 }, { - "epoch": 0.82, - "learning_rate": 4.767235395279073e-05, - "loss": 0.3366, + "epoch": 0.8417126175802789, + "grad_norm": 0.1897483915090561, + "learning_rate": 4.75342326054514e-05, + "loss": 0.431, "step": 23355 }, { - "epoch": 0.82, - "learning_rate": 4.767115347265968e-05, - "loss": 0.3116, + "epoch": 0.8418928172415036, + "grad_norm": 0.17141766846179962, + "learning_rate": 4.7532968750542694e-05, + "loss": 0.4671, "step": 23360 }, { - "epoch": 0.82, - "learning_rate": 4.766995269815864e-05, - "loss": 0.3052, + "epoch": 0.8420730169027282, + "grad_norm": 0.17842607200145721, + "learning_rate": 4.753170458862665e-05, + "loss": 0.471, "step": 23365 }, { - "epoch": 0.82, - "learning_rate": 4.766875162930321e-05, - "loss": 0.3002, + "epoch": 0.8422532165639529, + "grad_norm": 0.14743296802043915, + "learning_rate": 4.75304401197205e-05, + "loss": 0.4735, "step": 23370 }, { - "epoch": 0.82, - "learning_rate": 4.7667550266108974e-05, - "loss": 0.3176, + "epoch": 0.8424334162251775, + "grad_norm": 0.1760016530752182, + "learning_rate": 4.7529175343841455e-05, + "loss": 0.431, "step": 23375 }, { - "epoch": 0.82, - "learning_rate": 4.766634860859154e-05, - "loss": 0.2901, + "epoch": 0.8426136158864022, + "grad_norm": 0.16929085552692413, + "learning_rate": 4.7527910261006755e-05, + "loss": 0.4272, "step": 23380 }, { - "epoch": 0.82, - "learning_rate": 4.76651466567665e-05, - "loss": 0.2979, + "epoch": 0.8427938155476268, + "grad_norm": 0.167909175157547, + "learning_rate": 4.752664487123365e-05, + "loss": 0.4715, "step": 23385 }, { - "epoch": 0.82, - "learning_rate": 4.766394441064948e-05, - "loss": 0.3301, + "epoch": 0.8429740152088514, + "grad_norm": 0.17611196637153625, + "learning_rate": 4.752537917453937e-05, + "loss": 0.4361, "step": 23390 }, { - "epoch": 0.82, - "learning_rate": 4.766274187025607e-05, - "loss": 0.3071, + "epoch": 0.843154214870076, + "grad_norm": 0.1789659559726715, + "learning_rate": 4.752411317094115e-05, + "loss": 0.4534, "step": 23395 }, { - "epoch": 0.82, - "learning_rate": 4.766153903560189e-05, - "loss": 0.3096, + "epoch": 0.8433344145313006, + "grad_norm": 0.17830581963062286, + "learning_rate": 4.752284686045626e-05, + "loss": 0.4091, "step": 23400 }, { - "epoch": 0.82, - "learning_rate": 4.766033590670256e-05, - "loss": 0.313, + "epoch": 0.8435146141925253, + "grad_norm": 0.18294012546539307, + "learning_rate": 4.752158024310194e-05, + "loss": 0.4673, "step": 23405 }, { - "epoch": 0.82, - "learning_rate": 4.765913248357371e-05, - "loss": 0.3152, + "epoch": 0.84369481385375, + "grad_norm": 0.15136177837848663, + "learning_rate": 4.752031331889545e-05, + "loss": 0.4319, "step": 23410 }, { - "epoch": 0.82, - "learning_rate": 4.7657928766230944e-05, - "loss": 0.2958, + "epoch": 0.8438750135149746, + "grad_norm": 0.20712289214134216, + "learning_rate": 4.751904608785405e-05, + "loss": 0.4255, "step": 23415 }, { - "epoch": 0.82, - "learning_rate": 4.7656724754689916e-05, - "loss": 0.3091, + "epoch": 0.8440552131761992, + "grad_norm": 0.1756359040737152, + "learning_rate": 4.7517778549994994e-05, + "loss": 0.4445, "step": 23420 }, { - "epoch": 0.82, - "learning_rate": 4.765552044896625e-05, - "loss": 0.3127, + "epoch": 0.8442354128374239, + "grad_norm": 0.15145882964134216, + "learning_rate": 4.751651070533558e-05, + "loss": 0.4436, "step": 23425 }, { - "epoch": 0.82, - "learning_rate": 4.7654315849075574e-05, - "loss": 0.3271, + "epoch": 0.8444156124986485, + "grad_norm": 0.18007196485996246, + "learning_rate": 4.7515242553893056e-05, + "loss": 0.4498, "step": 23430 }, { - "epoch": 0.82, - "learning_rate": 4.7653110955033545e-05, - "loss": 0.2826, + "epoch": 0.8445958121598731, + "grad_norm": 0.17577087879180908, + "learning_rate": 4.751397409568472e-05, + "loss": 0.4107, "step": 23435 }, { - "epoch": 0.82, - "learning_rate": 4.765190576685579e-05, - "loss": 0.3016, + "epoch": 0.8447760118210977, + "grad_norm": 0.1596294343471527, + "learning_rate": 4.7512705330727847e-05, + "loss": 0.4042, "step": 23440 }, { - "epoch": 0.82, - "learning_rate": 4.765070028455797e-05, - "loss": 0.308, + "epoch": 0.8449562114823224, + "grad_norm": 0.14652547240257263, + "learning_rate": 4.751143625903972e-05, + "loss": 0.4424, "step": 23445 }, { - "epoch": 0.83, - "learning_rate": 4.7649494508155735e-05, - "loss": 0.2744, + "epoch": 0.8451364111435471, + "grad_norm": 0.14940057694911957, + "learning_rate": 4.751016688063763e-05, + "loss": 0.4387, "step": 23450 }, { - "epoch": 0.83, - "learning_rate": 4.7648288437664746e-05, - "loss": 0.3216, + "epoch": 0.8453166108047717, + "grad_norm": 0.16962915658950806, + "learning_rate": 4.750889719553888e-05, + "loss": 0.447, "step": 23455 }, { - "epoch": 0.83, - "learning_rate": 4.764708207310065e-05, - "loss": 0.2988, + "epoch": 0.8454968104659963, + "grad_norm": 0.193365216255188, + "learning_rate": 4.7507627203760754e-05, + "loss": 0.4235, "step": 23460 }, { - "epoch": 0.83, - "learning_rate": 4.7645875414479115e-05, - "loss": 0.3203, + "epoch": 0.845677010127221, + "grad_norm": 0.24096348881721497, + "learning_rate": 4.7506356905320574e-05, + "loss": 0.4828, "step": 23465 }, { - "epoch": 0.83, - "learning_rate": 4.764466846181582e-05, - "loss": 0.3257, + "epoch": 0.8458572097884456, + "grad_norm": 0.17045122385025024, + "learning_rate": 4.7505086300235635e-05, + "loss": 0.3911, "step": 23470 }, { - "epoch": 0.83, - "learning_rate": 4.7643461215126425e-05, - "loss": 0.3117, + "epoch": 0.8460374094496702, + "grad_norm": 0.18029987812042236, + "learning_rate": 4.750381538852325e-05, + "loss": 0.4495, "step": 23475 }, { - "epoch": 0.83, - "learning_rate": 4.764225367442661e-05, - "loss": 0.3221, + "epoch": 0.8462176091108948, + "grad_norm": 0.1832016408443451, + "learning_rate": 4.7502544170200735e-05, + "loss": 0.4109, "step": 23480 }, { - "epoch": 0.83, - "learning_rate": 4.764104583973205e-05, - "loss": 0.3148, + "epoch": 0.8463978087721195, + "grad_norm": 0.14569438993930817, + "learning_rate": 4.750127264528542e-05, + "loss": 0.4221, "step": 23485 }, { - "epoch": 0.83, - "learning_rate": 4.7639837711058435e-05, - "loss": 0.3044, + "epoch": 0.8465780084333442, + "grad_norm": 0.16590553522109985, + "learning_rate": 4.750000081379462e-05, + "loss": 0.4384, "step": 23490 }, { - "epoch": 0.83, - "learning_rate": 4.763862928842144e-05, - "loss": 0.3233, + "epoch": 0.8467582080945688, + "grad_norm": 0.201650008559227, + "learning_rate": 4.749872867574566e-05, + "loss": 0.4363, "step": 23495 }, { - "epoch": 0.83, - "learning_rate": 4.7637420571836775e-05, - "loss": 0.3088, + "epoch": 0.8469384077557934, + "grad_norm": 0.20361419022083282, + "learning_rate": 4.7497456231155884e-05, + "loss": 0.407, "step": 23500 }, { - "epoch": 0.83, - "eval_loss": 0.29781535267829895, - "eval_runtime": 10.5453, - "eval_samples_per_second": 9.483, - "eval_steps_per_second": 9.483, + "epoch": 0.8469384077557934, + "eval_loss": 0.4585105776786804, + "eval_runtime": 3.5434, + "eval_samples_per_second": 28.221, + "eval_steps_per_second": 7.055, "step": 23500 }, { - "epoch": 0.83, - "learning_rate": 4.763621156132011e-05, - "loss": 0.2941, + "epoch": 0.847118607417018, + "grad_norm": 0.1582178771495819, + "learning_rate": 4.749618348004262e-05, + "loss": 0.441, "step": 23505 }, { - "epoch": 0.83, - "learning_rate": 4.763500225688716e-05, - "loss": 0.3113, + "epoch": 0.8472988070782427, + "grad_norm": 0.1565631479024887, + "learning_rate": 4.749491042242321e-05, + "loss": 0.4264, "step": 23510 }, { - "epoch": 0.83, - "learning_rate": 4.763379265855363e-05, - "loss": 0.3271, + "epoch": 0.8474790067394673, + "grad_norm": 0.15611840784549713, + "learning_rate": 4.7493637058314996e-05, + "loss": 0.4848, "step": 23515 }, { - "epoch": 0.83, - "learning_rate": 4.763258276633521e-05, - "loss": 0.3066, + "epoch": 0.8476592064006919, + "grad_norm": 0.17525018751621246, + "learning_rate": 4.749236338773535e-05, + "loss": 0.4368, "step": 23520 }, { - "epoch": 0.83, - "learning_rate": 4.7631372580247624e-05, - "loss": 0.2862, + "epoch": 0.8478394060619167, + "grad_norm": 0.18957500159740448, + "learning_rate": 4.749108941070159e-05, + "loss": 0.4456, "step": 23525 }, { - "epoch": 0.83, - "learning_rate": 4.7630162100306576e-05, - "loss": 0.3283, + "epoch": 0.8480196057231413, + "grad_norm": 0.1911652833223343, + "learning_rate": 4.74898151272311e-05, + "loss": 0.4294, "step": 23530 }, { - "epoch": 0.83, - "learning_rate": 4.7628951326527786e-05, - "loss": 0.3109, + "epoch": 0.8481998053843659, + "grad_norm": 0.17784404754638672, + "learning_rate": 4.748854053734122e-05, + "loss": 0.4079, "step": 23535 }, { - "epoch": 0.83, - "learning_rate": 4.762774025892698e-05, - "loss": 0.2908, + "epoch": 0.8483800050455905, + "grad_norm": 0.16650652885437012, + "learning_rate": 4.748726564104935e-05, + "loss": 0.4114, "step": 23540 }, { - "epoch": 0.83, - "learning_rate": 4.762652889751988e-05, - "loss": 0.3238, + "epoch": 0.8485602047068151, + "grad_norm": 0.15656359493732452, + "learning_rate": 4.748599043837282e-05, + "loss": 0.4926, "step": 23545 }, { - "epoch": 0.83, - "learning_rate": 4.762531724232221e-05, - "loss": 0.3091, + "epoch": 0.8487404043680398, + "grad_norm": 0.1699918955564499, + "learning_rate": 4.748471492932903e-05, + "loss": 0.4273, "step": 23550 }, { - "epoch": 0.83, - "learning_rate": 4.7624105293349706e-05, - "loss": 0.3094, + "epoch": 0.8489206040292644, + "grad_norm": 0.15680480003356934, + "learning_rate": 4.748343911393536e-05, + "loss": 0.3945, "step": 23555 }, { - "epoch": 0.83, - "learning_rate": 4.762289305061811e-05, - "loss": 0.316, + "epoch": 0.849100803690489, + "grad_norm": 0.14087727665901184, + "learning_rate": 4.748216299220918e-05, + "loss": 0.4801, "step": 23560 }, { - "epoch": 0.83, - "learning_rate": 4.762168051414315e-05, - "loss": 0.327, + "epoch": 0.8492810033517137, + "grad_norm": 0.1624569147825241, + "learning_rate": 4.7480886564167877e-05, + "loss": 0.3957, "step": 23565 }, { - "epoch": 0.83, - "learning_rate": 4.7620467683940574e-05, - "loss": 0.2885, + "epoch": 0.8494612030129384, + "grad_norm": 0.15580429136753082, + "learning_rate": 4.747960982982885e-05, + "loss": 0.425, "step": 23570 }, { - "epoch": 0.83, - "learning_rate": 4.761925456002614e-05, - "loss": 0.3026, + "epoch": 0.849641402674163, + "grad_norm": 0.15668196976184845, + "learning_rate": 4.747833278920949e-05, + "loss": 0.4421, "step": 23575 }, { - "epoch": 0.83, - "learning_rate": 4.7618041142415596e-05, - "loss": 0.3034, + "epoch": 0.8498216023353876, + "grad_norm": 0.14758959412574768, + "learning_rate": 4.74770554423272e-05, + "loss": 0.4473, "step": 23580 }, { - "epoch": 0.83, - "learning_rate": 4.7616827431124674e-05, - "loss": 0.2888, + "epoch": 0.8500018019966122, + "grad_norm": 0.15379658341407776, + "learning_rate": 4.747577778919938e-05, + "loss": 0.4246, "step": 23585 }, { - "epoch": 0.83, - "learning_rate": 4.761561342616917e-05, - "loss": 0.3184, + "epoch": 0.8501820016578369, + "grad_norm": 0.14333048462867737, + "learning_rate": 4.747449982984344e-05, + "loss": 0.4508, "step": 23590 }, { - "epoch": 0.83, - "learning_rate": 4.761439912756483e-05, - "loss": 0.314, + "epoch": 0.8503622013190615, + "grad_norm": 0.18294481933116913, + "learning_rate": 4.747322156427679e-05, + "loss": 0.4115, "step": 23595 }, { - "epoch": 0.83, - "learning_rate": 4.7613184535327414e-05, - "loss": 0.3091, + "epoch": 0.8505424009802861, + "grad_norm": 0.18898597359657288, + "learning_rate": 4.747194299251685e-05, + "loss": 0.4568, "step": 23600 }, { - "epoch": 0.83, - "learning_rate": 4.76119696494727e-05, - "loss": 0.2994, + "epoch": 0.8507226006415108, + "grad_norm": 0.1791786253452301, + "learning_rate": 4.7470664114581034e-05, + "loss": 0.4312, "step": 23605 }, { - "epoch": 0.83, - "learning_rate": 4.761075447001647e-05, - "loss": 0.3185, + "epoch": 0.8509028003027355, + "grad_norm": 0.2092021107673645, + "learning_rate": 4.746938493048677e-05, + "loss": 0.4392, "step": 23610 }, { - "epoch": 0.83, - "learning_rate": 4.760953899697449e-05, - "loss": 0.3372, + "epoch": 0.8510829999639601, + "grad_norm": 0.17262564599514008, + "learning_rate": 4.7468105440251494e-05, + "loss": 0.4335, "step": 23615 }, { - "epoch": 0.83, - "learning_rate": 4.760832323036254e-05, - "loss": 0.3087, + "epoch": 0.8512631996251847, + "grad_norm": 0.1536947339773178, + "learning_rate": 4.746682564389262e-05, + "loss": 0.3972, "step": 23620 }, { - "epoch": 0.83, - "learning_rate": 4.7607107170196417e-05, - "loss": 0.2997, + "epoch": 0.8514433992864093, + "grad_norm": 0.16581547260284424, + "learning_rate": 4.7465545541427605e-05, + "loss": 0.4377, "step": 23625 }, { - "epoch": 0.83, - "learning_rate": 4.7605890816491904e-05, - "loss": 0.2975, + "epoch": 0.851623598947634, + "grad_norm": 0.1816360354423523, + "learning_rate": 4.746426513287387e-05, + "loss": 0.4616, "step": 23630 }, { - "epoch": 0.83, - "learning_rate": 4.760467416926481e-05, - "loss": 0.3262, + "epoch": 0.8518037986088586, + "grad_norm": 0.1770544946193695, + "learning_rate": 4.746298441824889e-05, + "loss": 0.4234, "step": 23635 }, { - "epoch": 0.83, - "learning_rate": 4.76034572285309e-05, - "loss": 0.3111, + "epoch": 0.8519839982700832, + "grad_norm": 0.15067480504512787, + "learning_rate": 4.746170339757009e-05, + "loss": 0.4579, "step": 23640 }, { - "epoch": 0.83, - "learning_rate": 4.7602239994306e-05, - "loss": 0.2784, + "epoch": 0.8521641979313079, + "grad_norm": 0.12504181265830994, + "learning_rate": 4.746042207085492e-05, + "loss": 0.4412, "step": 23645 }, { - "epoch": 0.83, - "learning_rate": 4.7601022466605914e-05, - "loss": 0.2921, + "epoch": 0.8523443975925326, + "grad_norm": 0.1552511602640152, + "learning_rate": 4.745914043812085e-05, + "loss": 0.4427, "step": 23650 }, { - "epoch": 0.83, - "learning_rate": 4.759980464544645e-05, - "loss": 0.2982, + "epoch": 0.8525245972537572, + "grad_norm": 0.21108126640319824, + "learning_rate": 4.745785849938535e-05, + "loss": 0.4025, "step": 23655 }, { - "epoch": 0.83, - "learning_rate": 4.7598586530843405e-05, - "loss": 0.294, + "epoch": 0.8527047969149818, + "grad_norm": 0.17185276746749878, + "learning_rate": 4.7456576254665866e-05, + "loss": 0.4341, "step": 23660 }, { - "epoch": 0.83, - "learning_rate": 4.7597368122812607e-05, - "loss": 0.3147, + "epoch": 0.8528849965762064, + "grad_norm": 0.21345165371894836, + "learning_rate": 4.7455293703979884e-05, + "loss": 0.4513, "step": 23665 }, { - "epoch": 0.83, - "learning_rate": 4.7596149421369886e-05, - "loss": 0.3126, + "epoch": 0.853065196237431, + "grad_norm": 0.16104549169540405, + "learning_rate": 4.745401084734487e-05, + "loss": 0.4178, "step": 23670 }, { - "epoch": 0.83, - "learning_rate": 4.7594930426531046e-05, - "loss": 0.3015, + "epoch": 0.8532453958986557, + "grad_norm": 0.1738523244857788, + "learning_rate": 4.7452727684778305e-05, + "loss": 0.4372, "step": 23675 }, { - "epoch": 0.83, - "learning_rate": 4.759371113831193e-05, - "loss": 0.3026, + "epoch": 0.8534255955598804, + "grad_norm": 0.19414255023002625, + "learning_rate": 4.7451444216297674e-05, + "loss": 0.4444, "step": 23680 }, { - "epoch": 0.83, - "learning_rate": 4.759249155672837e-05, - "loss": 0.3211, + "epoch": 0.853605795221105, + "grad_norm": 0.16047124564647675, + "learning_rate": 4.7450160441920466e-05, + "loss": 0.4439, "step": 23685 }, { - "epoch": 0.83, - "learning_rate": 4.759127168179619e-05, - "loss": 0.2899, + "epoch": 0.8537859948823296, + "grad_norm": 0.1373186558485031, + "learning_rate": 4.744887636166416e-05, + "loss": 0.396, "step": 23690 }, { - "epoch": 0.83, - "learning_rate": 4.759005151353123e-05, - "loss": 0.3028, + "epoch": 0.8539661945435543, + "grad_norm": 0.15237957239151, + "learning_rate": 4.744759197554626e-05, + "loss": 0.4106, "step": 23695 }, { - "epoch": 0.83, - "learning_rate": 4.758883105194934e-05, - "loss": 0.3102, + "epoch": 0.8541463942047789, + "grad_norm": 0.18528686463832855, + "learning_rate": 4.744630728358427e-05, + "loss": 0.4458, "step": 23700 }, { - "epoch": 0.83, - "learning_rate": 4.758761029706637e-05, - "loss": 0.2816, + "epoch": 0.8543265938660035, + "grad_norm": 0.16544073820114136, + "learning_rate": 4.744502228579569e-05, + "loss": 0.4257, "step": 23705 }, { - "epoch": 0.83, - "learning_rate": 4.758638924889817e-05, - "loss": 0.2997, + "epoch": 0.8545067935272281, + "grad_norm": 0.16749577224254608, + "learning_rate": 4.7443736982198026e-05, + "loss": 0.4441, "step": 23710 }, { - "epoch": 0.83, - "learning_rate": 4.758516790746059e-05, - "loss": 0.3077, + "epoch": 0.8546869931884528, + "grad_norm": 0.19285382330417633, + "learning_rate": 4.7442451372808797e-05, + "loss": 0.4527, "step": 23715 }, { - "epoch": 0.83, - "learning_rate": 4.758394627276949e-05, - "loss": 0.3017, + "epoch": 0.8548671928496775, + "grad_norm": 0.17987944185733795, + "learning_rate": 4.74411654576455e-05, + "loss": 0.4205, "step": 23720 }, { - "epoch": 0.83, - "learning_rate": 4.7582724344840726e-05, - "loss": 0.2612, + "epoch": 0.8550473925109021, + "grad_norm": 0.14557349681854248, + "learning_rate": 4.7439879236725674e-05, + "loss": 0.4193, "step": 23725 }, { - "epoch": 0.83, - "learning_rate": 4.7581502123690165e-05, - "loss": 0.3053, + "epoch": 0.8552275921721267, + "grad_norm": 0.184067040681839, + "learning_rate": 4.743859271006684e-05, + "loss": 0.4426, "step": 23730 }, { - "epoch": 0.84, - "learning_rate": 4.758027960933368e-05, - "loss": 0.3, + "epoch": 0.8554077918333514, + "grad_norm": 0.14480531215667725, + "learning_rate": 4.743730587768652e-05, + "loss": 0.4561, "step": 23735 }, { - "epoch": 0.84, - "learning_rate": 4.757905680178715e-05, - "loss": 0.323, + "epoch": 0.855587991494576, + "grad_norm": 0.16141416132450104, + "learning_rate": 4.7436018739602255e-05, + "loss": 0.4452, "step": 23740 }, { - "epoch": 0.84, - "learning_rate": 4.757783370106646e-05, - "loss": 0.3073, + "epoch": 0.8557681911558006, + "grad_norm": 0.1394343227148056, + "learning_rate": 4.743473129583158e-05, + "loss": 0.4205, "step": 23745 }, { - "epoch": 0.84, - "learning_rate": 4.757661030718746e-05, - "loss": 0.3241, + "epoch": 0.8559483908170252, + "grad_norm": 0.18532565236091614, + "learning_rate": 4.743344354639203e-05, + "loss": 0.4093, "step": 23750 }, { - "epoch": 0.84, - "learning_rate": 4.757538662016606e-05, - "loss": 0.2923, + "epoch": 0.8561285904782499, + "grad_norm": 0.17660491168498993, + "learning_rate": 4.743215549130115e-05, + "loss": 0.4328, "step": 23755 }, { - "epoch": 0.84, - "learning_rate": 4.757416264001814e-05, - "loss": 0.3052, + "epoch": 0.8563087901394746, + "grad_norm": 0.20785756409168243, + "learning_rate": 4.743086713057651e-05, + "loss": 0.427, "step": 23760 }, { - "epoch": 0.84, - "learning_rate": 4.75729383667596e-05, - "loss": 0.2748, + "epoch": 0.8564889898006992, + "grad_norm": 0.19306261837482452, + "learning_rate": 4.7429578464235635e-05, + "loss": 0.4162, "step": 23765 }, { - "epoch": 0.84, - "learning_rate": 4.7571713800406324e-05, - "loss": 0.3155, + "epoch": 0.8566691894619238, + "grad_norm": 0.1779967099428177, + "learning_rate": 4.742828949229611e-05, + "loss": 0.4251, "step": 23770 }, { - "epoch": 0.84, - "learning_rate": 4.7570488940974236e-05, - "loss": 0.3109, + "epoch": 0.8568493891231485, + "grad_norm": 0.22586210072040558, + "learning_rate": 4.742700021477547e-05, + "loss": 0.4546, "step": 23775 }, { - "epoch": 0.84, - "learning_rate": 4.7569263788479204e-05, - "loss": 0.2879, + "epoch": 0.8570295887843731, + "grad_norm": 0.1746557056903839, + "learning_rate": 4.74257106316913e-05, + "loss": 0.3891, "step": 23780 }, { - "epoch": 0.84, - "learning_rate": 4.756803834293717e-05, - "loss": 0.3166, + "epoch": 0.8572097884455977, + "grad_norm": 0.1889764368534088, + "learning_rate": 4.742442074306116e-05, + "loss": 0.4304, "step": 23785 }, { - "epoch": 0.84, - "learning_rate": 4.756681260436402e-05, - "loss": 0.3123, + "epoch": 0.8573899881068223, + "grad_norm": 0.16799265146255493, + "learning_rate": 4.742313054890263e-05, + "loss": 0.3993, "step": 23790 }, { - "epoch": 0.84, - "learning_rate": 4.7565586572775686e-05, - "loss": 0.2884, + "epoch": 0.8575701877680469, + "grad_norm": 0.17759232223033905, + "learning_rate": 4.742184004923329e-05, + "loss": 0.4243, "step": 23795 }, { - "epoch": 0.84, - "learning_rate": 4.7564360248188086e-05, - "loss": 0.308, + "epoch": 0.8577503874292717, + "grad_norm": 0.16500405967235565, + "learning_rate": 4.742054924407072e-05, + "loss": 0.4649, "step": 23800 }, { - "epoch": 0.84, - "learning_rate": 4.756313363061713e-05, - "loss": 0.3314, + "epoch": 0.8579305870904963, + "grad_norm": 0.20370623469352722, + "learning_rate": 4.7419258133432504e-05, + "loss": 0.4699, "step": 23805 }, { - "epoch": 0.84, - "learning_rate": 4.756190672007876e-05, - "loss": 0.3089, + "epoch": 0.8581107867517209, + "grad_norm": 0.18750417232513428, + "learning_rate": 4.741796671733624e-05, + "loss": 0.4242, "step": 23810 }, { - "epoch": 0.84, - "learning_rate": 4.756067951658889e-05, - "loss": 0.3132, + "epoch": 0.8582909864129455, + "grad_norm": 0.1584675908088684, + "learning_rate": 4.741667499579952e-05, + "loss": 0.4649, "step": 23815 }, { - "epoch": 0.84, - "learning_rate": 4.755945202016348e-05, - "loss": 0.2933, + "epoch": 0.8584711860741702, + "grad_norm": 0.2016519159078598, + "learning_rate": 4.741538296883994e-05, + "loss": 0.4401, "step": 23820 }, { - "epoch": 0.84, - "learning_rate": 4.755822423081844e-05, - "loss": 0.2671, + "epoch": 0.8586513857353948, + "grad_norm": 0.14827871322631836, + "learning_rate": 4.741409063647511e-05, + "loss": 0.4315, "step": 23825 }, { - "epoch": 0.84, - "learning_rate": 4.755699614856973e-05, - "loss": 0.3004, + "epoch": 0.8588315853966194, + "grad_norm": 0.22020429372787476, + "learning_rate": 4.741279799872263e-05, + "loss": 0.4144, "step": 23830 }, { - "epoch": 0.84, - "learning_rate": 4.755576777343329e-05, - "loss": 0.2954, + "epoch": 0.8590117850578441, + "grad_norm": 0.15583039820194244, + "learning_rate": 4.7411505055600115e-05, + "loss": 0.4558, "step": 23835 }, { - "epoch": 0.84, - "learning_rate": 4.7554539105425075e-05, - "loss": 0.3193, + "epoch": 0.8591919847190688, + "grad_norm": 0.17358680069446564, + "learning_rate": 4.741021180712519e-05, + "loss": 0.441, "step": 23840 }, { - "epoch": 0.84, - "learning_rate": 4.7553310144561024e-05, - "loss": 0.3082, + "epoch": 0.8593721843802934, + "grad_norm": 0.2088773101568222, + "learning_rate": 4.7408918253315464e-05, + "loss": 0.4209, "step": 23845 }, { - "epoch": 0.84, - "learning_rate": 4.755208089085711e-05, - "loss": 0.2875, + "epoch": 0.859552384041518, + "grad_norm": 0.1951671689748764, + "learning_rate": 4.740762439418856e-05, + "loss": 0.4292, "step": 23850 }, { - "epoch": 0.84, - "learning_rate": 4.7550851344329286e-05, - "loss": 0.3087, + "epoch": 0.8597325837027426, + "grad_norm": 0.14672303199768066, + "learning_rate": 4.740633022976213e-05, + "loss": 0.4192, "step": 23855 }, { - "epoch": 0.84, - "learning_rate": 4.754962150499352e-05, - "loss": 0.3027, + "epoch": 0.8599127833639673, + "grad_norm": 0.21486572921276093, + "learning_rate": 4.740503576005377e-05, + "loss": 0.4734, "step": 23860 }, { - "epoch": 0.84, - "learning_rate": 4.7548391372865776e-05, - "loss": 0.3188, + "epoch": 0.8600929830251919, + "grad_norm": 0.15998758375644684, + "learning_rate": 4.740374098508115e-05, + "loss": 0.4318, "step": 23865 }, { - "epoch": 0.84, - "learning_rate": 4.7547160947962034e-05, - "loss": 0.3187, + "epoch": 0.8602731826864165, + "grad_norm": 0.17849770188331604, + "learning_rate": 4.740244590486188e-05, + "loss": 0.4296, "step": 23870 }, { - "epoch": 0.84, - "learning_rate": 4.754593023029827e-05, - "loss": 0.3014, + "epoch": 0.8604533823476412, + "grad_norm": 0.18155914545059204, + "learning_rate": 4.740115051941363e-05, + "loss": 0.4598, "step": 23875 }, { - "epoch": 0.84, - "learning_rate": 4.7544699219890454e-05, - "loss": 0.2919, + "epoch": 0.8606335820088659, + "grad_norm": 0.15962116420269012, + "learning_rate": 4.7399854828754045e-05, + "loss": 0.417, "step": 23880 }, { - "epoch": 0.84, - "learning_rate": 4.754346791675458e-05, - "loss": 0.3071, + "epoch": 0.8608137816700905, + "grad_norm": 0.15650032460689545, + "learning_rate": 4.7398558832900774e-05, + "loss": 0.4187, "step": 23885 }, { - "epoch": 0.84, - "learning_rate": 4.754223632090663e-05, - "loss": 0.3336, + "epoch": 0.8609939813313151, + "grad_norm": 0.1548149287700653, + "learning_rate": 4.7397262531871466e-05, + "loss": 0.4128, "step": 23890 }, { - "epoch": 0.84, - "learning_rate": 4.7541004432362596e-05, - "loss": 0.3384, + "epoch": 0.8611741809925397, + "grad_norm": 0.1666049212217331, + "learning_rate": 4.739596592568381e-05, + "loss": 0.4565, "step": 23895 }, { - "epoch": 0.84, - "learning_rate": 4.753977225113847e-05, - "loss": 0.305, + "epoch": 0.8613543806537644, + "grad_norm": 0.14310970902442932, + "learning_rate": 4.7394669014355444e-05, + "loss": 0.4583, "step": 23900 }, { - "epoch": 0.84, - "learning_rate": 4.7538539777250266e-05, - "loss": 0.3004, + "epoch": 0.861534580314989, + "grad_norm": 0.15406273305416107, + "learning_rate": 4.739337179790404e-05, + "loss": 0.4391, "step": 23905 }, { - "epoch": 0.84, - "learning_rate": 4.753730701071397e-05, - "loss": 0.3281, + "epoch": 0.8617147799762136, + "grad_norm": 0.15325191617012024, + "learning_rate": 4.739207427634729e-05, + "loss": 0.4413, "step": 23910 }, { - "epoch": 0.84, - "learning_rate": 4.75360739515456e-05, - "loss": 0.2932, + "epoch": 0.8618949796374383, + "grad_norm": 0.14947649836540222, + "learning_rate": 4.7390776449702864e-05, + "loss": 0.439, "step": 23915 }, { - "epoch": 0.84, - "learning_rate": 4.7534840599761154e-05, - "loss": 0.3116, + "epoch": 0.862075179298663, + "grad_norm": 0.16361840069293976, + "learning_rate": 4.738947831798844e-05, + "loss": 0.4578, "step": 23920 }, { - "epoch": 0.84, - "learning_rate": 4.7533606955376665e-05, - "loss": 0.3204, + "epoch": 0.8622553789598876, + "grad_norm": 0.20598135888576508, + "learning_rate": 4.738817988122171e-05, + "loss": 0.4678, "step": 23925 }, { - "epoch": 0.84, - "learning_rate": 4.7532373018408125e-05, - "loss": 0.3119, + "epoch": 0.8624355786211122, + "grad_norm": 0.17498494684696198, + "learning_rate": 4.738688113942036e-05, + "loss": 0.4866, "step": 23930 }, { - "epoch": 0.84, - "learning_rate": 4.753113878887158e-05, - "loss": 0.2632, + "epoch": 0.8626157782823368, + "grad_norm": 0.1920240968465805, + "learning_rate": 4.73855820926021e-05, + "loss": 0.4274, "step": 23935 }, { - "epoch": 0.84, - "learning_rate": 4.752990426678304e-05, - "loss": 0.3219, + "epoch": 0.8627959779435614, + "grad_norm": 0.14575999975204468, + "learning_rate": 4.738428274078461e-05, + "loss": 0.4185, "step": 23940 }, { - "epoch": 0.84, - "learning_rate": 4.7528669452158554e-05, - "loss": 0.3126, + "epoch": 0.8629761776047861, + "grad_norm": 0.14960631728172302, + "learning_rate": 4.73829830839856e-05, + "loss": 0.4148, "step": 23945 }, { - "epoch": 0.84, - "learning_rate": 4.752743434501413e-05, - "loss": 0.3073, + "epoch": 0.8631563772660107, + "grad_norm": 0.15745064616203308, + "learning_rate": 4.738168312222278e-05, + "loss": 0.4097, "step": 23950 }, { - "epoch": 0.84, - "learning_rate": 4.752619894536582e-05, - "loss": 0.2941, + "epoch": 0.8633365769272354, + "grad_norm": 0.1760377436876297, + "learning_rate": 4.738038285551386e-05, + "loss": 0.4678, "step": 23955 }, { - "epoch": 0.84, - "learning_rate": 4.752496325322967e-05, - "loss": 0.3093, + "epoch": 0.86351677658846, + "grad_norm": 0.15912814438343048, + "learning_rate": 4.7379082283876566e-05, + "loss": 0.4585, "step": 23960 }, { - "epoch": 0.84, - "learning_rate": 4.752372726862171e-05, - "loss": 0.3013, + "epoch": 0.8636969762496847, + "grad_norm": 0.13206642866134644, + "learning_rate": 4.737778140732859e-05, + "loss": 0.4157, "step": 23965 }, { - "epoch": 0.84, - "learning_rate": 4.752249099155799e-05, - "loss": 0.2963, + "epoch": 0.8638771759109093, + "grad_norm": 0.21865518391132355, + "learning_rate": 4.737648022588769e-05, + "loss": 0.4373, "step": 23970 }, { - "epoch": 0.84, - "learning_rate": 4.752125442205457e-05, - "loss": 0.3125, + "epoch": 0.8640573755721339, + "grad_norm": 0.15692706406116486, + "learning_rate": 4.737517873957158e-05, + "loss": 0.3932, "step": 23975 }, { - "epoch": 0.84, - "learning_rate": 4.752001756012751e-05, - "loss": 0.3002, + "epoch": 0.8642375752333585, + "grad_norm": 0.19991809129714966, + "learning_rate": 4.737387694839798e-05, + "loss": 0.4778, "step": 23980 }, { - "epoch": 0.84, - "learning_rate": 4.751878040579286e-05, - "loss": 0.3147, + "epoch": 0.8644177748945832, + "grad_norm": 0.1880597621202469, + "learning_rate": 4.737257485238465e-05, + "loss": 0.3943, "step": 23985 }, { - "epoch": 0.84, - "learning_rate": 4.7517542959066684e-05, - "loss": 0.3145, + "epoch": 0.8645979745558078, + "grad_norm": 0.21935416758060455, + "learning_rate": 4.737127245154931e-05, + "loss": 0.4207, "step": 23990 }, { - "epoch": 0.84, - "learning_rate": 4.751630521996505e-05, - "loss": 0.2935, + "epoch": 0.8647781742170325, + "grad_norm": 0.16751424968242645, + "learning_rate": 4.736996974590972e-05, + "loss": 0.4191, "step": 23995 }, { - "epoch": 0.84, - "learning_rate": 4.751506718850404e-05, - "loss": 0.3306, + "epoch": 0.8649583738782571, + "grad_norm": 0.18504785001277924, + "learning_rate": 4.736866673548362e-05, + "loss": 0.4088, "step": 24000 }, { - "epoch": 0.84, - "eval_loss": 0.2971259653568268, - "eval_runtime": 10.5298, - "eval_samples_per_second": 9.497, - "eval_steps_per_second": 9.497, + "epoch": 0.8649583738782571, + "eval_loss": 0.45828837156295776, + "eval_runtime": 3.5369, + "eval_samples_per_second": 28.273, + "eval_steps_per_second": 7.068, "step": 24000 }, { - "epoch": 0.84, - "learning_rate": 4.751382886469972e-05, - "loss": 0.3354, + "epoch": 0.8651385735394818, + "grad_norm": 0.18134254217147827, + "learning_rate": 4.7367363420288765e-05, + "loss": 0.4821, "step": 24005 }, { - "epoch": 0.84, - "learning_rate": 4.7512590248568163e-05, - "loss": 0.2793, + "epoch": 0.8653187732007064, + "grad_norm": 0.15396596491336823, + "learning_rate": 4.736605980034292e-05, + "loss": 0.433, "step": 24010 }, { - "epoch": 0.84, - "learning_rate": 4.7511351340125456e-05, - "loss": 0.3002, + "epoch": 0.865498972861931, + "grad_norm": 0.1897013932466507, + "learning_rate": 4.7364755875663834e-05, + "loss": 0.4416, "step": 24015 }, { - "epoch": 0.85, - "learning_rate": 4.75101121393877e-05, - "loss": 0.2878, + "epoch": 0.8656791725231556, + "grad_norm": 0.15314961969852448, + "learning_rate": 4.736345164626929e-05, + "loss": 0.4014, "step": 24020 }, { - "epoch": 0.85, - "learning_rate": 4.750887264637096e-05, - "loss": 0.2873, + "epoch": 0.8658593721843802, + "grad_norm": 0.17612630128860474, + "learning_rate": 4.736214711217703e-05, + "loss": 0.4435, "step": 24025 }, { - "epoch": 0.85, - "learning_rate": 4.750763286109134e-05, - "loss": 0.3029, + "epoch": 0.866039571845605, + "grad_norm": 0.15653395652770996, + "learning_rate": 4.736084227340486e-05, + "loss": 0.4473, "step": 24030 }, { - "epoch": 0.85, - "learning_rate": 4.750639278356496e-05, - "loss": 0.317, + "epoch": 0.8662197715068296, + "grad_norm": 0.14282876253128052, + "learning_rate": 4.735953712997053e-05, + "loss": 0.4484, "step": 24035 }, { - "epoch": 0.85, - "learning_rate": 4.7505152413807886e-05, - "loss": 0.2921, + "epoch": 0.8663999711680542, + "grad_norm": 0.184425950050354, + "learning_rate": 4.7358231681891855e-05, + "loss": 0.4453, "step": 24040 }, { - "epoch": 0.85, - "learning_rate": 4.750391175183624e-05, - "loss": 0.3101, + "epoch": 0.8665801708292789, + "grad_norm": 0.1802368015050888, + "learning_rate": 4.735692592918658e-05, + "loss": 0.4524, "step": 24045 }, { - "epoch": 0.85, - "learning_rate": 4.750267079766614e-05, - "loss": 0.315, + "epoch": 0.8667603704905035, + "grad_norm": 0.15805204212665558, + "learning_rate": 4.735561987187253e-05, + "loss": 0.4139, "step": 24050 }, { - "epoch": 0.85, - "learning_rate": 4.750142955131368e-05, - "loss": 0.3072, + "epoch": 0.8669405701517281, + "grad_norm": 0.16449551284313202, + "learning_rate": 4.7354313509967486e-05, + "loss": 0.4236, "step": 24055 }, { - "epoch": 0.85, - "learning_rate": 4.7500188012794985e-05, - "loss": 0.3223, + "epoch": 0.8671207698129527, + "grad_norm": 0.1771911382675171, + "learning_rate": 4.735300684348925e-05, + "loss": 0.4665, "step": 24060 }, { - "epoch": 0.85, - "learning_rate": 4.749894618212618e-05, - "loss": 0.2978, + "epoch": 0.8673009694741773, + "grad_norm": 0.1740954965353012, + "learning_rate": 4.735169987245561e-05, + "loss": 0.4394, "step": 24065 }, { - "epoch": 0.85, - "learning_rate": 4.749770405932339e-05, - "loss": 0.2927, + "epoch": 0.8674811691354021, + "grad_norm": 0.18734583258628845, + "learning_rate": 4.735039259688441e-05, + "loss": 0.4227, "step": 24070 }, { - "epoch": 0.85, - "learning_rate": 4.749646164440273e-05, - "loss": 0.2951, + "epoch": 0.8676613687966267, + "grad_norm": 0.1404682993888855, + "learning_rate": 4.734908501679342e-05, + "loss": 0.4358, "step": 24075 }, { - "epoch": 0.85, - "learning_rate": 4.7495218937380356e-05, - "loss": 0.3082, + "epoch": 0.8678415684578513, + "grad_norm": 0.16821545362472534, + "learning_rate": 4.7347777132200475e-05, + "loss": 0.4389, "step": 24080 }, { - "epoch": 0.85, - "learning_rate": 4.7493975938272376e-05, - "loss": 0.3136, + "epoch": 0.8680217681190759, + "grad_norm": 0.17715495824813843, + "learning_rate": 4.73464689431234e-05, + "loss": 0.4148, "step": 24085 }, { - "epoch": 0.85, - "learning_rate": 4.749273264709495e-05, - "loss": 0.2949, + "epoch": 0.8682019677803006, + "grad_norm": 0.16599716246128082, + "learning_rate": 4.734516044958001e-05, + "loss": 0.4243, "step": 24090 }, { - "epoch": 0.85, - "learning_rate": 4.749148906386421e-05, - "loss": 0.2751, + "epoch": 0.8683821674415252, + "grad_norm": 0.15897536277770996, + "learning_rate": 4.7343851651588137e-05, + "loss": 0.431, "step": 24095 }, { - "epoch": 0.85, - "learning_rate": 4.749024518859631e-05, - "loss": 0.3013, + "epoch": 0.8685623671027498, + "grad_norm": 0.14617212116718292, + "learning_rate": 4.73425425491656e-05, + "loss": 0.4395, "step": 24100 }, { - "epoch": 0.85, - "learning_rate": 4.74890010213074e-05, - "loss": 0.3143, + "epoch": 0.8687425667639744, + "grad_norm": 0.1834208071231842, + "learning_rate": 4.734123314233026e-05, + "loss": 0.4274, "step": 24105 }, { - "epoch": 0.85, - "learning_rate": 4.748775656201363e-05, - "loss": 0.3332, + "epoch": 0.8689227664251992, + "grad_norm": 0.22381344437599182, + "learning_rate": 4.733992343109994e-05, + "loss": 0.4279, "step": 24110 }, { - "epoch": 0.85, - "learning_rate": 4.748651181073115e-05, - "loss": 0.2902, + "epoch": 0.8691029660864238, + "grad_norm": 0.14666973054409027, + "learning_rate": 4.7338613415492486e-05, + "loss": 0.4255, "step": 24115 }, { - "epoch": 0.85, - "learning_rate": 4.748526676747614e-05, - "loss": 0.2741, + "epoch": 0.8692831657476484, + "grad_norm": 0.17257143557071686, + "learning_rate": 4.733730309552575e-05, + "loss": 0.4548, "step": 24120 }, { - "epoch": 0.85, - "learning_rate": 4.748402143226477e-05, - "loss": 0.3071, + "epoch": 0.869463365408873, + "grad_norm": 0.17522400617599487, + "learning_rate": 4.733599247121758e-05, + "loss": 0.4354, "step": 24125 }, { - "epoch": 0.85, - "learning_rate": 4.748277580511319e-05, - "loss": 0.3265, + "epoch": 0.8696435650700977, + "grad_norm": 0.19729764759540558, + "learning_rate": 4.733468154258585e-05, + "loss": 0.46, "step": 24130 }, { - "epoch": 0.85, - "learning_rate": 4.748152988603758e-05, - "loss": 0.3057, + "epoch": 0.8698237647313223, + "grad_norm": 0.15954478085041046, + "learning_rate": 4.733337030964839e-05, + "loss": 0.4451, "step": 24135 }, { - "epoch": 0.85, - "learning_rate": 4.748028367505413e-05, - "loss": 0.3049, + "epoch": 0.8700039643925469, + "grad_norm": 0.16600678861141205, + "learning_rate": 4.73320587724231e-05, + "loss": 0.4397, "step": 24140 }, { - "epoch": 0.85, - "learning_rate": 4.747903717217901e-05, - "loss": 0.3015, + "epoch": 0.8701841640537715, + "grad_norm": 0.16135752201080322, + "learning_rate": 4.733074693092783e-05, + "loss": 0.4045, "step": 24145 }, { - "epoch": 0.85, - "learning_rate": 4.74777903774284e-05, - "loss": 0.3069, + "epoch": 0.8703643637149963, + "grad_norm": 0.16556213796138763, + "learning_rate": 4.732943478518045e-05, + "loss": 0.4408, "step": 24150 }, { - "epoch": 0.85, - "learning_rate": 4.74765432908185e-05, - "loss": 0.3236, + "epoch": 0.8705445633762209, + "grad_norm": 0.15862923860549927, + "learning_rate": 4.732812233519884e-05, + "loss": 0.4239, "step": 24155 }, { - "epoch": 0.85, - "learning_rate": 4.747529591236549e-05, - "loss": 0.3162, + "epoch": 0.8707247630374455, + "grad_norm": 0.19938628375530243, + "learning_rate": 4.73268095810009e-05, + "loss": 0.4735, "step": 24160 }, { - "epoch": 0.85, - "learning_rate": 4.7474048242085585e-05, - "loss": 0.2927, + "epoch": 0.8709049626986701, + "grad_norm": 0.19490042328834534, + "learning_rate": 4.732549652260449e-05, + "loss": 0.4184, "step": 24165 }, { - "epoch": 0.85, - "learning_rate": 4.7472800279994964e-05, - "loss": 0.3169, + "epoch": 0.8710851623598947, + "grad_norm": 0.19054916501045227, + "learning_rate": 4.732418316002751e-05, + "loss": 0.4391, "step": 24170 }, { - "epoch": 0.85, - "learning_rate": 4.7471552026109855e-05, - "loss": 0.3284, + "epoch": 0.8712653620211194, + "grad_norm": 0.15895889699459076, + "learning_rate": 4.732286949328787e-05, + "loss": 0.448, "step": 24175 }, { - "epoch": 0.85, - "learning_rate": 4.7470303480446446e-05, - "loss": 0.3145, + "epoch": 0.871445561682344, + "grad_norm": 0.16004562377929688, + "learning_rate": 4.732155552240345e-05, + "loss": 0.3978, "step": 24180 }, { - "epoch": 0.85, - "learning_rate": 4.746905464302095e-05, - "loss": 0.2979, + "epoch": 0.8716257613435687, + "grad_norm": 0.2268204391002655, + "learning_rate": 4.732024124739215e-05, + "loss": 0.4128, "step": 24185 }, { - "epoch": 0.85, - "learning_rate": 4.74678055138496e-05, - "loss": 0.308, + "epoch": 0.8718059610047934, + "grad_norm": 0.18399815261363983, + "learning_rate": 4.731892666827189e-05, + "loss": 0.4377, "step": 24190 }, { - "epoch": 0.85, - "learning_rate": 4.746655609294858e-05, - "loss": 0.3221, + "epoch": 0.871986160666018, + "grad_norm": 0.13044115900993347, + "learning_rate": 4.731761178506058e-05, + "loss": 0.4663, "step": 24195 }, { - "epoch": 0.85, - "learning_rate": 4.746530638033415e-05, - "loss": 0.3295, + "epoch": 0.8721663603272426, + "grad_norm": 0.1856122612953186, + "learning_rate": 4.7316296597776123e-05, + "loss": 0.4289, "step": 24200 }, { - "epoch": 0.85, - "learning_rate": 4.746405637602252e-05, - "loss": 0.3207, + "epoch": 0.8723465599884672, + "grad_norm": 0.17390163242816925, + "learning_rate": 4.731498110643645e-05, + "loss": 0.4633, "step": 24205 }, { - "epoch": 0.85, - "learning_rate": 4.746280608002992e-05, - "loss": 0.3052, + "epoch": 0.8725267596496918, + "grad_norm": 0.15870732069015503, + "learning_rate": 4.731366531105947e-05, + "loss": 0.436, "step": 24210 }, { - "epoch": 0.85, - "learning_rate": 4.746155549237259e-05, - "loss": 0.2973, + "epoch": 0.8727069593109165, + "grad_norm": 0.18578395247459412, + "learning_rate": 4.731234921166313e-05, + "loss": 0.4246, "step": 24215 }, { - "epoch": 0.85, - "learning_rate": 4.746030461306676e-05, - "loss": 0.2646, + "epoch": 0.8728871589721411, + "grad_norm": 0.19250373542308807, + "learning_rate": 4.7311032808265356e-05, + "loss": 0.4511, "step": 24220 }, { - "epoch": 0.85, - "learning_rate": 4.745905344212869e-05, - "loss": 0.3076, + "epoch": 0.8730673586333658, + "grad_norm": 0.18087449669837952, + "learning_rate": 4.730971610088407e-05, + "loss": 0.4365, "step": 24225 }, { - "epoch": 0.85, - "learning_rate": 4.7457801979574604e-05, - "loss": 0.2814, + "epoch": 0.8732475582945904, + "grad_norm": 0.15141281485557556, + "learning_rate": 4.7308399089537224e-05, + "loss": 0.4196, "step": 24230 }, { - "epoch": 0.85, - "learning_rate": 4.745655022542075e-05, - "loss": 0.3061, + "epoch": 0.8734277579558151, + "grad_norm": 0.16130012273788452, + "learning_rate": 4.730708177424276e-05, + "loss": 0.4339, "step": 24235 }, { - "epoch": 0.85, - "learning_rate": 4.74552981796834e-05, - "loss": 0.3201, + "epoch": 0.8736079576170397, + "grad_norm": 0.21587517857551575, + "learning_rate": 4.730576415501863e-05, + "loss": 0.4417, "step": 24240 }, { - "epoch": 0.85, - "learning_rate": 4.7454045842378806e-05, - "loss": 0.2877, + "epoch": 0.8737881572782643, + "grad_norm": 0.1740473508834839, + "learning_rate": 4.730444623188278e-05, + "loss": 0.4073, "step": 24245 }, { - "epoch": 0.85, - "learning_rate": 4.745279321352321e-05, - "loss": 0.2999, + "epoch": 0.8739683569394889, + "grad_norm": 0.1535811573266983, + "learning_rate": 4.730312800485316e-05, + "loss": 0.4328, "step": 24250 }, { - "epoch": 0.85, - "learning_rate": 4.74515402931329e-05, - "loss": 0.292, + "epoch": 0.8741485566007136, + "grad_norm": 0.20786762237548828, + "learning_rate": 4.7301809473947744e-05, + "loss": 0.4224, "step": 24255 }, { - "epoch": 0.85, - "learning_rate": 4.745028708122414e-05, - "loss": 0.2876, + "epoch": 0.8743287562619382, + "grad_norm": 0.19472654163837433, + "learning_rate": 4.730049063918449e-05, + "loss": 0.4272, "step": 24260 }, { - "epoch": 0.85, - "learning_rate": 4.744903357781319e-05, - "loss": 0.3125, + "epoch": 0.8745089559231629, + "grad_norm": 0.2173738032579422, + "learning_rate": 4.729917150058137e-05, + "loss": 0.4626, "step": 24265 }, { - "epoch": 0.85, - "learning_rate": 4.7447779782916333e-05, - "loss": 0.3157, + "epoch": 0.8746891555843875, + "grad_norm": 0.2047669142484665, + "learning_rate": 4.729785205815637e-05, + "loss": 0.4195, "step": 24270 }, { - "epoch": 0.85, - "learning_rate": 4.7446525696549855e-05, - "loss": 0.3172, + "epoch": 0.8748693552456122, + "grad_norm": 0.19127419590950012, + "learning_rate": 4.7296532311927436e-05, + "loss": 0.4646, "step": 24275 }, { - "epoch": 0.85, - "learning_rate": 4.744527131873003e-05, - "loss": 0.2945, + "epoch": 0.8750495549068368, + "grad_norm": 0.22150792181491852, + "learning_rate": 4.729521226191257e-05, + "loss": 0.426, "step": 24280 }, { - "epoch": 0.85, - "learning_rate": 4.744401664947315e-05, - "loss": 0.2963, + "epoch": 0.8752297545680614, + "grad_norm": 0.2369442582130432, + "learning_rate": 4.729389190812975e-05, + "loss": 0.4241, "step": 24285 }, { - "epoch": 0.85, - "learning_rate": 4.744276168879551e-05, - "loss": 0.2887, + "epoch": 0.875409954229286, + "grad_norm": 0.17270110547542572, + "learning_rate": 4.729257125059697e-05, + "loss": 0.4575, "step": 24290 }, { - "epoch": 0.85, - "learning_rate": 4.744150643671339e-05, - "loss": 0.2818, + "epoch": 0.8755901538905106, + "grad_norm": 0.16564151644706726, + "learning_rate": 4.729125028933222e-05, + "loss": 0.4041, "step": 24295 }, { - "epoch": 0.85, - "learning_rate": 4.7440250893243104e-05, - "loss": 0.3011, + "epoch": 0.8757703535517353, + "grad_norm": 0.1850225180387497, + "learning_rate": 4.728992902435351e-05, + "loss": 0.4068, "step": 24300 }, { - "epoch": 0.86, - "learning_rate": 4.743899505840095e-05, - "loss": 0.3359, + "epoch": 0.87595055321296, + "grad_norm": 0.17433954775333405, + "learning_rate": 4.728860745567883e-05, + "loss": 0.4035, "step": 24305 }, { - "epoch": 0.86, - "learning_rate": 4.743773893220323e-05, - "loss": 0.2925, + "epoch": 0.8761307528741846, + "grad_norm": 0.1604575216770172, + "learning_rate": 4.728728558332618e-05, + "loss": 0.4102, "step": 24310 }, { - "epoch": 0.86, - "learning_rate": 4.743648251466626e-05, - "loss": 0.2962, + "epoch": 0.8763109525354092, + "grad_norm": 0.18941442668437958, + "learning_rate": 4.7285963407313594e-05, + "loss": 0.4188, "step": 24315 }, { - "epoch": 0.86, - "learning_rate": 4.743522580580634e-05, - "loss": 0.3224, + "epoch": 0.8764911521966339, + "grad_norm": 0.15453235805034637, + "learning_rate": 4.728464092765906e-05, + "loss": 0.4138, "step": 24320 }, { - "epoch": 0.86, - "learning_rate": 4.743396880563981e-05, - "loss": 0.3107, + "epoch": 0.8766713518578585, + "grad_norm": 0.16290608048439026, + "learning_rate": 4.7283318144380606e-05, + "loss": 0.4676, "step": 24325 }, { - "epoch": 0.86, - "learning_rate": 4.7432711514182976e-05, - "loss": 0.3132, + "epoch": 0.8768515515190831, + "grad_norm": 0.15486080944538116, + "learning_rate": 4.728199505749626e-05, + "loss": 0.4694, "step": 24330 }, { - "epoch": 0.86, - "learning_rate": 4.743145393145217e-05, - "loss": 0.3075, + "epoch": 0.8770317511803077, + "grad_norm": 0.14686860144138336, + "learning_rate": 4.728067166702404e-05, + "loss": 0.4268, "step": 24335 }, { - "epoch": 0.86, - "learning_rate": 4.743019605746372e-05, - "loss": 0.2577, + "epoch": 0.8772119508415325, + "grad_norm": 0.18616507947444916, + "learning_rate": 4.7279347972982e-05, + "loss": 0.4561, "step": 24340 }, { - "epoch": 0.86, - "learning_rate": 4.742893789223394e-05, - "loss": 0.3243, + "epoch": 0.8773921505027571, + "grad_norm": 0.17040805518627167, + "learning_rate": 4.727802397538814e-05, + "loss": 0.4399, "step": 24345 }, { - "epoch": 0.86, - "learning_rate": 4.7427679435779196e-05, - "loss": 0.2945, + "epoch": 0.8775723501639817, + "grad_norm": 0.18918024003505707, + "learning_rate": 4.7276699674260525e-05, + "loss": 0.4592, "step": 24350 }, { - "epoch": 0.86, - "learning_rate": 4.7426420688115816e-05, - "loss": 0.3102, + "epoch": 0.8777525498252063, + "grad_norm": 0.1710633933544159, + "learning_rate": 4.727537506961719e-05, + "loss": 0.4546, "step": 24355 }, { - "epoch": 0.86, - "learning_rate": 4.742516164926014e-05, - "loss": 0.3217, + "epoch": 0.877932749486431, + "grad_norm": 0.16070812940597534, + "learning_rate": 4.727405016147618e-05, + "loss": 0.443, "step": 24360 }, { - "epoch": 0.86, - "learning_rate": 4.742390231922852e-05, - "loss": 0.2823, + "epoch": 0.8781129491476556, + "grad_norm": 0.20280031859874725, + "learning_rate": 4.727272494985554e-05, + "loss": 0.4375, "step": 24365 }, { - "epoch": 0.86, - "learning_rate": 4.74226426980373e-05, - "loss": 0.3064, + "epoch": 0.8782931488088802, + "grad_norm": 0.1605139523744583, + "learning_rate": 4.7271399434773345e-05, + "loss": 0.44, "step": 24370 }, { - "epoch": 0.86, - "learning_rate": 4.742138278570285e-05, - "loss": 0.3351, + "epoch": 0.8784733484701048, + "grad_norm": 0.19736407697200775, + "learning_rate": 4.7270073616247646e-05, + "loss": 0.4579, "step": 24375 }, { - "epoch": 0.86, - "learning_rate": 4.7420122582241514e-05, - "loss": 0.341, + "epoch": 0.8786535481313296, + "grad_norm": 0.19877927005290985, + "learning_rate": 4.72687474942965e-05, + "loss": 0.4602, "step": 24380 }, { - "epoch": 0.86, - "learning_rate": 4.741886208766966e-05, - "loss": 0.3188, + "epoch": 0.8788337477925542, + "grad_norm": 0.16131383180618286, + "learning_rate": 4.7267421068937984e-05, + "loss": 0.4178, "step": 24385 }, { - "epoch": 0.86, - "learning_rate": 4.741760130200367e-05, - "loss": 0.3149, + "epoch": 0.8790139474537788, + "grad_norm": 0.19206862151622772, + "learning_rate": 4.7266094340190165e-05, + "loss": 0.4558, "step": 24390 }, { - "epoch": 0.86, - "learning_rate": 4.7416340225259886e-05, - "loss": 0.3037, + "epoch": 0.8791941471150034, + "grad_norm": 0.20475110411643982, + "learning_rate": 4.7264767308071126e-05, + "loss": 0.417, "step": 24395 }, { - "epoch": 0.86, - "learning_rate": 4.7415078857454704e-05, - "loss": 0.3078, + "epoch": 0.879374346776228, + "grad_norm": 0.21035943925380707, + "learning_rate": 4.726343997259893e-05, + "loss": 0.4433, "step": 24400 }, { - "epoch": 0.86, - "learning_rate": 4.7413817198604496e-05, - "loss": 0.287, + "epoch": 0.8795545464374527, + "grad_norm": 0.18233457207679749, + "learning_rate": 4.7262112333791685e-05, + "loss": 0.3937, "step": 24405 }, { - "epoch": 0.86, - "learning_rate": 4.7412555248725646e-05, - "loss": 0.295, + "epoch": 0.8797347460986773, + "grad_norm": 0.18523730337619781, + "learning_rate": 4.7260784391667475e-05, + "loss": 0.4456, "step": 24410 }, { - "epoch": 0.86, - "learning_rate": 4.7411293007834525e-05, - "loss": 0.3377, + "epoch": 0.8799149457599019, + "grad_norm": 0.19751974940299988, + "learning_rate": 4.725945614624438e-05, + "loss": 0.4533, "step": 24415 }, { - "epoch": 0.86, - "learning_rate": 4.741003047594754e-05, - "loss": 0.2956, + "epoch": 0.8800951454211267, + "grad_norm": 0.15248170495033264, + "learning_rate": 4.7258127597540505e-05, + "loss": 0.4397, "step": 24420 }, { - "epoch": 0.86, - "learning_rate": 4.740876765308107e-05, - "loss": 0.3259, + "epoch": 0.8802753450823513, + "grad_norm": 0.1679372489452362, + "learning_rate": 4.725679874557395e-05, + "loss": 0.4419, "step": 24425 }, { - "epoch": 0.86, - "learning_rate": 4.740750453925153e-05, - "loss": 0.2787, + "epoch": 0.8804555447435759, + "grad_norm": 0.256967157125473, + "learning_rate": 4.7255469590362825e-05, + "loss": 0.432, "step": 24430 }, { - "epoch": 0.86, - "learning_rate": 4.7406241134475316e-05, - "loss": 0.3286, + "epoch": 0.8806357444048005, + "grad_norm": 0.18858453631401062, + "learning_rate": 4.725414013192523e-05, + "loss": 0.4169, "step": 24435 }, { - "epoch": 0.86, - "learning_rate": 4.740497743876882e-05, - "loss": 0.3364, + "epoch": 0.8808159440660251, + "grad_norm": 0.18355509638786316, + "learning_rate": 4.725281037027929e-05, + "loss": 0.4668, "step": 24440 }, { - "epoch": 0.86, - "learning_rate": 4.7403713452148454e-05, - "loss": 0.2879, + "epoch": 0.8809961437272498, + "grad_norm": 0.16345731914043427, + "learning_rate": 4.725148030544311e-05, + "loss": 0.4466, "step": 24445 }, { - "epoch": 0.86, - "learning_rate": 4.740244917463064e-05, - "loss": 0.3261, + "epoch": 0.8811763433884744, + "grad_norm": 0.19114868342876434, + "learning_rate": 4.7250149937434826e-05, + "loss": 0.3905, "step": 24450 }, { - "epoch": 0.86, - "learning_rate": 4.7401184606231786e-05, - "loss": 0.2984, + "epoch": 0.881356543049699, + "grad_norm": 0.19460555911064148, + "learning_rate": 4.724881926627255e-05, + "loss": 0.437, "step": 24455 }, { - "epoch": 0.86, - "learning_rate": 4.7399919746968316e-05, - "loss": 0.3239, + "epoch": 0.8815367427109237, + "grad_norm": Infinity, + "learning_rate": 4.7247754511084054e-05, + "loss": 0.4409, "step": 24460 }, { - "epoch": 0.86, - "learning_rate": 4.739865459685664e-05, - "loss": 0.2893, + "epoch": 0.8817169423721484, + "grad_norm": 0.14812225103378296, + "learning_rate": 4.72464232942903e-05, + "loss": 0.3929, "step": 24465 }, { - "epoch": 0.86, - "learning_rate": 4.7397389155913197e-05, - "loss": 0.3265, + "epoch": 0.881897142033373, + "grad_norm": 0.1727660894393921, + "learning_rate": 4.724509177439333e-05, + "loss": 0.4262, "step": 24470 }, { - "epoch": 0.86, - "learning_rate": 4.7396123424154426e-05, - "loss": 0.2947, + "epoch": 0.8820773416945976, + "grad_norm": 0.16415224969387054, + "learning_rate": 4.724375995141129e-05, + "loss": 0.4162, "step": 24475 }, { - "epoch": 0.86, - "learning_rate": 4.739485740159674e-05, - "loss": 0.2713, + "epoch": 0.8822575413558222, + "grad_norm": 0.16160134971141815, + "learning_rate": 4.724242782536234e-05, + "loss": 0.4698, "step": 24480 }, { - "epoch": 0.86, - "learning_rate": 4.739359108825659e-05, - "loss": 0.318, + "epoch": 0.8824377410170469, + "grad_norm": 0.15631291270256042, + "learning_rate": 4.724109539626461e-05, + "loss": 0.4455, "step": 24485 }, { - "epoch": 0.86, - "learning_rate": 4.739232448415043e-05, - "loss": 0.3234, + "epoch": 0.8826179406782715, + "grad_norm": 0.16617485880851746, + "learning_rate": 4.7239762664136264e-05, + "loss": 0.4638, "step": 24490 }, { - "epoch": 0.86, - "learning_rate": 4.739105758929468e-05, - "loss": 0.3022, + "epoch": 0.8827981403394961, + "grad_norm": 0.1969350427389145, + "learning_rate": 4.7238429628995456e-05, + "loss": 0.4458, "step": 24495 }, { - "epoch": 0.86, - "learning_rate": 4.738979040370582e-05, - "loss": 0.3255, + "epoch": 0.8829783400007208, + "grad_norm": 0.12761616706848145, + "learning_rate": 4.723709629086035e-05, + "loss": 0.4276, "step": 24500 }, { - "epoch": 0.86, - "eval_loss": 0.297638863325119, - "eval_runtime": 10.5374, - "eval_samples_per_second": 9.49, - "eval_steps_per_second": 9.49, + "epoch": 0.8829783400007208, + "eval_loss": 0.4570537507534027, + "eval_runtime": 3.5263, + "eval_samples_per_second": 28.359, + "eval_steps_per_second": 7.09, "step": 24500 }, { - "epoch": 0.86, - "learning_rate": 4.7388522927400276e-05, - "loss": 0.2881, + "epoch": 0.8831585396619455, + "grad_norm": 0.20359250903129578, + "learning_rate": 4.723576264974911e-05, + "loss": 0.4593, "step": 24505 }, { - "epoch": 0.86, - "learning_rate": 4.7387255160394514e-05, - "loss": 0.32, + "epoch": 0.8833387393231701, + "grad_norm": 0.1633104681968689, + "learning_rate": 4.723442870567991e-05, + "loss": 0.4538, "step": 24510 }, { - "epoch": 0.86, - "learning_rate": 4.7385987102705e-05, - "loss": 0.3053, + "epoch": 0.8835189389843947, + "grad_norm": 0.19287440180778503, + "learning_rate": 4.7233094458670926e-05, + "loss": 0.4838, "step": 24515 }, { - "epoch": 0.86, - "learning_rate": 4.73847187543482e-05, - "loss": 0.294, + "epoch": 0.8836991386456193, + "grad_norm": 0.16194848716259003, + "learning_rate": 4.723175990874034e-05, + "loss": 0.4247, "step": 24520 }, { - "epoch": 0.86, - "learning_rate": 4.738345011534058e-05, - "loss": 0.2906, + "epoch": 0.883879338306844, + "grad_norm": 0.15081371366977692, + "learning_rate": 4.723042505590631e-05, + "loss": 0.4416, "step": 24525 }, { - "epoch": 0.86, - "learning_rate": 4.738218118569861e-05, - "loss": 0.2912, + "epoch": 0.8840595379680686, + "grad_norm": 0.15278756618499756, + "learning_rate": 4.7229089900187065e-05, + "loss": 0.4031, "step": 24530 }, { - "epoch": 0.86, - "learning_rate": 4.738091196543876e-05, - "loss": 0.2966, + "epoch": 0.8842397376292933, + "grad_norm": 0.19165587425231934, + "learning_rate": 4.722775444160076e-05, + "loss": 0.4159, "step": 24535 }, { - "epoch": 0.86, - "learning_rate": 4.737964245457752e-05, - "loss": 0.3212, + "epoch": 0.8844199372905179, + "grad_norm": 0.17740246653556824, + "learning_rate": 4.722641868016561e-05, + "loss": 0.4277, "step": 24540 }, { - "epoch": 0.86, - "learning_rate": 4.7378372653131375e-05, - "loss": 0.3061, + "epoch": 0.8846001369517426, + "grad_norm": 0.14334651827812195, + "learning_rate": 4.72250826158998e-05, + "loss": 0.4596, "step": 24545 }, { - "epoch": 0.86, - "learning_rate": 4.7377102561116816e-05, - "loss": 0.3129, + "epoch": 0.8847803366129672, + "grad_norm": 0.1924460381269455, + "learning_rate": 4.722374624882155e-05, + "loss": 0.4452, "step": 24550 }, { - "epoch": 0.86, - "learning_rate": 4.737583217855032e-05, - "loss": 0.3163, + "epoch": 0.8849605362741918, + "grad_norm": 0.18541020154953003, + "learning_rate": 4.7222409578949054e-05, + "loss": 0.4299, "step": 24555 }, { - "epoch": 0.86, - "learning_rate": 4.7374561505448384e-05, - "loss": 0.3168, + "epoch": 0.8851407359354164, + "grad_norm": 0.14057400822639465, + "learning_rate": 4.7221072606300543e-05, + "loss": 0.4235, "step": 24560 }, { - "epoch": 0.86, - "learning_rate": 4.737329054182752e-05, - "loss": 0.3179, + "epoch": 0.885320935596641, + "grad_norm": 0.17242777347564697, + "learning_rate": 4.721973533089421e-05, + "loss": 0.4452, "step": 24565 }, { - "epoch": 0.86, - "learning_rate": 4.737201928770422e-05, - "loss": 0.2977, + "epoch": 0.8855011352578657, + "grad_norm": 0.14792689681053162, + "learning_rate": 4.721866529259576e-05, + "loss": 0.4112, "step": 24570 }, { - "epoch": 0.86, - "learning_rate": 4.737074774309499e-05, - "loss": 0.2845, + "epoch": 0.8856813349190904, + "grad_norm": 0.20297929644584656, + "learning_rate": 4.7217327472271283e-05, + "loss": 0.4249, "step": 24575 }, { - "epoch": 0.86, - "learning_rate": 4.736947590801635e-05, - "loss": 0.2973, + "epoch": 0.885861534580315, + "grad_norm": 0.19619841873645782, + "learning_rate": 4.7215989349240026e-05, + "loss": 0.4288, "step": 24580 }, { - "epoch": 0.86, - "learning_rate": 4.73682037824848e-05, - "loss": 0.3149, + "epoch": 0.8860417342415396, + "grad_norm": 0.15129470825195312, + "learning_rate": 4.721465092352021e-05, + "loss": 0.4335, "step": 24585 }, { - "epoch": 0.87, - "learning_rate": 4.7366931366516876e-05, - "loss": 0.3216, + "epoch": 0.8862219339027643, + "grad_norm": 0.18301598727703094, + "learning_rate": 4.7213312195130076e-05, + "loss": 0.4433, "step": 24590 }, { - "epoch": 0.87, - "learning_rate": 4.736565866012908e-05, - "loss": 0.3094, + "epoch": 0.8864021335639889, + "grad_norm": 0.16560180485248566, + "learning_rate": 4.721197316408787e-05, + "loss": 0.4086, "step": 24595 }, { - "epoch": 0.87, - "learning_rate": 4.736438566333795e-05, - "loss": 0.2993, + "epoch": 0.8865823332252135, + "grad_norm": 0.1662699282169342, + "learning_rate": 4.721063383041182e-05, + "loss": 0.4438, "step": 24600 }, { - "epoch": 0.87, - "learning_rate": 4.736311237616001e-05, - "loss": 0.3066, + "epoch": 0.8867625328864381, + "grad_norm": 0.2002149224281311, + "learning_rate": 4.720929419412019e-05, + "loss": 0.4215, "step": 24605 }, { - "epoch": 0.87, - "learning_rate": 4.736183879861179e-05, - "loss": 0.2802, + "epoch": 0.8869427325476628, + "grad_norm": 0.15055419504642487, + "learning_rate": 4.720795425523122e-05, + "loss": 0.4198, "step": 24610 }, { - "epoch": 0.87, - "learning_rate": 4.736056493070983e-05, - "loss": 0.2907, + "epoch": 0.8871229322088875, + "grad_norm": 0.18256331980228424, + "learning_rate": 4.720661401376318e-05, + "loss": 0.4618, "step": 24615 }, { - "epoch": 0.87, - "learning_rate": 4.7359290772470675e-05, - "loss": 0.312, + "epoch": 0.8873031318701121, + "grad_norm": 0.15107060968875885, + "learning_rate": 4.7205273469734325e-05, + "loss": 0.4164, "step": 24620 }, { - "epoch": 0.87, - "learning_rate": 4.735801632391087e-05, - "loss": 0.2899, + "epoch": 0.8874833315313367, + "grad_norm": 0.16118554770946503, + "learning_rate": 4.7203932623162917e-05, + "loss": 0.4657, "step": 24625 }, { - "epoch": 0.87, - "learning_rate": 4.735674158504695e-05, - "loss": 0.3086, + "epoch": 0.8876635311925614, + "grad_norm": 0.1474160999059677, + "learning_rate": 4.720259147406722e-05, + "loss": 0.4628, "step": 24630 }, { - "epoch": 0.87, - "learning_rate": 4.735546655589547e-05, - "loss": 0.2949, + "epoch": 0.887843730853786, + "grad_norm": 0.18264853954315186, + "learning_rate": 4.720125002246552e-05, + "loss": 0.4558, "step": 24635 }, { - "epoch": 0.87, - "learning_rate": 4.7354191236472995e-05, - "loss": 0.3053, + "epoch": 0.8880239305150106, + "grad_norm": 0.2043534368276596, + "learning_rate": 4.719990826837608e-05, + "loss": 0.4473, "step": 24640 }, { - "epoch": 0.87, - "learning_rate": 4.735291562679608e-05, - "loss": 0.3149, + "epoch": 0.8882041301762352, + "grad_norm": 0.1613204926252365, + "learning_rate": 4.719856621181719e-05, + "loss": 0.4262, "step": 24645 }, { - "epoch": 0.87, - "learning_rate": 4.735163972688128e-05, - "loss": 0.2791, + "epoch": 0.8883843298374599, + "grad_norm": 0.13441753387451172, + "learning_rate": 4.7197223852807136e-05, + "loss": 0.4354, "step": 24650 }, { - "epoch": 0.87, - "learning_rate": 4.7350363536745174e-05, - "loss": 0.2994, + "epoch": 0.8885645294986846, + "grad_norm": 0.221405029296875, + "learning_rate": 4.71958811913642e-05, + "loss": 0.4739, "step": 24655 }, { - "epoch": 0.87, - "learning_rate": 4.734908705640433e-05, - "loss": 0.3109, + "epoch": 0.8887447291599092, + "grad_norm": 0.18559223413467407, + "learning_rate": 4.719453822750669e-05, + "loss": 0.4838, "step": 24660 }, { - "epoch": 0.87, - "learning_rate": 4.7347810285875315e-05, - "loss": 0.3036, + "epoch": 0.8889249288211338, + "grad_norm": 0.16596554219722748, + "learning_rate": 4.7193194961252885e-05, + "loss": 0.4106, "step": 24665 }, { - "epoch": 0.87, - "learning_rate": 4.7346533225174705e-05, - "loss": 0.2824, + "epoch": 0.8891051284823585, + "grad_norm": 0.18130573630332947, + "learning_rate": 4.7191851392621086e-05, + "loss": 0.4411, "step": 24670 }, { - "epoch": 0.87, - "learning_rate": 4.7345255874319096e-05, - "loss": 0.3066, + "epoch": 0.8892853281435831, + "grad_norm": 0.19347991049289703, + "learning_rate": 4.719050752162962e-05, + "loss": 0.4076, "step": 24675 }, { - "epoch": 0.87, - "learning_rate": 4.7343978233325064e-05, - "loss": 0.3002, + "epoch": 0.8894655278048077, + "grad_norm": 0.18345235288143158, + "learning_rate": 4.7189163348296794e-05, + "loss": 0.3907, "step": 24680 }, { - "epoch": 0.87, - "learning_rate": 4.7342700302209195e-05, - "loss": 0.3059, + "epoch": 0.8896457274660323, + "grad_norm": 0.18901300430297852, + "learning_rate": 4.71878188726409e-05, + "loss": 0.4677, "step": 24685 }, { - "epoch": 0.87, - "learning_rate": 4.7341422080988094e-05, - "loss": 0.2977, + "epoch": 0.889825927127257, + "grad_norm": 0.17101964354515076, + "learning_rate": 4.718647409468028e-05, + "loss": 0.4077, "step": 24690 }, { - "epoch": 0.87, - "learning_rate": 4.734014356967835e-05, - "loss": 0.3186, + "epoch": 0.8900061267884817, + "grad_norm": 0.2040674090385437, + "learning_rate": 4.7185129014433234e-05, + "loss": 0.4288, "step": 24695 }, { - "epoch": 0.87, - "learning_rate": 4.733886476829656e-05, - "loss": 0.2897, + "epoch": 0.8901863264497063, + "grad_norm": 0.13120241463184357, + "learning_rate": 4.71837836319181e-05, + "loss": 0.4134, "step": 24700 }, { - "epoch": 0.87, - "learning_rate": 4.7337585676859344e-05, - "loss": 0.2943, + "epoch": 0.8903665261109309, + "grad_norm": 0.18854455649852753, + "learning_rate": 4.7182437947153216e-05, + "loss": 0.4561, "step": 24705 }, { - "epoch": 0.87, - "learning_rate": 4.7336306295383285e-05, - "loss": 0.3145, + "epoch": 0.8905467257721555, + "grad_norm": 0.14998573064804077, + "learning_rate": 4.718109196015691e-05, + "loss": 0.4087, "step": 24710 }, { - "epoch": 0.87, - "learning_rate": 4.7335026623885015e-05, - "loss": 0.3302, + "epoch": 0.8907269254333802, + "grad_norm": 0.1941816210746765, + "learning_rate": 4.717974567094752e-05, + "loss": 0.4604, "step": 24715 }, { - "epoch": 0.87, - "learning_rate": 4.733374666238115e-05, - "loss": 0.2812, + "epoch": 0.8909071250946048, + "grad_norm": 0.16004863381385803, + "learning_rate": 4.7178399079543386e-05, + "loss": 0.4399, "step": 24720 }, { - "epoch": 0.87, - "learning_rate": 4.733246641088829e-05, - "loss": 0.2879, + "epoch": 0.8910873247558294, + "grad_norm": 0.192369744181633, + "learning_rate": 4.717705218596286e-05, + "loss": 0.4439, "step": 24725 }, { - "epoch": 0.87, - "learning_rate": 4.7331185869423075e-05, - "loss": 0.3062, + "epoch": 0.8912675244170541, + "grad_norm": 0.2026439905166626, + "learning_rate": 4.717570499022429e-05, + "loss": 0.4401, "step": 24730 }, { - "epoch": 0.87, - "learning_rate": 4.7329905038002136e-05, - "loss": 0.285, + "epoch": 0.8914477240782788, + "grad_norm": 0.1843568980693817, + "learning_rate": 4.7174357492346035e-05, + "loss": 0.4492, "step": 24735 }, { - "epoch": 0.87, - "learning_rate": 4.732862391664209e-05, - "loss": 0.3035, + "epoch": 0.8916279237395034, + "grad_norm": 0.17085200548171997, + "learning_rate": 4.717300969234645e-05, + "loss": 0.4401, "step": 24740 }, { - "epoch": 0.87, - "learning_rate": 4.732734250535959e-05, - "loss": 0.2817, + "epoch": 0.891808123400728, + "grad_norm": 0.15291930735111237, + "learning_rate": 4.717166159024391e-05, + "loss": 0.3833, "step": 24745 }, { - "epoch": 0.87, - "learning_rate": 4.732606080417124e-05, - "loss": 0.3166, + "epoch": 0.8919883230619526, + "grad_norm": 0.21157534420490265, + "learning_rate": 4.717031318605676e-05, + "loss": 0.4876, "step": 24750 }, { - "epoch": 0.87, - "learning_rate": 4.732477881309372e-05, - "loss": 0.3061, + "epoch": 0.8921685227231773, + "grad_norm": 0.1500784456729889, + "learning_rate": 4.716896447980339e-05, + "loss": 0.4665, "step": 24755 }, { - "epoch": 0.87, - "learning_rate": 4.732349653214365e-05, - "loss": 0.3076, + "epoch": 0.8923487223844019, + "grad_norm": 0.20307470858097076, + "learning_rate": 4.716761547150218e-05, + "loss": 0.4211, "step": 24760 }, { - "epoch": 0.87, - "learning_rate": 4.732221396133769e-05, - "loss": 0.3195, + "epoch": 0.8925289220456265, + "grad_norm": 0.21808555722236633, + "learning_rate": 4.716626616117149e-05, + "loss": 0.4722, "step": 24765 }, { - "epoch": 0.87, - "learning_rate": 4.73209311006925e-05, - "loss": 0.3247, + "epoch": 0.8927091217068512, + "grad_norm": 0.16036725044250488, + "learning_rate": 4.7164916548829716e-05, + "loss": 0.4207, "step": 24770 }, { - "epoch": 0.87, - "learning_rate": 4.731964795022473e-05, - "loss": 0.3167, + "epoch": 0.8928893213680759, + "grad_norm": 0.1857818365097046, + "learning_rate": 4.716356663449525e-05, + "loss": 0.4423, "step": 24775 }, { - "epoch": 0.87, - "learning_rate": 4.731836450995103e-05, - "loss": 0.3, + "epoch": 0.8930695210293005, + "grad_norm": 0.16408823430538177, + "learning_rate": 4.716221641818648e-05, + "loss": 0.3968, "step": 24780 }, { - "epoch": 0.87, - "learning_rate": 4.7317080779888076e-05, - "loss": 0.2758, + "epoch": 0.8932497206905251, + "grad_norm": 0.1698572039604187, + "learning_rate": 4.716086589992179e-05, + "loss": 0.4458, "step": 24785 }, { - "epoch": 0.87, - "learning_rate": 4.7315796760052544e-05, - "loss": 0.3273, + "epoch": 0.8934299203517497, + "grad_norm": 0.17390035092830658, + "learning_rate": 4.7159515079719606e-05, + "loss": 0.4407, "step": 24790 }, { - "epoch": 0.87, - "learning_rate": 4.73145124504611e-05, - "loss": 0.2699, + "epoch": 0.8936101200129744, + "grad_norm": 0.321679025888443, + "learning_rate": 4.715816395759832e-05, + "loss": 0.4106, "step": 24795 }, { - "epoch": 0.87, - "learning_rate": 4.731322785113041e-05, - "loss": 0.285, + "epoch": 0.893790319674199, + "grad_norm": 0.14596271514892578, + "learning_rate": 4.715681253357633e-05, + "loss": 0.4319, "step": 24800 }, { - "epoch": 0.87, - "learning_rate": 4.7311942962077164e-05, - "loss": 0.3223, + "epoch": 0.8939705193354236, + "grad_norm": 0.1981964260339737, + "learning_rate": 4.715546080767207e-05, + "loss": 0.4163, "step": 24805 }, { - "epoch": 0.87, - "learning_rate": 4.731065778331805e-05, - "loss": 0.3058, + "epoch": 0.8941507189966483, + "grad_norm": 0.13960348069667816, + "learning_rate": 4.715410877990394e-05, + "loss": 0.4762, "step": 24810 }, { - "epoch": 0.87, - "learning_rate": 4.730937231486974e-05, - "loss": 0.2904, + "epoch": 0.894330918657873, + "grad_norm": 0.13228678703308105, + "learning_rate": 4.7152756450290365e-05, + "loss": 0.4419, "step": 24815 }, { - "epoch": 0.87, - "learning_rate": 4.7308086556748926e-05, - "loss": 0.2698, + "epoch": 0.8945111183190976, + "grad_norm": 0.13322246074676514, + "learning_rate": 4.715140381884977e-05, + "loss": 0.4458, "step": 24820 }, { - "epoch": 0.87, - "learning_rate": 4.730680050897231e-05, - "loss": 0.3074, + "epoch": 0.8946913179803222, + "grad_norm": 0.190599724650383, + "learning_rate": 4.715005088560059e-05, + "loss": 0.4297, "step": 24825 }, { - "epoch": 0.87, - "learning_rate": 4.730551417155661e-05, - "loss": 0.2964, + "epoch": 0.8948715176415468, + "grad_norm": 0.2033384144306183, + "learning_rate": 4.714869765056126e-05, + "loss": 0.4869, "step": 24830 }, { - "epoch": 0.87, - "learning_rate": 4.730422754451849e-05, - "loss": 0.2897, + "epoch": 0.8950517173027714, + "grad_norm": 0.1728903204202652, + "learning_rate": 4.714734411375021e-05, + "loss": 0.4397, "step": 24835 }, { - "epoch": 0.87, - "learning_rate": 4.730294062787468e-05, - "loss": 0.3129, + "epoch": 0.8952319169639961, + "grad_norm": 0.1565089374780655, + "learning_rate": 4.714599027518588e-05, + "loss": 0.4149, "step": 24840 }, { - "epoch": 0.87, - "learning_rate": 4.730165342164188e-05, - "loss": 0.3175, + "epoch": 0.8954121166252208, + "grad_norm": 0.14530473947525024, + "learning_rate": 4.7144636134886725e-05, + "loss": 0.4479, "step": 24845 }, { - "epoch": 0.87, - "learning_rate": 4.730036592583682e-05, - "loss": 0.2899, + "epoch": 0.8955923162864454, + "grad_norm": 0.18022358417510986, + "learning_rate": 4.714328169287119e-05, + "loss": 0.4322, "step": 24850 }, { - "epoch": 0.87, - "learning_rate": 4.7299078140476195e-05, - "loss": 0.3284, + "epoch": 0.89577251594767, + "grad_norm": 0.20143041014671326, + "learning_rate": 4.714192694915772e-05, + "loss": 0.4065, "step": 24855 }, { - "epoch": 0.87, - "learning_rate": 4.7297790065576734e-05, - "loss": 0.3055, + "epoch": 0.8959527156088947, + "grad_norm": 0.1598571240901947, + "learning_rate": 4.7140571903764796e-05, + "loss": 0.4355, "step": 24860 }, { - "epoch": 0.87, - "learning_rate": 4.729650170115517e-05, - "loss": 0.3146, + "epoch": 0.8961329152701193, + "grad_norm": 0.14883951842784882, + "learning_rate": 4.713921655671086e-05, + "loss": 0.4303, "step": 24865 }, { - "epoch": 0.87, - "learning_rate": 4.729521304722823e-05, - "loss": 0.307, + "epoch": 0.8963131149313439, + "grad_norm": 0.22746653854846954, + "learning_rate": 4.713786090801438e-05, + "loss": 0.414, "step": 24870 }, { - "epoch": 0.88, - "learning_rate": 4.7293924103812626e-05, - "loss": 0.2946, + "epoch": 0.8964933145925685, + "grad_norm": 0.1794048696756363, + "learning_rate": 4.713650495769384e-05, + "loss": 0.441, "step": 24875 }, { - "epoch": 0.88, - "learning_rate": 4.7292634870925126e-05, - "loss": 0.3226, + "epoch": 0.8966735142537932, + "grad_norm": 0.1713542640209198, + "learning_rate": 4.713514870576769e-05, + "loss": 0.4191, "step": 24880 }, { - "epoch": 0.88, - "learning_rate": 4.729134534858244e-05, - "loss": 0.3063, + "epoch": 0.8968537139150179, + "grad_norm": 0.189683198928833, + "learning_rate": 4.713379215225444e-05, + "loss": 0.4236, "step": 24885 }, { - "epoch": 0.88, - "learning_rate": 4.729005553680133e-05, - "loss": 0.315, + "epoch": 0.8970339135762425, + "grad_norm": 0.13822448253631592, + "learning_rate": 4.713243529717256e-05, + "loss": 0.4481, "step": 24890 }, { - "epoch": 0.88, - "learning_rate": 4.7288765435598536e-05, - "loss": 0.3197, + "epoch": 0.8972141132374671, + "grad_norm": 0.1470467448234558, + "learning_rate": 4.713107814054052e-05, + "loss": 0.4471, "step": 24895 }, { - "epoch": 0.88, - "learning_rate": 4.7287475044990815e-05, - "loss": 0.306, + "epoch": 0.8973943128986918, + "grad_norm": 0.23228789865970612, + "learning_rate": 4.7129720682376835e-05, + "loss": 0.4273, "step": 24900 }, { - "epoch": 0.88, - "learning_rate": 4.7286184364994924e-05, - "loss": 0.3047, + "epoch": 0.8975745125599164, + "grad_norm": 0.16581259667873383, + "learning_rate": 4.712836292269999e-05, + "loss": 0.4401, "step": 24905 }, { - "epoch": 0.88, - "learning_rate": 4.728489339562761e-05, - "loss": 0.3094, + "epoch": 0.897754712221141, + "grad_norm": 0.20045721530914307, + "learning_rate": 4.712700486152848e-05, + "loss": 0.4381, "step": 24910 }, { - "epoch": 0.88, - "learning_rate": 4.728360213690564e-05, - "loss": 0.3052, + "epoch": 0.8979349118823656, + "grad_norm": 0.1778937131166458, + "learning_rate": 4.712564649888081e-05, + "loss": 0.4466, "step": 24915 }, { - "epoch": 0.88, - "learning_rate": 4.7282310588845786e-05, - "loss": 0.2992, + "epoch": 0.8981151115435902, + "grad_norm": 0.22372958064079285, + "learning_rate": 4.7124287834775496e-05, + "loss": 0.4296, "step": 24920 }, { - "epoch": 0.88, - "learning_rate": 4.7281018751464814e-05, - "loss": 0.3046, + "epoch": 0.898295311204815, + "grad_norm": 0.15579721331596375, + "learning_rate": 4.7122928869231044e-05, + "loss": 0.4306, "step": 24925 }, { - "epoch": 0.88, - "learning_rate": 4.727972662477949e-05, - "loss": 0.2983, + "epoch": 0.8984755108660396, + "grad_norm": 0.1593790054321289, + "learning_rate": 4.712156960226597e-05, + "loss": 0.4518, "step": 24930 }, { - "epoch": 0.88, - "learning_rate": 4.727843420880661e-05, - "loss": 0.318, + "epoch": 0.8986557105272642, + "grad_norm": 0.18624834716320038, + "learning_rate": 4.7120210033898784e-05, + "loss": 0.4181, "step": 24935 }, { - "epoch": 0.88, - "learning_rate": 4.727714150356294e-05, - "loss": 0.2807, + "epoch": 0.8988359101884889, + "grad_norm": 0.19123771786689758, + "learning_rate": 4.711885016414802e-05, + "loss": 0.4106, "step": 24940 }, { - "epoch": 0.88, - "learning_rate": 4.727584850906526e-05, - "loss": 0.3074, + "epoch": 0.8990161098497135, + "grad_norm": 0.1681036353111267, + "learning_rate": 4.7117489993032216e-05, + "loss": 0.4661, "step": 24945 }, { - "epoch": 0.88, - "learning_rate": 4.7274555225330374e-05, - "loss": 0.2902, + "epoch": 0.8991963095109381, + "grad_norm": 0.16487166285514832, + "learning_rate": 4.711612952056988e-05, + "loss": 0.4357, "step": 24950 }, { - "epoch": 0.88, - "learning_rate": 4.7273261652375066e-05, - "loss": 0.2953, + "epoch": 0.8993765091721627, + "grad_norm": 0.1486726552248001, + "learning_rate": 4.711476874677957e-05, + "loss": 0.4281, "step": 24955 }, { - "epoch": 0.88, - "learning_rate": 4.727196779021614e-05, - "loss": 0.3285, + "epoch": 0.8995567088333873, + "grad_norm": 0.1914074718952179, + "learning_rate": 4.711340767167982e-05, + "loss": 0.4557, "step": 24960 }, { - "epoch": 0.88, - "learning_rate": 4.7270673638870386e-05, - "loss": 0.3097, + "epoch": 0.8997369084946121, + "grad_norm": 0.17185872793197632, + "learning_rate": 4.711204629528917e-05, + "loss": 0.4334, "step": 24965 }, { - "epoch": 0.88, - "learning_rate": 4.7269379198354606e-05, - "loss": 0.3233, + "epoch": 0.8999171081558367, + "grad_norm": 0.15413062274456024, + "learning_rate": 4.711068461762617e-05, + "loss": 0.4027, "step": 24970 }, { - "epoch": 0.88, - "learning_rate": 4.726808446868562e-05, - "loss": 0.3141, + "epoch": 0.9000973078170613, + "grad_norm": 0.2285022735595703, + "learning_rate": 4.710932263870936e-05, + "loss": 0.423, "step": 24975 }, { - "epoch": 0.88, - "learning_rate": 4.7266789449880235e-05, - "loss": 0.2786, + "epoch": 0.9002775074782859, + "grad_norm": 0.18323256075382233, + "learning_rate": 4.710796035855732e-05, + "loss": 0.4656, "step": 24980 }, { - "epoch": 0.88, - "learning_rate": 4.7265494141955255e-05, - "loss": 0.2944, + "epoch": 0.9004577071395106, + "grad_norm": 0.19098562002182007, + "learning_rate": 4.71065977771886e-05, + "loss": 0.4367, "step": 24985 }, { - "epoch": 0.88, - "learning_rate": 4.7264198544927515e-05, - "loss": 0.313, + "epoch": 0.9006379068007352, + "grad_norm": 0.14779239892959595, + "learning_rate": 4.710523489462177e-05, + "loss": 0.4426, "step": 24990 }, { - "epoch": 0.88, - "learning_rate": 4.726290265881382e-05, - "loss": 0.2895, + "epoch": 0.9008181064619598, + "grad_norm": 0.1570071429014206, + "learning_rate": 4.710387171087539e-05, + "loss": 0.4674, "step": 24995 }, { - "epoch": 0.88, - "learning_rate": 4.7261606483631017e-05, - "loss": 0.3045, + "epoch": 0.9009983061231844, + "grad_norm": 0.1645687371492386, + "learning_rate": 4.7102508225968035e-05, + "loss": 0.4766, "step": 25000 }, { - "epoch": 0.88, - "eval_loss": 0.2971786856651306, - "eval_runtime": 10.5292, - "eval_samples_per_second": 9.497, - "eval_steps_per_second": 9.497, + "epoch": 0.9009983061231844, + "eval_loss": 0.4570350646972656, + "eval_runtime": 3.5365, + "eval_samples_per_second": 28.276, + "eval_steps_per_second": 7.069, "step": 25000 }, { - "epoch": 0.88, - "learning_rate": 4.726031001939592e-05, - "loss": 0.3188, + "epoch": 0.9011785057844092, + "grad_norm": 0.15871667861938477, + "learning_rate": 4.7101144439918287e-05, + "loss": 0.4376, "step": 25005 }, { - "epoch": 0.88, - "learning_rate": 4.725901326612536e-05, - "loss": 0.3274, + "epoch": 0.9013587054456338, + "grad_norm": 0.1973850131034851, + "learning_rate": 4.709978035274473e-05, + "loss": 0.446, "step": 25010 }, { - "epoch": 0.88, - "learning_rate": 4.72577162238362e-05, - "loss": 0.3033, + "epoch": 0.9015389051068584, + "grad_norm": 0.15039071440696716, + "learning_rate": 4.709841596446594e-05, + "loss": 0.4431, "step": 25015 }, { - "epoch": 0.88, - "learning_rate": 4.725641889254525e-05, - "loss": 0.3326, + "epoch": 0.901719104768083, + "grad_norm": 0.16307663917541504, + "learning_rate": 4.70970512751005e-05, + "loss": 0.4178, "step": 25020 }, { - "epoch": 0.88, - "learning_rate": 4.725512127226937e-05, - "loss": 0.3117, + "epoch": 0.9018993044293077, + "grad_norm": 0.17655940353870392, + "learning_rate": 4.709568628466703e-05, + "loss": 0.4112, "step": 25025 }, { - "epoch": 0.88, - "learning_rate": 4.725382336302541e-05, - "loss": 0.3072, + "epoch": 0.9020795040905323, + "grad_norm": 0.21620622277259827, + "learning_rate": 4.709432099318411e-05, + "loss": 0.4354, "step": 25030 }, { - "epoch": 0.88, - "learning_rate": 4.725252516483021e-05, - "loss": 0.2899, + "epoch": 0.9022597037517569, + "grad_norm": 0.17951133847236633, + "learning_rate": 4.7092955400670336e-05, + "loss": 0.4332, "step": 25035 }, { - "epoch": 0.88, - "learning_rate": 4.725122667770064e-05, - "loss": 0.305, + "epoch": 0.9024399034129816, + "grad_norm": 0.15933279693126678, + "learning_rate": 4.7091589507144326e-05, + "loss": 0.3898, "step": 25040 }, { - "epoch": 0.88, - "learning_rate": 4.7249927901653566e-05, - "loss": 0.2833, + "epoch": 0.9026201030742063, + "grad_norm": 0.1578451693058014, + "learning_rate": 4.7090223312624683e-05, + "loss": 0.4215, "step": 25045 }, { - "epoch": 0.88, - "learning_rate": 4.724862883670583e-05, - "loss": 0.3178, + "epoch": 0.9028003027354309, + "grad_norm": 0.1939324289560318, + "learning_rate": 4.708885681713003e-05, + "loss": 0.4807, "step": 25050 }, { - "epoch": 0.88, - "learning_rate": 4.724732948287432e-05, - "loss": 0.2996, + "epoch": 0.9029805023966555, + "grad_norm": 0.1886306256055832, + "learning_rate": 4.708749002067897e-05, + "loss": 0.4057, "step": 25055 }, { - "epoch": 0.88, - "learning_rate": 4.7246029840175884e-05, - "loss": 0.2997, + "epoch": 0.9031607020578801, + "grad_norm": 0.1769237220287323, + "learning_rate": 4.708612292329015e-05, + "loss": 0.4619, "step": 25060 }, { - "epoch": 0.88, - "learning_rate": 4.7244729908627426e-05, - "loss": 0.3063, + "epoch": 0.9033409017191047, + "grad_norm": 0.17207685112953186, + "learning_rate": 4.7084755524982175e-05, + "loss": 0.4632, "step": 25065 }, { - "epoch": 0.88, - "learning_rate": 4.72434296882458e-05, - "loss": 0.2872, + "epoch": 0.9035211013803294, + "grad_norm": 0.17112012207508087, + "learning_rate": 4.7083387825773676e-05, + "loss": 0.4322, "step": 25070 }, { - "epoch": 0.88, - "learning_rate": 4.7242129179047904e-05, - "loss": 0.2907, + "epoch": 0.903701301041554, + "grad_norm": 0.1867535561323166, + "learning_rate": 4.70820198256833e-05, + "loss": 0.4359, "step": 25075 }, { - "epoch": 0.88, - "learning_rate": 4.724082838105062e-05, - "loss": 0.3003, + "epoch": 0.9038815007027787, + "grad_norm": 0.21555501222610474, + "learning_rate": 4.708065152472967e-05, + "loss": 0.4375, "step": 25080 }, { - "epoch": 0.88, - "learning_rate": 4.723952729427083e-05, - "loss": 0.3175, + "epoch": 0.9040617003640034, + "grad_norm": 0.16034695506095886, + "learning_rate": 4.707928292293144e-05, + "loss": 0.4587, "step": 25085 }, { - "epoch": 0.88, - "learning_rate": 4.723822591872544e-05, - "loss": 0.3222, + "epoch": 0.904241900025228, + "grad_norm": 0.15322132408618927, + "learning_rate": 4.7077914020307266e-05, + "loss": 0.4422, "step": 25090 }, { - "epoch": 0.88, - "learning_rate": 4.7236924254431344e-05, - "loss": 0.3183, + "epoch": 0.9044220996864526, + "grad_norm": 0.16674070060253143, + "learning_rate": 4.707654481687578e-05, + "loss": 0.4216, "step": 25095 }, { - "epoch": 0.88, - "learning_rate": 4.723562230140544e-05, - "loss": 0.2934, + "epoch": 0.9046022993476772, + "grad_norm": 0.16037312150001526, + "learning_rate": 4.707517531265565e-05, + "loss": 0.4055, "step": 25100 }, { - "epoch": 0.88, - "learning_rate": 4.7234320059664626e-05, - "loss": 0.3129, + "epoch": 0.9047824990089018, + "grad_norm": 0.17139527201652527, + "learning_rate": 4.707380550766553e-05, + "loss": 0.4565, "step": 25105 }, { - "epoch": 0.88, - "learning_rate": 4.7233017529225826e-05, - "loss": 0.2844, + "epoch": 0.9049626986701265, + "grad_norm": 0.1893669068813324, + "learning_rate": 4.7072435401924075e-05, + "loss": 0.4536, "step": 25110 }, { - "epoch": 0.88, - "learning_rate": 4.723171471010595e-05, - "loss": 0.2877, + "epoch": 0.9051428983313511, + "grad_norm": 0.17662879824638367, + "learning_rate": 4.7071064995449964e-05, + "loss": 0.4653, "step": 25115 }, { - "epoch": 0.88, - "learning_rate": 4.72304116023219e-05, - "loss": 0.2901, + "epoch": 0.9053230979925758, + "grad_norm": 0.16549894213676453, + "learning_rate": 4.7069694288261864e-05, + "loss": 0.402, "step": 25120 }, { - "epoch": 0.88, - "learning_rate": 4.72291082058906e-05, - "loss": 0.2915, + "epoch": 0.9055032976538004, + "grad_norm": 0.1662409007549286, + "learning_rate": 4.706832328037846e-05, + "loss": 0.4282, "step": 25125 }, { - "epoch": 0.88, - "learning_rate": 4.7227804520829e-05, - "loss": 0.3152, + "epoch": 0.9056834973150251, + "grad_norm": 0.21818654239177704, + "learning_rate": 4.706695197181842e-05, + "loss": 0.4141, "step": 25130 }, { - "epoch": 0.88, - "learning_rate": 4.7226500547153996e-05, - "loss": 0.301, + "epoch": 0.9058636969762497, + "grad_norm": 0.14861366152763367, + "learning_rate": 4.706558036260042e-05, + "loss": 0.4126, "step": 25135 }, { - "epoch": 0.88, - "learning_rate": 4.722519628488253e-05, - "loss": 0.3223, + "epoch": 0.9060438966374743, + "grad_norm": 0.17982593178749084, + "learning_rate": 4.7064208452743174e-05, + "loss": 0.4186, "step": 25140 }, { - "epoch": 0.88, - "learning_rate": 4.722389173403153e-05, - "loss": 0.3069, + "epoch": 0.9062240962986989, + "grad_norm": 0.19204726815223694, + "learning_rate": 4.706283624226536e-05, + "loss": 0.4466, "step": 25145 }, { - "epoch": 0.88, - "learning_rate": 4.7222586894617956e-05, - "loss": 0.3253, + "epoch": 0.9064042959599236, + "grad_norm": 0.20537865161895752, + "learning_rate": 4.7061463731185676e-05, + "loss": 0.4208, "step": 25150 }, { - "epoch": 0.89, - "learning_rate": 4.722128176665873e-05, - "loss": 0.2785, + "epoch": 0.9065844956211482, + "grad_norm": 0.12418222427368164, + "learning_rate": 4.7060090919522806e-05, + "loss": 0.4412, "step": 25155 }, { - "epoch": 0.89, - "learning_rate": 4.72199763501708e-05, - "loss": 0.3071, + "epoch": 0.9067646952823729, + "grad_norm": 0.15775543451309204, + "learning_rate": 4.705871780729548e-05, + "loss": 0.4425, "step": 25160 }, { - "epoch": 0.89, - "learning_rate": 4.7218670645171125e-05, - "loss": 0.3289, + "epoch": 0.9069448949435975, + "grad_norm": 0.15570229291915894, + "learning_rate": 4.705734439452239e-05, + "loss": 0.445, "step": 25165 }, { - "epoch": 0.89, - "learning_rate": 4.721736465167666e-05, - "loss": 0.3063, + "epoch": 0.9071250946048222, + "grad_norm": 0.18232616782188416, + "learning_rate": 4.705597068122225e-05, + "loss": 0.4503, "step": 25170 }, { - "epoch": 0.89, - "learning_rate": 4.721605836970435e-05, - "loss": 0.3212, + "epoch": 0.9073052942660468, + "grad_norm": 0.20230647921562195, + "learning_rate": 4.705459666741379e-05, + "loss": 0.4519, "step": 25175 }, { - "epoch": 0.89, - "learning_rate": 4.7214751799271165e-05, - "loss": 0.2908, + "epoch": 0.9074854939272714, + "grad_norm": 0.1548745334148407, + "learning_rate": 4.705322235311571e-05, + "loss": 0.3896, "step": 25180 }, { - "epoch": 0.89, - "learning_rate": 4.721344494039407e-05, - "loss": 0.3022, + "epoch": 0.907665693588496, + "grad_norm": 0.19131234288215637, + "learning_rate": 4.705184773834675e-05, + "loss": 0.4694, "step": 25185 }, { - "epoch": 0.89, - "learning_rate": 4.721213779309003e-05, - "loss": 0.3016, + "epoch": 0.9078458932497206, + "grad_norm": 0.17528343200683594, + "learning_rate": 4.705047282312563e-05, + "loss": 0.4463, "step": 25190 }, { - "epoch": 0.89, - "learning_rate": 4.7210830357376015e-05, - "loss": 0.3272, + "epoch": 0.9080260929109454, + "grad_norm": 0.16300806403160095, + "learning_rate": 4.704909760747109e-05, + "loss": 0.4469, "step": 25195 }, { - "epoch": 0.89, - "learning_rate": 4.720952263326901e-05, - "loss": 0.3067, + "epoch": 0.90820629257217, + "grad_norm": 0.14464612305164337, + "learning_rate": 4.704772209140186e-05, + "loss": 0.4079, "step": 25200 }, { - "epoch": 0.89, - "learning_rate": 4.7208214620785986e-05, - "loss": 0.3113, + "epoch": 0.9083864922333946, + "grad_norm": 0.19416652619838715, + "learning_rate": 4.704634627493669e-05, + "loss": 0.4484, "step": 25205 }, { - "epoch": 0.89, - "learning_rate": 4.7206906319943934e-05, - "loss": 0.277, + "epoch": 0.9085666918946192, + "grad_norm": 0.18099582195281982, + "learning_rate": 4.704497015809432e-05, + "loss": 0.4251, "step": 25210 }, { - "epoch": 0.89, - "learning_rate": 4.720559773075984e-05, - "loss": 0.3084, + "epoch": 0.9087468915558439, + "grad_norm": 0.19250603020191193, + "learning_rate": 4.70435937408935e-05, + "loss": 0.434, "step": 25215 }, { - "epoch": 0.89, - "learning_rate": 4.720428885325069e-05, - "loss": 0.2904, + "epoch": 0.9089270912170685, + "grad_norm": 0.17500755190849304, + "learning_rate": 4.704221702335298e-05, + "loss": 0.3891, "step": 25220 }, { - "epoch": 0.89, - "learning_rate": 4.720297968743348e-05, - "loss": 0.3018, + "epoch": 0.9091072908782931, + "grad_norm": 0.1807602047920227, + "learning_rate": 4.7040840005491526e-05, + "loss": 0.4505, "step": 25225 }, { - "epoch": 0.89, - "learning_rate": 4.720167023332522e-05, - "loss": 0.3067, + "epoch": 0.9092874905395177, + "grad_norm": 0.1530625820159912, + "learning_rate": 4.7039462687327885e-05, + "loss": 0.4264, "step": 25230 }, { - "epoch": 0.89, - "learning_rate": 4.72003604909429e-05, - "loss": 0.3084, + "epoch": 0.9094676902007425, + "grad_norm": 0.1661711484193802, + "learning_rate": 4.703808506888084e-05, + "loss": 0.4535, "step": 25235 }, { - "epoch": 0.89, - "learning_rate": 4.7199050460303525e-05, - "loss": 0.2959, + "epoch": 0.9096478898619671, + "grad_norm": 0.14613687992095947, + "learning_rate": 4.7036707150169145e-05, + "loss": 0.4152, "step": 25240 }, { - "epoch": 0.89, - "learning_rate": 4.719774014142411e-05, - "loss": 0.2769, + "epoch": 0.9098280895231917, + "grad_norm": 0.18025970458984375, + "learning_rate": 4.703532893121159e-05, + "loss": 0.4294, "step": 25245 }, { - "epoch": 0.89, - "learning_rate": 4.719642953432167e-05, - "loss": 0.2959, + "epoch": 0.9100082891844163, + "grad_norm": 0.14726756513118744, + "learning_rate": 4.703395041202694e-05, + "loss": 0.4612, "step": 25250 }, { - "epoch": 0.89, - "learning_rate": 4.719511863901322e-05, - "loss": 0.2857, + "epoch": 0.910188488845641, + "grad_norm": 0.19874265789985657, + "learning_rate": 4.703257159263398e-05, + "loss": 0.4415, "step": 25255 }, { - "epoch": 0.89, - "learning_rate": 4.719380745551578e-05, - "loss": 0.2813, + "epoch": 0.9103686885068656, + "grad_norm": 0.17537064850330353, + "learning_rate": 4.70311924730515e-05, + "loss": 0.4454, "step": 25260 }, { - "epoch": 0.89, - "learning_rate": 4.7192495983846385e-05, - "loss": 0.3073, + "epoch": 0.9105488881680902, + "grad_norm": 0.1721314936876297, + "learning_rate": 4.702981305329829e-05, + "loss": 0.4314, "step": 25265 }, { - "epoch": 0.89, - "learning_rate": 4.719118422402205e-05, - "loss": 0.2863, + "epoch": 0.9107290878293148, + "grad_norm": 0.16811582446098328, + "learning_rate": 4.702843333339314e-05, + "loss": 0.4317, "step": 25270 }, { - "epoch": 0.89, - "learning_rate": 4.718987217605981e-05, - "loss": 0.3157, + "epoch": 0.9109092874905396, + "grad_norm": 0.19957655668258667, + "learning_rate": 4.702705331335485e-05, + "loss": 0.4278, "step": 25275 }, { - "epoch": 0.89, - "learning_rate": 4.71885598399767e-05, - "loss": 0.2739, + "epoch": 0.9110894871517642, + "grad_norm": 0.17485888302326202, + "learning_rate": 4.702567299320223e-05, + "loss": 0.4501, "step": 25280 }, { - "epoch": 0.89, - "learning_rate": 4.718724721578978e-05, - "loss": 0.2985, + "epoch": 0.9112696868129888, + "grad_norm": 0.13707765936851501, + "learning_rate": 4.702429237295407e-05, + "loss": 0.4126, "step": 25285 }, { - "epoch": 0.89, - "learning_rate": 4.7185934303516054e-05, - "loss": 0.2974, + "epoch": 0.9114498864742134, + "grad_norm": 0.182311549782753, + "learning_rate": 4.702291145262919e-05, + "loss": 0.4308, "step": 25290 }, { - "epoch": 0.89, - "learning_rate": 4.718462110317261e-05, - "loss": 0.3091, + "epoch": 0.9116300861354381, + "grad_norm": 0.16993586719036102, + "learning_rate": 4.702153023224641e-05, + "loss": 0.4155, "step": 25295 }, { - "epoch": 0.89, - "learning_rate": 4.718330761477647e-05, - "loss": 0.2901, + "epoch": 0.9118102857966627, + "grad_norm": 0.16655287146568298, + "learning_rate": 4.7020148711824546e-05, + "loss": 0.4481, "step": 25300 }, { - "epoch": 0.89, - "learning_rate": 4.7181993838344704e-05, - "loss": 0.2989, + "epoch": 0.9119904854578873, + "grad_norm": 0.1623249053955078, + "learning_rate": 4.701876689138242e-05, + "loss": 0.4569, "step": 25305 }, { - "epoch": 0.89, - "learning_rate": 4.718067977389437e-05, - "loss": 0.2995, + "epoch": 0.9121706851191119, + "grad_norm": 0.15807580947875977, + "learning_rate": 4.701738477093885e-05, + "loss": 0.4459, "step": 25310 }, { - "epoch": 0.89, - "learning_rate": 4.7179365421442526e-05, - "loss": 0.3258, + "epoch": 0.9123508847803367, + "grad_norm": 0.20741410553455353, + "learning_rate": 4.701600235051268e-05, + "loss": 0.4314, "step": 25315 }, { - "epoch": 0.89, - "learning_rate": 4.717805078100623e-05, - "loss": 0.3229, + "epoch": 0.9125310844415613, + "grad_norm": 0.1682579219341278, + "learning_rate": 4.701461963012274e-05, + "loss": 0.4152, "step": 25320 }, { - "epoch": 0.89, - "learning_rate": 4.717673585260257e-05, - "loss": 0.2945, + "epoch": 0.9127112841027859, + "grad_norm": 0.15219363570213318, + "learning_rate": 4.701323660978787e-05, + "loss": 0.415, "step": 25325 }, { - "epoch": 0.89, - "learning_rate": 4.717542063624861e-05, - "loss": 0.3246, + "epoch": 0.9128914837640105, + "grad_norm": 0.1439961940050125, + "learning_rate": 4.701185328952692e-05, + "loss": 0.4282, "step": 25330 }, { - "epoch": 0.89, - "learning_rate": 4.7174105131961424e-05, - "loss": 0.2893, + "epoch": 0.9130716834252351, + "grad_norm": 0.18314993381500244, + "learning_rate": 4.701046966935872e-05, + "loss": 0.4325, "step": 25335 }, { - "epoch": 0.89, - "learning_rate": 4.7172789339758095e-05, - "loss": 0.3087, + "epoch": 0.9132518830864598, + "grad_norm": 0.19595153629779816, + "learning_rate": 4.700908574930213e-05, + "loss": 0.4474, "step": 25340 }, { - "epoch": 0.89, - "learning_rate": 4.717147325965571e-05, - "loss": 0.2858, + "epoch": 0.9134320827476844, + "grad_norm": 0.14837300777435303, + "learning_rate": 4.700770152937601e-05, + "loss": 0.4244, "step": 25345 }, { - "epoch": 0.89, - "learning_rate": 4.717015689167136e-05, - "loss": 0.2646, + "epoch": 0.9136122824089091, + "grad_norm": 0.19343934953212738, + "learning_rate": 4.700631700959923e-05, + "loss": 0.4336, "step": 25350 }, { - "epoch": 0.89, - "learning_rate": 4.716884023582213e-05, - "loss": 0.3022, + "epoch": 0.9137924820701337, + "grad_norm": 0.15232184529304504, + "learning_rate": 4.700493218999063e-05, + "loss": 0.4504, "step": 25355 }, { - "epoch": 0.89, - "learning_rate": 4.716752329212512e-05, - "loss": 0.3013, + "epoch": 0.9139726817313584, + "grad_norm": 0.15530715882778168, + "learning_rate": 4.700354707056909e-05, + "loss": 0.4485, "step": 25360 }, { - "epoch": 0.89, - "learning_rate": 4.716620606059742e-05, - "loss": 0.3055, + "epoch": 0.914152881392583, + "grad_norm": 0.19562314450740814, + "learning_rate": 4.7002161651353485e-05, + "loss": 0.46, "step": 25365 }, { - "epoch": 0.89, - "learning_rate": 4.7164888541256154e-05, - "loss": 0.289, + "epoch": 0.9143330810538076, + "grad_norm": 0.17155522108078003, + "learning_rate": 4.7000775932362684e-05, + "loss": 0.4414, "step": 25370 }, { - "epoch": 0.89, - "learning_rate": 4.716357073411841e-05, - "loss": 0.3143, + "epoch": 0.9145132807150322, + "grad_norm": 0.17853830754756927, + "learning_rate": 4.699938991361558e-05, + "loss": 0.4158, "step": 25375 }, { - "epoch": 0.89, - "learning_rate": 4.7162252639201314e-05, - "loss": 0.3089, + "epoch": 0.9146934803762569, + "grad_norm": 0.17986997961997986, + "learning_rate": 4.6998003595131035e-05, + "loss": 0.4345, "step": 25380 }, { - "epoch": 0.89, - "learning_rate": 4.7160934256521974e-05, - "loss": 0.2892, + "epoch": 0.9148736800374815, + "grad_norm": 0.16720250248908997, + "learning_rate": 4.699661697692796e-05, + "loss": 0.4374, "step": 25385 }, { - "epoch": 0.89, - "learning_rate": 4.71596155860975e-05, - "loss": 0.3338, + "epoch": 0.9150538796987062, + "grad_norm": 0.18035994470119476, + "learning_rate": 4.699523005902522e-05, + "loss": 0.423, "step": 25390 }, { - "epoch": 0.89, - "learning_rate": 4.715829662794502e-05, - "loss": 0.291, + "epoch": 0.9152340793599308, + "grad_norm": 0.17745894193649292, + "learning_rate": 4.699384284144174e-05, + "loss": 0.4273, "step": 25395 }, { - "epoch": 0.89, - "learning_rate": 4.715697738208167e-05, - "loss": 0.2916, + "epoch": 0.9154142790211555, + "grad_norm": 0.1389733850955963, + "learning_rate": 4.699245532419642e-05, + "loss": 0.4206, "step": 25400 }, { - "epoch": 0.89, - "learning_rate": 4.715565784852457e-05, - "loss": 0.2713, + "epoch": 0.9155944786823801, + "grad_norm": 0.16255025565624237, + "learning_rate": 4.699106750730814e-05, + "loss": 0.4705, "step": 25405 }, { - "epoch": 0.89, - "learning_rate": 4.715433802729085e-05, - "loss": 0.3044, + "epoch": 0.9157746783436047, + "grad_norm": 0.16465483605861664, + "learning_rate": 4.6989679390795826e-05, + "loss": 0.4509, "step": 25410 }, { - "epoch": 0.89, - "learning_rate": 4.715301791839765e-05, - "loss": 0.3076, + "epoch": 0.9159548780048293, + "grad_norm": 0.16897955536842346, + "learning_rate": 4.6988290974678384e-05, + "loss": 0.4106, "step": 25415 }, { - "epoch": 0.89, - "learning_rate": 4.7151697521862104e-05, - "loss": 0.3235, + "epoch": 0.916135077666054, + "grad_norm": 0.17163941264152527, + "learning_rate": 4.698690225897474e-05, + "loss": 0.4134, "step": 25420 }, { - "epoch": 0.89, - "learning_rate": 4.715037683770138e-05, - "loss": 0.362, + "epoch": 0.9163152773272786, + "grad_norm": 0.16389816999435425, + "learning_rate": 4.698551324370381e-05, + "loss": 0.4388, "step": 25425 }, { - "epoch": 0.89, - "learning_rate": 4.71490558659326e-05, - "loss": 0.2861, + "epoch": 0.9164954769885033, + "grad_norm": 0.17674943804740906, + "learning_rate": 4.698412392888452e-05, + "loss": 0.4547, "step": 25430 }, { - "epoch": 0.89, - "learning_rate": 4.714773460657293e-05, - "loss": 0.2833, + "epoch": 0.9166756766497279, + "grad_norm": 0.1594432145357132, + "learning_rate": 4.698273431453579e-05, + "loss": 0.4396, "step": 25435 }, { - "epoch": 0.9, - "learning_rate": 4.7146413059639524e-05, - "loss": 0.3223, + "epoch": 0.9168558763109526, + "grad_norm": 0.17538687586784363, + "learning_rate": 4.6981344400676566e-05, + "loss": 0.4724, "step": 25440 }, { - "epoch": 0.9, - "learning_rate": 4.714509122514953e-05, - "loss": 0.2809, + "epoch": 0.9170360759721772, + "grad_norm": 0.1810724139213562, + "learning_rate": 4.697995418732578e-05, + "loss": 0.4075, "step": 25445 }, { - "epoch": 0.9, - "learning_rate": 4.7143769103120125e-05, - "loss": 0.3196, + "epoch": 0.9172162756334018, + "grad_norm": 0.15371228754520416, + "learning_rate": 4.6978563674502375e-05, + "loss": 0.4578, "step": 25450 }, { - "epoch": 0.9, - "learning_rate": 4.714244669356848e-05, - "loss": 0.2941, + "epoch": 0.9173964752946264, + "grad_norm": 0.14616931974887848, + "learning_rate": 4.6977172862225294e-05, + "loss": 0.4127, "step": 25455 }, { - "epoch": 0.9, - "learning_rate": 4.714112399651175e-05, - "loss": 0.3171, + "epoch": 0.917576674955851, + "grad_norm": 0.19712692499160767, + "learning_rate": 4.697578175051348e-05, + "loss": 0.4611, "step": 25460 }, { - "epoch": 0.9, - "learning_rate": 4.713980101196712e-05, - "loss": 0.2729, + "epoch": 0.9177568746170757, + "grad_norm": 0.16714803874492645, + "learning_rate": 4.69743903393859e-05, + "loss": 0.4397, "step": 25465 }, { - "epoch": 0.9, - "learning_rate": 4.713847773995176e-05, - "loss": 0.2862, + "epoch": 0.9179370742783004, + "grad_norm": 0.15617181360721588, + "learning_rate": 4.6972998628861506e-05, + "loss": 0.3992, "step": 25470 }, { - "epoch": 0.9, - "learning_rate": 4.7137154180482854e-05, - "loss": 0.2835, + "epoch": 0.918117273939525, + "grad_norm": 0.17703229188919067, + "learning_rate": 4.697160661895927e-05, + "loss": 0.4722, "step": 25475 }, { - "epoch": 0.9, - "learning_rate": 4.71358303335776e-05, - "loss": 0.31, + "epoch": 0.9182974736007496, + "grad_norm": 0.17935994267463684, + "learning_rate": 4.6970214309698134e-05, + "loss": 0.4434, "step": 25480 }, { - "epoch": 0.9, - "learning_rate": 4.713450619925317e-05, - "loss": 0.2939, + "epoch": 0.9184776732619743, + "grad_norm": 0.22057127952575684, + "learning_rate": 4.6968821701097086e-05, + "loss": 0.4237, "step": 25485 }, { - "epoch": 0.9, - "learning_rate": 4.713318177752677e-05, - "loss": 0.2882, + "epoch": 0.9186578729231989, + "grad_norm": 0.17813293635845184, + "learning_rate": 4.69674287931751e-05, + "loss": 0.4612, "step": 25490 }, { - "epoch": 0.9, - "learning_rate": 4.713185706841559e-05, - "loss": 0.3074, + "epoch": 0.9188380725844235, + "grad_norm": 0.20391541719436646, + "learning_rate": 4.696603558595115e-05, + "loss": 0.4252, "step": 25495 }, { - "epoch": 0.9, - "learning_rate": 4.7130532071936814e-05, - "loss": 0.3055, + "epoch": 0.9190182722456481, + "grad_norm": 0.1561935395002365, + "learning_rate": 4.696464207944421e-05, + "loss": 0.4011, "step": 25500 }, { - "epoch": 0.9, - "eval_loss": 0.29553505778312683, - "eval_runtime": 10.5452, - "eval_samples_per_second": 9.483, - "eval_steps_per_second": 9.483, + "epoch": 0.9190182722456481, + "eval_loss": 0.4568355083465576, + "eval_runtime": 3.5356, + "eval_samples_per_second": 28.284, + "eval_steps_per_second": 7.071, "step": 25500 }, { - "epoch": 0.9, - "learning_rate": 4.712920678810768e-05, - "loss": 0.2973, + "epoch": 0.9191984719068728, + "grad_norm": 0.18729767203330994, + "learning_rate": 4.696324827367328e-05, + "loss": 0.4199, "step": 25505 }, { - "epoch": 0.9, - "learning_rate": 4.712788121694538e-05, - "loss": 0.2921, + "epoch": 0.9193786715680975, + "grad_norm": 0.19635900855064392, + "learning_rate": 4.696185416865734e-05, + "loss": 0.4135, "step": 25510 }, { - "epoch": 0.9, - "learning_rate": 4.712655535846713e-05, - "loss": 0.2836, + "epoch": 0.9195588712293221, + "grad_norm": 0.1535876840353012, + "learning_rate": 4.6960459764415386e-05, + "loss": 0.4719, "step": 25515 }, { - "epoch": 0.9, - "learning_rate": 4.712522921269012e-05, - "loss": 0.313, + "epoch": 0.9197390708905467, + "grad_norm": 0.16010642051696777, + "learning_rate": 4.695906506096643e-05, + "loss": 0.432, "step": 25520 }, { - "epoch": 0.9, - "learning_rate": 4.712390277963161e-05, - "loss": 0.3145, + "epoch": 0.9199192705517714, + "grad_norm": 0.1595637947320938, + "learning_rate": 4.6957670058329464e-05, + "loss": 0.4195, "step": 25525 }, { - "epoch": 0.9, - "learning_rate": 4.7122576059308796e-05, - "loss": 0.2959, + "epoch": 0.920099470212996, + "grad_norm": 0.15828733146190643, + "learning_rate": 4.6956274756523484e-05, + "loss": 0.4074, "step": 25530 }, { - "epoch": 0.9, - "learning_rate": 4.712124905173891e-05, - "loss": 0.3051, + "epoch": 0.9202796698742206, + "grad_norm": 0.16714178025722504, + "learning_rate": 4.695487915556752e-05, + "loss": 0.4126, "step": 25535 }, { - "epoch": 0.9, - "learning_rate": 4.711992175693918e-05, - "loss": 0.3089, + "epoch": 0.9204598695354452, + "grad_norm": 0.1568571776151657, + "learning_rate": 4.695348325548057e-05, + "loss": 0.451, "step": 25540 }, { - "epoch": 0.9, - "learning_rate": 4.7118594174926856e-05, - "loss": 0.325, + "epoch": 0.92064006919667, + "grad_norm": 0.1514507383108139, + "learning_rate": 4.695208705628167e-05, + "loss": 0.4052, "step": 25545 }, { - "epoch": 0.9, - "learning_rate": 4.711726630571915e-05, - "loss": 0.3128, + "epoch": 0.9208202688578946, + "grad_norm": 0.15713444352149963, + "learning_rate": 4.695069055798983e-05, + "loss": 0.4466, "step": 25550 }, { - "epoch": 0.9, - "learning_rate": 4.711593814933333e-05, - "loss": 0.3181, + "epoch": 0.9210004685191192, + "grad_norm": 0.1931026428937912, + "learning_rate": 4.694929376062408e-05, + "loss": 0.4462, "step": 25555 }, { - "epoch": 0.9, - "learning_rate": 4.7114609705786614e-05, - "loss": 0.2796, + "epoch": 0.9211806681803438, + "grad_norm": 0.1435370147228241, + "learning_rate": 4.694789666420345e-05, + "loss": 0.4197, "step": 25560 }, { - "epoch": 0.9, - "learning_rate": 4.711328097509627e-05, - "loss": 0.3029, + "epoch": 0.9213608678415685, + "grad_norm": 0.15287569165229797, + "learning_rate": 4.694649926874698e-05, + "loss": 0.4207, "step": 25565 }, { - "epoch": 0.9, - "learning_rate": 4.711195195727955e-05, - "loss": 0.2966, + "epoch": 0.9215410675027931, + "grad_norm": 0.17821337282657623, + "learning_rate": 4.694510157427371e-05, + "loss": 0.4436, "step": 25570 }, { - "epoch": 0.9, - "learning_rate": 4.7110622652353696e-05, - "loss": 0.3051, + "epoch": 0.9217212671640177, + "grad_norm": 0.1588643342256546, + "learning_rate": 4.694370358080267e-05, + "loss": 0.4427, "step": 25575 }, { - "epoch": 0.9, - "learning_rate": 4.7109293060335994e-05, - "loss": 0.3295, + "epoch": 0.9219014668252423, + "grad_norm": 0.2273692488670349, + "learning_rate": 4.6942305288352926e-05, + "loss": 0.4394, "step": 25580 }, { - "epoch": 0.9, - "learning_rate": 4.710796318124368e-05, - "loss": 0.2901, + "epoch": 0.922081666486467, + "grad_norm": 0.1667856127023697, + "learning_rate": 4.694090669694351e-05, + "loss": 0.4425, "step": 25585 }, { - "epoch": 0.9, - "learning_rate": 4.7106633015094037e-05, - "loss": 0.3316, + "epoch": 0.9222618661476917, + "grad_norm": 0.17408248782157898, + "learning_rate": 4.69395078065935e-05, + "loss": 0.4393, "step": 25590 }, { - "epoch": 0.9, - "learning_rate": 4.710530256190433e-05, - "loss": 0.2916, + "epoch": 0.9224420658089163, + "grad_norm": 0.17394454777240753, + "learning_rate": 4.693810861732194e-05, + "loss": 0.4212, "step": 25595 }, { - "epoch": 0.9, - "learning_rate": 4.7103971821691845e-05, - "loss": 0.2884, + "epoch": 0.9226222654701409, + "grad_norm": 0.21937377750873566, + "learning_rate": 4.69367091291479e-05, + "loss": 0.4295, "step": 25600 }, { - "epoch": 0.9, - "learning_rate": 4.710264079447384e-05, - "loss": 0.2916, + "epoch": 0.9228024651313655, + "grad_norm": 0.17998327314853668, + "learning_rate": 4.693530934209044e-05, + "loss": 0.4028, "step": 25605 }, { - "epoch": 0.9, - "learning_rate": 4.710130948026763e-05, - "loss": 0.2877, + "epoch": 0.9229826647925902, + "grad_norm": 0.14169585704803467, + "learning_rate": 4.693390925616864e-05, + "loss": 0.4253, "step": 25610 }, { - "epoch": 0.9, - "learning_rate": 4.7099977879090464e-05, - "loss": 0.3071, + "epoch": 0.9231628644538148, + "grad_norm": 0.16097652912139893, + "learning_rate": 4.693250887140157e-05, + "loss": 0.4052, "step": 25615 }, { - "epoch": 0.9, - "learning_rate": 4.7098645990959654e-05, - "loss": 0.3166, + "epoch": 0.9233430641150394, + "grad_norm": 0.14775985479354858, + "learning_rate": 4.6931108187808316e-05, + "loss": 0.4093, "step": 25620 }, { - "epoch": 0.9, - "learning_rate": 4.7097313815892495e-05, - "loss": 0.3184, + "epoch": 0.9235232637762641, + "grad_norm": 0.1724456250667572, + "learning_rate": 4.692970720540796e-05, + "loss": 0.4383, "step": 25625 }, { - "epoch": 0.9, - "learning_rate": 4.7095981353906275e-05, - "loss": 0.2892, + "epoch": 0.9237034634374888, + "grad_norm": 0.14798258244991302, + "learning_rate": 4.6928305924219587e-05, + "loss": 0.441, "step": 25630 }, { - "epoch": 0.9, - "learning_rate": 4.70946486050183e-05, - "loss": 0.2976, + "epoch": 0.9238836630987134, + "grad_norm": 0.16991879045963287, + "learning_rate": 4.692690434426229e-05, + "loss": 0.4178, "step": 25635 }, { - "epoch": 0.9, - "learning_rate": 4.7093315569245876e-05, - "loss": 0.3174, + "epoch": 0.924063862759938, + "grad_norm": 0.1819402128458023, + "learning_rate": 4.692550246555517e-05, + "loss": 0.4375, "step": 25640 }, { - "epoch": 0.9, - "learning_rate": 4.7091982246606304e-05, - "loss": 0.3334, + "epoch": 0.9242440624211626, + "grad_norm": 0.17619052529335022, + "learning_rate": 4.692410028811732e-05, + "loss": 0.4423, "step": 25645 }, { - "epoch": 0.9, - "learning_rate": 4.709064863711691e-05, - "loss": 0.3152, + "epoch": 0.9244242620823873, + "grad_norm": 0.17652717232704163, + "learning_rate": 4.692269781196785e-05, + "loss": 0.4498, "step": 25650 }, { - "epoch": 0.9, - "learning_rate": 4.708931474079499e-05, - "loss": 0.2769, + "epoch": 0.9246044617436119, + "grad_norm": 0.1433330625295639, + "learning_rate": 4.692129503712587e-05, + "loss": 0.4225, "step": 25655 }, { - "epoch": 0.9, - "learning_rate": 4.708798055765788e-05, - "loss": 0.3005, + "epoch": 0.9247846614048365, + "grad_norm": 0.18287613987922668, + "learning_rate": 4.6919891963610485e-05, + "loss": 0.4516, "step": 25660 }, { - "epoch": 0.9, - "learning_rate": 4.70866460877229e-05, - "loss": 0.3029, + "epoch": 0.9249648610660612, + "grad_norm": 0.17915308475494385, + "learning_rate": 4.691848859144081e-05, + "loss": 0.4276, "step": 25665 }, { - "epoch": 0.9, - "learning_rate": 4.7085311331007375e-05, - "loss": 0.2968, + "epoch": 0.9251450607272859, + "grad_norm": 0.16471678018569946, + "learning_rate": 4.691708492063598e-05, + "loss": 0.4445, "step": 25670 }, { - "epoch": 0.9, - "learning_rate": 4.708397628752864e-05, - "loss": 0.2918, + "epoch": 0.9253252603885105, + "grad_norm": 0.18665508925914764, + "learning_rate": 4.6915680951215114e-05, + "loss": 0.4602, "step": 25675 }, { - "epoch": 0.9, - "learning_rate": 4.7082640957304025e-05, - "loss": 0.3066, + "epoch": 0.9255054600497351, + "grad_norm": 0.1585237830877304, + "learning_rate": 4.6914276683197334e-05, + "loss": 0.4243, "step": 25680 }, { - "epoch": 0.9, - "learning_rate": 4.7081305340350875e-05, - "loss": 0.3111, + "epoch": 0.9256856597109597, + "grad_norm": 0.1394348442554474, + "learning_rate": 4.6912872116601776e-05, + "loss": 0.3881, "step": 25685 }, { - "epoch": 0.9, - "learning_rate": 4.707996943668652e-05, - "loss": 0.2707, + "epoch": 0.9258658593721844, + "grad_norm": 0.14394313097000122, + "learning_rate": 4.6911467251447574e-05, + "loss": 0.424, "step": 25690 }, { - "epoch": 0.9, - "learning_rate": 4.707863324632832e-05, - "loss": 0.3182, + "epoch": 0.926046059033409, + "grad_norm": 0.20186100900173187, + "learning_rate": 4.691006208775388e-05, + "loss": 0.4345, "step": 25695 }, { - "epoch": 0.9, - "learning_rate": 4.707729676929361e-05, - "loss": 0.2845, + "epoch": 0.9262262586946337, + "grad_norm": 0.16559197008609772, + "learning_rate": 4.690865662553983e-05, + "loss": 0.4314, "step": 25700 }, { - "epoch": 0.9, - "learning_rate": 4.7075960005599754e-05, - "loss": 0.289, + "epoch": 0.9264064583558583, + "grad_norm": 0.2052149623632431, + "learning_rate": 4.690725086482457e-05, + "loss": 0.4128, "step": 25705 }, { - "epoch": 0.9, - "learning_rate": 4.707462295526411e-05, - "loss": 0.294, + "epoch": 0.926586658017083, + "grad_norm": 0.1739431619644165, + "learning_rate": 4.690584480562726e-05, + "loss": 0.3859, "step": 25710 }, { - "epoch": 0.9, - "learning_rate": 4.7073285618304034e-05, - "loss": 0.2875, + "epoch": 0.9267668576783076, + "grad_norm": 0.17957325279712677, + "learning_rate": 4.6904438447967064e-05, + "loss": 0.429, "step": 25715 }, { - "epoch": 0.9, - "learning_rate": 4.7071947994736887e-05, - "loss": 0.2998, + "epoch": 0.9269470573395322, + "grad_norm": 0.16392971575260162, + "learning_rate": 4.690303179186313e-05, + "loss": 0.4385, "step": 25720 }, { - "epoch": 0.91, - "learning_rate": 4.707061008458004e-05, - "loss": 0.3073, + "epoch": 0.9271272570007568, + "grad_norm": 0.20912306010723114, + "learning_rate": 4.690162483733462e-05, + "loss": 0.4398, "step": 25725 }, { - "epoch": 0.91, - "learning_rate": 4.706927188785087e-05, - "loss": 0.2987, + "epoch": 0.9273074566619814, + "grad_norm": 0.18170976638793945, + "learning_rate": 4.690021758440072e-05, + "loss": 0.4031, "step": 25730 }, { - "epoch": 0.91, - "learning_rate": 4.706793340456674e-05, - "loss": 0.3124, + "epoch": 0.9274876563232061, + "grad_norm": 0.24062508344650269, + "learning_rate": 4.68988100330806e-05, + "loss": 0.4034, "step": 25735 }, { - "epoch": 0.91, - "learning_rate": 4.7066594634745055e-05, - "loss": 0.2986, + "epoch": 0.9276678559844308, + "grad_norm": 0.14883898198604584, + "learning_rate": 4.689740218339342e-05, + "loss": 0.4644, "step": 25740 }, { - "epoch": 0.91, - "learning_rate": 4.7065255578403165e-05, - "loss": 0.3156, + "epoch": 0.9278480556456554, + "grad_norm": 0.19792114198207855, + "learning_rate": 4.689599403535839e-05, + "loss": 0.4129, "step": 25745 }, { - "epoch": 0.91, - "learning_rate": 4.706391623555848e-05, - "loss": 0.3243, + "epoch": 0.92802825530688, + "grad_norm": 0.16648323833942413, + "learning_rate": 4.6894585588994676e-05, + "loss": 0.4379, "step": 25750 }, { - "epoch": 0.91, - "learning_rate": 4.7062576606228385e-05, - "loss": 0.2935, + "epoch": 0.9282084549681047, + "grad_norm": 0.18078407645225525, + "learning_rate": 4.689317684432147e-05, + "loss": 0.4586, "step": 25755 }, { - "epoch": 0.91, - "learning_rate": 4.706123669043027e-05, - "loss": 0.2948, + "epoch": 0.9283886546293293, + "grad_norm": 0.14701730012893677, + "learning_rate": 4.689176780135797e-05, + "loss": 0.4344, "step": 25760 }, { - "epoch": 0.91, - "learning_rate": 4.705989648818154e-05, - "loss": 0.3191, + "epoch": 0.9285688542905539, + "grad_norm": 0.1665094792842865, + "learning_rate": 4.689035846012336e-05, + "loss": 0.4549, "step": 25765 }, { - "epoch": 0.91, - "learning_rate": 4.705855599949959e-05, - "loss": 0.3178, + "epoch": 0.9287490539517785, + "grad_norm": 0.19282084703445435, + "learning_rate": 4.688894882063687e-05, + "loss": 0.4313, "step": 25770 }, { - "epoch": 0.91, - "learning_rate": 4.705721522440182e-05, - "loss": 0.2991, + "epoch": 0.9289292536130032, + "grad_norm": 0.18455705046653748, + "learning_rate": 4.688753888291768e-05, + "loss": 0.4461, "step": 25775 }, { - "epoch": 0.91, - "learning_rate": 4.705587416290565e-05, - "loss": 0.3112, + "epoch": 0.9291094532742279, + "grad_norm": 0.1790475845336914, + "learning_rate": 4.688612864698502e-05, + "loss": 0.4302, "step": 25780 }, { - "epoch": 0.91, - "learning_rate": 4.7054532815028494e-05, - "loss": 0.3057, + "epoch": 0.9292896529354525, + "grad_norm": 0.16020038723945618, + "learning_rate": 4.6884718112858085e-05, + "loss": 0.4316, "step": 25785 }, { - "epoch": 0.91, - "learning_rate": 4.705319118078776e-05, - "loss": 0.3145, + "epoch": 0.9294698525966771, + "grad_norm": 0.22309835255146027, + "learning_rate": 4.688330728055611e-05, + "loss": 0.4452, "step": 25790 }, { - "epoch": 0.91, - "learning_rate": 4.7051849260200874e-05, - "loss": 0.2953, + "epoch": 0.9296500522579018, + "grad_norm": 0.1724500060081482, + "learning_rate": 4.68818961500983e-05, + "loss": 0.4117, "step": 25795 }, { - "epoch": 0.91, - "learning_rate": 4.7050507053285256e-05, - "loss": 0.3187, + "epoch": 0.9298302519191264, + "grad_norm": 0.1488490104675293, + "learning_rate": 4.68804847215039e-05, + "loss": 0.4567, "step": 25800 }, { - "epoch": 0.91, - "learning_rate": 4.704916456005834e-05, - "loss": 0.3019, + "epoch": 0.930010451580351, + "grad_norm": 0.18638138473033905, + "learning_rate": 4.6879072994792126e-05, + "loss": 0.4115, "step": 25805 }, { - "epoch": 0.91, - "learning_rate": 4.704782178053755e-05, - "loss": 0.3052, + "epoch": 0.9301906512415756, + "grad_norm": 0.1752612441778183, + "learning_rate": 4.687766096998223e-05, + "loss": 0.4133, "step": 25810 }, { - "epoch": 0.91, - "learning_rate": 4.704647871474033e-05, - "loss": 0.3106, + "epoch": 0.9303708509028003, + "grad_norm": 0.17065556347370148, + "learning_rate": 4.6876248647093424e-05, + "loss": 0.4351, "step": 25815 }, { - "epoch": 0.91, - "learning_rate": 4.70451353626841e-05, - "loss": 0.3061, + "epoch": 0.930551050564025, + "grad_norm": 0.15603967010974884, + "learning_rate": 4.687483602614497e-05, + "loss": 0.4381, "step": 25820 }, { - "epoch": 0.91, - "learning_rate": 4.7043791724386324e-05, - "loss": 0.2958, + "epoch": 0.9307312502252496, + "grad_norm": 0.1808996945619583, + "learning_rate": 4.687342310715612e-05, + "loss": 0.4155, "step": 25825 }, { - "epoch": 0.91, - "learning_rate": 4.7042447799864444e-05, - "loss": 0.284, + "epoch": 0.9309114498864742, + "grad_norm": 0.17924173176288605, + "learning_rate": 4.687200989014611e-05, + "loss": 0.4348, "step": 25830 }, { - "epoch": 0.91, - "learning_rate": 4.70411035891359e-05, - "loss": 0.3004, + "epoch": 0.9310916495476989, + "grad_norm": 0.17122434079647064, + "learning_rate": 4.687059637513419e-05, + "loss": 0.4378, "step": 25835 }, { - "epoch": 0.91, - "learning_rate": 4.703975909221815e-05, - "loss": 0.3132, + "epoch": 0.9312718492089235, + "grad_norm": 0.1833307445049286, + "learning_rate": 4.686918256213964e-05, + "loss": 0.4553, "step": 25840 }, { - "epoch": 0.91, - "learning_rate": 4.703841430912865e-05, - "loss": 0.3095, + "epoch": 0.9314520488701481, + "grad_norm": 0.14123891294002533, + "learning_rate": 4.6867768451181706e-05, + "loss": 0.4414, "step": 25845 }, { - "epoch": 0.91, - "learning_rate": 4.703706923988488e-05, - "loss": 0.3004, + "epoch": 0.9316322485313727, + "grad_norm": 0.18901309370994568, + "learning_rate": 4.6866354042279666e-05, + "loss": 0.4157, "step": 25850 }, { - "epoch": 0.91, - "learning_rate": 4.703572388450427e-05, - "loss": 0.3101, + "epoch": 0.9318124481925975, + "grad_norm": 0.1698739230632782, + "learning_rate": 4.686493933545278e-05, + "loss": 0.4427, "step": 25855 }, { - "epoch": 0.91, - "learning_rate": 4.703437824300432e-05, - "loss": 0.3068, + "epoch": 0.9319926478538221, + "grad_norm": 0.14457178115844727, + "learning_rate": 4.686352433072033e-05, + "loss": 0.4243, "step": 25860 }, { - "epoch": 0.91, - "learning_rate": 4.703303231540248e-05, - "loss": 0.2825, + "epoch": 0.9321728475150467, + "grad_norm": 0.18088513612747192, + "learning_rate": 4.6862109028101596e-05, + "loss": 0.4398, "step": 25865 }, { - "epoch": 0.91, - "learning_rate": 4.7031686101716244e-05, - "loss": 0.3267, + "epoch": 0.9323530471762713, + "grad_norm": 0.17031700909137726, + "learning_rate": 4.686069342761585e-05, + "loss": 0.4415, "step": 25870 }, { - "epoch": 0.91, - "learning_rate": 4.703033960196308e-05, - "loss": 0.2971, + "epoch": 0.9325332468374959, + "grad_norm": 0.16327226161956787, + "learning_rate": 4.6859277529282406e-05, + "loss": 0.4316, "step": 25875 }, { - "epoch": 0.91, - "learning_rate": 4.702899281616048e-05, - "loss": 0.3243, + "epoch": 0.9327134464987206, + "grad_norm": 0.15043266117572784, + "learning_rate": 4.6857861333120525e-05, + "loss": 0.4595, "step": 25880 }, { - "epoch": 0.91, - "learning_rate": 4.702764574432592e-05, - "loss": 0.3012, + "epoch": 0.9328936461599452, + "grad_norm": 0.1847275346517563, + "learning_rate": 4.685644483914952e-05, + "loss": 0.4269, "step": 25885 }, { - "epoch": 0.91, - "learning_rate": 4.70262983864769e-05, - "loss": 0.3297, + "epoch": 0.9330738458211698, + "grad_norm": 0.1813531517982483, + "learning_rate": 4.685502804738868e-05, + "loss": 0.4171, "step": 25890 }, { - "epoch": 0.91, - "learning_rate": 4.7024950742630906e-05, - "loss": 0.2954, + "epoch": 0.9332540454823945, + "grad_norm": 0.16259461641311646, + "learning_rate": 4.685361095785732e-05, + "loss": 0.4319, "step": 25895 }, { - "epoch": 0.91, - "learning_rate": 4.7023602812805444e-05, - "loss": 0.2899, + "epoch": 0.9334342451436192, + "grad_norm": 0.17456987500190735, + "learning_rate": 4.685219357057474e-05, + "loss": 0.4362, "step": 25900 }, { - "epoch": 0.91, - "learning_rate": 4.702225459701802e-05, - "loss": 0.319, + "epoch": 0.9336144448048438, + "grad_norm": 0.1771409511566162, + "learning_rate": 4.6850775885560255e-05, + "loss": 0.4237, "step": 25905 }, { - "epoch": 0.91, - "learning_rate": 4.702090609528612e-05, - "loss": 0.3112, + "epoch": 0.9337946444660684, + "grad_norm": 0.14823564887046814, + "learning_rate": 4.684935790283318e-05, + "loss": 0.4253, "step": 25910 }, { - "epoch": 0.91, - "learning_rate": 4.7019557307627284e-05, - "loss": 0.2806, + "epoch": 0.933974844127293, + "grad_norm": 0.17045463621616364, + "learning_rate": 4.684793962241283e-05, + "loss": 0.4249, "step": 25915 }, { - "epoch": 0.91, - "learning_rate": 4.701820823405899e-05, - "loss": 0.2969, + "epoch": 0.9341550437885177, + "grad_norm": 0.2057085633277893, + "learning_rate": 4.684652104431852e-05, + "loss": 0.4473, "step": 25920 }, { - "epoch": 0.91, - "learning_rate": 4.7016858874598786e-05, - "loss": 0.2954, + "epoch": 0.9343352434497423, + "grad_norm": 0.1884680688381195, + "learning_rate": 4.684510216856961e-05, + "loss": 0.4832, "step": 25925 }, { - "epoch": 0.91, - "learning_rate": 4.7015509229264174e-05, - "loss": 0.3091, + "epoch": 0.9345154431109669, + "grad_norm": 0.15199509263038635, + "learning_rate": 4.684368299518541e-05, + "loss": 0.4648, "step": 25930 }, { - "epoch": 0.91, - "learning_rate": 4.701415929807268e-05, - "loss": 0.3093, + "epoch": 0.9346956427721916, + "grad_norm": 0.2007426768541336, + "learning_rate": 4.684226352418525e-05, + "loss": 0.465, "step": 25935 }, { - "epoch": 0.91, - "learning_rate": 4.7012809081041845e-05, - "loss": 0.2918, + "epoch": 0.9348758424334163, + "grad_norm": 0.19974127411842346, + "learning_rate": 4.684084375558848e-05, + "loss": 0.4523, "step": 25940 }, { - "epoch": 0.91, - "learning_rate": 4.701145857818918e-05, - "loss": 0.2836, + "epoch": 0.9350560420946409, + "grad_norm": 0.1712539941072464, + "learning_rate": 4.6839423689414455e-05, + "loss": 0.4529, "step": 25945 }, { - "epoch": 0.91, - "learning_rate": 4.7010107789532236e-05, - "loss": 0.2904, + "epoch": 0.9352362417558655, + "grad_norm": 0.19021165370941162, + "learning_rate": 4.68380033256825e-05, + "loss": 0.409, "step": 25950 }, { - "epoch": 0.91, - "learning_rate": 4.7008756715088554e-05, - "loss": 0.3015, + "epoch": 0.9354164414170901, + "grad_norm": 0.1501091569662094, + "learning_rate": 4.6836582664411975e-05, + "loss": 0.4259, "step": 25955 }, { - "epoch": 0.91, - "learning_rate": 4.700740535487566e-05, - "loss": 0.3074, + "epoch": 0.9355966410783148, + "grad_norm": 0.16499082744121552, + "learning_rate": 4.683516170562224e-05, + "loss": 0.4451, "step": 25960 }, { - "epoch": 0.91, - "learning_rate": 4.7006053708911115e-05, - "loss": 0.2716, + "epoch": 0.9357768407395394, + "grad_norm": 0.12097856402397156, + "learning_rate": 4.683374044933266e-05, + "loss": 0.4216, "step": 25965 }, { - "epoch": 0.91, - "learning_rate": 4.7004701777212465e-05, - "loss": 0.3119, + "epoch": 0.935957040400764, + "grad_norm": 0.17843008041381836, + "learning_rate": 4.683231889556259e-05, + "loss": 0.4188, "step": 25970 }, { - "epoch": 0.91, - "learning_rate": 4.700334955979726e-05, - "loss": 0.3131, + "epoch": 0.9361372400619887, + "grad_norm": 0.18024946749210358, + "learning_rate": 4.68308970443314e-05, + "loss": 0.4292, "step": 25975 }, { - "epoch": 0.91, - "learning_rate": 4.700199705668307e-05, - "loss": 0.2855, + "epoch": 0.9363174397232134, + "grad_norm": 0.18790417909622192, + "learning_rate": 4.6829474895658464e-05, + "loss": 0.4282, "step": 25980 }, { - "epoch": 0.91, - "learning_rate": 4.700064426788744e-05, - "loss": 0.3164, + "epoch": 0.936497639384438, + "grad_norm": 0.16022486984729767, + "learning_rate": 4.682805244956316e-05, + "loss": 0.4387, "step": 25985 }, { - "epoch": 0.91, - "learning_rate": 4.6999291193427954e-05, - "loss": 0.2818, + "epoch": 0.9366778390456626, + "grad_norm": 0.15988902747631073, + "learning_rate": 4.682662970606487e-05, + "loss": 0.4369, "step": 25990 }, { - "epoch": 0.91, - "learning_rate": 4.699793783332216e-05, - "loss": 0.3179, + "epoch": 0.9368580387068872, + "grad_norm": 0.1928686499595642, + "learning_rate": 4.682520666518297e-05, + "loss": 0.4324, "step": 25995 }, { - "epoch": 0.91, - "learning_rate": 4.699658418758764e-05, - "loss": 0.2999, + "epoch": 0.9370382383681118, + "grad_norm": 0.18302272260189056, + "learning_rate": 4.682378332693686e-05, + "loss": 0.4272, "step": 26000 }, { - "epoch": 0.91, - "eval_loss": 0.2956346273422241, - "eval_runtime": 10.5342, - "eval_samples_per_second": 9.493, - "eval_steps_per_second": 9.493, + "epoch": 0.9370382383681118, + "eval_loss": 0.45559266209602356, + "eval_runtime": 3.5347, + "eval_samples_per_second": 28.291, + "eval_steps_per_second": 7.073, "step": 26000 }, { - "epoch": 0.91, - "learning_rate": 4.6995230256241984e-05, - "loss": 0.3159, + "epoch": 0.9372184380293365, + "grad_norm": 0.15167242288589478, + "learning_rate": 4.682235969134593e-05, + "loss": 0.4238, "step": 26005 }, { - "epoch": 0.92, - "learning_rate": 4.699387603930274e-05, - "loss": 0.3136, + "epoch": 0.9373986376905611, + "grad_norm": 0.16070915758609772, + "learning_rate": 4.682093575842957e-05, + "loss": 0.4412, "step": 26010 }, { - "epoch": 0.92, - "learning_rate": 4.6992521536787525e-05, - "loss": 0.3187, + "epoch": 0.9375788373517858, + "grad_norm": 0.15172840654850006, + "learning_rate": 4.681951152820718e-05, + "loss": 0.4573, "step": 26015 }, { - "epoch": 0.92, - "learning_rate": 4.699116674871391e-05, - "loss": 0.3006, + "epoch": 0.9377590370130104, + "grad_norm": 0.1980069875717163, + "learning_rate": 4.6818087000698175e-05, + "loss": 0.4407, "step": 26020 }, { - "epoch": 0.92, - "learning_rate": 4.698981167509948e-05, - "loss": 0.3087, + "epoch": 0.9379392366742351, + "grad_norm": 0.17154526710510254, + "learning_rate": 4.6816662175921965e-05, + "loss": 0.4435, "step": 26025 }, { - "epoch": 0.92, - "learning_rate": 4.6988456315961835e-05, - "loss": 0.2858, + "epoch": 0.9381194363354597, + "grad_norm": 0.21349559724330902, + "learning_rate": 4.6815237053897946e-05, + "loss": 0.4316, "step": 26030 }, { - "epoch": 0.92, - "learning_rate": 4.698710067131859e-05, - "loss": 0.3003, + "epoch": 0.9382996359966843, + "grad_norm": 0.2062828242778778, + "learning_rate": 4.6813811634645554e-05, + "loss": 0.4443, "step": 26035 }, { - "epoch": 0.92, - "learning_rate": 4.6985744741187324e-05, - "loss": 0.2924, + "epoch": 0.9384798356579089, + "grad_norm": 0.16467832028865814, + "learning_rate": 4.68123859181842e-05, + "loss": 0.4363, "step": 26040 }, { - "epoch": 0.92, - "learning_rate": 4.698438852558564e-05, - "loss": 0.3158, + "epoch": 0.9386600353191336, + "grad_norm": 0.16752782464027405, + "learning_rate": 4.68109599045333e-05, + "loss": 0.4136, "step": 26045 }, { - "epoch": 0.92, - "learning_rate": 4.6983032024531173e-05, - "loss": 0.3382, + "epoch": 0.9388402349803583, + "grad_norm": 0.2054414600133896, + "learning_rate": 4.6809533593712305e-05, + "loss": 0.4687, "step": 26050 }, { - "epoch": 0.92, - "learning_rate": 4.698167523804152e-05, - "loss": 0.2996, + "epoch": 0.9390204346415829, + "grad_norm": 0.20668792724609375, + "learning_rate": 4.680810698574064e-05, + "loss": 0.4421, "step": 26055 }, { - "epoch": 0.92, - "learning_rate": 4.698031816613431e-05, - "loss": 0.3003, + "epoch": 0.9392006343028075, + "grad_norm": 0.1917664259672165, + "learning_rate": 4.680668008063773e-05, + "loss": 0.4454, "step": 26060 }, { - "epoch": 0.92, - "learning_rate": 4.697896080882714e-05, - "loss": 0.2955, + "epoch": 0.9393808339640322, + "grad_norm": 0.13623978197574615, + "learning_rate": 4.680525287842303e-05, + "loss": 0.4358, "step": 26065 }, { - "epoch": 0.92, - "learning_rate": 4.697760316613765e-05, - "loss": 0.3226, + "epoch": 0.9395610336252568, + "grad_norm": 0.1284467577934265, + "learning_rate": 4.6803825379115985e-05, + "loss": 0.4677, "step": 26070 }, { - "epoch": 0.92, - "learning_rate": 4.6976245238083474e-05, - "loss": 0.3099, + "epoch": 0.9397412332864814, + "grad_norm": 0.16711989045143127, + "learning_rate": 4.680239758273604e-05, + "loss": 0.4238, "step": 26075 }, { - "epoch": 0.92, - "learning_rate": 4.6974887024682236e-05, - "loss": 0.3085, + "epoch": 0.939921432947706, + "grad_norm": 0.1910637617111206, + "learning_rate": 4.6800969489302646e-05, + "loss": 0.4641, "step": 26080 }, { - "epoch": 0.92, - "learning_rate": 4.697352852595156e-05, - "loss": 0.3156, + "epoch": 0.9401016326089306, + "grad_norm": 0.16136784851551056, + "learning_rate": 4.6799541098835264e-05, + "loss": 0.4006, "step": 26085 }, { - "epoch": 0.92, - "learning_rate": 4.697216974190911e-05, - "loss": 0.3053, + "epoch": 0.9402818322701554, + "grad_norm": 0.1435597538948059, + "learning_rate": 4.679811241135335e-05, + "loss": 0.4049, "step": 26090 }, { - "epoch": 0.92, - "learning_rate": 4.6970810672572514e-05, - "loss": 0.2997, + "epoch": 0.94046203193138, + "grad_norm": 0.16800041496753693, + "learning_rate": 4.679668342687638e-05, + "loss": 0.411, "step": 26095 }, { - "epoch": 0.92, - "learning_rate": 4.6969451317959415e-05, - "loss": 0.3295, + "epoch": 0.9406422315926046, + "grad_norm": 0.19089345633983612, + "learning_rate": 4.679525414542382e-05, + "loss": 0.435, "step": 26100 }, { - "epoch": 0.92, - "learning_rate": 4.696809167808748e-05, - "loss": 0.3008, + "epoch": 0.9408224312538292, + "grad_norm": 0.22108210623264313, + "learning_rate": 4.6793824567015135e-05, + "loss": 0.438, "step": 26105 }, { - "epoch": 0.92, - "learning_rate": 4.696673175297433e-05, - "loss": 0.3188, + "epoch": 0.9410026309150539, + "grad_norm": 0.15207885205745697, + "learning_rate": 4.679239469166982e-05, + "loss": 0.4272, "step": 26110 }, { - "epoch": 0.92, - "learning_rate": 4.696537154263767e-05, - "loss": 0.3141, + "epoch": 0.9411828305762785, + "grad_norm": 0.19638319313526154, + "learning_rate": 4.679096451940734e-05, + "loss": 0.4027, "step": 26115 }, { - "epoch": 0.92, - "learning_rate": 4.696401104709512e-05, - "loss": 0.293, + "epoch": 0.9413630302375031, + "grad_norm": 0.22127874195575714, + "learning_rate": 4.678953405024718e-05, + "loss": 0.4261, "step": 26120 }, { - "epoch": 0.92, - "learning_rate": 4.696265026636437e-05, - "loss": 0.2927, + "epoch": 0.9415432298987277, + "grad_norm": 0.17142963409423828, + "learning_rate": 4.678810328420885e-05, + "loss": 0.4106, "step": 26125 }, { - "epoch": 0.92, - "learning_rate": 4.6961289200463077e-05, - "loss": 0.2989, + "epoch": 0.9417234295599525, + "grad_norm": 0.15688422322273254, + "learning_rate": 4.678667222131183e-05, + "loss": 0.4312, "step": 26130 }, { - "epoch": 0.92, - "learning_rate": 4.695992784940892e-05, - "loss": 0.2919, + "epoch": 0.9419036292211771, + "grad_norm": 0.1551523059606552, + "learning_rate": 4.678524086157561e-05, + "loss": 0.4178, "step": 26135 }, { - "epoch": 0.92, - "learning_rate": 4.695856621321957e-05, - "loss": 0.3212, + "epoch": 0.9420838288824017, + "grad_norm": 0.1784355640411377, + "learning_rate": 4.678380920501971e-05, + "loss": 0.4415, "step": 26140 }, { - "epoch": 0.92, - "learning_rate": 4.69572042919127e-05, - "loss": 0.3111, + "epoch": 0.9422640285436263, + "grad_norm": 0.14627361297607422, + "learning_rate": 4.6782377251663624e-05, + "loss": 0.4048, "step": 26145 }, { - "epoch": 0.92, - "learning_rate": 4.695584208550602e-05, - "loss": 0.2652, + "epoch": 0.942444228204851, + "grad_norm": 0.14835497736930847, + "learning_rate": 4.678094500152686e-05, + "loss": 0.4471, "step": 26150 }, { - "epoch": 0.92, - "learning_rate": 4.695447959401719e-05, - "loss": 0.2956, + "epoch": 0.9426244278660756, + "grad_norm": 0.15386229753494263, + "learning_rate": 4.677951245462895e-05, + "loss": 0.4297, "step": 26155 }, { - "epoch": 0.92, - "learning_rate": 4.695311681746391e-05, - "loss": 0.3111, + "epoch": 0.9428046275273002, + "grad_norm": 0.16428400576114655, + "learning_rate": 4.677807961098939e-05, + "loss": 0.4265, "step": 26160 }, { - "epoch": 0.92, - "learning_rate": 4.695175375586388e-05, - "loss": 0.3066, + "epoch": 0.9429848271885248, + "grad_norm": 0.18662281334400177, + "learning_rate": 4.677664647062771e-05, + "loss": 0.4396, "step": 26165 }, { - "epoch": 0.92, - "learning_rate": 4.695039040923479e-05, - "loss": 0.3114, + "epoch": 0.9431650268497496, + "grad_norm": 0.17339280247688293, + "learning_rate": 4.6775213033563445e-05, + "loss": 0.4346, "step": 26170 }, { - "epoch": 0.92, - "learning_rate": 4.694902677759435e-05, - "loss": 0.3438, + "epoch": 0.9433452265109742, + "grad_norm": 0.1801465004682541, + "learning_rate": 4.677377929981611e-05, + "loss": 0.4492, "step": 26175 }, { - "epoch": 0.92, - "learning_rate": 4.6947662860960257e-05, - "loss": 0.3166, + "epoch": 0.9435254261721988, + "grad_norm": 0.14782361686229706, + "learning_rate": 4.6772345269405255e-05, + "loss": 0.4003, "step": 26180 }, { - "epoch": 0.92, - "learning_rate": 4.694629865935023e-05, - "loss": 0.2927, + "epoch": 0.9437056258334234, + "grad_norm": 0.20001257956027985, + "learning_rate": 4.677091094235041e-05, + "loss": 0.4631, "step": 26185 }, { - "epoch": 0.92, - "learning_rate": 4.694493417278197e-05, - "loss": 0.3508, + "epoch": 0.9438858254946481, + "grad_norm": 0.16880610585212708, + "learning_rate": 4.6769476318671116e-05, + "loss": 0.4457, "step": 26190 }, { - "epoch": 0.92, - "learning_rate": 4.694356940127321e-05, - "loss": 0.3214, + "epoch": 0.9440660251558727, + "grad_norm": 0.18512383103370667, + "learning_rate": 4.676804139838692e-05, + "loss": 0.4106, "step": 26195 }, { - "epoch": 0.92, - "learning_rate": 4.6942204344841665e-05, - "loss": 0.3333, + "epoch": 0.9442462248170973, + "grad_norm": 0.19107241928577423, + "learning_rate": 4.6766606181517375e-05, + "loss": 0.417, "step": 26200 }, { - "epoch": 0.92, - "learning_rate": 4.694083900350505e-05, - "loss": 0.2991, + "epoch": 0.944426424478322, + "grad_norm": 0.18682074546813965, + "learning_rate": 4.676517066808204e-05, + "loss": 0.4276, "step": 26205 }, { - "epoch": 0.92, - "learning_rate": 4.6939473377281105e-05, - "loss": 0.2859, + "epoch": 0.9446066241395467, + "grad_norm": 0.19449475407600403, + "learning_rate": 4.676373485810046e-05, + "loss": 0.4347, "step": 26210 }, { - "epoch": 0.92, - "learning_rate": 4.6938107466187564e-05, - "loss": 0.3082, + "epoch": 0.9447868238007713, + "grad_norm": 0.16231991350650787, + "learning_rate": 4.6762298751592215e-05, + "loss": 0.4527, "step": 26215 }, { - "epoch": 0.92, - "learning_rate": 4.693674127024215e-05, - "loss": 0.2948, + "epoch": 0.9449670234619959, + "grad_norm": 0.19462081789970398, + "learning_rate": 4.676086234857686e-05, + "loss": 0.4579, "step": 26220 }, { - "epoch": 0.92, - "learning_rate": 4.69353747894626e-05, - "loss": 0.3076, + "epoch": 0.9451472231232205, + "grad_norm": 0.14375783503055573, + "learning_rate": 4.675942564907396e-05, + "loss": 0.4156, "step": 26225 }, { - "epoch": 0.92, - "learning_rate": 4.693400802386667e-05, - "loss": 0.31, + "epoch": 0.9453274227844451, + "grad_norm": 0.19837602972984314, + "learning_rate": 4.675798865310311e-05, + "loss": 0.4444, "step": 26230 }, { - "epoch": 0.92, - "learning_rate": 4.69326409734721e-05, - "loss": 0.287, + "epoch": 0.9455076224456698, + "grad_norm": 0.2307501882314682, + "learning_rate": 4.675655136068387e-05, + "loss": 0.4416, "step": 26235 }, { - "epoch": 0.92, - "learning_rate": 4.6931273638296635e-05, - "loss": 0.2953, + "epoch": 0.9456878221068944, + "grad_norm": 0.19053228199481964, + "learning_rate": 4.675511377183583e-05, + "loss": 0.4442, "step": 26240 }, { - "epoch": 0.92, - "learning_rate": 4.6929906018358046e-05, - "loss": 0.2955, + "epoch": 0.9458680217681191, + "grad_norm": 0.1901927888393402, + "learning_rate": 4.675367588657858e-05, + "loss": 0.4622, "step": 26245 }, { - "epoch": 0.92, - "learning_rate": 4.692853811367407e-05, - "loss": 0.302, + "epoch": 0.9460482214293437, + "grad_norm": 0.14448778331279755, + "learning_rate": 4.675223770493171e-05, + "loss": 0.4176, "step": 26250 }, { - "epoch": 0.92, - "learning_rate": 4.6927169924262486e-05, - "loss": 0.3084, + "epoch": 0.9462284210905684, + "grad_norm": 0.1635839194059372, + "learning_rate": 4.675079922691481e-05, + "loss": 0.4253, "step": 26255 }, { - "epoch": 0.92, - "learning_rate": 4.692580145014105e-05, - "loss": 0.3084, + "epoch": 0.946408620751793, + "grad_norm": 0.18405769765377045, + "learning_rate": 4.6749360452547485e-05, + "loss": 0.4426, "step": 26260 }, { - "epoch": 0.92, - "learning_rate": 4.6924432691327524e-05, - "loss": 0.2952, + "epoch": 0.9465888204130176, + "grad_norm": 0.15434348583221436, + "learning_rate": 4.674792138184933e-05, + "loss": 0.4832, "step": 26265 }, { - "epoch": 0.92, - "learning_rate": 4.6923063647839696e-05, - "loss": 0.2839, + "epoch": 0.9467690200742422, + "grad_norm": 0.14686670899391174, + "learning_rate": 4.674648201483995e-05, + "loss": 0.4396, "step": 26270 }, { - "epoch": 0.92, - "learning_rate": 4.692169431969533e-05, - "loss": 0.3139, + "epoch": 0.9469492197354669, + "grad_norm": 0.16869889199733734, + "learning_rate": 4.674504235153898e-05, + "loss": 0.4205, "step": 26275 }, { - "epoch": 0.92, - "learning_rate": 4.6920324706912215e-05, - "loss": 0.2949, + "epoch": 0.9471294193966915, + "grad_norm": 0.14260682463645935, + "learning_rate": 4.6743602391966004e-05, + "loss": 0.4266, "step": 26280 }, { - "epoch": 0.92, - "learning_rate": 4.691895480950813e-05, - "loss": 0.2875, + "epoch": 0.9473096190579162, + "grad_norm": 0.19560997188091278, + "learning_rate": 4.674216213614066e-05, + "loss": 0.4319, "step": 26285 }, { - "epoch": 0.92, - "learning_rate": 4.6917584627500855e-05, - "loss": 0.2957, + "epoch": 0.9474898187191408, + "grad_norm": 0.19733212888240814, + "learning_rate": 4.674072158408257e-05, + "loss": 0.4331, "step": 26290 }, { - "epoch": 0.93, - "learning_rate": 4.6916214160908194e-05, - "loss": 0.2871, + "epoch": 0.9476700183803655, + "grad_norm": 0.1900247186422348, + "learning_rate": 4.6739280735811355e-05, + "loss": 0.4258, "step": 26295 }, { - "epoch": 0.93, - "learning_rate": 4.691484340974793e-05, - "loss": 0.2999, + "epoch": 0.9478502180415901, + "grad_norm": 0.2038991004228592, + "learning_rate": 4.6737839591346645e-05, + "loss": 0.446, "step": 26300 }, { - "epoch": 0.93, - "learning_rate": 4.691347237403787e-05, - "loss": 0.3261, + "epoch": 0.9480304177028147, + "grad_norm": 0.19306516647338867, + "learning_rate": 4.6736398150708076e-05, + "loss": 0.4093, "step": 26305 }, { - "epoch": 0.93, - "learning_rate": 4.691210105379582e-05, - "loss": 0.3266, + "epoch": 0.9482106173640393, + "grad_norm": 0.15221892297267914, + "learning_rate": 4.67349564139153e-05, + "loss": 0.403, "step": 26310 }, { - "epoch": 0.93, - "learning_rate": 4.691072944903957e-05, - "loss": 0.2979, + "epoch": 0.948390817025264, + "grad_norm": 0.14861221611499786, + "learning_rate": 4.673351438098794e-05, + "loss": 0.4573, "step": 26315 }, { - "epoch": 0.93, - "learning_rate": 4.6909357559786935e-05, - "loss": 0.3069, + "epoch": 0.9485710166864886, + "grad_norm": 0.15452301502227783, + "learning_rate": 4.673207205194566e-05, + "loss": 0.4291, "step": 26320 }, { - "epoch": 0.93, - "learning_rate": 4.6907985386055735e-05, - "loss": 0.2832, + "epoch": 0.9487512163477133, + "grad_norm": 0.1777305006980896, + "learning_rate": 4.6730629426808114e-05, + "loss": 0.4437, "step": 26325 }, { - "epoch": 0.93, - "learning_rate": 4.6906612927863786e-05, - "loss": 0.2859, + "epoch": 0.9489314160089379, + "grad_norm": 0.17141470313072205, + "learning_rate": 4.6729186505594943e-05, + "loss": 0.4247, "step": 26330 }, { - "epoch": 0.93, - "learning_rate": 4.69052401852289e-05, - "loss": 0.3004, + "epoch": 0.9491116156701626, + "grad_norm": 0.21854913234710693, + "learning_rate": 4.672774328832581e-05, + "loss": 0.4053, "step": 26335 }, { - "epoch": 0.93, - "learning_rate": 4.690386715816891e-05, - "loss": 0.2796, + "epoch": 0.9492918153313872, + "grad_norm": 0.2022477239370346, + "learning_rate": 4.6726299775020385e-05, + "loss": 0.4414, "step": 26340 }, { - "epoch": 0.93, - "learning_rate": 4.6902493846701637e-05, - "loss": 0.2625, + "epoch": 0.9494720149926118, + "grad_norm": 0.12977053225040436, + "learning_rate": 4.672485596569833e-05, + "loss": 0.4409, "step": 26345 }, { - "epoch": 0.93, - "learning_rate": 4.690112025084492e-05, - "loss": 0.2902, + "epoch": 0.9496522146538364, + "grad_norm": 0.15741246938705444, + "learning_rate": 4.672341186037932e-05, + "loss": 0.4438, "step": 26350 }, { - "epoch": 0.93, - "learning_rate": 4.689974637061658e-05, - "loss": 0.294, + "epoch": 0.949832414315061, + "grad_norm": 0.1705687791109085, + "learning_rate": 4.672196745908303e-05, + "loss": 0.407, "step": 26355 }, { - "epoch": 0.93, - "learning_rate": 4.689837220603448e-05, - "loss": 0.297, + "epoch": 0.9500126139762857, + "grad_norm": 0.17730578780174255, + "learning_rate": 4.672052276182913e-05, + "loss": 0.4412, "step": 26360 }, { - "epoch": 0.93, - "learning_rate": 4.689699775711644e-05, - "loss": 0.2752, + "epoch": 0.9501928136375104, + "grad_norm": 0.14052340388298035, + "learning_rate": 4.671907776863732e-05, + "loss": 0.408, "step": 26365 }, { - "epoch": 0.93, - "learning_rate": 4.6895623023880313e-05, - "loss": 0.3214, + "epoch": 0.950373013298735, + "grad_norm": 0.15693028271198273, + "learning_rate": 4.671763247952728e-05, + "loss": 0.3939, "step": 26370 }, { - "epoch": 0.93, - "learning_rate": 4.689424800634395e-05, - "loss": 0.2921, + "epoch": 0.9505532129599596, + "grad_norm": 0.19423072040081024, + "learning_rate": 4.67161868945187e-05, + "loss": 0.4108, "step": 26375 }, { - "epoch": 0.93, - "learning_rate": 4.6892872704525213e-05, - "loss": 0.2828, + "epoch": 0.9507334126211843, + "grad_norm": 0.15230786800384521, + "learning_rate": 4.671474101363128e-05, + "loss": 0.4263, "step": 26380 }, { - "epoch": 0.93, - "learning_rate": 4.6891497118441954e-05, - "loss": 0.2992, + "epoch": 0.9509136122824089, + "grad_norm": 0.19921550154685974, + "learning_rate": 4.6713294836884716e-05, + "loss": 0.4178, "step": 26385 }, { - "epoch": 0.93, - "learning_rate": 4.689012124811202e-05, - "loss": 0.3126, + "epoch": 0.9510938119436335, + "grad_norm": 0.16791215538978577, + "learning_rate": 4.671184836429871e-05, + "loss": 0.4573, "step": 26390 }, { - "epoch": 0.93, - "learning_rate": 4.68887450935533e-05, - "loss": 0.3218, + "epoch": 0.9512740116048581, + "grad_norm": 0.17958621680736542, + "learning_rate": 4.6710401595892986e-05, + "loss": 0.4352, "step": 26395 }, { - "epoch": 0.93, - "learning_rate": 4.6887368654783634e-05, - "loss": 0.3072, + "epoch": 0.9514542112660829, + "grad_norm": 0.16500048339366913, + "learning_rate": 4.670895453168724e-05, + "loss": 0.4216, "step": 26400 }, { - "epoch": 0.93, - "learning_rate": 4.6885991931820925e-05, - "loss": 0.296, + "epoch": 0.9516344109273075, + "grad_norm": 0.1619076132774353, + "learning_rate": 4.670750717170119e-05, + "loss": 0.4489, "step": 26405 }, { - "epoch": 0.93, - "learning_rate": 4.6884614924683026e-05, - "loss": 0.2905, + "epoch": 0.9518146105885321, + "grad_norm": 0.14866675436496735, + "learning_rate": 4.6706059515954546e-05, + "loss": 0.4267, "step": 26410 }, { - "epoch": 0.93, - "learning_rate": 4.688323763338783e-05, - "loss": 0.2966, + "epoch": 0.9519948102497567, + "grad_norm": 0.17370857298374176, + "learning_rate": 4.670461156446706e-05, + "loss": 0.4023, "step": 26415 }, { - "epoch": 0.93, - "learning_rate": 4.6881860057953215e-05, - "loss": 0.3035, + "epoch": 0.9521750099109814, + "grad_norm": 0.23588383197784424, + "learning_rate": 4.6703163317258436e-05, + "loss": 0.4328, "step": 26420 }, { - "epoch": 0.93, - "learning_rate": 4.688048219839707e-05, - "loss": 0.3109, + "epoch": 0.952355209572206, + "grad_norm": 0.16146144270896912, + "learning_rate": 4.670171477434841e-05, + "loss": 0.402, "step": 26425 }, { - "epoch": 0.93, - "learning_rate": 4.6879104054737276e-05, - "loss": 0.3058, + "epoch": 0.9525354092334306, + "grad_norm": 0.1492733508348465, + "learning_rate": 4.670026593575673e-05, + "loss": 0.4807, "step": 26430 }, { - "epoch": 0.93, - "learning_rate": 4.687772562699174e-05, - "loss": 0.2764, + "epoch": 0.9527156088946552, + "grad_norm": 0.15443341434001923, + "learning_rate": 4.669881680150312e-05, + "loss": 0.4296, "step": 26435 }, { - "epoch": 0.93, - "learning_rate": 4.687634691517835e-05, - "loss": 0.3059, + "epoch": 0.95289580855588, + "grad_norm": 0.14959372580051422, + "learning_rate": 4.6697367371607334e-05, + "loss": 0.4148, "step": 26440 }, { - "epoch": 0.93, - "learning_rate": 4.6874967919315016e-05, - "loss": 0.2983, + "epoch": 0.9530760082171046, + "grad_norm": 0.18775780498981476, + "learning_rate": 4.669591764608913e-05, + "loss": 0.4036, "step": 26445 }, { - "epoch": 0.93, - "learning_rate": 4.687358863941964e-05, - "loss": 0.3109, + "epoch": 0.9532562078783292, + "grad_norm": 0.20374846458435059, + "learning_rate": 4.669446762496823e-05, + "loss": 0.4418, "step": 26450 }, { - "epoch": 0.93, - "learning_rate": 4.6872209075510134e-05, - "loss": 0.2926, + "epoch": 0.9534364075395538, + "grad_norm": 0.17194941639900208, + "learning_rate": 4.669301730826442e-05, + "loss": 0.4616, "step": 26455 }, { - "epoch": 0.93, - "learning_rate": 4.68708292276044e-05, - "loss": 0.3072, + "epoch": 0.9536166072007785, + "grad_norm": Infinity, + "learning_rate": 4.669185684209495e-05, + "loss": 0.4184, "step": 26460 }, { - "epoch": 0.93, - "learning_rate": 4.6869449095720364e-05, - "loss": 0.2999, + "epoch": 0.9537968068620031, + "grad_norm": 0.26384392380714417, + "learning_rate": 4.669040599339167e-05, + "loss": 0.4399, "step": 26465 }, { - "epoch": 0.93, - "learning_rate": 4.686806867987594e-05, - "loss": 0.2932, + "epoch": 0.9539770065232277, + "grad_norm": 0.16087359189987183, + "learning_rate": 4.6688954849160817e-05, + "loss": 0.4367, "step": 26470 }, { - "epoch": 0.93, - "learning_rate": 4.6866687980089064e-05, - "loss": 0.3021, + "epoch": 0.9541572061844523, + "grad_norm": 0.16282179951667786, + "learning_rate": 4.668750340942215e-05, + "loss": 0.4427, "step": 26475 }, { - "epoch": 0.93, - "learning_rate": 4.686530699637765e-05, - "loss": 0.3005, + "epoch": 0.9543374058456771, + "grad_norm": 0.14763139188289642, + "learning_rate": 4.6686051674195454e-05, + "loss": 0.4148, "step": 26480 }, { - "epoch": 0.93, - "learning_rate": 4.686392572875964e-05, - "loss": 0.2977, + "epoch": 0.9545176055069017, + "grad_norm": 0.17169588804244995, + "learning_rate": 4.66845996435005e-05, + "loss": 0.3915, "step": 26485 }, { - "epoch": 0.93, - "learning_rate": 4.686254417725296e-05, - "loss": 0.2818, + "epoch": 0.9546978051681263, + "grad_norm": 0.1894696205854416, + "learning_rate": 4.668314731735707e-05, + "loss": 0.4249, "step": 26490 }, { - "epoch": 0.93, - "learning_rate": 4.6861162341875556e-05, - "loss": 0.3014, + "epoch": 0.9548780048293509, + "grad_norm": 0.14295339584350586, + "learning_rate": 4.668169469578496e-05, + "loss": 0.4194, "step": 26495 }, { - "epoch": 0.93, - "learning_rate": 4.6859780222645355e-05, - "loss": 0.3317, + "epoch": 0.9550582044905755, + "grad_norm": 0.1540336161851883, + "learning_rate": 4.6680241778803955e-05, + "loss": 0.4442, "step": 26500 }, { - "epoch": 0.93, - "eval_loss": 0.29478785395622253, - "eval_runtime": 10.5324, - "eval_samples_per_second": 9.494, - "eval_steps_per_second": 9.494, + "epoch": 0.9550582044905755, + "eval_loss": 0.45503300428390503, + "eval_runtime": 3.5368, + "eval_samples_per_second": 28.274, + "eval_steps_per_second": 7.069, "step": 26500 }, { - "epoch": 0.93, - "learning_rate": 4.685839781958032e-05, - "loss": 0.3181, + "epoch": 0.9552384041518002, + "grad_norm": 0.12401163578033447, + "learning_rate": 4.6678788566433854e-05, + "loss": 0.4263, "step": 26505 }, { - "epoch": 0.93, - "learning_rate": 4.68570151326984e-05, - "loss": 0.3067, + "epoch": 0.9554186038130248, + "grad_norm": 0.1782900094985962, + "learning_rate": 4.6677335058694454e-05, + "loss": 0.4176, "step": 26510 }, { - "epoch": 0.93, - "learning_rate": 4.6855632162017535e-05, - "loss": 0.3111, + "epoch": 0.9555988034742494, + "grad_norm": 0.18796321749687195, + "learning_rate": 4.667588125560556e-05, + "loss": 0.4209, "step": 26515 }, { - "epoch": 0.93, - "learning_rate": 4.6854248907555695e-05, - "loss": 0.3103, + "epoch": 0.9557790031354741, + "grad_norm": 0.17212247848510742, + "learning_rate": 4.667442715718698e-05, + "loss": 0.4083, "step": 26520 }, { - "epoch": 0.93, - "learning_rate": 4.685286536933083e-05, - "loss": 0.3183, + "epoch": 0.9559592027966988, + "grad_norm": 0.1399925798177719, + "learning_rate": 4.667297276345853e-05, + "loss": 0.4253, "step": 26525 }, { - "epoch": 0.93, - "learning_rate": 4.685148154736091e-05, - "loss": 0.3011, + "epoch": 0.9561394024579234, + "grad_norm": 0.16920849680900574, + "learning_rate": 4.6671518074440025e-05, + "loss": 0.4156, "step": 26530 }, { - "epoch": 0.93, - "learning_rate": 4.685009744166391e-05, - "loss": 0.3188, + "epoch": 0.956319602119148, + "grad_norm": 0.22345153987407684, + "learning_rate": 4.6670063090151286e-05, + "loss": 0.4336, "step": 26535 }, { - "epoch": 0.93, - "learning_rate": 4.6848713052257786e-05, - "loss": 0.303, + "epoch": 0.9564998017803726, + "grad_norm": 0.1682891845703125, + "learning_rate": 4.666860781061212e-05, + "loss": 0.446, "step": 26540 }, { - "epoch": 0.93, - "learning_rate": 4.684732837916053e-05, - "loss": 0.3034, + "epoch": 0.9566800014415973, + "grad_norm": 0.15830697119235992, + "learning_rate": 4.666715223584237e-05, + "loss": 0.4206, "step": 26545 }, { - "epoch": 0.93, - "learning_rate": 4.6845943422390105e-05, - "loss": 0.2979, + "epoch": 0.9568602011028219, + "grad_norm": 0.16852952539920807, + "learning_rate": 4.6665696365861874e-05, + "loss": 0.4387, "step": 26550 }, { - "epoch": 0.93, - "learning_rate": 4.684455818196451e-05, - "loss": 0.2811, + "epoch": 0.9570404007640466, + "grad_norm": 0.13287803530693054, + "learning_rate": 4.666424020069045e-05, + "loss": 0.4461, "step": 26555 }, { - "epoch": 0.93, - "learning_rate": 4.6843172657901714e-05, - "loss": 0.2723, + "epoch": 0.9572206004252712, + "grad_norm": 0.15766523778438568, + "learning_rate": 4.666278374034795e-05, + "loss": 0.422, "step": 26560 }, { - "epoch": 0.93, - "learning_rate": 4.684178685021973e-05, - "loss": 0.3109, + "epoch": 0.9574008000864959, + "grad_norm": 0.1761545091867447, + "learning_rate": 4.6661326984854225e-05, + "loss": 0.4452, "step": 26565 }, { - "epoch": 0.93, - "learning_rate": 4.684040075893652e-05, - "loss": 0.2876, + "epoch": 0.9575809997477205, + "grad_norm": 0.17396709322929382, + "learning_rate": 4.6659869934229106e-05, + "loss": 0.4474, "step": 26570 }, { - "epoch": 0.93, - "learning_rate": 4.683901438407011e-05, - "loss": 0.2866, + "epoch": 0.9577611994089451, + "grad_norm": 0.16844242811203003, + "learning_rate": 4.665841258849245e-05, + "loss": 0.4374, "step": 26575 }, { - "epoch": 0.94, - "learning_rate": 4.683762772563849e-05, - "loss": 0.2905, + "epoch": 0.9579413990701697, + "grad_norm": 0.1774928718805313, + "learning_rate": 4.6656954947664125e-05, + "loss": 0.404, "step": 26580 }, { - "epoch": 0.94, - "learning_rate": 4.683624078365967e-05, - "loss": 0.2961, + "epoch": 0.9581215987313944, + "grad_norm": 0.16202402114868164, + "learning_rate": 4.665549701176397e-05, + "loss": 0.4328, "step": 26585 }, { - "epoch": 0.94, - "learning_rate": 4.683485355815165e-05, - "loss": 0.2906, + "epoch": 0.958301798392619, + "grad_norm": 0.2063705176115036, + "learning_rate": 4.6654038780811866e-05, + "loss": 0.458, "step": 26590 }, { - "epoch": 0.94, - "learning_rate": 4.683346604913244e-05, - "loss": 0.3036, + "epoch": 0.9584819980538437, + "grad_norm": 0.14139318466186523, + "learning_rate": 4.665258025482767e-05, + "loss": 0.3822, "step": 26595 }, { - "epoch": 0.94, - "learning_rate": 4.6832078256620065e-05, - "loss": 0.2968, + "epoch": 0.9586621977150683, + "grad_norm": 0.1882011443376541, + "learning_rate": 4.665112143383127e-05, + "loss": 0.424, "step": 26600 }, { - "epoch": 0.94, - "learning_rate": 4.683069018063254e-05, - "loss": 0.3245, + "epoch": 0.958842397376293, + "grad_norm": 0.1827850192785263, + "learning_rate": 4.664966231784253e-05, + "loss": 0.4365, "step": 26605 }, { - "epoch": 0.94, - "learning_rate": 4.68293018211879e-05, - "loss": 0.2988, + "epoch": 0.9590225970375176, + "grad_norm": 0.20430363714694977, + "learning_rate": 4.664820290688133e-05, + "loss": 0.4152, "step": 26610 }, { - "epoch": 0.94, - "learning_rate": 4.682791317830416e-05, - "loss": 0.2597, + "epoch": 0.9592027966987422, + "grad_norm": 0.21347634494304657, + "learning_rate": 4.664674320096756e-05, + "loss": 0.4451, "step": 26615 }, { - "epoch": 0.94, - "learning_rate": 4.6826524251999344e-05, - "loss": 0.3083, + "epoch": 0.9593829963599668, + "grad_norm": 0.17805293202400208, + "learning_rate": 4.66452832001211e-05, + "loss": 0.4693, "step": 26620 }, { - "epoch": 0.94, - "learning_rate": 4.6825135042291504e-05, - "loss": 0.2865, + "epoch": 0.9595631960211914, + "grad_norm": 0.17523930966854095, + "learning_rate": 4.664382290436185e-05, + "loss": 0.4101, "step": 26625 }, { - "epoch": 0.94, - "learning_rate": 4.682374554919866e-05, - "loss": 0.2915, + "epoch": 0.9597433956824161, + "grad_norm": 0.17870113253593445, + "learning_rate": 4.6642362313709706e-05, + "loss": 0.4164, "step": 26630 }, { - "epoch": 0.94, - "learning_rate": 4.6822355772738865e-05, - "loss": 0.2914, + "epoch": 0.9599235953436408, + "grad_norm": 0.20278626680374146, + "learning_rate": 4.664090142818456e-05, + "loss": 0.457, "step": 26635 }, { - "epoch": 0.94, - "learning_rate": 4.682124374755895e-05, - "loss": 0.3391, + "epoch": 0.9601037950048654, + "grad_norm": 0.2184595763683319, + "learning_rate": 4.663944024780632e-05, + "loss": 0.4481, "step": 26640 }, { - "epoch": 0.94, - "learning_rate": 4.681985346108412e-05, - "loss": 0.3162, + "epoch": 0.96028399466609, + "grad_norm": 0.174354687333107, + "learning_rate": 4.66379787725949e-05, + "loss": 0.443, "step": 26645 }, { - "epoch": 0.94, - "learning_rate": 4.681846289129287e-05, - "loss": 0.3289, + "epoch": 0.9604641943273147, + "grad_norm": 0.20273347198963165, + "learning_rate": 4.663651700257021e-05, + "loss": 0.4258, "step": 26650 }, { - "epoch": 0.94, - "learning_rate": 4.681707203820325e-05, - "loss": 0.2981, + "epoch": 0.9606443939885393, + "grad_norm": 0.1632155179977417, + "learning_rate": 4.6635054937752166e-05, + "loss": 0.4303, "step": 26655 }, { - "epoch": 0.94, - "learning_rate": 4.6815680901833334e-05, - "loss": 0.2916, + "epoch": 0.9608245936497639, + "grad_norm": 0.16507910192012787, + "learning_rate": 4.6633592578160687e-05, + "loss": 0.4055, "step": 26660 }, { - "epoch": 0.94, - "learning_rate": 4.681428948220118e-05, - "loss": 0.3133, + "epoch": 0.9610047933109885, + "grad_norm": 0.1875597983598709, + "learning_rate": 4.6632129923815694e-05, + "loss": 0.4617, "step": 26665 }, { - "epoch": 0.94, - "learning_rate": 4.681289777932485e-05, - "loss": 0.2867, + "epoch": 0.9611849929722132, + "grad_norm": 0.16827437281608582, + "learning_rate": 4.663066697473711e-05, + "loss": 0.4083, "step": 26670 }, { - "epoch": 0.94, - "learning_rate": 4.6811505793222426e-05, - "loss": 0.2839, + "epoch": 0.9613651926334379, + "grad_norm": 0.1666988730430603, + "learning_rate": 4.662920373094489e-05, + "loss": 0.4235, "step": 26675 }, { - "epoch": 0.94, - "learning_rate": 4.681011352391197e-05, - "loss": 0.2843, + "epoch": 0.9615453922946625, + "grad_norm": 0.18340694904327393, + "learning_rate": 4.662774019245896e-05, + "loss": 0.4255, "step": 26680 }, { - "epoch": 0.94, - "learning_rate": 4.6808720971411556e-05, - "loss": 0.299, + "epoch": 0.9617255919558871, + "grad_norm": 0.18287798762321472, + "learning_rate": 4.6626276359299245e-05, + "loss": 0.3855, "step": 26685 }, { - "epoch": 0.94, - "learning_rate": 4.6807328135739284e-05, - "loss": 0.3218, + "epoch": 0.9619057916171118, + "grad_norm": 0.1787051558494568, + "learning_rate": 4.662481223148571e-05, + "loss": 0.4594, "step": 26690 }, { - "epoch": 0.94, - "learning_rate": 4.6805935016913216e-05, - "loss": 0.3144, + "epoch": 0.9620859912783364, + "grad_norm": 0.1771918535232544, + "learning_rate": 4.662334780903829e-05, + "loss": 0.415, "step": 26695 }, { - "epoch": 0.94, - "learning_rate": 4.680454161495146e-05, - "loss": 0.3157, + "epoch": 0.962266190939561, + "grad_norm": 0.15356236696243286, + "learning_rate": 4.6621883091976945e-05, + "loss": 0.4285, "step": 26700 }, { - "epoch": 0.94, - "learning_rate": 4.68031479298721e-05, - "loss": 0.2977, + "epoch": 0.9624463906007856, + "grad_norm": 0.1645512878894806, + "learning_rate": 4.662041808032163e-05, + "loss": 0.465, "step": 26705 }, { - "epoch": 0.94, - "learning_rate": 4.680175396169323e-05, - "loss": 0.2742, + "epoch": 0.9626265902620104, + "grad_norm": 0.20833955705165863, + "learning_rate": 4.661895277409231e-05, + "loss": 0.4383, "step": 26710 }, { - "epoch": 0.94, - "learning_rate": 4.6800359710432955e-05, - "loss": 0.3116, + "epoch": 0.962806789923235, + "grad_norm": 0.15828882157802582, + "learning_rate": 4.661748717330893e-05, + "loss": 0.4052, "step": 26715 }, { - "epoch": 0.94, - "learning_rate": 4.679896517610938e-05, - "loss": 0.2944, + "epoch": 0.9629869895844596, + "grad_norm": 0.15166039764881134, + "learning_rate": 4.6616021277991476e-05, + "loss": 0.4495, "step": 26720 }, { - "epoch": 0.94, - "learning_rate": 4.67975703587406e-05, - "loss": 0.3056, + "epoch": 0.9631671892456842, + "grad_norm": 0.18399517238140106, + "learning_rate": 4.6614555088159924e-05, + "loss": 0.4607, "step": 26725 }, { - "epoch": 0.94, - "learning_rate": 4.679617525834474e-05, - "loss": 0.2851, + "epoch": 0.9633473889069089, + "grad_norm": 0.16991674900054932, + "learning_rate": 4.661308860383424e-05, + "loss": 0.4267, "step": 26730 }, { - "epoch": 0.94, - "learning_rate": 4.67947798749399e-05, - "loss": 0.29, + "epoch": 0.9635275885681335, + "grad_norm": 0.18746553361415863, + "learning_rate": 4.661162182503441e-05, + "loss": 0.456, "step": 26735 }, { - "epoch": 0.94, - "learning_rate": 4.679338420854422e-05, - "loss": 0.2968, + "epoch": 0.9637077882293581, + "grad_norm": 0.18917517364025116, + "learning_rate": 4.661015475178041e-05, + "loss": 0.4372, "step": 26740 }, { - "epoch": 0.94, - "learning_rate": 4.67919882591758e-05, - "loss": 0.3058, + "epoch": 0.9638879878905827, + "grad_norm": 0.16291776299476624, + "learning_rate": 4.6608687384092244e-05, + "loss": 0.4211, "step": 26745 }, { - "epoch": 0.94, - "learning_rate": 4.679059202685278e-05, - "loss": 0.3168, + "epoch": 0.9640681875518075, + "grad_norm": 0.17410635948181152, + "learning_rate": 4.660721972198989e-05, + "loss": 0.4218, "step": 26750 }, { - "epoch": 0.94, - "learning_rate": 4.678919551159327e-05, - "loss": 0.2959, + "epoch": 0.9642483872130321, + "grad_norm": 0.17787551879882812, + "learning_rate": 4.6605751765493354e-05, + "loss": 0.4457, "step": 26755 }, { - "epoch": 0.94, - "learning_rate": 4.678779871341543e-05, - "loss": 0.2937, + "epoch": 0.9644285868742567, + "grad_norm": 0.17140106856822968, + "learning_rate": 4.660428351462263e-05, + "loss": 0.4006, "step": 26760 }, { - "epoch": 0.94, - "learning_rate": 4.6786401632337376e-05, - "loss": 0.2977, + "epoch": 0.9646087865354813, + "grad_norm": 0.17782960832118988, + "learning_rate": 4.660281496939773e-05, + "loss": 0.4501, "step": 26765 }, { - "epoch": 0.94, - "learning_rate": 4.678500426837725e-05, - "loss": 0.3166, + "epoch": 0.9647889861967059, + "grad_norm": 0.15600888431072235, + "learning_rate": 4.6601346129838655e-05, + "loss": 0.4476, "step": 26770 }, { - "epoch": 0.94, - "learning_rate": 4.678360662155321e-05, - "loss": 0.3044, + "epoch": 0.9649691858579306, + "grad_norm": 0.18162627518177032, + "learning_rate": 4.6599876995965424e-05, + "loss": 0.418, "step": 26775 }, { - "epoch": 0.94, - "learning_rate": 4.678220869188339e-05, - "loss": 0.3031, + "epoch": 0.9651493855191552, + "grad_norm": 0.20125152170658112, + "learning_rate": 4.659840756779805e-05, + "loss": 0.4183, "step": 26780 }, { - "epoch": 0.94, - "learning_rate": 4.6780810479385945e-05, - "loss": 0.303, + "epoch": 0.9653295851803798, + "grad_norm": 0.16319221258163452, + "learning_rate": 4.6596937845356556e-05, + "loss": 0.4437, "step": 26785 }, { - "epoch": 0.94, - "learning_rate": 4.677941198407903e-05, - "loss": 0.307, + "epoch": 0.9655097848416045, + "grad_norm": 0.2335626482963562, + "learning_rate": 4.659546782866096e-05, + "loss": 0.4571, "step": 26790 }, { - "epoch": 0.94, - "learning_rate": 4.677801320598081e-05, - "loss": 0.3096, + "epoch": 0.9656899845028292, + "grad_norm": 0.17783339321613312, + "learning_rate": 4.6593997517731305e-05, + "loss": 0.4296, "step": 26795 }, { - "epoch": 0.94, - "learning_rate": 4.677661414510943e-05, - "loss": 0.3295, + "epoch": 0.9658701841640538, + "grad_norm": 0.16893276572227478, + "learning_rate": 4.65925269125876e-05, + "loss": 0.4179, "step": 26800 }, { - "epoch": 0.94, - "learning_rate": 4.677521480148307e-05, - "loss": 0.3012, + "epoch": 0.9660503838252784, + "grad_norm": 0.160037562251091, + "learning_rate": 4.6591056013249914e-05, + "loss": 0.4114, "step": 26805 }, { - "epoch": 0.94, - "learning_rate": 4.67738151751199e-05, - "loss": 0.2984, + "epoch": 0.966230583486503, + "grad_norm": 0.18261387944221497, + "learning_rate": 4.6589584819738254e-05, + "loss": 0.4149, "step": 26810 }, { - "epoch": 0.94, - "learning_rate": 4.677241526603808e-05, - "loss": 0.3075, + "epoch": 0.9664107831477277, + "grad_norm": 0.18064436316490173, + "learning_rate": 4.658811333207269e-05, + "loss": 0.4456, "step": 26815 }, { - "epoch": 0.94, - "learning_rate": 4.677101507425581e-05, - "loss": 0.2996, + "epoch": 0.9665909828089523, + "grad_norm": 0.16004979610443115, + "learning_rate": 4.658664155027326e-05, + "loss": 0.4362, "step": 26820 }, { - "epoch": 0.94, - "learning_rate": 4.6769614599791245e-05, - "loss": 0.3274, + "epoch": 0.9667711824701769, + "grad_norm": 0.18018919229507446, + "learning_rate": 4.658516947436001e-05, + "loss": 0.4137, "step": 26825 }, { - "epoch": 0.94, - "learning_rate": 4.676821384266259e-05, - "loss": 0.3016, + "epoch": 0.9669513821314016, + "grad_norm": 0.1572667360305786, + "learning_rate": 4.658369710435302e-05, + "loss": 0.4265, "step": 26830 }, { - "epoch": 0.94, - "learning_rate": 4.6766812802888016e-05, - "loss": 0.2927, + "epoch": 0.9671315817926263, + "grad_norm": 0.14630809426307678, + "learning_rate": 4.6582224440272325e-05, + "loss": 0.4567, "step": 26835 }, { - "epoch": 0.94, - "learning_rate": 4.676541148048573e-05, - "loss": 0.3203, + "epoch": 0.9673117814538509, + "grad_norm": 0.15697656571865082, + "learning_rate": 4.6580751482138e-05, + "loss": 0.4598, "step": 26840 }, { - "epoch": 0.94, - "learning_rate": 4.6764009875473914e-05, - "loss": 0.2974, + "epoch": 0.9674919811150755, + "grad_norm": 0.1883528083562851, + "learning_rate": 4.657927822997012e-05, + "loss": 0.4144, "step": 26845 }, { - "epoch": 0.94, - "learning_rate": 4.676260798787078e-05, - "loss": 0.3076, + "epoch": 0.9676721807763001, + "grad_norm": 0.1584809124469757, + "learning_rate": 4.657780468378875e-05, + "loss": 0.421, "step": 26850 }, { - "epoch": 0.94, - "learning_rate": 4.6761205817694514e-05, - "loss": 0.3237, + "epoch": 0.9678523804375248, + "grad_norm": 0.1841040551662445, + "learning_rate": 4.657633084361397e-05, + "loss": 0.4483, "step": 26855 }, { - "epoch": 0.95, - "learning_rate": 4.6759803364963336e-05, - "loss": 0.3246, + "epoch": 0.9680325800987494, + "grad_norm": 0.1718447059392929, + "learning_rate": 4.657485670946585e-05, + "loss": 0.4267, "step": 26860 }, { - "epoch": 0.95, - "learning_rate": 4.6758400629695456e-05, - "loss": 0.3098, + "epoch": 0.968212779759974, + "grad_norm": 0.16161082684993744, + "learning_rate": 4.65733822813645e-05, + "loss": 0.4503, "step": 26865 }, { - "epoch": 0.95, - "learning_rate": 4.6756997611909074e-05, - "loss": 0.3105, + "epoch": 0.9683929794211987, + "grad_norm": 0.175176739692688, + "learning_rate": 4.657190755932999e-05, + "loss": 0.4539, "step": 26870 }, { - "epoch": 0.95, - "learning_rate": 4.675559431162242e-05, - "loss": 0.2933, + "epoch": 0.9685731790824234, + "grad_norm": 0.16012953221797943, + "learning_rate": 4.65704325433824e-05, + "loss": 0.4453, "step": 26875 }, { - "epoch": 0.95, - "learning_rate": 4.675419072885372e-05, - "loss": 0.2906, + "epoch": 0.968753378743648, + "grad_norm": 0.16824325919151306, + "learning_rate": 4.6568957233541854e-05, + "loss": 0.4654, "step": 26880 }, { - "epoch": 0.95, - "learning_rate": 4.675278686362118e-05, - "loss": 0.2907, + "epoch": 0.9689335784048726, + "grad_norm": 0.15294811129570007, + "learning_rate": 4.6567481629828443e-05, + "loss": 0.4517, "step": 26885 }, { - "epoch": 0.95, - "learning_rate": 4.675138271594304e-05, - "loss": 0.3234, + "epoch": 0.9691137780660972, + "grad_norm": 0.19897432625293732, + "learning_rate": 4.6566005732262275e-05, + "loss": 0.4646, "step": 26890 }, { - "epoch": 0.95, - "learning_rate": 4.674997828583753e-05, - "loss": 0.3188, + "epoch": 0.9692939777273218, + "grad_norm": 0.16633781790733337, + "learning_rate": 4.6564529540863446e-05, + "loss": 0.3853, "step": 26895 }, { - "epoch": 0.95, - "learning_rate": 4.6748573573322884e-05, - "loss": 0.3043, + "epoch": 0.9694741773885465, + "grad_norm": 0.13114029169082642, + "learning_rate": 4.656305305565208e-05, + "loss": 0.4149, "step": 26900 }, { - "epoch": 0.95, - "learning_rate": 4.674716857841735e-05, - "loss": 0.2734, + "epoch": 0.9696543770497712, + "grad_norm": 0.17593474686145782, + "learning_rate": 4.656157627664829e-05, + "loss": 0.4126, "step": 26905 }, { - "epoch": 0.95, - "learning_rate": 4.6745763301139153e-05, - "loss": 0.2985, + "epoch": 0.9698345767109958, + "grad_norm": 0.1551685929298401, + "learning_rate": 4.6560099203872196e-05, + "loss": 0.4258, "step": 26910 }, { - "epoch": 0.95, - "learning_rate": 4.674435774150656e-05, - "loss": 0.3309, + "epoch": 0.9700147763722204, + "grad_norm": 0.18880245089530945, + "learning_rate": 4.655862183734392e-05, + "loss": 0.4148, "step": 26915 }, { - "epoch": 0.95, - "learning_rate": 4.674295189953781e-05, - "loss": 0.3117, + "epoch": 0.9701949760334451, + "grad_norm": 0.1571785807609558, + "learning_rate": 4.6557144177083604e-05, + "loss": 0.4471, "step": 26920 }, { - "epoch": 0.95, - "learning_rate": 4.674154577525116e-05, - "loss": 0.3057, + "epoch": 0.9703751756946697, + "grad_norm": 0.18303348124027252, + "learning_rate": 4.655566622311137e-05, + "loss": 0.4398, "step": 26925 }, { - "epoch": 0.95, - "learning_rate": 4.6740139368664866e-05, - "loss": 0.3029, + "epoch": 0.9705553753558943, + "grad_norm": 0.19541716575622559, + "learning_rate": 4.6554187975447364e-05, + "loss": 0.4099, "step": 26930 }, { - "epoch": 0.95, - "learning_rate": 4.673873267979718e-05, - "loss": 0.3176, + "epoch": 0.9707355750171189, + "grad_norm": 0.17658253014087677, + "learning_rate": 4.655270943411171e-05, + "loss": 0.4512, "step": 26935 }, { - "epoch": 0.95, - "learning_rate": 4.6737325708666394e-05, - "loss": 0.3005, + "epoch": 0.9709157746783436, + "grad_norm": 0.18457411229610443, + "learning_rate": 4.655123059912456e-05, + "loss": 0.4632, "step": 26940 }, { - "epoch": 0.95, - "learning_rate": 4.673591845529074e-05, - "loss": 0.3125, + "epoch": 0.9710959743395683, + "grad_norm": 0.15944646298885345, + "learning_rate": 4.654975147050607e-05, + "loss": 0.4436, "step": 26945 }, { - "epoch": 0.95, - "learning_rate": 4.6734510919688525e-05, - "loss": 0.3098, + "epoch": 0.9712761740007929, + "grad_norm": 0.17047551274299622, + "learning_rate": 4.654827204827639e-05, + "loss": 0.4476, "step": 26950 }, { - "epoch": 0.95, - "learning_rate": 4.6733103101878e-05, - "loss": 0.3302, + "epoch": 0.9714563736620175, + "grad_norm": 0.18987303972244263, + "learning_rate": 4.654679233245568e-05, + "loss": 0.4751, "step": 26955 }, { - "epoch": 0.95, - "learning_rate": 4.673169500187746e-05, - "loss": 0.2737, + "epoch": 0.9716365733232422, + "grad_norm": 0.18949106335639954, + "learning_rate": 4.654531232306409e-05, + "loss": 0.432, "step": 26960 }, { - "epoch": 0.95, - "learning_rate": 4.673028661970518e-05, - "loss": 0.2857, + "epoch": 0.9718167729844668, + "grad_norm": 0.15808527171611786, + "learning_rate": 4.654383202012179e-05, + "loss": 0.3919, "step": 26965 }, { - "epoch": 0.95, - "learning_rate": 4.672887795537945e-05, - "loss": 0.3199, + "epoch": 0.9719969726456914, + "grad_norm": 0.1749500036239624, + "learning_rate": 4.654235142364895e-05, + "loss": 0.4509, "step": 26970 }, { - "epoch": 0.95, - "learning_rate": 4.672746900891856e-05, - "loss": 0.2955, + "epoch": 0.972177172306916, + "grad_norm": 0.1732529252767563, + "learning_rate": 4.654087053366575e-05, + "loss": 0.4012, "step": 26975 }, { - "epoch": 0.95, - "learning_rate": 4.672605978034079e-05, - "loss": 0.2969, + "epoch": 0.9723573719681406, + "grad_norm": 0.17838901281356812, + "learning_rate": 4.653938935019235e-05, + "loss": 0.4284, "step": 26980 }, { - "epoch": 0.95, - "learning_rate": 4.6724650269664465e-05, - "loss": 0.3046, + "epoch": 0.9725375716293654, + "grad_norm": 0.16471531987190247, + "learning_rate": 4.653790787324894e-05, + "loss": 0.4306, "step": 26985 }, { - "epoch": 0.95, - "learning_rate": 4.672324047690787e-05, - "loss": 0.3196, + "epoch": 0.97271777129059, + "grad_norm": 0.19036082923412323, + "learning_rate": 4.6536426102855714e-05, + "loss": 0.4657, "step": 26990 }, { - "epoch": 0.95, - "learning_rate": 4.672183040208932e-05, - "loss": 0.2903, + "epoch": 0.9728979709518146, + "grad_norm": 0.15139730274677277, + "learning_rate": 4.6534944039032845e-05, + "loss": 0.4456, "step": 26995 }, { - "epoch": 0.95, - "learning_rate": 4.672042004522711e-05, - "loss": 0.3032, + "epoch": 0.9730781706130393, + "grad_norm": 0.17167389392852783, + "learning_rate": 4.6533461681800534e-05, + "loss": 0.4286, "step": 27000 }, { - "epoch": 0.95, - "eval_loss": 0.29417115449905396, - "eval_runtime": 10.5309, - "eval_samples_per_second": 9.496, - "eval_steps_per_second": 9.496, + "epoch": 0.9730781706130393, + "eval_loss": 0.45476090908050537, + "eval_runtime": 3.5374, + "eval_samples_per_second": 28.269, + "eval_steps_per_second": 7.067, "step": 27000 }, { - "epoch": 0.95, - "learning_rate": 4.671900940633956e-05, - "loss": 0.3257, + "epoch": 0.9732583702742639, + "grad_norm": 0.17848706245422363, + "learning_rate": 4.6531979031178975e-05, + "loss": 0.3916, "step": 27005 }, { - "epoch": 0.95, - "learning_rate": 4.671759848544499e-05, - "loss": 0.3167, + "epoch": 0.9734385699354885, + "grad_norm": 0.15405860543251038, + "learning_rate": 4.6530496087188374e-05, + "loss": 0.4158, "step": 27010 }, { - "epoch": 0.95, - "learning_rate": 4.671618728256172e-05, - "loss": 0.2893, + "epoch": 0.9736187695967131, + "grad_norm": 0.17654937505722046, + "learning_rate": 4.652901284984893e-05, + "loss": 0.4398, "step": 27015 }, { - "epoch": 0.95, - "learning_rate": 4.671477579770806e-05, - "loss": 0.2664, + "epoch": 0.9737989692579377, + "grad_norm": 0.17176496982574463, + "learning_rate": 4.652752931918085e-05, + "loss": 0.4457, "step": 27020 }, { - "epoch": 0.95, - "learning_rate": 4.671336403090235e-05, - "loss": 0.3067, + "epoch": 0.9739791689191625, + "grad_norm": 0.1502595841884613, + "learning_rate": 4.652604549520436e-05, + "loss": 0.445, "step": 27025 }, { - "epoch": 0.95, - "learning_rate": 4.671195198216293e-05, - "loss": 0.3055, + "epoch": 0.9741593685803871, + "grad_norm": 0.14152762293815613, + "learning_rate": 4.652456137793966e-05, + "loss": 0.3946, "step": 27030 }, { - "epoch": 0.95, - "learning_rate": 4.67105396515081e-05, - "loss": 0.2689, + "epoch": 0.9743395682416117, + "grad_norm": 0.18833976984024048, + "learning_rate": 4.6523076967406984e-05, + "loss": 0.4336, "step": 27035 }, { - "epoch": 0.95, - "learning_rate": 4.6709127038956235e-05, - "loss": 0.299, + "epoch": 0.9745197679028363, + "grad_norm": 0.14718258380889893, + "learning_rate": 4.652159226362655e-05, + "loss": 0.4079, "step": 27040 }, { - "epoch": 0.95, - "learning_rate": 4.670771414452566e-05, - "loss": 0.3221, + "epoch": 0.974699967564061, + "grad_norm": 0.16922341287136078, + "learning_rate": 4.652010726661858e-05, + "loss": 0.4368, "step": 27045 }, { - "epoch": 0.95, - "learning_rate": 4.670630096823472e-05, - "loss": 0.3033, + "epoch": 0.9748801672252856, + "grad_norm": 0.18971046805381775, + "learning_rate": 4.6518621976403335e-05, + "loss": 0.4514, "step": 27050 }, { - "epoch": 0.95, - "learning_rate": 4.670488751010177e-05, - "loss": 0.2964, + "epoch": 0.9750603668865102, + "grad_norm": 0.15908949077129364, + "learning_rate": 4.6517136393001015e-05, + "loss": 0.4215, "step": 27055 }, { - "epoch": 0.95, - "learning_rate": 4.6703473770145155e-05, - "loss": 0.3276, + "epoch": 0.9752405665477349, + "grad_norm": 0.15814034640789032, + "learning_rate": 4.651565051643188e-05, + "loss": 0.4227, "step": 27060 }, { - "epoch": 0.95, - "learning_rate": 4.6702059748383244e-05, - "loss": 0.2896, + "epoch": 0.9754207662089596, + "grad_norm": 0.19080659747123718, + "learning_rate": 4.651416434671617e-05, + "loss": 0.4194, "step": 27065 }, { - "epoch": 0.95, - "learning_rate": 4.670064544483438e-05, - "loss": 0.3221, + "epoch": 0.9756009658701842, + "grad_norm": 0.17519068717956543, + "learning_rate": 4.651267788387415e-05, + "loss": 0.423, "step": 27070 }, { - "epoch": 0.95, - "learning_rate": 4.669923085951694e-05, - "loss": 0.3254, + "epoch": 0.9757811655314088, + "grad_norm": 0.18204259872436523, + "learning_rate": 4.651119112792604e-05, + "loss": 0.4161, "step": 27075 }, { - "epoch": 0.95, - "learning_rate": 4.6697815992449287e-05, - "loss": 0.2988, + "epoch": 0.9759613651926334, + "grad_norm": 0.17226997017860413, + "learning_rate": 4.6509704078892124e-05, + "loss": 0.4583, "step": 27080 }, { - "epoch": 0.95, - "learning_rate": 4.66964008436498e-05, - "loss": 0.2927, + "epoch": 0.9761415648538581, + "grad_norm": 0.15808525681495667, + "learning_rate": 4.650821673679265e-05, + "loss": 0.4533, "step": 27085 }, { - "epoch": 0.95, - "learning_rate": 4.6694985413136835e-05, - "loss": 0.3356, + "epoch": 0.9763217645150827, + "grad_norm": 0.15862226486206055, + "learning_rate": 4.6506729101647897e-05, + "loss": 0.3829, "step": 27090 }, { - "epoch": 0.95, - "learning_rate": 4.6693569700928795e-05, - "loss": 0.3185, + "epoch": 0.9765019641763073, + "grad_norm": 0.18071158230304718, + "learning_rate": 4.650524117347812e-05, + "loss": 0.4365, "step": 27095 }, { - "epoch": 0.95, - "learning_rate": 4.669215370704404e-05, - "loss": 0.2984, + "epoch": 0.976682163837532, + "grad_norm": 0.2250252366065979, + "learning_rate": 4.650375295230359e-05, + "loss": 0.453, "step": 27100 }, { - "epoch": 0.95, - "learning_rate": 4.669073743150096e-05, - "loss": 0.3017, + "epoch": 0.9768623634987567, + "grad_norm": 0.182828888297081, + "learning_rate": 4.6502264438144596e-05, + "loss": 0.4589, "step": 27105 }, { - "epoch": 0.95, - "learning_rate": 4.668932087431797e-05, - "loss": 0.3103, + "epoch": 0.9770425631599813, + "grad_norm": 0.18622487783432007, + "learning_rate": 4.650077563102141e-05, + "loss": 0.4099, "step": 27110 }, { - "epoch": 0.95, - "learning_rate": 4.6687904035513416e-05, - "loss": 0.2788, + "epoch": 0.9772227628212059, + "grad_norm": 0.16842947900295258, + "learning_rate": 4.6499286530954314e-05, + "loss": 0.4102, "step": 27115 }, { - "epoch": 0.95, - "learning_rate": 4.668648691510574e-05, - "loss": 0.2997, + "epoch": 0.9774029624824305, + "grad_norm": 0.17270709574222565, + "learning_rate": 4.649779713796361e-05, + "loss": 0.4312, "step": 27120 }, { - "epoch": 0.95, - "learning_rate": 4.6685069513113315e-05, - "loss": 0.2898, + "epoch": 0.9775831621436551, + "grad_norm": 0.1607503890991211, + "learning_rate": 4.649630745206958e-05, + "loss": 0.4738, "step": 27125 }, { - "epoch": 0.95, - "learning_rate": 4.668365182955455e-05, - "loss": 0.2964, + "epoch": 0.9777633618048798, + "grad_norm": 0.13477887213230133, + "learning_rate": 4.649481747329252e-05, + "loss": 0.4357, "step": 27130 }, { - "epoch": 0.95, - "learning_rate": 4.6682233864447865e-05, - "loss": 0.2835, + "epoch": 0.9779435614661044, + "grad_norm": 0.16214247047901154, + "learning_rate": 4.6493327201652725e-05, + "loss": 0.3851, "step": 27135 }, { - "epoch": 0.95, - "learning_rate": 4.668081561781166e-05, - "loss": 0.3101, + "epoch": 0.9781237611273291, + "grad_norm": 0.19764074683189392, + "learning_rate": 4.649183663717052e-05, + "loss": 0.4212, "step": 27140 }, { - "epoch": 0.96, - "learning_rate": 4.667939708966435e-05, - "loss": 0.3146, + "epoch": 0.9783039607885538, + "grad_norm": 0.169803187251091, + "learning_rate": 4.6490345779866197e-05, + "loss": 0.4267, "step": 27145 }, { - "epoch": 0.96, - "learning_rate": 4.667797828002436e-05, - "loss": 0.2676, + "epoch": 0.9784841604497784, + "grad_norm": 0.16970309615135193, + "learning_rate": 4.6488854629760074e-05, + "loss": 0.4518, "step": 27150 }, { - "epoch": 0.96, - "learning_rate": 4.66765591889101e-05, - "loss": 0.2785, + "epoch": 0.978664360111003, + "grad_norm": 0.15592272579669952, + "learning_rate": 4.648736318687247e-05, + "loss": 0.4588, "step": 27155 }, { - "epoch": 0.96, - "learning_rate": 4.667513981634001e-05, - "loss": 0.3098, + "epoch": 0.9788445597722276, + "grad_norm": 0.17874795198440552, + "learning_rate": 4.64858714512237e-05, + "loss": 0.4404, "step": 27160 }, { - "epoch": 0.96, - "learning_rate": 4.667372016233251e-05, - "loss": 0.2952, + "epoch": 0.9790247594334522, + "grad_norm": 0.15960319340229034, + "learning_rate": 4.648437942283409e-05, + "loss": 0.408, "step": 27165 }, { - "epoch": 0.96, - "learning_rate": 4.667230022690605e-05, - "loss": 0.3123, + "epoch": 0.9792049590946769, + "grad_norm": 0.17572569847106934, + "learning_rate": 4.6482887101723974e-05, + "loss": 0.4118, "step": 27170 }, { - "epoch": 0.96, - "learning_rate": 4.667088001007904e-05, - "loss": 0.2965, + "epoch": 0.9793851587559015, + "grad_norm": 0.20097064971923828, + "learning_rate": 4.6481394487913673e-05, + "loss": 0.4195, "step": 27175 }, { - "epoch": 0.96, - "learning_rate": 4.666945951186994e-05, - "loss": 0.3047, + "epoch": 0.9795653584171262, + "grad_norm": 0.18549709022045135, + "learning_rate": 4.647990158142354e-05, + "loss": 0.4298, "step": 27180 }, { - "epoch": 0.96, - "learning_rate": 4.666803873229719e-05, - "loss": 0.2921, + "epoch": 0.9797455580783508, + "grad_norm": 0.17674687504768372, + "learning_rate": 4.6478408382273905e-05, + "loss": 0.441, "step": 27185 }, { - "epoch": 0.96, - "learning_rate": 4.666661767137923e-05, - "loss": 0.3198, + "epoch": 0.9799257577395755, + "grad_norm": 0.14485035836696625, + "learning_rate": 4.6476914890485114e-05, + "loss": 0.4422, "step": 27190 }, { - "epoch": 0.96, - "learning_rate": 4.666519632913453e-05, - "loss": 0.2968, + "epoch": 0.9801059574008001, + "grad_norm": 0.1638946533203125, + "learning_rate": 4.647542110607751e-05, + "loss": 0.4341, "step": 27195 }, { - "epoch": 0.96, - "learning_rate": 4.666377470558152e-05, - "loss": 0.2936, + "epoch": 0.9802861570620247, + "grad_norm": 0.1771542876958847, + "learning_rate": 4.6473927029071454e-05, + "loss": 0.4298, "step": 27200 }, { - "epoch": 0.96, - "learning_rate": 4.6662352800738684e-05, - "loss": 0.281, + "epoch": 0.9804663567232493, + "grad_norm": 0.16256387531757355, + "learning_rate": 4.6472432659487296e-05, + "loss": 0.4271, "step": 27205 }, { - "epoch": 0.96, - "learning_rate": 4.666093061462447e-05, - "loss": 0.2865, + "epoch": 0.980646556384474, + "grad_norm": 0.15982799232006073, + "learning_rate": 4.647093799734541e-05, + "loss": 0.4554, "step": 27210 }, { - "epoch": 0.96, - "learning_rate": 4.665950814725734e-05, - "loss": 0.2759, + "epoch": 0.9808267560456987, + "grad_norm": 0.19146406650543213, + "learning_rate": 4.646944304266615e-05, + "loss": 0.4195, "step": 27215 }, { - "epoch": 0.96, - "learning_rate": 4.6658085398655784e-05, - "loss": 0.2907, + "epoch": 0.9810069557069233, + "grad_norm": 0.17436739802360535, + "learning_rate": 4.646794779546988e-05, + "loss": 0.4558, "step": 27220 }, { - "epoch": 0.96, - "learning_rate": 4.6656662368838247e-05, - "loss": 0.291, + "epoch": 0.9811871553681479, + "grad_norm": 0.17052249610424042, + "learning_rate": 4.6466452255776976e-05, + "loss": 0.4448, "step": 27225 }, { - "epoch": 0.96, - "learning_rate": 4.665523905782323e-05, - "loss": 0.3372, + "epoch": 0.9813673550293726, + "grad_norm": 0.1632729321718216, + "learning_rate": 4.646495642360782e-05, + "loss": 0.4211, "step": 27230 }, { - "epoch": 0.96, - "learning_rate": 4.665381546562921e-05, - "loss": 0.2823, + "epoch": 0.9815475546905972, + "grad_norm": 0.2103988230228424, + "learning_rate": 4.6463460298982787e-05, + "loss": 0.431, "step": 27235 }, { - "epoch": 0.96, - "learning_rate": 4.665239159227466e-05, - "loss": 0.3084, + "epoch": 0.9817277543518218, + "grad_norm": 0.19348739087581635, + "learning_rate": 4.646196388192226e-05, + "loss": 0.4097, "step": 27240 }, { - "epoch": 0.96, - "learning_rate": 4.665096743777807e-05, - "loss": 0.3115, + "epoch": 0.9819079540130464, + "grad_norm": 0.1999654471874237, + "learning_rate": 4.646046717244663e-05, + "loss": 0.435, "step": 27245 }, { - "epoch": 0.96, - "learning_rate": 4.664954300215795e-05, - "loss": 0.294, + "epoch": 0.982088153674271, + "grad_norm": 0.16381704807281494, + "learning_rate": 4.6458970170576296e-05, + "loss": 0.3906, "step": 27250 }, { - "epoch": 0.96, - "learning_rate": 4.664811828543277e-05, - "loss": 0.3071, + "epoch": 0.9822683533354958, + "grad_norm": 0.15966589748859406, + "learning_rate": 4.6457472876331644e-05, + "loss": 0.4372, "step": 27255 }, { - "epoch": 0.96, - "learning_rate": 4.6646693287621045e-05, - "loss": 0.287, + "epoch": 0.9824485529967204, + "grad_norm": 0.17224369943141937, + "learning_rate": 4.6455975289733077e-05, + "loss": 0.4433, "step": 27260 }, { - "epoch": 0.96, - "learning_rate": 4.664526800874127e-05, - "loss": 0.2981, + "epoch": 0.982628752657945, + "grad_norm": 0.1269349455833435, + "learning_rate": 4.6454477410801e-05, + "loss": 0.3989, "step": 27265 }, { - "epoch": 0.96, - "learning_rate": 4.664384244881196e-05, - "loss": 0.2819, + "epoch": 0.9828089523191696, + "grad_norm": 0.1905910223722458, + "learning_rate": 4.6452979239555825e-05, + "loss": 0.4544, "step": 27270 }, { - "epoch": 0.96, - "learning_rate": 4.6642416607851615e-05, - "loss": 0.3413, + "epoch": 0.9829891519803943, + "grad_norm": 0.17192591726779938, + "learning_rate": 4.645148077601796e-05, + "loss": 0.4112, "step": 27275 }, { - "epoch": 0.96, - "learning_rate": 4.6640990485878755e-05, - "loss": 0.2848, + "epoch": 0.9831693516416189, + "grad_norm": 0.1754426509141922, + "learning_rate": 4.644998202020783e-05, + "loss": 0.471, "step": 27280 }, { - "epoch": 0.96, - "learning_rate": 4.6639564082911896e-05, - "loss": 0.3091, + "epoch": 0.9833495513028435, + "grad_norm": 0.16824094951152802, + "learning_rate": 4.644848297214584e-05, + "loss": 0.4322, "step": 27285 }, { - "epoch": 0.96, - "learning_rate": 4.663813739896956e-05, - "loss": 0.2923, + "epoch": 0.9835297509640681, + "grad_norm": 0.15957225859165192, + "learning_rate": 4.6446983631852424e-05, + "loss": 0.4364, "step": 27290 }, { - "epoch": 0.96, - "learning_rate": 4.663671043407026e-05, - "loss": 0.3156, + "epoch": 0.9837099506252929, + "grad_norm": 0.16426332294940948, + "learning_rate": 4.6445483999348006e-05, + "loss": 0.4243, "step": 27295 }, { - "epoch": 0.96, - "learning_rate": 4.663528318823255e-05, - "loss": 0.2914, + "epoch": 0.9838901502865175, + "grad_norm": 0.1607593595981598, + "learning_rate": 4.6443984074653026e-05, + "loss": 0.4289, "step": 27300 }, { - "epoch": 0.96, - "learning_rate": 4.6633855661474935e-05, - "loss": 0.2957, + "epoch": 0.9840703499477421, + "grad_norm": 0.15784458816051483, + "learning_rate": 4.644248385778791e-05, + "loss": 0.396, "step": 27305 }, { - "epoch": 0.96, - "learning_rate": 4.6632427853815965e-05, - "loss": 0.2945, + "epoch": 0.9842505496089667, + "grad_norm": 0.21137282252311707, + "learning_rate": 4.6440983348773105e-05, + "loss": 0.4444, "step": 27310 }, { - "epoch": 0.96, - "learning_rate": 4.6630999765274175e-05, - "loss": 0.294, + "epoch": 0.9844307492701914, + "grad_norm": 0.21864286065101624, + "learning_rate": 4.6439482547629046e-05, + "loss": 0.4633, "step": 27315 }, { - "epoch": 0.96, - "learning_rate": 4.662957139586811e-05, - "loss": 0.2932, + "epoch": 0.984610948931416, + "grad_norm": 0.1778162717819214, + "learning_rate": 4.6437981454376194e-05, + "loss": 0.3835, "step": 27320 }, { - "epoch": 0.96, - "learning_rate": 4.6628142745616313e-05, - "loss": 0.2979, + "epoch": 0.9847911485926406, + "grad_norm": 0.1485341191291809, + "learning_rate": 4.6436480069034995e-05, + "loss": 0.4085, "step": 27325 }, { - "epoch": 0.96, - "learning_rate": 4.6626713814537335e-05, - "loss": 0.2806, + "epoch": 0.9849713482538652, + "grad_norm": 0.14085252583026886, + "learning_rate": 4.6434978391625905e-05, + "loss": 0.4097, "step": 27330 }, { - "epoch": 0.96, - "learning_rate": 4.662528460264973e-05, - "loss": 0.289, + "epoch": 0.98515154791509, + "grad_norm": 0.12692780792713165, + "learning_rate": 4.643347642216939e-05, + "loss": 0.4053, "step": 27335 }, { - "epoch": 0.96, - "learning_rate": 4.662385510997206e-05, - "loss": 0.3088, + "epoch": 0.9853317475763146, + "grad_norm": 0.18438071012496948, + "learning_rate": 4.64319741606859e-05, + "loss": 0.43, "step": 27340 }, { - "epoch": 0.96, - "learning_rate": 4.662242533652288e-05, - "loss": 0.2632, + "epoch": 0.9855119472375392, + "grad_norm": 0.21135936677455902, + "learning_rate": 4.6430471607195917e-05, + "loss": 0.4677, "step": 27345 }, { - "epoch": 0.96, - "learning_rate": 4.6620995282320756e-05, - "loss": 0.2826, + "epoch": 0.9856921468987638, + "grad_norm": 0.16982705891132355, + "learning_rate": 4.64289687617199e-05, + "loss": 0.4531, "step": 27350 }, { - "epoch": 0.96, - "learning_rate": 4.661956494738426e-05, - "loss": 0.2917, + "epoch": 0.9858723465599885, + "grad_norm": 0.16164273023605347, + "learning_rate": 4.642746562427834e-05, + "loss": 0.4122, "step": 27355 }, { - "epoch": 0.96, - "learning_rate": 4.6618134331731956e-05, - "loss": 0.3059, + "epoch": 0.9860525462212131, + "grad_norm": 0.20969519019126892, + "learning_rate": 4.6425962194891705e-05, + "loss": 0.4288, "step": 27360 }, { - "epoch": 0.96, - "learning_rate": 4.661670343538243e-05, - "loss": 0.2964, + "epoch": 0.9862327458824377, + "grad_norm": 0.14550209045410156, + "learning_rate": 4.6424458473580486e-05, + "loss": 0.4714, "step": 27365 }, { - "epoch": 0.96, - "learning_rate": 4.6615272258354255e-05, - "loss": 0.3051, + "epoch": 0.9864129455436623, + "grad_norm": 0.18479301035404205, + "learning_rate": 4.6422954460365165e-05, + "loss": 0.4166, "step": 27370 }, { - "epoch": 0.96, - "learning_rate": 4.661384080066601e-05, - "loss": 0.3155, + "epoch": 0.9865931452048871, + "grad_norm": 0.15923672914505005, + "learning_rate": 4.642145015526624e-05, + "loss": 0.4384, "step": 27375 }, { - "epoch": 0.96, - "learning_rate": 4.661240906233629e-05, - "loss": 0.3184, + "epoch": 0.9867733448661117, + "grad_norm": 0.1702156960964203, + "learning_rate": 4.64199455583042e-05, + "loss": 0.427, "step": 27380 }, { - "epoch": 0.96, - "learning_rate": 4.661097704338368e-05, - "loss": 0.2967, + "epoch": 0.9869535445273363, + "grad_norm": 0.19491025805473328, + "learning_rate": 4.641844066949955e-05, + "loss": 0.4257, "step": 27385 }, { - "epoch": 0.96, - "learning_rate": 4.6609544743826775e-05, - "loss": 0.3054, + "epoch": 0.9871337441885609, + "grad_norm": 0.21191620826721191, + "learning_rate": 4.6416935488872806e-05, + "loss": 0.4846, "step": 27390 }, { - "epoch": 0.96, - "learning_rate": 4.6608112163684175e-05, - "loss": 0.3207, + "epoch": 0.9873139438497855, + "grad_norm": 0.1783130019903183, + "learning_rate": 4.6415430016444445e-05, + "loss": 0.4066, "step": 27395 }, { - "epoch": 0.96, - "learning_rate": 4.6606679302974474e-05, - "loss": 0.2907, + "epoch": 0.9874941435110102, + "grad_norm": 0.16882510483264923, + "learning_rate": 4.6413924252235006e-05, + "loss": 0.4118, "step": 27400 }, { - "epoch": 0.96, - "learning_rate": 4.660524616171629e-05, - "loss": 0.3062, + "epoch": 0.9876743431722348, + "grad_norm": 0.18911772966384888, + "learning_rate": 4.641241819626499e-05, + "loss": 0.4378, "step": 27405 }, { - "epoch": 0.96, - "learning_rate": 4.660381273992822e-05, - "loss": 0.2796, + "epoch": 0.9878545428334595, + "grad_norm": 0.15727469325065613, + "learning_rate": 4.641091184855492e-05, + "loss": 0.4031, "step": 27410 }, { - "epoch": 0.96, - "learning_rate": 4.660237903762887e-05, - "loss": 0.3121, + "epoch": 0.9880347424946841, + "grad_norm": 0.1802516132593155, + "learning_rate": 4.640940520912532e-05, + "loss": 0.4541, "step": 27415 }, { - "epoch": 0.96, - "learning_rate": 4.660094505483688e-05, - "loss": 0.2687, + "epoch": 0.9882149421559088, + "grad_norm": 0.18460440635681152, + "learning_rate": 4.640789827799673e-05, + "loss": 0.409, "step": 27420 }, { - "epoch": 0.96, - "learning_rate": 4.659951079157084e-05, - "loss": 0.319, + "epoch": 0.9883951418171334, + "grad_norm": 0.18090972304344177, + "learning_rate": 4.640639105518966e-05, + "loss": 0.4232, "step": 27425 }, { - "epoch": 0.97, - "learning_rate": 4.65980762478494e-05, - "loss": 0.3121, + "epoch": 0.988575341478358, + "grad_norm": 0.15177130699157715, + "learning_rate": 4.6404883540724665e-05, + "loss": 0.3694, "step": 27430 }, { - "epoch": 0.97, - "learning_rate": 4.659664142369116e-05, - "loss": 0.3029, + "epoch": 0.9887555411395826, + "grad_norm": 0.19808749854564667, + "learning_rate": 4.6403375734622265e-05, + "loss": 0.416, "step": 27435 }, { - "epoch": 0.97, - "learning_rate": 4.659520631911477e-05, - "loss": 0.2954, + "epoch": 0.9889357408008073, + "grad_norm": 0.18107527494430542, + "learning_rate": 4.640186763690302e-05, + "loss": 0.4578, "step": 27440 }, { - "epoch": 0.97, - "learning_rate": 4.659377093413886e-05, - "loss": 0.2916, + "epoch": 0.9891159404620319, + "grad_norm": 0.18067267537117004, + "learning_rate": 4.640035924758748e-05, + "loss": 0.4572, "step": 27445 }, { - "epoch": 0.97, - "learning_rate": 4.659233526878206e-05, - "loss": 0.2977, + "epoch": 0.9892961401232566, + "grad_norm": 0.14522598683834076, + "learning_rate": 4.6398850566696176e-05, + "loss": 0.4252, "step": 27450 }, { - "epoch": 0.97, - "learning_rate": 4.659089932306302e-05, - "loss": 0.2922, + "epoch": 0.9894763397844812, + "grad_norm": 0.1613176465034485, + "learning_rate": 4.6397341594249675e-05, + "loss": 0.3907, "step": 27455 }, { - "epoch": 0.97, - "learning_rate": 4.658946309700039e-05, - "loss": 0.3278, + "epoch": 0.9896565394457059, + "grad_norm": 0.1852157860994339, + "learning_rate": 4.639583233026855e-05, + "loss": 0.4324, "step": 27460 }, { - "epoch": 0.97, - "learning_rate": 4.658802659061279e-05, - "loss": 0.2658, + "epoch": 0.9898367391069305, + "grad_norm": 0.16514049470424652, + "learning_rate": 4.639432277477335e-05, + "loss": 0.4413, "step": 27465 }, { - "epoch": 0.97, - "learning_rate": 4.6586589803918905e-05, - "loss": 0.2951, + "epoch": 0.9900169387681551, + "grad_norm": 0.14694753289222717, + "learning_rate": 4.639281292778464e-05, + "loss": 0.4236, "step": 27470 }, { - "epoch": 0.97, - "learning_rate": 4.658515273693738e-05, - "loss": 0.3163, + "epoch": 0.9901971384293797, + "grad_norm": 0.18619796633720398, + "learning_rate": 4.639130278932299e-05, + "loss": 0.4442, "step": 27475 }, { - "epoch": 0.97, - "learning_rate": 4.658371538968686e-05, - "loss": 0.2748, + "epoch": 0.9903773380906044, + "grad_norm": 0.15744751691818237, + "learning_rate": 4.638979235940899e-05, + "loss": 0.4074, "step": 27480 }, { - "epoch": 0.97, - "learning_rate": 4.658227776218603e-05, - "loss": 0.3057, + "epoch": 0.990557537751829, + "grad_norm": 0.17762751877307892, + "learning_rate": 4.63882816380632e-05, + "loss": 0.4212, "step": 27485 }, { - "epoch": 0.97, - "learning_rate": 4.658083985445354e-05, - "loss": 0.2993, + "epoch": 0.9907377374130537, + "grad_norm": 0.18236143887043, + "learning_rate": 4.638677062530622e-05, + "loss": 0.4089, "step": 27490 }, { - "epoch": 0.97, - "learning_rate": 4.6579401666508074e-05, - "loss": 0.3059, + "epoch": 0.9909179370742783, + "grad_norm": 0.1682094931602478, + "learning_rate": 4.638525932115863e-05, + "loss": 0.4288, "step": 27495 }, { - "epoch": 0.97, - "learning_rate": 4.657796319836829e-05, - "loss": 0.3165, + "epoch": 0.991098136735503, + "grad_norm": 0.17078998684883118, + "learning_rate": 4.6383747725641027e-05, + "loss": 0.4328, "step": 27500 }, { - "epoch": 0.97, - "eval_loss": 0.293122798204422, - "eval_runtime": 10.5363, - "eval_samples_per_second": 9.491, - "eval_steps_per_second": 9.491, + "epoch": 0.991098136735503, + "eval_loss": 0.45491889119148254, + "eval_runtime": 3.529, + "eval_samples_per_second": 28.337, + "eval_steps_per_second": 7.084, "step": 27500 }, { - "epoch": 0.97, - "learning_rate": 4.657652445005288e-05, - "loss": 0.2904, + "epoch": 0.9912783363967276, + "grad_norm": 0.185177743434906, + "learning_rate": 4.6382235838774e-05, + "loss": 0.4294, "step": 27505 }, { - "epoch": 0.97, - "learning_rate": 4.657508542158052e-05, - "loss": 0.2853, + "epoch": 0.9914585360579522, + "grad_norm": 0.1422509402036667, + "learning_rate": 4.6380723660578144e-05, + "loss": 0.3911, "step": 27510 }, { - "epoch": 0.97, - "learning_rate": 4.657364611296989e-05, - "loss": 0.2936, + "epoch": 0.9916387357191768, + "grad_norm": 0.1539120376110077, + "learning_rate": 4.6379211191074066e-05, + "loss": 0.4519, "step": 27515 }, { - "epoch": 0.97, - "learning_rate": 4.6572206524239686e-05, - "loss": 0.2737, + "epoch": 0.9918189353804014, + "grad_norm": 0.19037242233753204, + "learning_rate": 4.637769843028238e-05, + "loss": 0.4822, "step": 27520 }, { - "epoch": 0.97, - "learning_rate": 4.6570766655408594e-05, - "loss": 0.2966, + "epoch": 0.9919991350416261, + "grad_norm": 0.2115742713212967, + "learning_rate": 4.637618537822369e-05, + "loss": 0.4498, "step": 27525 }, { - "epoch": 0.97, - "learning_rate": 4.656932650649532e-05, - "loss": 0.3009, + "epoch": 0.9921793347028508, + "grad_norm": 0.17919567227363586, + "learning_rate": 4.6374672034918606e-05, + "loss": 0.4453, "step": 27530 }, { - "epoch": 0.97, - "learning_rate": 4.656788607751855e-05, - "loss": 0.3059, + "epoch": 0.9923595343640754, + "grad_norm": 0.13095134496688843, + "learning_rate": 4.6373158400387775e-05, + "loss": 0.4289, "step": 27535 }, { - "epoch": 0.97, - "learning_rate": 4.656644536849699e-05, - "loss": 0.2924, + "epoch": 0.9925397340253, + "grad_norm": 0.18032823503017426, + "learning_rate": 4.6371644474651773e-05, + "loss": 0.4262, "step": 27540 }, { - "epoch": 0.97, - "learning_rate": 4.656500437944936e-05, - "loss": 0.3073, + "epoch": 0.9927199336865247, + "grad_norm": 0.14844289422035217, + "learning_rate": 4.637013025773127e-05, + "loss": 0.4254, "step": 27545 }, { - "epoch": 0.97, - "learning_rate": 4.656356311039435e-05, - "loss": 0.295, + "epoch": 0.9929001333477493, + "grad_norm": 0.182834193110466, + "learning_rate": 4.636861574964687e-05, + "loss": 0.4526, "step": 27550 }, { - "epoch": 0.97, - "learning_rate": 4.6562121561350686e-05, - "loss": 0.2916, + "epoch": 0.9930803330089739, + "grad_norm": 0.18387030065059662, + "learning_rate": 4.6367100950419226e-05, + "loss": 0.4377, "step": 27555 }, { - "epoch": 0.97, - "learning_rate": 4.656067973233709e-05, - "loss": 0.3131, + "epoch": 0.9932605326701985, + "grad_norm": 0.200217604637146, + "learning_rate": 4.636558586006896e-05, + "loss": 0.4289, "step": 27560 }, { - "epoch": 0.97, - "learning_rate": 4.6559237623372265e-05, - "loss": 0.2935, + "epoch": 0.9934407323314233, + "grad_norm": 0.16323430836200714, + "learning_rate": 4.636407047861673e-05, + "loss": 0.41, "step": 27565 }, { - "epoch": 0.97, - "learning_rate": 4.655779523447495e-05, - "loss": 0.3054, + "epoch": 0.9936209319926479, + "grad_norm": 0.14790096879005432, + "learning_rate": 4.6362554806083176e-05, + "loss": 0.4418, "step": 27570 }, { - "epoch": 0.97, - "learning_rate": 4.6556352565663885e-05, - "loss": 0.3058, + "epoch": 0.9938011316538725, + "grad_norm": 0.18364425003528595, + "learning_rate": 4.6361038842488944e-05, + "loss": 0.4132, "step": 27575 }, { - "epoch": 0.97, - "learning_rate": 4.6554909616957774e-05, - "loss": 0.2799, + "epoch": 0.9939813313150971, + "grad_norm": 0.15414080023765564, + "learning_rate": 4.63595225878547e-05, + "loss": 0.4203, "step": 27580 }, { - "epoch": 0.97, - "learning_rate": 4.6553466388375375e-05, - "loss": 0.3288, + "epoch": 0.9941615309763218, + "grad_norm": 0.17159409821033478, + "learning_rate": 4.635800604220109e-05, + "loss": 0.4208, "step": 27585 }, { - "epoch": 0.97, - "learning_rate": 4.655202287993541e-05, - "loss": 0.3051, + "epoch": 0.9943417306375464, + "grad_norm": 0.13989026844501495, + "learning_rate": 4.635648920554878e-05, + "loss": 0.4302, "step": 27590 }, { - "epoch": 0.97, - "learning_rate": 4.6550579091656646e-05, - "loss": 0.2936, + "epoch": 0.994521930298771, + "grad_norm": 0.19220641255378723, + "learning_rate": 4.635497207791845e-05, + "loss": 0.4385, "step": 27595 }, { - "epoch": 0.97, - "learning_rate": 4.65491350235578e-05, - "loss": 0.3018, + "epoch": 0.9947021299599956, + "grad_norm": 0.20542806386947632, + "learning_rate": 4.635345465933075e-05, + "loss": 0.4185, "step": 27600 }, { - "epoch": 0.97, - "learning_rate": 4.654769067565764e-05, - "loss": 0.29, + "epoch": 0.9948823296212204, + "grad_norm": 0.15193575620651245, + "learning_rate": 4.635193694980636e-05, + "loss": 0.4152, "step": 27605 }, { - "epoch": 0.97, - "learning_rate": 4.654624604797492e-05, - "loss": 0.2886, + "epoch": 0.995062529282445, + "grad_norm": 0.174140065908432, + "learning_rate": 4.635041894936598e-05, + "loss": 0.4468, "step": 27610 }, { - "epoch": 0.97, - "learning_rate": 4.65448011405284e-05, - "loss": 0.3219, + "epoch": 0.9952427289436696, + "grad_norm": 0.18024513125419617, + "learning_rate": 4.6348900658030263e-05, + "loss": 0.4326, "step": 27615 }, { - "epoch": 0.97, - "learning_rate": 4.654335595333683e-05, - "loss": 0.2825, + "epoch": 0.9954229286048942, + "grad_norm": 0.18026669323444366, + "learning_rate": 4.634738207581991e-05, + "loss": 0.4329, "step": 27620 }, { - "epoch": 0.97, - "learning_rate": 4.654191048641898e-05, - "loss": 0.2895, + "epoch": 0.9956031282661189, + "grad_norm": 0.15884456038475037, + "learning_rate": 4.634586320275561e-05, + "loss": 0.4202, "step": 27625 }, { - "epoch": 0.97, - "learning_rate": 4.6540464739793624e-05, - "loss": 0.3029, + "epoch": 0.9957833279273435, + "grad_norm": 0.1757356971502304, + "learning_rate": 4.634434403885805e-05, + "loss": 0.4291, "step": 27630 }, { - "epoch": 0.97, - "learning_rate": 4.653901871347952e-05, - "loss": 0.3063, + "epoch": 0.9959635275885681, + "grad_norm": 0.16361142694950104, + "learning_rate": 4.634282458414795e-05, + "loss": 0.4216, "step": 27635 }, { - "epoch": 0.97, - "learning_rate": 4.653757240749546e-05, - "loss": 0.3149, + "epoch": 0.9961437272497927, + "grad_norm": 0.21342548727989197, + "learning_rate": 4.634130483864598e-05, + "loss": 0.4669, "step": 27640 }, { - "epoch": 0.97, - "learning_rate": 4.653612582186021e-05, - "loss": 0.3045, + "epoch": 0.9963239269110175, + "grad_norm": 0.2188446968793869, + "learning_rate": 4.6339784802372874e-05, + "loss": 0.4772, "step": 27645 }, { - "epoch": 0.97, - "learning_rate": 4.653467895659256e-05, - "loss": 0.2987, + "epoch": 0.9965041265722421, + "grad_norm": 0.1779574155807495, + "learning_rate": 4.633826447534934e-05, + "loss": 0.4633, "step": 27650 }, { - "epoch": 0.97, - "learning_rate": 4.6533231811711296e-05, - "loss": 0.3002, + "epoch": 0.9966843262334667, + "grad_norm": 0.19599558413028717, + "learning_rate": 4.633674385759607e-05, + "loss": 0.4405, "step": 27655 }, { - "epoch": 0.97, - "learning_rate": 4.653178438723521e-05, - "loss": 0.3108, + "epoch": 0.9968645258946913, + "grad_norm": 0.15029726922512054, + "learning_rate": 4.6335222949133794e-05, + "loss": 0.4461, "step": 27660 }, { - "epoch": 0.97, - "learning_rate": 4.653033668318309e-05, - "loss": 0.2842, + "epoch": 0.997044725555916, + "grad_norm": 0.17829090356826782, + "learning_rate": 4.633370174998324e-05, + "loss": 0.4567, "step": 27665 }, { - "epoch": 0.97, - "learning_rate": 4.652888869957375e-05, - "loss": 0.2661, + "epoch": 0.9972249252171406, + "grad_norm": 0.1724914163351059, + "learning_rate": 4.6332180260165135e-05, + "loss": 0.4121, "step": 27670 }, { - "epoch": 0.97, - "learning_rate": 4.652744043642597e-05, - "loss": 0.312, + "epoch": 0.9974051248783652, + "grad_norm": 0.18331369757652283, + "learning_rate": 4.63306584797002e-05, + "loss": 0.4185, "step": 27675 }, { - "epoch": 0.97, - "learning_rate": 4.652599189375856e-05, - "loss": 0.3114, + "epoch": 0.9975853245395898, + "grad_norm": 0.2127685546875, + "learning_rate": 4.632913640860918e-05, + "loss": 0.465, "step": 27680 }, { - "epoch": 0.97, - "learning_rate": 4.652454307159033e-05, - "loss": 0.3029, + "epoch": 0.9977655242008145, + "grad_norm": 0.18392300605773926, + "learning_rate": 4.6327614046912796e-05, + "loss": 0.4276, "step": 27685 }, { - "epoch": 0.97, - "learning_rate": 4.65230939699401e-05, - "loss": 0.326, + "epoch": 0.9979457238620392, + "grad_norm": 0.18069952726364136, + "learning_rate": 4.63260913946318e-05, + "loss": 0.4089, "step": 27690 }, { - "epoch": 0.97, - "learning_rate": 4.6521644588826675e-05, - "loss": 0.3124, + "epoch": 0.9981259235232638, + "grad_norm": 0.17359404265880585, + "learning_rate": 4.632456845178694e-05, + "loss": 0.4294, "step": 27695 }, { - "epoch": 0.97, - "learning_rate": 4.652019492826888e-05, - "loss": 0.2903, + "epoch": 0.9983061231844884, + "grad_norm": 0.1843539923429489, + "learning_rate": 4.632304521839896e-05, + "loss": 0.4795, "step": 27700 }, { - "epoch": 0.97, - "learning_rate": 4.6518744988285536e-05, - "loss": 0.3094, + "epoch": 0.998486322845713, + "grad_norm": 0.17044410109519958, + "learning_rate": 4.6321521694488627e-05, + "loss": 0.4345, "step": 27705 }, { - "epoch": 0.97, - "learning_rate": 4.651729476889547e-05, - "loss": 0.2816, + "epoch": 0.9986665225069377, + "grad_norm": 0.24153609573841095, + "learning_rate": 4.631999788007668e-05, + "loss": 0.437, "step": 27710 }, { - "epoch": 0.98, - "learning_rate": 4.651584427011751e-05, - "loss": 0.3151, + "epoch": 0.9988467221681623, + "grad_norm": 0.16202819347381592, + "learning_rate": 4.631847377518389e-05, + "loss": 0.4272, "step": 27715 }, { - "epoch": 0.98, - "learning_rate": 4.6514393491970496e-05, - "loss": 0.291, + "epoch": 0.999026921829387, + "grad_norm": 0.16928540170192719, + "learning_rate": 4.6316949379831025e-05, + "loss": 0.4584, "step": 27720 }, { - "epoch": 0.98, - "learning_rate": 4.651294243447326e-05, - "loss": 0.2991, + "epoch": 0.9992071214906116, + "grad_norm": 0.20613226294517517, + "learning_rate": 4.6315424694038854e-05, + "loss": 0.4415, "step": 27725 }, { - "epoch": 0.98, - "learning_rate": 4.651149109764464e-05, - "loss": 0.301, + "epoch": 0.9993873211518363, + "grad_norm": 0.17505665123462677, + "learning_rate": 4.631389971782815e-05, + "loss": 0.4521, "step": 27730 }, { - "epoch": 0.98, - "learning_rate": 4.651003948150349e-05, - "loss": 0.2971, + "epoch": 0.9995675208130609, + "grad_norm": 0.18743610382080078, + "learning_rate": 4.631237445121968e-05, + "loss": 0.4555, "step": 27735 }, { - "epoch": 0.98, - "learning_rate": 4.6508587586068644e-05, - "loss": 0.2991, + "epoch": 0.9997477204742855, + "grad_norm": 0.17451363801956177, + "learning_rate": 4.631084889423424e-05, + "loss": 0.4042, "step": 27740 }, { - "epoch": 0.98, - "learning_rate": 4.650713541135897e-05, - "loss": 0.3036, + "epoch": 0.9999279201355101, + "grad_norm": 0.17138254642486572, + "learning_rate": 4.630932304689261e-05, + "loss": 0.411, "step": 27745 }, { - "epoch": 0.98, - "learning_rate": 4.650568295739331e-05, - "loss": 0.3455, + "epoch": 1.0001081197967348, + "grad_norm": 0.18573379516601562, + "learning_rate": 4.6307796909215574e-05, + "loss": 0.4382, "step": 27750 }, { - "epoch": 0.98, - "learning_rate": 4.650423022419054e-05, - "loss": 0.3175, + "epoch": 1.0002883194579595, + "grad_norm": 0.18925435841083527, + "learning_rate": 4.630627048122393e-05, + "loss": 0.4086, "step": 27755 }, { - "epoch": 0.98, - "learning_rate": 4.650277721176951e-05, - "loss": 0.3293, + "epoch": 1.000468519119184, + "grad_norm": 0.15313346683979034, + "learning_rate": 4.630474376293849e-05, + "loss": 0.435, "step": 27760 }, { - "epoch": 0.98, - "learning_rate": 4.6501323920149086e-05, - "loss": 0.2894, + "epoch": 1.0006487187804087, + "grad_norm": 0.16638581454753876, + "learning_rate": 4.630321675438002e-05, + "loss": 0.3956, "step": 27765 }, { - "epoch": 0.98, - "learning_rate": 4.649987034934814e-05, - "loss": 0.3213, + "epoch": 1.0008289184416332, + "grad_norm": 0.23420801758766174, + "learning_rate": 4.630168945556937e-05, + "loss": 0.4247, "step": 27770 }, { - "epoch": 0.98, - "learning_rate": 4.6498416499385546e-05, - "loss": 0.2862, + "epoch": 1.001009118102858, + "grad_norm": 0.19458545744419098, + "learning_rate": 4.63001618665273e-05, + "loss": 0.4022, "step": 27775 }, { - "epoch": 0.98, - "learning_rate": 4.6496962370280183e-05, - "loss": 0.3092, + "epoch": 1.0011893177640827, + "grad_norm": 0.15574969351291656, + "learning_rate": 4.629863398727466e-05, + "loss": 0.4313, "step": 27780 }, { - "epoch": 0.98, - "learning_rate": 4.6495507962050936e-05, - "loss": 0.2987, + "epoch": 1.0013695174253072, + "grad_norm": 0.14644834399223328, + "learning_rate": 4.629710581783226e-05, + "loss": 0.4278, "step": 27785 }, { - "epoch": 0.98, - "learning_rate": 4.649405327471667e-05, - "loss": 0.3139, + "epoch": 1.001549717086532, + "grad_norm": 0.17037297785282135, + "learning_rate": 4.6295577358220914e-05, + "loss": 0.4216, "step": 27790 }, { - "epoch": 0.98, - "learning_rate": 4.64925983082963e-05, - "loss": 0.2881, + "epoch": 1.0017299167477565, + "grad_norm": 0.18147750198841095, + "learning_rate": 4.629404860846145e-05, + "loss": 0.4044, "step": 27795 }, { - "epoch": 0.98, - "learning_rate": 4.6491143062808696e-05, - "loss": 0.3101, + "epoch": 1.0019101164089812, + "grad_norm": 0.17213241755962372, + "learning_rate": 4.629251956857469e-05, + "loss": 0.4361, "step": 27800 }, { - "epoch": 0.98, - "learning_rate": 4.648968753827277e-05, - "loss": 0.291, + "epoch": 1.0020903160702057, + "grad_norm": 0.16381776332855225, + "learning_rate": 4.629099023858148e-05, + "loss": 0.4295, "step": 27805 }, { - "epoch": 0.98, - "learning_rate": 4.648823173470741e-05, - "loss": 0.2974, + "epoch": 1.0022705157314304, + "grad_norm": 0.19637686014175415, + "learning_rate": 4.628946061850265e-05, + "loss": 0.4377, "step": 27810 }, { - "epoch": 0.98, - "learning_rate": 4.648677565213152e-05, - "loss": 0.3195, + "epoch": 1.002450715392655, + "grad_norm": 0.19968008995056152, + "learning_rate": 4.628793070835904e-05, + "loss": 0.406, "step": 27815 }, { - "epoch": 0.98, - "learning_rate": 4.648531929056401e-05, - "loss": 0.2996, + "epoch": 1.0026309150538797, + "grad_norm": 0.19794724881649017, + "learning_rate": 4.628640050817149e-05, + "loss": 0.4257, "step": 27820 }, { - "epoch": 0.98, - "learning_rate": 4.64838626500238e-05, - "loss": 0.2848, + "epoch": 1.0028111147151044, + "grad_norm": 0.14951010048389435, + "learning_rate": 4.628487001796086e-05, + "loss": 0.4467, "step": 27825 }, { - "epoch": 0.98, - "learning_rate": 4.6482405730529773e-05, - "loss": 0.3081, + "epoch": 1.002991314376329, + "grad_norm": 0.169041708111763, + "learning_rate": 4.628333923774799e-05, + "loss": 0.4444, "step": 27830 }, { - "epoch": 0.98, - "learning_rate": 4.6480948532100875e-05, - "loss": 0.3071, + "epoch": 1.0031715140375537, + "grad_norm": 0.1682281494140625, + "learning_rate": 4.628180816755375e-05, + "loss": 0.4206, "step": 27835 }, { - "epoch": 0.98, - "learning_rate": 4.647949105475602e-05, - "loss": 0.3182, + "epoch": 1.0033517136987782, + "grad_norm": 0.18344461917877197, + "learning_rate": 4.6280276807398994e-05, + "loss": 0.4229, "step": 27840 }, { - "epoch": 0.98, - "learning_rate": 4.6478033298514124e-05, - "loss": 0.3004, + "epoch": 1.003531913360003, + "grad_norm": 0.14406928420066833, + "learning_rate": 4.627874515730459e-05, + "loss": 0.4192, "step": 27845 }, { - "epoch": 0.98, - "learning_rate": 4.6476575263394124e-05, - "loss": 0.2944, + "epoch": 1.0037121130212274, + "grad_norm": 0.1358763873577118, + "learning_rate": 4.6277213217291395e-05, + "loss": 0.4126, "step": 27850 }, { - "epoch": 0.98, - "learning_rate": 4.647511694941494e-05, - "loss": 0.305, + "epoch": 1.0038923126824522, + "grad_norm": 0.15834762156009674, + "learning_rate": 4.6275680987380296e-05, + "loss": 0.4679, "step": 27855 }, { - "epoch": 0.98, - "learning_rate": 4.6473658356595525e-05, - "loss": 0.2991, + "epoch": 1.004072512343677, + "grad_norm": 0.1732066124677658, + "learning_rate": 4.627414846759216e-05, + "loss": 0.3903, "step": 27860 }, { - "epoch": 0.98, - "learning_rate": 4.6472199484954805e-05, - "loss": 0.2867, + "epoch": 1.0042527120049014, + "grad_norm": 0.17274266481399536, + "learning_rate": 4.627261565794787e-05, + "loss": 0.4179, "step": 27865 }, { - "epoch": 0.98, - "learning_rate": 4.647074033451172e-05, - "loss": 0.3062, + "epoch": 1.0044329116661261, + "grad_norm": 0.1775139421224594, + "learning_rate": 4.6271082558468306e-05, + "loss": 0.4045, "step": 27870 }, { - "epoch": 0.98, - "learning_rate": 4.6469280905285224e-05, - "loss": 0.2917, + "epoch": 1.0046131113273506, + "grad_norm": 0.17623652517795563, + "learning_rate": 4.626954916917436e-05, + "loss": 0.3981, "step": 27875 }, { - "epoch": 0.98, - "learning_rate": 4.646782119729426e-05, - "loss": 0.3011, + "epoch": 1.0047933109885754, + "grad_norm": 0.18118661642074585, + "learning_rate": 4.626801549008693e-05, + "loss": 0.432, "step": 27880 }, { - "epoch": 0.98, - "learning_rate": 4.6466361210557794e-05, - "loss": 0.2834, + "epoch": 1.0049735106498, + "grad_norm": 0.1708979606628418, + "learning_rate": 4.6266481521226904e-05, + "loss": 0.4477, "step": 27885 }, { - "epoch": 0.98, - "learning_rate": 4.646490094509477e-05, - "loss": 0.3241, + "epoch": 1.0051537103110246, + "grad_norm": 0.17798757553100586, + "learning_rate": 4.6264947262615186e-05, + "loss": 0.4128, "step": 27890 }, { - "epoch": 0.98, - "learning_rate": 4.6463440400924156e-05, - "loss": 0.3104, + "epoch": 1.0053339099722494, + "grad_norm": 0.16669988632202148, + "learning_rate": 4.626341271427268e-05, + "loss": 0.4202, "step": 27895 }, { - "epoch": 0.98, - "learning_rate": 4.646197957806491e-05, - "loss": 0.3255, + "epoch": 1.0055141096334739, + "grad_norm": 0.1673901528120041, + "learning_rate": 4.626187787622029e-05, + "loss": 0.4245, "step": 27900 }, { - "epoch": 0.98, - "learning_rate": 4.646051847653601e-05, - "loss": 0.3279, + "epoch": 1.0056943092946986, + "grad_norm": 0.1705399453639984, + "learning_rate": 4.626034274847893e-05, + "loss": 0.4076, "step": 27905 }, { - "epoch": 0.98, - "learning_rate": 4.6459057096356414e-05, - "loss": 0.2838, + "epoch": 1.0058745089559231, + "grad_norm": 0.17363837361335754, + "learning_rate": 4.625880733106951e-05, + "loss": 0.4292, "step": 27910 }, { - "epoch": 0.98, - "learning_rate": 4.64575954375451e-05, - "loss": 0.3108, + "epoch": 1.0060547086171479, + "grad_norm": 0.15469084680080414, + "learning_rate": 4.625727162401296e-05, + "loss": 0.3902, "step": 27915 }, { - "epoch": 0.98, - "learning_rate": 4.645613350012106e-05, - "loss": 0.3067, + "epoch": 1.0062349082783724, + "grad_norm": 0.1932782232761383, + "learning_rate": 4.625573562733021e-05, + "loss": 0.4187, "step": 27920 }, { - "epoch": 0.98, - "learning_rate": 4.6454671284103255e-05, - "loss": 0.3262, + "epoch": 1.006415107939597, + "grad_norm": 0.2232416868209839, + "learning_rate": 4.625419934104217e-05, + "loss": 0.459, "step": 27925 }, { - "epoch": 0.98, - "learning_rate": 4.6453208789510695e-05, - "loss": 0.3154, + "epoch": 1.0065953076008216, + "grad_norm": 0.1562723070383072, + "learning_rate": 4.625266276516978e-05, + "loss": 0.42, "step": 27930 }, { - "epoch": 0.98, - "learning_rate": 4.645174601636235e-05, - "loss": 0.3076, + "epoch": 1.0067755072620463, + "grad_norm": 0.16613322496414185, + "learning_rate": 4.625112589973397e-05, + "loss": 0.4205, "step": 27935 }, { - "epoch": 0.98, - "learning_rate": 4.645028296467722e-05, - "loss": 0.3014, + "epoch": 1.006955706923271, + "grad_norm": 0.15974131226539612, + "learning_rate": 4.624958874475569e-05, + "loss": 0.4098, "step": 27940 }, { - "epoch": 0.98, - "learning_rate": 4.6448819634474305e-05, - "loss": 0.279, + "epoch": 1.0071359065844956, + "grad_norm": 0.12717698514461517, + "learning_rate": 4.624805130025588e-05, + "loss": 0.4076, "step": 27945 }, { - "epoch": 0.98, - "learning_rate": 4.64473560257726e-05, - "loss": 0.2873, + "epoch": 1.0073161062457203, + "grad_norm": 0.20461279153823853, + "learning_rate": 4.624651356625548e-05, + "loss": 0.438, "step": 27950 }, { - "epoch": 0.98, - "learning_rate": 4.644589213859112e-05, - "loss": 0.2604, + "epoch": 1.0074963059069448, + "grad_norm": 0.16563838720321655, + "learning_rate": 4.624497554277544e-05, + "loss": 0.3795, "step": 27955 }, { - "epoch": 0.98, - "learning_rate": 4.644442797294886e-05, - "loss": 0.3031, + "epoch": 1.0076765055681696, + "grad_norm": 0.1689777672290802, + "learning_rate": 4.6243437229836725e-05, + "loss": 0.425, "step": 27960 }, { - "epoch": 0.98, - "learning_rate": 4.644296352886482e-05, - "loss": 0.3184, + "epoch": 1.007856705229394, + "grad_norm": 0.1565127670764923, + "learning_rate": 4.624189862746029e-05, + "loss": 0.3809, "step": 27965 }, { - "epoch": 0.98, - "learning_rate": 4.644149880635804e-05, - "loss": 0.3232, + "epoch": 1.0080369048906188, + "grad_norm": 0.17497403919696808, + "learning_rate": 4.62403597356671e-05, + "loss": 0.4502, "step": 27970 }, { - "epoch": 0.98, - "learning_rate": 4.644003380544754e-05, - "loss": 0.3124, + "epoch": 1.0082171045518435, + "grad_norm": 0.1647554188966751, + "learning_rate": 4.623882055447813e-05, + "loss": 0.4453, "step": 27975 }, { - "epoch": 0.98, - "learning_rate": 4.6438568526152315e-05, - "loss": 0.281, + "epoch": 1.008397304213068, + "grad_norm": 0.18609805405139923, + "learning_rate": 4.623728108391433e-05, + "loss": 0.4263, "step": 27980 }, { - "epoch": 0.98, - "learning_rate": 4.6437102968491416e-05, - "loss": 0.3167, + "epoch": 1.0085775038742928, + "grad_norm": 0.15984831750392914, + "learning_rate": 4.6235741323996696e-05, + "loss": 0.4564, "step": 27985 }, { - "epoch": 0.98, - "learning_rate": 4.6435637132483855e-05, - "loss": 0.2948, + "epoch": 1.0087577035355173, + "grad_norm": 0.1694299876689911, + "learning_rate": 4.623420127474619e-05, + "loss": 0.4075, "step": 27990 }, { - "epoch": 0.98, - "learning_rate": 4.6434171018148674e-05, - "loss": 0.2974, + "epoch": 1.008937903196742, + "grad_norm": 0.18183918297290802, + "learning_rate": 4.62326609361838e-05, + "loss": 0.3977, "step": 27995 }, { - "epoch": 0.99, - "learning_rate": 4.6432704625504915e-05, - "loss": 0.31, + "epoch": 1.0091181028579665, + "grad_norm": 0.18939772248268127, + "learning_rate": 4.623112030833052e-05, + "loss": 0.4101, "step": 28000 }, { - "epoch": 0.99, - "eval_loss": 0.2925254702568054, - "eval_runtime": 10.5771, - "eval_samples_per_second": 9.454, - "eval_steps_per_second": 9.454, + "epoch": 1.0091181028579665, + "eval_loss": 0.4545483887195587, + "eval_runtime": 3.5395, + "eval_samples_per_second": 28.252, + "eval_steps_per_second": 7.063, "step": 28000 }, { - "epoch": 0.99, - "learning_rate": 4.6431237954571605e-05, - "loss": 0.3, + "epoch": 1.0092983025191913, + "grad_norm": 0.15189491212368011, + "learning_rate": 4.622957939120734e-05, + "loss": 0.416, "step": 28005 }, { - "epoch": 0.99, - "learning_rate": 4.64297710053678e-05, - "loss": 0.3034, + "epoch": 1.0094785021804158, + "grad_norm": 0.1771238148212433, + "learning_rate": 4.6228038184835256e-05, + "loss": 0.4021, "step": 28010 }, { - "epoch": 0.99, - "learning_rate": 4.642830377791253e-05, - "loss": 0.2771, + "epoch": 1.0096587018416405, + "grad_norm": 0.18271222710609436, + "learning_rate": 4.622649668923525e-05, + "loss": 0.4491, "step": 28015 }, { - "epoch": 0.99, - "learning_rate": 4.642683627222486e-05, - "loss": 0.2768, + "epoch": 1.0098389015028653, + "grad_norm": 0.19635730981826782, + "learning_rate": 4.622495490442834e-05, + "loss": 0.408, "step": 28020 }, { - "epoch": 0.99, - "learning_rate": 4.642536848832384e-05, - "loss": 0.2962, + "epoch": 1.0100191011640898, + "grad_norm": 0.1971067190170288, + "learning_rate": 4.622341283043553e-05, + "loss": 0.4363, "step": 28025 }, { - "epoch": 0.99, - "learning_rate": 4.642390042622854e-05, - "loss": 0.2933, + "epoch": 1.0101993008253145, + "grad_norm": 0.1596841812133789, + "learning_rate": 4.622187046727783e-05, + "loss": 0.4135, "step": 28030 }, { - "epoch": 0.99, - "learning_rate": 4.6422432085958014e-05, - "loss": 0.282, + "epoch": 1.010379500486539, + "grad_norm": 0.20057372748851776, + "learning_rate": 4.622032781497625e-05, + "loss": 0.4142, "step": 28035 }, { - "epoch": 0.99, - "learning_rate": 4.642096346753131e-05, - "loss": 0.3212, + "epoch": 1.0105597001477638, + "grad_norm": 0.21350523829460144, + "learning_rate": 4.6218784873551816e-05, + "loss": 0.4545, "step": 28040 }, { - "epoch": 0.99, - "learning_rate": 4.6419494570967524e-05, - "loss": 0.2929, + "epoch": 1.0107398998089883, + "grad_norm": 0.21572305262088776, + "learning_rate": 4.621724164302554e-05, + "loss": 0.4428, "step": 28045 }, { - "epoch": 0.99, - "learning_rate": 4.6418025396285716e-05, - "loss": 0.291, + "epoch": 1.010920099470213, + "grad_norm": 0.15721938014030457, + "learning_rate": 4.621569812341846e-05, + "loss": 0.4368, "step": 28050 }, { - "epoch": 0.99, - "learning_rate": 4.641655594350496e-05, - "loss": 0.2882, + "epoch": 1.0111002991314377, + "grad_norm": 0.18772733211517334, + "learning_rate": 4.62141543147516e-05, + "loss": 0.4255, "step": 28055 }, { - "epoch": 0.99, - "learning_rate": 4.641508621264434e-05, - "loss": 0.3456, + "epoch": 1.0112804987926622, + "grad_norm": 0.15941715240478516, + "learning_rate": 4.6212610217045985e-05, + "loss": 0.4191, "step": 28060 }, { - "epoch": 0.99, - "learning_rate": 4.641361620372294e-05, - "loss": 0.2915, + "epoch": 1.011460698453887, + "grad_norm": 0.191365584731102, + "learning_rate": 4.6211065830322674e-05, + "loss": 0.4096, "step": 28065 }, { - "epoch": 0.99, - "learning_rate": 4.6412145916759844e-05, - "loss": 0.3201, + "epoch": 1.0116408981151115, + "grad_norm": 0.15204831957817078, + "learning_rate": 4.62095211546027e-05, + "loss": 0.4178, "step": 28070 }, { - "epoch": 0.99, - "learning_rate": 4.641067535177414e-05, - "loss": 0.3373, + "epoch": 1.0118210977763362, + "grad_norm": 0.1380978375673294, + "learning_rate": 4.62079761899071e-05, + "loss": 0.3991, "step": 28075 }, { - "epoch": 0.99, - "learning_rate": 4.640920450878493e-05, - "loss": 0.303, + "epoch": 1.0120012974375607, + "grad_norm": 0.1999700665473938, + "learning_rate": 4.6206430936256925e-05, + "loss": 0.4427, "step": 28080 }, { - "epoch": 0.99, - "learning_rate": 4.6407733387811314e-05, - "loss": 0.278, + "epoch": 1.0121814970987855, + "grad_norm": 0.16516053676605225, + "learning_rate": 4.620488539367325e-05, + "loss": 0.4186, "step": 28085 }, { - "epoch": 0.99, - "learning_rate": 4.640626198887238e-05, - "loss": 0.2931, + "epoch": 1.0123616967600102, + "grad_norm": 0.1769554764032364, + "learning_rate": 4.62033395621771e-05, + "loss": 0.3821, "step": 28090 }, { - "epoch": 0.99, - "learning_rate": 4.6404790311987245e-05, - "loss": 0.2942, + "epoch": 1.0125418964212347, + "grad_norm": 0.15776276588439941, + "learning_rate": 4.6201793441789566e-05, + "loss": 0.4336, "step": 28095 }, { - "epoch": 0.99, - "learning_rate": 4.6403318357175016e-05, - "loss": 0.2935, + "epoch": 1.0127220960824594, + "grad_norm": 0.16874942183494568, + "learning_rate": 4.620024703253169e-05, + "loss": 0.4315, "step": 28100 }, { - "epoch": 0.99, - "learning_rate": 4.640184612445481e-05, - "loss": 0.2873, + "epoch": 1.012902295743684, + "grad_norm": 0.18879170715808868, + "learning_rate": 4.619870033442456e-05, + "loss": 0.4295, "step": 28105 }, { - "epoch": 0.99, - "learning_rate": 4.640037361384573e-05, - "loss": 0.3022, + "epoch": 1.0130824954049087, + "grad_norm": 0.17838184535503387, + "learning_rate": 4.619715334748924e-05, + "loss": 0.3818, "step": 28110 }, { - "epoch": 0.99, - "learning_rate": 4.6398900825366895e-05, - "loss": 0.2989, + "epoch": 1.0132626950661332, + "grad_norm": 0.1531950831413269, + "learning_rate": 4.619560607174681e-05, + "loss": 0.4307, "step": 28115 }, { - "epoch": 0.99, - "learning_rate": 4.6397427759037437e-05, - "loss": 0.3209, + "epoch": 1.013442894727358, + "grad_norm": 0.15358266234397888, + "learning_rate": 4.619405850721835e-05, + "loss": 0.3952, "step": 28120 }, { - "epoch": 0.99, - "learning_rate": 4.639595441487649e-05, - "loss": 0.312, + "epoch": 1.0136230943885824, + "grad_norm": 0.16065557301044464, + "learning_rate": 4.6192510653924956e-05, + "loss": 0.4233, "step": 28125 }, { - "epoch": 0.99, - "learning_rate": 4.639448079290317e-05, - "loss": 0.2821, + "epoch": 1.0138032940498072, + "grad_norm": 0.2141285538673401, + "learning_rate": 4.6190962511887694e-05, + "loss": 0.3981, "step": 28130 }, { - "epoch": 0.99, - "learning_rate": 4.6393006893136616e-05, - "loss": 0.2885, + "epoch": 1.013983493711032, + "grad_norm": 0.23794370889663696, + "learning_rate": 4.618941408112768e-05, + "loss": 0.3997, "step": 28135 }, { - "epoch": 0.99, - "learning_rate": 4.6391532715595964e-05, - "loss": 0.2991, + "epoch": 1.0141636933722564, + "grad_norm": 0.17404253780841827, + "learning_rate": 4.6187865361665995e-05, + "loss": 0.4309, "step": 28140 }, { - "epoch": 0.99, - "learning_rate": 4.639005826030035e-05, - "loss": 0.326, + "epoch": 1.0143438930334812, + "grad_norm": 0.1899445503950119, + "learning_rate": 4.618631635352375e-05, + "loss": 0.4306, "step": 28145 }, { - "epoch": 0.99, - "learning_rate": 4.638858352726895e-05, - "loss": 0.2807, + "epoch": 1.0145240926947057, + "grad_norm": 0.14172546565532684, + "learning_rate": 4.6184767056722044e-05, + "loss": 0.4384, "step": 28150 }, { - "epoch": 0.99, - "learning_rate": 4.638710851652087e-05, - "loss": 0.2922, + "epoch": 1.0147042923559304, + "grad_norm": 0.18066340684890747, + "learning_rate": 4.618321747128199e-05, + "loss": 0.4248, "step": 28155 }, { - "epoch": 0.99, - "learning_rate": 4.638563322807528e-05, - "loss": 0.2952, + "epoch": 1.014884492017155, + "grad_norm": 0.18051232397556305, + "learning_rate": 4.61816675972247e-05, + "loss": 0.4058, "step": 28160 }, { - "epoch": 0.99, - "learning_rate": 4.638415766195135e-05, - "loss": 0.2912, + "epoch": 1.0150646916783796, + "grad_norm": 0.21809528768062592, + "learning_rate": 4.6180117434571285e-05, + "loss": 0.3864, "step": 28165 }, { - "epoch": 0.99, - "learning_rate": 4.638268181816821e-05, - "loss": 0.285, + "epoch": 1.0152448913396044, + "grad_norm": 0.1755502074956894, + "learning_rate": 4.6178566983342875e-05, + "loss": 0.411, "step": 28170 }, { - "epoch": 0.99, - "learning_rate": 4.6381205696745046e-05, - "loss": 0.2991, + "epoch": 1.015425091000829, + "grad_norm": 0.2000172734260559, + "learning_rate": 4.617701624356059e-05, + "loss": 0.4297, "step": 28175 }, { - "epoch": 0.99, - "learning_rate": 4.637972929770101e-05, - "loss": 0.3202, + "epoch": 1.0156052906620536, + "grad_norm": 0.15490785241127014, + "learning_rate": 4.6175465215245556e-05, + "loss": 0.4408, "step": 28180 }, { - "epoch": 0.99, - "learning_rate": 4.637825262105528e-05, - "loss": 0.2775, + "epoch": 1.0157854903232781, + "grad_norm": 0.1819140464067459, + "learning_rate": 4.617391389841891e-05, + "loss": 0.3964, "step": 28185 }, { - "epoch": 0.99, - "learning_rate": 4.6376775666827045e-05, - "loss": 0.3135, + "epoch": 1.0159656899845029, + "grad_norm": 0.1457093507051468, + "learning_rate": 4.617236229310179e-05, + "loss": 0.3747, "step": 28190 }, { - "epoch": 0.99, - "learning_rate": 4.6375298435035455e-05, - "loss": 0.3039, + "epoch": 1.0161458896457274, + "grad_norm": 0.18323257565498352, + "learning_rate": 4.617081039931532e-05, + "loss": 0.4372, "step": 28195 }, { - "epoch": 0.99, - "learning_rate": 4.63738209256997e-05, - "loss": 0.3044, + "epoch": 1.0163260893069521, + "grad_norm": 0.16842052340507507, + "learning_rate": 4.616925821708067e-05, + "loss": 0.4285, "step": 28200 }, { - "epoch": 0.99, - "learning_rate": 4.637234313883896e-05, - "loss": 0.3249, + "epoch": 1.0165062889681766, + "grad_norm": 0.1632702648639679, + "learning_rate": 4.6167705746418974e-05, + "loss": 0.395, "step": 28205 }, { - "epoch": 0.99, - "learning_rate": 4.6370865074472435e-05, - "loss": 0.3282, + "epoch": 1.0166864886294014, + "grad_norm": 0.15500997006893158, + "learning_rate": 4.616615298735138e-05, + "loss": 0.4169, "step": 28210 }, { - "epoch": 0.99, - "learning_rate": 4.636938673261931e-05, - "loss": 0.3199, + "epoch": 1.016866688290626, + "grad_norm": 0.18930770456790924, + "learning_rate": 4.616459993989906e-05, + "loss": 0.4304, "step": 28215 }, { - "epoch": 0.99, - "learning_rate": 4.6367908113298784e-05, - "loss": 0.2861, + "epoch": 1.0170468879518506, + "grad_norm": 0.19972793757915497, + "learning_rate": 4.616304660408315e-05, + "loss": 0.4025, "step": 28220 }, { - "epoch": 0.99, - "learning_rate": 4.636642921653006e-05, - "loss": 0.2998, + "epoch": 1.0172270876130753, + "grad_norm": 0.1533392369747162, + "learning_rate": 4.6161492979924834e-05, + "loss": 0.4091, "step": 28225 }, { - "epoch": 0.99, - "learning_rate": 4.6364950042332326e-05, - "loss": 0.2819, + "epoch": 1.0174072872742999, + "grad_norm": 0.17162777483463287, + "learning_rate": 4.615993906744528e-05, + "loss": 0.4054, "step": 28230 }, { - "epoch": 0.99, - "learning_rate": 4.636347059072479e-05, - "loss": 0.312, + "epoch": 1.0175874869355246, + "grad_norm": 0.2027086317539215, + "learning_rate": 4.615838486666564e-05, + "loss": 0.3837, "step": 28235 }, { - "epoch": 0.99, - "learning_rate": 4.636199086172667e-05, - "loss": 0.2945, + "epoch": 1.017767686596749, + "grad_norm": 0.1900579333305359, + "learning_rate": 4.6156830377607105e-05, + "loss": 0.4096, "step": 28240 }, { - "epoch": 0.99, - "learning_rate": 4.636051085535718e-05, - "loss": 0.2942, + "epoch": 1.0179478862579738, + "grad_norm": 0.15131549537181854, + "learning_rate": 4.615527560029086e-05, + "loss": 0.4474, "step": 28245 }, { - "epoch": 0.99, - "learning_rate": 4.635903057163553e-05, - "loss": 0.3138, + "epoch": 1.0181280859191986, + "grad_norm": 0.1787458062171936, + "learning_rate": 4.615372053473808e-05, + "loss": 0.3827, "step": 28250 }, { - "epoch": 0.99, - "learning_rate": 4.6357550010580954e-05, - "loss": 0.2827, + "epoch": 1.018308285580423, + "grad_norm": 0.18287619948387146, + "learning_rate": 4.6152165180969944e-05, + "loss": 0.4215, "step": 28255 }, { - "epoch": 0.99, - "learning_rate": 4.635606917221266e-05, - "loss": 0.2791, + "epoch": 1.0184884852416478, + "grad_norm": 0.15783603489398956, + "learning_rate": 4.6150609539007664e-05, + "loss": 0.3793, "step": 28260 }, { - "epoch": 0.99, - "learning_rate": 4.6354588056549883e-05, - "loss": 0.3206, + "epoch": 1.0186686849028723, + "grad_norm": 0.2039651870727539, + "learning_rate": 4.614905360887241e-05, + "loss": 0.4043, "step": 28265 }, { - "epoch": 0.99, - "learning_rate": 4.6353106663611843e-05, - "loss": 0.3046, + "epoch": 1.018848884564097, + "grad_norm": 0.18400070071220398, + "learning_rate": 4.61474973905854e-05, + "loss": 0.4042, "step": 28270 }, { - "epoch": 0.99, - "learning_rate": 4.6351624993417795e-05, - "loss": 0.3276, + "epoch": 1.0190290842253216, + "grad_norm": 0.19877344369888306, + "learning_rate": 4.614594088416784e-05, + "loss": 0.4057, "step": 28275 }, { - "epoch": 0.99, - "learning_rate": 4.635014304598697e-05, - "loss": 0.3149, + "epoch": 1.0192092838865463, + "grad_norm": 0.19915084540843964, + "learning_rate": 4.614438408964092e-05, + "loss": 0.4553, "step": 28280 }, { - "epoch": 1.0, - "learning_rate": 4.6348660821338605e-05, - "loss": 0.2965, + "epoch": 1.019389483547771, + "grad_norm": 0.19474834203720093, + "learning_rate": 4.614282700702587e-05, + "loss": 0.4374, "step": 28285 }, { - "epoch": 1.0, - "learning_rate": 4.6347178319491946e-05, - "loss": 0.2983, + "epoch": 1.0195696832089955, + "grad_norm": 0.1851280927658081, + "learning_rate": 4.614126963634389e-05, + "loss": 0.4065, "step": 28290 }, { - "epoch": 1.0, - "learning_rate": 4.634569554046625e-05, - "loss": 0.3216, + "epoch": 1.0197498828702203, + "grad_norm": 0.1995769590139389, + "learning_rate": 4.6139711977616207e-05, + "loss": 0.4195, "step": 28295 }, { - "epoch": 1.0, - "learning_rate": 4.6344212484280756e-05, - "loss": 0.3091, + "epoch": 1.0199300825314448, + "grad_norm": 0.17774897813796997, + "learning_rate": 4.6138154030864036e-05, + "loss": 0.4494, "step": 28300 }, { - "epoch": 1.0, - "learning_rate": 4.634272915095475e-05, - "loss": 0.3023, + "epoch": 1.0201102821926695, + "grad_norm": 0.1672256588935852, + "learning_rate": 4.613659579610861e-05, + "loss": 0.413, "step": 28305 }, { - "epoch": 1.0, - "learning_rate": 4.634124554050745e-05, - "loss": 0.3142, + "epoch": 1.020290481853894, + "grad_norm": 0.17119145393371582, + "learning_rate": 4.613503727337116e-05, + "loss": 0.3927, "step": 28310 }, { - "epoch": 1.0, - "learning_rate": 4.633976165295815e-05, - "loss": 0.2853, + "epoch": 1.0204706815151188, + "grad_norm": 0.16596178710460663, + "learning_rate": 4.613347846267292e-05, + "loss": 0.4003, "step": 28315 }, { - "epoch": 1.0, - "learning_rate": 4.6338277488326095e-05, - "loss": 0.3037, + "epoch": 1.0206508811763433, + "grad_norm": 0.1888241469860077, + "learning_rate": 4.6131919364035126e-05, + "loss": 0.4401, "step": 28320 }, { - "epoch": 1.0, - "learning_rate": 4.633679304663059e-05, - "loss": 0.3117, + "epoch": 1.020831080837568, + "grad_norm": 0.2028852254152298, + "learning_rate": 4.613035997747902e-05, + "loss": 0.4529, "step": 28325 }, { - "epoch": 1.0, - "learning_rate": 4.633530832789088e-05, - "loss": 0.2757, + "epoch": 1.0210112804987928, + "grad_norm": 0.18602454662322998, + "learning_rate": 4.612880030302585e-05, + "loss": 0.4408, "step": 28330 }, { - "epoch": 1.0, - "learning_rate": 4.633382333212625e-05, - "loss": 0.3027, + "epoch": 1.0211914801600173, + "grad_norm": 0.2020941823720932, + "learning_rate": 4.6127240340696876e-05, + "loss": 0.4332, "step": 28335 }, { - "epoch": 1.0, - "learning_rate": 4.633233805935598e-05, - "loss": 0.2905, + "epoch": 1.021371679821242, + "grad_norm": 0.17948292195796967, + "learning_rate": 4.6125680090513334e-05, + "loss": 0.4235, "step": 28340 }, { - "epoch": 1.0, - "learning_rate": 4.6330852509599364e-05, - "loss": 0.3119, + "epoch": 1.0215518794824665, + "grad_norm": 0.17429165542125702, + "learning_rate": 4.612411955249649e-05, + "loss": 0.4238, "step": 28345 }, { - "epoch": 1.0, - "learning_rate": 4.632936668287568e-05, - "loss": 0.2912, + "epoch": 1.0217320791436912, + "grad_norm": 0.18674267828464508, + "learning_rate": 4.612255872666762e-05, + "loss": 0.3849, "step": 28350 }, { - "epoch": 1.0, - "learning_rate": 4.632788057920423e-05, - "loss": 0.3334, + "epoch": 1.0219122788049158, + "grad_norm": 0.13337041437625885, + "learning_rate": 4.612099761304796e-05, + "loss": 0.4294, "step": 28355 }, { - "epoch": 1.0, - "learning_rate": 4.632639419860431e-05, - "loss": 0.3038, + "epoch": 1.0220924784661405, + "grad_norm": 0.15359793603420258, + "learning_rate": 4.6119436211658805e-05, + "loss": 0.4145, "step": 28360 }, { - "epoch": 1.0, - "learning_rate": 4.632490754109521e-05, - "loss": 0.2781, + "epoch": 1.0222726781273652, + "grad_norm": 0.17645424604415894, + "learning_rate": 4.611787452252142e-05, + "loss": 0.4163, "step": 28365 }, { - "epoch": 1.0, - "learning_rate": 4.6323420606696245e-05, - "loss": 0.2914, + "epoch": 1.0224528777885897, + "grad_norm": 0.14870162308216095, + "learning_rate": 4.6116312545657083e-05, + "loss": 0.382, "step": 28370 }, { - "epoch": 1.0, - "learning_rate": 4.632193339542671e-05, - "loss": 0.304, + "epoch": 1.0226330774498145, + "grad_norm": 0.18462905287742615, + "learning_rate": 4.611475028108707e-05, + "loss": 0.4238, "step": 28375 }, { - "epoch": 1.0, - "learning_rate": 4.632044590730593e-05, - "loss": 0.2986, + "epoch": 1.022813277111039, + "grad_norm": 0.1805817037820816, + "learning_rate": 4.611318772883268e-05, + "loss": 0.4203, "step": 28380 }, { - "epoch": 1.0, - "learning_rate": 4.631895814235321e-05, - "loss": 0.3096, + "epoch": 1.0229934767722637, + "grad_norm": 0.18119722604751587, + "learning_rate": 4.6111624888915196e-05, + "loss": 0.4058, "step": 28385 }, { - "epoch": 1.0, - "learning_rate": 4.6317470100587856e-05, - "loss": 0.3176, + "epoch": 1.0231736764334882, + "grad_norm": 0.14671674370765686, + "learning_rate": 4.611006176135591e-05, + "loss": 0.4227, "step": 28390 }, { - "epoch": 1.0, - "learning_rate": 4.631598178202921e-05, - "loss": 0.3122, + "epoch": 1.023353876094713, + "grad_norm": 0.13989455997943878, + "learning_rate": 4.610849834617611e-05, + "loss": 0.3985, "step": 28395 }, { - "epoch": 1.0, - "learning_rate": 4.631449318669659e-05, - "loss": 0.2804, + "epoch": 1.0235340757559377, + "grad_norm": 0.17615246772766113, + "learning_rate": 4.610693464339711e-05, + "loss": 0.3773, "step": 28400 }, { - "epoch": 1.0, - "learning_rate": 4.631300431460932e-05, - "loss": 0.2944, + "epoch": 1.0237142754171622, + "grad_norm": 0.17603693902492523, + "learning_rate": 4.6105370653040216e-05, + "loss": 0.3988, "step": 28405 }, { - "epoch": 1.0, - "learning_rate": 4.6311515165786736e-05, - "loss": 0.3003, + "epoch": 1.023894475078387, + "grad_norm": 0.15467150509357452, + "learning_rate": 4.6103806375126735e-05, + "loss": 0.4082, "step": 28410 }, { - "epoch": 1.0, - "learning_rate": 4.631002574024818e-05, - "loss": 0.2809, + "epoch": 1.0240746747396114, + "grad_norm": 0.19032159447669983, + "learning_rate": 4.6102241809677974e-05, + "loss": 0.4251, "step": 28415 }, { - "epoch": 1.0, - "learning_rate": 4.630853603801296e-05, - "loss": 0.3108, + "epoch": 1.0242548744008362, + "grad_norm": 0.19951966404914856, + "learning_rate": 4.610067695671525e-05, + "loss": 0.4239, "step": 28420 }, { - "epoch": 1.0, - "learning_rate": 4.6307046059100465e-05, - "loss": 0.2931, + "epoch": 1.0244350740620607, + "grad_norm": 0.1830962896347046, + "learning_rate": 4.609911181625989e-05, + "loss": 0.4016, "step": 28425 }, { - "epoch": 1.0, - "learning_rate": 4.630555580353001e-05, - "loss": 0.2833, + "epoch": 1.0246152737232854, + "grad_norm": 0.18551193177700043, + "learning_rate": 4.609754638833322e-05, + "loss": 0.3889, "step": 28430 }, { - "epoch": 1.0, - "learning_rate": 4.630406527132095e-05, - "loss": 0.2914, + "epoch": 1.02479547338451, + "grad_norm": 0.18302911520004272, + "learning_rate": 4.609598067295656e-05, + "loss": 0.4453, "step": 28435 }, { - "epoch": 1.0, - "learning_rate": 4.6302574462492656e-05, - "loss": 0.2863, + "epoch": 1.0249756730457347, + "grad_norm": 0.15674737095832825, + "learning_rate": 4.6094414670151253e-05, + "loss": 0.4165, "step": 28440 }, { - "epoch": 1.0, - "learning_rate": 4.630108337706446e-05, - "loss": 0.2728, + "epoch": 1.0251558727069594, + "grad_norm": 0.18824201822280884, + "learning_rate": 4.609284837993863e-05, + "loss": 0.4216, "step": 28445 }, { - "epoch": 1.0, - "learning_rate": 4.629959201505574e-05, - "loss": 0.2981, + "epoch": 1.025336072368184, + "grad_norm": 0.13510337471961975, + "learning_rate": 4.609128180234003e-05, + "loss": 0.4233, "step": 28450 }, { - "epoch": 1.0, - "learning_rate": 4.629810037648585e-05, - "loss": 0.2896, + "epoch": 1.0255162720294086, + "grad_norm": 0.17444869875907898, + "learning_rate": 4.60897149373768e-05, + "loss": 0.4555, "step": 28455 }, { - "epoch": 1.0, - "learning_rate": 4.6296608461374166e-05, - "loss": 0.2883, + "epoch": 1.0256964716906332, + "grad_norm": 0.14340052008628845, + "learning_rate": 4.6088147785070284e-05, + "loss": 0.3925, "step": 28460 }, { - "epoch": 1.0, - "learning_rate": 4.6295116269740055e-05, - "loss": 0.2993, + "epoch": 1.025876671351858, + "grad_norm": 0.15294180810451508, + "learning_rate": 4.608658034544184e-05, + "loss": 0.4104, "step": 28465 }, { - "epoch": 1.0, - "learning_rate": 4.6293623801602894e-05, - "loss": 0.2843, + "epoch": 1.0260568710130824, + "grad_norm": 0.21026860177516937, + "learning_rate": 4.608501261851282e-05, + "loss": 0.4463, "step": 28470 }, { - "epoch": 1.0, - "learning_rate": 4.6292131056982056e-05, - "loss": 0.2819, + "epoch": 1.0262370706743071, + "grad_norm": 0.18391993641853333, + "learning_rate": 4.6083444604304584e-05, + "loss": 0.4285, "step": 28475 }, { - "epoch": 1.0, - "learning_rate": 4.6290638035896935e-05, - "loss": 0.2676, + "epoch": 1.0264172703355319, + "grad_norm": 0.16915670037269592, + "learning_rate": 4.60818763028385e-05, + "loss": 0.3984, "step": 28480 }, { - "epoch": 1.0, - "learning_rate": 4.6289144738366905e-05, - "loss": 0.2986, + "epoch": 1.0265974699967564, + "grad_norm": 0.15066103637218475, + "learning_rate": 4.608030771413593e-05, + "loss": 0.4118, "step": 28485 }, { - "epoch": 1.0, - "learning_rate": 4.628765116441136e-05, - "loss": 0.2851, + "epoch": 1.0267776696579811, + "grad_norm": 0.1730443239212036, + "learning_rate": 4.607873883821825e-05, + "loss": 0.4169, "step": 28490 }, { - "epoch": 1.0, - "learning_rate": 4.62861573140497e-05, - "loss": 0.3139, + "epoch": 1.0269578693192056, + "grad_norm": 0.2241186797618866, + "learning_rate": 4.6077169675106836e-05, + "loss": 0.4354, "step": 28495 }, { - "epoch": 1.0, - "learning_rate": 4.628466318730131e-05, - "loss": 0.2856, + "epoch": 1.0271380689804304, + "grad_norm": 0.20758524537086487, + "learning_rate": 4.6075600224823066e-05, + "loss": 0.4531, "step": 28500 }, { - "epoch": 1.0, - "eval_loss": 0.2926534414291382, - "eval_runtime": 10.5975, - "eval_samples_per_second": 9.436, - "eval_steps_per_second": 9.436, + "epoch": 1.0271380689804304, + "eval_loss": 0.45461568236351013, + "eval_runtime": 3.545, + "eval_samples_per_second": 28.209, + "eval_steps_per_second": 7.052, "step": 28500 }, { - "epoch": 1.0, - "learning_rate": 4.62831687841856e-05, - "loss": 0.2937, + "epoch": 1.0273182686416549, + "grad_norm": 0.143527090549469, + "learning_rate": 4.607403048738832e-05, + "loss": 0.3849, "step": 28505 }, { - "epoch": 1.0, - "learning_rate": 4.6281674104721966e-05, - "loss": 0.2959, + "epoch": 1.0274984683028796, + "grad_norm": 0.17763735353946686, + "learning_rate": 4.607246046282399e-05, + "loss": 0.4174, "step": 28510 }, { - "epoch": 1.0, - "learning_rate": 4.628017914892982e-05, - "loss": 0.2815, + "epoch": 1.0276786679641041, + "grad_norm": 0.2017643004655838, + "learning_rate": 4.607089015115147e-05, + "loss": 0.4237, "step": 28515 }, { - "epoch": 1.0, - "learning_rate": 4.627868391682857e-05, - "loss": 0.2922, + "epoch": 1.0278588676253289, + "grad_norm": 0.16140924394130707, + "learning_rate": 4.6069319552392145e-05, + "loss": 0.4097, "step": 28520 }, { - "epoch": 1.0, - "learning_rate": 4.627718840843763e-05, - "loss": 0.3035, + "epoch": 1.0280390672865536, + "grad_norm": 0.16550087928771973, + "learning_rate": 4.6067748666567425e-05, + "loss": 0.3789, "step": 28525 }, { - "epoch": 1.0, - "learning_rate": 4.6275692623776434e-05, - "loss": 0.2818, + "epoch": 1.028219266947778, + "grad_norm": 0.1847558319568634, + "learning_rate": 4.60661774936987e-05, + "loss": 0.4227, "step": 28530 }, { - "epoch": 1.0, - "learning_rate": 4.627419656286438e-05, - "loss": 0.3022, + "epoch": 1.0283994666090028, + "grad_norm": 0.18655303120613098, + "learning_rate": 4.606460603380739e-05, + "loss": 0.4278, "step": 28535 }, { - "epoch": 1.0, - "learning_rate": 4.6272700225720906e-05, - "loss": 0.2987, + "epoch": 1.0285796662702273, + "grad_norm": 0.17594356834888458, + "learning_rate": 4.606303428691491e-05, + "loss": 0.4417, "step": 28540 }, { - "epoch": 1.0, - "learning_rate": 4.627120361236544e-05, - "loss": 0.2933, + "epoch": 1.028759865931452, + "grad_norm": 0.18802598118782043, + "learning_rate": 4.606146225304265e-05, + "loss": 0.4576, "step": 28545 }, { - "epoch": 1.0, - "learning_rate": 4.626970672281741e-05, - "loss": 0.2709, + "epoch": 1.0289400655926766, + "grad_norm": 0.190033420920372, + "learning_rate": 4.605988993221206e-05, + "loss": 0.3783, "step": 28550 }, { - "epoch": 1.0, - "learning_rate": 4.6268209557096256e-05, - "loss": 0.2913, + "epoch": 1.0291202652539013, + "grad_norm": 0.16919276118278503, + "learning_rate": 4.605831732444453e-05, + "loss": 0.4453, "step": 28555 }, { - "epoch": 1.0, - "learning_rate": 4.626671211522142e-05, - "loss": 0.2927, + "epoch": 1.029300464915126, + "grad_norm": 0.17263488471508026, + "learning_rate": 4.605674442976152e-05, + "loss": 0.3981, "step": 28560 }, { - "epoch": 1.0, - "learning_rate": 4.6265214397212344e-05, - "loss": 0.27, + "epoch": 1.0294806645763506, + "grad_norm": 0.18015895783901215, + "learning_rate": 4.6055171248184434e-05, + "loss": 0.3846, "step": 28565 }, { - "epoch": 1.01, - "learning_rate": 4.6263716403088464e-05, - "loss": 0.3177, + "epoch": 1.0296608642375753, + "grad_norm": 0.18624891340732574, + "learning_rate": 4.605359777973472e-05, + "loss": 0.4022, "step": 28570 }, { - "epoch": 1.01, - "learning_rate": 4.626221813286925e-05, - "loss": 0.3077, + "epoch": 1.0298410638987998, + "grad_norm": 0.18329830467700958, + "learning_rate": 4.6052024024433815e-05, + "loss": 0.4198, "step": 28575 }, { - "epoch": 1.01, - "learning_rate": 4.626071958657414e-05, - "loss": 0.2766, + "epoch": 1.0300212635600245, + "grad_norm": 0.17667533457279205, + "learning_rate": 4.605044998230315e-05, + "loss": 0.4046, "step": 28580 }, { - "epoch": 1.01, - "learning_rate": 4.62592207642226e-05, - "loss": 0.3328, + "epoch": 1.030201463221249, + "grad_norm": 0.1662374883890152, + "learning_rate": 4.604887565336419e-05, + "loss": 0.4322, "step": 28585 }, { - "epoch": 1.01, - "learning_rate": 4.625772166583409e-05, - "loss": 0.2749, + "epoch": 1.0303816628824738, + "grad_norm": 0.20289361476898193, + "learning_rate": 4.604730103763837e-05, + "loss": 0.4173, "step": 28590 }, { - "epoch": 1.01, - "learning_rate": 4.625622229142806e-05, - "loss": 0.266, + "epoch": 1.0305618625436985, + "grad_norm": 0.17283061146736145, + "learning_rate": 4.604572613514714e-05, + "loss": 0.4174, "step": 28595 }, { - "epoch": 1.01, - "learning_rate": 4.625472264102401e-05, - "loss": 0.2969, + "epoch": 1.030742062204923, + "grad_norm": 0.21077826619148254, + "learning_rate": 4.6044150945911974e-05, + "loss": 0.4365, "step": 28600 }, { - "epoch": 1.01, - "learning_rate": 4.6253222714641386e-05, - "loss": 0.2833, + "epoch": 1.0309222618661478, + "grad_norm": 0.19256427884101868, + "learning_rate": 4.604257546995433e-05, + "loss": 0.4491, "step": 28605 }, { - "epoch": 1.01, - "learning_rate": 4.625172251229968e-05, - "loss": 0.2772, + "epoch": 1.0311024615273723, + "grad_norm": 0.15783824026584625, + "learning_rate": 4.6040999707295665e-05, + "loss": 0.4072, "step": 28610 }, { - "epoch": 1.01, - "learning_rate": 4.6250222034018354e-05, - "loss": 0.3067, + "epoch": 1.031282661188597, + "grad_norm": 0.15280930697917938, + "learning_rate": 4.603942365795745e-05, + "loss": 0.4217, "step": 28615 }, { - "epoch": 1.01, - "learning_rate": 4.62487212798169e-05, - "loss": 0.2986, + "epoch": 1.0314628608498215, + "grad_norm": 0.16231854259967804, + "learning_rate": 4.603784732196116e-05, + "loss": 0.3995, "step": 28620 }, { - "epoch": 1.01, - "learning_rate": 4.62472202497148e-05, - "loss": 0.3008, + "epoch": 1.0316430605110463, + "grad_norm": 0.1602470427751541, + "learning_rate": 4.603627069932827e-05, + "loss": 0.4217, "step": 28625 }, { - "epoch": 1.01, - "learning_rate": 4.6245718943731556e-05, - "loss": 0.2753, + "epoch": 1.0318232601722708, + "grad_norm": 0.2087296098470688, + "learning_rate": 4.603469379008028e-05, + "loss": 0.4469, "step": 28630 }, { - "epoch": 1.01, - "learning_rate": 4.624421736188664e-05, - "loss": 0.2953, + "epoch": 1.0320034598334955, + "grad_norm": 0.18025125563144684, + "learning_rate": 4.603311659423864e-05, + "loss": 0.4198, "step": 28635 }, { - "epoch": 1.01, - "learning_rate": 4.624271550419957e-05, - "loss": 0.2925, + "epoch": 1.0321836594947202, + "grad_norm": 0.19022351503372192, + "learning_rate": 4.603153911182487e-05, + "loss": 0.4439, "step": 28640 }, { - "epoch": 1.01, - "learning_rate": 4.624121337068985e-05, - "loss": 0.302, + "epoch": 1.0323638591559448, + "grad_norm": 0.17020079493522644, + "learning_rate": 4.602996134286045e-05, + "loss": 0.3931, "step": 28645 }, { - "epoch": 1.01, - "learning_rate": 4.623971096137696e-05, - "loss": 0.3007, + "epoch": 1.0325440588171695, + "grad_norm": 0.17506608366966248, + "learning_rate": 4.602838328736688e-05, + "loss": 0.4457, "step": 28650 }, { - "epoch": 1.01, - "learning_rate": 4.6238208276280425e-05, - "loss": 0.2937, + "epoch": 1.032724258478394, + "grad_norm": 0.22321075201034546, + "learning_rate": 4.6026804945365655e-05, + "loss": 0.4347, "step": 28655 }, { - "epoch": 1.01, - "learning_rate": 4.6236705315419746e-05, - "loss": 0.2881, + "epoch": 1.0329044581396187, + "grad_norm": 0.16402897238731384, + "learning_rate": 4.602522631687829e-05, + "loss": 0.4257, "step": 28660 }, { - "epoch": 1.01, - "learning_rate": 4.623520207881445e-05, - "loss": 0.2761, + "epoch": 1.0330846578008432, + "grad_norm": 0.18853814899921417, + "learning_rate": 4.602364740192628e-05, + "loss": 0.4312, "step": 28665 }, { - "epoch": 1.01, - "learning_rate": 4.623369856648405e-05, - "loss": 0.2717, + "epoch": 1.033264857462068, + "grad_norm": 0.1805652529001236, + "learning_rate": 4.602206820053114e-05, + "loss": 0.4294, "step": 28670 }, { - "epoch": 1.01, - "learning_rate": 4.6232194778448064e-05, - "loss": 0.2863, + "epoch": 1.0334450571232927, + "grad_norm": 0.15662455558776855, + "learning_rate": 4.6020488712714404e-05, + "loss": 0.4338, "step": 28675 }, { - "epoch": 1.01, - "learning_rate": 4.623069071472602e-05, - "loss": 0.2899, + "epoch": 1.0336252567845172, + "grad_norm": 0.157413512468338, + "learning_rate": 4.6018908938497573e-05, + "loss": 0.4187, "step": 28680 }, { - "epoch": 1.01, - "learning_rate": 4.622918637533745e-05, - "loss": 0.2708, + "epoch": 1.033805456445742, + "grad_norm": 0.1799081712961197, + "learning_rate": 4.6017328877902176e-05, + "loss": 0.4412, "step": 28685 }, { - "epoch": 1.01, - "learning_rate": 4.622768176030189e-05, - "loss": 0.3391, + "epoch": 1.0339856561069665, + "grad_norm": 0.14759212732315063, + "learning_rate": 4.601574853094974e-05, + "loss": 0.407, "step": 28690 }, { - "epoch": 1.01, - "learning_rate": 4.622617686963887e-05, - "loss": 0.3078, + "epoch": 1.0341658557681912, + "grad_norm": 0.1873723268508911, + "learning_rate": 4.60141678976618e-05, + "loss": 0.4277, "step": 28695 }, { - "epoch": 1.01, - "learning_rate": 4.6224671703367925e-05, - "loss": 0.2943, + "epoch": 1.0343460554294157, + "grad_norm": 0.16125832498073578, + "learning_rate": 4.6012586978059893e-05, + "loss": 0.4602, "step": 28700 }, { - "epoch": 1.01, - "learning_rate": 4.622316626150861e-05, - "loss": 0.3048, + "epoch": 1.0345262550906404, + "grad_norm": 0.1734544336795807, + "learning_rate": 4.601100577216556e-05, + "loss": 0.4233, "step": 28705 }, { - "epoch": 1.01, - "learning_rate": 4.622166054408047e-05, - "loss": 0.2815, + "epoch": 1.034706454751865, + "grad_norm": 0.1755981296300888, + "learning_rate": 4.600942428000033e-05, + "loss": 0.4266, "step": 28710 }, { - "epoch": 1.01, - "learning_rate": 4.622015455110306e-05, - "loss": 0.2984, + "epoch": 1.0348866544130897, + "grad_norm": 0.16786262392997742, + "learning_rate": 4.600784250158577e-05, + "loss": 0.4204, "step": 28715 }, { - "epoch": 1.01, - "learning_rate": 4.621864828259592e-05, - "loss": 0.2911, + "epoch": 1.0350668540743144, + "grad_norm": 0.14706341922283173, + "learning_rate": 4.600626043694343e-05, + "loss": 0.4241, "step": 28720 }, { - "epoch": 1.01, - "learning_rate": 4.6217141738578605e-05, - "loss": 0.3086, + "epoch": 1.035247053735539, + "grad_norm": 0.1602163165807724, + "learning_rate": 4.600467808609485e-05, + "loss": 0.4105, "step": 28725 }, { - "epoch": 1.01, - "learning_rate": 4.6215634919070706e-05, - "loss": 0.2838, + "epoch": 1.0354272533967637, + "grad_norm": 0.1791827380657196, + "learning_rate": 4.60030954490616e-05, + "loss": 0.4406, "step": 28730 }, { - "epoch": 1.01, - "learning_rate": 4.621412782409176e-05, - "loss": 0.2952, + "epoch": 1.0356074530579882, + "grad_norm": 0.17273133993148804, + "learning_rate": 4.600151252586524e-05, + "loss": 0.4023, "step": 28735 }, { - "epoch": 1.01, - "learning_rate": 4.621262045366134e-05, - "loss": 0.2666, + "epoch": 1.035787652719213, + "grad_norm": 0.16618365049362183, + "learning_rate": 4.5999929316527335e-05, + "loss": 0.4238, "step": 28740 }, { - "epoch": 1.01, - "learning_rate": 4.621111280779903e-05, - "loss": 0.2786, + "epoch": 1.0359678523804374, + "grad_norm": 0.147443950176239, + "learning_rate": 4.599834582106946e-05, + "loss": 0.4491, "step": 28745 }, { - "epoch": 1.01, - "learning_rate": 4.62096048865244e-05, - "loss": 0.284, + "epoch": 1.0361480520416622, + "grad_norm": 0.1795797199010849, + "learning_rate": 4.599676203951319e-05, + "loss": 0.4178, "step": 28750 }, { - "epoch": 1.01, - "learning_rate": 4.620809668985703e-05, - "loss": 0.2934, + "epoch": 1.036328251702887, + "grad_norm": 0.15541589260101318, + "learning_rate": 4.59951779718801e-05, + "loss": 0.4204, "step": 28755 }, { - "epoch": 1.01, - "learning_rate": 4.6206588217816494e-05, - "loss": 0.3039, + "epoch": 1.0365084513641114, + "grad_norm": 0.17285758256912231, + "learning_rate": 4.599359361819178e-05, + "loss": 0.3788, "step": 28760 }, { - "epoch": 1.01, - "learning_rate": 4.6205079470422385e-05, - "loss": 0.2786, + "epoch": 1.0366886510253361, + "grad_norm": 0.13503800332546234, + "learning_rate": 4.5992008978469806e-05, + "loss": 0.3833, "step": 28765 }, { - "epoch": 1.01, - "learning_rate": 4.62035704476943e-05, - "loss": 0.2848, + "epoch": 1.0368688506865607, + "grad_norm": 0.19433645904064178, + "learning_rate": 4.599042405273578e-05, + "loss": 0.4648, "step": 28770 }, { - "epoch": 1.01, - "learning_rate": 4.620206114965182e-05, - "loss": 0.2921, + "epoch": 1.0370490503477854, + "grad_norm": 0.18631264567375183, + "learning_rate": 4.598883884101128e-05, + "loss": 0.4023, "step": 28775 }, { - "epoch": 1.01, - "learning_rate": 4.6200551576314555e-05, - "loss": 0.2966, + "epoch": 1.03722925000901, + "grad_norm": 0.1467059701681137, + "learning_rate": 4.598725334331793e-05, + "loss": 0.4018, "step": 28780 }, { - "epoch": 1.01, - "learning_rate": 4.61990417277021e-05, - "loss": 0.2852, + "epoch": 1.0374094496702346, + "grad_norm": 0.160639688372612, + "learning_rate": 4.5985667559677303e-05, + "loss": 0.3925, "step": 28785 }, { - "epoch": 1.01, - "learning_rate": 4.619753160383405e-05, - "loss": 0.2852, + "epoch": 1.0375896493314594, + "grad_norm": 0.1615612506866455, + "learning_rate": 4.598408149011102e-05, + "loss": 0.4166, "step": 28790 }, { - "epoch": 1.01, - "learning_rate": 4.619602120473003e-05, - "loss": 0.2634, + "epoch": 1.0377698489926839, + "grad_norm": 0.24002741277217865, + "learning_rate": 4.598249513464069e-05, + "loss": 0.3697, "step": 28795 }, { - "epoch": 1.01, - "learning_rate": 4.619451053040964e-05, - "loss": 0.2672, + "epoch": 1.0379500486539086, + "grad_norm": 0.18659855425357819, + "learning_rate": 4.5980908493287933e-05, + "loss": 0.4156, "step": 28800 }, { - "epoch": 1.01, - "learning_rate": 4.6192999580892505e-05, - "loss": 0.2912, + "epoch": 1.0381302483151331, + "grad_norm": 0.18351595103740692, + "learning_rate": 4.5979321566074354e-05, + "loss": 0.4355, "step": 28805 }, { - "epoch": 1.01, - "learning_rate": 4.619148835619823e-05, - "loss": 0.2948, + "epoch": 1.0383104479763579, + "grad_norm": 0.20558714866638184, + "learning_rate": 4.5977734353021586e-05, + "loss": 0.4162, "step": 28810 }, { - "epoch": 1.01, - "learning_rate": 4.6189976856346445e-05, - "loss": 0.3218, + "epoch": 1.0384906476375824, + "grad_norm": 0.15841884911060333, + "learning_rate": 4.5976146854151244e-05, + "loss": 0.3912, "step": 28815 }, { - "epoch": 1.01, - "learning_rate": 4.618846508135678e-05, - "loss": 0.2818, + "epoch": 1.038670847298807, + "grad_norm": 0.17524218559265137, + "learning_rate": 4.597455906948496e-05, + "loss": 0.419, "step": 28820 }, { - "epoch": 1.01, - "learning_rate": 4.6186953031248856e-05, - "loss": 0.2722, + "epoch": 1.0388510469600316, + "grad_norm": 0.1817321926355362, + "learning_rate": 4.597297099904437e-05, + "loss": 0.4137, "step": 28825 }, { - "epoch": 1.01, - "learning_rate": 4.6185440706042303e-05, - "loss": 0.2779, + "epoch": 1.0390312466212563, + "grad_norm": 0.18538501858711243, + "learning_rate": 4.597138264285111e-05, + "loss": 0.3848, "step": 28830 }, { - "epoch": 1.01, - "learning_rate": 4.618392810575677e-05, - "loss": 0.3148, + "epoch": 1.039211446282481, + "grad_norm": 0.16245108842849731, + "learning_rate": 4.596979400092683e-05, + "loss": 0.4461, "step": 28835 }, { - "epoch": 1.01, - "learning_rate": 4.6182415230411896e-05, - "loss": 0.2818, + "epoch": 1.0393916459437056, + "grad_norm": 0.2564604878425598, + "learning_rate": 4.5968205073293156e-05, + "loss": 0.4233, "step": 28840 }, { - "epoch": 1.01, - "learning_rate": 4.618090208002731e-05, - "loss": 0.3025, + "epoch": 1.0395718456049303, + "grad_norm": 0.1610010415315628, + "learning_rate": 4.596661585997176e-05, + "loss": 0.4268, "step": 28845 }, { - "epoch": 1.02, - "learning_rate": 4.617938865462268e-05, - "loss": 0.279, + "epoch": 1.0397520452661548, + "grad_norm": 0.19828878343105316, + "learning_rate": 4.596502636098427e-05, + "loss": 0.43, "step": 28850 }, { - "epoch": 1.02, - "learning_rate": 4.617787495421764e-05, - "loss": 0.2818, + "epoch": 1.0399322449273796, + "grad_norm": 0.17699094116687775, + "learning_rate": 4.596343657635236e-05, + "loss": 0.4678, "step": 28855 }, { - "epoch": 1.02, - "learning_rate": 4.6176360978831854e-05, - "loss": 0.2788, + "epoch": 1.040112444588604, + "grad_norm": 0.1853751391172409, + "learning_rate": 4.596184650609768e-05, + "loss": 0.4363, "step": 28860 }, { - "epoch": 1.02, - "learning_rate": 4.617484672848498e-05, - "loss": 0.2834, + "epoch": 1.0402926442498288, + "grad_norm": 0.17727990448474884, + "learning_rate": 4.596025615024191e-05, + "loss": 0.4541, "step": 28865 }, { - "epoch": 1.02, - "learning_rate": 4.617333220319666e-05, - "loss": 0.3027, + "epoch": 1.0404728439110535, + "grad_norm": 0.16852405667304993, + "learning_rate": 4.5958665508806696e-05, + "loss": 0.3916, "step": 28870 }, { - "epoch": 1.02, - "learning_rate": 4.617181740298659e-05, - "loss": 0.2951, + "epoch": 1.040653043572278, + "grad_norm": 0.1764410436153412, + "learning_rate": 4.595707458181373e-05, + "loss": 0.4453, "step": 28875 }, { - "epoch": 1.02, - "learning_rate": 4.617030232787441e-05, - "loss": 0.3074, + "epoch": 1.0408332432335028, + "grad_norm": 0.19933126866817474, + "learning_rate": 4.595548336928468e-05, + "loss": 0.4561, "step": 28880 }, { - "epoch": 1.02, - "learning_rate": 4.616878697787982e-05, - "loss": 0.2972, + "epoch": 1.0410134428947273, + "grad_norm": 0.2222566306591034, + "learning_rate": 4.5953891871241226e-05, + "loss": 0.418, "step": 28885 }, { - "epoch": 1.02, - "learning_rate": 4.616727135302248e-05, - "loss": 0.3025, + "epoch": 1.041193642555952, + "grad_norm": 0.16706469655036926, + "learning_rate": 4.5952300087705055e-05, + "loss": 0.453, "step": 28890 }, { - "epoch": 1.02, - "learning_rate": 4.616575545332207e-05, - "loss": 0.2992, + "epoch": 1.0413738422171765, + "grad_norm": 0.15461257100105286, + "learning_rate": 4.595070801869784e-05, + "loss": 0.4037, "step": 28895 }, { - "epoch": 1.02, - "learning_rate": 4.616423927879827e-05, - "loss": 0.2954, + "epoch": 1.0415540418784013, + "grad_norm": 0.16241170465946198, + "learning_rate": 4.594911566424129e-05, + "loss": 0.4157, "step": 28900 }, { - "epoch": 1.02, - "learning_rate": 4.6162722829470774e-05, - "loss": 0.2932, + "epoch": 1.041734241539626, + "grad_norm": 0.18162140250205994, + "learning_rate": 4.5947523024357106e-05, + "loss": 0.3942, "step": 28905 }, { - "epoch": 1.02, - "learning_rate": 4.616120610535927e-05, - "loss": 0.3215, + "epoch": 1.0419144412008505, + "grad_norm": 0.21681730449199677, + "learning_rate": 4.5945930099066966e-05, + "loss": 0.4247, "step": 28910 }, { - "epoch": 1.02, - "learning_rate": 4.615968910648345e-05, - "loss": 0.3091, + "epoch": 1.0420946408620753, + "grad_norm": 0.16923941671848297, + "learning_rate": 4.594433688839258e-05, + "loss": 0.4093, "step": 28915 }, { - "epoch": 1.02, - "learning_rate": 4.615817183286301e-05, - "loss": 0.3195, + "epoch": 1.0422748405232998, + "grad_norm": 0.17253339290618896, + "learning_rate": 4.594274339235567e-05, + "loss": 0.4216, "step": 28920 }, { - "epoch": 1.02, - "learning_rate": 4.615665428451765e-05, - "loss": 0.2676, + "epoch": 1.0424550401845245, + "grad_norm": 0.17402751743793488, + "learning_rate": 4.594114961097793e-05, + "loss": 0.421, "step": 28925 }, { - "epoch": 1.02, - "learning_rate": 4.615513646146707e-05, - "loss": 0.2835, + "epoch": 1.042635239845749, + "grad_norm": 0.17757736146450043, + "learning_rate": 4.593955554428108e-05, + "loss": 0.416, "step": 28930 }, { - "epoch": 1.02, - "learning_rate": 4.6153618363731e-05, - "loss": 0.2917, + "epoch": 1.0428154395069738, + "grad_norm": 0.19702796638011932, + "learning_rate": 4.593796119228684e-05, + "loss": 0.4186, "step": 28935 }, { - "epoch": 1.02, - "learning_rate": 4.6152099991329135e-05, - "loss": 0.3195, + "epoch": 1.0429956391681983, + "grad_norm": 0.17790283262729645, + "learning_rate": 4.593636655501693e-05, + "loss": 0.4218, "step": 28940 }, { - "epoch": 1.02, - "learning_rate": 4.615058134428118e-05, - "loss": 0.2848, + "epoch": 1.043175838829423, + "grad_norm": 0.18331415951251984, + "learning_rate": 4.593477163249308e-05, + "loss": 0.4208, "step": 28945 }, { - "epoch": 1.02, - "learning_rate": 4.614906242260687e-05, - "loss": 0.3069, + "epoch": 1.0433560384906477, + "grad_norm": 0.19453075528144836, + "learning_rate": 4.5933176424737025e-05, + "loss": 0.4193, "step": 28950 }, { - "epoch": 1.02, - "learning_rate": 4.614754322632593e-05, - "loss": 0.2777, + "epoch": 1.0435362381518722, + "grad_norm": 0.2180250734090805, + "learning_rate": 4.5931580931770494e-05, + "loss": 0.406, "step": 28955 }, { - "epoch": 1.02, - "learning_rate": 4.6146023755458065e-05, - "loss": 0.3258, + "epoch": 1.043716437813097, + "grad_norm": 0.16775591671466827, + "learning_rate": 4.592998515361522e-05, + "loss": 0.4246, "step": 28960 }, { - "epoch": 1.02, - "learning_rate": 4.614450401002303e-05, - "loss": 0.2998, + "epoch": 1.0438966374743215, + "grad_norm": 0.1662595570087433, + "learning_rate": 4.5928389090292955e-05, + "loss": 0.4263, "step": 28965 }, { - "epoch": 1.02, - "learning_rate": 4.6143288015999886e-05, - "loss": 0.2975, + "epoch": 1.0440768371355462, + "grad_norm": 0.17564332485198975, + "learning_rate": 4.592679274182544e-05, + "loss": 0.4001, "step": 28970 }, { - "epoch": 1.02, - "learning_rate": 4.614176777639365e-05, - "loss": 0.2925, + "epoch": 1.0442570367967707, + "grad_norm": 0.18523907661437988, + "learning_rate": 4.592519610823442e-05, + "loss": 0.4231, "step": 28975 }, { - "epoch": 1.02, - "learning_rate": 4.614024726227548e-05, - "loss": 0.3256, + "epoch": 1.0444372364579955, + "grad_norm": 0.17822639644145966, + "learning_rate": 4.592359918954165e-05, + "loss": 0.4239, "step": 28980 }, { - "epoch": 1.02, - "learning_rate": 4.6138726473665144e-05, - "loss": 0.296, + "epoch": 1.0446174361192202, + "grad_norm": 0.19693171977996826, + "learning_rate": 4.59220019857689e-05, + "loss": 0.4082, "step": 28985 }, { - "epoch": 1.02, - "learning_rate": 4.613720541058237e-05, - "loss": 0.2818, + "epoch": 1.0447976357804447, + "grad_norm": 0.16676361858844757, + "learning_rate": 4.592040449693793e-05, + "loss": 0.4109, "step": 28990 }, { - "epoch": 1.02, - "learning_rate": 4.613568407304692e-05, - "loss": 0.2902, + "epoch": 1.0449778354416694, + "grad_norm": 0.16993066668510437, + "learning_rate": 4.59188067230705e-05, + "loss": 0.4093, "step": 28995 }, { - "epoch": 1.02, - "learning_rate": 4.6134162461078535e-05, - "loss": 0.3066, + "epoch": 1.045158035102894, + "grad_norm": 0.20470210909843445, + "learning_rate": 4.591720866418836e-05, + "loss": 0.4041, "step": 29000 }, { - "epoch": 1.02, - "eval_loss": 0.2924649715423584, - "eval_runtime": 10.5475, - "eval_samples_per_second": 9.481, - "eval_steps_per_second": 9.481, + "epoch": 1.045158035102894, + "eval_loss": 0.4535163640975952, + "eval_runtime": 3.5354, + "eval_samples_per_second": 28.285, + "eval_steps_per_second": 7.071, "step": 29000 }, { - "epoch": 1.02, - "learning_rate": 4.613264057469697e-05, - "loss": 0.3196, + "epoch": 1.0453382347641187, + "grad_norm": 0.1358274519443512, + "learning_rate": 4.591561032031332e-05, + "loss": 0.366, "step": 29005 }, { - "epoch": 1.02, - "learning_rate": 4.6131118413922015e-05, - "loss": 0.2835, + "epoch": 1.0455184344253432, + "grad_norm": 0.1652381271123886, + "learning_rate": 4.591401169146713e-05, + "loss": 0.4371, "step": 29010 }, { - "epoch": 1.02, - "learning_rate": 4.61295959787734e-05, - "loss": 0.2965, + "epoch": 1.045698634086568, + "grad_norm": 0.2014123499393463, + "learning_rate": 4.591241277767158e-05, + "loss": 0.4396, "step": 29015 }, { - "epoch": 1.02, - "learning_rate": 4.61280732692709e-05, - "loss": 0.2943, + "epoch": 1.0458788337477924, + "grad_norm": 0.23837004601955414, + "learning_rate": 4.591081357894845e-05, + "loss": 0.4038, "step": 29020 }, { - "epoch": 1.02, - "learning_rate": 4.61265502854343e-05, - "loss": 0.2936, + "epoch": 1.0460590334090172, + "grad_norm": 0.167801171541214, + "learning_rate": 4.590921409531954e-05, + "loss": 0.4502, "step": 29025 }, { - "epoch": 1.02, - "learning_rate": 4.6125027027283375e-05, - "loss": 0.2984, + "epoch": 1.046239233070242, + "grad_norm": 0.14784109592437744, + "learning_rate": 4.5907614326806635e-05, + "loss": 0.4006, "step": 29030 }, { - "epoch": 1.02, - "learning_rate": 4.612350349483788e-05, - "loss": 0.2713, + "epoch": 1.0464194327314664, + "grad_norm": 0.17408788204193115, + "learning_rate": 4.5906014273431535e-05, + "loss": 0.3958, "step": 29035 }, { - "epoch": 1.02, - "learning_rate": 4.6121979688117626e-05, - "loss": 0.293, + "epoch": 1.0465996323926912, + "grad_norm": 0.1565713882446289, + "learning_rate": 4.590441393521604e-05, + "loss": 0.4062, "step": 29040 }, { - "epoch": 1.02, - "learning_rate": 4.6120455607142385e-05, - "loss": 0.2994, + "epoch": 1.0467798320539157, + "grad_norm": 0.20593391358852386, + "learning_rate": 4.590281331218195e-05, + "loss": 0.4573, "step": 29045 }, { - "epoch": 1.02, - "learning_rate": 4.611893125193194e-05, - "loss": 0.29, + "epoch": 1.0469600317151404, + "grad_norm": 0.1602008193731308, + "learning_rate": 4.5901212404351065e-05, + "loss": 0.4379, "step": 29050 }, { - "epoch": 1.02, - "learning_rate": 4.61174066225061e-05, - "loss": 0.2924, + "epoch": 1.047140231376365, + "grad_norm": 0.19146746397018433, + "learning_rate": 4.589961121174522e-05, + "loss": 0.3882, "step": 29055 }, { - "epoch": 1.02, - "learning_rate": 4.611588171888464e-05, - "loss": 0.3052, + "epoch": 1.0473204310375896, + "grad_norm": 0.18531233072280884, + "learning_rate": 4.5898009734386216e-05, + "loss": 0.4533, "step": 29060 }, { - "epoch": 1.02, - "learning_rate": 4.611435654108738e-05, - "loss": 0.2863, + "epoch": 1.0475006306988144, + "grad_norm": 0.15970437228679657, + "learning_rate": 4.5896407972295875e-05, + "loss": 0.3773, "step": 29065 }, { - "epoch": 1.02, - "learning_rate": 4.611283108913411e-05, - "loss": 0.2687, + "epoch": 1.047680830360039, + "grad_norm": 0.17048677802085876, + "learning_rate": 4.589480592549602e-05, + "loss": 0.4305, "step": 29070 }, { - "epoch": 1.02, - "learning_rate": 4.6111305363044644e-05, - "loss": 0.2993, + "epoch": 1.0478610300212636, + "grad_norm": 0.1856071949005127, + "learning_rate": 4.5893203594008476e-05, + "loss": 0.4541, "step": 29075 }, { - "epoch": 1.02, - "learning_rate": 4.6109779362838794e-05, - "loss": 0.293, + "epoch": 1.0480412296824881, + "grad_norm": 0.19591088593006134, + "learning_rate": 4.5891600977855085e-05, + "loss": 0.3656, "step": 29080 }, { - "epoch": 1.02, - "learning_rate": 4.610825308853637e-05, - "loss": 0.2927, + "epoch": 1.0482214293437129, + "grad_norm": 0.15485340356826782, + "learning_rate": 4.5889998077057674e-05, + "loss": 0.4005, "step": 29085 }, { - "epoch": 1.02, - "learning_rate": 4.610672654015719e-05, - "loss": 0.3053, + "epoch": 1.0484016290049374, + "grad_norm": 0.17994803190231323, + "learning_rate": 4.588839489163808e-05, + "loss": 0.406, "step": 29090 }, { - "epoch": 1.02, - "learning_rate": 4.610519971772107e-05, - "loss": 0.3047, + "epoch": 1.0485818286661621, + "grad_norm": 0.1988825649023056, + "learning_rate": 4.5886791421618155e-05, + "loss": 0.402, "step": 29095 }, { - "epoch": 1.02, - "learning_rate": 4.610367262124785e-05, - "loss": 0.3033, + "epoch": 1.0487620283273866, + "grad_norm": 0.18159984052181244, + "learning_rate": 4.5885187667019733e-05, + "loss": 0.4012, "step": 29100 }, { - "epoch": 1.02, - "learning_rate": 4.6102145250757345e-05, - "loss": 0.3131, + "epoch": 1.0489422279886114, + "grad_norm": 0.17193853855133057, + "learning_rate": 4.588358362786468e-05, + "loss": 0.4378, "step": 29105 }, { - "epoch": 1.02, - "learning_rate": 4.6100617606269385e-05, - "loss": 0.3028, + "epoch": 1.049122427649836, + "grad_norm": 0.1589495688676834, + "learning_rate": 4.588197930417484e-05, + "loss": 0.4416, "step": 29110 }, { - "epoch": 1.02, - "learning_rate": 4.609908968780382e-05, - "loss": 0.2893, + "epoch": 1.0493026273110606, + "grad_norm": 0.1768224835395813, + "learning_rate": 4.588037469597207e-05, + "loss": 0.4335, "step": 29115 }, { - "epoch": 1.02, - "learning_rate": 4.6097561495380474e-05, - "loss": 0.275, + "epoch": 1.0494828269722853, + "grad_norm": 0.207745760679245, + "learning_rate": 4.587876980327824e-05, + "loss": 0.4003, "step": 29120 }, { - "epoch": 1.02, - "learning_rate": 4.60960330290192e-05, - "loss": 0.2819, + "epoch": 1.0496630266335099, + "grad_norm": 0.20602521300315857, + "learning_rate": 4.587716462611522e-05, + "loss": 0.453, "step": 29125 }, { - "epoch": 1.02, - "learning_rate": 4.609450428873984e-05, - "loss": 0.2878, + "epoch": 1.0498432262947346, + "grad_norm": 0.1503795087337494, + "learning_rate": 4.587555916450487e-05, + "loss": 0.3976, "step": 29130 }, { - "epoch": 1.03, - "learning_rate": 4.609297527456224e-05, - "loss": 0.3103, + "epoch": 1.050023425955959, + "grad_norm": 0.21797312796115875, + "learning_rate": 4.587395341846907e-05, + "loss": 0.4621, "step": 29135 }, { - "epoch": 1.03, - "learning_rate": 4.609144598650625e-05, - "loss": 0.2843, + "epoch": 1.0502036256171838, + "grad_norm": 0.17475320398807526, + "learning_rate": 4.587234738802969e-05, + "loss": 0.4362, "step": 29140 }, { - "epoch": 1.03, - "learning_rate": 4.608991642459175e-05, - "loss": 0.3022, + "epoch": 1.0503838252784086, + "grad_norm": 0.15241824090480804, + "learning_rate": 4.5870741073208624e-05, + "loss": 0.4161, "step": 29145 }, { - "epoch": 1.03, - "learning_rate": 4.6088386588838574e-05, - "loss": 0.3251, + "epoch": 1.050564024939633, + "grad_norm": 0.17844238877296448, + "learning_rate": 4.5869134474027756e-05, + "loss": 0.4955, "step": 29150 }, { - "epoch": 1.03, - "learning_rate": 4.60868564792666e-05, - "loss": 0.3163, + "epoch": 1.0507442246008578, + "grad_norm": 0.1921539604663849, + "learning_rate": 4.586752759050896e-05, + "loss": 0.4587, "step": 29155 }, { - "epoch": 1.03, - "learning_rate": 4.6085326095895685e-05, - "loss": 0.2952, + "epoch": 1.0509244242620823, + "grad_norm": 0.20518042147159576, + "learning_rate": 4.586592042267415e-05, + "loss": 0.4486, "step": 29160 }, { - "epoch": 1.03, - "learning_rate": 4.608379543874571e-05, - "loss": 0.3182, + "epoch": 1.051104623923307, + "grad_norm": 0.15805669128894806, + "learning_rate": 4.586431297054521e-05, + "loss": 0.4031, "step": 29165 }, { - "epoch": 1.03, - "learning_rate": 4.608226450783655e-05, - "loss": 0.2704, + "epoch": 1.0512848235845316, + "grad_norm": 0.20354914665222168, + "learning_rate": 4.586270523414404e-05, + "loss": 0.4089, "step": 29170 }, { - "epoch": 1.03, - "learning_rate": 4.6080733303188075e-05, - "loss": 0.3, + "epoch": 1.0514650232457563, + "grad_norm": 0.18959026038646698, + "learning_rate": 4.586109721349255e-05, + "loss": 0.4437, "step": 29175 }, { - "epoch": 1.03, - "learning_rate": 4.607920182482017e-05, - "loss": 0.2958, + "epoch": 1.051645222906981, + "grad_norm": 0.2117196023464203, + "learning_rate": 4.585948890861266e-05, + "loss": 0.4373, "step": 29180 }, { - "epoch": 1.03, - "learning_rate": 4.607767007275273e-05, - "loss": 0.2918, + "epoch": 1.0518254225682055, + "grad_norm": 0.18057450652122498, + "learning_rate": 4.585788031952627e-05, + "loss": 0.4321, "step": 29185 }, { - "epoch": 1.03, - "learning_rate": 4.607613804700562e-05, - "loss": 0.307, + "epoch": 1.0520056222294303, + "grad_norm": 0.1702229529619217, + "learning_rate": 4.5856271446255294e-05, + "loss": 0.3866, "step": 29190 }, { - "epoch": 1.03, - "learning_rate": 4.607460574759876e-05, - "loss": 0.3087, + "epoch": 1.0521858218906548, + "grad_norm": 0.17912250757217407, + "learning_rate": 4.5854662288821656e-05, + "loss": 0.3849, "step": 29195 }, { - "epoch": 1.03, - "learning_rate": 4.607307317455203e-05, - "loss": 0.2707, + "epoch": 1.0523660215518795, + "grad_norm": 0.19952471554279327, + "learning_rate": 4.5853052847247286e-05, + "loss": 0.4252, "step": 29200 }, { - "epoch": 1.03, - "learning_rate": 4.607154032788533e-05, - "loss": 0.3145, + "epoch": 1.052546221213104, + "grad_norm": 0.17590820789337158, + "learning_rate": 4.5851443121554104e-05, + "loss": 0.4243, "step": 29205 }, { - "epoch": 1.03, - "learning_rate": 4.607000720761856e-05, - "loss": 0.29, + "epoch": 1.0527264208743288, + "grad_norm": 0.18067772686481476, + "learning_rate": 4.584983311176405e-05, + "loss": 0.4142, "step": 29210 }, { - "epoch": 1.03, - "learning_rate": 4.606847381377164e-05, - "loss": 0.2782, + "epoch": 1.0529066205355533, + "grad_norm": 0.17660415172576904, + "learning_rate": 4.584822281789906e-05, + "loss": 0.4109, "step": 29215 }, { - "epoch": 1.03, - "learning_rate": 4.606694014636447e-05, - "loss": 0.3053, + "epoch": 1.053086820196778, + "grad_norm": 0.20561589300632477, + "learning_rate": 4.5846612239981065e-05, + "loss": 0.4339, "step": 29220 }, { - "epoch": 1.03, - "learning_rate": 4.6065406205416966e-05, - "loss": 0.2973, + "epoch": 1.0532670198580028, + "grad_norm": 0.14123573899269104, + "learning_rate": 4.584500137803201e-05, + "loss": 0.4289, "step": 29225 }, { - "epoch": 1.03, - "learning_rate": 4.606387199094905e-05, - "loss": 0.2895, + "epoch": 1.0534472195192273, + "grad_norm": 0.17180036008358002, + "learning_rate": 4.5843390232073856e-05, + "loss": 0.4204, "step": 29230 }, { - "epoch": 1.03, - "learning_rate": 4.606233750298063e-05, - "loss": 0.2861, + "epoch": 1.053627419180452, + "grad_norm": 0.18303899466991425, + "learning_rate": 4.5841778802128544e-05, + "loss": 0.425, "step": 29235 }, { - "epoch": 1.03, - "learning_rate": 4.6060802741531636e-05, - "loss": 0.2772, + "epoch": 1.0538076188416765, + "grad_norm": 0.15086086094379425, + "learning_rate": 4.5840167088218026e-05, + "loss": 0.3899, "step": 29240 }, { - "epoch": 1.03, - "learning_rate": 4.605926770662201e-05, - "loss": 0.3141, + "epoch": 1.0539878185029012, + "grad_norm": 0.15532901883125305, + "learning_rate": 4.583855509036427e-05, + "loss": 0.4025, "step": 29245 }, { - "epoch": 1.03, - "learning_rate": 4.605773239827166e-05, - "loss": 0.2708, + "epoch": 1.0541680181641258, + "grad_norm": 0.15308701992034912, + "learning_rate": 4.5836942808589235e-05, + "loss": 0.4168, "step": 29250 }, { - "epoch": 1.03, - "learning_rate": 4.605619681650054e-05, - "loss": 0.2624, + "epoch": 1.0543482178253505, + "grad_norm": 0.1958785504102707, + "learning_rate": 4.583533024291489e-05, + "loss": 0.4647, "step": 29255 }, { - "epoch": 1.03, - "learning_rate": 4.605466096132858e-05, - "loss": 0.3042, + "epoch": 1.0545284174865752, + "grad_norm": 0.15900172293186188, + "learning_rate": 4.583371739336319e-05, + "loss": 0.4076, "step": 29260 }, { - "epoch": 1.03, - "learning_rate": 4.605312483277571e-05, - "loss": 0.3024, + "epoch": 1.0547086171477997, + "grad_norm": 0.19627580046653748, + "learning_rate": 4.583210425995614e-05, + "loss": 0.4619, "step": 29265 }, { - "epoch": 1.03, - "learning_rate": 4.60515884308619e-05, - "loss": 0.3008, + "epoch": 1.0548888168090245, + "grad_norm": 0.17516663670539856, + "learning_rate": 4.58304908427157e-05, + "loss": 0.4433, "step": 29270 }, { - "epoch": 1.03, - "learning_rate": 4.605005175560709e-05, - "loss": 0.3072, + "epoch": 1.055069016470249, + "grad_norm": 0.1998327523469925, + "learning_rate": 4.582887714166386e-05, + "loss": 0.4309, "step": 29275 }, { - "epoch": 1.03, - "learning_rate": 4.604851480703123e-05, - "loss": 0.2906, + "epoch": 1.0552492161314737, + "grad_norm": 0.15834058821201324, + "learning_rate": 4.582726315682259e-05, + "loss": 0.4016, "step": 29280 }, { - "epoch": 1.03, - "learning_rate": 4.604697758515427e-05, - "loss": 0.2937, + "epoch": 1.0554294157926982, + "grad_norm": 0.16241154074668884, + "learning_rate": 4.58256488882139e-05, + "loss": 0.4014, "step": 29285 }, { - "epoch": 1.03, - "learning_rate": 4.604544008999618e-05, - "loss": 0.2759, + "epoch": 1.055609615453923, + "grad_norm": 0.18402625620365143, + "learning_rate": 4.582403433585978e-05, + "loss": 0.4168, "step": 29290 }, { - "epoch": 1.03, - "learning_rate": 4.6043902321576926e-05, - "loss": 0.3239, + "epoch": 1.0557898151151477, + "grad_norm": 0.17826689779758453, + "learning_rate": 4.582241949978221e-05, + "loss": 0.4416, "step": 29295 }, { - "epoch": 1.03, - "learning_rate": 4.6042364279916464e-05, - "loss": 0.2735, + "epoch": 1.0559700147763722, + "grad_norm": 0.18548059463500977, + "learning_rate": 4.582080438000321e-05, + "loss": 0.422, "step": 29300 }, { - "epoch": 1.03, - "learning_rate": 4.6040825965034764e-05, - "loss": 0.3082, + "epoch": 1.056150214437597, + "grad_norm": 0.1519528478384018, + "learning_rate": 4.581918897654479e-05, + "loss": 0.4051, "step": 29305 }, { - "epoch": 1.03, - "learning_rate": 4.603928737695181e-05, - "loss": 0.2771, + "epoch": 1.0563304140988214, + "grad_norm": 0.14724068343639374, + "learning_rate": 4.581757328942894e-05, + "loss": 0.3999, "step": 29310 }, { - "epoch": 1.03, - "learning_rate": 4.6037748515687576e-05, - "loss": 0.3013, + "epoch": 1.0565106137600462, + "grad_norm": 0.18171782791614532, + "learning_rate": 4.5815957318677693e-05, + "loss": 0.4232, "step": 29315 }, { - "epoch": 1.03, - "learning_rate": 4.603620938126204e-05, - "loss": 0.2855, + "epoch": 1.0566908134212707, + "grad_norm": 0.20490474998950958, + "learning_rate": 4.5814341064313055e-05, + "loss": 0.4333, "step": 29320 }, { - "epoch": 1.03, - "learning_rate": 4.603466997369519e-05, - "loss": 0.3087, + "epoch": 1.0568710130824954, + "grad_norm": 0.18439970910549164, + "learning_rate": 4.581272452635705e-05, + "loss": 0.3899, "step": 29325 }, { - "epoch": 1.03, - "learning_rate": 4.603313029300701e-05, - "loss": 0.2584, + "epoch": 1.05705121274372, + "grad_norm": 0.14571373164653778, + "learning_rate": 4.58111077048317e-05, + "loss": 0.4089, "step": 29330 }, { - "epoch": 1.03, - "learning_rate": 4.603159033921749e-05, - "loss": 0.294, + "epoch": 1.0572314124049447, + "grad_norm": 0.15669065713882446, + "learning_rate": 4.5809490599759034e-05, + "loss": 0.3999, "step": 29335 }, { - "epoch": 1.03, - "learning_rate": 4.6030050112346643e-05, - "loss": 0.3152, + "epoch": 1.0574116120661694, + "grad_norm": 0.16960056126117706, + "learning_rate": 4.58078732111611e-05, + "loss": 0.4173, "step": 29340 }, { - "epoch": 1.03, - "learning_rate": 4.602850961241445e-05, - "loss": 0.2669, + "epoch": 1.057591811727394, + "grad_norm": 0.17213228344917297, + "learning_rate": 4.5806255539059905e-05, + "loss": 0.4012, "step": 29345 }, { - "epoch": 1.03, - "learning_rate": 4.602696883944091e-05, - "loss": 0.2989, + "epoch": 1.0577720113886186, + "grad_norm": 0.15806017816066742, + "learning_rate": 4.5804637583477514e-05, + "loss": 0.4421, "step": 29350 }, { - "epoch": 1.03, - "learning_rate": 4.602542779344605e-05, - "loss": 0.2704, + "epoch": 1.0579522110498432, + "grad_norm": 0.19483284652233124, + "learning_rate": 4.5803019344435974e-05, + "loss": 0.4188, "step": 29355 }, { - "epoch": 1.03, - "learning_rate": 4.602388647444985e-05, - "loss": 0.2651, + "epoch": 1.058132410711068, + "grad_norm": 0.18508678674697876, + "learning_rate": 4.580140082195731e-05, + "loss": 0.4134, "step": 29360 }, { - "epoch": 1.03, - "learning_rate": 4.6022344882472354e-05, - "loss": 0.2857, + "epoch": 1.0583126103722924, + "grad_norm": 0.1570422649383545, + "learning_rate": 4.579978201606359e-05, + "loss": 0.4138, "step": 29365 }, { - "epoch": 1.03, - "learning_rate": 4.6020803017533555e-05, - "loss": 0.319, + "epoch": 1.0584928100335171, + "grad_norm": 0.14425136148929596, + "learning_rate": 4.5798162926776865e-05, + "loss": 0.4375, "step": 29370 }, { - "epoch": 1.03, - "learning_rate": 4.6019260879653486e-05, - "loss": 0.2883, + "epoch": 1.0586730096947419, + "grad_norm": 0.17451147735118866, + "learning_rate": 4.57965435541192e-05, + "loss": 0.4292, "step": 29375 }, { - "epoch": 1.03, - "learning_rate": 4.6017718468852165e-05, - "loss": 0.2926, + "epoch": 1.0588532093559664, + "grad_norm": 0.16206900775432587, + "learning_rate": 4.579492389811266e-05, + "loss": 0.4548, "step": 29380 }, { - "epoch": 1.03, - "learning_rate": 4.601617578514962e-05, - "loss": 0.2898, + "epoch": 1.0590334090171911, + "grad_norm": 0.1823441982269287, + "learning_rate": 4.57933039587793e-05, + "loss": 0.409, "step": 29385 }, { - "epoch": 1.03, - "learning_rate": 4.6014632828565887e-05, - "loss": 0.323, + "epoch": 1.0592136086784156, + "grad_norm": 0.18232353031635284, + "learning_rate": 4.579168373614121e-05, + "loss": 0.3947, "step": 29390 }, { - "epoch": 1.03, - "learning_rate": 4.6013089599121e-05, - "loss": 0.2977, + "epoch": 1.0593938083396404, + "grad_norm": 0.16864867508411407, + "learning_rate": 4.5790063230220444e-05, + "loss": 0.4212, "step": 29395 }, { - "epoch": 1.03, - "learning_rate": 4.601154609683498e-05, - "loss": 0.3198, + "epoch": 1.0595740080008649, + "grad_norm": 0.19608835875988007, + "learning_rate": 4.578844244103909e-05, + "loss": 0.42, "step": 29400 }, { - "epoch": 1.03, - "learning_rate": 4.601000232172789e-05, - "loss": 0.2892, + "epoch": 1.0597542076620896, + "grad_norm": 0.1760438084602356, + "learning_rate": 4.5786821368619236e-05, + "loss": 0.4085, "step": 29405 }, { - "epoch": 1.03, - "learning_rate": 4.600845827381976e-05, - "loss": 0.2732, + "epoch": 1.0599344073233143, + "grad_norm": 0.17386280000209808, + "learning_rate": 4.578520001298297e-05, + "loss": 0.4454, "step": 29410 }, { - "epoch": 1.03, - "learning_rate": 4.600691395313065e-05, - "loss": 0.3062, + "epoch": 1.0601146069845389, + "grad_norm": 0.14320212602615356, + "learning_rate": 4.5783578374152376e-05, + "loss": 0.389, "step": 29415 }, { - "epoch": 1.04, - "learning_rate": 4.600536935968061e-05, - "loss": 0.2863, + "epoch": 1.0602948066457636, + "grad_norm": 0.20935241878032684, + "learning_rate": 4.578195645214955e-05, + "loss": 0.4252, "step": 29420 }, { - "epoch": 1.04, - "learning_rate": 4.6003824493489676e-05, - "loss": 0.2974, + "epoch": 1.060475006306988, + "grad_norm": 0.17992691695690155, + "learning_rate": 4.578033424699659e-05, + "loss": 0.4362, "step": 29425 }, { - "epoch": 1.04, - "learning_rate": 4.600227935457794e-05, - "loss": 0.2896, + "epoch": 1.0606552059682128, + "grad_norm": 0.18552446365356445, + "learning_rate": 4.57787117587156e-05, + "loss": 0.4269, "step": 29430 }, { - "epoch": 1.04, - "learning_rate": 4.600073394296544e-05, - "loss": 0.3128, + "epoch": 1.0608354056294373, + "grad_norm": 0.18173004686832428, + "learning_rate": 4.577708898732868e-05, + "loss": 0.4358, "step": 29435 }, { - "epoch": 1.04, - "learning_rate": 4.5999188258672246e-05, - "loss": 0.3325, + "epoch": 1.061015605290662, + "grad_norm": 0.19340266287326813, + "learning_rate": 4.577546593285795e-05, + "loss": 0.4268, "step": 29440 }, { - "epoch": 1.04, - "learning_rate": 4.599764230171844e-05, - "loss": 0.296, + "epoch": 1.0611958049518866, + "grad_norm": 0.17074435949325562, + "learning_rate": 4.577384259532552e-05, + "loss": 0.4342, "step": 29445 }, { - "epoch": 1.04, - "learning_rate": 4.599609607212408e-05, - "loss": 0.2799, + "epoch": 1.0613760046131113, + "grad_norm": 0.17281962931156158, + "learning_rate": 4.57722189747535e-05, + "loss": 0.3953, "step": 29450 }, { - "epoch": 1.04, - "learning_rate": 4.599454956990925e-05, - "loss": 0.3323, + "epoch": 1.061556204274336, + "grad_norm": 0.16818857192993164, + "learning_rate": 4.577059507116403e-05, + "loss": 0.3934, "step": 29455 }, { - "epoch": 1.04, - "learning_rate": 4.599300279509403e-05, - "loss": 0.3046, + "epoch": 1.0617364039355606, + "grad_norm": 0.15899808704853058, + "learning_rate": 4.576897088457921e-05, + "loss": 0.4223, "step": 29460 }, { - "epoch": 1.04, - "learning_rate": 4.5991455747698505e-05, - "loss": 0.2976, + "epoch": 1.0619166035967853, + "grad_norm": 0.17966797947883606, + "learning_rate": 4.576734641502119e-05, + "loss": 0.4265, "step": 29465 }, { - "epoch": 1.04, - "learning_rate": 4.598990842774276e-05, - "loss": 0.2796, + "epoch": 1.0620968032580098, + "grad_norm": 0.198250874876976, + "learning_rate": 4.57657216625121e-05, + "loss": 0.4491, "step": 29470 }, { - "epoch": 1.04, - "learning_rate": 4.598836083524689e-05, - "loss": 0.2941, + "epoch": 1.0622770029192345, + "grad_norm": 0.15183819830417633, + "learning_rate": 4.576409662707406e-05, + "loss": 0.4185, "step": 29475 }, { - "epoch": 1.04, - "learning_rate": 4.598681297023099e-05, - "loss": 0.2621, + "epoch": 1.062457202580459, + "grad_norm": 0.1668313890695572, + "learning_rate": 4.576247130872924e-05, + "loss": 0.3849, "step": 29480 }, { - "epoch": 1.04, - "learning_rate": 4.598526483271514e-05, - "loss": 0.313, + "epoch": 1.0626374022416838, + "grad_norm": 0.15694022178649902, + "learning_rate": 4.5760845707499753e-05, + "loss": 0.4084, "step": 29485 }, { - "epoch": 1.04, - "learning_rate": 4.598371642271947e-05, - "loss": 0.3199, + "epoch": 1.0628176019029085, + "grad_norm": 0.17181478440761566, + "learning_rate": 4.575921982340777e-05, + "loss": 0.4529, "step": 29490 }, { - "epoch": 1.04, - "learning_rate": 4.598216774026406e-05, - "loss": 0.3125, + "epoch": 1.062997801564133, + "grad_norm": 0.18744981288909912, + "learning_rate": 4.575759365647543e-05, + "loss": 0.4149, "step": 29495 }, { - "epoch": 1.04, - "learning_rate": 4.598061878536904e-05, - "loss": 0.3023, + "epoch": 1.0631780012253578, + "grad_norm": 0.1948675960302353, + "learning_rate": 4.5755967206724906e-05, + "loss": 0.4541, "step": 29500 }, { - "epoch": 1.04, - "eval_loss": 0.2920066714286804, - "eval_runtime": 10.538, - "eval_samples_per_second": 9.489, - "eval_steps_per_second": 9.489, + "epoch": 1.0631780012253578, + "eval_loss": 0.45399028062820435, + "eval_runtime": 3.5401, + "eval_samples_per_second": 28.248, + "eval_steps_per_second": 7.062, "step": 29500 }, { - "epoch": 1.04, - "learning_rate": 4.597906955805451e-05, - "loss": 0.2885, + "epoch": 1.0633582008865823, + "grad_norm": 0.2056550681591034, + "learning_rate": 4.5754340474178334e-05, + "loss": 0.4456, "step": 29505 }, { - "epoch": 1.04, - "learning_rate": 4.5977520058340586e-05, - "loss": 0.2782, + "epoch": 1.063538400547807, + "grad_norm": 0.14791448414325714, + "learning_rate": 4.57527134588579e-05, + "loss": 0.429, "step": 29510 }, { - "epoch": 1.04, - "learning_rate": 4.5975970286247384e-05, - "loss": 0.2928, + "epoch": 1.0637186002090315, + "grad_norm": 0.16541562974452972, + "learning_rate": 4.5751086160785764e-05, + "loss": 0.4213, "step": 29515 }, { - "epoch": 1.04, - "learning_rate": 4.5974420241795034e-05, - "loss": 0.2862, + "epoch": 1.0638987998702563, + "grad_norm": 0.18588918447494507, + "learning_rate": 4.57494585799841e-05, + "loss": 0.4371, "step": 29520 }, { - "epoch": 1.04, - "learning_rate": 4.597286992500366e-05, - "loss": 0.2886, + "epoch": 1.064078999531481, + "grad_norm": 0.20339952409267426, + "learning_rate": 4.574783071647507e-05, + "loss": 0.4157, "step": 29525 }, { - "epoch": 1.04, - "learning_rate": 4.597131933589339e-05, - "loss": 0.2839, + "epoch": 1.0642591991927055, + "grad_norm": 0.1556108593940735, + "learning_rate": 4.574620257028087e-05, + "loss": 0.4185, "step": 29530 }, { - "epoch": 1.04, - "learning_rate": 4.5969768474484364e-05, - "loss": 0.2895, + "epoch": 1.0644393988539302, + "grad_norm": 0.2084941267967224, + "learning_rate": 4.574457414142367e-05, + "loss": 0.4225, "step": 29535 }, { - "epoch": 1.04, - "learning_rate": 4.596821734079672e-05, - "loss": 0.2762, + "epoch": 1.0646195985151548, + "grad_norm": 0.1711416095495224, + "learning_rate": 4.574294542992566e-05, + "loss": 0.4142, "step": 29540 }, { - "epoch": 1.04, - "learning_rate": 4.596666593485058e-05, - "loss": 0.3067, + "epoch": 1.0647997981763795, + "grad_norm": 0.17444539070129395, + "learning_rate": 4.574131643580905e-05, + "loss": 0.4169, "step": 29545 }, { - "epoch": 1.04, - "learning_rate": 4.5965114256666104e-05, - "loss": 0.2999, + "epoch": 1.064979997837604, + "grad_norm": 0.2214493304491043, + "learning_rate": 4.573968715909601e-05, + "loss": 0.4479, "step": 29550 }, { - "epoch": 1.04, - "learning_rate": 4.5963562306263435e-05, - "loss": 0.2703, + "epoch": 1.0651601974988287, + "grad_norm": 0.16091366112232208, + "learning_rate": 4.573805759980875e-05, + "loss": 0.4392, "step": 29555 }, { - "epoch": 1.04, - "learning_rate": 4.5962010083662724e-05, - "loss": 0.2873, + "epoch": 1.0653403971600532, + "grad_norm": 0.1976114958524704, + "learning_rate": 4.573642775796947e-05, + "loss": 0.4436, "step": 29560 }, { - "epoch": 1.04, - "learning_rate": 4.596045758888413e-05, - "loss": 0.3129, + "epoch": 1.065520596821278, + "grad_norm": 0.14705707132816315, + "learning_rate": 4.573479763360038e-05, + "loss": 0.4187, "step": 29565 }, { - "epoch": 1.04, - "learning_rate": 4.5958904821947805e-05, - "loss": 0.303, + "epoch": 1.0657007964825027, + "grad_norm": 0.20177073776721954, + "learning_rate": 4.573316722672368e-05, + "loss": 0.4562, "step": 29570 }, { - "epoch": 1.04, - "learning_rate": 4.5957351782873916e-05, - "loss": 0.2996, + "epoch": 1.0658809961437272, + "grad_norm": 0.16343575716018677, + "learning_rate": 4.573153653736159e-05, + "loss": 0.4348, "step": 29575 }, { - "epoch": 1.04, - "learning_rate": 4.595579847168263e-05, - "loss": 0.28, + "epoch": 1.066061195804952, + "grad_norm": 0.19736631214618683, + "learning_rate": 4.572990556553634e-05, + "loss": 0.4196, "step": 29580 }, { - "epoch": 1.04, - "learning_rate": 4.59542448883941e-05, - "loss": 0.2625, + "epoch": 1.0662413954661765, + "grad_norm": 0.1972283273935318, + "learning_rate": 4.572827431127012e-05, + "loss": 0.4352, "step": 29585 }, { - "epoch": 1.04, - "learning_rate": 4.595269103302852e-05, - "loss": 0.2995, + "epoch": 1.0664215951274012, + "grad_norm": 0.17068789899349213, + "learning_rate": 4.5726642774585195e-05, + "loss": 0.4277, "step": 29590 }, { - "epoch": 1.04, - "learning_rate": 4.595113690560605e-05, - "loss": 0.2965, + "epoch": 1.0666017947886257, + "grad_norm": 0.22462329268455505, + "learning_rate": 4.5725010955503764e-05, + "loss": 0.4148, "step": 29595 }, { - "epoch": 1.04, - "learning_rate": 4.594958250614687e-05, - "loss": 0.2867, + "epoch": 1.0667819944498504, + "grad_norm": 0.14778359234333038, + "learning_rate": 4.572337885404807e-05, + "loss": 0.4104, "step": 29600 }, { - "epoch": 1.04, - "learning_rate": 4.594802783467117e-05, - "loss": 0.2938, + "epoch": 1.066962194111075, + "grad_norm": 0.1986793428659439, + "learning_rate": 4.5721746470240355e-05, + "loss": 0.4351, "step": 29605 }, { - "epoch": 1.04, - "learning_rate": 4.594647289119914e-05, - "loss": 0.2893, + "epoch": 1.0671423937722997, + "grad_norm": 0.20412606000900269, + "learning_rate": 4.572011380410286e-05, + "loss": 0.3812, "step": 29610 }, { - "epoch": 1.04, - "learning_rate": 4.594491767575096e-05, - "loss": 0.2957, + "epoch": 1.0673225934335244, + "grad_norm": 0.17064638435840607, + "learning_rate": 4.571848085565783e-05, + "loss": 0.4169, "step": 29615 }, { - "epoch": 1.04, - "learning_rate": 4.594336218834683e-05, - "loss": 0.26, + "epoch": 1.067502793094749, + "grad_norm": 0.20599807798862457, + "learning_rate": 4.5716847624927496e-05, + "loss": 0.4217, "step": 29620 }, { - "epoch": 1.04, - "learning_rate": 4.594180642900694e-05, - "loss": 0.2826, + "epoch": 1.0676829927559737, + "grad_norm": 0.1645248532295227, + "learning_rate": 4.5715214111934134e-05, + "loss": 0.3981, "step": 29625 }, { - "epoch": 1.04, - "learning_rate": 4.5940250397751494e-05, - "loss": 0.2641, + "epoch": 1.0678631924171982, + "grad_norm": 0.16410000622272491, + "learning_rate": 4.571358031669999e-05, + "loss": 0.4021, "step": 29630 }, { - "epoch": 1.04, - "learning_rate": 4.5938694094600697e-05, - "loss": 0.3104, + "epoch": 1.068043392078423, + "grad_norm": 0.19438453018665314, + "learning_rate": 4.571194623924731e-05, + "loss": 0.4029, "step": 29635 }, { - "epoch": 1.04, - "learning_rate": 4.593713751957476e-05, - "loss": 0.2913, + "epoch": 1.0682235917396474, + "grad_norm": 0.18021252751350403, + "learning_rate": 4.571031187959839e-05, + "loss": 0.3966, "step": 29640 }, { - "epoch": 1.04, - "learning_rate": 4.593558067269389e-05, - "loss": 0.3053, + "epoch": 1.0684037914008722, + "grad_norm": 0.17498481273651123, + "learning_rate": 4.570867723777548e-05, + "loss": 0.3946, "step": 29645 }, { - "epoch": 1.04, - "learning_rate": 4.59340235539783e-05, - "loss": 0.301, + "epoch": 1.068583991062097, + "grad_norm": 0.16093195974826813, + "learning_rate": 4.570704231380084e-05, + "loss": 0.4181, "step": 29650 }, { - "epoch": 1.04, - "learning_rate": 4.593246616344821e-05, - "loss": 0.2842, + "epoch": 1.0687641907233214, + "grad_norm": 0.20035532116889954, + "learning_rate": 4.570540710769676e-05, + "loss": 0.4173, "step": 29655 }, { - "epoch": 1.04, - "learning_rate": 4.593090850112384e-05, - "loss": 0.3104, + "epoch": 1.0689443903845461, + "grad_norm": 0.16321523487567902, + "learning_rate": 4.5703771619485523e-05, + "loss": 0.4154, "step": 29660 }, { - "epoch": 1.04, - "learning_rate": 4.592935056702542e-05, - "loss": 0.2777, + "epoch": 1.0691245900457707, + "grad_norm": 0.17585666477680206, + "learning_rate": 4.570213584918941e-05, + "loss": 0.4138, "step": 29665 }, { - "epoch": 1.04, - "learning_rate": 4.592779236117318e-05, - "loss": 0.3036, + "epoch": 1.0693047897069954, + "grad_norm": 0.14717882871627808, + "learning_rate": 4.570049979683069e-05, + "loss": 0.4144, "step": 29670 }, { - "epoch": 1.04, - "learning_rate": 4.592623388358734e-05, - "loss": 0.3212, + "epoch": 1.06948498936822, + "grad_norm": 0.16287022829055786, + "learning_rate": 4.569886346243167e-05, + "loss": 0.4082, "step": 29675 }, { - "epoch": 1.04, - "learning_rate": 4.592467513428815e-05, - "loss": 0.2755, + "epoch": 1.0696651890294446, + "grad_norm": 0.1774132400751114, + "learning_rate": 4.569722684601465e-05, + "loss": 0.443, "step": 29680 }, { - "epoch": 1.04, - "learning_rate": 4.592311611329584e-05, - "loss": 0.2793, + "epoch": 1.0698453886906694, + "grad_norm": 0.17134885489940643, + "learning_rate": 4.569558994760192e-05, + "loss": 0.423, "step": 29685 }, { - "epoch": 1.04, - "learning_rate": 4.5921556820630656e-05, - "loss": 0.2717, + "epoch": 1.0700255883518939, + "grad_norm": 0.20845064520835876, + "learning_rate": 4.569395276721579e-05, + "loss": 0.4343, "step": 29690 }, { - "epoch": 1.04, - "learning_rate": 4.591999725631285e-05, - "loss": 0.2906, + "epoch": 1.0702057880131186, + "grad_norm": 0.19249121844768524, + "learning_rate": 4.569231530487855e-05, + "loss": 0.4168, "step": 29695 }, { - "epoch": 1.04, - "learning_rate": 4.591843742036266e-05, - "loss": 0.3059, + "epoch": 1.0703859876743431, + "grad_norm": 0.21180908381938934, + "learning_rate": 4.569067756061253e-05, + "loss": 0.4245, "step": 29700 }, { - "epoch": 1.05, - "learning_rate": 4.591687731280034e-05, - "loss": 0.3108, + "epoch": 1.0705661873355679, + "grad_norm": 0.199473574757576, + "learning_rate": 4.568903953444003e-05, + "loss": 0.4199, "step": 29705 }, { - "epoch": 1.05, - "learning_rate": 4.591531693364617e-05, - "loss": 0.2996, + "epoch": 1.0707463869967924, + "grad_norm": 0.19441378116607666, + "learning_rate": 4.568740122638337e-05, + "loss": 0.4198, "step": 29710 }, { - "epoch": 1.05, - "learning_rate": 4.591375628292038e-05, - "loss": 0.315, + "epoch": 1.070926586658017, + "grad_norm": 0.17675237357616425, + "learning_rate": 4.568576263646487e-05, + "loss": 0.3893, "step": 29715 }, { - "epoch": 1.05, - "learning_rate": 4.591219536064325e-05, - "loss": 0.2997, + "epoch": 1.0711067863192416, + "grad_norm": 0.2046016901731491, + "learning_rate": 4.5684123764706855e-05, + "loss": 0.4285, "step": 29720 }, { - "epoch": 1.05, - "learning_rate": 4.5910634166835045e-05, - "loss": 0.2891, + "epoch": 1.0712869859804663, + "grad_norm": 0.1835736781358719, + "learning_rate": 4.568248461113167e-05, + "loss": 0.4359, "step": 29725 }, { - "epoch": 1.05, - "learning_rate": 4.5909072701516035e-05, - "loss": 0.2988, + "epoch": 1.071467185641691, + "grad_norm": 0.1686883121728897, + "learning_rate": 4.5680845175761635e-05, + "loss": 0.4405, "step": 29730 }, { - "epoch": 1.05, - "learning_rate": 4.59075109647065e-05, - "loss": 0.2868, + "epoch": 1.0716473853029156, + "grad_norm": 0.15105289220809937, + "learning_rate": 4.567920545861908e-05, + "loss": 0.4067, "step": 29735 }, { - "epoch": 1.05, - "learning_rate": 4.590594895642671e-05, - "loss": 0.2998, + "epoch": 1.0718275849641403, + "grad_norm": 0.21068331599235535, + "learning_rate": 4.5677565459726355e-05, + "loss": 0.4408, "step": 29740 }, { - "epoch": 1.05, - "learning_rate": 4.5904386676696955e-05, - "loss": 0.2672, + "epoch": 1.0720077846253648, + "grad_norm": 0.16096103191375732, + "learning_rate": 4.567592517910582e-05, + "loss": 0.4391, "step": 29745 }, { - "epoch": 1.05, - "learning_rate": 4.5902824125537516e-05, - "loss": 0.3028, + "epoch": 1.0721879842865896, + "grad_norm": 0.18630751967430115, + "learning_rate": 4.567428461677979e-05, + "loss": 0.4219, "step": 29750 }, { - "epoch": 1.05, - "learning_rate": 4.5901261302968676e-05, - "loss": 0.3042, + "epoch": 1.072368183947814, + "grad_norm": 0.18894068896770477, + "learning_rate": 4.567264377277064e-05, + "loss": 0.3977, "step": 29755 }, { - "epoch": 1.05, - "learning_rate": 4.589969820901073e-05, - "loss": 0.2895, + "epoch": 1.0725483836090388, + "grad_norm": 0.2151520699262619, + "learning_rate": 4.5671002647100716e-05, + "loss": 0.4539, "step": 29760 }, { - "epoch": 1.05, - "learning_rate": 4.589813484368398e-05, - "loss": 0.2965, + "epoch": 1.0727285832702635, + "grad_norm": 0.1710183173418045, + "learning_rate": 4.566936123979239e-05, + "loss": 0.4098, "step": 29765 }, { - "epoch": 1.05, - "learning_rate": 4.589657120700872e-05, - "loss": 0.2915, + "epoch": 1.072908782931488, + "grad_norm": 0.17231152951717377, + "learning_rate": 4.566771955086802e-05, + "loss": 0.4076, "step": 29770 }, { - "epoch": 1.05, - "learning_rate": 4.589500729900525e-05, - "loss": 0.2911, + "epoch": 1.0730889825927128, + "grad_norm": 0.18413393199443817, + "learning_rate": 4.566607758034997e-05, + "loss": 0.4398, "step": 29775 }, { - "epoch": 1.05, - "learning_rate": 4.5893443119693895e-05, - "loss": 0.2829, + "epoch": 1.0732691822539373, + "grad_norm": 0.18336208164691925, + "learning_rate": 4.566443532826061e-05, + "loss": 0.4552, "step": 29780 }, { - "epoch": 1.05, - "learning_rate": 4.589187866909495e-05, - "loss": 0.2855, + "epoch": 1.073449381915162, + "grad_norm": 0.18550068140029907, + "learning_rate": 4.566279279462232e-05, + "loss": 0.4303, "step": 29785 }, { - "epoch": 1.05, - "learning_rate": 4.589031394722871e-05, - "loss": 0.2959, + "epoch": 1.0736295815763865, + "grad_norm": 0.1502627283334732, + "learning_rate": 4.5661149979457485e-05, + "loss": 0.4258, "step": 29790 }, { - "epoch": 1.05, - "learning_rate": 4.588874895411552e-05, - "loss": 0.2871, + "epoch": 1.0738097812376113, + "grad_norm": 0.17316554486751556, + "learning_rate": 4.565950688278848e-05, + "loss": 0.414, "step": 29795 }, { - "epoch": 1.05, - "learning_rate": 4.5887183689775694e-05, - "loss": 0.2746, + "epoch": 1.073989980898836, + "grad_norm": 0.19836434721946716, + "learning_rate": 4.5657863504637694e-05, + "loss": 0.3891, "step": 29800 }, { - "epoch": 1.05, - "learning_rate": 4.588561815422955e-05, - "loss": 0.2942, + "epoch": 1.0741701805600605, + "grad_norm": 0.17455318570137024, + "learning_rate": 4.5656219845027516e-05, + "loss": 0.3748, "step": 29805 }, { - "epoch": 1.05, - "learning_rate": 4.588405234749742e-05, - "loss": 0.3044, + "epoch": 1.0743503802212853, + "grad_norm": 0.2048964947462082, + "learning_rate": 4.565457590398034e-05, + "loss": 0.431, "step": 29810 }, { - "epoch": 1.05, - "learning_rate": 4.588248626959963e-05, - "loss": 0.2734, + "epoch": 1.0745305798825098, + "grad_norm": 0.1792224943637848, + "learning_rate": 4.5652931681518565e-05, + "loss": 0.4157, "step": 29815 }, { - "epoch": 1.05, - "learning_rate": 4.588091992055653e-05, - "loss": 0.2695, + "epoch": 1.0747107795437345, + "grad_norm": 0.22106477618217468, + "learning_rate": 4.56512871776646e-05, + "loss": 0.4332, "step": 29820 }, { - "epoch": 1.05, - "learning_rate": 4.587935330038843e-05, - "loss": 0.3035, + "epoch": 1.074890979204959, + "grad_norm": 0.19627970457077026, + "learning_rate": 4.564964239244084e-05, + "loss": 0.4105, "step": 29825 }, { - "epoch": 1.05, - "learning_rate": 4.58777864091157e-05, - "loss": 0.3184, + "epoch": 1.0750711788661838, + "grad_norm": 0.1940174698829651, + "learning_rate": 4.564799732586971e-05, + "loss": 0.4369, "step": 29830 }, { - "epoch": 1.05, - "learning_rate": 4.587621924675866e-05, - "loss": 0.3016, + "epoch": 1.0752513785274083, + "grad_norm": 0.19587305188179016, + "learning_rate": 4.56463519779736e-05, + "loss": 0.3994, "step": 29835 }, { - "epoch": 1.05, - "learning_rate": 4.5874651813337674e-05, - "loss": 0.3051, + "epoch": 1.075431578188633, + "grad_norm": 0.14254744350910187, + "learning_rate": 4.564470634877495e-05, + "loss": 0.4357, "step": 29840 }, { - "epoch": 1.05, - "learning_rate": 4.58730841088731e-05, - "loss": 0.2728, + "epoch": 1.0756117778498577, + "grad_norm": 0.2016969919204712, + "learning_rate": 4.564306043829617e-05, + "loss": 0.4505, "step": 29845 }, { - "epoch": 1.05, - "learning_rate": 4.587151613338528e-05, - "loss": 0.2969, + "epoch": 1.0757919775110822, + "grad_norm": 0.17493173480033875, + "learning_rate": 4.564141424655969e-05, + "loss": 0.3733, "step": 29850 }, { - "epoch": 1.05, - "learning_rate": 4.5869947886894577e-05, - "loss": 0.2884, + "epoch": 1.075972177172307, + "grad_norm": 0.16929864883422852, + "learning_rate": 4.563976777358793e-05, + "loss": 0.3971, "step": 29855 }, { - "epoch": 1.05, - "learning_rate": 4.586837936942136e-05, - "loss": 0.2899, + "epoch": 1.0761523768335315, + "grad_norm": 0.18552954494953156, + "learning_rate": 4.563812101940334e-05, + "loss": 0.4258, "step": 29860 }, { - "epoch": 1.05, - "learning_rate": 4.586681058098598e-05, - "loss": 0.2928, + "epoch": 1.0763325764947562, + "grad_norm": 0.1586046814918518, + "learning_rate": 4.563647398402834e-05, + "loss": 0.4375, "step": 29865 }, { - "epoch": 1.05, - "learning_rate": 4.586524152160881e-05, - "loss": 0.2819, + "epoch": 1.0765127761559807, + "grad_norm": 0.1995263397693634, + "learning_rate": 4.5634826667485385e-05, + "loss": 0.4137, "step": 29870 }, { - "epoch": 1.05, - "learning_rate": 4.586367219131025e-05, - "loss": 0.2921, + "epoch": 1.0766929758172055, + "grad_norm": 0.16926749050617218, + "learning_rate": 4.56331790697969e-05, + "loss": 0.4065, "step": 29875 }, { - "epoch": 1.05, - "learning_rate": 4.5862102590110644e-05, - "loss": 0.3056, + "epoch": 1.0768731754784302, + "grad_norm": 0.18349143862724304, + "learning_rate": 4.563153119098535e-05, + "loss": 0.4106, "step": 29880 }, { - "epoch": 1.05, - "learning_rate": 4.586053271803039e-05, - "loss": 0.2808, + "epoch": 1.0770533751396547, + "grad_norm": 0.18791143596172333, + "learning_rate": 4.562988303107319e-05, + "loss": 0.4174, "step": 29885 }, { - "epoch": 1.05, - "learning_rate": 4.585896257508986e-05, - "loss": 0.3111, + "epoch": 1.0772335748008794, + "grad_norm": 0.1980997622013092, + "learning_rate": 4.5628234590082864e-05, + "loss": 0.4179, "step": 29890 }, { - "epoch": 1.05, - "learning_rate": 4.585739216130945e-05, - "loss": 0.3143, + "epoch": 1.077413774462104, + "grad_norm": 0.19258050620555878, + "learning_rate": 4.562658586803683e-05, + "loss": 0.4205, "step": 29895 }, { - "epoch": 1.05, - "learning_rate": 4.5855821476709546e-05, - "loss": 0.2987, + "epoch": 1.0775939741233287, + "grad_norm": 0.1779131144285202, + "learning_rate": 4.5624936864957556e-05, + "loss": 0.3996, "step": 29900 }, { - "epoch": 1.05, - "learning_rate": 4.5854250521310556e-05, - "loss": 0.2665, + "epoch": 1.0777741737845532, + "grad_norm": 0.14219936728477478, + "learning_rate": 4.562328758086752e-05, + "loss": 0.4039, "step": 29905 }, { - "epoch": 1.05, - "learning_rate": 4.5852679295132853e-05, - "loss": 0.2807, + "epoch": 1.077954373445778, + "grad_norm": 0.18294230103492737, + "learning_rate": 4.562163801578918e-05, + "loss": 0.4341, "step": 29910 }, { - "epoch": 1.05, - "learning_rate": 4.5851107798196854e-05, - "loss": 0.279, + "epoch": 1.0781345731070027, + "grad_norm": 0.18481683731079102, + "learning_rate": 4.561998816974501e-05, + "loss": 0.4289, "step": 29915 }, { - "epoch": 1.05, - "learning_rate": 4.584953603052297e-05, - "loss": 0.3015, + "epoch": 1.0783147727682272, + "grad_norm": 0.17565883696079254, + "learning_rate": 4.561833804275749e-05, + "loss": 0.4318, "step": 29920 }, { - "epoch": 1.05, - "learning_rate": 4.584796399213159e-05, - "loss": 0.3045, + "epoch": 1.078494972429452, + "grad_norm": 0.17877116799354553, + "learning_rate": 4.561668763484911e-05, + "loss": 0.4196, "step": 29925 }, { - "epoch": 1.05, - "learning_rate": 4.584639168304315e-05, - "loss": 0.2951, + "epoch": 1.0786751720906764, + "grad_norm": 0.1558140218257904, + "learning_rate": 4.561503694604236e-05, + "loss": 0.3991, "step": 29930 }, { - "epoch": 1.05, - "learning_rate": 4.584481910327804e-05, - "loss": 0.2686, + "epoch": 1.0788553717519012, + "grad_norm": 0.19004541635513306, + "learning_rate": 4.561338597635972e-05, + "loss": 0.468, "step": 29935 }, { - "epoch": 1.05, - "learning_rate": 4.5843246252856697e-05, - "loss": 0.2661, + "epoch": 1.0790355714131257, + "grad_norm": 0.14542394876480103, + "learning_rate": 4.561173472582368e-05, + "loss": 0.448, "step": 29940 }, { - "epoch": 1.05, - "learning_rate": 4.584167313179954e-05, - "loss": 0.2954, + "epoch": 1.0792157710743504, + "grad_norm": 0.19577011466026306, + "learning_rate": 4.5610083194456754e-05, + "loss": 0.4157, "step": 29945 }, { - "epoch": 1.05, - "learning_rate": 4.584009974012699e-05, - "loss": 0.2725, + "epoch": 1.079395970735575, + "grad_norm": 0.21589916944503784, + "learning_rate": 4.5608431382281426e-05, + "loss": 0.4265, "step": 29950 }, { - "epoch": 1.05, - "learning_rate": 4.583852607785948e-05, - "loss": 0.3023, + "epoch": 1.0795761703967997, + "grad_norm": 0.14693371951580048, + "learning_rate": 4.560677928932021e-05, + "loss": 0.411, "step": 29955 }, { - "epoch": 1.05, - "learning_rate": 4.583695214501743e-05, - "loss": 0.3105, + "epoch": 1.0797563700580244, + "grad_norm": 0.19670480489730835, + "learning_rate": 4.560512691559562e-05, + "loss": 0.4504, "step": 29960 }, { - "epoch": 1.05, - "learning_rate": 4.58353779416213e-05, - "loss": 0.279, + "epoch": 1.079936569719249, + "grad_norm": 0.18953409790992737, + "learning_rate": 4.560347426113017e-05, + "loss": 0.4325, "step": 29965 }, { - "epoch": 1.05, - "learning_rate": 4.5833803467691515e-05, - "loss": 0.3067, + "epoch": 1.0801167693804736, + "grad_norm": 0.18052424490451813, + "learning_rate": 4.560182132594637e-05, + "loss": 0.4092, "step": 29970 }, { - "epoch": 1.05, - "learning_rate": 4.583222872324853e-05, - "loss": 0.2747, + "epoch": 1.0802969690416981, + "grad_norm": 0.19230318069458008, + "learning_rate": 4.560016811006673e-05, + "loss": 0.4343, "step": 29975 }, { - "epoch": 1.05, - "learning_rate": 4.5830653708312775e-05, - "loss": 0.32, + "epoch": 1.0804771687029229, + "grad_norm": 0.2135818749666214, + "learning_rate": 4.5598514613513796e-05, + "loss": 0.4111, "step": 29980 }, { - "epoch": 1.05, - "learning_rate": 4.582907842290472e-05, - "loss": 0.3127, + "epoch": 1.0806573683641474, + "grad_norm": 0.1807836890220642, + "learning_rate": 4.5596860836310094e-05, + "loss": 0.435, "step": 29985 }, { - "epoch": 1.06, - "learning_rate": 4.582750286704479e-05, - "loss": 0.2701, + "epoch": 1.0808375680253721, + "grad_norm": 0.18161781132221222, + "learning_rate": 4.559520677847815e-05, + "loss": 0.4094, "step": 29990 }, { - "epoch": 1.06, - "learning_rate": 4.582592704075348e-05, - "loss": 0.2854, + "epoch": 1.0810177676865969, + "grad_norm": 0.19649210572242737, + "learning_rate": 4.55935524400405e-05, + "loss": 0.3864, "step": 29995 }, { - "epoch": 1.06, - "learning_rate": 4.582435094405122e-05, - "loss": 0.3255, + "epoch": 1.0811979673478214, + "grad_norm": 0.15059545636177063, + "learning_rate": 4.559189782101968e-05, + "loss": 0.3919, "step": 30000 }, { - "epoch": 1.06, - "eval_loss": 0.29047587513923645, - "eval_runtime": 10.5326, - "eval_samples_per_second": 9.494, - "eval_steps_per_second": 9.494, + "epoch": 1.0811979673478214, + "eval_loss": 0.4527240991592407, + "eval_runtime": 3.5376, + "eval_samples_per_second": 28.268, + "eval_steps_per_second": 7.067, "step": 30000 }, { - "epoch": 1.06, - "learning_rate": 4.582277457695849e-05, - "loss": 0.3098, + "epoch": 1.081378167009046, + "grad_norm": 0.16200894117355347, + "learning_rate": 4.559024292143824e-05, + "loss": 0.4298, "step": 30005 }, { - "epoch": 1.06, - "learning_rate": 4.582119793949576e-05, - "loss": 0.2848, + "epoch": 1.0815583666702706, + "grad_norm": 0.19894878566265106, + "learning_rate": 4.558858774131873e-05, + "loss": 0.4217, "step": 30010 }, { - "epoch": 1.06, - "learning_rate": 4.581962103168349e-05, - "loss": 0.2573, + "epoch": 1.0817385663314953, + "grad_norm": 0.18201377987861633, + "learning_rate": 4.5586932280683696e-05, + "loss": 0.4006, "step": 30015 }, { - "epoch": 1.06, - "learning_rate": 4.581804385354217e-05, - "loss": 0.3003, + "epoch": 1.0819187659927199, + "grad_norm": 0.22016799449920654, + "learning_rate": 4.55852765395557e-05, + "loss": 0.4411, "step": 30020 }, { - "epoch": 1.06, - "learning_rate": 4.5816466405092265e-05, - "loss": 0.2979, + "epoch": 1.0820989656539446, + "grad_norm": 0.21885946393013, + "learning_rate": 4.5583620517957296e-05, + "loss": 0.4293, "step": 30025 }, { - "epoch": 1.06, - "learning_rate": 4.5814888686354254e-05, - "loss": 0.2854, + "epoch": 1.0822791653151693, + "grad_norm": 0.21789535880088806, + "learning_rate": 4.5581964215911036e-05, + "loss": 0.4038, "step": 30030 }, { - "epoch": 1.06, - "learning_rate": 4.5813310697348644e-05, - "loss": 0.3119, + "epoch": 1.0824593649763938, + "grad_norm": 0.14212708175182343, + "learning_rate": 4.558030763343951e-05, + "loss": 0.4043, "step": 30035 }, { - "epoch": 1.06, - "learning_rate": 4.58117324380959e-05, - "loss": 0.2668, + "epoch": 1.0826395646376186, + "grad_norm": 0.1961928755044937, + "learning_rate": 4.5578650770565274e-05, + "loss": 0.4439, "step": 30040 }, { - "epoch": 1.06, - "learning_rate": 4.5810153908616534e-05, - "loss": 0.3009, + "epoch": 1.082819764298843, + "grad_norm": 0.18222855031490326, + "learning_rate": 4.55769936273109e-05, + "loss": 0.4137, "step": 30045 }, { - "epoch": 1.06, - "learning_rate": 4.580857510893103e-05, - "loss": 0.2783, + "epoch": 1.0829999639600678, + "grad_norm": 0.18094827234745026, + "learning_rate": 4.5575336203698985e-05, + "loss": 0.3873, "step": 30050 }, { - "epoch": 1.06, - "learning_rate": 4.5806996039059894e-05, - "loss": 0.2799, + "epoch": 1.0831801636212923, + "grad_norm": 0.14984826743602753, + "learning_rate": 4.557367849975208e-05, + "loss": 0.4094, "step": 30055 }, { - "epoch": 1.06, - "learning_rate": 4.580541669902362e-05, - "loss": 0.3114, + "epoch": 1.083360363282517, + "grad_norm": 0.18728286027908325, + "learning_rate": 4.5572020515492805e-05, + "loss": 0.4556, "step": 30060 }, { - "epoch": 1.06, - "learning_rate": 4.5803837088842736e-05, - "loss": 0.2727, + "epoch": 1.0835405629437416, + "grad_norm": 0.1479881852865219, + "learning_rate": 4.5570362250943725e-05, + "loss": 0.3622, "step": 30065 }, { - "epoch": 1.06, - "learning_rate": 4.5802257208537725e-05, - "loss": 0.2785, + "epoch": 1.0837207626049663, + "grad_norm": 0.2368391901254654, + "learning_rate": 4.556870370612744e-05, + "loss": 0.4298, "step": 30070 }, { - "epoch": 1.06, - "learning_rate": 4.5800677058129114e-05, - "loss": 0.2772, + "epoch": 1.083900962266191, + "grad_norm": 0.1753835380077362, + "learning_rate": 4.5567044881066555e-05, + "loss": 0.4145, "step": 30075 }, { - "epoch": 1.06, - "learning_rate": 4.579909663763742e-05, - "loss": 0.2984, + "epoch": 1.0840811619274155, + "grad_norm": 0.17474301159381866, + "learning_rate": 4.556538577578366e-05, + "loss": 0.4013, "step": 30080 }, { - "epoch": 1.06, - "learning_rate": 4.579751594708317e-05, - "loss": 0.3128, + "epoch": 1.0842613615886403, + "grad_norm": 0.2075207531452179, + "learning_rate": 4.5563726390301374e-05, + "loss": 0.3922, "step": 30085 }, { - "epoch": 1.06, - "learning_rate": 4.579593498648688e-05, - "loss": 0.3048, + "epoch": 1.0844415612498648, + "grad_norm": 0.1771383434534073, + "learning_rate": 4.5562066724642286e-05, + "loss": 0.3737, "step": 30090 }, { - "epoch": 1.06, - "learning_rate": 4.5794353755869076e-05, - "loss": 0.2876, + "epoch": 1.0846217609110895, + "grad_norm": 0.1907252073287964, + "learning_rate": 4.556040677882903e-05, + "loss": 0.4369, "step": 30095 }, { - "epoch": 1.06, - "learning_rate": 4.5792772255250285e-05, - "loss": 0.3073, + "epoch": 1.084801960572314, + "grad_norm": 0.1812266856431961, + "learning_rate": 4.55587465528842e-05, + "loss": 0.4365, "step": 30100 }, { - "epoch": 1.06, - "learning_rate": 4.579119048465106e-05, - "loss": 0.3059, + "epoch": 1.0849821602335388, + "grad_norm": 0.17140717804431915, + "learning_rate": 4.5557086046830446e-05, + "loss": 0.4023, "step": 30105 }, { - "epoch": 1.06, - "learning_rate": 4.578960844409192e-05, - "loss": 0.2862, + "epoch": 1.0851623598947633, + "grad_norm": 0.20850670337677002, + "learning_rate": 4.555542526069036e-05, + "loss": 0.4173, "step": 30110 }, { - "epoch": 1.06, - "learning_rate": 4.5788026133593434e-05, - "loss": 0.2938, + "epoch": 1.085342559555988, + "grad_norm": 0.21336951851844788, + "learning_rate": 4.5553764194486583e-05, + "loss": 0.431, "step": 30115 }, { - "epoch": 1.06, - "learning_rate": 4.578644355317611e-05, - "loss": 0.2748, + "epoch": 1.0855227592172128, + "grad_norm": 0.16418558359146118, + "learning_rate": 4.5552102848241764e-05, + "loss": 0.4199, "step": 30120 }, { - "epoch": 1.06, - "learning_rate": 4.578486070286052e-05, - "loss": 0.3168, + "epoch": 1.0857029588784373, + "grad_norm": 0.15260203182697296, + "learning_rate": 4.5550441221978505e-05, + "loss": 0.4031, "step": 30125 }, { - "epoch": 1.06, - "learning_rate": 4.578327758266721e-05, - "loss": 0.2866, + "epoch": 1.085883158539662, + "grad_norm": 0.17375853657722473, + "learning_rate": 4.554877931571947e-05, + "loss": 0.4301, "step": 30130 }, { - "epoch": 1.06, - "learning_rate": 4.578169419261674e-05, - "loss": 0.293, + "epoch": 1.0860633582008865, + "grad_norm": 0.1743849366903305, + "learning_rate": 4.5547117129487305e-05, + "loss": 0.4394, "step": 30135 }, { - "epoch": 1.06, - "learning_rate": 4.578011053272967e-05, - "loss": 0.2965, + "epoch": 1.0862435578621112, + "grad_norm": 0.1563817262649536, + "learning_rate": 4.554545466330463e-05, + "loss": 0.423, "step": 30140 }, { - "epoch": 1.06, - "learning_rate": 4.577852660302655e-05, - "loss": 0.2518, + "epoch": 1.0864237575233358, + "grad_norm": 0.22052206099033356, + "learning_rate": 4.554379191719412e-05, + "loss": 0.4591, "step": 30145 }, { - "epoch": 1.06, - "learning_rate": 4.577694240352797e-05, - "loss": 0.2861, + "epoch": 1.0866039571845605, + "grad_norm": 0.23430973291397095, + "learning_rate": 4.554212889117842e-05, + "loss": 0.4231, "step": 30150 }, { - "epoch": 1.06, - "learning_rate": 4.577535793425447e-05, - "loss": 0.2965, + "epoch": 1.0867841568457852, + "grad_norm": 0.1482902467250824, + "learning_rate": 4.55404655852802e-05, + "loss": 0.4042, "step": 30155 }, { - "epoch": 1.06, - "learning_rate": 4.5773773195226646e-05, - "loss": 0.2779, + "epoch": 1.0869643565070097, + "grad_norm": 0.19089116156101227, + "learning_rate": 4.5538801999522115e-05, + "loss": 0.3929, "step": 30160 }, { - "epoch": 1.06, - "learning_rate": 4.577218818646506e-05, - "loss": 0.2751, + "epoch": 1.0871445561682345, + "grad_norm": 0.1974368840456009, + "learning_rate": 4.553713813392682e-05, + "loss": 0.393, "step": 30165 }, { - "epoch": 1.06, - "learning_rate": 4.5770602907990314e-05, - "loss": 0.298, + "epoch": 1.087324755829459, + "grad_norm": 0.20312896370887756, + "learning_rate": 4.5535473988517e-05, + "loss": 0.3933, "step": 30170 }, { - "epoch": 1.06, - "learning_rate": 4.5769017359822965e-05, - "loss": 0.3175, + "epoch": 1.0875049554906837, + "grad_norm": 0.16208785772323608, + "learning_rate": 4.553380956331531e-05, + "loss": 0.4074, "step": 30175 }, { - "epoch": 1.06, - "learning_rate": 4.5767431541983616e-05, - "loss": 0.3072, + "epoch": 1.0876851551519082, + "grad_norm": 0.16057203710079193, + "learning_rate": 4.5532144858344446e-05, + "loss": 0.4167, "step": 30180 }, { - "epoch": 1.06, - "learning_rate": 4.576584545449286e-05, - "loss": 0.2959, + "epoch": 1.087865354813133, + "grad_norm": 0.19308283925056458, + "learning_rate": 4.5530479873627095e-05, + "loss": 0.4052, "step": 30185 }, { - "epoch": 1.06, - "learning_rate": 4.576425909737128e-05, - "loss": 0.2951, + "epoch": 1.0880455544743577, + "grad_norm": 0.15638864040374756, + "learning_rate": 4.552881460918592e-05, + "loss": 0.4148, "step": 30190 }, { - "epoch": 1.06, - "learning_rate": 4.576267247063948e-05, - "loss": 0.2759, + "epoch": 1.0882257541355822, + "grad_norm": 0.1577041745185852, + "learning_rate": 4.552714906504362e-05, + "loss": 0.4295, "step": 30195 }, { - "epoch": 1.06, - "learning_rate": 4.576108557431806e-05, - "loss": 0.2547, + "epoch": 1.088405953796807, + "grad_norm": 0.18273350596427917, + "learning_rate": 4.552548324122289e-05, + "loss": 0.4369, "step": 30200 }, { - "epoch": 1.06, - "learning_rate": 4.5759498408427634e-05, - "loss": 0.3103, + "epoch": 1.0885861534580314, + "grad_norm": 0.18305066227912903, + "learning_rate": 4.552381713774643e-05, + "loss": 0.3818, "step": 30205 }, { - "epoch": 1.06, - "learning_rate": 4.57579109729888e-05, - "loss": 0.2763, + "epoch": 1.0887663531192562, + "grad_norm": 0.1907605528831482, + "learning_rate": 4.5522150754636926e-05, + "loss": 0.4491, "step": 30210 }, { - "epoch": 1.06, - "learning_rate": 4.575632326802217e-05, - "loss": 0.2865, + "epoch": 1.0889465527804807, + "grad_norm": 0.18072152137756348, + "learning_rate": 4.55204840919171e-05, + "loss": 0.3953, "step": 30215 }, { - "epoch": 1.06, - "learning_rate": 4.5754735293548355e-05, - "loss": 0.2729, + "epoch": 1.0891267524417054, + "grad_norm": 0.15544374287128448, + "learning_rate": 4.551881714960965e-05, + "loss": 0.4226, "step": 30220 }, { - "epoch": 1.06, - "learning_rate": 4.575314704958799e-05, - "loss": 0.2916, + "epoch": 1.08930695210293, + "grad_norm": 0.19354958832263947, + "learning_rate": 4.5517149927737276e-05, + "loss": 0.3948, "step": 30225 }, { - "epoch": 1.06, - "learning_rate": 4.575155853616168e-05, - "loss": 0.3036, + "epoch": 1.0894871517641547, + "grad_norm": 0.21312229335308075, + "learning_rate": 4.551548242632272e-05, + "loss": 0.4241, "step": 30230 }, { - "epoch": 1.06, - "learning_rate": 4.5749969753290065e-05, - "loss": 0.2789, + "epoch": 1.0896673514253794, + "grad_norm": 0.19396165013313293, + "learning_rate": 4.5513814645388686e-05, + "loss": 0.435, "step": 30235 }, { - "epoch": 1.06, - "learning_rate": 4.574838070099376e-05, - "loss": 0.2888, + "epoch": 1.089847551086604, + "grad_norm": 0.18126662075519562, + "learning_rate": 4.55121465849579e-05, + "loss": 0.4313, "step": 30240 }, { - "epoch": 1.06, - "learning_rate": 4.574679137929341e-05, - "loss": 0.3114, + "epoch": 1.0900277507478286, + "grad_norm": 0.1702282428741455, + "learning_rate": 4.551047824505308e-05, + "loss": 0.4408, "step": 30245 }, { - "epoch": 1.06, - "learning_rate": 4.5745201788209646e-05, - "loss": 0.3183, + "epoch": 1.0902079504090532, + "grad_norm": 0.17638468742370605, + "learning_rate": 4.550880962569697e-05, + "loss": 0.4091, "step": 30250 }, { - "epoch": 1.06, - "learning_rate": 4.5743611927763106e-05, - "loss": 0.2891, + "epoch": 1.090388150070278, + "grad_norm": 0.20289641618728638, + "learning_rate": 4.55071407269123e-05, + "loss": 0.4385, "step": 30255 }, { - "epoch": 1.06, - "learning_rate": 4.5742021797974447e-05, - "loss": 0.3024, + "epoch": 1.0905683497315024, + "grad_norm": 0.16934141516685486, + "learning_rate": 4.5505471548721815e-05, + "loss": 0.4698, "step": 30260 }, { - "epoch": 1.06, - "learning_rate": 4.574043139886429e-05, - "loss": 0.3048, + "epoch": 1.0907485493927271, + "grad_norm": 0.1499088704586029, + "learning_rate": 4.550380209114824e-05, + "loss": 0.4283, "step": 30265 }, { - "epoch": 1.06, - "learning_rate": 4.57388407304533e-05, - "loss": 0.3093, + "epoch": 1.0909287490539519, + "grad_norm": 0.1858983188867569, + "learning_rate": 4.550213235421433e-05, + "loss": 0.4332, "step": 30270 }, { - "epoch": 1.07, - "learning_rate": 4.573724979276214e-05, - "loss": 0.2744, + "epoch": 1.0911089487151764, + "grad_norm": 0.1682644486427307, + "learning_rate": 4.550046233794284e-05, + "loss": 0.4435, "step": 30275 }, { - "epoch": 1.07, - "learning_rate": 4.573565858581145e-05, - "loss": 0.3016, + "epoch": 1.0912891483764011, + "grad_norm": 0.21229910850524902, + "learning_rate": 4.5498792042356516e-05, + "loss": 0.4552, "step": 30280 }, { - "epoch": 1.07, - "learning_rate": 4.57340671096219e-05, - "loss": 0.2876, + "epoch": 1.0914693480376256, + "grad_norm": 0.18798302114009857, + "learning_rate": 4.549712146747812e-05, + "loss": 0.4752, "step": 30285 }, { - "epoch": 1.07, - "learning_rate": 4.573247536421415e-05, - "loss": 0.2879, + "epoch": 1.0916495476988504, + "grad_norm": 0.192665234208107, + "learning_rate": 4.549545061333042e-05, + "loss": 0.4404, "step": 30290 }, { - "epoch": 1.07, - "learning_rate": 4.5730883349608885e-05, - "loss": 0.3148, + "epoch": 1.0918297473600749, + "grad_norm": 0.18173715472221375, + "learning_rate": 4.549377947993617e-05, + "loss": 0.4051, "step": 30295 }, { - "epoch": 1.07, - "learning_rate": 4.572929106582675e-05, - "loss": 0.3076, + "epoch": 1.0920099470212996, + "grad_norm": 0.17702946066856384, + "learning_rate": 4.549210806731814e-05, + "loss": 0.4092, "step": 30300 }, { - "epoch": 1.07, - "learning_rate": 4.572769851288843e-05, - "loss": 0.2992, + "epoch": 1.0921901466825243, + "grad_norm": 0.19283123314380646, + "learning_rate": 4.54904363754991e-05, + "loss": 0.4396, "step": 30305 }, { - "epoch": 1.07, - "learning_rate": 4.5726105690814616e-05, - "loss": 0.2713, + "epoch": 1.0923703463437489, + "grad_norm": 0.16534939408302307, + "learning_rate": 4.5488764404501836e-05, + "loss": 0.4085, "step": 30310 }, { - "epoch": 1.07, - "learning_rate": 4.5724512599625965e-05, - "loss": 0.311, + "epoch": 1.0925505460049736, + "grad_norm": 0.1978139728307724, + "learning_rate": 4.5487092154349134e-05, + "loss": 0.4261, "step": 30315 }, { - "epoch": 1.07, - "learning_rate": 4.572291923934319e-05, - "loss": 0.276, + "epoch": 1.092730745666198, + "grad_norm": 0.17723916471004486, + "learning_rate": 4.548541962506375e-05, + "loss": 0.4394, "step": 30320 }, { - "epoch": 1.07, - "learning_rate": 4.5721325609986956e-05, - "loss": 0.2811, + "epoch": 1.0929109453274228, + "grad_norm": 0.15324194729328156, + "learning_rate": 4.54837468166685e-05, + "loss": 0.4128, "step": 30325 }, { - "epoch": 1.07, - "learning_rate": 4.571973171157796e-05, - "loss": 0.3065, + "epoch": 1.0930911449886473, + "grad_norm": 0.14871060848236084, + "learning_rate": 4.548207372918617e-05, + "loss": 0.4189, "step": 30330 }, { - "epoch": 1.07, - "learning_rate": 4.5718137544136906e-05, - "loss": 0.2823, + "epoch": 1.093271344649872, + "grad_norm": 0.1577349752187729, + "learning_rate": 4.5480400362639544e-05, + "loss": 0.3968, "step": 30335 }, { - "epoch": 1.07, - "learning_rate": 4.57165431076845e-05, - "loss": 0.2673, + "epoch": 1.0934515443110966, + "grad_norm": 0.18540742993354797, + "learning_rate": 4.5478726717051425e-05, + "loss": 0.4243, "step": 30340 }, { - "epoch": 1.07, - "learning_rate": 4.571494840224143e-05, - "loss": 0.3034, + "epoch": 1.0936317439723213, + "grad_norm": 0.17788349092006683, + "learning_rate": 4.547705279244462e-05, + "loss": 0.4177, "step": 30345 }, { - "epoch": 1.07, - "learning_rate": 4.57133534278284e-05, - "loss": 0.2886, + "epoch": 1.093811943633546, + "grad_norm": 0.17169490456581116, + "learning_rate": 4.5475378588841945e-05, + "loss": 0.4525, "step": 30350 }, { - "epoch": 1.07, - "learning_rate": 4.571175818446613e-05, - "loss": 0.3097, + "epoch": 1.0939921432947706, + "grad_norm": 0.17133040726184845, + "learning_rate": 4.547370410626619e-05, + "loss": 0.4083, "step": 30355 }, { - "epoch": 1.07, - "learning_rate": 4.5710162672175326e-05, - "loss": 0.301, + "epoch": 1.0941723429559953, + "grad_norm": 0.2028420865535736, + "learning_rate": 4.547202934474019e-05, + "loss": 0.448, "step": 30360 }, { - "epoch": 1.07, - "learning_rate": 4.570856689097672e-05, - "loss": 0.3069, + "epoch": 1.0943525426172198, + "grad_norm": 0.2202211320400238, + "learning_rate": 4.5470354304286746e-05, + "loss": 0.4242, "step": 30365 }, { - "epoch": 1.07, - "learning_rate": 4.570697084089101e-05, - "loss": 0.2981, + "epoch": 1.0945327422784445, + "grad_norm": 0.19225327670574188, + "learning_rate": 4.546867898492869e-05, + "loss": 0.4575, "step": 30370 }, { - "epoch": 1.07, - "learning_rate": 4.570537452193893e-05, - "loss": 0.321, + "epoch": 1.094712941939669, + "grad_norm": 0.18303290009498596, + "learning_rate": 4.546700338668884e-05, + "loss": 0.4284, "step": 30375 }, { - "epoch": 1.07, - "learning_rate": 4.570377793414121e-05, - "loss": 0.3171, + "epoch": 1.0948931416008938, + "grad_norm": 0.19182315468788147, + "learning_rate": 4.546532750959004e-05, + "loss": 0.4241, "step": 30380 }, { - "epoch": 1.07, - "learning_rate": 4.570218107751858e-05, - "loss": 0.3056, + "epoch": 1.0950733412621185, + "grad_norm": 0.15474554896354675, + "learning_rate": 4.546365135365511e-05, + "loss": 0.4373, "step": 30385 }, { - "epoch": 1.07, - "learning_rate": 4.5700583952091766e-05, - "loss": 0.2731, + "epoch": 1.095253540923343, + "grad_norm": 0.17440365254878998, + "learning_rate": 4.54619749189069e-05, + "loss": 0.4173, "step": 30390 }, { - "epoch": 1.07, - "learning_rate": 4.569898655788152e-05, - "loss": 0.2996, + "epoch": 1.0954337405845678, + "grad_norm": 0.18450212478637695, + "learning_rate": 4.546029820536824e-05, + "loss": 0.4307, "step": 30395 }, { - "epoch": 1.07, - "learning_rate": 4.5697388894908565e-05, - "loss": 0.3146, + "epoch": 1.0956139402457923, + "grad_norm": 0.15440581738948822, + "learning_rate": 4.545862121306197e-05, + "loss": 0.4153, "step": 30400 }, { - "epoch": 1.07, - "learning_rate": 4.5695790963193666e-05, - "loss": 0.2897, + "epoch": 1.095794139907017, + "grad_norm": 0.19222205877304077, + "learning_rate": 4.5456943942010954e-05, + "loss": 0.4095, "step": 30405 }, { - "epoch": 1.07, - "learning_rate": 4.569419276275755e-05, - "loss": 0.2906, + "epoch": 1.0959743395682415, + "grad_norm": 0.24602346122264862, + "learning_rate": 4.545526639223804e-05, + "loss": 0.4373, "step": 30410 }, { - "epoch": 1.07, - "learning_rate": 4.569259429362098e-05, - "loss": 0.2822, + "epoch": 1.0961545392294663, + "grad_norm": 0.12822362780570984, + "learning_rate": 4.545358856376608e-05, + "loss": 0.3651, "step": 30415 }, { - "epoch": 1.07, - "learning_rate": 4.569131532486135e-05, - "loss": 0.3105, + "epoch": 1.096334738890691, + "grad_norm": 0.1669246107339859, + "learning_rate": 4.545191045661793e-05, + "loss": 0.383, "step": 30420 }, { - "epoch": 1.07, - "learning_rate": 4.5689716372116266e-05, - "loss": 0.2918, + "epoch": 1.0965149385519155, + "grad_norm": 0.18753668665885925, + "learning_rate": 4.5450232070816455e-05, + "loss": 0.4247, "step": 30425 }, { - "epoch": 1.07, - "learning_rate": 4.5688117150728856e-05, - "loss": 0.2943, + "epoch": 1.0966951382131402, + "grad_norm": 0.19645550847053528, + "learning_rate": 4.544855340638454e-05, + "loss": 0.3919, "step": 30430 }, { - "epoch": 1.07, - "learning_rate": 4.5686517660719875e-05, - "loss": 0.3087, + "epoch": 1.0968753378743648, + "grad_norm": 0.16774851083755493, + "learning_rate": 4.544687446334504e-05, + "loss": 0.4473, "step": 30435 }, { - "epoch": 1.07, - "learning_rate": 4.56849179021101e-05, - "loss": 0.2929, + "epoch": 1.0970555375355895, + "grad_norm": 0.16702622175216675, + "learning_rate": 4.544519524172083e-05, + "loss": 0.4206, "step": 30440 }, { - "epoch": 1.07, - "learning_rate": 4.568331787492029e-05, - "loss": 0.2754, + "epoch": 1.097235737196814, + "grad_norm": 0.18590982258319855, + "learning_rate": 4.5443515741534805e-05, + "loss": 0.374, "step": 30445 }, { - "epoch": 1.07, - "learning_rate": 4.568171757917123e-05, - "loss": 0.3089, + "epoch": 1.0974159368580387, + "grad_norm": 0.18357446789741516, + "learning_rate": 4.544183596280982e-05, + "loss": 0.4525, "step": 30450 }, { - "epoch": 1.07, - "learning_rate": 4.568011701488371e-05, - "loss": 0.289, + "epoch": 1.0975961365192632, + "grad_norm": 0.196320578455925, + "learning_rate": 4.544015590556879e-05, + "loss": 0.4196, "step": 30455 }, { - "epoch": 1.07, - "learning_rate": 4.567851618207848e-05, - "loss": 0.281, + "epoch": 1.097776336180488, + "grad_norm": 0.21362504363059998, + "learning_rate": 4.5438475569834585e-05, + "loss": 0.4115, "step": 30460 }, { - "epoch": 1.07, - "learning_rate": 4.567691508077635e-05, - "loss": 0.2843, + "epoch": 1.0979565358417127, + "grad_norm": 0.18946920335292816, + "learning_rate": 4.5436794955630115e-05, + "loss": 0.4346, "step": 30465 }, { - "epoch": 1.07, - "learning_rate": 4.5675313710998116e-05, - "loss": 0.2716, + "epoch": 1.0981367355029372, + "grad_norm": 0.1708582490682602, + "learning_rate": 4.5435114062978255e-05, + "loss": 0.4377, "step": 30470 }, { - "epoch": 1.07, - "learning_rate": 4.5673712072764544e-05, - "loss": 0.2786, + "epoch": 1.098316935164162, + "grad_norm": 0.1605166643857956, + "learning_rate": 4.543343289190194e-05, + "loss": 0.4377, "step": 30475 }, { - "epoch": 1.07, - "learning_rate": 4.567211016609645e-05, - "loss": 0.2728, + "epoch": 1.0984971348253865, + "grad_norm": 0.18001148104667664, + "learning_rate": 4.543175144242405e-05, + "loss": 0.4053, "step": 30480 }, { - "epoch": 1.07, - "learning_rate": 4.567050799101463e-05, - "loss": 0.2905, + "epoch": 1.0986773344866112, + "grad_norm": 0.16023088991641998, + "learning_rate": 4.5430069714567503e-05, + "loss": 0.4372, "step": 30485 }, { - "epoch": 1.07, - "learning_rate": 4.566890554753989e-05, - "loss": 0.2941, + "epoch": 1.0988575341478357, + "grad_norm": 0.1586974859237671, + "learning_rate": 4.5428387708355214e-05, + "loss": 0.3954, "step": 30490 }, { - "epoch": 1.07, - "learning_rate": 4.5667302835693034e-05, - "loss": 0.2961, + "epoch": 1.0990377338090604, + "grad_norm": 0.1917564570903778, + "learning_rate": 4.542670542381009e-05, + "loss": 0.4374, "step": 30495 }, { - "epoch": 1.07, - "learning_rate": 4.5665699855494863e-05, - "loss": 0.3006, + "epoch": 1.099217933470285, + "grad_norm": 0.16604667901992798, + "learning_rate": 4.542502286095507e-05, + "loss": 0.4139, "step": 30500 }, { - "epoch": 1.07, - "eval_loss": 0.2904548943042755, - "eval_runtime": 10.5425, - "eval_samples_per_second": 9.485, - "eval_steps_per_second": 9.485, + "epoch": 1.099217933470285, + "eval_loss": 0.4521316885948181, + "eval_runtime": 3.5361, + "eval_samples_per_second": 28.28, + "eval_steps_per_second": 7.07, "step": 30500 }, { - "epoch": 1.07, - "learning_rate": 4.56640966069662e-05, - "loss": 0.2878, + "epoch": 1.0993981331315097, + "grad_norm": 0.16811847686767578, + "learning_rate": 4.542334001981307e-05, + "loss": 0.4095, "step": 30505 }, { - "epoch": 1.07, - "learning_rate": 4.566249309012787e-05, - "loss": 0.3108, + "epoch": 1.0995783327927344, + "grad_norm": 0.1503792107105255, + "learning_rate": 4.5421656900407e-05, + "loss": 0.442, "step": 30510 }, { - "epoch": 1.07, - "learning_rate": 4.566088930500068e-05, - "loss": 0.3057, + "epoch": 1.099758532453959, + "grad_norm": 0.18034709990024567, + "learning_rate": 4.5419973502759816e-05, + "loss": 0.4725, "step": 30515 }, { - "epoch": 1.07, - "learning_rate": 4.565928525160545e-05, - "loss": 0.3067, + "epoch": 1.0999387321151837, + "grad_norm": 0.16529808938503265, + "learning_rate": 4.541828982689445e-05, + "loss": 0.3991, "step": 30520 }, { - "epoch": 1.07, - "learning_rate": 4.565768092996302e-05, - "loss": 0.2811, + "epoch": 1.1001189317764082, + "grad_norm": 0.17672580480575562, + "learning_rate": 4.541660587283384e-05, + "loss": 0.4159, "step": 30525 }, { - "epoch": 1.07, - "learning_rate": 4.5656076340094224e-05, - "loss": 0.2812, + "epoch": 1.100299131437633, + "grad_norm": 0.17020630836486816, + "learning_rate": 4.541492164060092e-05, + "loss": 0.4095, "step": 30530 }, { - "epoch": 1.07, - "learning_rate": 4.5654471482019876e-05, - "loss": 0.2949, + "epoch": 1.1004793310988576, + "grad_norm": 0.19656726717948914, + "learning_rate": 4.541323713021865e-05, + "loss": 0.4345, "step": 30535 }, { - "epoch": 1.07, - "learning_rate": 4.565286635576084e-05, - "loss": 0.2894, + "epoch": 1.1006595307600822, + "grad_norm": 0.15002837777137756, + "learning_rate": 4.541155234170997e-05, + "loss": 0.4065, "step": 30540 }, { - "epoch": 1.07, - "learning_rate": 4.565126096133794e-05, - "loss": 0.288, + "epoch": 1.100839730421307, + "grad_norm": 0.15486060082912445, + "learning_rate": 4.540986727509785e-05, + "loss": 0.3887, "step": 30545 }, { - "epoch": 1.07, - "learning_rate": 4.564965529877203e-05, - "loss": 0.3034, + "epoch": 1.1010199300825314, + "grad_norm": 0.16535469889640808, + "learning_rate": 4.540818193040523e-05, + "loss": 0.395, "step": 30550 }, { - "epoch": 1.08, - "learning_rate": 4.5648049368083956e-05, - "loss": 0.2972, + "epoch": 1.1012001297437561, + "grad_norm": 0.14959602057933807, + "learning_rate": 4.54064963076551e-05, + "loss": 0.4287, "step": 30555 }, { - "epoch": 1.08, - "learning_rate": 4.564644316929456e-05, - "loss": 0.2935, + "epoch": 1.1013803294049807, + "grad_norm": 0.19328902661800385, + "learning_rate": 4.5404810406870396e-05, + "loss": 0.4119, "step": 30560 }, { - "epoch": 1.08, - "learning_rate": 4.564483670242471e-05, - "loss": 0.2799, + "epoch": 1.1015605290662054, + "grad_norm": 0.18112468719482422, + "learning_rate": 4.54031242280741e-05, + "loss": 0.4082, "step": 30565 }, { - "epoch": 1.08, - "learning_rate": 4.564322996749526e-05, - "loss": 0.2615, + "epoch": 1.10174072872743, + "grad_norm": 0.2009975016117096, + "learning_rate": 4.540143777128919e-05, + "loss": 0.4164, "step": 30570 }, { - "epoch": 1.08, - "learning_rate": 4.5641622964527076e-05, - "loss": 0.3086, + "epoch": 1.1019209283886546, + "grad_norm": 0.15006959438323975, + "learning_rate": 4.539975103653864e-05, + "loss": 0.37, "step": 30575 }, { - "epoch": 1.08, - "learning_rate": 4.564001569354102e-05, - "loss": 0.3051, + "epoch": 1.1021011280498794, + "grad_norm": 0.15584491193294525, + "learning_rate": 4.5398064023845424e-05, + "loss": 0.4106, "step": 30580 }, { - "epoch": 1.08, - "learning_rate": 4.563840815455797e-05, - "loss": 0.2889, + "epoch": 1.1022813277111039, + "grad_norm": 0.17378970980644226, + "learning_rate": 4.539637673323255e-05, + "loss": 0.4463, "step": 30585 }, { - "epoch": 1.08, - "learning_rate": 4.563680034759877e-05, - "loss": 0.2883, + "epoch": 1.1024615273723286, + "grad_norm": 0.21250377595424652, + "learning_rate": 4.539468916472298e-05, + "loss": 0.4063, "step": 30590 }, { - "epoch": 1.08, - "learning_rate": 4.563519227268433e-05, - "loss": 0.2804, + "epoch": 1.1026417270335531, + "grad_norm": 0.19015663862228394, + "learning_rate": 4.539300131833972e-05, + "loss": 0.4417, "step": 30595 }, { - "epoch": 1.08, - "learning_rate": 4.563358392983553e-05, - "loss": 0.2779, + "epoch": 1.1028219266947779, + "grad_norm": 0.15822429955005646, + "learning_rate": 4.539131319410577e-05, + "loss": 0.4316, "step": 30600 }, { - "epoch": 1.08, - "learning_rate": 4.563197531907323e-05, - "loss": 0.2925, + "epoch": 1.1030021263560024, + "grad_norm": 0.17139464616775513, + "learning_rate": 4.538962479204412e-05, + "loss": 0.4495, "step": 30605 }, { - "epoch": 1.08, - "learning_rate": 4.563036644041833e-05, - "loss": 0.312, + "epoch": 1.103182326017227, + "grad_norm": 0.14621873199939728, + "learning_rate": 4.538793611217778e-05, + "loss": 0.4219, "step": 30610 }, { - "epoch": 1.08, - "learning_rate": 4.56287572938917e-05, - "loss": 0.3245, + "epoch": 1.1033625256784516, + "grad_norm": 0.15971679985523224, + "learning_rate": 4.538624715452976e-05, + "loss": 0.4193, "step": 30615 }, { - "epoch": 1.08, - "learning_rate": 4.562714787951427e-05, - "loss": 0.303, + "epoch": 1.1035427253396763, + "grad_norm": 0.16001123189926147, + "learning_rate": 4.538455791912307e-05, + "loss": 0.4119, "step": 30620 }, { - "epoch": 1.08, - "learning_rate": 4.5625538197306916e-05, - "loss": 0.2757, + "epoch": 1.103722925000901, + "grad_norm": 0.17780780792236328, + "learning_rate": 4.5382868405980724e-05, + "loss": 0.4474, "step": 30625 }, { - "epoch": 1.08, - "learning_rate": 4.5623928247290536e-05, - "loss": 0.2908, + "epoch": 1.1039031246621256, + "grad_norm": 0.17984482645988464, + "learning_rate": 4.5381178615125746e-05, + "loss": 0.4399, "step": 30630 }, { - "epoch": 1.08, - "learning_rate": 4.5622318029486045e-05, - "loss": 0.2996, + "epoch": 1.1040833243233503, + "grad_norm": 0.1892779916524887, + "learning_rate": 4.537948854658115e-05, + "loss": 0.4361, "step": 30635 }, { - "epoch": 1.08, - "learning_rate": 4.562070754391434e-05, - "loss": 0.2964, + "epoch": 1.1042635239845748, + "grad_norm": 0.13633422553539276, + "learning_rate": 4.537779820036997e-05, + "loss": 0.4032, "step": 30640 }, { - "epoch": 1.08, - "learning_rate": 4.561909679059635e-05, - "loss": 0.2906, + "epoch": 1.1044437236457996, + "grad_norm": 0.1995527446269989, + "learning_rate": 4.5376107576515235e-05, + "loss": 0.4478, "step": 30645 }, { - "epoch": 1.08, - "learning_rate": 4.561748576955297e-05, - "loss": 0.2945, + "epoch": 1.104623923307024, + "grad_norm": 0.21352799236774445, + "learning_rate": 4.537441667503998e-05, + "loss": 0.4282, "step": 30650 }, { - "epoch": 1.08, - "learning_rate": 4.5615874480805114e-05, - "loss": 0.2942, + "epoch": 1.1048041229682488, + "grad_norm": 0.2059534788131714, + "learning_rate": 4.537272549596724e-05, + "loss": 0.4291, "step": 30655 }, { - "epoch": 1.08, - "learning_rate": 4.561426292437372e-05, - "loss": 0.2847, + "epoch": 1.1049843226294735, + "grad_norm": 0.17112842202186584, + "learning_rate": 4.5371034039320065e-05, + "loss": 0.3752, "step": 30660 }, { - "epoch": 1.08, - "learning_rate": 4.561265110027971e-05, - "loss": 0.2865, + "epoch": 1.105164522290698, + "grad_norm": 0.18487350642681122, + "learning_rate": 4.536934230512149e-05, + "loss": 0.4198, "step": 30665 }, { - "epoch": 1.08, - "learning_rate": 4.561103900854401e-05, - "loss": 0.2858, + "epoch": 1.1053447219519228, + "grad_norm": 0.20532426238059998, + "learning_rate": 4.5367650293394574e-05, + "loss": 0.4549, "step": 30670 }, { - "epoch": 1.08, - "learning_rate": 4.560942664918755e-05, - "loss": 0.2842, + "epoch": 1.1055249216131473, + "grad_norm": 0.1958533376455307, + "learning_rate": 4.536595800416236e-05, + "loss": 0.424, "step": 30675 }, { - "epoch": 1.08, - "learning_rate": 4.5607814022231266e-05, - "loss": 0.3133, + "epoch": 1.105705121274372, + "grad_norm": 0.18324759602546692, + "learning_rate": 4.5364265437447915e-05, + "loss": 0.4424, "step": 30680 }, { - "epoch": 1.08, - "learning_rate": 4.5606201127696105e-05, - "loss": 0.2843, + "epoch": 1.1058853209355965, + "grad_norm": 0.16815263032913208, + "learning_rate": 4.536257259327429e-05, + "loss": 0.3943, "step": 30685 }, { - "epoch": 1.08, - "learning_rate": 4.5604587965602996e-05, - "loss": 0.3128, + "epoch": 1.1060655205968213, + "grad_norm": 0.16847245395183563, + "learning_rate": 4.536087947166456e-05, + "loss": 0.4235, "step": 30690 }, { - "epoch": 1.08, - "learning_rate": 4.560297453597288e-05, - "loss": 0.2892, + "epoch": 1.106245720258046, + "grad_norm": 0.18886929750442505, + "learning_rate": 4.5359186072641796e-05, + "loss": 0.431, "step": 30695 }, { - "epoch": 1.08, - "learning_rate": 4.560136083882674e-05, - "loss": 0.3015, + "epoch": 1.1064259199192705, + "grad_norm": 0.19722402095794678, + "learning_rate": 4.535749239622906e-05, + "loss": 0.4551, "step": 30700 }, { - "epoch": 1.08, - "learning_rate": 4.559974687418549e-05, - "loss": 0.3036, + "epoch": 1.1066061195804953, + "grad_norm": 0.1684960126876831, + "learning_rate": 4.535579844244943e-05, + "loss": 0.4292, "step": 30705 }, { - "epoch": 1.08, - "learning_rate": 4.559813264207011e-05, - "loss": 0.2806, + "epoch": 1.1067863192417198, + "grad_norm": 0.16538777947425842, + "learning_rate": 4.535410421132598e-05, + "loss": 0.4574, "step": 30710 }, { - "epoch": 1.08, - "learning_rate": 4.5596518142501556e-05, - "loss": 0.289, + "epoch": 1.1069665189029445, + "grad_norm": 0.21061739325523376, + "learning_rate": 4.535240970288181e-05, + "loss": 0.402, "step": 30715 }, { - "epoch": 1.08, - "learning_rate": 4.5594903375500786e-05, - "loss": 0.2716, + "epoch": 1.107146718564169, + "grad_norm": 0.18841953575611115, + "learning_rate": 4.535071491713999e-05, + "loss": 0.4477, "step": 30720 }, { - "epoch": 1.08, - "learning_rate": 4.559328834108876e-05, - "loss": 0.2837, + "epoch": 1.1073269182253938, + "grad_norm": 0.15210363268852234, + "learning_rate": 4.534901985412363e-05, + "loss": 0.4299, "step": 30725 }, { - "epoch": 1.08, - "learning_rate": 4.559167303928646e-05, - "loss": 0.2884, + "epoch": 1.1075071178866183, + "grad_norm": 0.18912887573242188, + "learning_rate": 4.53473245138558e-05, + "loss": 0.4201, "step": 30730 }, { - "epoch": 1.08, - "learning_rate": 4.559005747011486e-05, - "loss": 0.2818, + "epoch": 1.107687317547843, + "grad_norm": 0.1371183693408966, + "learning_rate": 4.5345628896359625e-05, + "loss": 0.4116, "step": 30735 }, { - "epoch": 1.08, - "learning_rate": 4.5588441633594926e-05, - "loss": 0.304, + "epoch": 1.1078675172090677, + "grad_norm": 0.15921291708946228, + "learning_rate": 4.5343933001658194e-05, + "loss": 0.4222, "step": 30740 }, { - "epoch": 1.08, - "learning_rate": 4.558682552974765e-05, - "loss": 0.306, + "epoch": 1.1080477168702922, + "grad_norm": 0.23919863998889923, + "learning_rate": 4.5342236829774617e-05, + "loss": 0.453, "step": 30745 }, { - "epoch": 1.08, - "learning_rate": 4.5585209158594014e-05, - "loss": 0.2925, + "epoch": 1.108227916531517, + "grad_norm": 0.2175007313489914, + "learning_rate": 4.534054038073199e-05, + "loss": 0.4146, "step": 30750 }, { - "epoch": 1.08, - "learning_rate": 4.558359252015499e-05, - "loss": 0.2952, + "epoch": 1.1084081161927415, + "grad_norm": 0.16294798254966736, + "learning_rate": 4.533884365455345e-05, + "loss": 0.3633, "step": 30755 }, { - "epoch": 1.08, - "learning_rate": 4.55819756144516e-05, - "loss": 0.2705, + "epoch": 1.1085883158539662, + "grad_norm": 0.15248903632164001, + "learning_rate": 4.5337146651262094e-05, + "loss": 0.4316, "step": 30760 }, { - "epoch": 1.08, - "learning_rate": 4.558035844150481e-05, - "loss": 0.2831, + "epoch": 1.1087685155151907, + "grad_norm": 0.18344025313854218, + "learning_rate": 4.533544937088106e-05, + "loss": 0.3941, "step": 30765 }, { - "epoch": 1.08, - "learning_rate": 4.557874100133563e-05, - "loss": 0.2785, + "epoch": 1.1089487151764155, + "grad_norm": 0.2048654705286026, + "learning_rate": 4.533375181343346e-05, + "loss": 0.4528, "step": 30770 }, { - "epoch": 1.08, - "learning_rate": 4.557712329396507e-05, - "loss": 0.3199, + "epoch": 1.1091289148376402, + "grad_norm": 0.20084497332572937, + "learning_rate": 4.5332053978942436e-05, + "loss": 0.4366, "step": 30775 }, { - "epoch": 1.08, - "learning_rate": 4.5575505319414117e-05, - "loss": 0.3042, + "epoch": 1.1093091144988647, + "grad_norm": 0.13244780898094177, + "learning_rate": 4.5330355867431106e-05, + "loss": 0.4239, "step": 30780 }, { - "epoch": 1.08, - "learning_rate": 4.5573887077703786e-05, - "loss": 0.291, + "epoch": 1.1094893141600894, + "grad_norm": 0.21777132153511047, + "learning_rate": 4.532865747892261e-05, + "loss": 0.4454, "step": 30785 }, { - "epoch": 1.08, - "learning_rate": 4.557226856885509e-05, - "loss": 0.2976, + "epoch": 1.109669513821314, + "grad_norm": 0.19449090957641602, + "learning_rate": 4.53269588134401e-05, + "loss": 0.4324, "step": 30790 }, { - "epoch": 1.08, - "learning_rate": 4.557064979288905e-05, - "loss": 0.3074, + "epoch": 1.1098497134825387, + "grad_norm": 0.16358676552772522, + "learning_rate": 4.532525987100671e-05, + "loss": 0.4083, "step": 30795 }, { - "epoch": 1.08, - "learning_rate": 4.5569030749826666e-05, - "loss": 0.28, + "epoch": 1.1100299131437632, + "grad_norm": 0.1827852874994278, + "learning_rate": 4.532356065164558e-05, + "loss": 0.411, "step": 30800 }, { - "epoch": 1.08, - "learning_rate": 4.556741143968899e-05, - "loss": 0.2845, + "epoch": 1.110210112804988, + "grad_norm": 0.16371290385723114, + "learning_rate": 4.5321861155379884e-05, + "loss": 0.4314, "step": 30805 }, { - "epoch": 1.08, - "learning_rate": 4.556579186249702e-05, - "loss": 0.303, + "epoch": 1.1103903124662127, + "grad_norm": 0.18602146208286285, + "learning_rate": 4.532016138223276e-05, + "loss": 0.4158, "step": 30810 }, { - "epoch": 1.08, - "learning_rate": 4.55641720182718e-05, - "loss": 0.3016, + "epoch": 1.1105705121274372, + "grad_norm": 0.14630118012428284, + "learning_rate": 4.5318461332227365e-05, + "loss": 0.3876, "step": 30815 }, { - "epoch": 1.08, - "learning_rate": 4.556255190703436e-05, - "loss": 0.2929, + "epoch": 1.110750711788662, + "grad_norm": 0.1606942117214203, + "learning_rate": 4.531676100538688e-05, + "loss": 0.4428, "step": 30820 }, { - "epoch": 1.08, - "learning_rate": 4.556093152880573e-05, - "loss": 0.28, + "epoch": 1.1109309114498864, + "grad_norm": 0.16617447137832642, + "learning_rate": 4.5315060401734445e-05, + "loss": 0.4363, "step": 30825 }, { - "epoch": 1.08, - "learning_rate": 4.555931088360695e-05, - "loss": 0.3086, + "epoch": 1.1111111111111112, + "grad_norm": 0.1629336029291153, + "learning_rate": 4.5313359521293254e-05, + "loss": 0.403, "step": 30830 }, { - "epoch": 1.08, - "learning_rate": 4.555768997145908e-05, - "loss": 0.3224, + "epoch": 1.1112913107723357, + "grad_norm": 0.1905163824558258, + "learning_rate": 4.5311658364086474e-05, + "loss": 0.4358, "step": 30835 }, { - "epoch": 1.09, - "learning_rate": 4.555606879238314e-05, - "loss": 0.3191, + "epoch": 1.1114715104335604, + "grad_norm": 0.17347170412540436, + "learning_rate": 4.531029723906539e-05, + "loss": 0.4344, "step": 30840 }, { - "epoch": 1.09, - "learning_rate": 4.555444734640021e-05, - "loss": 0.2812, + "epoch": 1.111651710094785, + "grad_norm": 0.1710389107465744, + "learning_rate": 4.530859558373896e-05, + "loss": 0.4036, "step": 30845 }, { - "epoch": 1.09, - "learning_rate": 4.555282563353132e-05, - "loss": 0.2703, + "epoch": 1.1118319097560097, + "grad_norm": 0.15823177993297577, + "learning_rate": 4.530689365171184e-05, + "loss": 0.3619, "step": 30850 }, { - "epoch": 1.09, - "learning_rate": 4.5551203653797525e-05, - "loss": 0.2794, + "epoch": 1.1120121094172344, + "grad_norm": 0.17477105557918549, + "learning_rate": 4.530519144300722e-05, + "loss": 0.4299, "step": 30855 }, { - "epoch": 1.09, - "learning_rate": 4.55495814072199e-05, - "loss": 0.287, + "epoch": 1.112192309078459, + "grad_norm": 0.15027157962322235, + "learning_rate": 4.530348895764831e-05, + "loss": 0.4332, "step": 30860 }, { - "epoch": 1.09, - "learning_rate": 4.554795889381951e-05, - "loss": 0.2831, + "epoch": 1.1123725087396836, + "grad_norm": 0.1768927425146103, + "learning_rate": 4.530178619565829e-05, + "loss": 0.434, "step": 30865 }, { - "epoch": 1.09, - "learning_rate": 4.554633611361741e-05, - "loss": 0.2908, + "epoch": 1.1125527084009081, + "grad_norm": 0.17332357168197632, + "learning_rate": 4.5300083157060356e-05, + "loss": 0.4292, "step": 30870 }, { - "epoch": 1.09, - "learning_rate": 4.554471306663467e-05, - "loss": 0.2641, + "epoch": 1.1127329080621329, + "grad_norm": 0.22046640515327454, + "learning_rate": 4.529837984187773e-05, + "loss": 0.4375, "step": 30875 }, { - "epoch": 1.09, - "learning_rate": 4.5543089752892375e-05, - "loss": 0.3155, + "epoch": 1.1129131077233574, + "grad_norm": 0.15392176806926727, + "learning_rate": 4.529667625013361e-05, + "loss": 0.4253, "step": 30880 }, { - "epoch": 1.09, - "learning_rate": 4.554146617241161e-05, - "loss": 0.2905, + "epoch": 1.1130933073845821, + "grad_norm": 0.1741206794977188, + "learning_rate": 4.52949723818512e-05, + "loss": 0.3929, "step": 30885 }, { - "epoch": 1.09, - "learning_rate": 4.553984232521342e-05, - "loss": 0.3115, + "epoch": 1.1132735070458069, + "grad_norm": 0.1908791959285736, + "learning_rate": 4.529326823705372e-05, + "loss": 0.4403, "step": 30890 }, { - "epoch": 1.09, - "learning_rate": 4.553821821131893e-05, - "loss": 0.3043, + "epoch": 1.1134537067070314, + "grad_norm": 0.20315563678741455, + "learning_rate": 4.5291563815764384e-05, + "loss": 0.4401, "step": 30895 }, { - "epoch": 1.09, - "learning_rate": 4.553659383074921e-05, - "loss": 0.2819, + "epoch": 1.113633906368256, + "grad_norm": 0.16982267796993256, + "learning_rate": 4.528985911800643e-05, + "loss": 0.4372, "step": 30900 }, { - "epoch": 1.09, - "learning_rate": 4.5534969183525354e-05, - "loss": 0.2957, + "epoch": 1.1138141060294806, + "grad_norm": 0.1527600884437561, + "learning_rate": 4.5288154143803066e-05, + "loss": 0.3923, "step": 30905 }, { - "epoch": 1.09, - "learning_rate": 4.553334426966846e-05, - "loss": 0.2755, + "epoch": 1.1139943056907053, + "grad_norm": 0.17226542532444, + "learning_rate": 4.528644889317753e-05, + "loss": 0.4158, "step": 30910 }, { - "epoch": 1.09, - "learning_rate": 4.553171908919961e-05, - "loss": 0.2928, + "epoch": 1.1141745053519299, + "grad_norm": 0.178422212600708, + "learning_rate": 4.528474336615306e-05, + "loss": 0.4268, "step": 30915 }, { - "epoch": 1.09, - "learning_rate": 4.5530093642139927e-05, - "loss": 0.2944, + "epoch": 1.1143547050131546, + "grad_norm": 0.1679055094718933, + "learning_rate": 4.528303756275288e-05, + "loss": 0.4216, "step": 30920 }, { - "epoch": 1.09, - "learning_rate": 4.55284679285105e-05, - "loss": 0.2891, + "epoch": 1.1145349046743793, + "grad_norm": 0.21220412850379944, + "learning_rate": 4.528133148300026e-05, + "loss": 0.4276, "step": 30925 }, { - "epoch": 1.09, - "learning_rate": 4.5526841948332455e-05, - "loss": 0.2939, + "epoch": 1.1147151043356038, + "grad_norm": 0.16649261116981506, + "learning_rate": 4.52796251269184e-05, + "loss": 0.4486, "step": 30930 }, { - "epoch": 1.09, - "learning_rate": 4.55252157016269e-05, - "loss": 0.2685, + "epoch": 1.1148953039968286, + "grad_norm": 0.19539640843868256, + "learning_rate": 4.527791849453059e-05, + "loss": 0.4124, "step": 30935 }, { - "epoch": 1.09, - "learning_rate": 4.5523589188414926e-05, - "loss": 0.2761, + "epoch": 1.115075503658053, + "grad_norm": 0.17714886367321014, + "learning_rate": 4.5276211585860064e-05, + "loss": 0.4119, "step": 30940 }, { - "epoch": 1.09, - "learning_rate": 4.552196240871769e-05, - "loss": 0.2695, + "epoch": 1.1152557033192778, + "grad_norm": 0.17066708207130432, + "learning_rate": 4.527450440093008e-05, + "loss": 0.4194, "step": 30945 }, { - "epoch": 1.09, - "learning_rate": 4.5520335362556286e-05, - "loss": 0.2848, + "epoch": 1.1154359029805023, + "grad_norm": 0.1813109666109085, + "learning_rate": 4.527279693976389e-05, + "loss": 0.4032, "step": 30950 }, { - "epoch": 1.09, - "learning_rate": 4.551870804995186e-05, - "loss": 0.2786, + "epoch": 1.115616102641727, + "grad_norm": 0.1931227445602417, + "learning_rate": 4.527108920238478e-05, + "loss": 0.4249, "step": 30955 }, { - "epoch": 1.09, - "learning_rate": 4.551708047092552e-05, - "loss": 0.2955, + "epoch": 1.1157963023029516, + "grad_norm": 0.16914038360118866, + "learning_rate": 4.5269381188815996e-05, + "loss": 0.381, "step": 30960 }, { - "epoch": 1.09, - "learning_rate": 4.551545262549842e-05, - "loss": 0.2877, + "epoch": 1.1159765019641763, + "grad_norm": 0.1841052621603012, + "learning_rate": 4.526767289908083e-05, + "loss": 0.4565, "step": 30965 }, { - "epoch": 1.09, - "learning_rate": 4.5513824513691686e-05, - "loss": 0.2801, + "epoch": 1.116156701625401, + "grad_norm": 0.1776377409696579, + "learning_rate": 4.5265964333202526e-05, + "loss": 0.4371, "step": 30970 }, { - "epoch": 1.09, - "learning_rate": 4.551219613552646e-05, - "loss": 0.2774, + "epoch": 1.1163369012866255, + "grad_norm": 0.17121796309947968, + "learning_rate": 4.526425549120439e-05, + "loss": 0.4383, "step": 30975 }, { - "epoch": 1.09, - "learning_rate": 4.551056749102389e-05, - "loss": 0.2859, + "epoch": 1.1165171009478503, + "grad_norm": 0.16264458000659943, + "learning_rate": 4.526254637310971e-05, + "loss": 0.4006, "step": 30980 }, { - "epoch": 1.09, - "learning_rate": 4.550893858020511e-05, - "loss": 0.2759, + "epoch": 1.1166973006090748, + "grad_norm": 0.19844268262386322, + "learning_rate": 4.526083697894173e-05, + "loss": 0.4223, "step": 30985 }, { - "epoch": 1.09, - "learning_rate": 4.550730940309128e-05, - "loss": 0.3017, + "epoch": 1.1168775002702995, + "grad_norm": 0.19541612267494202, + "learning_rate": 4.525912730872379e-05, + "loss": 0.42, "step": 30990 }, { - "epoch": 1.09, - "learning_rate": 4.5505679959703565e-05, - "loss": 0.3095, + "epoch": 1.117057699931524, + "grad_norm": 0.19237811863422394, + "learning_rate": 4.525741736247916e-05, + "loss": 0.3994, "step": 30995 }, { - "epoch": 1.09, - "learning_rate": 4.550405025006309e-05, - "loss": 0.2957, + "epoch": 1.1172378995927488, + "grad_norm": 0.15686160326004028, + "learning_rate": 4.5255707140231136e-05, + "loss": 0.4275, "step": 31000 }, { - "epoch": 1.09, - "eval_loss": 0.2897602617740631, - "eval_runtime": 10.5439, - "eval_samples_per_second": 9.484, - "eval_steps_per_second": 9.484, + "epoch": 1.1172378995927488, + "eval_loss": 0.4531842768192291, + "eval_runtime": 3.5417, + "eval_samples_per_second": 28.235, + "eval_steps_per_second": 7.059, "step": 31000 }, { - "epoch": 1.09, - "learning_rate": 4.550242027419105e-05, - "loss": 0.285, + "epoch": 1.1174180992539733, + "grad_norm": 0.16874189674854279, + "learning_rate": 4.5253996642003025e-05, + "loss": 0.4781, "step": 31005 }, { - "epoch": 1.09, - "learning_rate": 4.550079003210859e-05, - "loss": 0.2983, + "epoch": 1.117598298915198, + "grad_norm": 0.17031274735927582, + "learning_rate": 4.525228586781813e-05, + "loss": 0.4237, "step": 31010 }, { - "epoch": 1.09, - "learning_rate": 4.5499159523836875e-05, - "loss": 0.2819, + "epoch": 1.1177784985764228, + "grad_norm": 0.19716662168502808, + "learning_rate": 4.525057481769976e-05, + "loss": 0.415, "step": 31015 }, { - "epoch": 1.09, - "learning_rate": 4.549752874939709e-05, - "loss": 0.3313, + "epoch": 1.1179586982376473, + "grad_norm": 0.15895715355873108, + "learning_rate": 4.524886349167124e-05, + "loss": 0.3885, "step": 31020 }, { - "epoch": 1.09, - "learning_rate": 4.5495897708810395e-05, - "loss": 0.266, + "epoch": 1.118138897898872, + "grad_norm": 0.1836315244436264, + "learning_rate": 4.5247151889755855e-05, + "loss": 0.4201, "step": 31025 }, { - "epoch": 1.09, - "learning_rate": 4.549426640209799e-05, - "loss": 0.3073, + "epoch": 1.1183190975600965, + "grad_norm": 0.17792055010795593, + "learning_rate": 4.5245440011976966e-05, + "loss": 0.4348, "step": 31030 }, { - "epoch": 1.09, - "learning_rate": 4.5492634829281026e-05, - "loss": 0.3227, + "epoch": 1.1184992972213212, + "grad_norm": 0.17918603122234344, + "learning_rate": 4.524372785835787e-05, + "loss": 0.4039, "step": 31035 }, { - "epoch": 1.09, - "learning_rate": 4.549100299038072e-05, - "loss": 0.2894, + "epoch": 1.118679496882546, + "grad_norm": 0.1892392635345459, + "learning_rate": 4.524201542892189e-05, + "loss": 0.4318, "step": 31040 }, { - "epoch": 1.09, - "learning_rate": 4.548937088541823e-05, - "loss": 0.2956, + "epoch": 1.1188596965437705, + "grad_norm": 0.17238296568393707, + "learning_rate": 4.524030272369238e-05, + "loss": 0.4143, "step": 31045 }, { - "epoch": 1.09, - "learning_rate": 4.548773851441477e-05, - "loss": 0.2959, + "epoch": 1.1190398962049952, + "grad_norm": 0.1613350510597229, + "learning_rate": 4.523858974269266e-05, + "loss": 0.4531, "step": 31050 }, { - "epoch": 1.09, - "learning_rate": 4.548610587739152e-05, - "loss": 0.2917, + "epoch": 1.1192200958662197, + "grad_norm": 0.20245467126369476, + "learning_rate": 4.5236876485946076e-05, + "loss": 0.4655, "step": 31055 }, { - "epoch": 1.09, - "learning_rate": 4.54844729743697e-05, - "loss": 0.3038, + "epoch": 1.1194002955274445, + "grad_norm": 0.16436737775802612, + "learning_rate": 4.5235162953475966e-05, + "loss": 0.4167, "step": 31060 }, { - "epoch": 1.09, - "learning_rate": 4.548283980537048e-05, - "loss": 0.2948, + "epoch": 1.119580495188669, + "grad_norm": 0.15972478687763214, + "learning_rate": 4.523344914530568e-05, + "loss": 0.4051, "step": 31065 }, { - "epoch": 1.09, - "learning_rate": 4.5481206370415094e-05, - "loss": 0.2723, + "epoch": 1.1197606948498937, + "grad_norm": 0.15999290347099304, + "learning_rate": 4.523173506145856e-05, + "loss": 0.3884, "step": 31070 }, { - "epoch": 1.09, - "learning_rate": 4.547989943097659e-05, - "loss": 0.2976, + "epoch": 1.1199408945111182, + "grad_norm": 0.15831471979618073, + "learning_rate": 4.5230020701957976e-05, + "loss": 0.3952, "step": 31075 }, { - "epoch": 1.09, - "learning_rate": 4.5478265517353534e-05, - "loss": 0.3196, + "epoch": 1.120121094172343, + "grad_norm": 0.17444223165512085, + "learning_rate": 4.522830606682727e-05, + "loss": 0.4452, "step": 31080 }, { - "epoch": 1.09, - "learning_rate": 4.54766313378337e-05, - "loss": 0.2942, + "epoch": 1.1203012938335677, + "grad_norm": 0.16463379561901093, + "learning_rate": 4.5226591156089816e-05, + "loss": 0.4349, "step": 31085 }, { - "epoch": 1.09, - "learning_rate": 4.5474996892438294e-05, - "loss": 0.2788, + "epoch": 1.1204814934947922, + "grad_norm": 0.16084179282188416, + "learning_rate": 4.522487596976897e-05, + "loss": 0.4415, "step": 31090 }, { - "epoch": 1.09, - "learning_rate": 4.5473362181188554e-05, - "loss": 0.269, + "epoch": 1.120661693156017, + "grad_norm": 0.16912023723125458, + "learning_rate": 4.5223160507888106e-05, + "loss": 0.4269, "step": 31095 }, { - "epoch": 1.09, - "learning_rate": 4.5471727204105695e-05, - "loss": 0.2889, + "epoch": 1.1208418928172414, + "grad_norm": 0.1586826741695404, + "learning_rate": 4.5221444770470595e-05, + "loss": 0.4002, "step": 31100 }, { - "epoch": 1.09, - "learning_rate": 4.5470091961210955e-05, - "loss": 0.268, + "epoch": 1.1210220924784662, + "grad_norm": 0.1658790409564972, + "learning_rate": 4.521972875753981e-05, + "loss": 0.41, "step": 31105 }, { - "epoch": 1.09, - "learning_rate": 4.5468456452525556e-05, - "loss": 0.2981, + "epoch": 1.1212022921396907, + "grad_norm": 0.18843096494674683, + "learning_rate": 4.521801246911914e-05, + "loss": 0.4006, "step": 31110 }, { - "epoch": 1.09, - "learning_rate": 4.546682067807074e-05, - "loss": 0.2968, + "epoch": 1.1213824918009154, + "grad_norm": 0.1666111797094345, + "learning_rate": 4.521629590523197e-05, + "loss": 0.3906, "step": 31115 }, { - "epoch": 1.09, - "learning_rate": 4.546518463786775e-05, - "loss": 0.2593, + "epoch": 1.12156269146214, + "grad_norm": 0.15296362340450287, + "learning_rate": 4.521457906590167e-05, + "loss": 0.4152, "step": 31120 }, { - "epoch": 1.1, - "learning_rate": 4.546354833193782e-05, - "loss": 0.3151, + "epoch": 1.1217428911233647, + "grad_norm": 0.20949307084083557, + "learning_rate": 4.521286195115165e-05, + "loss": 0.4181, "step": 31125 }, { - "epoch": 1.1, - "learning_rate": 4.54619117603022e-05, - "loss": 0.2953, + "epoch": 1.1219230907845894, + "grad_norm": 0.1738746613264084, + "learning_rate": 4.5211144561005295e-05, + "loss": 0.4134, "step": 31130 }, { - "epoch": 1.1, - "learning_rate": 4.546027492298215e-05, - "loss": 0.2878, + "epoch": 1.122103290445814, + "grad_norm": 0.199616476893425, + "learning_rate": 4.520942689548601e-05, + "loss": 0.4121, "step": 31135 }, { - "epoch": 1.1, - "learning_rate": 4.54586378199989e-05, - "loss": 0.2767, + "epoch": 1.1222834901070387, + "grad_norm": 0.13446038961410522, + "learning_rate": 4.52077089546172e-05, + "loss": 0.4055, "step": 31140 }, { - "epoch": 1.1, - "learning_rate": 4.545700045137373e-05, - "loss": 0.2915, + "epoch": 1.1224636897682632, + "grad_norm": 0.15235783159732819, + "learning_rate": 4.520599073842226e-05, + "loss": 0.416, "step": 31145 }, { - "epoch": 1.1, - "learning_rate": 4.545536281712789e-05, - "loss": 0.2771, + "epoch": 1.122643889429488, + "grad_norm": 0.1988137811422348, + "learning_rate": 4.520427224692462e-05, + "loss": 0.4166, "step": 31150 }, { - "epoch": 1.1, - "learning_rate": 4.5453724917282635e-05, - "loss": 0.3109, + "epoch": 1.1228240890907124, + "grad_norm": 0.19993172585964203, + "learning_rate": 4.520255348014768e-05, + "loss": 0.4663, "step": 31155 }, { - "epoch": 1.1, - "learning_rate": 4.5452086751859254e-05, - "loss": 0.2773, + "epoch": 1.1230042887519371, + "grad_norm": 0.2015010416507721, + "learning_rate": 4.520083443811485e-05, + "loss": 0.3944, "step": 31160 }, { - "epoch": 1.1, - "learning_rate": 4.5450448320879004e-05, - "loss": 0.283, + "epoch": 1.1231844884131619, + "grad_norm": 0.17933201789855957, + "learning_rate": 4.519911512084957e-05, + "loss": 0.4343, "step": 31165 }, { - "epoch": 1.1, - "learning_rate": 4.544880962436315e-05, - "loss": 0.2982, + "epoch": 1.1233646880743864, + "grad_norm": 0.15731388330459595, + "learning_rate": 4.519739552837526e-05, + "loss": 0.4049, "step": 31170 }, { - "epoch": 1.1, - "learning_rate": 4.544717066233298e-05, - "loss": 0.2781, + "epoch": 1.1235448877356111, + "grad_norm": 0.17282412946224213, + "learning_rate": 4.519567566071534e-05, + "loss": 0.4227, "step": 31175 }, { - "epoch": 1.1, - "learning_rate": 4.544553143480978e-05, - "loss": 0.2913, + "epoch": 1.1237250873968356, + "grad_norm": 0.165944442152977, + "learning_rate": 4.519395551789325e-05, + "loss": 0.4523, "step": 31180 }, { - "epoch": 1.1, - "learning_rate": 4.544389194181483e-05, - "loss": 0.28, + "epoch": 1.1239052870580604, + "grad_norm": 0.1765991449356079, + "learning_rate": 4.5192235099932425e-05, + "loss": 0.4304, "step": 31185 }, { - "epoch": 1.1, - "learning_rate": 4.544225218336942e-05, - "loss": 0.2506, + "epoch": 1.1240854867192849, + "grad_norm": 0.17275585234165192, + "learning_rate": 4.519051440685631e-05, + "loss": 0.4245, "step": 31190 }, { - "epoch": 1.1, - "learning_rate": 4.544061215949483e-05, - "loss": 0.2924, + "epoch": 1.1242656863805096, + "grad_norm": 0.18365871906280518, + "learning_rate": 4.518879343868834e-05, + "loss": 0.4363, "step": 31195 }, { - "epoch": 1.1, - "learning_rate": 4.543897187021236e-05, - "loss": 0.2654, + "epoch": 1.1244458860417343, + "grad_norm": 0.17454713582992554, + "learning_rate": 4.5187072195451975e-05, + "loss": 0.4329, "step": 31200 }, { - "epoch": 1.1, - "learning_rate": 4.543733131554332e-05, - "loss": 0.2989, + "epoch": 1.1246260857029589, + "grad_norm": 0.1820690780878067, + "learning_rate": 4.518535067717066e-05, + "loss": 0.4217, "step": 31205 }, { - "epoch": 1.1, - "learning_rate": 4.5435690495508984e-05, - "loss": 0.2829, + "epoch": 1.1248062853641836, + "grad_norm": 0.16349612176418304, + "learning_rate": 4.518362888386784e-05, + "loss": 0.4456, "step": 31210 }, { - "epoch": 1.1, - "learning_rate": 4.543404941013069e-05, - "loss": 0.2898, + "epoch": 1.124986485025408, + "grad_norm": 0.17479030787944794, + "learning_rate": 4.5181906815566986e-05, + "loss": 0.4347, "step": 31215 }, { - "epoch": 1.1, - "learning_rate": 4.543240805942973e-05, - "loss": 0.2847, + "epoch": 1.1251666846866328, + "grad_norm": 0.14878250658512115, + "learning_rate": 4.5180184472291566e-05, + "loss": 0.4315, "step": 31220 }, { - "epoch": 1.1, - "learning_rate": 4.5430766443427405e-05, - "loss": 0.3066, + "epoch": 1.1253468843478573, + "grad_norm": 0.18707720935344696, + "learning_rate": 4.5178461854065036e-05, + "loss": 0.3981, "step": 31225 }, { - "epoch": 1.1, - "learning_rate": 4.542912456214505e-05, - "loss": 0.3007, + "epoch": 1.125527084009082, + "grad_norm": 0.19858436286449432, + "learning_rate": 4.517673896091087e-05, + "loss": 0.405, "step": 31230 }, { - "epoch": 1.1, - "learning_rate": 4.542748241560398e-05, - "loss": 0.2982, + "epoch": 1.1257072836703066, + "grad_norm": 0.19132393598556519, + "learning_rate": 4.517501579285255e-05, + "loss": 0.4229, "step": 31235 }, { - "epoch": 1.1, - "learning_rate": 4.542584000382552e-05, - "loss": 0.3101, + "epoch": 1.1258874833315313, + "grad_norm": 0.1902177333831787, + "learning_rate": 4.5173292349913534e-05, + "loss": 0.3822, "step": 31240 }, { - "epoch": 1.1, - "learning_rate": 4.542419732683097e-05, - "loss": 0.2978, + "epoch": 1.126067682992756, + "grad_norm": 0.1548781394958496, + "learning_rate": 4.517156863211732e-05, + "loss": 0.4134, "step": 31245 }, { - "epoch": 1.1, - "learning_rate": 4.542255438464169e-05, - "loss": 0.2544, + "epoch": 1.1262478826539806, + "grad_norm": 0.20979823172092438, + "learning_rate": 4.517018945999895e-05, + "loss": 0.4261, "step": 31250 }, { - "epoch": 1.1, - "learning_rate": 4.542091117727899e-05, - "loss": 0.3042, + "epoch": 1.1264280823152053, + "grad_norm": 0.17488163709640503, + "learning_rate": 4.5168465247518955e-05, + "loss": 0.4078, "step": 31255 }, { - "epoch": 1.1, - "learning_rate": 4.5419267704764227e-05, - "loss": 0.3146, + "epoch": 1.1266082819764298, + "grad_norm": 0.16054439544677734, + "learning_rate": 4.516674076024752e-05, + "loss": 0.4304, "step": 31260 }, { - "epoch": 1.1, - "learning_rate": 4.541762396711872e-05, - "loss": 0.3158, + "epoch": 1.1267884816376545, + "grad_norm": 0.16727547347545624, + "learning_rate": 4.516501599820816e-05, + "loss": 0.4086, "step": 31265 }, { - "epoch": 1.1, - "learning_rate": 4.5415979964363825e-05, - "loss": 0.2852, + "epoch": 1.1269686812988793, + "grad_norm": 0.17022760212421417, + "learning_rate": 4.5163290961424355e-05, + "loss": 0.4378, "step": 31270 }, { - "epoch": 1.1, - "learning_rate": 4.541433569652089e-05, - "loss": 0.2918, + "epoch": 1.1271488809601038, + "grad_norm": 0.1838621199131012, + "learning_rate": 4.5161565649919614e-05, + "loss": 0.4176, "step": 31275 }, { - "epoch": 1.1, - "learning_rate": 4.541269116361125e-05, - "loss": 0.3133, + "epoch": 1.1273290806213285, + "grad_norm": 0.16698120534420013, + "learning_rate": 4.515984006371744e-05, + "loss": 0.4196, "step": 31280 }, { - "epoch": 1.1, - "learning_rate": 4.541104636565627e-05, - "loss": 0.2998, + "epoch": 1.127509280282553, + "grad_norm": 0.19196081161499023, + "learning_rate": 4.5158114202841354e-05, + "loss": 0.3924, "step": 31285 }, { - "epoch": 1.1, - "learning_rate": 4.540940130267731e-05, - "loss": 0.2574, + "epoch": 1.1276894799437778, + "grad_norm": 0.17519588768482208, + "learning_rate": 4.515638806731486e-05, + "loss": 0.4042, "step": 31290 }, { - "epoch": 1.1, - "learning_rate": 4.5407755974695726e-05, - "loss": 0.305, + "epoch": 1.1278696796050023, + "grad_norm": 0.1803540289402008, + "learning_rate": 4.515466165716149e-05, + "loss": 0.3852, "step": 31295 }, { - "epoch": 1.1, - "learning_rate": 4.540611038173288e-05, - "loss": 0.2872, + "epoch": 1.128049879266227, + "grad_norm": 0.19045265018939972, + "learning_rate": 4.5152934972404746e-05, + "loss": 0.4543, "step": 31300 }, { - "epoch": 1.1, - "learning_rate": 4.540446452381013e-05, - "loss": 0.2743, + "epoch": 1.1282300789274515, + "grad_norm": 0.2028571367263794, + "learning_rate": 4.515120801306818e-05, + "loss": 0.4327, "step": 31305 }, { - "epoch": 1.1, - "learning_rate": 4.540281840094886e-05, - "loss": 0.2836, + "epoch": 1.1284102785886763, + "grad_norm": 0.18844293057918549, + "learning_rate": 4.514948077917529e-05, + "loss": 0.4408, "step": 31310 }, { - "epoch": 1.1, - "learning_rate": 4.540117201317044e-05, - "loss": 0.2902, + "epoch": 1.128590478249901, + "grad_norm": 0.17702354490756989, + "learning_rate": 4.514775327074963e-05, + "loss": 0.4077, "step": 31315 }, { - "epoch": 1.1, - "learning_rate": 4.539952536049624e-05, - "loss": 0.2857, + "epoch": 1.1287706779111255, + "grad_norm": 0.1727331429719925, + "learning_rate": 4.514602548781474e-05, + "loss": 0.4235, "step": 31320 }, { - "epoch": 1.1, - "learning_rate": 4.539787844294766e-05, - "loss": 0.2743, + "epoch": 1.1289508775723502, + "grad_norm": 0.18904700875282288, + "learning_rate": 4.514429743039414e-05, + "loss": 0.4351, "step": 31325 }, { - "epoch": 1.1, - "learning_rate": 4.539623126054606e-05, - "loss": 0.2676, + "epoch": 1.1291310772335748, + "grad_norm": 0.15556100010871887, + "learning_rate": 4.51425690985114e-05, + "loss": 0.4397, "step": 31330 }, { - "epoch": 1.1, - "learning_rate": 4.539458381331284e-05, - "loss": 0.2786, + "epoch": 1.1293112768947995, + "grad_norm": 0.16988936066627502, + "learning_rate": 4.514084049219005e-05, + "loss": 0.3974, "step": 31335 }, { - "epoch": 1.1, - "learning_rate": 4.53929361012694e-05, - "loss": 0.2938, + "epoch": 1.129491476556024, + "grad_norm": 0.16999125480651855, + "learning_rate": 4.513911161145365e-05, + "loss": 0.3972, "step": 31340 }, { - "epoch": 1.1, - "learning_rate": 4.5391288124437116e-05, - "loss": 0.3097, + "epoch": 1.1296716762172487, + "grad_norm": 0.20566906034946442, + "learning_rate": 4.513738245632575e-05, + "loss": 0.4368, "step": 31345 }, { - "epoch": 1.1, - "learning_rate": 4.5389639882837386e-05, - "loss": 0.2933, + "epoch": 1.1298518758784732, + "grad_norm": 0.2229662835597992, + "learning_rate": 4.513565302682992e-05, + "loss": 0.4017, "step": 31350 }, { - "epoch": 1.1, - "learning_rate": 4.538799137649163e-05, - "loss": 0.2772, + "epoch": 1.130032075539698, + "grad_norm": 0.1942766159772873, + "learning_rate": 4.513392332298971e-05, + "loss": 0.4144, "step": 31355 }, { - "epoch": 1.1, - "learning_rate": 4.538634260542124e-05, - "loss": 0.2808, + "epoch": 1.1302122752009227, + "grad_norm": 0.1725579798221588, + "learning_rate": 4.513219334482869e-05, + "loss": 0.386, "step": 31360 }, { - "epoch": 1.1, - "learning_rate": 4.538469356964763e-05, - "loss": 0.3264, + "epoch": 1.1303924748621472, + "grad_norm": 0.1760469377040863, + "learning_rate": 4.513046309237044e-05, + "loss": 0.4181, "step": 31365 }, { - "epoch": 1.1, - "learning_rate": 4.5383044269192196e-05, - "loss": 0.2675, + "epoch": 1.130572674523372, + "grad_norm": 0.1780589520931244, + "learning_rate": 4.512873256563852e-05, + "loss": 0.4206, "step": 31370 }, { - "epoch": 1.1, - "learning_rate": 4.5381394704076374e-05, - "loss": 0.3035, + "epoch": 1.1307528741845965, + "grad_norm": 0.16992972791194916, + "learning_rate": 4.5127001764656526e-05, + "loss": 0.4126, "step": 31375 }, { - "epoch": 1.1, - "learning_rate": 4.537974487432157e-05, - "loss": 0.3087, + "epoch": 1.1309330738458212, + "grad_norm": 0.1983700394630432, + "learning_rate": 4.5125270689448015e-05, + "loss": 0.4072, "step": 31380 }, { - "epoch": 1.1, - "learning_rate": 4.537809477994922e-05, - "loss": 0.3148, + "epoch": 1.1311132735070457, + "grad_norm": 0.17569278180599213, + "learning_rate": 4.51235393400366e-05, + "loss": 0.4082, "step": 31385 }, { - "epoch": 1.1, - "learning_rate": 4.537644442098073e-05, - "loss": 0.3031, + "epoch": 1.1312934731682704, + "grad_norm": 0.22225306928157806, + "learning_rate": 4.512180771644585e-05, + "loss": 0.4641, "step": 31390 }, { - "epoch": 1.1, - "learning_rate": 4.537479379743754e-05, - "loss": 0.2633, + "epoch": 1.131473672829495, + "grad_norm": 0.1481614112854004, + "learning_rate": 4.512007581869937e-05, + "loss": 0.3931, "step": 31395 }, { - "epoch": 1.1, - "learning_rate": 4.537314290934108e-05, - "loss": 0.2997, + "epoch": 1.1316538724907197, + "grad_norm": 0.21973121166229248, + "learning_rate": 4.511834364682076e-05, + "loss": 0.4273, "step": 31400 }, { - "epoch": 1.1, - "learning_rate": 4.537149175671278e-05, - "loss": 0.3034, + "epoch": 1.1318340721519444, + "grad_norm": 0.21641644835472107, + "learning_rate": 4.51166112008336e-05, + "loss": 0.4151, "step": 31405 }, { - "epoch": 1.11, - "learning_rate": 4.5369840339574096e-05, - "loss": 0.2862, + "epoch": 1.132014271813169, + "grad_norm": 0.18511059880256653, + "learning_rate": 4.511487848076151e-05, + "loss": 0.4423, "step": 31410 }, { - "epoch": 1.11, - "learning_rate": 4.536818865794645e-05, - "loss": 0.2872, + "epoch": 1.1321944714743937, + "grad_norm": 0.2063097059726715, + "learning_rate": 4.5113145486628095e-05, + "loss": 0.4257, "step": 31415 }, { - "epoch": 1.11, - "learning_rate": 4.53665367118513e-05, - "loss": 0.2752, + "epoch": 1.1323746711356182, + "grad_norm": 0.17430323362350464, + "learning_rate": 4.511141221845697e-05, + "loss": 0.4076, "step": 31420 }, { - "epoch": 1.11, - "learning_rate": 4.5364884501310086e-05, - "loss": 0.3041, + "epoch": 1.132554870796843, + "grad_norm": 0.1422608643770218, + "learning_rate": 4.510967867627175e-05, + "loss": 0.3761, "step": 31425 }, { - "epoch": 1.11, - "learning_rate": 4.5363232026344284e-05, - "loss": 0.2794, + "epoch": 1.1327350704580676, + "grad_norm": 0.2072693109512329, + "learning_rate": 4.510794486009604e-05, + "loss": 0.4133, "step": 31430 }, { - "epoch": 1.11, - "learning_rate": 4.536157928697532e-05, - "loss": 0.295, + "epoch": 1.1329152701192922, + "grad_norm": 0.1433665156364441, + "learning_rate": 4.5106210769953484e-05, + "loss": 0.3992, "step": 31435 }, { - "epoch": 1.11, - "learning_rate": 4.5359926283224674e-05, - "loss": 0.2901, + "epoch": 1.133095469780517, + "grad_norm": 0.20627710223197937, + "learning_rate": 4.5104476405867705e-05, + "loss": 0.4233, "step": 31440 }, { - "epoch": 1.11, - "learning_rate": 4.535827301511381e-05, - "loss": 0.2754, + "epoch": 1.1332756694417414, + "grad_norm": 0.1617184430360794, + "learning_rate": 4.510274176786231e-05, + "loss": 0.4564, "step": 31445 }, { - "epoch": 1.11, - "learning_rate": 4.535661948266418e-05, - "loss": 0.2856, + "epoch": 1.1334558691029661, + "grad_norm": 0.1667255014181137, + "learning_rate": 4.5101006855960956e-05, + "loss": 0.4173, "step": 31450 }, { - "epoch": 1.11, - "learning_rate": 4.5354965685897255e-05, - "loss": 0.2911, + "epoch": 1.1336360687641907, + "grad_norm": 0.17955441772937775, + "learning_rate": 4.5099271670187285e-05, + "loss": 0.4203, "step": 31455 }, { - "epoch": 1.11, - "learning_rate": 4.5353311624834516e-05, - "loss": 0.2583, + "epoch": 1.1338162684254154, + "grad_norm": 0.1967443972826004, + "learning_rate": 4.5097536210564915e-05, + "loss": 0.4261, "step": 31460 }, { - "epoch": 1.11, - "learning_rate": 4.5351657299497444e-05, - "loss": 0.2752, + "epoch": 1.13399646808664, + "grad_norm": 0.1767064779996872, + "learning_rate": 4.5095800477117514e-05, + "loss": 0.423, "step": 31465 }, { - "epoch": 1.11, - "learning_rate": 4.535000270990751e-05, - "loss": 0.2796, + "epoch": 1.1341766677478646, + "grad_norm": 0.15333469212055206, + "learning_rate": 4.5094064469868726e-05, + "loss": 0.4139, "step": 31470 }, { - "epoch": 1.11, - "learning_rate": 4.5348347856086214e-05, - "loss": 0.2898, + "epoch": 1.1343568674090894, + "grad_norm": 0.1856624186038971, + "learning_rate": 4.509232818884219e-05, + "loss": 0.4326, "step": 31475 }, { - "epoch": 1.11, - "learning_rate": 4.534669273805502e-05, - "loss": 0.2915, + "epoch": 1.1345370670703139, + "grad_norm": 0.20386086404323578, + "learning_rate": 4.5090591634061577e-05, + "loss": 0.4424, "step": 31480 }, { - "epoch": 1.11, - "learning_rate": 4.534503735583543e-05, - "loss": 0.2943, + "epoch": 1.1347172667315386, + "grad_norm": 0.2295321226119995, + "learning_rate": 4.508885480555055e-05, + "loss": 0.4369, "step": 31485 }, { - "epoch": 1.11, - "learning_rate": 4.5343381709448943e-05, - "loss": 0.2976, + "epoch": 1.1348974663927631, + "grad_norm": 0.2000851184129715, + "learning_rate": 4.5087117703332755e-05, + "loss": 0.3865, "step": 31490 }, { - "epoch": 1.11, - "learning_rate": 4.534172579891705e-05, - "loss": 0.3062, + "epoch": 1.1350776660539879, + "grad_norm": 0.19415894150733948, + "learning_rate": 4.508538032743187e-05, + "loss": 0.3817, "step": 31495 }, { - "epoch": 1.11, - "learning_rate": 4.534006962426125e-05, - "loss": 0.2871, + "epoch": 1.1352578657152124, + "grad_norm": 0.23722659051418304, + "learning_rate": 4.5083642677871575e-05, + "loss": 0.4131, "step": 31500 }, { - "epoch": 1.11, - "eval_loss": 0.28987738490104675, - "eval_runtime": 10.5447, - "eval_samples_per_second": 9.483, - "eval_steps_per_second": 9.483, + "epoch": 1.1352578657152124, + "eval_loss": 0.45174121856689453, + "eval_runtime": 3.5299, + "eval_samples_per_second": 28.33, + "eval_steps_per_second": 7.082, "step": 31500 }, { - "epoch": 1.11, - "learning_rate": 4.5338413185503056e-05, - "loss": 0.2818, + "epoch": 1.135438065376437, + "grad_norm": 0.1844346821308136, + "learning_rate": 4.508190475467553e-05, + "loss": 0.4107, "step": 31505 }, { - "epoch": 1.11, - "learning_rate": 4.533675648266397e-05, - "loss": 0.2519, + "epoch": 1.1356182650376616, + "grad_norm": 0.16183345019817352, + "learning_rate": 4.508016655786742e-05, + "loss": 0.4014, "step": 31510 }, { - "epoch": 1.11, - "learning_rate": 4.533509951576549e-05, - "loss": 0.2737, + "epoch": 1.1357984646988863, + "grad_norm": 0.20634831488132477, + "learning_rate": 4.507842808747093e-05, + "loss": 0.4184, "step": 31515 }, { - "epoch": 1.11, - "learning_rate": 4.5333442284829164e-05, - "loss": 0.3031, + "epoch": 1.135978664360111, + "grad_norm": 0.15508055686950684, + "learning_rate": 4.507668934350975e-05, + "loss": 0.4209, "step": 31520 }, { - "epoch": 1.11, - "learning_rate": 4.533178478987647e-05, - "loss": 0.3129, + "epoch": 1.1361588640213356, + "grad_norm": 0.1634722650051117, + "learning_rate": 4.507495032600756e-05, + "loss": 0.3867, "step": 31525 }, { - "epoch": 1.11, - "learning_rate": 4.533012703092897e-05, - "loss": 0.2801, + "epoch": 1.1363390636825603, + "grad_norm": 0.1492082178592682, + "learning_rate": 4.5073211034988055e-05, + "loss": 0.3896, "step": 31530 }, { - "epoch": 1.11, - "learning_rate": 4.5328469008008154e-05, - "loss": 0.2885, + "epoch": 1.1365192633437848, + "grad_norm": 0.17373375594615936, + "learning_rate": 4.507147147047494e-05, + "loss": 0.4057, "step": 31535 }, { - "epoch": 1.11, - "learning_rate": 4.532681072113557e-05, - "loss": 0.2672, + "epoch": 1.1366994630050096, + "grad_norm": 0.178725004196167, + "learning_rate": 4.5069731632491914e-05, + "loss": 0.4372, "step": 31540 }, { - "epoch": 1.11, - "learning_rate": 4.532515217033275e-05, - "loss": 0.3211, + "epoch": 1.1368796626662343, + "grad_norm": 0.17013704776763916, + "learning_rate": 4.506799152106268e-05, + "loss": 0.441, "step": 31545 }, { - "epoch": 1.11, - "learning_rate": 4.5323493355621214e-05, - "loss": 0.3024, + "epoch": 1.1370598623274588, + "grad_norm": 0.1601504683494568, + "learning_rate": 4.506625113621094e-05, + "loss": 0.4496, "step": 31550 }, { - "epoch": 1.11, - "learning_rate": 4.5321834277022515e-05, - "loss": 0.3081, + "epoch": 1.1372400619886835, + "grad_norm": 0.16774295270442963, + "learning_rate": 4.506451047796042e-05, + "loss": 0.4396, "step": 31555 }, { - "epoch": 1.11, - "learning_rate": 4.532017493455819e-05, - "loss": 0.3033, + "epoch": 1.137420261649908, + "grad_norm": 0.18739162385463715, + "learning_rate": 4.506276954633483e-05, + "loss": 0.4404, "step": 31560 }, { - "epoch": 1.11, - "learning_rate": 4.531851532824979e-05, - "loss": 0.3097, + "epoch": 1.1376004613111328, + "grad_norm": 0.2417900562286377, + "learning_rate": 4.506102834135788e-05, + "loss": 0.4549, "step": 31565 }, { - "epoch": 1.11, - "learning_rate": 4.531685545811885e-05, - "loss": 0.2925, + "epoch": 1.1377806609723573, + "grad_norm": 0.182041198015213, + "learning_rate": 4.5059286863053314e-05, + "loss": 0.4239, "step": 31570 }, { - "epoch": 1.11, - "learning_rate": 4.5315195324186935e-05, - "loss": 0.2856, + "epoch": 1.137960860633582, + "grad_norm": 0.18919362127780914, + "learning_rate": 4.5057545111444846e-05, + "loss": 0.4025, "step": 31575 }, { - "epoch": 1.11, - "learning_rate": 4.53135349264756e-05, - "loss": 0.2676, + "epoch": 1.1381410602948066, + "grad_norm": 0.17599248886108398, + "learning_rate": 4.50558030865562e-05, + "loss": 0.4041, "step": 31580 }, { - "epoch": 1.11, - "learning_rate": 4.531187426500639e-05, - "loss": 0.279, + "epoch": 1.1383212599560313, + "grad_norm": 0.15774129331111908, + "learning_rate": 4.505406078841113e-05, + "loss": 0.4701, "step": 31585 }, { - "epoch": 1.11, - "learning_rate": 4.5310213339800895e-05, - "loss": 0.3094, + "epoch": 1.138501459617256, + "grad_norm": 0.17122262716293335, + "learning_rate": 4.505231821703336e-05, + "loss": 0.4054, "step": 31590 }, { - "epoch": 1.11, - "learning_rate": 4.5308552150880655e-05, - "loss": 0.2832, + "epoch": 1.1386816592784805, + "grad_norm": 0.15675261616706848, + "learning_rate": 4.505057537244664e-05, + "loss": 0.4538, "step": 31595 }, { - "epoch": 1.11, - "learning_rate": 4.5306890698267254e-05, - "loss": 0.2985, + "epoch": 1.1388618589397053, + "grad_norm": 0.1774090677499771, + "learning_rate": 4.504883225467471e-05, + "loss": 0.4258, "step": 31600 }, { - "epoch": 1.11, - "learning_rate": 4.5305228981982254e-05, - "loss": 0.2949, + "epoch": 1.1390420586009298, + "grad_norm": 0.13083837926387787, + "learning_rate": 4.5047088863741314e-05, + "loss": 0.3618, "step": 31605 }, { - "epoch": 1.11, - "learning_rate": 4.5303567002047246e-05, - "loss": 0.2908, + "epoch": 1.1392222582621545, + "grad_norm": 0.1870373785495758, + "learning_rate": 4.5045345199670227e-05, + "loss": 0.4286, "step": 31610 }, { - "epoch": 1.11, - "learning_rate": 4.530190475848379e-05, - "loss": 0.3151, + "epoch": 1.139402457923379, + "grad_norm": 0.1769174039363861, + "learning_rate": 4.504360126248518e-05, + "loss": 0.4424, "step": 31615 }, { - "epoch": 1.11, - "learning_rate": 4.530024225131349e-05, - "loss": 0.2759, + "epoch": 1.1395826575846038, + "grad_norm": 0.19420921802520752, + "learning_rate": 4.5041857052209954e-05, + "loss": 0.4086, "step": 31620 }, { - "epoch": 1.11, - "learning_rate": 4.529857948055791e-05, - "loss": 0.2868, + "epoch": 1.1397628572458283, + "grad_norm": 0.19101811945438385, + "learning_rate": 4.504011256886831e-05, + "loss": 0.4177, "step": 31625 }, { - "epoch": 1.11, - "learning_rate": 4.529691644623866e-05, - "loss": 0.2885, + "epoch": 1.139943056907053, + "grad_norm": 0.20886962115764618, + "learning_rate": 4.503836781248401e-05, + "loss": 0.4463, "step": 31630 }, { - "epoch": 1.11, - "learning_rate": 4.5295253148377334e-05, - "loss": 0.2702, + "epoch": 1.1401232565682777, + "grad_norm": 0.20521090924739838, + "learning_rate": 4.503662278308082e-05, + "loss": 0.408, "step": 31635 }, { - "epoch": 1.11, - "learning_rate": 4.529358958699551e-05, - "loss": 0.2905, + "epoch": 1.1403034562295022, + "grad_norm": 0.16079290211200714, + "learning_rate": 4.503487748068253e-05, + "loss": 0.437, "step": 31640 }, { - "epoch": 1.11, - "learning_rate": 4.5291925762114805e-05, - "loss": 0.3137, + "epoch": 1.140483655890727, + "grad_norm": 0.18102915585041046, + "learning_rate": 4.5033131905312906e-05, + "loss": 0.4308, "step": 31645 }, { - "epoch": 1.11, - "learning_rate": 4.529026167375681e-05, - "loss": 0.2886, + "epoch": 1.1406638555519515, + "grad_norm": 0.1715448796749115, + "learning_rate": 4.503138605699575e-05, + "loss": 0.3977, "step": 31650 }, { - "epoch": 1.11, - "learning_rate": 4.5288597321943146e-05, - "loss": 0.3019, + "epoch": 1.1408440552131762, + "grad_norm": 0.18790487945079803, + "learning_rate": 4.502963993575483e-05, + "loss": 0.4169, "step": 31655 }, { - "epoch": 1.11, - "learning_rate": 4.528693270669542e-05, - "loss": 0.2911, + "epoch": 1.141024254874401, + "grad_norm": 0.15706005692481995, + "learning_rate": 4.502789354161395e-05, + "loss": 0.416, "step": 31660 }, { - "epoch": 1.11, - "learning_rate": 4.528526782803523e-05, - "loss": 0.2665, + "epoch": 1.1412044545356255, + "grad_norm": 0.1588856726884842, + "learning_rate": 4.5026146874596895e-05, + "loss": 0.4035, "step": 31665 }, { - "epoch": 1.11, - "learning_rate": 4.5283602685984215e-05, - "loss": 0.2793, + "epoch": 1.1413846541968502, + "grad_norm": 0.2316998541355133, + "learning_rate": 4.502439993472746e-05, + "loss": 0.4545, "step": 31670 }, { - "epoch": 1.11, - "learning_rate": 4.528193728056398e-05, - "loss": 0.2908, + "epoch": 1.1415648538580747, + "grad_norm": 0.20132675766944885, + "learning_rate": 4.502265272202946e-05, + "loss": 0.4461, "step": 31675 }, { - "epoch": 1.11, - "learning_rate": 4.5280271611796165e-05, - "loss": 0.2861, + "epoch": 1.1417450535192994, + "grad_norm": 0.1755920946598053, + "learning_rate": 4.502090523652669e-05, + "loss": 0.4417, "step": 31680 }, { - "epoch": 1.11, - "learning_rate": 4.527860567970238e-05, - "loss": 0.301, + "epoch": 1.141925253180524, + "grad_norm": 0.19191905856132507, + "learning_rate": 4.501915747824296e-05, + "loss": 0.4183, "step": 31685 }, { - "epoch": 1.11, - "learning_rate": 4.527693948430427e-05, - "loss": 0.2825, + "epoch": 1.1421054528417487, + "grad_norm": 0.16491484642028809, + "learning_rate": 4.501740944720209e-05, + "loss": 0.4373, "step": 31690 }, { - "epoch": 1.12, - "learning_rate": 4.527527302562346e-05, - "loss": 0.298, + "epoch": 1.1422856525029732, + "grad_norm": 0.16868291795253754, + "learning_rate": 4.501566114342789e-05, + "loss": 0.4038, "step": 31695 }, { - "epoch": 1.12, - "learning_rate": 4.5273606303681596e-05, - "loss": 0.2914, + "epoch": 1.142465852164198, + "grad_norm": 0.1392037272453308, + "learning_rate": 4.501391256694418e-05, + "loss": 0.3929, "step": 31700 }, { - "epoch": 1.12, - "learning_rate": 4.527193931850031e-05, - "loss": 0.2769, + "epoch": 1.1426460518254227, + "grad_norm": 0.1932985484600067, + "learning_rate": 4.501216371777479e-05, + "loss": 0.4323, "step": 31705 }, { - "epoch": 1.12, - "learning_rate": 4.527027207010125e-05, - "loss": 0.3404, + "epoch": 1.1428262514866472, + "grad_norm": 0.17676390707492828, + "learning_rate": 4.501041459594354e-05, + "loss": 0.419, "step": 31710 }, { - "epoch": 1.12, - "learning_rate": 4.5268604558506075e-05, - "loss": 0.2766, + "epoch": 1.143006451147872, + "grad_norm": 0.20211122930049896, + "learning_rate": 4.5008665201474273e-05, + "loss": 0.4147, "step": 31715 }, { - "epoch": 1.12, - "learning_rate": 4.526693678373642e-05, - "loss": 0.2803, + "epoch": 1.1431866508090964, + "grad_norm": 0.16196738183498383, + "learning_rate": 4.50069155343908e-05, + "loss": 0.4052, "step": 31720 }, { - "epoch": 1.12, - "learning_rate": 4.526526874581395e-05, - "loss": 0.2841, + "epoch": 1.1433668504703212, + "grad_norm": 0.16365759074687958, + "learning_rate": 4.500516559471699e-05, + "loss": 0.424, "step": 31725 }, { - "epoch": 1.12, - "learning_rate": 4.5263600444760316e-05, - "loss": 0.2872, + "epoch": 1.1435470501315457, + "grad_norm": 0.16799362003803253, + "learning_rate": 4.500341538247667e-05, + "loss": 0.4237, "step": 31730 }, { - "epoch": 1.12, - "learning_rate": 4.526193188059718e-05, - "loss": 0.2733, + "epoch": 1.1437272497927704, + "grad_norm": 0.1993078589439392, + "learning_rate": 4.500166489769369e-05, + "loss": 0.4194, "step": 31735 }, { - "epoch": 1.12, - "learning_rate": 4.526026305334623e-05, - "loss": 0.2901, + "epoch": 1.143907449453995, + "grad_norm": 0.15421169996261597, + "learning_rate": 4.49999141403919e-05, + "loss": 0.3786, "step": 31740 }, { - "epoch": 1.12, - "learning_rate": 4.52585939630291e-05, - "loss": 0.2933, + "epoch": 1.1440876491152197, + "grad_norm": 0.17669521272182465, + "learning_rate": 4.499816311059514e-05, + "loss": 0.3992, "step": 31745 }, { - "epoch": 1.12, - "learning_rate": 4.5256924609667484e-05, - "loss": 0.3002, + "epoch": 1.1442678487764444, + "grad_norm": 0.18035532534122467, + "learning_rate": 4.499641180832729e-05, + "loss": 0.433, "step": 31750 }, { - "epoch": 1.12, - "learning_rate": 4.525525499328306e-05, - "loss": 0.2962, + "epoch": 1.144448048437669, + "grad_norm": 0.15400253236293793, + "learning_rate": 4.49946602336122e-05, + "loss": 0.4367, "step": 31755 }, { - "epoch": 1.12, - "learning_rate": 4.525358511389749e-05, - "loss": 0.2934, + "epoch": 1.1446282480988936, + "grad_norm": 0.16569051146507263, + "learning_rate": 4.4992908386473727e-05, + "loss": 0.3999, "step": 31760 }, { - "epoch": 1.12, - "learning_rate": 4.525191497153246e-05, - "loss": 0.3064, + "epoch": 1.1448084477601181, + "grad_norm": 0.16333115100860596, + "learning_rate": 4.4991156266935755e-05, + "loss": 0.4458, "step": 31765 }, { - "epoch": 1.12, - "learning_rate": 4.5250244566209666e-05, - "loss": 0.2713, + "epoch": 1.1449886474213429, + "grad_norm": 0.17189821600914001, + "learning_rate": 4.498940387502214e-05, + "loss": 0.399, "step": 31770 }, { - "epoch": 1.12, - "learning_rate": 4.52485738979508e-05, - "loss": 0.3043, + "epoch": 1.1451688470825676, + "grad_norm": 0.2206844836473465, + "learning_rate": 4.498765121075678e-05, + "loss": 0.4515, "step": 31775 }, { - "epoch": 1.12, - "learning_rate": 4.5246902966777535e-05, - "loss": 0.2654, + "epoch": 1.1453490467437921, + "grad_norm": 0.18912701308727264, + "learning_rate": 4.498589827416354e-05, + "loss": 0.4375, "step": 31780 }, { - "epoch": 1.12, - "learning_rate": 4.524523177271158e-05, - "loss": 0.2602, + "epoch": 1.1455292464050169, + "grad_norm": 0.17603172361850739, + "learning_rate": 4.49841450652663e-05, + "loss": 0.407, "step": 31785 }, { - "epoch": 1.12, - "learning_rate": 4.5243560315774634e-05, - "loss": 0.2898, + "epoch": 1.1457094460662414, + "grad_norm": 0.1851511150598526, + "learning_rate": 4.498239158408896e-05, + "loss": 0.4273, "step": 31790 }, { - "epoch": 1.12, - "learning_rate": 4.52418885959884e-05, - "loss": 0.2941, + "epoch": 1.145889645727466, + "grad_norm": 0.13465797901153564, + "learning_rate": 4.498063783065539e-05, + "loss": 0.4008, "step": 31795 }, { - "epoch": 1.12, - "learning_rate": 4.524021661337458e-05, - "loss": 0.3295, + "epoch": 1.1460698453886906, + "grad_norm": 0.16686324775218964, + "learning_rate": 4.4978883804989516e-05, + "loss": 0.4083, "step": 31800 }, { - "epoch": 1.12, - "learning_rate": 4.5238544367954884e-05, - "loss": 0.2934, + "epoch": 1.1462500450499153, + "grad_norm": 0.16204862296581268, + "learning_rate": 4.49771295071152e-05, + "loss": 0.4026, "step": 31805 }, { - "epoch": 1.12, - "learning_rate": 4.5236871859751026e-05, - "loss": 0.2955, + "epoch": 1.1464302447111399, + "grad_norm": 0.19073139131069183, + "learning_rate": 4.4975374937056366e-05, + "loss": 0.4004, "step": 31810 }, { - "epoch": 1.12, - "learning_rate": 4.523519908878473e-05, - "loss": 0.2837, + "epoch": 1.1466104443723646, + "grad_norm": 0.20284846425056458, + "learning_rate": 4.4973620094836926e-05, + "loss": 0.4195, "step": 31815 }, { - "epoch": 1.12, - "learning_rate": 4.5233526055077705e-05, - "loss": 0.3061, + "epoch": 1.1467906440335893, + "grad_norm": 0.18009348213672638, + "learning_rate": 4.497186498048077e-05, + "loss": 0.4421, "step": 31820 }, { - "epoch": 1.12, - "learning_rate": 4.5231852758651685e-05, - "loss": 0.2806, + "epoch": 1.1469708436948138, + "grad_norm": 0.18369971215724945, + "learning_rate": 4.497010959401183e-05, + "loss": 0.417, "step": 31825 }, { - "epoch": 1.12, - "learning_rate": 4.5230179199528384e-05, - "loss": 0.3167, + "epoch": 1.1471510433560386, + "grad_norm": 0.17240361869335175, + "learning_rate": 4.4968353935454004e-05, + "loss": 0.3953, "step": 31830 }, { - "epoch": 1.12, - "learning_rate": 4.5228505377729534e-05, - "loss": 0.2961, + "epoch": 1.147331243017263, + "grad_norm": 0.182241752743721, + "learning_rate": 4.496659800483123e-05, + "loss": 0.4441, "step": 31835 }, { - "epoch": 1.12, - "learning_rate": 4.522683129327688e-05, - "loss": 0.3087, + "epoch": 1.1475114426784878, + "grad_norm": 0.20979800820350647, + "learning_rate": 4.4964841802167414e-05, + "loss": 0.4349, "step": 31840 }, { - "epoch": 1.12, - "learning_rate": 4.522515694619215e-05, - "loss": 0.2866, + "epoch": 1.1476916423397123, + "grad_norm": 0.1706288456916809, + "learning_rate": 4.49630853274865e-05, + "loss": 0.3891, "step": 31845 }, { - "epoch": 1.12, - "learning_rate": 4.522348233649709e-05, - "loss": 0.2903, + "epoch": 1.147871842000937, + "grad_norm": 0.17474350333213806, + "learning_rate": 4.496132858081241e-05, + "loss": 0.4184, "step": 31850 }, { - "epoch": 1.12, - "learning_rate": 4.522180746421343e-05, - "loss": 0.2939, + "epoch": 1.1480520416621616, + "grad_norm": 0.16004018485546112, + "learning_rate": 4.495957156216908e-05, + "loss": 0.4254, "step": 31855 }, { - "epoch": 1.12, - "learning_rate": 4.522013232936293e-05, - "loss": 0.2891, + "epoch": 1.1482322413233863, + "grad_norm": 0.20258988440036774, + "learning_rate": 4.495781427158046e-05, + "loss": 0.4425, "step": 31860 }, { - "epoch": 1.12, - "learning_rate": 4.521845693196735e-05, - "loss": 0.3175, + "epoch": 1.148412440984611, + "grad_norm": 0.18658919632434845, + "learning_rate": 4.4956056709070485e-05, + "loss": 0.4369, "step": 31865 }, { - "epoch": 1.12, - "learning_rate": 4.5216781272048414e-05, - "loss": 0.3182, + "epoch": 1.1485926406458355, + "grad_norm": 0.16538068652153015, + "learning_rate": 4.495429887466309e-05, + "loss": 0.4099, "step": 31870 }, { - "epoch": 1.12, - "learning_rate": 4.5215105349627906e-05, - "loss": 0.2788, + "epoch": 1.1487728403070603, + "grad_norm": 0.16032880544662476, + "learning_rate": 4.495254076838225e-05, + "loss": 0.3942, "step": 31875 }, { - "epoch": 1.12, - "learning_rate": 4.5213429164727575e-05, - "loss": 0.2906, + "epoch": 1.1489530399682848, + "grad_norm": 0.14562112092971802, + "learning_rate": 4.495078239025189e-05, + "loss": 0.4076, "step": 31880 }, { - "epoch": 1.12, - "learning_rate": 4.521175271736918e-05, - "loss": 0.2794, + "epoch": 1.1491332396295095, + "grad_norm": 0.20045393705368042, + "learning_rate": 4.494902374029599e-05, + "loss": 0.4033, "step": 31885 }, { - "epoch": 1.12, - "learning_rate": 4.521007600757451e-05, - "loss": 0.2859, + "epoch": 1.149313439290734, + "grad_norm": 0.1704903542995453, + "learning_rate": 4.4947264818538505e-05, + "loss": 0.4255, "step": 31890 }, { - "epoch": 1.12, - "learning_rate": 4.520839903536531e-05, - "loss": 0.2792, + "epoch": 1.1494936389519588, + "grad_norm": 0.18456216156482697, + "learning_rate": 4.49455056250034e-05, + "loss": 0.4368, "step": 31895 }, { - "epoch": 1.12, - "learning_rate": 4.520672180076337e-05, - "loss": 0.2994, + "epoch": 1.1496738386131833, + "grad_norm": 0.18716961145401, + "learning_rate": 4.494374615971464e-05, + "loss": 0.4268, "step": 31900 }, { - "epoch": 1.12, - "learning_rate": 4.520504430379047e-05, - "loss": 0.2815, + "epoch": 1.149854038274408, + "grad_norm": 0.1741458624601364, + "learning_rate": 4.49419864226962e-05, + "loss": 0.3761, "step": 31905 }, { - "epoch": 1.12, - "learning_rate": 4.520336654446838e-05, - "loss": 0.2966, + "epoch": 1.1500342379356328, + "grad_norm": 0.17461341619491577, + "learning_rate": 4.494022641397205e-05, + "loss": 0.4028, "step": 31910 }, { - "epoch": 1.12, - "learning_rate": 4.5201688522818895e-05, - "loss": 0.3142, + "epoch": 1.1502144375968573, + "grad_norm": 0.16203176975250244, + "learning_rate": 4.493846613356619e-05, + "loss": 0.4028, "step": 31915 }, { - "epoch": 1.12, - "learning_rate": 4.5200010238863794e-05, - "loss": 0.2811, + "epoch": 1.150394637258082, + "grad_norm": 0.16421377658843994, + "learning_rate": 4.493670558150258e-05, + "loss": 0.4365, "step": 31920 }, { - "epoch": 1.12, - "learning_rate": 4.5198331692624874e-05, - "loss": 0.2741, + "epoch": 1.1505748369193065, + "grad_norm": 0.17700999975204468, + "learning_rate": 4.493494475780521e-05, + "loss": 0.4262, "step": 31925 }, { - "epoch": 1.12, - "learning_rate": 4.5196652884123926e-05, - "loss": 0.3182, + "epoch": 1.1507550365805312, + "grad_norm": 0.1917300820350647, + "learning_rate": 4.493318366249809e-05, + "loss": 0.4477, "step": 31930 }, { - "epoch": 1.12, - "learning_rate": 4.5194973813382744e-05, - "loss": 0.2964, + "epoch": 1.150935236241756, + "grad_norm": 0.24866171181201935, + "learning_rate": 4.4931422295605196e-05, + "loss": 0.3896, "step": 31935 }, { - "epoch": 1.12, - "learning_rate": 4.5193294480423146e-05, - "loss": 0.3003, + "epoch": 1.1511154359029805, + "grad_norm": 0.16452571749687195, + "learning_rate": 4.492966065715053e-05, + "loss": 0.4155, "step": 31940 }, { - "epoch": 1.12, - "learning_rate": 4.5191614885266925e-05, - "loss": 0.2881, + "epoch": 1.1512956355642052, + "grad_norm": 0.15788938105106354, + "learning_rate": 4.4927898747158095e-05, + "loss": 0.424, "step": 31945 }, { - "epoch": 1.12, - "learning_rate": 4.5189935027935894e-05, - "loss": 0.2914, + "epoch": 1.1514758352254297, + "grad_norm": 0.16340243816375732, + "learning_rate": 4.4926136565651904e-05, + "loss": 0.4527, "step": 31950 }, { - "epoch": 1.12, - "learning_rate": 4.518825490845185e-05, - "loss": 0.2768, + "epoch": 1.1516560348866545, + "grad_norm": 0.16550558805465698, + "learning_rate": 4.492437411265596e-05, + "loss": 0.4677, "step": 31955 }, { - "epoch": 1.12, - "learning_rate": 4.518657452683663e-05, - "loss": 0.2805, + "epoch": 1.151836234547879, + "grad_norm": 0.14602138102054596, + "learning_rate": 4.492261138819427e-05, + "loss": 0.4225, "step": 31960 }, { - "epoch": 1.12, - "learning_rate": 4.518489388311205e-05, - "loss": 0.3325, + "epoch": 1.1520164342091037, + "grad_norm": 0.19070053100585938, + "learning_rate": 4.492084839229086e-05, + "loss": 0.4458, "step": 31965 }, { - "epoch": 1.12, - "learning_rate": 4.518321297729991e-05, - "loss": 0.3182, + "epoch": 1.1521966338703282, + "grad_norm": 0.18122611939907074, + "learning_rate": 4.491908512496975e-05, + "loss": 0.4202, "step": 31970 }, { - "epoch": 1.12, - "learning_rate": 4.518153180942206e-05, - "loss": 0.2958, + "epoch": 1.152376833531553, + "grad_norm": 0.1686059683561325, + "learning_rate": 4.491732158625496e-05, + "loss": 0.4264, "step": 31975 }, { - "epoch": 1.13, - "learning_rate": 4.517985037950032e-05, - "loss": 0.2981, + "epoch": 1.1525570331927777, + "grad_norm": 0.15016207098960876, + "learning_rate": 4.491555777617051e-05, + "loss": 0.4277, "step": 31980 }, { - "epoch": 1.13, - "learning_rate": 4.5178168687556525e-05, - "loss": 0.2792, + "epoch": 1.1527372328540022, + "grad_norm": 0.15645290911197662, + "learning_rate": 4.491379369474046e-05, + "loss": 0.4294, "step": 31985 }, { - "epoch": 1.13, - "learning_rate": 4.5176486733612497e-05, - "loss": 0.2914, + "epoch": 1.152917432515227, + "grad_norm": 0.14731116592884064, + "learning_rate": 4.491202934198881e-05, + "loss": 0.3946, "step": 31990 }, { - "epoch": 1.13, - "learning_rate": 4.517480451769009e-05, - "loss": 0.3029, + "epoch": 1.1530976321764514, + "grad_norm": 0.1814972460269928, + "learning_rate": 4.491026471793962e-05, + "loss": 0.4306, "step": 31995 }, { - "epoch": 1.13, - "learning_rate": 4.517312203981114e-05, - "loss": 0.2882, + "epoch": 1.1532778318376762, + "grad_norm": 0.2180231809616089, + "learning_rate": 4.4908499822616934e-05, + "loss": 0.421, "step": 32000 }, { - "epoch": 1.13, - "eval_loss": 0.2893707752227783, - "eval_runtime": 10.5373, - "eval_samples_per_second": 9.49, - "eval_steps_per_second": 9.49, + "epoch": 1.1532778318376762, + "eval_loss": 0.4514605402946472, + "eval_runtime": 3.5405, + "eval_samples_per_second": 28.244, + "eval_steps_per_second": 7.061, "step": 32000 }, { - "epoch": 1.13, - "learning_rate": 4.51714392999975e-05, - "loss": 0.2729, + "epoch": 1.1534580314989007, + "grad_norm": 0.18036217987537384, + "learning_rate": 4.4906734656044786e-05, + "loss": 0.4261, "step": 32005 }, { - "epoch": 1.13, - "learning_rate": 4.516975629827101e-05, - "loss": 0.3085, + "epoch": 1.1536382311601254, + "grad_norm": 0.2106885462999344, + "learning_rate": 4.4904969218247234e-05, + "loss": 0.3771, "step": 32010 }, { - "epoch": 1.13, - "learning_rate": 4.516807303465353e-05, - "loss": 0.2839, + "epoch": 1.15381843082135, + "grad_norm": 0.15591959655284882, + "learning_rate": 4.490320350924833e-05, + "loss": 0.4009, "step": 32015 }, { - "epoch": 1.13, - "learning_rate": 4.516638950916691e-05, - "loss": 0.2747, + "epoch": 1.1539986304825747, + "grad_norm": 0.14869220554828644, + "learning_rate": 4.490143752907213e-05, + "loss": 0.4101, "step": 32020 }, { - "epoch": 1.13, - "learning_rate": 4.516470572183302e-05, - "loss": 0.2748, + "epoch": 1.1541788301437994, + "grad_norm": 0.1626279205083847, + "learning_rate": 4.489967127774271e-05, + "loss": 0.4174, "step": 32025 }, { - "epoch": 1.13, - "learning_rate": 4.5163021672673714e-05, - "loss": 0.2887, + "epoch": 1.154359029805024, + "grad_norm": 0.1796058565378189, + "learning_rate": 4.48979047552841e-05, + "loss": 0.4132, "step": 32030 }, { - "epoch": 1.13, - "learning_rate": 4.516133736171084e-05, - "loss": 0.3063, + "epoch": 1.1545392294662487, + "grad_norm": 0.16110989451408386, + "learning_rate": 4.48961379617204e-05, + "loss": 0.4238, "step": 32035 }, { - "epoch": 1.13, - "learning_rate": 4.5159652788966314e-05, - "loss": 0.2861, + "epoch": 1.1547194291274732, + "grad_norm": 0.17553108930587769, + "learning_rate": 4.489437089707568e-05, + "loss": 0.3808, "step": 32040 }, { - "epoch": 1.13, - "learning_rate": 4.5157967954461967e-05, - "loss": 0.2965, + "epoch": 1.154899628788698, + "grad_norm": 0.289981484413147, + "learning_rate": 4.489260356137399e-05, + "loss": 0.4202, "step": 32045 }, { - "epoch": 1.13, - "learning_rate": 4.515628285821969e-05, - "loss": 0.3038, + "epoch": 1.1550798284499226, + "grad_norm": 0.162216454744339, + "learning_rate": 4.489083595463944e-05, + "loss": 0.409, "step": 32050 }, { - "epoch": 1.13, - "learning_rate": 4.515459750026137e-05, - "loss": 0.288, + "epoch": 1.1552600281111471, + "grad_norm": 0.18123595416545868, + "learning_rate": 4.48890680768961e-05, + "loss": 0.4316, "step": 32055 }, { - "epoch": 1.13, - "learning_rate": 4.515291188060888e-05, - "loss": 0.2806, + "epoch": 1.1554402277723719, + "grad_norm": 0.22130350768566132, + "learning_rate": 4.488729992816806e-05, + "loss": 0.4034, "step": 32060 }, { - "epoch": 1.13, - "learning_rate": 4.5151225999284115e-05, - "loss": 0.3051, + "epoch": 1.1556204274335964, + "grad_norm": 0.15505443513393402, + "learning_rate": 4.488553150847941e-05, + "loss": 0.3919, "step": 32065 }, { - "epoch": 1.13, - "learning_rate": 4.514953985630895e-05, - "loss": 0.3021, + "epoch": 1.1558006270948211, + "grad_norm": 0.17959943413734436, + "learning_rate": 4.4883762817854236e-05, + "loss": 0.4241, "step": 32070 }, { - "epoch": 1.13, - "learning_rate": 4.51478534517053e-05, - "loss": 0.2987, + "epoch": 1.1559808267560456, + "grad_norm": 0.18637138605117798, + "learning_rate": 4.488199385631665e-05, + "loss": 0.4389, "step": 32075 }, { - "epoch": 1.13, - "learning_rate": 4.514616678549505e-05, - "loss": 0.2964, + "epoch": 1.1561610264172704, + "grad_norm": 0.16619151830673218, + "learning_rate": 4.488022462389074e-05, + "loss": 0.4582, "step": 32080 }, { - "epoch": 1.13, - "learning_rate": 4.51444798577001e-05, - "loss": 0.273, + "epoch": 1.1563412260784949, + "grad_norm": 0.22853337228298187, + "learning_rate": 4.487845512060063e-05, + "loss": 0.4317, "step": 32085 }, { - "epoch": 1.13, - "learning_rate": 4.514279266834235e-05, - "loss": 0.2763, + "epoch": 1.1565214257397196, + "grad_norm": 0.15449535846710205, + "learning_rate": 4.487668534647041e-05, + "loss": 0.4514, "step": 32090 }, { - "epoch": 1.13, - "learning_rate": 4.514110521744371e-05, - "loss": 0.2772, + "epoch": 1.1567016254009443, + "grad_norm": 0.18602608144283295, + "learning_rate": 4.4874915301524194e-05, + "loss": 0.4577, "step": 32095 }, { - "epoch": 1.13, - "learning_rate": 4.5139417505026105e-05, - "loss": 0.2844, + "epoch": 1.1568818250621689, + "grad_norm": 0.178580179810524, + "learning_rate": 4.487314498578611e-05, + "loss": 0.4175, "step": 32100 }, { - "epoch": 1.13, - "learning_rate": 4.513772953111143e-05, - "loss": 0.2938, + "epoch": 1.1570620247233936, + "grad_norm": 0.17809760570526123, + "learning_rate": 4.4871374399280273e-05, + "loss": 0.4256, "step": 32105 }, { - "epoch": 1.13, - "learning_rate": 4.5136041295721606e-05, - "loss": 0.3007, + "epoch": 1.157242224384618, + "grad_norm": 0.16492916643619537, + "learning_rate": 4.4869603542030806e-05, + "loss": 0.4098, "step": 32110 }, { - "epoch": 1.13, - "learning_rate": 4.5134352798878556e-05, - "loss": 0.3043, + "epoch": 1.1574224240458428, + "grad_norm": 0.1905374526977539, + "learning_rate": 4.486783241406184e-05, + "loss": 0.4009, "step": 32115 }, { - "epoch": 1.13, - "learning_rate": 4.51326640406042e-05, - "loss": 0.2907, + "epoch": 1.1576026237070673, + "grad_norm": 0.16066482663154602, + "learning_rate": 4.486606101539751e-05, + "loss": 0.398, "step": 32120 }, { - "epoch": 1.13, - "learning_rate": 4.513097502092047e-05, - "loss": 0.3222, + "epoch": 1.157782823368292, + "grad_norm": 0.1904597133398056, + "learning_rate": 4.486428934606194e-05, + "loss": 0.405, "step": 32125 }, { - "epoch": 1.13, - "learning_rate": 4.5129285739849304e-05, - "loss": 0.3052, + "epoch": 1.1579630230295166, + "grad_norm": 0.16945073008537292, + "learning_rate": 4.486251740607927e-05, + "loss": 0.4205, "step": 32130 }, { - "epoch": 1.13, - "learning_rate": 4.512759619741262e-05, - "loss": 0.2914, + "epoch": 1.1581432226907413, + "grad_norm": 0.19391489028930664, + "learning_rate": 4.486074519547365e-05, + "loss": 0.4073, "step": 32135 }, { - "epoch": 1.13, - "learning_rate": 4.5125906393632376e-05, - "loss": 0.3115, + "epoch": 1.158323422351966, + "grad_norm": 0.13818098604679108, + "learning_rate": 4.4858972714269215e-05, + "loss": 0.3963, "step": 32140 }, { - "epoch": 1.13, - "learning_rate": 4.512421632853049e-05, - "loss": 0.2945, + "epoch": 1.1585036220131906, + "grad_norm": 0.1501578539609909, + "learning_rate": 4.485719996249012e-05, + "loss": 0.4193, "step": 32145 }, { - "epoch": 1.13, - "learning_rate": 4.5122526002128924e-05, - "loss": 0.3217, + "epoch": 1.1586838216744153, + "grad_norm": 0.19974437355995178, + "learning_rate": 4.4855426940160536e-05, + "loss": 0.4491, "step": 32150 }, { - "epoch": 1.13, - "learning_rate": 4.5120835414449615e-05, - "loss": 0.3123, + "epoch": 1.1588640213356398, + "grad_norm": 0.1437712460756302, + "learning_rate": 4.4853653647304596e-05, + "loss": 0.4116, "step": 32155 }, { - "epoch": 1.13, - "learning_rate": 4.5119144565514514e-05, - "loss": 0.2885, + "epoch": 1.1590442209968645, + "grad_norm": 0.22510062158107758, + "learning_rate": 4.485188008394646e-05, + "loss": 0.419, "step": 32160 }, { - "epoch": 1.13, - "learning_rate": 4.5117453455345594e-05, - "loss": 0.2797, + "epoch": 1.1592244206580893, + "grad_norm": 0.16433167457580566, + "learning_rate": 4.485010625011031e-05, + "loss": 0.4199, "step": 32165 }, { - "epoch": 1.13, - "learning_rate": 4.5115762083964786e-05, - "loss": 0.2813, + "epoch": 1.1594046203193138, + "grad_norm": 0.18741507828235626, + "learning_rate": 4.48483321458203e-05, + "loss": 0.4125, "step": 32170 }, { - "epoch": 1.13, - "learning_rate": 4.511407045139406e-05, - "loss": 0.3063, + "epoch": 1.1595848199805385, + "grad_norm": 0.171631321310997, + "learning_rate": 4.484655777110062e-05, + "loss": 0.4034, "step": 32175 }, { - "epoch": 1.13, - "learning_rate": 4.51123785576554e-05, - "loss": 0.2874, + "epoch": 1.159765019641763, + "grad_norm": 0.20612791180610657, + "learning_rate": 4.484478312597542e-05, + "loss": 0.4072, "step": 32180 }, { - "epoch": 1.13, - "learning_rate": 4.511068640277075e-05, - "loss": 0.2988, + "epoch": 1.1599452193029878, + "grad_norm": 0.162516251206398, + "learning_rate": 4.4843008210468896e-05, + "loss": 0.4092, "step": 32185 }, { - "epoch": 1.13, - "learning_rate": 4.5108993986762096e-05, - "loss": 0.2857, + "epoch": 1.1601254189642123, + "grad_norm": 0.17401380836963654, + "learning_rate": 4.484123302460523e-05, + "loss": 0.3899, "step": 32190 }, { - "epoch": 1.13, - "learning_rate": 4.5107301309651404e-05, - "loss": 0.2855, + "epoch": 1.160305618625437, + "grad_norm": 0.1766970008611679, + "learning_rate": 4.48394575684086e-05, + "loss": 0.4243, "step": 32195 }, { - "epoch": 1.13, - "learning_rate": 4.5105608371460654e-05, - "loss": 0.3222, + "epoch": 1.1604858182866615, + "grad_norm": 0.17945876717567444, + "learning_rate": 4.483768184190321e-05, + "loss": 0.4148, "step": 32200 }, { - "epoch": 1.13, - "learning_rate": 4.510391517221183e-05, - "loss": 0.289, + "epoch": 1.1606660179478863, + "grad_norm": 0.17272137105464935, + "learning_rate": 4.4835905845113234e-05, + "loss": 0.4174, "step": 32205 }, { - "epoch": 1.13, - "learning_rate": 4.510222171192692e-05, - "loss": 0.2758, + "epoch": 1.160846217609111, + "grad_norm": 0.14284390211105347, + "learning_rate": 4.483412957806289e-05, + "loss": 0.4218, "step": 32210 }, { - "epoch": 1.13, - "learning_rate": 4.51005279906279e-05, - "loss": 0.3126, + "epoch": 1.1610264172703355, + "grad_norm": 0.18068622052669525, + "learning_rate": 4.483235304077636e-05, + "loss": 0.4128, "step": 32215 }, { - "epoch": 1.13, - "learning_rate": 4.509883400833678e-05, - "loss": 0.2726, + "epoch": 1.1612066169315602, + "grad_norm": 0.17758698761463165, + "learning_rate": 4.483057623327787e-05, + "loss": 0.4533, "step": 32220 }, { - "epoch": 1.13, - "learning_rate": 4.509713976507554e-05, - "loss": 0.2743, + "epoch": 1.1613868165927848, + "grad_norm": 0.17569084465503693, + "learning_rate": 4.4828799155591615e-05, + "loss": 0.448, "step": 32225 }, { - "epoch": 1.13, - "learning_rate": 4.509544526086619e-05, - "loss": 0.2863, + "epoch": 1.1615670162540095, + "grad_norm": 0.18562377989292145, + "learning_rate": 4.4827021807741806e-05, + "loss": 0.4072, "step": 32230 }, { - "epoch": 1.13, - "learning_rate": 4.509375049573072e-05, - "loss": 0.2754, + "epoch": 1.161747215915234, + "grad_norm": 0.19902653992176056, + "learning_rate": 4.482524418975267e-05, + "loss": 0.4286, "step": 32235 }, { - "epoch": 1.13, - "learning_rate": 4.5092055469691145e-05, - "loss": 0.3209, + "epoch": 1.1619274155764587, + "grad_norm": 0.17017509043216705, + "learning_rate": 4.482346630164842e-05, + "loss": 0.4172, "step": 32240 }, { - "epoch": 1.13, - "learning_rate": 4.509036018276948e-05, - "loss": 0.2833, + "epoch": 1.1621076152376832, + "grad_norm": 0.14589948952198029, + "learning_rate": 4.4821688143453275e-05, + "loss": 0.4425, "step": 32245 }, { - "epoch": 1.13, - "learning_rate": 4.508866463498772e-05, - "loss": 0.2931, + "epoch": 1.162287814898908, + "grad_norm": 0.17183731496334076, + "learning_rate": 4.4819909715191475e-05, + "loss": 0.4243, "step": 32250 }, { - "epoch": 1.13, - "learning_rate": 4.508696882636788e-05, - "loss": 0.3042, + "epoch": 1.1624680145601327, + "grad_norm": 0.14236903190612793, + "learning_rate": 4.481813101688723e-05, + "loss": 0.4096, "step": 32255 }, { - "epoch": 1.13, - "learning_rate": 4.5085272756932e-05, - "loss": 0.282, + "epoch": 1.1626482142213572, + "grad_norm": 0.21945489943027496, + "learning_rate": 4.4816352048564806e-05, + "loss": 0.4207, "step": 32260 }, { - "epoch": 1.14, - "learning_rate": 4.508357642670209e-05, - "loss": 0.2859, + "epoch": 1.162828413882582, + "grad_norm": 0.21350398659706116, + "learning_rate": 4.48145728102484e-05, + "loss": 0.4225, "step": 32265 }, { - "epoch": 1.14, - "learning_rate": 4.508187983570017e-05, - "loss": 0.2901, + "epoch": 1.1630086135438065, + "grad_norm": 0.11839442700147629, + "learning_rate": 4.481279330196229e-05, + "loss": 0.3818, "step": 32270 }, { - "epoch": 1.14, - "learning_rate": 4.5080182983948274e-05, - "loss": 0.3145, + "epoch": 1.1631888132050312, + "grad_norm": 0.16868416965007782, + "learning_rate": 4.481101352373071e-05, + "loss": 0.4231, "step": 32275 }, { - "epoch": 1.14, - "learning_rate": 4.5078485871468436e-05, - "loss": 0.2789, + "epoch": 1.163369012866256, + "grad_norm": 0.1809322088956833, + "learning_rate": 4.480923347557789e-05, + "loss": 0.4284, "step": 32280 }, { - "epoch": 1.14, - "learning_rate": 4.50767884982827e-05, - "loss": 0.3104, + "epoch": 1.1635492125274804, + "grad_norm": 0.2081749439239502, + "learning_rate": 4.48074531575281e-05, + "loss": 0.4275, "step": 32285 }, { - "epoch": 1.14, - "learning_rate": 4.5075090864413085e-05, - "loss": 0.2759, + "epoch": 1.1637294121887052, + "grad_norm": 0.17321893572807312, + "learning_rate": 4.480567256960561e-05, + "loss": 0.4338, "step": 32290 }, { - "epoch": 1.14, - "learning_rate": 4.507339296988165e-05, - "loss": 0.3297, + "epoch": 1.1639096118499297, + "grad_norm": 0.19908635318279266, + "learning_rate": 4.480389171183466e-05, + "loss": 0.3936, "step": 32295 }, { - "epoch": 1.14, - "learning_rate": 4.507169481471044e-05, - "loss": 0.2993, + "epoch": 1.1640898115111544, + "grad_norm": 0.19130414724349976, + "learning_rate": 4.480211058423952e-05, + "loss": 0.4122, "step": 32300 }, { - "epoch": 1.14, - "learning_rate": 4.506999639892149e-05, - "loss": 0.3079, + "epoch": 1.164270011172379, + "grad_norm": 0.20319342613220215, + "learning_rate": 4.480032918684446e-05, + "loss": 0.3795, "step": 32305 }, { - "epoch": 1.14, - "learning_rate": 4.506829772253687e-05, - "loss": 0.3061, + "epoch": 1.1644502108336037, + "grad_norm": 0.19457338750362396, + "learning_rate": 4.479854751967374e-05, + "loss": 0.4287, "step": 32310 }, { - "epoch": 1.14, - "learning_rate": 4.506659878557863e-05, - "loss": 0.2899, + "epoch": 1.1646304104948282, + "grad_norm": 0.14478561282157898, + "learning_rate": 4.479676558275164e-05, + "loss": 0.413, "step": 32315 }, { - "epoch": 1.14, - "learning_rate": 4.506489958806883e-05, - "loss": 0.3119, + "epoch": 1.164810610156053, + "grad_norm": 0.19652974605560303, + "learning_rate": 4.479498337610246e-05, + "loss": 0.4123, "step": 32320 }, { - "epoch": 1.14, - "learning_rate": 4.506320013002953e-05, - "loss": 0.2938, + "epoch": 1.1649908098172777, + "grad_norm": 0.14270542562007904, + "learning_rate": 4.4793200899750445e-05, + "loss": 0.4011, "step": 32325 }, { - "epoch": 1.14, - "learning_rate": 4.506150041148281e-05, - "loss": 0.3046, + "epoch": 1.1651710094785022, + "grad_norm": 0.16907188296318054, + "learning_rate": 4.4791418153719914e-05, + "loss": 0.4427, "step": 32330 }, { - "epoch": 1.14, - "learning_rate": 4.5059800432450705e-05, - "loss": 0.2817, + "epoch": 1.165351209139727, + "grad_norm": 0.19798724353313446, + "learning_rate": 4.4789635138035137e-05, + "loss": 0.4178, "step": 32335 }, { - "epoch": 1.14, - "learning_rate": 4.5058100192955327e-05, - "loss": 0.279, + "epoch": 1.1655314088009514, + "grad_norm": 0.1947140246629715, + "learning_rate": 4.4787851852720405e-05, + "loss": 0.405, "step": 32340 }, { - "epoch": 1.14, - "learning_rate": 4.505639969301874e-05, - "loss": 0.2616, + "epoch": 1.1657116084621761, + "grad_norm": 0.19974881410598755, + "learning_rate": 4.4786068297800034e-05, + "loss": 0.4248, "step": 32345 }, { - "epoch": 1.14, - "learning_rate": 4.5054698932663006e-05, - "loss": 0.287, + "epoch": 1.1658918081234007, + "grad_norm": 0.23895248770713806, + "learning_rate": 4.478428447329831e-05, + "loss": 0.4351, "step": 32350 }, { - "epoch": 1.14, - "learning_rate": 4.505299791191023e-05, - "loss": 0.292, + "epoch": 1.1660720077846254, + "grad_norm": 0.1716042160987854, + "learning_rate": 4.4782500379239534e-05, + "loss": 0.4104, "step": 32355 }, { - "epoch": 1.14, - "learning_rate": 4.50512966307825e-05, - "loss": 0.3002, + "epoch": 1.16625220744585, + "grad_norm": 0.14665000140666962, + "learning_rate": 4.4780716015648026e-05, + "loss": 0.4237, "step": 32360 }, { - "epoch": 1.14, - "learning_rate": 4.504959508930189e-05, - "loss": 0.2942, + "epoch": 1.1664324071070746, + "grad_norm": 0.1704597920179367, + "learning_rate": 4.4778931382548096e-05, + "loss": 0.4007, "step": 32365 }, { - "epoch": 1.14, - "learning_rate": 4.50478932874905e-05, - "loss": 0.3056, + "epoch": 1.1666126067682994, + "grad_norm": 0.17646756768226624, + "learning_rate": 4.477714647996405e-05, + "loss": 0.4092, "step": 32370 }, { - "epoch": 1.14, - "learning_rate": 4.504619122537043e-05, - "loss": 0.2633, + "epoch": 1.1667928064295239, + "grad_norm": 0.13408124446868896, + "learning_rate": 4.4775361307920205e-05, + "loss": 0.4013, "step": 32375 }, { - "epoch": 1.14, - "learning_rate": 4.504448890296377e-05, - "loss": 0.2736, + "epoch": 1.1669730060907486, + "grad_norm": 0.1914110779762268, + "learning_rate": 4.47735758664409e-05, + "loss": 0.4323, "step": 32380 }, { - "epoch": 1.14, - "learning_rate": 4.504278632029264e-05, - "loss": 0.2809, + "epoch": 1.1671532057519731, + "grad_norm": 0.19718259572982788, + "learning_rate": 4.4771790155550455e-05, + "loss": 0.4199, "step": 32385 }, { - "epoch": 1.14, - "learning_rate": 4.5041083477379134e-05, - "loss": 0.3094, + "epoch": 1.1673334054131979, + "grad_norm": 0.19468161463737488, + "learning_rate": 4.477000417527319e-05, + "loss": 0.4024, "step": 32390 }, { - "epoch": 1.14, - "learning_rate": 4.503938037424537e-05, - "loss": 0.2886, + "epoch": 1.1675136050744224, + "grad_norm": 0.16953888535499573, + "learning_rate": 4.476821792563345e-05, + "loss": 0.4586, "step": 32395 }, { - "epoch": 1.14, - "learning_rate": 4.503767701091346e-05, - "loss": 0.2882, + "epoch": 1.167693804735647, + "grad_norm": 0.18917879462242126, + "learning_rate": 4.4766788731997023e-05, + "loss": 0.4336, "step": 32400 }, { - "epoch": 1.14, - "learning_rate": 4.5035973387405514e-05, - "loss": 0.3187, + "epoch": 1.1678740043968716, + "grad_norm": 0.16631363332271576, + "learning_rate": 4.476500199756615e-05, + "loss": 0.3989, "step": 32405 }, { - "epoch": 1.14, - "learning_rate": 4.5034269503743665e-05, - "loss": 0.2848, + "epoch": 1.1680542040580963, + "grad_norm": 0.17123185098171234, + "learning_rate": 4.476321499384096e-05, + "loss": 0.4394, "step": 32410 }, { - "epoch": 1.14, - "learning_rate": 4.5032565359950016e-05, - "loss": 0.3036, + "epoch": 1.168234403719321, + "grad_norm": 0.14406786859035492, + "learning_rate": 4.476142772084578e-05, + "loss": 0.4602, "step": 32415 }, { - "epoch": 1.14, - "learning_rate": 4.5030860956046725e-05, - "loss": 0.3061, + "epoch": 1.1684146033805456, + "grad_norm": 0.17469120025634766, + "learning_rate": 4.475964017860498e-05, + "loss": 0.4058, "step": 32420 }, { - "epoch": 1.14, - "learning_rate": 4.5029156292055895e-05, - "loss": 0.2894, + "epoch": 1.1685948030417703, + "grad_norm": 0.15574227273464203, + "learning_rate": 4.47578523671429e-05, + "loss": 0.4586, "step": 32425 }, { - "epoch": 1.14, - "learning_rate": 4.502745136799967e-05, - "loss": 0.2715, + "epoch": 1.1687750027029948, + "grad_norm": 0.22920864820480347, + "learning_rate": 4.4756064286483915e-05, + "loss": 0.4453, "step": 32430 }, { - "epoch": 1.14, - "learning_rate": 4.5025746183900195e-05, - "loss": 0.3355, + "epoch": 1.1689552023642196, + "grad_norm": 0.19061683118343353, + "learning_rate": 4.475427593665238e-05, + "loss": 0.4672, "step": 32435 }, { - "epoch": 1.14, - "learning_rate": 4.502404073977959e-05, - "loss": 0.2947, + "epoch": 1.1691354020254443, + "grad_norm": 0.20378006994724274, + "learning_rate": 4.475248731767265e-05, + "loss": 0.4179, "step": 32440 }, { - "epoch": 1.14, - "learning_rate": 4.5022335035660026e-05, - "loss": 0.2973, + "epoch": 1.1693156016866688, + "grad_norm": 0.16971811652183533, + "learning_rate": 4.475069842956911e-05, + "loss": 0.397, "step": 32445 }, { - "epoch": 1.14, - "learning_rate": 4.502062907156363e-05, - "loss": 0.304, + "epoch": 1.1694958013478935, + "grad_norm": 0.17052094638347626, + "learning_rate": 4.4748909272366133e-05, + "loss": 0.3946, "step": 32450 }, { - "epoch": 1.14, - "learning_rate": 4.5018922847512555e-05, - "loss": 0.3056, + "epoch": 1.169676001009118, + "grad_norm": 0.14758871495723724, + "learning_rate": 4.474711984608809e-05, + "loss": 0.3939, "step": 32455 }, { - "epoch": 1.14, - "learning_rate": 4.501721636352897e-05, - "loss": 0.2712, + "epoch": 1.1698562006703428, + "grad_norm": 0.18905098736286163, + "learning_rate": 4.474533015075936e-05, + "loss": 0.4125, "step": 32460 }, { - "epoch": 1.14, - "learning_rate": 4.501550961963502e-05, - "loss": 0.275, + "epoch": 1.1700364003315673, + "grad_norm": 0.1888333261013031, + "learning_rate": 4.474354018640432e-05, + "loss": 0.4127, "step": 32465 }, { - "epoch": 1.14, - "learning_rate": 4.501380261585286e-05, - "loss": 0.2669, + "epoch": 1.170216599992792, + "grad_norm": 0.15173350274562836, + "learning_rate": 4.4741749953047374e-05, + "loss": 0.4341, "step": 32470 }, { - "epoch": 1.14, - "learning_rate": 4.501209535220468e-05, - "loss": 0.3307, + "epoch": 1.1703967996540166, + "grad_norm": 0.18733574450016022, + "learning_rate": 4.473995945071291e-05, + "loss": 0.4164, "step": 32475 }, { - "epoch": 1.14, - "learning_rate": 4.501038782871261e-05, - "loss": 0.2873, + "epoch": 1.1705769993152413, + "grad_norm": 0.18692295253276825, + "learning_rate": 4.473816867942532e-05, + "loss": 0.4247, "step": 32480 }, { - "epoch": 1.14, - "learning_rate": 4.500868004539885e-05, - "loss": 0.3082, + "epoch": 1.170757198976466, + "grad_norm": 0.22081652283668518, + "learning_rate": 4.473637763920901e-05, + "loss": 0.4228, "step": 32485 }, { - "epoch": 1.14, - "learning_rate": 4.5006972002285574e-05, - "loss": 0.2735, + "epoch": 1.1709373986376905, + "grad_norm": 0.15377368032932281, + "learning_rate": 4.4734586330088365e-05, + "loss": 0.381, "step": 32490 }, { - "epoch": 1.14, - "learning_rate": 4.500526369939494e-05, - "loss": 0.3092, + "epoch": 1.1711175982989153, + "grad_norm": 0.17810076475143433, + "learning_rate": 4.47327947520878e-05, + "loss": 0.413, "step": 32495 }, { - "epoch": 1.14, - "learning_rate": 4.500355513674914e-05, - "loss": 0.297, + "epoch": 1.1712977979601398, + "grad_norm": 0.17749875783920288, + "learning_rate": 4.4731002905231735e-05, + "loss": 0.4012, "step": 32500 }, { - "epoch": 1.14, - "eval_loss": 0.28957316279411316, - "eval_runtime": 10.558, - "eval_samples_per_second": 9.471, - "eval_steps_per_second": 9.471, + "epoch": 1.1712977979601398, + "eval_loss": 0.4519113600254059, + "eval_runtime": 3.5356, + "eval_samples_per_second": 28.284, + "eval_steps_per_second": 7.071, "step": 32500 }, { - "epoch": 1.14, - "learning_rate": 4.500184631437037e-05, - "loss": 0.2703, + "epoch": 1.1714779976213645, + "grad_norm": 0.20969074964523315, + "learning_rate": 4.4729210789544576e-05, + "loss": 0.4399, "step": 32505 }, { - "epoch": 1.14, - "learning_rate": 4.5000137232280806e-05, - "loss": 0.2789, + "epoch": 1.171658197282589, + "grad_norm": 0.17028845846652985, + "learning_rate": 4.472741840505073e-05, + "loss": 0.4074, "step": 32510 }, { - "epoch": 1.14, - "learning_rate": 4.499842789050263e-05, - "loss": 0.2644, + "epoch": 1.1718383969438138, + "grad_norm": 0.19405877590179443, + "learning_rate": 4.472562575177464e-05, + "loss": 0.4364, "step": 32515 }, { - "epoch": 1.14, - "learning_rate": 4.499671828905806e-05, - "loss": 0.286, + "epoch": 1.1720185966050383, + "grad_norm": 0.20871876180171967, + "learning_rate": 4.472383282974071e-05, + "loss": 0.4526, "step": 32520 }, { - "epoch": 1.14, - "learning_rate": 4.499500842796927e-05, - "loss": 0.285, + "epoch": 1.172198796266263, + "grad_norm": 0.16674499213695526, + "learning_rate": 4.4722039638973374e-05, + "loss": 0.4057, "step": 32525 }, { - "epoch": 1.14, - "learning_rate": 4.499329830725848e-05, - "loss": 0.2909, + "epoch": 1.1723789959274877, + "grad_norm": 0.17715777456760406, + "learning_rate": 4.4720246179497074e-05, + "loss": 0.4047, "step": 32530 }, { - "epoch": 1.14, - "learning_rate": 4.4991587926947886e-05, - "loss": 0.2899, + "epoch": 1.1725591955887122, + "grad_norm": 0.15576201677322388, + "learning_rate": 4.471845245133623e-05, + "loss": 0.4307, "step": 32535 }, { - "epoch": 1.14, - "learning_rate": 4.4989877287059694e-05, - "loss": 0.2982, + "epoch": 1.172739395249937, + "grad_norm": 0.19848684966564178, + "learning_rate": 4.4716658454515294e-05, + "loss": 0.4309, "step": 32540 }, { - "epoch": 1.15, - "learning_rate": 4.4988166387616125e-05, - "loss": 0.2857, + "epoch": 1.1729195949111615, + "grad_norm": 0.21592764556407928, + "learning_rate": 4.4714864189058705e-05, + "loss": 0.4146, "step": 32545 }, { - "epoch": 1.15, - "learning_rate": 4.498645522863938e-05, - "loss": 0.2861, + "epoch": 1.1730997945723862, + "grad_norm": 0.17954400181770325, + "learning_rate": 4.47130696549909e-05, + "loss": 0.4367, "step": 32550 }, { - "epoch": 1.15, - "learning_rate": 4.49847438101517e-05, - "loss": 0.2805, + "epoch": 1.173279994233611, + "grad_norm": 0.17298442125320435, + "learning_rate": 4.471127485233635e-05, + "loss": 0.3967, "step": 32555 }, { - "epoch": 1.15, - "learning_rate": 4.498303213217527e-05, - "loss": 0.3046, + "epoch": 1.1734601938948355, + "grad_norm": 0.17591506242752075, + "learning_rate": 4.4709479781119485e-05, + "loss": 0.4144, "step": 32560 }, { - "epoch": 1.15, - "learning_rate": 4.498132019473236e-05, - "loss": 0.2983, + "epoch": 1.1736403935560602, + "grad_norm": 0.1792951375246048, + "learning_rate": 4.470768444136478e-05, + "loss": 0.429, "step": 32565 }, { - "epoch": 1.15, - "learning_rate": 4.497960799784516e-05, - "loss": 0.2732, + "epoch": 1.1738205932172847, + "grad_norm": 0.19630871713161469, + "learning_rate": 4.470588883309669e-05, + "loss": 0.4472, "step": 32570 }, { - "epoch": 1.15, - "learning_rate": 4.497789554153593e-05, - "loss": 0.295, + "epoch": 1.1740007928785094, + "grad_norm": 0.1824900358915329, + "learning_rate": 4.470409295633968e-05, + "loss": 0.4047, "step": 32575 }, { - "epoch": 1.15, - "learning_rate": 4.4976182825826877e-05, - "loss": 0.2888, + "epoch": 1.174180992539734, + "grad_norm": 0.21130244433879852, + "learning_rate": 4.470229681111822e-05, + "loss": 0.4292, "step": 32580 }, { - "epoch": 1.15, - "learning_rate": 4.4974469850740264e-05, - "loss": 0.2696, + "epoch": 1.1743611922009587, + "grad_norm": 0.22852078080177307, + "learning_rate": 4.470050039745677e-05, + "loss": 0.4134, "step": 32585 }, { - "epoch": 1.15, - "learning_rate": 4.4972756616298323e-05, - "loss": 0.2795, + "epoch": 1.1745413918621832, + "grad_norm": 0.1561303436756134, + "learning_rate": 4.4698703715379823e-05, + "loss": 0.4225, "step": 32590 }, { - "epoch": 1.15, - "learning_rate": 4.49710431225233e-05, - "loss": 0.2967, + "epoch": 1.174721591523408, + "grad_norm": 0.19721920788288116, + "learning_rate": 4.4696906764911855e-05, + "loss": 0.4508, "step": 32595 }, { - "epoch": 1.15, - "learning_rate": 4.496932936943744e-05, - "loss": 0.2909, + "epoch": 1.1749017911846327, + "grad_norm": 0.19223082065582275, + "learning_rate": 4.469510954607734e-05, + "loss": 0.4363, "step": 32600 }, { - "epoch": 1.15, - "learning_rate": 4.4967615357063006e-05, - "loss": 0.2927, + "epoch": 1.1750819908458572, + "grad_norm": 0.15646933019161224, + "learning_rate": 4.4693312058900774e-05, + "loss": 0.4086, "step": 32605 }, { - "epoch": 1.15, - "learning_rate": 4.4965901085422235e-05, - "loss": 0.2756, + "epoch": 1.175262190507082, + "grad_norm": 0.19095897674560547, + "learning_rate": 4.469151430340665e-05, + "loss": 0.4511, "step": 32610 }, { - "epoch": 1.15, - "learning_rate": 4.49641865545374e-05, - "loss": 0.2814, + "epoch": 1.1754423901683064, + "grad_norm": 0.1620524823665619, + "learning_rate": 4.4689716279619444e-05, + "loss": 0.4414, "step": 32615 }, { - "epoch": 1.15, - "learning_rate": 4.496247176443077e-05, - "loss": 0.3063, + "epoch": 1.1756225898295312, + "grad_norm": 0.14250415563583374, + "learning_rate": 4.468791798756367e-05, + "loss": 0.4214, "step": 32620 }, { - "epoch": 1.15, - "learning_rate": 4.4960756715124586e-05, - "loss": 0.3235, + "epoch": 1.1758027894907557, + "grad_norm": 0.16954278945922852, + "learning_rate": 4.4686119427263826e-05, + "loss": 0.4095, "step": 32625 }, { - "epoch": 1.15, - "learning_rate": 4.495904140664114e-05, - "loss": 0.2832, + "epoch": 1.1759829891519804, + "grad_norm": 0.19082613289356232, + "learning_rate": 4.4684320598744426e-05, + "loss": 0.4455, "step": 32630 }, { - "epoch": 1.15, - "learning_rate": 4.495732583900269e-05, - "loss": 0.2814, + "epoch": 1.176163188813205, + "grad_norm": 0.1630188673734665, + "learning_rate": 4.468252150202995e-05, + "loss": 0.4229, "step": 32635 }, { - "epoch": 1.15, - "learning_rate": 4.495561001223152e-05, - "loss": 0.299, + "epoch": 1.1763433884744297, + "grad_norm": 0.20702870190143585, + "learning_rate": 4.4680722137144935e-05, + "loss": 0.4234, "step": 32640 }, { - "epoch": 1.15, - "learning_rate": 4.4953893926349896e-05, - "loss": 0.2991, + "epoch": 1.1765235881356544, + "grad_norm": 0.1635637730360031, + "learning_rate": 4.46789225041139e-05, + "loss": 0.3663, "step": 32645 }, { - "epoch": 1.15, - "learning_rate": 4.495217758138012e-05, - "loss": 0.2541, + "epoch": 1.176703787796879, + "grad_norm": 0.18059010803699493, + "learning_rate": 4.467712260296135e-05, + "loss": 0.3952, "step": 32650 }, { - "epoch": 1.15, - "learning_rate": 4.495046097734447e-05, - "loss": 0.2964, + "epoch": 1.1768839874581036, + "grad_norm": 0.18861469626426697, + "learning_rate": 4.467532243371181e-05, + "loss": 0.436, "step": 32655 }, { - "epoch": 1.15, - "learning_rate": 4.494874411426522e-05, - "loss": 0.2943, + "epoch": 1.1770641871193281, + "grad_norm": 0.1615755259990692, + "learning_rate": 4.467352199638982e-05, + "loss": 0.4543, "step": 32660 }, { - "epoch": 1.15, - "learning_rate": 4.4947026992164677e-05, - "loss": 0.2958, + "epoch": 1.1772443867805529, + "grad_norm": 0.19088678061962128, + "learning_rate": 4.46717212910199e-05, + "loss": 0.4356, "step": 32665 }, { - "epoch": 1.15, - "learning_rate": 4.4945309611065136e-05, - "loss": 0.3157, + "epoch": 1.1774245864417776, + "grad_norm": 0.1913667619228363, + "learning_rate": 4.466992031762658e-05, + "loss": 0.4021, "step": 32670 }, { - "epoch": 1.15, - "learning_rate": 4.49435919709889e-05, - "loss": 0.3251, + "epoch": 1.1776047861030021, + "grad_norm": 0.1861899197101593, + "learning_rate": 4.466811907623441e-05, + "loss": 0.4698, "step": 32675 }, { - "epoch": 1.15, - "learning_rate": 4.4941874071958254e-05, - "loss": 0.2771, + "epoch": 1.1777849857642269, + "grad_norm": 0.21820510923862457, + "learning_rate": 4.466631756686792e-05, + "loss": 0.4119, "step": 32680 }, { - "epoch": 1.15, - "learning_rate": 4.494015591399553e-05, - "loss": 0.2911, + "epoch": 1.1779651854254514, + "grad_norm": 0.20619237422943115, + "learning_rate": 4.4664515789551665e-05, + "loss": 0.4625, "step": 32685 }, { - "epoch": 1.15, - "learning_rate": 4.493843749712301e-05, - "loss": 0.3239, + "epoch": 1.178145385086676, + "grad_norm": 0.19361612200737, + "learning_rate": 4.466271374431019e-05, + "loss": 0.4151, "step": 32690 }, { - "epoch": 1.15, - "learning_rate": 4.493671882136302e-05, - "loss": 0.3032, + "epoch": 1.1783255847479006, + "grad_norm": 0.2008073925971985, + "learning_rate": 4.466091143116804e-05, + "loss": 0.4312, "step": 32695 }, { - "epoch": 1.15, - "learning_rate": 4.493499988673788e-05, - "loss": 0.3173, + "epoch": 1.1785057844091253, + "grad_norm": 0.15440912544727325, + "learning_rate": 4.465910885014979e-05, + "loss": 0.3915, "step": 32700 }, { - "epoch": 1.15, - "learning_rate": 4.493328069326991e-05, - "loss": 0.286, + "epoch": 1.1786859840703499, + "grad_norm": 0.37958988547325134, + "learning_rate": 4.4657306001279975e-05, + "loss": 0.4194, "step": 32705 }, { - "epoch": 1.15, - "learning_rate": 4.493156124098141e-05, - "loss": 0.2682, + "epoch": 1.1788661837315746, + "grad_norm": 0.19730152189731598, + "learning_rate": 4.4655502884583177e-05, + "loss": 0.4029, "step": 32710 }, { - "epoch": 1.15, - "learning_rate": 4.4929841529894735e-05, - "loss": 0.2971, + "epoch": 1.1790463833927993, + "grad_norm": 0.19884982705116272, + "learning_rate": 4.465369950008396e-05, + "loss": 0.4183, "step": 32715 }, { - "epoch": 1.15, - "learning_rate": 4.4928121560032196e-05, - "loss": 0.2826, + "epoch": 1.1792265830540238, + "grad_norm": 0.22939956188201904, + "learning_rate": 4.465189584780689e-05, + "loss": 0.4118, "step": 32720 }, { - "epoch": 1.15, - "learning_rate": 4.4926401331416135e-05, - "loss": 0.2779, + "epoch": 1.1794067827152486, + "grad_norm": 0.19102820754051208, + "learning_rate": 4.465009192777655e-05, + "loss": 0.4277, "step": 32725 }, { - "epoch": 1.15, - "learning_rate": 4.4924680844068887e-05, - "loss": 0.2962, + "epoch": 1.179586982376473, + "grad_norm": 0.19602443277835846, + "learning_rate": 4.4648287740017506e-05, + "loss": 0.4216, "step": 32730 }, { - "epoch": 1.15, - "learning_rate": 4.492296009801279e-05, - "loss": 0.277, + "epoch": 1.1797671820376978, + "grad_norm": 0.1469677835702896, + "learning_rate": 4.464648328455434e-05, + "loss": 0.4232, "step": 32735 }, { - "epoch": 1.15, - "learning_rate": 4.492123909327018e-05, - "loss": 0.3074, + "epoch": 1.1799473816989223, + "grad_norm": 0.15973035991191864, + "learning_rate": 4.464467856141166e-05, + "loss": 0.4254, "step": 32740 }, { - "epoch": 1.15, - "learning_rate": 4.491951782986341e-05, - "loss": 0.3198, + "epoch": 1.180127581360147, + "grad_norm": 0.18778270483016968, + "learning_rate": 4.4642873570614016e-05, + "loss": 0.4131, "step": 32745 }, { - "epoch": 1.15, - "learning_rate": 4.491779630781483e-05, - "loss": 0.3118, + "epoch": 1.1803077810213716, + "grad_norm": 0.16484954953193665, + "learning_rate": 4.464106831218604e-05, + "loss": 0.4222, "step": 32750 }, { - "epoch": 1.15, - "learning_rate": 4.491607452714679e-05, - "loss": 0.298, + "epoch": 1.1804879806825963, + "grad_norm": 0.17252211272716522, + "learning_rate": 4.4639262786152306e-05, + "loss": 0.4331, "step": 32755 }, { - "epoch": 1.15, - "learning_rate": 4.491435248788165e-05, - "loss": 0.293, + "epoch": 1.180668180343821, + "grad_norm": 0.16965141892433167, + "learning_rate": 4.463745699253742e-05, + "loss": 0.4342, "step": 32760 }, { - "epoch": 1.15, - "learning_rate": 4.4912630190041765e-05, - "loss": 0.2902, + "epoch": 1.1808483800050456, + "grad_norm": 0.1783820539712906, + "learning_rate": 4.463565093136598e-05, + "loss": 0.4189, "step": 32765 }, { - "epoch": 1.15, - "learning_rate": 4.49109076336495e-05, - "loss": 0.2658, + "epoch": 1.1810285796662703, + "grad_norm": 0.2075047343969345, + "learning_rate": 4.46338446026626e-05, + "loss": 0.4277, "step": 32770 }, { - "epoch": 1.15, - "learning_rate": 4.4909184818727224e-05, - "loss": 0.2954, + "epoch": 1.1812087793274948, + "grad_norm": 0.21429748833179474, + "learning_rate": 4.463203800645188e-05, + "loss": 0.4372, "step": 32775 }, { - "epoch": 1.15, - "learning_rate": 4.4907461745297294e-05, - "loss": 0.2952, + "epoch": 1.1813889789887195, + "grad_norm": 0.1777476817369461, + "learning_rate": 4.4630231142758455e-05, + "loss": 0.4113, "step": 32780 }, { - "epoch": 1.15, - "learning_rate": 4.49057384133821e-05, - "loss": 0.2875, + "epoch": 1.1815691786499443, + "grad_norm": 0.1785525679588318, + "learning_rate": 4.4628424011606927e-05, + "loss": 0.4065, "step": 32785 }, { - "epoch": 1.15, - "learning_rate": 4.490401482300401e-05, - "loss": 0.3302, + "epoch": 1.1817493783111688, + "grad_norm": 0.18620218336582184, + "learning_rate": 4.462661661302192e-05, + "loss": 0.4236, "step": 32790 }, { - "epoch": 1.15, - "learning_rate": 4.490229097418541e-05, - "loss": 0.32, + "epoch": 1.1819295779723935, + "grad_norm": 0.16326984763145447, + "learning_rate": 4.462480894702806e-05, + "loss": 0.3865, "step": 32795 }, { - "epoch": 1.15, - "learning_rate": 4.490056686694867e-05, - "loss": 0.3094, + "epoch": 1.182109777633618, + "grad_norm": 0.16543501615524292, + "learning_rate": 4.462300101364999e-05, + "loss": 0.4208, "step": 32800 }, { - "epoch": 1.15, - "learning_rate": 4.489884250131619e-05, - "loss": 0.3041, + "epoch": 1.1822899772948428, + "grad_norm": 0.1949724704027176, + "learning_rate": 4.4621192812912316e-05, + "loss": 0.4216, "step": 32805 }, { - "epoch": 1.15, - "learning_rate": 4.489711787731036e-05, - "loss": 0.3089, + "epoch": 1.1824701769560673, + "grad_norm": 0.19553053379058838, + "learning_rate": 4.46193843448397e-05, + "loss": 0.4616, "step": 32810 }, { - "epoch": 1.15, - "learning_rate": 4.489539299495356e-05, - "loss": 0.286, + "epoch": 1.182650376617292, + "grad_norm": 0.17806103825569153, + "learning_rate": 4.461757560945676e-05, + "loss": 0.4171, "step": 32815 }, { - "epoch": 1.15, - "learning_rate": 4.4893667854268184e-05, - "loss": 0.3166, + "epoch": 1.1828305762785165, + "grad_norm": 0.21131014823913574, + "learning_rate": 4.461576660678815e-05, + "loss": 0.4127, "step": 32820 }, { - "epoch": 1.15, - "learning_rate": 4.489194245527665e-05, - "loss": 0.308, + "epoch": 1.1830107759397412, + "grad_norm": 0.16493719816207886, + "learning_rate": 4.461395733685853e-05, + "loss": 0.4213, "step": 32825 }, { - "epoch": 1.16, - "learning_rate": 4.489021679800135e-05, - "loss": 0.2992, + "epoch": 1.183190975600966, + "grad_norm": 0.21300533413887024, + "learning_rate": 4.461214779969253e-05, + "loss": 0.4309, "step": 32830 }, { - "epoch": 1.16, - "learning_rate": 4.4888490882464696e-05, - "loss": 0.2923, + "epoch": 1.1833711752621905, + "grad_norm": 0.17048995196819305, + "learning_rate": 4.4610337995314816e-05, + "loss": 0.4518, "step": 32835 }, { - "epoch": 1.16, - "learning_rate": 4.48867647086891e-05, - "loss": 0.312, + "epoch": 1.1835513749234152, + "grad_norm": 0.17380398511886597, + "learning_rate": 4.460852792375004e-05, + "loss": 0.4386, "step": 32840 }, { - "epoch": 1.16, - "learning_rate": 4.488503827669696e-05, - "loss": 0.2999, + "epoch": 1.1837315745846397, + "grad_norm": 0.16541922092437744, + "learning_rate": 4.4606717585022876e-05, + "loss": 0.4223, "step": 32845 }, { - "epoch": 1.16, - "learning_rate": 4.4883311586510706e-05, - "loss": 0.3034, + "epoch": 1.1839117742458645, + "grad_norm": 0.17973947525024414, + "learning_rate": 4.460490697915797e-05, + "loss": 0.4302, "step": 32850 }, { - "epoch": 1.16, - "learning_rate": 4.4881584638152756e-05, - "loss": 0.2815, + "epoch": 1.184091973907089, + "grad_norm": 0.24981901049613953, + "learning_rate": 4.460309610618001e-05, + "loss": 0.4425, "step": 32855 }, { - "epoch": 1.16, - "learning_rate": 4.4879857431645536e-05, - "loss": 0.289, + "epoch": 1.1842721735683137, + "grad_norm": 0.23169773817062378, + "learning_rate": 4.4601284966113656e-05, + "loss": 0.4564, "step": 32860 }, { - "epoch": 1.16, - "learning_rate": 4.4878129967011464e-05, - "loss": 0.3124, + "epoch": 1.1844523732295382, + "grad_norm": 0.15463878214359283, + "learning_rate": 4.4599473558983596e-05, + "loss": 0.3629, "step": 32865 }, { - "epoch": 1.16, - "learning_rate": 4.4876402244272974e-05, - "loss": 0.2979, + "epoch": 1.184632572890763, + "grad_norm": 0.17254050076007843, + "learning_rate": 4.459766188481449e-05, + "loss": 0.414, "step": 32870 }, { - "epoch": 1.16, - "learning_rate": 4.4874674263452496e-05, - "loss": 0.2549, + "epoch": 1.1848127725519877, + "grad_norm": 0.1448955237865448, + "learning_rate": 4.459584994363105e-05, + "loss": 0.4048, "step": 32875 }, { - "epoch": 1.16, - "learning_rate": 4.4872946024572474e-05, - "loss": 0.3019, + "epoch": 1.1849929722132122, + "grad_norm": 0.15114130079746246, + "learning_rate": 4.459403773545794e-05, + "loss": 0.4154, "step": 32880 }, { - "epoch": 1.16, - "learning_rate": 4.4871217527655344e-05, - "loss": 0.2989, + "epoch": 1.185173171874437, + "grad_norm": 0.16808383166790009, + "learning_rate": 4.459222526031987e-05, + "loss": 0.4226, "step": 32885 }, { - "epoch": 1.16, - "learning_rate": 4.4869488772723555e-05, - "loss": 0.2659, + "epoch": 1.1853533715356614, + "grad_norm": 0.18770961463451385, + "learning_rate": 4.45904125182415e-05, + "loss": 0.4326, "step": 32890 }, { - "epoch": 1.16, - "learning_rate": 4.4867759759799544e-05, - "loss": 0.2789, + "epoch": 1.1855335711968862, + "grad_norm": 0.18759319186210632, + "learning_rate": 4.458859950924758e-05, + "loss": 0.4325, "step": 32895 }, { - "epoch": 1.16, - "learning_rate": 4.4866030488905766e-05, - "loss": 0.2879, + "epoch": 1.1857137708581107, + "grad_norm": 0.19363155961036682, + "learning_rate": 4.458678623336277e-05, + "loss": 0.4474, "step": 32900 }, { - "epoch": 1.16, - "learning_rate": 4.486430096006467e-05, - "loss": 0.3129, + "epoch": 1.1858939705193354, + "grad_norm": 0.13609229028224945, + "learning_rate": 4.4584972690611784e-05, + "loss": 0.3956, "step": 32905 }, { - "epoch": 1.16, - "learning_rate": 4.4862571173298727e-05, - "loss": 0.2921, + "epoch": 1.18607417018056, + "grad_norm": 0.20877060294151306, + "learning_rate": 4.458315888101935e-05, + "loss": 0.4139, "step": 32910 }, { - "epoch": 1.16, - "learning_rate": 4.4860841128630384e-05, - "loss": 0.3155, + "epoch": 1.1862543698417847, + "grad_norm": 0.1942521333694458, + "learning_rate": 4.4581344804610156e-05, + "loss": 0.4022, "step": 32915 }, { - "epoch": 1.16, - "learning_rate": 4.485911082608211e-05, - "loss": 0.2996, + "epoch": 1.1864345695030094, + "grad_norm": 0.19765913486480713, + "learning_rate": 4.457953046140894e-05, + "loss": 0.4402, "step": 32920 }, { - "epoch": 1.16, - "learning_rate": 4.4857380265676363e-05, - "loss": 0.282, + "epoch": 1.186614769164234, + "grad_norm": 0.15261493623256683, + "learning_rate": 4.4577715851440405e-05, + "loss": 0.4131, "step": 32925 }, { - "epoch": 1.16, - "learning_rate": 4.485564944743562e-05, - "loss": 0.2971, + "epoch": 1.1867949688254587, + "grad_norm": 0.17721693217754364, + "learning_rate": 4.4575900974729284e-05, + "loss": 0.434, "step": 32930 }, { - "epoch": 1.16, - "learning_rate": 4.4853918371382365e-05, - "loss": 0.2768, + "epoch": 1.1869751684866832, + "grad_norm": 0.1746007800102234, + "learning_rate": 4.45740858313003e-05, + "loss": 0.4198, "step": 32935 }, { - "epoch": 1.16, - "learning_rate": 4.485218703753905e-05, - "loss": 0.3019, + "epoch": 1.187155368147908, + "grad_norm": 0.18980930745601654, + "learning_rate": 4.45722704211782e-05, + "loss": 0.4456, "step": 32940 }, { - "epoch": 1.16, - "learning_rate": 4.485045544592818e-05, - "loss": 0.2769, + "epoch": 1.1873355678091326, + "grad_norm": 0.15787626802921295, + "learning_rate": 4.457045474438769e-05, + "loss": 0.3863, "step": 32945 }, { - "epoch": 1.16, - "learning_rate": 4.484872359657223e-05, - "loss": 0.2651, + "epoch": 1.1875157674703571, + "grad_norm": 0.2203458696603775, + "learning_rate": 4.4568638800953524e-05, + "loss": 0.3924, "step": 32950 }, { - "epoch": 1.16, - "learning_rate": 4.4846991489493674e-05, - "loss": 0.3109, + "epoch": 1.1876959671315819, + "grad_norm": 0.18546192348003387, + "learning_rate": 4.4566822590900445e-05, + "loss": 0.4048, "step": 32955 }, { - "epoch": 1.16, - "learning_rate": 4.4845259124715014e-05, - "loss": 0.2813, + "epoch": 1.1878761667928064, + "grad_norm": 0.19496969878673553, + "learning_rate": 4.45650061142532e-05, + "loss": 0.3955, "step": 32960 }, { - "epoch": 1.16, - "learning_rate": 4.4843526502258747e-05, - "loss": 0.2924, + "epoch": 1.1880563664540311, + "grad_norm": 0.1746780127286911, + "learning_rate": 4.456318937103653e-05, + "loss": 0.4654, "step": 32965 }, { - "epoch": 1.16, - "learning_rate": 4.484179362214736e-05, - "loss": 0.2847, + "epoch": 1.1882365661152556, + "grad_norm": 0.17759990692138672, + "learning_rate": 4.45613723612752e-05, + "loss": 0.4105, "step": 32970 }, { - "epoch": 1.16, - "learning_rate": 4.484006048440337e-05, - "loss": 0.2839, + "epoch": 1.1884167657764804, + "grad_norm": 0.21956616640090942, + "learning_rate": 4.455955508499395e-05, + "loss": 0.4118, "step": 32975 }, { - "epoch": 1.16, - "learning_rate": 4.483832708904927e-05, - "loss": 0.2725, + "epoch": 1.1885969654377049, + "grad_norm": 0.1860971599817276, + "learning_rate": 4.455773754221755e-05, + "loss": 0.4431, "step": 32980 }, { - "epoch": 1.16, - "learning_rate": 4.4836593436107555e-05, - "loss": 0.3009, + "epoch": 1.1887771650989296, + "grad_norm": 0.17252309620380402, + "learning_rate": 4.455591973297077e-05, + "loss": 0.4097, "step": 32985 }, { - "epoch": 1.16, - "learning_rate": 4.483485952560076e-05, - "loss": 0.2911, + "epoch": 1.1889573647601543, + "grad_norm": 0.16842429339885712, + "learning_rate": 4.455410165727836e-05, + "loss": 0.4194, "step": 32990 }, { - "epoch": 1.16, - "learning_rate": 4.4833125357551375e-05, - "loss": 0.2847, + "epoch": 1.1891375644213789, + "grad_norm": 0.16480675339698792, + "learning_rate": 4.4552283315165114e-05, + "loss": 0.4398, "step": 32995 }, { - "epoch": 1.16, - "learning_rate": 4.4831390931981934e-05, - "loss": 0.3052, + "epoch": 1.1893177640826036, + "grad_norm": 0.1965269148349762, + "learning_rate": 4.455046470665578e-05, + "loss": 0.435, "step": 33000 }, { - "epoch": 1.16, - "eval_loss": 0.28887951374053955, - "eval_runtime": 10.5389, - "eval_samples_per_second": 9.489, - "eval_steps_per_second": 9.489, + "epoch": 1.1893177640826036, + "eval_loss": 0.4501653015613556, + "eval_runtime": 3.5279, + "eval_samples_per_second": 28.345, + "eval_steps_per_second": 7.086, "step": 33000 }, { - "epoch": 1.16, - "learning_rate": 4.4829656248914945e-05, - "loss": 0.3001, + "epoch": 1.189497963743828, + "grad_norm": 0.16842001676559448, + "learning_rate": 4.454864583177515e-05, + "loss": 0.4186, "step": 33005 }, { - "epoch": 1.16, - "learning_rate": 4.4827921308372945e-05, - "loss": 0.2907, + "epoch": 1.1896781634050528, + "grad_norm": 0.17782749235630035, + "learning_rate": 4.454682669054801e-05, + "loss": 0.4134, "step": 33010 }, { - "epoch": 1.16, - "learning_rate": 4.482618611037845e-05, - "loss": 0.2813, + "epoch": 1.1898583630662773, + "grad_norm": 0.20939677953720093, + "learning_rate": 4.454500728299914e-05, + "loss": 0.4094, "step": 33015 }, { - "epoch": 1.16, - "learning_rate": 4.482445065495399e-05, - "loss": 0.2769, + "epoch": 1.190038562727502, + "grad_norm": 0.22223712503910065, + "learning_rate": 4.454318760915333e-05, + "loss": 0.4279, "step": 33020 }, { - "epoch": 1.16, - "learning_rate": 4.4822714942122105e-05, - "loss": 0.3057, + "epoch": 1.1902187623887266, + "grad_norm": 0.19512085616588593, + "learning_rate": 4.4541367669035373e-05, + "loss": 0.4125, "step": 33025 }, { - "epoch": 1.16, - "learning_rate": 4.482097897190534e-05, - "loss": 0.2956, + "epoch": 1.1903989620499513, + "grad_norm": 0.17766328155994415, + "learning_rate": 4.453954746267006e-05, + "loss": 0.4323, "step": 33030 }, { - "epoch": 1.16, - "learning_rate": 4.481924274432622e-05, - "loss": 0.282, + "epoch": 1.190579161711176, + "grad_norm": 0.17056851089000702, + "learning_rate": 4.45377269900822e-05, + "loss": 0.4184, "step": 33035 }, { - "epoch": 1.16, - "learning_rate": 4.481750625940729e-05, - "loss": 0.2964, + "epoch": 1.1907593613724006, + "grad_norm": 0.18727493286132812, + "learning_rate": 4.45359062512966e-05, + "loss": 0.42, "step": 33040 }, { - "epoch": 1.16, - "learning_rate": 4.4815769517171105e-05, - "loss": 0.3054, + "epoch": 1.1909395610336253, + "grad_norm": 0.16119909286499023, + "learning_rate": 4.453408524633805e-05, + "loss": 0.3888, "step": 33045 }, { - "epoch": 1.16, - "learning_rate": 4.481403251764021e-05, - "loss": 0.2802, + "epoch": 1.1911197606948498, + "grad_norm": 0.16698874533176422, + "learning_rate": 4.453226397523137e-05, + "loss": 0.4427, "step": 33050 }, { - "epoch": 1.16, - "learning_rate": 4.481229526083715e-05, - "loss": 0.302, + "epoch": 1.1912999603560745, + "grad_norm": 0.1766866147518158, + "learning_rate": 4.453044243800137e-05, + "loss": 0.4434, "step": 33055 }, { - "epoch": 1.16, - "learning_rate": 4.4810557746784514e-05, - "loss": 0.2758, + "epoch": 1.1914801600172993, + "grad_norm": 0.19949309527873993, + "learning_rate": 4.452862063467289e-05, + "loss": 0.4333, "step": 33060 }, { - "epoch": 1.16, - "learning_rate": 4.4808819975504835e-05, - "loss": 0.2762, + "epoch": 1.1916603596785238, + "grad_norm": 0.18705400824546814, + "learning_rate": 4.452679856527072e-05, + "loss": 0.4109, "step": 33065 }, { - "epoch": 1.16, - "learning_rate": 4.480708194702068e-05, - "loss": 0.3195, + "epoch": 1.1918405593397485, + "grad_norm": 0.19949448108673096, + "learning_rate": 4.45249762298197e-05, + "loss": 0.3773, "step": 33070 }, { - "epoch": 1.16, - "learning_rate": 4.480534366135462e-05, - "loss": 0.2812, + "epoch": 1.192020759000973, + "grad_norm": 0.1769210696220398, + "learning_rate": 4.452315362834467e-05, + "loss": 0.4424, "step": 33075 }, { - "epoch": 1.16, - "learning_rate": 4.480360511852922e-05, - "loss": 0.2912, + "epoch": 1.1922009586621978, + "grad_norm": 0.19094079732894897, + "learning_rate": 4.452133076087045e-05, + "loss": 0.3962, "step": 33080 }, { - "epoch": 1.16, - "learning_rate": 4.480186631856707e-05, - "loss": 0.2869, + "epoch": 1.1923811583234223, + "grad_norm": 0.2079572230577469, + "learning_rate": 4.4519507627421873e-05, + "loss": 0.4552, "step": 33085 }, { - "epoch": 1.16, - "learning_rate": 4.480012726149072e-05, - "loss": 0.2889, + "epoch": 1.192561357984647, + "grad_norm": 0.24924804270267487, + "learning_rate": 4.451768422802378e-05, + "loss": 0.3995, "step": 33090 }, { - "epoch": 1.16, - "learning_rate": 4.479838794732277e-05, - "loss": 0.302, + "epoch": 1.1927415576458715, + "grad_norm": 0.1824015974998474, + "learning_rate": 4.451586056270103e-05, + "loss": 0.4448, "step": 33095 }, { - "epoch": 1.16, - "learning_rate": 4.4796648376085815e-05, - "loss": 0.2824, + "epoch": 1.1929217573070963, + "grad_norm": 0.1912459284067154, + "learning_rate": 4.4514036631478444e-05, + "loss": 0.4076, "step": 33100 }, { - "epoch": 1.16, - "learning_rate": 4.479490854780242e-05, - "loss": 0.2879, + "epoch": 1.193101956968321, + "grad_norm": 0.1705092191696167, + "learning_rate": 4.4512212434380894e-05, + "loss": 0.4206, "step": 33105 }, { - "epoch": 1.16, - "learning_rate": 4.479316846249517e-05, - "loss": 0.31, + "epoch": 1.1932821566295455, + "grad_norm": 0.17675426602363586, + "learning_rate": 4.4510387971433225e-05, + "loss": 0.4243, "step": 33110 }, { - "epoch": 1.17, - "learning_rate": 4.479142812018669e-05, - "loss": 0.2899, + "epoch": 1.1934623562907702, + "grad_norm": 0.21491315960884094, + "learning_rate": 4.45085632426603e-05, + "loss": 0.4422, "step": 33115 }, { - "epoch": 1.17, - "learning_rate": 4.478968752089955e-05, - "loss": 0.3027, + "epoch": 1.1936425559519948, + "grad_norm": 0.2086547613143921, + "learning_rate": 4.4506738248086974e-05, + "loss": 0.4128, "step": 33120 }, { - "epoch": 1.17, - "learning_rate": 4.478794666465636e-05, - "loss": 0.2736, + "epoch": 1.1938227556132195, + "grad_norm": 0.1773192584514618, + "learning_rate": 4.4504912987738124e-05, + "loss": 0.4244, "step": 33125 }, { - "epoch": 1.17, - "learning_rate": 4.478620555147972e-05, - "loss": 0.2686, + "epoch": 1.194002955274444, + "grad_norm": 0.19399385154247284, + "learning_rate": 4.45030874616386e-05, + "loss": 0.4028, "step": 33130 }, { - "epoch": 1.17, - "learning_rate": 4.478446418139224e-05, - "loss": 0.3045, + "epoch": 1.1941831549356687, + "grad_norm": 0.22621554136276245, + "learning_rate": 4.45012616698133e-05, + "loss": 0.4292, "step": 33135 }, { - "epoch": 1.17, - "learning_rate": 4.478272255441653e-05, - "loss": 0.273, + "epoch": 1.1943633545968932, + "grad_norm": 0.17529787123203278, + "learning_rate": 4.449943561228707e-05, + "loss": 0.3879, "step": 33140 }, { - "epoch": 1.17, - "learning_rate": 4.478098067057522e-05, - "loss": 0.2836, + "epoch": 1.194543554258118, + "grad_norm": 0.18595170974731445, + "learning_rate": 4.449760928908481e-05, + "loss": 0.4049, "step": 33145 }, { - "epoch": 1.17, - "learning_rate": 4.47792385298909e-05, - "loss": 0.2809, + "epoch": 1.1947237539193427, + "grad_norm": 0.17124305665493011, + "learning_rate": 4.44957827002314e-05, + "loss": 0.3847, "step": 33150 }, { - "epoch": 1.17, - "learning_rate": 4.477749613238621e-05, - "loss": 0.3096, + "epoch": 1.1949039535805672, + "grad_norm": 0.1618146151304245, + "learning_rate": 4.449395584575172e-05, + "loss": 0.4069, "step": 33155 }, { - "epoch": 1.17, - "learning_rate": 4.4775753478083756e-05, - "loss": 0.3059, + "epoch": 1.195084153241792, + "grad_norm": 0.15872132778167725, + "learning_rate": 4.449212872567068e-05, + "loss": 0.4154, "step": 33160 }, { - "epoch": 1.17, - "learning_rate": 4.477401056700618e-05, - "loss": 0.2698, + "epoch": 1.1952643529030165, + "grad_norm": 0.18358559906482697, + "learning_rate": 4.4490301340013146e-05, + "loss": 0.4185, "step": 33165 }, { - "epoch": 1.17, - "learning_rate": 4.477226739917611e-05, - "loss": 0.2845, + "epoch": 1.1954445525642412, + "grad_norm": 0.139307901263237, + "learning_rate": 4.4488473688804034e-05, + "loss": 0.3846, "step": 33170 }, { - "epoch": 1.17, - "learning_rate": 4.477052397461617e-05, - "loss": 0.3061, + "epoch": 1.195624752225466, + "grad_norm": 0.16078798472881317, + "learning_rate": 4.448664577206824e-05, + "loss": 0.4072, "step": 33175 }, { - "epoch": 1.17, - "learning_rate": 4.476878029334902e-05, - "loss": 0.2727, + "epoch": 1.1958049518866904, + "grad_norm": 0.20551976561546326, + "learning_rate": 4.448481758983067e-05, + "loss": 0.4151, "step": 33180 }, { - "epoch": 1.17, - "learning_rate": 4.4767036355397276e-05, - "loss": 0.2941, + "epoch": 1.1959851515479152, + "grad_norm": 0.17032408714294434, + "learning_rate": 4.4482989142116236e-05, + "loss": 0.4053, "step": 33185 }, { - "epoch": 1.17, - "learning_rate": 4.47652921607836e-05, - "loss": 0.284, + "epoch": 1.1961653512091397, + "grad_norm": 0.19917094707489014, + "learning_rate": 4.4481160428949845e-05, + "loss": 0.3863, "step": 33190 }, { - "epoch": 1.17, - "learning_rate": 4.476354770953062e-05, - "loss": 0.2993, + "epoch": 1.1963455508703644, + "grad_norm": 0.17138609290122986, + "learning_rate": 4.447933145035642e-05, + "loss": 0.4144, "step": 33195 }, { - "epoch": 1.17, - "learning_rate": 4.476180300166101e-05, - "loss": 0.2959, + "epoch": 1.196525750531589, + "grad_norm": 0.16464346647262573, + "learning_rate": 4.447750220636086e-05, + "loss": 0.4573, "step": 33200 }, { - "epoch": 1.17, - "learning_rate": 4.476005803719741e-05, - "loss": 0.2831, + "epoch": 1.1967059501928137, + "grad_norm": 0.18972603976726532, + "learning_rate": 4.4475672696988117e-05, + "loss": 0.4093, "step": 33205 }, { - "epoch": 1.17, - "learning_rate": 4.4758312816162466e-05, - "loss": 0.3011, + "epoch": 1.1968861498540382, + "grad_norm": 0.1480892151594162, + "learning_rate": 4.44738429222631e-05, + "loss": 0.3913, "step": 33210 }, { - "epoch": 1.17, - "learning_rate": 4.4756567338578856e-05, - "loss": 0.3106, + "epoch": 1.197066349515263, + "grad_norm": 0.1947953850030899, + "learning_rate": 4.4472012882210744e-05, + "loss": 0.4281, "step": 33215 }, { - "epoch": 1.17, - "learning_rate": 4.4754821604469245e-05, - "loss": 0.2985, + "epoch": 1.1972465491764877, + "grad_norm": 0.1418490707874298, + "learning_rate": 4.4470182576855984e-05, + "loss": 0.4228, "step": 33220 }, { - "epoch": 1.17, - "learning_rate": 4.47530756138563e-05, - "loss": 0.3007, + "epoch": 1.1974267488377122, + "grad_norm": 0.21313992142677307, + "learning_rate": 4.446835200622376e-05, + "loss": 0.4354, "step": 33225 }, { - "epoch": 1.17, - "learning_rate": 4.4751329366762677e-05, - "loss": 0.3026, + "epoch": 1.197606948498937, + "grad_norm": 0.16371306777000427, + "learning_rate": 4.4466521170339e-05, + "loss": 0.4339, "step": 33230 }, { - "epoch": 1.17, - "learning_rate": 4.474958286321105e-05, - "loss": 0.2842, + "epoch": 1.1977871481601614, + "grad_norm": 0.17602042853832245, + "learning_rate": 4.446469006922666e-05, + "loss": 0.4083, "step": 33235 }, { - "epoch": 1.17, - "learning_rate": 4.4747836103224115e-05, - "loss": 0.2882, + "epoch": 1.1979673478213861, + "grad_norm": 0.2149427831172943, + "learning_rate": 4.446285870291168e-05, + "loss": 0.4426, "step": 33240 }, { - "epoch": 1.17, - "learning_rate": 4.474608908682455e-05, - "loss": 0.2853, + "epoch": 1.1981475474826107, + "grad_norm": 0.19372370839118958, + "learning_rate": 4.4461027071419034e-05, + "loss": 0.4215, "step": 33245 }, { - "epoch": 1.17, - "learning_rate": 4.474434181403502e-05, - "loss": 0.3047, + "epoch": 1.1983277471438354, + "grad_norm": 0.1823178380727768, + "learning_rate": 4.445919517477365e-05, + "loss": 0.4066, "step": 33250 }, { - "epoch": 1.17, - "learning_rate": 4.4742594284878225e-05, - "loss": 0.2728, + "epoch": 1.19850794680506, + "grad_norm": 0.2029699683189392, + "learning_rate": 4.445736301300051e-05, + "loss": 0.4223, "step": 33255 }, { - "epoch": 1.17, - "learning_rate": 4.4740846499376856e-05, - "loss": 0.2977, + "epoch": 1.1986881464662846, + "grad_norm": 0.16662827134132385, + "learning_rate": 4.445553058612455e-05, + "loss": 0.4364, "step": 33260 }, { - "epoch": 1.17, - "learning_rate": 4.4739098457553606e-05, - "loss": 0.3427, + "epoch": 1.1988683461275094, + "grad_norm": 0.1548469513654709, + "learning_rate": 4.445369789417077e-05, + "loss": 0.4124, "step": 33265 }, { - "epoch": 1.17, - "learning_rate": 4.473735015943117e-05, - "loss": 0.2878, + "epoch": 1.1990485457887339, + "grad_norm": 0.15706267952919006, + "learning_rate": 4.445186493716411e-05, + "loss": 0.3692, "step": 33270 }, { - "epoch": 1.17, - "learning_rate": 4.473560160503225e-05, - "loss": 0.2875, + "epoch": 1.1992287454499586, + "grad_norm": 0.19710016250610352, + "learning_rate": 4.4450031715129556e-05, + "loss": 0.4461, "step": 33275 }, { - "epoch": 1.17, - "learning_rate": 4.473385279437955e-05, - "loss": 0.2912, + "epoch": 1.1994089451111831, + "grad_norm": 0.16916310787200928, + "learning_rate": 4.4448198228092095e-05, + "loss": 0.3842, "step": 33280 }, { - "epoch": 1.17, - "learning_rate": 4.4732103727495775e-05, - "loss": 0.2663, + "epoch": 1.1995891447724079, + "grad_norm": 0.24299342930316925, + "learning_rate": 4.444636447607669e-05, + "loss": 0.4048, "step": 33285 }, { - "epoch": 1.17, - "learning_rate": 4.473035440440364e-05, - "loss": 0.2863, + "epoch": 1.1997693444336326, + "grad_norm": 0.1836657077074051, + "learning_rate": 4.444453045910834e-05, + "loss": 0.4123, "step": 33290 }, { - "epoch": 1.17, - "learning_rate": 4.472860482512585e-05, - "loss": 0.2773, + "epoch": 1.199949544094857, + "grad_norm": 0.2232903391122818, + "learning_rate": 4.444269617721202e-05, + "loss": 0.4128, "step": 33295 }, { - "epoch": 1.17, - "learning_rate": 4.472685498968513e-05, - "loss": 0.2813, + "epoch": 1.2001297437560818, + "grad_norm": 0.150344580411911, + "learning_rate": 4.444086163041273e-05, + "loss": 0.4184, "step": 33300 }, { - "epoch": 1.17, - "learning_rate": 4.47251048981042e-05, - "loss": 0.302, + "epoch": 1.2003099434173063, + "grad_norm": 0.16514082252979279, + "learning_rate": 4.443902681873547e-05, + "loss": 0.4122, "step": 33305 }, { - "epoch": 1.17, - "learning_rate": 4.4723354550405784e-05, - "loss": 0.2719, + "epoch": 1.200490143078531, + "grad_norm": 0.16925464570522308, + "learning_rate": 4.443719174220523e-05, + "loss": 0.4041, "step": 33310 }, { - "epoch": 1.17, - "learning_rate": 4.4721603946612604e-05, - "loss": 0.2746, + "epoch": 1.2006703427397556, + "grad_norm": 0.2036067545413971, + "learning_rate": 4.443535640084702e-05, + "loss": 0.4335, "step": 33315 }, { - "epoch": 1.17, - "learning_rate": 4.47198530867474e-05, - "loss": 0.3027, + "epoch": 1.2008505424009803, + "grad_norm": 0.19253499805927277, + "learning_rate": 4.443352079468583e-05, + "loss": 0.4581, "step": 33320 }, { - "epoch": 1.17, - "learning_rate": 4.4718101970832895e-05, - "loss": 0.2748, + "epoch": 1.2010307420622048, + "grad_norm": 0.16766758263111115, + "learning_rate": 4.4431684923746695e-05, + "loss": 0.4244, "step": 33325 }, { - "epoch": 1.17, - "learning_rate": 4.4716350598891834e-05, - "loss": 0.2948, + "epoch": 1.2012109417234296, + "grad_norm": 0.14643500745296478, + "learning_rate": 4.442984878805461e-05, + "loss": 0.4007, "step": 33330 }, { - "epoch": 1.17, - "learning_rate": 4.4714598970946944e-05, - "loss": 0.3105, + "epoch": 1.2013911413846543, + "grad_norm": 0.19802068173885345, + "learning_rate": 4.44280123876346e-05, + "loss": 0.4579, "step": 33335 }, { - "epoch": 1.17, - "learning_rate": 4.471284708702098e-05, - "loss": 0.3052, + "epoch": 1.2015713410458788, + "grad_norm": 0.2025717943906784, + "learning_rate": 4.4426175722511674e-05, + "loss": 0.4056, "step": 33340 }, { - "epoch": 1.17, - "learning_rate": 4.47110949471367e-05, - "loss": 0.3049, + "epoch": 1.2017515407071035, + "grad_norm": 0.181138813495636, + "learning_rate": 4.442433879271087e-05, + "loss": 0.4282, "step": 33345 }, { - "epoch": 1.17, - "learning_rate": 4.4709342551316836e-05, - "loss": 0.2955, + "epoch": 1.201931740368328, + "grad_norm": 0.1720162183046341, + "learning_rate": 4.4422501598257216e-05, + "loss": 0.4089, "step": 33350 }, { - "epoch": 1.17, - "learning_rate": 4.470758989958415e-05, - "loss": 0.276, + "epoch": 1.2021119400295528, + "grad_norm": 0.23699156939983368, + "learning_rate": 4.442066413917573e-05, + "loss": 0.4118, "step": 33355 }, { - "epoch": 1.17, - "learning_rate": 4.470583699196139e-05, - "loss": 0.2762, + "epoch": 1.2022921396907773, + "grad_norm": 0.16807523369789124, + "learning_rate": 4.441882641549145e-05, + "loss": 0.4238, "step": 33360 }, { - "epoch": 1.17, - "learning_rate": 4.470408382847133e-05, - "loss": 0.3054, + "epoch": 1.202472339352002, + "grad_norm": 0.16875354945659637, + "learning_rate": 4.441698842722943e-05, + "loss": 0.41, "step": 33365 }, { - "epoch": 1.17, - "learning_rate": 4.4702330409136716e-05, - "loss": 0.2919, + "epoch": 1.2026525390132266, + "grad_norm": 0.15263767540454865, + "learning_rate": 4.44151501744147e-05, + "loss": 0.3937, "step": 33370 }, { - "epoch": 1.17, - "learning_rate": 4.470057673398034e-05, - "loss": 0.2929, + "epoch": 1.2028327386744513, + "grad_norm": 0.19248110055923462, + "learning_rate": 4.441331165707231e-05, + "loss": 0.4074, "step": 33375 }, { - "epoch": 1.17, - "learning_rate": 4.469882280302495e-05, - "loss": 0.3022, + "epoch": 1.203012938335676, + "grad_norm": 0.1765611320734024, + "learning_rate": 4.44114728752273e-05, + "loss": 0.4095, "step": 33380 }, { - "epoch": 1.17, - "learning_rate": 4.469706861629333e-05, - "loss": 0.3106, + "epoch": 1.2031931379969005, + "grad_norm": 0.1671658307313919, + "learning_rate": 4.440963382890474e-05, + "loss": 0.4064, "step": 33385 }, { - "epoch": 1.17, - "learning_rate": 4.469531417380826e-05, - "loss": 0.2885, + "epoch": 1.2033733376581253, + "grad_norm": 0.1801038384437561, + "learning_rate": 4.440779451812967e-05, + "loss": 0.391, "step": 33390 }, { - "epoch": 1.17, - "learning_rate": 4.46935594755925e-05, - "loss": 0.2992, + "epoch": 1.2035535373193498, + "grad_norm": 0.18195895850658417, + "learning_rate": 4.4405954942927155e-05, + "loss": 0.4165, "step": 33395 }, { - "epoch": 1.18, - "learning_rate": 4.4691804521668865e-05, - "loss": 0.2822, + "epoch": 1.2037337369805745, + "grad_norm": 0.22114919126033783, + "learning_rate": 4.440411510332226e-05, + "loss": 0.4255, "step": 33400 }, { - "epoch": 1.18, - "learning_rate": 4.4690049312060126e-05, - "loss": 0.3032, + "epoch": 1.203913936641799, + "grad_norm": 0.18893185257911682, + "learning_rate": 4.4402274999340065e-05, + "loss": 0.4319, "step": 33405 }, { - "epoch": 1.18, - "learning_rate": 4.468829384678907e-05, - "loss": 0.2915, + "epoch": 1.2040941363030238, + "grad_norm": 0.19673962891101837, + "learning_rate": 4.4400434631005626e-05, + "loss": 0.4201, "step": 33410 }, { - "epoch": 1.18, - "learning_rate": 4.4686538125878485e-05, - "loss": 0.2831, + "epoch": 1.2042743359642483, + "grad_norm": 0.21751564741134644, + "learning_rate": 4.439859399834402e-05, + "loss": 0.4748, "step": 33415 }, { - "epoch": 1.18, - "learning_rate": 4.468478214935118e-05, - "loss": 0.2915, + "epoch": 1.204454535625473, + "grad_norm": 0.17174293100833893, + "learning_rate": 4.4396753101380316e-05, + "loss": 0.3964, "step": 33420 }, { - "epoch": 1.18, - "learning_rate": 4.4683025917229956e-05, - "loss": 0.3001, + "epoch": 1.2046347352866977, + "grad_norm": 0.18520551919937134, + "learning_rate": 4.4394911940139616e-05, + "loss": 0.4367, "step": 33425 }, { - "epoch": 1.18, - "learning_rate": 4.468126942953761e-05, - "loss": 0.3102, + "epoch": 1.2048149349479222, + "grad_norm": 0.16532555222511292, + "learning_rate": 4.4393070514647e-05, + "loss": 0.4135, "step": 33430 }, { - "epoch": 1.18, - "learning_rate": 4.467951268629695e-05, - "loss": 0.2899, + "epoch": 1.204995134609147, + "grad_norm": 0.1470910906791687, + "learning_rate": 4.439122882492754e-05, + "loss": 0.4098, "step": 33435 }, { - "epoch": 1.18, - "learning_rate": 4.4677755687530787e-05, - "loss": 0.2786, + "epoch": 1.2051753342703715, + "grad_norm": 0.18562817573547363, + "learning_rate": 4.438938687100636e-05, + "loss": 0.4498, "step": 33440 }, { - "epoch": 1.18, - "learning_rate": 4.467599843326193e-05, - "loss": 0.2695, + "epoch": 1.2053555339315962, + "grad_norm": 0.14606645703315735, + "learning_rate": 4.438754465290852e-05, + "loss": 0.4095, "step": 33445 }, { - "epoch": 1.18, - "learning_rate": 4.467424092351321e-05, - "loss": 0.3129, + "epoch": 1.205535733592821, + "grad_norm": 0.15203700959682465, + "learning_rate": 4.4385702170659144e-05, + "loss": 0.3668, "step": 33450 }, { - "epoch": 1.18, - "learning_rate": 4.467248315830743e-05, - "loss": 0.273, + "epoch": 1.2057159332540455, + "grad_norm": 0.17846938967704773, + "learning_rate": 4.4383859424283325e-05, + "loss": 0.4517, "step": 33455 }, { - "epoch": 1.18, - "learning_rate": 4.4670725137667425e-05, - "loss": 0.2893, + "epoch": 1.2058961329152702, + "grad_norm": 0.16596698760986328, + "learning_rate": 4.438201641380618e-05, + "loss": 0.3891, "step": 33460 }, { - "epoch": 1.18, - "learning_rate": 4.466896686161601e-05, - "loss": 0.2975, + "epoch": 1.2060763325764947, + "grad_norm": 0.14986424148082733, + "learning_rate": 4.438017313925281e-05, + "loss": 0.3901, "step": 33465 }, { - "epoch": 1.18, - "learning_rate": 4.466720833017604e-05, - "loss": 0.2891, + "epoch": 1.2062565322377194, + "grad_norm": 0.18405571579933167, + "learning_rate": 4.4378329600648337e-05, + "loss": 0.4215, "step": 33470 }, { - "epoch": 1.18, - "learning_rate": 4.4665449543370316e-05, - "loss": 0.2937, + "epoch": 1.206436731898944, + "grad_norm": 0.16309815645217896, + "learning_rate": 4.437648579801788e-05, + "loss": 0.4015, "step": 33475 }, { - "epoch": 1.18, - "learning_rate": 4.466369050122169e-05, - "loss": 0.2957, + "epoch": 1.2066169315601687, + "grad_norm": 0.16883867979049683, + "learning_rate": 4.437464173138655e-05, + "loss": 0.4407, "step": 33480 }, { - "epoch": 1.18, - "learning_rate": 4.466193120375301e-05, - "loss": 0.2962, + "epoch": 1.2067971312213932, + "grad_norm": 0.18659423291683197, + "learning_rate": 4.437279740077947e-05, + "loss": 0.4177, "step": 33485 }, { - "epoch": 1.18, - "learning_rate": 4.46601716509871e-05, - "loss": 0.3038, + "epoch": 1.206977330882618, + "grad_norm": 0.2079888880252838, + "learning_rate": 4.437095280622178e-05, + "loss": 0.4201, "step": 33490 }, { - "epoch": 1.18, - "learning_rate": 4.465841184294682e-05, - "loss": 0.2816, + "epoch": 1.2071575305438427, + "grad_norm": 0.15758785605430603, + "learning_rate": 4.4369107947738606e-05, + "loss": 0.4037, "step": 33495 }, { - "epoch": 1.18, - "learning_rate": 4.465665177965502e-05, - "loss": 0.3047, + "epoch": 1.2073377302050672, + "grad_norm": 0.18078656494617462, + "learning_rate": 4.436726282535509e-05, + "loss": 0.4328, "step": 33500 }, { - "epoch": 1.18, - "eval_loss": 0.2884669601917267, - "eval_runtime": 10.5481, - "eval_samples_per_second": 9.48, - "eval_steps_per_second": 9.48, + "epoch": 1.2073377302050672, + "eval_loss": 0.45027703046798706, + "eval_runtime": 3.5418, + "eval_samples_per_second": 28.234, + "eval_steps_per_second": 7.058, "step": 33500 }, { - "epoch": 1.18, - "learning_rate": 4.4654891461134556e-05, - "loss": 0.2963, + "epoch": 1.207517929866292, + "grad_norm": 0.18480472266674042, + "learning_rate": 4.436541743909637e-05, + "loss": 0.4078, "step": 33505 }, { - "epoch": 1.18, - "learning_rate": 4.465313088740828e-05, - "loss": 0.2943, + "epoch": 1.2076981295275164, + "grad_norm": 0.24424877762794495, + "learning_rate": 4.436357178898759e-05, + "loss": 0.4427, "step": 33510 }, { - "epoch": 1.18, - "learning_rate": 4.4651370058499045e-05, - "loss": 0.3003, + "epoch": 1.2078783291887412, + "grad_norm": 0.15822868049144745, + "learning_rate": 4.436172587505389e-05, + "loss": 0.4074, "step": 33515 }, { - "epoch": 1.18, - "learning_rate": 4.464960897442973e-05, - "loss": 0.2907, + "epoch": 1.2080585288499657, + "grad_norm": 0.1882435828447342, + "learning_rate": 4.435987969732042e-05, + "loss": 0.4272, "step": 33520 }, { - "epoch": 1.18, - "learning_rate": 4.4647847635223175e-05, - "loss": 0.2912, + "epoch": 1.2082387285111904, + "grad_norm": 0.18066862225532532, + "learning_rate": 4.4358033255812334e-05, + "loss": 0.3978, "step": 33525 }, { - "epoch": 1.18, - "learning_rate": 4.464608604090228e-05, - "loss": 0.2866, + "epoch": 1.208418928172415, + "grad_norm": 0.18334537744522095, + "learning_rate": 4.43561865505548e-05, + "loss": 0.4683, "step": 33530 }, { - "epoch": 1.18, - "learning_rate": 4.46443241914899e-05, - "loss": 0.2882, + "epoch": 1.2085991278336397, + "grad_norm": 0.17291270196437836, + "learning_rate": 4.4354339581572974e-05, + "loss": 0.4201, "step": 33535 }, { - "epoch": 1.18, - "learning_rate": 4.464256208700892e-05, - "loss": 0.2638, + "epoch": 1.2087793274948644, + "grad_norm": 0.19278649985790253, + "learning_rate": 4.4352492348892015e-05, + "loss": 0.4244, "step": 33540 }, { - "epoch": 1.18, - "learning_rate": 4.464079972748221e-05, - "loss": 0.3069, + "epoch": 1.208959527156089, + "grad_norm": 0.17571572959423065, + "learning_rate": 4.435064485253709e-05, + "loss": 0.4039, "step": 33545 }, { - "epoch": 1.18, - "learning_rate": 4.463903711293267e-05, - "loss": 0.283, + "epoch": 1.2091397268173136, + "grad_norm": 0.21595270931720734, + "learning_rate": 4.434879709253338e-05, + "loss": 0.4619, "step": 33550 }, { - "epoch": 1.18, - "learning_rate": 4.463727424338316e-05, - "loss": 0.2771, + "epoch": 1.2093199264785381, + "grad_norm": 0.16592943668365479, + "learning_rate": 4.434694906890605e-05, + "loss": 0.4259, "step": 33555 }, { - "epoch": 1.18, - "learning_rate": 4.46355111188566e-05, - "loss": 0.2906, + "epoch": 1.2095001261397629, + "grad_norm": 0.1708192229270935, + "learning_rate": 4.434510078168029e-05, + "loss": 0.4389, "step": 33560 }, { - "epoch": 1.18, - "learning_rate": 4.463374773937587e-05, - "loss": 0.3029, + "epoch": 1.2096803258009876, + "grad_norm": 0.13730975985527039, + "learning_rate": 4.434325223088128e-05, + "loss": 0.3832, "step": 33565 }, { - "epoch": 1.18, - "learning_rate": 4.4631984104963854e-05, - "loss": 0.2964, + "epoch": 1.2098605254622121, + "grad_norm": 0.1926315575838089, + "learning_rate": 4.43414034165342e-05, + "loss": 0.3929, "step": 33570 }, { - "epoch": 1.18, - "learning_rate": 4.463022021564346e-05, - "loss": 0.2757, + "epoch": 1.2100407251234369, + "grad_norm": 0.16301114857196808, + "learning_rate": 4.433955433866424e-05, + "loss": 0.3981, "step": 33575 }, { - "epoch": 1.18, - "learning_rate": 4.46284560714376e-05, - "loss": 0.2749, + "epoch": 1.2102209247846614, + "grad_norm": 0.16956037282943726, + "learning_rate": 4.4337704997296604e-05, + "loss": 0.4255, "step": 33580 }, { - "epoch": 1.18, - "learning_rate": 4.462669167236917e-05, - "loss": 0.3006, + "epoch": 1.210401124445886, + "grad_norm": 0.23975764214992523, + "learning_rate": 4.4335855392456474e-05, + "loss": 0.4243, "step": 33585 }, { - "epoch": 1.18, - "learning_rate": 4.462492701846109e-05, - "loss": 0.2939, + "epoch": 1.2105813241071106, + "grad_norm": 0.1944325715303421, + "learning_rate": 4.4334005524169066e-05, + "loss": 0.4094, "step": 33590 }, { - "epoch": 1.18, - "learning_rate": 4.462316210973626e-05, - "loss": 0.2729, + "epoch": 1.2107615237683353, + "grad_norm": 0.17085929214954376, + "learning_rate": 4.433215539245956e-05, + "loss": 0.4084, "step": 33595 }, { - "epoch": 1.18, - "learning_rate": 4.46213969462176e-05, - "loss": 0.2856, + "epoch": 1.2109417234295599, + "grad_norm": 0.21070963144302368, + "learning_rate": 4.43303049973532e-05, + "loss": 0.4491, "step": 33600 }, { - "epoch": 1.18, - "learning_rate": 4.4619631527928043e-05, - "loss": 0.3168, + "epoch": 1.2111219230907846, + "grad_norm": 0.158106729388237, + "learning_rate": 4.4328454338875156e-05, + "loss": 0.3992, "step": 33605 }, { - "epoch": 1.18, - "learning_rate": 4.4617865854890495e-05, - "loss": 0.2675, + "epoch": 1.2113021227520093, + "grad_norm": 0.19057874381542206, + "learning_rate": 4.4326603417050676e-05, + "loss": 0.3716, "step": 33610 }, { - "epoch": 1.18, - "learning_rate": 4.461609992712788e-05, - "loss": 0.2795, + "epoch": 1.2114823224132338, + "grad_norm": 0.1798359900712967, + "learning_rate": 4.4324752231904965e-05, + "loss": 0.4243, "step": 33615 }, { - "epoch": 1.18, - "learning_rate": 4.461433374466314e-05, - "loss": 0.2885, + "epoch": 1.2116625220744586, + "grad_norm": 0.20672789216041565, + "learning_rate": 4.432290078346324e-05, + "loss": 0.4384, "step": 33620 }, { - "epoch": 1.18, - "learning_rate": 4.4612567307519205e-05, - "loss": 0.2828, + "epoch": 1.211842721735683, + "grad_norm": 0.18144282698631287, + "learning_rate": 4.4321049071750743e-05, + "loss": 0.3966, "step": 33625 }, { - "epoch": 1.18, - "learning_rate": 4.4610800615719007e-05, - "loss": 0.299, + "epoch": 1.2120229213969078, + "grad_norm": 0.2156475931406021, + "learning_rate": 4.431919709679269e-05, + "loss": 0.4488, "step": 33630 }, { - "epoch": 1.18, - "learning_rate": 4.4609033669285485e-05, - "loss": 0.2943, + "epoch": 1.2122031210581323, + "grad_norm": 0.1874171942472458, + "learning_rate": 4.431734485861432e-05, + "loss": 0.4277, "step": 33635 }, { - "epoch": 1.18, - "learning_rate": 4.460726646824159e-05, - "loss": 0.2989, + "epoch": 1.212383320719357, + "grad_norm": 0.20030780136585236, + "learning_rate": 4.431549235724086e-05, + "loss": 0.3958, "step": 33640 }, { - "epoch": 1.18, - "learning_rate": 4.460549901261025e-05, - "loss": 0.2855, + "epoch": 1.2125635203805816, + "grad_norm": 0.16703949868679047, + "learning_rate": 4.431363959269755e-05, + "loss": 0.4372, "step": 33645 }, { - "epoch": 1.18, - "learning_rate": 4.4603731302414445e-05, - "loss": 0.2958, + "epoch": 1.2127437200418063, + "grad_norm": 0.1703837811946869, + "learning_rate": 4.431178656500965e-05, + "loss": 0.404, "step": 33650 }, { - "epoch": 1.18, - "learning_rate": 4.460196333767709e-05, - "loss": 0.2847, + "epoch": 1.212923919703031, + "grad_norm": 0.1517026126384735, + "learning_rate": 4.43099332742024e-05, + "loss": 0.4263, "step": 33655 }, { - "epoch": 1.18, - "learning_rate": 4.460019511842117e-05, - "loss": 0.2869, + "epoch": 1.2131041193642556, + "grad_norm": 0.21069294214248657, + "learning_rate": 4.430807972030104e-05, + "loss": 0.3994, "step": 33660 }, { - "epoch": 1.18, - "learning_rate": 4.459842664466962e-05, - "loss": 0.2747, + "epoch": 1.2132843190254803, + "grad_norm": 0.1706804782152176, + "learning_rate": 4.430622590333083e-05, + "loss": 0.4113, "step": 33665 }, { - "epoch": 1.18, - "learning_rate": 4.459665791644543e-05, - "loss": 0.2878, + "epoch": 1.2134645186867048, + "grad_norm": 0.20271262526512146, + "learning_rate": 4.430437182331704e-05, + "loss": 0.4189, "step": 33670 }, { - "epoch": 1.18, - "learning_rate": 4.459488893377155e-05, - "loss": 0.3042, + "epoch": 1.2136447183479295, + "grad_norm": 0.1703236848115921, + "learning_rate": 4.430251748028492e-05, + "loss": 0.4197, "step": 33675 }, { - "epoch": 1.18, - "learning_rate": 4.459311969667095e-05, - "loss": 0.2931, + "epoch": 1.2138249180091543, + "grad_norm": 0.16944654285907745, + "learning_rate": 4.430066287425973e-05, + "loss": 0.4716, "step": 33680 }, { - "epoch": 1.19, - "learning_rate": 4.459135020516659e-05, - "loss": 0.2853, + "epoch": 1.2140051176703788, + "grad_norm": 0.17633916437625885, + "learning_rate": 4.429880800526675e-05, + "loss": 0.3921, "step": 33685 }, { - "epoch": 1.19, - "learning_rate": 4.4589580459281474e-05, - "loss": 0.287, + "epoch": 1.2141853173316035, + "grad_norm": 0.2126268744468689, + "learning_rate": 4.4296952873331235e-05, + "loss": 0.4292, "step": 33690 }, { - "epoch": 1.19, - "learning_rate": 4.4587810459038556e-05, - "loss": 0.2946, + "epoch": 1.214365516992828, + "grad_norm": 0.1921420842409134, + "learning_rate": 4.4295097478478484e-05, + "loss": 0.4163, "step": 33695 }, { - "epoch": 1.19, - "learning_rate": 4.458604020446084e-05, - "loss": 0.292, + "epoch": 1.2145457166540528, + "grad_norm": 0.19828033447265625, + "learning_rate": 4.4293241820733764e-05, + "loss": 0.4112, "step": 33700 }, { - "epoch": 1.19, - "learning_rate": 4.458426969557129e-05, - "loss": 0.3181, + "epoch": 1.2147259163152773, + "grad_norm": 0.171315535902977, + "learning_rate": 4.429138590012236e-05, + "loss": 0.4352, "step": 33705 }, { - "epoch": 1.19, - "learning_rate": 4.4582498932392906e-05, - "loss": 0.2827, + "epoch": 1.214906115976502, + "grad_norm": 0.15419135987758636, + "learning_rate": 4.428952971666956e-05, + "loss": 0.4077, "step": 33710 }, { - "epoch": 1.19, - "learning_rate": 4.4580727914948685e-05, - "loss": 0.2826, + "epoch": 1.2150863156377265, + "grad_norm": 0.16436229646205902, + "learning_rate": 4.428767327040065e-05, + "loss": 0.4068, "step": 33715 }, { - "epoch": 1.19, - "learning_rate": 4.45789566432616e-05, - "loss": 0.2858, + "epoch": 1.2152665152989512, + "grad_norm": 0.17633068561553955, + "learning_rate": 4.428581656134092e-05, + "loss": 0.4457, "step": 33720 }, { - "epoch": 1.19, - "learning_rate": 4.457718511735468e-05, - "loss": 0.2845, + "epoch": 1.215446714960176, + "grad_norm": 0.1803673654794693, + "learning_rate": 4.4283959589515686e-05, + "loss": 0.4286, "step": 33725 }, { - "epoch": 1.19, - "learning_rate": 4.4575413337250914e-05, - "loss": 0.3087, + "epoch": 1.2156269146214005, + "grad_norm": 0.14655368030071259, + "learning_rate": 4.428210235495023e-05, + "loss": 0.4373, "step": 33730 }, { - "epoch": 1.19, - "learning_rate": 4.45736413029733e-05, - "loss": 0.2915, + "epoch": 1.2158071142826252, + "grad_norm": 0.18454967439174652, + "learning_rate": 4.428024485766986e-05, + "loss": 0.4165, "step": 33735 }, { - "epoch": 1.19, - "learning_rate": 4.457186901454485e-05, - "loss": 0.2721, + "epoch": 1.2159873139438497, + "grad_norm": 0.1603035181760788, + "learning_rate": 4.427838709769989e-05, + "loss": 0.3995, "step": 33740 }, { - "epoch": 1.19, - "learning_rate": 4.4570096471988596e-05, - "loss": 0.3164, + "epoch": 1.2161675136050745, + "grad_norm": 0.1812167763710022, + "learning_rate": 4.427652907506562e-05, + "loss": 0.4046, "step": 33745 }, { - "epoch": 1.19, - "learning_rate": 4.4568323675327525e-05, - "loss": 0.2786, + "epoch": 1.216347713266299, + "grad_norm": 0.16783830523490906, + "learning_rate": 4.427467078979238e-05, + "loss": 0.4103, "step": 33750 }, { - "epoch": 1.19, - "learning_rate": 4.456655062458467e-05, - "loss": 0.2895, + "epoch": 1.2165279129275237, + "grad_norm": 0.20408573746681213, + "learning_rate": 4.427281224190548e-05, + "loss": 0.4424, "step": 33755 }, { - "epoch": 1.19, - "learning_rate": 4.4564777319783047e-05, - "loss": 0.2568, + "epoch": 1.2167081125887482, + "grad_norm": 0.13193145394325256, + "learning_rate": 4.427095343143025e-05, + "loss": 0.4065, "step": 33760 }, { - "epoch": 1.19, - "learning_rate": 4.456300376094568e-05, - "loss": 0.2961, + "epoch": 1.216888312249973, + "grad_norm": 0.1420491337776184, + "learning_rate": 4.4269094358392e-05, + "loss": 0.4043, "step": 33765 }, { - "epoch": 1.19, - "learning_rate": 4.456122994809561e-05, - "loss": 0.3038, + "epoch": 1.2170685119111977, + "grad_norm": 0.19709770381450653, + "learning_rate": 4.4267235022816084e-05, + "loss": 0.4288, "step": 33770 }, { - "epoch": 1.19, - "learning_rate": 4.455945588125585e-05, - "loss": 0.2928, + "epoch": 1.2172487115724222, + "grad_norm": 0.17064298689365387, + "learning_rate": 4.4265375424727815e-05, + "loss": 0.4346, "step": 33775 }, { - "epoch": 1.19, - "learning_rate": 4.4557681560449456e-05, - "loss": 0.2999, + "epoch": 1.217428911233647, + "grad_norm": 0.16574378311634064, + "learning_rate": 4.4263515564152534e-05, + "loss": 0.4228, "step": 33780 }, { - "epoch": 1.19, - "learning_rate": 4.455590698569945e-05, - "loss": 0.3193, + "epoch": 1.2176091108948714, + "grad_norm": 0.20119209587574005, + "learning_rate": 4.426165544111558e-05, + "loss": 0.4042, "step": 33785 }, { - "epoch": 1.19, - "learning_rate": 4.4554132157028885e-05, - "loss": 0.2992, + "epoch": 1.2177893105560962, + "grad_norm": 0.16486892104148865, + "learning_rate": 4.4259795055642305e-05, + "loss": 0.4075, "step": 33790 }, { - "epoch": 1.19, - "learning_rate": 4.4552357074460796e-05, - "loss": 0.3181, + "epoch": 1.2179695102173207, + "grad_norm": 0.17339356243610382, + "learning_rate": 4.425793440775805e-05, + "loss": 0.4147, "step": 33795 }, { - "epoch": 1.19, - "learning_rate": 4.4550581738018236e-05, - "loss": 0.2944, + "epoch": 1.2181497098785454, + "grad_norm": 0.16329234838485718, + "learning_rate": 4.425607349748816e-05, + "loss": 0.411, "step": 33800 }, { - "epoch": 1.19, - "learning_rate": 4.454880614772426e-05, - "loss": 0.2957, + "epoch": 1.2183299095397702, + "grad_norm": 0.17299103736877441, + "learning_rate": 4.425421232485801e-05, + "loss": 0.4047, "step": 33805 }, { - "epoch": 1.19, - "learning_rate": 4.454703030360192e-05, - "loss": 0.3041, + "epoch": 1.2185101092009947, + "grad_norm": 0.18187008798122406, + "learning_rate": 4.4252350889892936e-05, + "loss": 0.4287, "step": 33810 }, { - "epoch": 1.19, - "learning_rate": 4.4545254205674266e-05, - "loss": 0.2972, + "epoch": 1.2186903088622194, + "grad_norm": 0.19990107417106628, + "learning_rate": 4.425048919261832e-05, + "loss": 0.4089, "step": 33815 }, { - "epoch": 1.19, - "learning_rate": 4.4543477853964375e-05, - "loss": 0.2742, + "epoch": 1.218870508523444, + "grad_norm": 0.1538524031639099, + "learning_rate": 4.4248627233059505e-05, + "loss": 0.4271, "step": 33820 }, { - "epoch": 1.19, - "learning_rate": 4.45417012484953e-05, - "loss": 0.279, + "epoch": 1.2190507081846687, + "grad_norm": 0.18129165470600128, + "learning_rate": 4.4246765011241864e-05, + "loss": 0.3741, "step": 33825 }, { - "epoch": 1.19, - "learning_rate": 4.453992438929011e-05, - "loss": 0.2805, + "epoch": 1.2192309078458932, + "grad_norm": 0.20447011291980743, + "learning_rate": 4.4244902527190785e-05, + "loss": 0.4431, "step": 33830 }, { - "epoch": 1.19, - "learning_rate": 4.4538147276371884e-05, - "loss": 0.2776, + "epoch": 1.219411107507118, + "grad_norm": 0.148980513215065, + "learning_rate": 4.424303978093163e-05, + "loss": 0.3938, "step": 33835 }, { - "epoch": 1.19, - "learning_rate": 4.4536369909763683e-05, - "loss": 0.2843, + "epoch": 1.2195913071683426, + "grad_norm": 0.14329323172569275, + "learning_rate": 4.424117677248979e-05, + "loss": 0.4257, "step": 33840 }, { - "epoch": 1.19, - "learning_rate": 4.453459228948861e-05, - "loss": 0.2555, + "epoch": 1.2197715068295671, + "grad_norm": 0.22545427083969116, + "learning_rate": 4.423931350189065e-05, + "loss": 0.4461, "step": 33845 }, { - "epoch": 1.19, - "learning_rate": 4.453281441556972e-05, - "loss": 0.3132, + "epoch": 1.2199517064907919, + "grad_norm": 0.17364872992038727, + "learning_rate": 4.423744996915957e-05, + "loss": 0.445, "step": 33850 }, { - "epoch": 1.19, - "learning_rate": 4.45310362880301e-05, - "loss": 0.3091, + "epoch": 1.2201319061520164, + "grad_norm": 0.2247709333896637, + "learning_rate": 4.4235586174321964e-05, + "loss": 0.4471, "step": 33855 }, { - "epoch": 1.19, - "learning_rate": 4.4529257906892864e-05, - "loss": 0.2793, + "epoch": 1.2203121058132411, + "grad_norm": 0.18981455266475677, + "learning_rate": 4.423372211740323e-05, + "loss": 0.4089, "step": 33860 }, { - "epoch": 1.19, - "learning_rate": 4.4527479272181074e-05, - "loss": 0.2814, + "epoch": 1.2204923054744656, + "grad_norm": 0.15154671669006348, + "learning_rate": 4.423185779842874e-05, + "loss": 0.4149, "step": 33865 }, { - "epoch": 1.19, - "learning_rate": 4.452570038391783e-05, - "loss": 0.3016, + "epoch": 1.2206725051356904, + "grad_norm": 0.1997554749250412, + "learning_rate": 4.422999321742393e-05, + "loss": 0.3822, "step": 33870 }, { - "epoch": 1.19, - "learning_rate": 4.452392124212624e-05, - "loss": 0.2923, + "epoch": 1.2208527047969149, + "grad_norm": 0.21992482244968414, + "learning_rate": 4.422812837441417e-05, + "loss": 0.4158, "step": 33875 }, { - "epoch": 1.19, - "learning_rate": 4.45221418468294e-05, - "loss": 0.2731, + "epoch": 1.2210329044581396, + "grad_norm": 0.19677501916885376, + "learning_rate": 4.422626326942489e-05, + "loss": 0.4037, "step": 33880 }, { - "epoch": 1.19, - "learning_rate": 4.452036219805041e-05, - "loss": 0.3242, + "epoch": 1.2212131041193643, + "grad_norm": 0.1691318303346634, + "learning_rate": 4.422439790248149e-05, + "loss": 0.4369, "step": 33885 }, { - "epoch": 1.19, - "learning_rate": 4.451858229581238e-05, - "loss": 0.2567, + "epoch": 1.2213933037805889, + "grad_norm": 0.15583454072475433, + "learning_rate": 4.4222532273609396e-05, + "loss": 0.3988, "step": 33890 }, { - "epoch": 1.19, - "learning_rate": 4.451680214013842e-05, - "loss": 0.3106, + "epoch": 1.2215735034418136, + "grad_norm": 0.1846490055322647, + "learning_rate": 4.422066638283402e-05, + "loss": 0.4234, "step": 33895 }, { - "epoch": 1.19, - "learning_rate": 4.4515021731051654e-05, - "loss": 0.3036, + "epoch": 1.221753703103038, + "grad_norm": 0.18129578232765198, + "learning_rate": 4.421880023018079e-05, + "loss": 0.4312, "step": 33900 }, { - "epoch": 1.19, - "learning_rate": 4.4513241068575196e-05, - "loss": 0.2825, + "epoch": 1.2219339027642628, + "grad_norm": 0.15360191464424133, + "learning_rate": 4.421693381567512e-05, + "loss": 0.4302, "step": 33905 }, { - "epoch": 1.19, - "learning_rate": 4.4511460152732156e-05, - "loss": 0.2796, + "epoch": 1.2221141024254873, + "grad_norm": 0.2045409232378006, + "learning_rate": 4.421506713934245e-05, + "loss": 0.4176, "step": 33910 }, { - "epoch": 1.19, - "learning_rate": 4.4509678983545654e-05, - "loss": 0.2792, + "epoch": 1.222294302086712, + "grad_norm": 0.15887264907360077, + "learning_rate": 4.421320020120821e-05, + "loss": 0.4114, "step": 33915 }, { - "epoch": 1.19, - "learning_rate": 4.4507897561038845e-05, - "loss": 0.266, + "epoch": 1.2224745017479366, + "grad_norm": 0.18806084990501404, + "learning_rate": 4.4211333001297836e-05, + "loss": 0.4254, "step": 33920 }, { - "epoch": 1.19, - "learning_rate": 4.4506115885234836e-05, - "loss": 0.2884, + "epoch": 1.2226547014091613, + "grad_norm": 0.21067681908607483, + "learning_rate": 4.420946553963677e-05, + "loss": 0.4186, "step": 33925 }, { - "epoch": 1.19, - "learning_rate": 4.450433395615676e-05, - "loss": 0.283, + "epoch": 1.222834901070386, + "grad_norm": 0.15859293937683105, + "learning_rate": 4.4207597816250454e-05, + "loss": 0.4144, "step": 33930 }, { - "epoch": 1.19, - "learning_rate": 4.4502551773827775e-05, - "loss": 0.2885, + "epoch": 1.2230151007316106, + "grad_norm": 0.1685830056667328, + "learning_rate": 4.420572983116434e-05, + "loss": 0.4313, "step": 33935 }, { - "epoch": 1.19, - "learning_rate": 4.4500769338271e-05, - "loss": 0.2917, + "epoch": 1.2231953003928353, + "grad_norm": 0.17942924797534943, + "learning_rate": 4.420386158440388e-05, + "loss": 0.4175, "step": 33940 }, { - "epoch": 1.19, - "learning_rate": 4.4498986649509584e-05, - "loss": 0.2898, + "epoch": 1.2233755000540598, + "grad_norm": 0.19803176820278168, + "learning_rate": 4.420199307599452e-05, + "loss": 0.4151, "step": 33945 }, { - "epoch": 1.19, - "learning_rate": 4.449720370756668e-05, - "loss": 0.2922, + "epoch": 1.2235556997152846, + "grad_norm": 0.18446727097034454, + "learning_rate": 4.420012430596172e-05, + "loss": 0.4478, "step": 33950 }, { - "epoch": 1.19, - "learning_rate": 4.449542051246544e-05, - "loss": 0.3001, + "epoch": 1.2237358993765093, + "grad_norm": 0.15331429243087769, + "learning_rate": 4.419825527433095e-05, + "loss": 0.424, "step": 33955 }, { - "epoch": 1.19, - "learning_rate": 4.4493637064229e-05, - "loss": 0.2668, + "epoch": 1.2239160990377338, + "grad_norm": 0.21690736711025238, + "learning_rate": 4.419638598112765e-05, + "loss": 0.4068, "step": 33960 }, { - "epoch": 1.19, - "learning_rate": 4.449185336288054e-05, - "loss": 0.2862, + "epoch": 1.2240962986989585, + "grad_norm": 0.18475008010864258, + "learning_rate": 4.4194516426377326e-05, + "loss": 0.4087, "step": 33965 }, { - "epoch": 1.2, - "learning_rate": 4.44900694084432e-05, - "loss": 0.2975, + "epoch": 1.224276498360183, + "grad_norm": 0.19182796776294708, + "learning_rate": 4.4192646610105425e-05, + "loss": 0.4304, "step": 33970 }, { - "epoch": 1.2, - "learning_rate": 4.448828520094017e-05, - "loss": 0.293, + "epoch": 1.2244566980214078, + "grad_norm": 0.15352657437324524, + "learning_rate": 4.419077653233743e-05, + "loss": 0.4087, "step": 33975 }, { - "epoch": 1.2, - "learning_rate": 4.448650074039459e-05, - "loss": 0.2925, + "epoch": 1.2246368976826323, + "grad_norm": 0.18293914198875427, + "learning_rate": 4.418890619309882e-05, + "loss": 0.4337, "step": 33980 }, { - "epoch": 1.2, - "learning_rate": 4.4484716026829635e-05, - "loss": 0.2922, + "epoch": 1.224817097343857, + "grad_norm": 0.22879280149936676, + "learning_rate": 4.4187035592415085e-05, + "loss": 0.4208, "step": 33985 }, { - "epoch": 1.2, - "learning_rate": 4.448293106026849e-05, - "loss": 0.2837, + "epoch": 1.2249972970050815, + "grad_norm": 0.18083889782428741, + "learning_rate": 4.41851647303117e-05, + "loss": 0.4357, "step": 33990 }, { - "epoch": 1.2, - "learning_rate": 4.448114584073432e-05, - "loss": 0.308, + "epoch": 1.2251774966663063, + "grad_norm": 0.17108914256095886, + "learning_rate": 4.4183293606814155e-05, + "loss": 0.3936, "step": 33995 }, { - "epoch": 1.2, - "learning_rate": 4.447936036825031e-05, - "loss": 0.3173, + "epoch": 1.225357696327531, + "grad_norm": 0.15355034172534943, + "learning_rate": 4.418142222194795e-05, + "loss": 0.3946, "step": 34000 }, { - "epoch": 1.2, - "eval_loss": 0.28807368874549866, - "eval_runtime": 10.551, - "eval_samples_per_second": 9.478, - "eval_steps_per_second": 9.478, + "epoch": 1.225357696327531, + "eval_loss": 0.45026490092277527, + "eval_runtime": 3.5278, + "eval_samples_per_second": 28.346, + "eval_steps_per_second": 7.086, "step": 34000 }, { - "epoch": 1.2, - "learning_rate": 4.447757464283965e-05, - "loss": 0.2874, + "epoch": 1.2255378959887555, + "grad_norm": 0.18301257491111755, + "learning_rate": 4.4179550575738584e-05, + "loss": 0.4336, "step": 34005 }, { - "epoch": 1.2, - "learning_rate": 4.447578866452552e-05, - "loss": 0.2941, + "epoch": 1.2257180956499802, + "grad_norm": 0.16814136505126953, + "learning_rate": 4.4177678668211555e-05, + "loss": 0.4443, "step": 34010 }, { - "epoch": 1.2, - "learning_rate": 4.44740024333311e-05, - "loss": 0.2793, + "epoch": 1.2258982953112048, + "grad_norm": 0.22709350287914276, + "learning_rate": 4.417580649939237e-05, + "loss": 0.4527, "step": 34015 }, { - "epoch": 1.2, - "learning_rate": 4.4472215949279594e-05, - "loss": 0.2921, + "epoch": 1.2260784949724295, + "grad_norm": 0.15572187304496765, + "learning_rate": 4.417393406930652e-05, + "loss": 0.3646, "step": 34020 }, { - "epoch": 1.2, - "learning_rate": 4.44704292123942e-05, - "loss": 0.2979, + "epoch": 1.226258694633654, + "grad_norm": 0.17748330533504486, + "learning_rate": 4.4172061377979545e-05, + "loss": 0.4317, "step": 34025 }, { - "epoch": 1.2, - "learning_rate": 4.446864222269812e-05, - "loss": 0.3073, + "epoch": 1.2264388942948787, + "grad_norm": 0.18140548467636108, + "learning_rate": 4.417018842543694e-05, + "loss": 0.4283, "step": 34030 }, { - "epoch": 1.2, - "learning_rate": 4.446685498021455e-05, - "loss": 0.3038, + "epoch": 1.2266190939561032, + "grad_norm": 0.1755266636610031, + "learning_rate": 4.416831521170424e-05, + "loss": 0.4276, "step": 34035 }, { - "epoch": 1.2, - "learning_rate": 4.446506748496668e-05, - "loss": 0.2936, + "epoch": 1.226799293617328, + "grad_norm": 0.1404469907283783, + "learning_rate": 4.416644173680694e-05, + "loss": 0.3802, "step": 34040 }, { - "epoch": 1.2, - "learning_rate": 4.4463279736977754e-05, - "loss": 0.3041, + "epoch": 1.2269794932785527, + "grad_norm": 0.18569141626358032, + "learning_rate": 4.416456800077059e-05, + "loss": 0.4175, "step": 34045 }, { - "epoch": 1.2, - "learning_rate": 4.446149173627096e-05, - "loss": 0.3029, + "epoch": 1.2271596929397772, + "grad_norm": 0.20885035395622253, + "learning_rate": 4.416269400362071e-05, + "loss": 0.4017, "step": 34050 }, { - "epoch": 1.2, - "learning_rate": 4.445970348286952e-05, - "loss": 0.3014, + "epoch": 1.227339892601002, + "grad_norm": 0.18960420787334442, + "learning_rate": 4.416081974538283e-05, + "loss": 0.4186, "step": 34055 }, { - "epoch": 1.2, - "learning_rate": 4.445791497679666e-05, - "loss": 0.2813, + "epoch": 1.2275200922622265, + "grad_norm": 0.15756478905677795, + "learning_rate": 4.41589452260825e-05, + "loss": 0.3757, "step": 34060 }, { - "epoch": 1.2, - "learning_rate": 4.445612621807559e-05, - "loss": 0.2693, + "epoch": 1.2277002919234512, + "grad_norm": 0.16769862174987793, + "learning_rate": 4.415707044574524e-05, + "loss": 0.3859, "step": 34065 }, { - "epoch": 1.2, - "learning_rate": 4.445433720672955e-05, - "loss": 0.2814, + "epoch": 1.227880491584676, + "grad_norm": 0.21524082124233246, + "learning_rate": 4.415519540439661e-05, + "loss": 0.46, "step": 34070 }, { - "epoch": 1.2, - "learning_rate": 4.4452547942781765e-05, - "loss": 0.2809, + "epoch": 1.2280606912459004, + "grad_norm": 0.1766262948513031, + "learning_rate": 4.4153320102062155e-05, + "loss": 0.3977, "step": 34075 }, { - "epoch": 1.2, - "learning_rate": 4.4450758426255456e-05, - "loss": 0.285, + "epoch": 1.2282408909071252, + "grad_norm": 0.15509352087974548, + "learning_rate": 4.4151444538767414e-05, + "loss": 0.4288, "step": 34080 }, { - "epoch": 1.2, - "learning_rate": 4.444896865717387e-05, - "loss": 0.2821, + "epoch": 1.2284210905683497, + "grad_norm": 0.14794203639030457, + "learning_rate": 4.414956871453796e-05, + "loss": 0.4274, "step": 34085 }, { - "epoch": 1.2, - "learning_rate": 4.444717863556025e-05, - "loss": 0.3087, + "epoch": 1.2286012902295744, + "grad_norm": 0.1935451328754425, + "learning_rate": 4.4147692629399326e-05, + "loss": 0.3819, "step": 34090 }, { - "epoch": 1.2, - "learning_rate": 4.444538836143782e-05, - "loss": 0.2888, + "epoch": 1.228781489890799, + "grad_norm": 0.14869855344295502, + "learning_rate": 4.414581628337709e-05, + "loss": 0.4268, "step": 34095 }, { - "epoch": 1.2, - "learning_rate": 4.444359783482985e-05, - "loss": 0.2941, + "epoch": 1.2289616895520237, + "grad_norm": 0.17884673178195953, + "learning_rate": 4.4143939676496825e-05, + "loss": 0.4162, "step": 34100 }, { - "epoch": 1.2, - "learning_rate": 4.4441807055759574e-05, - "loss": 0.3051, + "epoch": 1.2291418892132482, + "grad_norm": 0.19956587255001068, + "learning_rate": 4.414206280878408e-05, + "loss": 0.4309, "step": 34105 }, { - "epoch": 1.2, - "learning_rate": 4.444001602425024e-05, - "loss": 0.2595, + "epoch": 1.229322088874473, + "grad_norm": 0.1776251643896103, + "learning_rate": 4.414018568026443e-05, + "loss": 0.4051, "step": 34110 }, { - "epoch": 1.2, - "learning_rate": 4.443822474032511e-05, - "loss": 0.2836, + "epoch": 1.2295022885356977, + "grad_norm": 0.16299593448638916, + "learning_rate": 4.413830829096347e-05, + "loss": 0.4617, "step": 34115 }, { - "epoch": 1.2, - "learning_rate": 4.443643320400744e-05, - "loss": 0.3046, + "epoch": 1.2296824881969222, + "grad_norm": 0.17670361697673798, + "learning_rate": 4.413643064090675e-05, + "loss": 0.399, "step": 34120 }, { - "epoch": 1.2, - "learning_rate": 4.44346414153205e-05, - "loss": 0.2819, + "epoch": 1.229862687858147, + "grad_norm": 0.16211190819740295, + "learning_rate": 4.4134552730119874e-05, + "loss": 0.4074, "step": 34125 }, { - "epoch": 1.2, - "learning_rate": 4.443284937428754e-05, - "loss": 0.2821, + "epoch": 1.2300428875193714, + "grad_norm": 0.1310056746006012, + "learning_rate": 4.413267455862842e-05, + "loss": 0.4109, "step": 34130 }, { - "epoch": 1.2, - "learning_rate": 4.443105708093185e-05, - "loss": 0.3211, + "epoch": 1.2302230871805961, + "grad_norm": 0.1611911952495575, + "learning_rate": 4.4130796126457984e-05, + "loss": 0.3891, "step": 34135 }, { - "epoch": 1.2, - "learning_rate": 4.4429264535276684e-05, - "loss": 0.2882, + "epoch": 1.2304032868418207, + "grad_norm": 0.17254267632961273, + "learning_rate": 4.412891743363416e-05, + "loss": 0.3842, "step": 34140 }, { - "epoch": 1.2, - "learning_rate": 4.442747173734532e-05, - "loss": 0.3001, + "epoch": 1.2305834865030454, + "grad_norm": 0.2114710956811905, + "learning_rate": 4.412703848018253e-05, + "loss": 0.421, "step": 34145 }, { - "epoch": 1.2, - "learning_rate": 4.4425678687161046e-05, - "loss": 0.2723, + "epoch": 1.23076368616427, + "grad_norm": 0.1724889725446701, + "learning_rate": 4.4125159266128696e-05, + "loss": 0.4537, "step": 34150 }, { - "epoch": 1.2, - "learning_rate": 4.4423885384747124e-05, - "loss": 0.2883, + "epoch": 1.2309438858254946, + "grad_norm": 0.17160102725028992, + "learning_rate": 4.412327979149828e-05, + "loss": 0.4127, "step": 34155 }, { - "epoch": 1.2, - "learning_rate": 4.442209183012685e-05, - "loss": 0.2654, + "epoch": 1.2311240854867194, + "grad_norm": 0.15789945423603058, + "learning_rate": 4.412140005631688e-05, + "loss": 0.3848, "step": 34160 }, { - "epoch": 1.2, - "learning_rate": 4.442029802332353e-05, - "loss": 0.2737, + "epoch": 1.2313042851479439, + "grad_norm": 0.1946706771850586, + "learning_rate": 4.4119520060610105e-05, + "loss": 0.4288, "step": 34165 }, { - "epoch": 1.2, - "learning_rate": 4.4418503964360424e-05, - "loss": 0.29, + "epoch": 1.2314844848091686, + "grad_norm": 0.1840618997812271, + "learning_rate": 4.411763980440357e-05, + "loss": 0.3903, "step": 34170 }, { - "epoch": 1.2, - "learning_rate": 4.441670965326085e-05, - "loss": 0.2884, + "epoch": 1.2316646844703931, + "grad_norm": 0.16430874168872833, + "learning_rate": 4.411575928772289e-05, + "loss": 0.4252, "step": 34175 }, { - "epoch": 1.2, - "learning_rate": 4.4414915090048095e-05, - "loss": 0.3039, + "epoch": 1.2318448841316179, + "grad_norm": 0.16133743524551392, + "learning_rate": 4.41138785105937e-05, + "loss": 0.4184, "step": 34180 }, { - "epoch": 1.2, - "learning_rate": 4.4413120274745455e-05, - "loss": 0.2871, + "epoch": 1.2320250837928426, + "grad_norm": 0.18589156866073608, + "learning_rate": 4.411199747304161e-05, + "loss": 0.3987, "step": 34185 }, { - "epoch": 1.2, - "learning_rate": 4.441132520737625e-05, - "loss": 0.2728, + "epoch": 1.232205283454067, + "grad_norm": 0.19146478176116943, + "learning_rate": 4.4110116175092254e-05, + "loss": 0.437, "step": 34190 }, { - "epoch": 1.2, - "learning_rate": 4.440952988796378e-05, - "loss": 0.3025, + "epoch": 1.2323854831152918, + "grad_norm": 0.1762928068637848, + "learning_rate": 4.410823461677126e-05, + "loss": 0.3935, "step": 34195 }, { - "epoch": 1.2, - "learning_rate": 4.440773431653136e-05, - "loss": 0.2866, + "epoch": 1.2325656827765163, + "grad_norm": 0.17464253306388855, + "learning_rate": 4.4106352798104276e-05, + "loss": 0.4299, "step": 34200 }, { - "epoch": 1.2, - "learning_rate": 4.440593849310229e-05, - "loss": 0.2843, + "epoch": 1.232745882437741, + "grad_norm": 0.15182888507843018, + "learning_rate": 4.410447071911693e-05, + "loss": 0.4296, "step": 34205 }, { - "epoch": 1.2, - "learning_rate": 4.4404142417699904e-05, - "loss": 0.2922, + "epoch": 1.2329260820989656, + "grad_norm": 0.20655085146427155, + "learning_rate": 4.410258837983488e-05, + "loss": 0.4122, "step": 34210 }, { - "epoch": 1.2, - "learning_rate": 4.4402346090347515e-05, - "loss": 0.2985, + "epoch": 1.2331062817601903, + "grad_norm": 0.15774188935756683, + "learning_rate": 4.4100705780283746e-05, + "loss": 0.3955, "step": 34215 }, { - "epoch": 1.2, - "learning_rate": 4.4400549511068445e-05, - "loss": 0.2766, + "epoch": 1.2332864814214148, + "grad_norm": 0.16499753296375275, + "learning_rate": 4.40988229204892e-05, + "loss": 0.4257, "step": 34220 }, { - "epoch": 1.2, - "learning_rate": 4.4398752679886026e-05, - "loss": 0.2857, + "epoch": 1.2334666810826396, + "grad_norm": 0.21794357895851135, + "learning_rate": 4.4096939800476894e-05, + "loss": 0.4236, "step": 34225 }, { - "epoch": 1.2, - "learning_rate": 4.4396955596823594e-05, - "loss": 0.3026, + "epoch": 1.2336468807438643, + "grad_norm": 0.16437794268131256, + "learning_rate": 4.409505642027248e-05, + "loss": 0.4395, "step": 34230 }, { - "epoch": 1.2, - "learning_rate": 4.439515826190447e-05, - "loss": 0.2888, + "epoch": 1.2338270804050888, + "grad_norm": 0.152201309800148, + "learning_rate": 4.409317277990161e-05, + "loss": 0.4321, "step": 34235 }, { - "epoch": 1.2, - "learning_rate": 4.4393360675151996e-05, - "loss": 0.3029, + "epoch": 1.2340072800663135, + "grad_norm": 0.1949222981929779, + "learning_rate": 4.409128887938997e-05, + "loss": 0.4265, "step": 34240 }, { - "epoch": 1.2, - "learning_rate": 4.439156283658952e-05, - "loss": 0.2839, + "epoch": 1.234187479727538, + "grad_norm": 0.18767017126083374, + "learning_rate": 4.408940471876321e-05, + "loss": 0.4111, "step": 34245 }, { - "epoch": 1.21, - "learning_rate": 4.438976474624038e-05, - "loss": 0.2773, + "epoch": 1.2343676793887628, + "grad_norm": 0.22580619156360626, + "learning_rate": 4.4087520298047003e-05, + "loss": 0.416, "step": 34250 }, { - "epoch": 1.21, - "learning_rate": 4.4387966404127926e-05, - "loss": 0.3019, + "epoch": 1.2345478790499873, + "grad_norm": 0.14515608549118042, + "learning_rate": 4.4085635617267026e-05, + "loss": 0.4118, "step": 34255 }, { - "epoch": 1.21, - "learning_rate": 4.4386167810275493e-05, - "loss": 0.3107, + "epoch": 1.234728078711212, + "grad_norm": 0.17137964069843292, + "learning_rate": 4.408375067644897e-05, + "loss": 0.3908, "step": 34260 }, { - "epoch": 1.21, - "learning_rate": 4.438436896470646e-05, - "loss": 0.2845, + "epoch": 1.2349082783724366, + "grad_norm": 0.17997115850448608, + "learning_rate": 4.40818654756185e-05, + "loss": 0.4645, "step": 34265 }, { - "epoch": 1.21, - "learning_rate": 4.4382569867444164e-05, - "loss": 0.2895, + "epoch": 1.2350884780336613, + "grad_norm": 0.16378553211688995, + "learning_rate": 4.40799800148013e-05, + "loss": 0.4137, "step": 34270 }, { - "epoch": 1.21, - "learning_rate": 4.438077051851197e-05, - "loss": 0.2912, + "epoch": 1.235268677694886, + "grad_norm": 0.1839524507522583, + "learning_rate": 4.407809429402308e-05, + "loss": 0.4448, "step": 34275 }, { - "epoch": 1.21, - "learning_rate": 4.437897091793325e-05, - "loss": 0.2887, + "epoch": 1.2354488773561105, + "grad_norm": 0.1332423835992813, + "learning_rate": 4.407620831330951e-05, + "loss": 0.4172, "step": 34280 }, { - "epoch": 1.21, - "learning_rate": 4.437717106573136e-05, - "loss": 0.2649, + "epoch": 1.2356290770173353, + "grad_norm": 0.18333803117275238, + "learning_rate": 4.407432207268629e-05, + "loss": 0.4224, "step": 34285 }, { - "epoch": 1.21, - "learning_rate": 4.437537096192967e-05, - "loss": 0.2903, + "epoch": 1.2358092766785598, + "grad_norm": 0.2039324939250946, + "learning_rate": 4.4072435572179136e-05, + "loss": 0.4289, "step": 34290 }, { - "epoch": 1.21, - "learning_rate": 4.437357060655156e-05, - "loss": 0.2955, + "epoch": 1.2359894763397845, + "grad_norm": 0.17370297014713287, + "learning_rate": 4.407054881181373e-05, + "loss": 0.4226, "step": 34295 }, { - "epoch": 1.21, - "learning_rate": 4.4371769999620406e-05, - "loss": 0.2914, + "epoch": 1.236169676001009, + "grad_norm": 0.16956277191638947, + "learning_rate": 4.40686617916158e-05, + "loss": 0.4354, "step": 34300 }, { - "epoch": 1.21, - "learning_rate": 4.4369969141159585e-05, - "loss": 0.2676, + "epoch": 1.2363498756622338, + "grad_norm": 0.16114924848079681, + "learning_rate": 4.406677451161103e-05, + "loss": 0.4433, "step": 34305 }, { - "epoch": 1.21, - "learning_rate": 4.436816803119248e-05, - "loss": 0.2737, + "epoch": 1.2365300753234585, + "grad_norm": 0.15897580981254578, + "learning_rate": 4.406488697182516e-05, + "loss": 0.3884, "step": 34310 }, { - "epoch": 1.21, - "learning_rate": 4.4366366669742476e-05, - "loss": 0.3046, + "epoch": 1.236710274984683, + "grad_norm": 0.17210477590560913, + "learning_rate": 4.406299917228389e-05, + "loss": 0.4553, "step": 34315 }, { - "epoch": 1.21, - "learning_rate": 4.4364565056832966e-05, - "loss": 0.3038, + "epoch": 1.2368904746459077, + "grad_norm": 0.17768850922584534, + "learning_rate": 4.406111111301295e-05, + "loss": 0.4372, "step": 34320 }, { - "epoch": 1.21, - "learning_rate": 4.4362763192487336e-05, - "loss": 0.2929, + "epoch": 1.2370706743071322, + "grad_norm": 0.20289890468120575, + "learning_rate": 4.405922279403807e-05, + "loss": 0.4266, "step": 34325 }, { - "epoch": 1.21, - "learning_rate": 4.436096107672899e-05, - "loss": 0.2979, + "epoch": 1.237250873968357, + "grad_norm": 0.18634875118732452, + "learning_rate": 4.405733421538496e-05, + "loss": 0.39, "step": 34330 }, { - "epoch": 1.21, - "learning_rate": 4.435915870958132e-05, - "loss": 0.3026, + "epoch": 1.2374310736295815, + "grad_norm": 0.16916252672672272, + "learning_rate": 4.4055445377079364e-05, + "loss": 0.3812, "step": 34335 }, { - "epoch": 1.21, - "learning_rate": 4.4357356091067734e-05, - "loss": 0.3056, + "epoch": 1.2376112732908062, + "grad_norm": 0.17390431463718414, + "learning_rate": 4.405355627914701e-05, + "loss": 0.4682, "step": 34340 }, { - "epoch": 1.21, - "learning_rate": 4.435555322121164e-05, - "loss": 0.3093, + "epoch": 1.237791472952031, + "grad_norm": 0.17190834879875183, + "learning_rate": 4.405166692161365e-05, + "loss": 0.4097, "step": 34345 }, { - "epoch": 1.21, - "learning_rate": 4.435375010003644e-05, - "loss": 0.2713, + "epoch": 1.2379716726132555, + "grad_norm": 0.15604445338249207, + "learning_rate": 4.4049777304505e-05, + "loss": 0.4193, "step": 34350 }, { - "epoch": 1.21, - "learning_rate": 4.435194672756555e-05, - "loss": 0.295, + "epoch": 1.2381518722744802, + "grad_norm": 0.19317157566547394, + "learning_rate": 4.404788742784683e-05, + "loss": 0.4087, "step": 34355 }, { - "epoch": 1.21, - "learning_rate": 4.4350143103822385e-05, - "loss": 0.2824, + "epoch": 1.2383320719357047, + "grad_norm": 0.15867052972316742, + "learning_rate": 4.404599729166489e-05, + "loss": 0.4123, "step": 34360 }, { - "epoch": 1.21, - "learning_rate": 4.434833922883036e-05, - "loss": 0.2898, + "epoch": 1.2385122715969294, + "grad_norm": 0.18117307126522064, + "learning_rate": 4.404410689598491e-05, + "loss": 0.4088, "step": 34365 }, { - "epoch": 1.21, - "learning_rate": 4.434653510261291e-05, - "loss": 0.2953, + "epoch": 1.238692471258154, + "grad_norm": 0.20402644574642181, + "learning_rate": 4.404221624083267e-05, + "loss": 0.4284, "step": 34370 }, { - "epoch": 1.21, - "learning_rate": 4.434473072519344e-05, - "loss": 0.3097, + "epoch": 1.2388726709193787, + "grad_norm": 0.20701655745506287, + "learning_rate": 4.4040325326233914e-05, + "loss": 0.386, "step": 34375 }, { - "epoch": 1.21, - "learning_rate": 4.4342926096595396e-05, - "loss": 0.301, + "epoch": 1.2390528705806032, + "grad_norm": 0.20911046862602234, + "learning_rate": 4.403843415221442e-05, + "loss": 0.3965, "step": 34380 }, { - "epoch": 1.21, - "learning_rate": 4.434112121684221e-05, - "loss": 0.2811, + "epoch": 1.239233070241828, + "grad_norm": 0.22463898360729218, + "learning_rate": 4.4036542718799944e-05, + "loss": 0.4292, "step": 34385 }, { - "epoch": 1.21, - "learning_rate": 4.4339316085957305e-05, - "loss": 0.2814, + "epoch": 1.2394132699030527, + "grad_norm": 0.1606704443693161, + "learning_rate": 4.403465102601626e-05, + "loss": 0.4427, "step": 34390 }, { - "epoch": 1.21, - "learning_rate": 4.433751070396412e-05, - "loss": 0.2852, + "epoch": 1.2395934695642772, + "grad_norm": 0.18783807754516602, + "learning_rate": 4.4032759073889134e-05, + "loss": 0.4545, "step": 34395 }, { - "epoch": 1.21, - "learning_rate": 4.433570507088611e-05, - "loss": 0.283, + "epoch": 1.239773669225502, + "grad_norm": 0.20419709384441376, + "learning_rate": 4.403086686244435e-05, + "loss": 0.4332, "step": 34400 }, { - "epoch": 1.21, - "learning_rate": 4.433389918674671e-05, - "loss": 0.2712, + "epoch": 1.2399538688867264, + "grad_norm": 0.16795575618743896, + "learning_rate": 4.40289743917077e-05, + "loss": 0.4359, "step": 34405 }, { - "epoch": 1.21, - "learning_rate": 4.4332093051569374e-05, - "loss": 0.3063, + "epoch": 1.2401340685479512, + "grad_norm": 0.157429039478302, + "learning_rate": 4.402708166170495e-05, + "loss": 0.4176, "step": 34410 }, { - "epoch": 1.21, - "learning_rate": 4.433028666537755e-05, - "loss": 0.2805, + "epoch": 1.2403142682091757, + "grad_norm": 0.1853724867105484, + "learning_rate": 4.4025188672461903e-05, + "loss": 0.4474, "step": 34415 }, { - "epoch": 1.21, - "learning_rate": 4.4328480028194684e-05, - "loss": 0.3026, + "epoch": 1.2404944678704004, + "grad_norm": 0.1714290827512741, + "learning_rate": 4.402329542400434e-05, + "loss": 0.4237, "step": 34420 }, { - "epoch": 1.21, - "learning_rate": 4.432667314004425e-05, - "loss": 0.2853, + "epoch": 1.240674667531625, + "grad_norm": 0.2036142498254776, + "learning_rate": 4.402140191635806e-05, + "loss": 0.4049, "step": 34425 }, { - "epoch": 1.21, - "learning_rate": 4.432486600094971e-05, - "loss": 0.3051, + "epoch": 1.2408548671928497, + "grad_norm": 0.1990082859992981, + "learning_rate": 4.401950814954886e-05, + "loss": 0.4252, "step": 34430 }, { - "epoch": 1.21, - "learning_rate": 4.432305861093451e-05, - "loss": 0.2682, + "epoch": 1.2410350668540744, + "grad_norm": 0.17865248024463654, + "learning_rate": 4.4017614123602546e-05, + "loss": 0.4072, "step": 34435 }, { - "epoch": 1.21, - "learning_rate": 4.4321250970022126e-05, - "loss": 0.2969, + "epoch": 1.241215266515299, + "grad_norm": 0.1914636194705963, + "learning_rate": 4.401571983854492e-05, + "loss": 0.4452, "step": 34440 }, { - "epoch": 1.21, - "learning_rate": 4.431944307823604e-05, - "loss": 0.2952, + "epoch": 1.2413954661765236, + "grad_norm": 0.1847231239080429, + "learning_rate": 4.401382529440179e-05, + "loss": 0.3943, "step": 34445 }, { - "epoch": 1.21, - "learning_rate": 4.431763493559971e-05, - "loss": 0.2963, + "epoch": 1.2415756658377481, + "grad_norm": 0.20364885032176971, + "learning_rate": 4.401193049119898e-05, + "loss": 0.4116, "step": 34450 }, { - "epoch": 1.21, - "learning_rate": 4.431582654213663e-05, - "loss": 0.296, + "epoch": 1.2417558654989729, + "grad_norm": 0.1959732621908188, + "learning_rate": 4.4010035428962295e-05, + "loss": 0.4072, "step": 34455 }, { - "epoch": 1.21, - "learning_rate": 4.431401789787026e-05, - "loss": 0.2775, + "epoch": 1.2419360651601976, + "grad_norm": 0.18121816217899323, + "learning_rate": 4.400814010771755e-05, + "loss": 0.4325, "step": 34460 }, { - "epoch": 1.21, - "learning_rate": 4.43122090028241e-05, - "loss": 0.2827, + "epoch": 1.2421162648214221, + "grad_norm": 0.23917900025844574, + "learning_rate": 4.400624452749058e-05, + "loss": 0.4233, "step": 34465 }, { - "epoch": 1.21, - "learning_rate": 4.431039985702164e-05, - "loss": 0.3046, + "epoch": 1.2422964644826469, + "grad_norm": 0.21127377450466156, + "learning_rate": 4.400434868830721e-05, + "loss": 0.4266, "step": 34470 }, { - "epoch": 1.21, - "learning_rate": 4.430859046048636e-05, - "loss": 0.2818, + "epoch": 1.2424766641438714, + "grad_norm": 0.21649089455604553, + "learning_rate": 4.4002452590193265e-05, + "loss": 0.4478, "step": 34475 }, { - "epoch": 1.21, - "learning_rate": 4.430678081324175e-05, - "loss": 0.2823, + "epoch": 1.242656863805096, + "grad_norm": 0.18373538553714752, + "learning_rate": 4.400055623317459e-05, + "loss": 0.3895, "step": 34480 }, { - "epoch": 1.21, - "learning_rate": 4.4304970915311326e-05, - "loss": 0.2867, + "epoch": 1.2428370634663206, + "grad_norm": 0.1825505495071411, + "learning_rate": 4.399865961727701e-05, + "loss": 0.3957, "step": 34485 }, { - "epoch": 1.21, - "learning_rate": 4.4303160766718574e-05, - "loss": 0.3043, + "epoch": 1.2430172631275453, + "grad_norm": 0.15860360860824585, + "learning_rate": 4.399676274252637e-05, + "loss": 0.4106, "step": 34490 }, { - "epoch": 1.21, - "learning_rate": 4.4301350367487005e-05, - "loss": 0.3266, + "epoch": 1.2431974627887699, + "grad_norm": 0.14994730055332184, + "learning_rate": 4.399486560894852e-05, + "loss": 0.3668, "step": 34495 }, { - "epoch": 1.21, - "learning_rate": 4.429953971764012e-05, - "loss": 0.287, + "epoch": 1.2433776624499946, + "grad_norm": 0.18823125958442688, + "learning_rate": 4.39929682165693e-05, + "loss": 0.419, "step": 34500 }, { - "epoch": 1.21, - "eval_loss": 0.2872275710105896, - "eval_runtime": 10.5666, - "eval_samples_per_second": 9.464, - "eval_steps_per_second": 9.464, + "epoch": 1.2433776624499946, + "eval_loss": 0.45045050978660583, + "eval_runtime": 3.5444, + "eval_samples_per_second": 28.213, + "eval_steps_per_second": 7.053, "step": 34500 }, { - "epoch": 1.21, - "learning_rate": 4.429772881720142e-05, - "loss": 0.2876, + "epoch": 1.2435578621112193, + "grad_norm": 0.20211850106716156, + "learning_rate": 4.399107056541456e-05, + "loss": 0.3954, "step": 34505 }, { - "epoch": 1.21, - "learning_rate": 4.429591766619444e-05, - "loss": 0.2798, + "epoch": 1.2437380617724438, + "grad_norm": 0.19048206508159637, + "learning_rate": 4.398917265551017e-05, + "loss": 0.4312, "step": 34510 }, { - "epoch": 1.21, - "learning_rate": 4.429410626464268e-05, - "loss": 0.285, + "epoch": 1.2439182614336686, + "grad_norm": 0.17858850955963135, + "learning_rate": 4.398727448688198e-05, + "loss": 0.4522, "step": 34515 }, { - "epoch": 1.21, - "learning_rate": 4.429229461256967e-05, - "loss": 0.2681, + "epoch": 1.244098461094893, + "grad_norm": 0.14461615681648254, + "learning_rate": 4.398537605955584e-05, + "loss": 0.4295, "step": 34520 }, { - "epoch": 1.21, - "learning_rate": 4.429048270999892e-05, - "loss": 0.2791, + "epoch": 1.2442786607561178, + "grad_norm": 0.16220664978027344, + "learning_rate": 4.398347737355764e-05, + "loss": 0.4194, "step": 34525 }, { - "epoch": 1.21, - "learning_rate": 4.428867055695398e-05, - "loss": 0.3229, + "epoch": 1.2444588604173423, + "grad_norm": 0.17861701548099518, + "learning_rate": 4.398157842891323e-05, + "loss": 0.4393, "step": 34530 }, { - "epoch": 1.22, - "learning_rate": 4.4286858153458344e-05, - "loss": 0.3223, + "epoch": 1.244639060078567, + "grad_norm": 0.2174644023180008, + "learning_rate": 4.3979679225648484e-05, + "loss": 0.4721, "step": 34535 }, { - "epoch": 1.22, - "learning_rate": 4.4285045499535584e-05, - "loss": 0.3156, + "epoch": 1.2448192597397916, + "grad_norm": 0.19384269416332245, + "learning_rate": 4.397777976378929e-05, + "loss": 0.4112, "step": 34540 }, { - "epoch": 1.22, - "learning_rate": 4.428323259520921e-05, - "loss": 0.2721, + "epoch": 1.2449994594010163, + "grad_norm": 0.16836018860340118, + "learning_rate": 4.397588004336152e-05, + "loss": 0.4547, "step": 34545 }, { - "epoch": 1.22, - "learning_rate": 4.428141944050277e-05, - "loss": 0.2948, + "epoch": 1.245179659062241, + "grad_norm": 0.163166344165802, + "learning_rate": 4.397398006439105e-05, + "loss": 0.4291, "step": 34550 }, { - "epoch": 1.22, - "learning_rate": 4.42796060354398e-05, - "loss": 0.2897, + "epoch": 1.2453598587234656, + "grad_norm": 0.17786313593387604, + "learning_rate": 4.397207982690378e-05, + "loss": 0.4256, "step": 34555 }, { - "epoch": 1.22, - "learning_rate": 4.427779238004386e-05, - "loss": 0.2914, + "epoch": 1.2455400583846903, + "grad_norm": 0.1747133880853653, + "learning_rate": 4.39701793309256e-05, + "loss": 0.4184, "step": 34560 }, { - "epoch": 1.22, - "learning_rate": 4.4275978474338485e-05, - "loss": 0.2896, + "epoch": 1.2457202580459148, + "grad_norm": 0.1763535887002945, + "learning_rate": 4.3968278576482394e-05, + "loss": 0.4077, "step": 34565 }, { - "epoch": 1.22, - "learning_rate": 4.4274164318347235e-05, - "loss": 0.2832, + "epoch": 1.2459004577071395, + "grad_norm": 0.22853651642799377, + "learning_rate": 4.396637756360007e-05, + "loss": 0.4367, "step": 34570 }, { - "epoch": 1.22, - "learning_rate": 4.427234991209366e-05, - "loss": 0.3122, + "epoch": 1.2460806573683643, + "grad_norm": 0.21163353323936462, + "learning_rate": 4.396447629230452e-05, + "loss": 0.4155, "step": 34575 }, { - "epoch": 1.22, - "learning_rate": 4.4270535255601324e-05, - "loss": 0.2831, + "epoch": 1.2462608570295888, + "grad_norm": 0.16000016033649445, + "learning_rate": 4.396257476262165e-05, + "loss": 0.3771, "step": 34580 }, { - "epoch": 1.22, - "learning_rate": 4.4268720348893786e-05, - "loss": 0.2953, + "epoch": 1.2464410566908135, + "grad_norm": 0.23140566051006317, + "learning_rate": 4.396067297457738e-05, + "loss": 0.4196, "step": 34585 }, { - "epoch": 1.22, - "learning_rate": 4.4266905191994615e-05, - "loss": 0.2978, + "epoch": 1.246621256352038, + "grad_norm": 0.16207090020179749, + "learning_rate": 4.3958770928197604e-05, + "loss": 0.4025, "step": 34590 }, { - "epoch": 1.22, - "learning_rate": 4.426508978492738e-05, - "loss": 0.2993, + "epoch": 1.2468014560132628, + "grad_norm": 0.18664728105068207, + "learning_rate": 4.395686862350824e-05, + "loss": 0.4154, "step": 34595 }, { - "epoch": 1.22, - "learning_rate": 4.426327412771565e-05, - "loss": 0.3028, + "epoch": 1.2469816556744873, + "grad_norm": 0.1926645040512085, + "learning_rate": 4.395496606053522e-05, + "loss": 0.4093, "step": 34600 }, { - "epoch": 1.22, - "learning_rate": 4.4261458220383e-05, - "loss": 0.2977, + "epoch": 1.247161855335712, + "grad_norm": 0.16075952351093292, + "learning_rate": 4.395306323930445e-05, + "loss": 0.4158, "step": 34605 }, { - "epoch": 1.22, - "learning_rate": 4.425964206295301e-05, - "loss": 0.2806, + "epoch": 1.2473420549969365, + "grad_norm": 0.1828758716583252, + "learning_rate": 4.3951160159841864e-05, + "loss": 0.3779, "step": 34610 }, { - "epoch": 1.22, - "learning_rate": 4.4257825655449257e-05, - "loss": 0.2709, + "epoch": 1.2475222546581612, + "grad_norm": 0.2084207683801651, + "learning_rate": 4.394925682217339e-05, + "loss": 0.4011, "step": 34615 }, { - "epoch": 1.22, - "learning_rate": 4.425600899789533e-05, - "loss": 0.2556, + "epoch": 1.247702454319386, + "grad_norm": 0.17692120373249054, + "learning_rate": 4.3947353226324964e-05, + "loss": 0.4318, "step": 34620 }, { - "epoch": 1.22, - "learning_rate": 4.425419209031482e-05, - "loss": 0.2851, + "epoch": 1.2478826539806105, + "grad_norm": 0.16040922701358795, + "learning_rate": 4.394544937232252e-05, + "loss": 0.396, "step": 34625 }, { - "epoch": 1.22, - "learning_rate": 4.42523749327313e-05, - "loss": 0.2911, + "epoch": 1.2480628536418352, + "grad_norm": 0.19275468587875366, + "learning_rate": 4.3943545260192e-05, + "loss": 0.4325, "step": 34630 }, { - "epoch": 1.22, - "learning_rate": 4.425055752516839e-05, - "loss": 0.2835, + "epoch": 1.2482430533030597, + "grad_norm": 0.1728275716304779, + "learning_rate": 4.394164088995933e-05, + "loss": 0.4156, "step": 34635 }, { - "epoch": 1.22, - "learning_rate": 4.424873986764968e-05, - "loss": 0.2993, + "epoch": 1.2484232529642845, + "grad_norm": 0.2221703678369522, + "learning_rate": 4.393973626165048e-05, + "loss": 0.3882, "step": 34640 }, { - "epoch": 1.22, - "learning_rate": 4.424692196019877e-05, - "loss": 0.289, + "epoch": 1.248603452625509, + "grad_norm": 0.18804728984832764, + "learning_rate": 4.393783137529139e-05, + "loss": 0.4215, "step": 34645 }, { - "epoch": 1.22, - "learning_rate": 4.4245103802839256e-05, - "loss": 0.2912, + "epoch": 1.2487836522867337, + "grad_norm": 0.15801158547401428, + "learning_rate": 4.393592623090801e-05, + "loss": 0.364, "step": 34650 }, { - "epoch": 1.22, - "learning_rate": 4.4243285395594754e-05, - "loss": 0.2923, + "epoch": 1.2489638519479582, + "grad_norm": 0.1544542908668518, + "learning_rate": 4.39340208285263e-05, + "loss": 0.4171, "step": 34655 }, { - "epoch": 1.22, - "learning_rate": 4.424146673848887e-05, - "loss": 0.291, + "epoch": 1.249144051609183, + "grad_norm": 0.19875742495059967, + "learning_rate": 4.3932115168172225e-05, + "loss": 0.4258, "step": 34660 }, { - "epoch": 1.22, - "learning_rate": 4.4239647831545225e-05, - "loss": 0.3019, + "epoch": 1.2493242512704077, + "grad_norm": 0.16528360545635223, + "learning_rate": 4.3930209249871744e-05, + "loss": 0.4562, "step": 34665 }, { - "epoch": 1.22, - "learning_rate": 4.423782867478743e-05, - "loss": 0.2934, + "epoch": 1.2495044509316322, + "grad_norm": 0.18268528580665588, + "learning_rate": 4.3928303073650835e-05, + "loss": 0.407, "step": 34670 }, { - "epoch": 1.22, - "learning_rate": 4.423600926823911e-05, - "loss": 0.2817, + "epoch": 1.249684650592857, + "grad_norm": 0.1825907826423645, + "learning_rate": 4.392639663953545e-05, + "loss": 0.4098, "step": 34675 }, { - "epoch": 1.22, - "learning_rate": 4.423418961192389e-05, - "loss": 0.3051, + "epoch": 1.2498648502540814, + "grad_norm": 0.17392845451831818, + "learning_rate": 4.3924489947551586e-05, + "loss": 0.4377, "step": 34680 }, { - "epoch": 1.22, - "learning_rate": 4.423236970586538e-05, - "loss": 0.2773, + "epoch": 1.2500450499153062, + "grad_norm": 0.1909123808145523, + "learning_rate": 4.39225829977252e-05, + "loss": 0.4298, "step": 34685 }, { - "epoch": 1.22, - "learning_rate": 4.423054955008724e-05, - "loss": 0.2957, + "epoch": 1.250225249576531, + "grad_norm": 0.13296571373939514, + "learning_rate": 4.392067579008229e-05, + "loss": 0.4111, "step": 34690 }, { - "epoch": 1.22, - "learning_rate": 4.422872914461308e-05, - "loss": 0.2919, + "epoch": 1.2504054492377554, + "grad_norm": 0.21893556416034698, + "learning_rate": 4.391876832464883e-05, + "loss": 0.4422, "step": 34695 }, { - "epoch": 1.22, - "learning_rate": 4.4226908489466544e-05, - "loss": 0.2951, + "epoch": 1.25058564889898, + "grad_norm": 0.18834765255451202, + "learning_rate": 4.3916860601450825e-05, + "loss": 0.4363, "step": 34700 }, { - "epoch": 1.22, - "learning_rate": 4.422508758467128e-05, - "loss": 0.278, + "epoch": 1.2507658485602047, + "grad_norm": 0.18092414736747742, + "learning_rate": 4.391495262051425e-05, + "loss": 0.4227, "step": 34705 }, { - "epoch": 1.22, - "learning_rate": 4.4223266430250915e-05, - "loss": 0.2756, + "epoch": 1.2509460482214294, + "grad_norm": 0.18126097321510315, + "learning_rate": 4.391304438186511e-05, + "loss": 0.3999, "step": 34710 }, { - "epoch": 1.22, - "learning_rate": 4.42214450262291e-05, - "loss": 0.2845, + "epoch": 1.251126247882654, + "grad_norm": 0.19100238382816315, + "learning_rate": 4.39111358855294e-05, + "loss": 0.3802, "step": 34715 }, { - "epoch": 1.22, - "learning_rate": 4.42196233726295e-05, - "loss": 0.2854, + "epoch": 1.2513064475438787, + "grad_norm": 0.18125472962856293, + "learning_rate": 4.390922713153312e-05, + "loss": 0.4184, "step": 34720 }, { - "epoch": 1.22, - "learning_rate": 4.421780146947575e-05, - "loss": 0.287, + "epoch": 1.2514866472051032, + "grad_norm": 0.2104109823703766, + "learning_rate": 4.3907318119902286e-05, + "loss": 0.3942, "step": 34725 }, { - "epoch": 1.22, - "learning_rate": 4.4215979316791516e-05, - "loss": 0.2951, + "epoch": 1.251666846866328, + "grad_norm": 0.18633076548576355, + "learning_rate": 4.390540885066291e-05, + "loss": 0.409, "step": 34730 }, { - "epoch": 1.22, - "learning_rate": 4.4214156914600454e-05, - "loss": 0.2786, + "epoch": 1.2518470465275526, + "grad_norm": 0.18093355000019073, + "learning_rate": 4.3903499323840985e-05, + "loss": 0.411, "step": 34735 }, { - "epoch": 1.22, - "learning_rate": 4.4212334262926226e-05, - "loss": 0.3042, + "epoch": 1.2520272461887771, + "grad_norm": 0.18245679140090942, + "learning_rate": 4.390158953946255e-05, + "loss": 0.4236, "step": 34740 }, { - "epoch": 1.22, - "learning_rate": 4.42105113617925e-05, - "loss": 0.2986, + "epoch": 1.2522074458500019, + "grad_norm": 0.16724418103694916, + "learning_rate": 4.3899679497553616e-05, + "loss": 0.4244, "step": 34745 }, { - "epoch": 1.22, - "learning_rate": 4.420868821122295e-05, - "loss": 0.2869, + "epoch": 1.2523876455112264, + "grad_norm": 0.1758909821510315, + "learning_rate": 4.3897769198140204e-05, + "loss": 0.438, "step": 34750 }, { - "epoch": 1.22, - "learning_rate": 4.420686481124123e-05, - "loss": 0.2745, + "epoch": 1.2525678451724511, + "grad_norm": 0.17863698303699493, + "learning_rate": 4.389585864124835e-05, + "loss": 0.4163, "step": 34755 }, { - "epoch": 1.22, - "learning_rate": 4.420504116187103e-05, - "loss": 0.2817, + "epoch": 1.2527480448336756, + "grad_norm": 0.19687694311141968, + "learning_rate": 4.389394782690408e-05, + "loss": 0.4004, "step": 34760 }, { - "epoch": 1.22, - "learning_rate": 4.4203217263136034e-05, - "loss": 0.2904, + "epoch": 1.2529282444949004, + "grad_norm": 0.17366833984851837, + "learning_rate": 4.389203675513343e-05, + "loss": 0.4307, "step": 34765 }, { - "epoch": 1.22, - "learning_rate": 4.420139311505991e-05, - "loss": 0.3057, + "epoch": 1.2531084441561249, + "grad_norm": 0.14887477457523346, + "learning_rate": 4.389012542596244e-05, + "loss": 0.435, "step": 34770 }, { - "epoch": 1.22, - "learning_rate": 4.419956871766636e-05, - "loss": 0.2846, + "epoch": 1.2532886438173496, + "grad_norm": 0.17449168860912323, + "learning_rate": 4.388821383941714e-05, + "loss": 0.4406, "step": 34775 }, { - "epoch": 1.22, - "learning_rate": 4.4197744070979056e-05, - "loss": 0.2672, + "epoch": 1.2534688434785743, + "grad_norm": 0.1995227187871933, + "learning_rate": 4.388630199552358e-05, + "loss": 0.4066, "step": 34780 }, { - "epoch": 1.22, - "learning_rate": 4.41959191750217e-05, - "loss": 0.3128, + "epoch": 1.2536490431397989, + "grad_norm": 0.19903253018856049, + "learning_rate": 4.388438989430782e-05, + "loss": 0.459, "step": 34785 }, { - "epoch": 1.22, - "learning_rate": 4.419409402981799e-05, - "loss": 0.2899, + "epoch": 1.2538292428010236, + "grad_norm": 0.23275376856327057, + "learning_rate": 4.3882477535795904e-05, + "loss": 0.4503, "step": 34790 }, { - "epoch": 1.22, - "learning_rate": 4.41922686353916e-05, - "loss": 0.3013, + "epoch": 1.254009442462248, + "grad_norm": 0.17660613358020782, + "learning_rate": 4.3880564920013885e-05, + "loss": 0.3901, "step": 34795 }, { - "epoch": 1.22, - "learning_rate": 4.4190442991766264e-05, - "loss": 0.2865, + "epoch": 1.2541896421234728, + "grad_norm": 0.1641213297843933, + "learning_rate": 4.3878652046987824e-05, + "loss": 0.4275, "step": 34800 }, { - "epoch": 1.22, - "learning_rate": 4.4188617098965675e-05, - "loss": 0.2979, + "epoch": 1.2543698417846976, + "grad_norm": 0.19240440428256989, + "learning_rate": 4.387673891674379e-05, + "loss": 0.4089, "step": 34805 }, { - "epoch": 1.22, - "learning_rate": 4.4186790957013536e-05, - "loss": 0.2925, + "epoch": 1.254550041445922, + "grad_norm": 0.18231160938739777, + "learning_rate": 4.387482552930783e-05, + "loss": 0.4007, "step": 34810 }, { - "epoch": 1.22, - "learning_rate": 4.418496456593356e-05, - "loss": 0.2984, + "epoch": 1.2547302411071466, + "grad_norm": 0.15601201355457306, + "learning_rate": 4.387291188470603e-05, + "loss": 0.4185, "step": 34815 }, { - "epoch": 1.23, - "learning_rate": 4.418313792574946e-05, - "loss": 0.2933, + "epoch": 1.2549104407683713, + "grad_norm": 0.1767847090959549, + "learning_rate": 4.387099798296447e-05, + "loss": 0.4317, "step": 34820 }, { - "epoch": 1.23, - "learning_rate": 4.4181311036484955e-05, - "loss": 0.2885, + "epoch": 1.255090640429596, + "grad_norm": 0.1789582371711731, + "learning_rate": 4.386908382410922e-05, + "loss": 0.4234, "step": 34825 }, { - "epoch": 1.23, - "learning_rate": 4.417948389816377e-05, - "loss": 0.3076, + "epoch": 1.2552708400908206, + "grad_norm": 0.1394336223602295, + "learning_rate": 4.3867169408166333e-05, + "loss": 0.3885, "step": 34830 }, { - "epoch": 1.23, - "learning_rate": 4.417765651080963e-05, - "loss": 0.271, + "epoch": 1.2554510397520453, + "grad_norm": 0.1963426172733307, + "learning_rate": 4.3865254735161934e-05, + "loss": 0.3865, "step": 34835 }, { - "epoch": 1.23, - "learning_rate": 4.417582887444625e-05, - "loss": 0.3028, + "epoch": 1.2556312394132698, + "grad_norm": 0.19364525377750397, + "learning_rate": 4.3863339805122086e-05, + "loss": 0.4117, "step": 34840 }, { - "epoch": 1.23, - "learning_rate": 4.4174000989097375e-05, - "loss": 0.3184, + "epoch": 1.2558114390744946, + "grad_norm": 0.19295062124729156, + "learning_rate": 4.386142461807288e-05, + "loss": 0.4395, "step": 34845 }, { - "epoch": 1.23, - "learning_rate": 4.417217285478673e-05, - "loss": 0.2935, + "epoch": 1.2559916387357193, + "grad_norm": 0.16617421805858612, + "learning_rate": 4.385950917404042e-05, + "loss": 0.3931, "step": 34850 }, { - "epoch": 1.23, - "learning_rate": 4.417034447153806e-05, - "loss": 0.2841, + "epoch": 1.2561718383969438, + "grad_norm": 0.16993173956871033, + "learning_rate": 4.3857593473050804e-05, + "loss": 0.4174, "step": 34855 }, { - "epoch": 1.23, - "learning_rate": 4.41685158393751e-05, - "loss": 0.2943, + "epoch": 1.2563520380581685, + "grad_norm": 0.1860906183719635, + "learning_rate": 4.3855677515130125e-05, + "loss": 0.4128, "step": 34860 }, { - "epoch": 1.23, - "learning_rate": 4.41666869583216e-05, - "loss": 0.2861, + "epoch": 1.256532237719393, + "grad_norm": 0.17956295609474182, + "learning_rate": 4.385376130030448e-05, + "loss": 0.4174, "step": 34865 }, { - "epoch": 1.23, - "learning_rate": 4.416485782840129e-05, - "loss": 0.2744, + "epoch": 1.2567124373806178, + "grad_norm": 0.17807215452194214, + "learning_rate": 4.38518448286e-05, + "loss": 0.4009, "step": 34870 }, { - "epoch": 1.23, - "learning_rate": 4.4163028449637933e-05, - "loss": 0.2977, + "epoch": 1.2568926370418423, + "grad_norm": 0.15387162566184998, + "learning_rate": 4.384992810004278e-05, + "loss": 0.4004, "step": 34875 }, { - "epoch": 1.23, - "learning_rate": 4.416119882205528e-05, - "loss": 0.3126, + "epoch": 1.257072836703067, + "grad_norm": 0.1546095311641693, + "learning_rate": 4.3848011114658934e-05, + "loss": 0.3967, "step": 34880 }, { - "epoch": 1.23, - "learning_rate": 4.41593689456771e-05, - "loss": 0.2921, + "epoch": 1.2572530363642915, + "grad_norm": 0.18044285476207733, + "learning_rate": 4.384609387247459e-05, + "loss": 0.4163, "step": 34885 }, { - "epoch": 1.23, - "learning_rate": 4.415753882052713e-05, - "loss": 0.3003, + "epoch": 1.2574332360255163, + "grad_norm": 0.2026679515838623, + "learning_rate": 4.384417637351587e-05, + "loss": 0.4258, "step": 34890 }, { - "epoch": 1.23, - "learning_rate": 4.415570844662914e-05, - "loss": 0.27, + "epoch": 1.257613435686741, + "grad_norm": 0.15723782777786255, + "learning_rate": 4.3842258617808895e-05, + "loss": 0.395, "step": 34895 }, { - "epoch": 1.23, - "learning_rate": 4.415387782400691e-05, - "loss": 0.2767, + "epoch": 1.2577936353479655, + "grad_norm": 0.19395868480205536, + "learning_rate": 4.384034060537979e-05, + "loss": 0.425, "step": 34900 }, { - "epoch": 1.23, - "learning_rate": 4.415204695268419e-05, - "loss": 0.2865, + "epoch": 1.2579738350091902, + "grad_norm": 0.1924101859331131, + "learning_rate": 4.383842233625469e-05, + "loss": 0.432, "step": 34905 }, { - "epoch": 1.23, - "learning_rate": 4.415021583268477e-05, - "loss": 0.3064, + "epoch": 1.2581540346704148, + "grad_norm": 0.13849791884422302, + "learning_rate": 4.383650381045974e-05, + "loss": 0.4305, "step": 34910 }, { - "epoch": 1.23, - "learning_rate": 4.414838446403242e-05, - "loss": 0.2653, + "epoch": 1.2583342343316395, + "grad_norm": 0.16291098296642303, + "learning_rate": 4.383458502802107e-05, + "loss": 0.4011, "step": 34915 }, { - "epoch": 1.23, - "learning_rate": 4.414655284675091e-05, - "loss": 0.2878, + "epoch": 1.2585144339928642, + "grad_norm": 0.19101671874523163, + "learning_rate": 4.383266598896482e-05, + "loss": 0.4103, "step": 34920 }, { - "epoch": 1.23, - "learning_rate": 4.414472098086403e-05, - "loss": 0.2892, + "epoch": 1.2586946336540887, + "grad_norm": 0.21897098422050476, + "learning_rate": 4.383074669331715e-05, + "loss": 0.4145, "step": 34925 }, { - "epoch": 1.23, - "learning_rate": 4.4142888866395566e-05, - "loss": 0.2682, + "epoch": 1.2588748333153132, + "grad_norm": 0.22563257813453674, + "learning_rate": 4.3828827141104186e-05, + "loss": 0.414, "step": 34930 }, { - "epoch": 1.23, - "learning_rate": 4.41410565033693e-05, - "loss": 0.3151, + "epoch": 1.259055032976538, + "grad_norm": 0.16169485449790955, + "learning_rate": 4.382690733235212e-05, + "loss": 0.4148, "step": 34935 }, { - "epoch": 1.23, - "learning_rate": 4.413922389180904e-05, - "loss": 0.2662, + "epoch": 1.2592352326377627, + "grad_norm": 0.20008553564548492, + "learning_rate": 4.382498726708707e-05, + "loss": 0.4406, "step": 34940 }, { - "epoch": 1.23, - "learning_rate": 4.413739103173856e-05, - "loss": 0.2859, + "epoch": 1.2594154322989872, + "grad_norm": 0.21007391810417175, + "learning_rate": 4.3823066945335225e-05, + "loss": 0.4308, "step": 34945 }, { - "epoch": 1.23, - "learning_rate": 4.4135557923181684e-05, - "loss": 0.302, + "epoch": 1.259595631960212, + "grad_norm": 0.17790304124355316, + "learning_rate": 4.3821146367122726e-05, + "loss": 0.3854, "step": 34950 }, { - "epoch": 1.23, - "learning_rate": 4.4133724566162195e-05, - "loss": 0.2936, + "epoch": 1.2597758316214365, + "grad_norm": 0.221872016787529, + "learning_rate": 4.381922553247576e-05, + "loss": 0.3961, "step": 34955 }, { - "epoch": 1.23, - "learning_rate": 4.413189096070389e-05, - "loss": 0.2965, + "epoch": 1.2599560312826612, + "grad_norm": 0.16703324019908905, + "learning_rate": 4.381730444142048e-05, + "loss": 0.4415, "step": 34960 }, { - "epoch": 1.23, - "learning_rate": 4.41300571068306e-05, - "loss": 0.2763, + "epoch": 1.260136230943886, + "grad_norm": 0.21276666224002838, + "learning_rate": 4.3815383093983084e-05, + "loss": 0.4316, "step": 34965 }, { - "epoch": 1.23, - "learning_rate": 4.412822300456613e-05, - "loss": 0.2747, + "epoch": 1.2603164306051104, + "grad_norm": 0.2033817023038864, + "learning_rate": 4.381346149018973e-05, + "loss": 0.409, "step": 34970 }, { - "epoch": 1.23, - "learning_rate": 4.4126388653934276e-05, - "loss": 0.2767, + "epoch": 1.260496630266335, + "grad_norm": 0.15015016496181488, + "learning_rate": 4.38115396300666e-05, + "loss": 0.4541, "step": 34975 }, { - "epoch": 1.23, - "learning_rate": 4.412455405495888e-05, - "loss": 0.295, + "epoch": 1.2606768299275597, + "grad_norm": 0.19646312296390533, + "learning_rate": 4.3809617513639886e-05, + "loss": 0.4066, "step": 34980 }, { - "epoch": 1.23, - "learning_rate": 4.412271920766375e-05, - "loss": 0.2844, + "epoch": 1.2608570295887844, + "grad_norm": 0.17837940156459808, + "learning_rate": 4.380769514093578e-05, + "loss": 0.4212, "step": 34985 }, { - "epoch": 1.23, - "learning_rate": 4.4120884112072714e-05, - "loss": 0.2889, + "epoch": 1.261037229250009, + "grad_norm": 0.20663444697856903, + "learning_rate": 4.380577251198047e-05, + "loss": 0.4459, "step": 34990 }, { - "epoch": 1.23, - "learning_rate": 4.41190487682096e-05, - "loss": 0.303, + "epoch": 1.2612174289112337, + "grad_norm": 0.20016391575336456, + "learning_rate": 4.380384962680015e-05, + "loss": 0.4125, "step": 34995 }, { - "epoch": 1.23, - "learning_rate": 4.411721317609824e-05, - "loss": 0.2971, + "epoch": 1.2613976285724582, + "grad_norm": 0.20151177048683167, + "learning_rate": 4.380192648542101e-05, + "loss": 0.4607, "step": 35000 }, { - "epoch": 1.23, - "eval_loss": 0.2867205739021301, - "eval_runtime": 10.5517, - "eval_samples_per_second": 9.477, - "eval_steps_per_second": 9.477, + "epoch": 1.2613976285724582, + "eval_loss": 0.44922810792922974, + "eval_runtime": 3.5387, + "eval_samples_per_second": 28.259, + "eval_steps_per_second": 7.065, "step": 35000 }, { - "epoch": 1.23, - "learning_rate": 4.4115377335762456e-05, - "loss": 0.3064, + "epoch": 1.261577828233683, + "grad_norm": 0.18609340488910675, + "learning_rate": 4.380000308786927e-05, + "loss": 0.4066, "step": 35005 }, { - "epoch": 1.23, - "learning_rate": 4.41135412472261e-05, - "loss": 0.2653, + "epoch": 1.2617580278949077, + "grad_norm": 0.18463589251041412, + "learning_rate": 4.3798079434171124e-05, + "loss": 0.4624, "step": 35010 }, { - "epoch": 1.23, - "learning_rate": 4.411170491051301e-05, - "loss": 0.295, + "epoch": 1.2619382275561322, + "grad_norm": 0.21645288169384003, + "learning_rate": 4.379615552435279e-05, + "loss": 0.4172, "step": 35015 }, { - "epoch": 1.23, - "learning_rate": 4.410986832564702e-05, - "loss": 0.2904, + "epoch": 1.262118427217357, + "grad_norm": 0.18142879009246826, + "learning_rate": 4.379423135844048e-05, + "loss": 0.445, "step": 35020 }, { - "epoch": 1.23, - "learning_rate": 4.410803149265199e-05, - "loss": 0.256, + "epoch": 1.2622986268785814, + "grad_norm": 0.17872460186481476, + "learning_rate": 4.379230693646039e-05, + "loss": 0.4329, "step": 35025 }, { - "epoch": 1.23, - "learning_rate": 4.4106194411551766e-05, - "loss": 0.2899, + "epoch": 1.2624788265398061, + "grad_norm": 0.16483642160892487, + "learning_rate": 4.3790382258438776e-05, + "loss": 0.436, "step": 35030 }, { - "epoch": 1.23, - "learning_rate": 4.410435708237019e-05, - "loss": 0.2897, + "epoch": 1.2626590262010309, + "grad_norm": 0.2143619805574417, + "learning_rate": 4.3788457324401826e-05, + "loss": 0.4184, "step": 35035 }, { - "epoch": 1.23, - "learning_rate": 4.410251950513113e-05, - "loss": 0.3124, + "epoch": 1.2628392258622554, + "grad_norm": 0.16062313318252563, + "learning_rate": 4.378653213437579e-05, + "loss": 0.4251, "step": 35040 }, { - "epoch": 1.23, - "learning_rate": 4.410068167985845e-05, - "loss": 0.2721, + "epoch": 1.26301942552348, + "grad_norm": 0.206187441945076, + "learning_rate": 4.3784606688386885e-05, + "loss": 0.4143, "step": 35045 }, { - "epoch": 1.23, - "learning_rate": 4.4098843606576004e-05, - "loss": 0.2894, + "epoch": 1.2631996251847046, + "grad_norm": 0.15511402487754822, + "learning_rate": 4.3782680986461356e-05, + "loss": 0.3726, "step": 35050 }, { - "epoch": 1.23, - "learning_rate": 4.4097005285307654e-05, - "loss": 0.2857, + "epoch": 1.2633798248459294, + "grad_norm": 0.15216435492038727, + "learning_rate": 4.3780755028625434e-05, + "loss": 0.407, "step": 35055 }, { - "epoch": 1.23, - "learning_rate": 4.409516671607729e-05, - "loss": 0.277, + "epoch": 1.2635600245071539, + "grad_norm": 0.1635829508304596, + "learning_rate": 4.377882881490536e-05, + "loss": 0.4166, "step": 35060 }, { - "epoch": 1.23, - "learning_rate": 4.409332789890876e-05, - "loss": 0.3018, + "epoch": 1.2637402241683786, + "grad_norm": 0.2377825379371643, + "learning_rate": 4.377690234532739e-05, + "loss": 0.4311, "step": 35065 }, { - "epoch": 1.23, - "learning_rate": 4.409148883382595e-05, - "loss": 0.2893, + "epoch": 1.2639204238296031, + "grad_norm": 0.1933947205543518, + "learning_rate": 4.3774975619917744e-05, + "loss": 0.4266, "step": 35070 }, { - "epoch": 1.23, - "learning_rate": 4.4089649520852736e-05, - "loss": 0.285, + "epoch": 1.2641006234908279, + "grad_norm": 0.1821034997701645, + "learning_rate": 4.3773048638702694e-05, + "loss": 0.4347, "step": 35075 }, { - "epoch": 1.23, - "learning_rate": 4.4087809960013013e-05, - "loss": 0.3, + "epoch": 1.2642808231520526, + "grad_norm": 0.21266570687294006, + "learning_rate": 4.3771121401708495e-05, + "loss": 0.4509, "step": 35080 }, { - "epoch": 1.23, - "learning_rate": 4.408597015133065e-05, - "loss": 0.2834, + "epoch": 1.264461022813277, + "grad_norm": 0.16414852440357208, + "learning_rate": 4.3769193908961405e-05, + "loss": 0.44, "step": 35085 }, { - "epoch": 1.23, - "learning_rate": 4.408413009482954e-05, - "loss": 0.3, + "epoch": 1.2646412224745016, + "grad_norm": 0.15532633662223816, + "learning_rate": 4.3767266160487675e-05, + "loss": 0.4466, "step": 35090 }, { - "epoch": 1.23, - "learning_rate": 4.4082289790533586e-05, - "loss": 0.2805, + "epoch": 1.2648214221357263, + "grad_norm": 0.1961180716753006, + "learning_rate": 4.376533815631357e-05, + "loss": 0.4613, "step": 35095 }, { - "epoch": 1.23, - "learning_rate": 4.408044923846667e-05, - "loss": 0.3079, + "epoch": 1.265001621796951, + "grad_norm": 0.18854406476020813, + "learning_rate": 4.3763409896465376e-05, + "loss": 0.4098, "step": 35100 }, { - "epoch": 1.24, - "learning_rate": 4.4078608438652705e-05, - "loss": 0.3199, + "epoch": 1.2651818214581756, + "grad_norm": 0.181373730301857, + "learning_rate": 4.376148138096936e-05, + "loss": 0.4146, "step": 35105 }, { - "epoch": 1.24, - "learning_rate": 4.407676739111557e-05, - "loss": 0.2876, + "epoch": 1.2653620211194003, + "grad_norm": 0.16534672677516937, + "learning_rate": 4.3759552609851785e-05, + "loss": 0.4341, "step": 35110 }, { - "epoch": 1.24, - "learning_rate": 4.407492609587919e-05, - "loss": 0.2647, + "epoch": 1.2655422207806248, + "grad_norm": 0.22548224031925201, + "learning_rate": 4.375762358313894e-05, + "loss": 0.3947, "step": 35115 }, { - "epoch": 1.24, - "learning_rate": 4.407308455296746e-05, - "loss": 0.3269, + "epoch": 1.2657224204418496, + "grad_norm": 0.18645992875099182, + "learning_rate": 4.37556943008571e-05, + "loss": 0.4356, "step": 35120 }, { - "epoch": 1.24, - "learning_rate": 4.4071242762404294e-05, - "loss": 0.2996, + "epoch": 1.2659026201030743, + "grad_norm": 0.1680716574192047, + "learning_rate": 4.375376476303256e-05, + "loss": 0.3941, "step": 35125 }, { - "epoch": 1.24, - "learning_rate": 4.406940072421362e-05, - "loss": 0.2991, + "epoch": 1.2660828197642988, + "grad_norm": 0.18578827381134033, + "learning_rate": 4.375183496969161e-05, + "loss": 0.4242, "step": 35130 }, { - "epoch": 1.24, - "learning_rate": 4.406755843841933e-05, - "loss": 0.2908, + "epoch": 1.2662630194255236, + "grad_norm": 0.1809617131948471, + "learning_rate": 4.374990492086053e-05, + "loss": 0.4275, "step": 35135 }, { - "epoch": 1.24, - "learning_rate": 4.406571590504537e-05, - "loss": 0.2929, + "epoch": 1.266443219086748, + "grad_norm": 0.21433116495609283, + "learning_rate": 4.3747974616565634e-05, + "loss": 0.4564, "step": 35140 }, { - "epoch": 1.24, - "learning_rate": 4.406387312411565e-05, - "loss": 0.3158, + "epoch": 1.2666234187479728, + "grad_norm": 0.18926386535167694, + "learning_rate": 4.3746044056833205e-05, + "loss": 0.4426, "step": 35145 }, { - "epoch": 1.24, - "learning_rate": 4.4062030095654104e-05, - "loss": 0.2799, + "epoch": 1.2668036184091975, + "grad_norm": 0.19316570460796356, + "learning_rate": 4.3744113241689565e-05, + "loss": 0.4352, "step": 35150 }, { - "epoch": 1.24, - "learning_rate": 4.406018681968466e-05, - "loss": 0.2741, + "epoch": 1.266983818070422, + "grad_norm": 0.20651282370090485, + "learning_rate": 4.3742182171161005e-05, + "loss": 0.4239, "step": 35155 }, { - "epoch": 1.24, - "learning_rate": 4.405834329623125e-05, - "loss": 0.3071, + "epoch": 1.2671640177316466, + "grad_norm": 0.15775299072265625, + "learning_rate": 4.3740250845273845e-05, + "loss": 0.3809, "step": 35160 }, { - "epoch": 1.24, - "learning_rate": 4.405649952531781e-05, - "loss": 0.2802, + "epoch": 1.2673442173928713, + "grad_norm": 0.1828775703907013, + "learning_rate": 4.373831926405439e-05, + "loss": 0.4036, "step": 35165 }, { - "epoch": 1.24, - "learning_rate": 4.405465550696829e-05, - "loss": 0.2824, + "epoch": 1.267524417054096, + "grad_norm": 0.17841170728206635, + "learning_rate": 4.373638742752897e-05, + "loss": 0.4488, "step": 35170 }, { - "epoch": 1.24, - "learning_rate": 4.4052811241206614e-05, - "loss": 0.2678, + "epoch": 1.2677046167153205, + "grad_norm": 0.1723710149526596, + "learning_rate": 4.373445533572389e-05, + "loss": 0.4132, "step": 35175 }, { - "epoch": 1.24, - "learning_rate": 4.405096672805675e-05, - "loss": 0.2921, + "epoch": 1.2678848163765453, + "grad_norm": 0.2052830010652542, + "learning_rate": 4.373252298866549e-05, + "loss": 0.4367, "step": 35180 }, { - "epoch": 1.24, - "learning_rate": 4.404912196754264e-05, - "loss": 0.2858, + "epoch": 1.2680650160377698, + "grad_norm": 0.17017756402492523, + "learning_rate": 4.3730590386380086e-05, + "loss": 0.4235, "step": 35185 }, { - "epoch": 1.24, - "learning_rate": 4.404727695968823e-05, - "loss": 0.306, + "epoch": 1.2682452156989945, + "grad_norm": 0.1878291219472885, + "learning_rate": 4.372865752889402e-05, + "loss": 0.4508, "step": 35190 }, { - "epoch": 1.24, - "learning_rate": 4.4045431704517474e-05, - "loss": 0.3048, + "epoch": 1.2684254153602192, + "grad_norm": 0.1998129040002823, + "learning_rate": 4.3726724416233625e-05, + "loss": 0.4286, "step": 35195 }, { - "epoch": 1.24, - "learning_rate": 4.404358620205435e-05, - "loss": 0.2906, + "epoch": 1.2686056150214438, + "grad_norm": 0.1536918580532074, + "learning_rate": 4.372479104842522e-05, + "loss": 0.4135, "step": 35200 }, { - "epoch": 1.24, - "learning_rate": 4.40417404523228e-05, - "loss": 0.2993, + "epoch": 1.2687858146826683, + "grad_norm": 0.2284386157989502, + "learning_rate": 4.372285742549517e-05, + "loss": 0.4026, "step": 35205 }, { - "epoch": 1.24, - "learning_rate": 4.403989445534681e-05, - "loss": 0.2896, + "epoch": 1.268966014343893, + "grad_norm": 0.1657029688358307, + "learning_rate": 4.372092354746982e-05, + "loss": 0.4326, "step": 35210 }, { - "epoch": 1.24, - "learning_rate": 4.403804821115033e-05, - "loss": 0.3059, + "epoch": 1.2691462140051177, + "grad_norm": 0.1616412103176117, + "learning_rate": 4.37189894143755e-05, + "loss": 0.424, "step": 35215 }, { - "epoch": 1.24, - "learning_rate": 4.403620171975734e-05, - "loss": 0.286, + "epoch": 1.2693264136663422, + "grad_norm": 0.18977968394756317, + "learning_rate": 4.371705502623858e-05, + "loss": 0.4001, "step": 35220 }, { - "epoch": 1.24, - "learning_rate": 4.403435498119182e-05, - "loss": 0.324, + "epoch": 1.269506613327567, + "grad_norm": 0.20277686417102814, + "learning_rate": 4.371512038308541e-05, + "loss": 0.4667, "step": 35225 }, { - "epoch": 1.24, - "learning_rate": 4.403250799547774e-05, - "loss": 0.2888, + "epoch": 1.2696868129887915, + "grad_norm": 0.16292032599449158, + "learning_rate": 4.371318548494234e-05, + "loss": 0.4241, "step": 35230 }, { - "epoch": 1.24, - "learning_rate": 4.40306607626391e-05, - "loss": 0.2763, + "epoch": 1.2698670126500162, + "grad_norm": 0.18846076726913452, + "learning_rate": 4.3711250331835754e-05, + "loss": 0.3928, "step": 35235 }, { - "epoch": 1.24, - "learning_rate": 4.402881328269985e-05, - "loss": 0.3169, + "epoch": 1.270047212311241, + "grad_norm": 0.17085112631320953, + "learning_rate": 4.370931492379199e-05, + "loss": 0.4118, "step": 35240 }, { - "epoch": 1.24, - "learning_rate": 4.402696555568402e-05, - "loss": 0.3118, + "epoch": 1.2702274119724655, + "grad_norm": 0.18065665662288666, + "learning_rate": 4.3707379260837444e-05, + "loss": 0.4184, "step": 35245 }, { - "epoch": 1.24, - "learning_rate": 4.402511758161557e-05, - "loss": 0.2793, + "epoch": 1.2704076116336902, + "grad_norm": 0.278125137090683, + "learning_rate": 4.370544334299847e-05, + "loss": 0.4567, "step": 35250 }, { - "epoch": 1.24, - "learning_rate": 4.402326936051852e-05, - "loss": 0.3025, + "epoch": 1.2705878112949147, + "grad_norm": 0.1816800832748413, + "learning_rate": 4.3703507170301454e-05, + "loss": 0.4395, "step": 35255 }, { - "epoch": 1.24, - "learning_rate": 4.402142089241684e-05, - "loss": 0.261, + "epoch": 1.2707680109561394, + "grad_norm": 0.16282232105731964, + "learning_rate": 4.370157074277278e-05, + "loss": 0.3854, "step": 35260 }, { - "epoch": 1.24, - "learning_rate": 4.4019572177334545e-05, - "loss": 0.2901, + "epoch": 1.270948210617364, + "grad_norm": 0.1948549896478653, + "learning_rate": 4.369963406043881e-05, + "loss": 0.4173, "step": 35265 }, { - "epoch": 1.24, - "learning_rate": 4.401772321529565e-05, - "loss": 0.2884, + "epoch": 1.2711284102785887, + "grad_norm": 0.18401391804218292, + "learning_rate": 4.3697697123325956e-05, + "loss": 0.3982, "step": 35270 }, { - "epoch": 1.24, - "learning_rate": 4.401587400632414e-05, - "loss": 0.3042, + "epoch": 1.2713086099398132, + "grad_norm": 0.17478926479816437, + "learning_rate": 4.36957599314606e-05, + "loss": 0.4542, "step": 35275 }, { - "epoch": 1.24, - "learning_rate": 4.4014024550444044e-05, - "loss": 0.2762, + "epoch": 1.271488809601038, + "grad_norm": 0.20537783205509186, + "learning_rate": 4.369382248486914e-05, + "loss": 0.4193, "step": 35280 }, { - "epoch": 1.24, - "learning_rate": 4.401217484767937e-05, - "loss": 0.2782, + "epoch": 1.2716690092622627, + "grad_norm": 0.1872226595878601, + "learning_rate": 4.3691884783577966e-05, + "loss": 0.4263, "step": 35285 }, { - "epoch": 1.24, - "learning_rate": 4.401032489805413e-05, - "loss": 0.2859, + "epoch": 1.2718492089234872, + "grad_norm": 0.18654055893421173, + "learning_rate": 4.368994682761347e-05, + "loss": 0.4464, "step": 35290 }, { - "epoch": 1.24, - "learning_rate": 4.400847470159236e-05, - "loss": 0.2922, + "epoch": 1.272029408584712, + "grad_norm": 0.17078270018100739, + "learning_rate": 4.3688008617002076e-05, + "loss": 0.4367, "step": 35295 }, { - "epoch": 1.24, - "learning_rate": 4.400662425831807e-05, - "loss": 0.2677, + "epoch": 1.2722096082459364, + "grad_norm": 0.1947806477546692, + "learning_rate": 4.368607015177018e-05, + "loss": 0.4056, "step": 35300 }, { - "epoch": 1.24, - "learning_rate": 4.4004773568255284e-05, - "loss": 0.2656, + "epoch": 1.2723898079071612, + "grad_norm": 0.17518731951713562, + "learning_rate": 4.368413143194419e-05, + "loss": 0.4531, "step": 35305 }, { - "epoch": 1.24, - "learning_rate": 4.400292263142805e-05, - "loss": 0.2953, + "epoch": 1.272570007568386, + "grad_norm": 0.19480538368225098, + "learning_rate": 4.368219245755053e-05, + "loss": 0.448, "step": 35310 }, { - "epoch": 1.24, - "learning_rate": 4.400107144786038e-05, - "loss": 0.2876, + "epoch": 1.2727502072296104, + "grad_norm": 0.15128688514232635, + "learning_rate": 4.368025322861562e-05, + "loss": 0.4321, "step": 35315 }, { - "epoch": 1.24, - "learning_rate": 4.399922001757631e-05, - "loss": 0.2982, + "epoch": 1.272930406890835, + "grad_norm": 0.16583947837352753, + "learning_rate": 4.367831374516588e-05, + "loss": 0.4399, "step": 35320 }, { - "epoch": 1.24, - "learning_rate": 4.3997368340599896e-05, - "loss": 0.2891, + "epoch": 1.2731106065520597, + "grad_norm": 0.22142437100410461, + "learning_rate": 4.3676374007227715e-05, + "loss": 0.4275, "step": 35325 }, { - "epoch": 1.24, - "learning_rate": 4.3995516416955187e-05, - "loss": 0.2719, + "epoch": 1.2732908062132844, + "grad_norm": 0.1897389143705368, + "learning_rate": 4.367443401482758e-05, + "loss": 0.4079, "step": 35330 }, { - "epoch": 1.24, - "learning_rate": 4.39936642466662e-05, - "loss": 0.3066, + "epoch": 1.273471005874509, + "grad_norm": 0.18949352204799652, + "learning_rate": 4.36724937679919e-05, + "loss": 0.4659, "step": 35335 }, { - "epoch": 1.24, - "learning_rate": 4.399181182975701e-05, - "loss": 0.2932, + "epoch": 1.2736512055357336, + "grad_norm": 0.1753978729248047, + "learning_rate": 4.367055326674711e-05, + "loss": 0.4273, "step": 35340 }, { - "epoch": 1.24, - "learning_rate": 4.398995916625165e-05, - "loss": 0.3074, + "epoch": 1.2738314051969581, + "grad_norm": 0.20065420866012573, + "learning_rate": 4.366861251111963e-05, + "loss": 0.4463, "step": 35345 }, { - "epoch": 1.24, - "learning_rate": 4.398810625617419e-05, - "loss": 0.294, + "epoch": 1.2740116048581829, + "grad_norm": 0.19253739714622498, + "learning_rate": 4.366667150113594e-05, + "loss": 0.4188, "step": 35350 }, { - "epoch": 1.24, - "learning_rate": 4.398625309954868e-05, - "loss": 0.2983, + "epoch": 1.2741918045194076, + "grad_norm": 0.1715114414691925, + "learning_rate": 4.366473023682245e-05, + "loss": 0.4355, "step": 35355 }, { - "epoch": 1.24, - "learning_rate": 4.398439969639919e-05, - "loss": 0.2994, + "epoch": 1.2743720041806321, + "grad_norm": 0.17242814600467682, + "learning_rate": 4.3662788718205625e-05, + "loss": 0.4026, "step": 35360 }, { - "epoch": 1.24, - "learning_rate": 4.398254604674978e-05, - "loss": 0.265, + "epoch": 1.2745522038418569, + "grad_norm": 0.20738811790943146, + "learning_rate": 4.366084694531192e-05, + "loss": 0.4046, "step": 35365 }, { - "epoch": 1.24, - "learning_rate": 4.3980692150624524e-05, - "loss": 0.2759, + "epoch": 1.2747324035030814, + "grad_norm": 0.19424347579479218, + "learning_rate": 4.365890491816779e-05, + "loss": 0.4434, "step": 35370 }, { - "epoch": 1.24, - "learning_rate": 4.397883800804749e-05, - "loss": 0.2845, + "epoch": 1.274912603164306, + "grad_norm": 0.21286608278751373, + "learning_rate": 4.365696263679969e-05, + "loss": 0.4375, "step": 35375 }, { - "epoch": 1.24, - "learning_rate": 4.397698361904274e-05, - "loss": 0.2766, + "epoch": 1.2750928028255306, + "grad_norm": 0.20882990956306458, + "learning_rate": 4.365502010123409e-05, + "loss": 0.4316, "step": 35380 }, { - "epoch": 1.24, - "learning_rate": 4.397512898363437e-05, - "loss": 0.2817, + "epoch": 1.2752730024867553, + "grad_norm": 0.21594053506851196, + "learning_rate": 4.365307731149745e-05, + "loss": 0.3852, "step": 35385 }, { - "epoch": 1.25, - "learning_rate": 4.397327410184646e-05, - "loss": 0.293, + "epoch": 1.2754532021479799, + "grad_norm": 0.1830471307039261, + "learning_rate": 4.3651134267616244e-05, + "loss": 0.4404, "step": 35390 }, { - "epoch": 1.25, - "learning_rate": 4.3971418973703085e-05, - "loss": 0.3059, + "epoch": 1.2756334018092046, + "grad_norm": 0.17027118802070618, + "learning_rate": 4.3649190969616946e-05, + "loss": 0.4043, "step": 35395 }, { - "epoch": 1.25, - "learning_rate": 4.396956359922834e-05, - "loss": 0.2993, + "epoch": 1.2758136014704293, + "grad_norm": 0.16624897718429565, + "learning_rate": 4.364724741752603e-05, + "loss": 0.4311, "step": 35400 }, { - "epoch": 1.25, - "learning_rate": 4.3967707978446316e-05, - "loss": 0.2904, + "epoch": 1.2759938011316538, + "grad_norm": 0.17142795026302338, + "learning_rate": 4.364530361136998e-05, + "loss": 0.4524, "step": 35405 }, { - "epoch": 1.25, - "learning_rate": 4.39658521113811e-05, - "loss": 0.2725, + "epoch": 1.2761740007928786, + "grad_norm": 0.197305366396904, + "learning_rate": 4.364335955117528e-05, + "loss": 0.4254, "step": 35410 }, { - "epoch": 1.25, - "learning_rate": 4.3963995998056795e-05, - "loss": 0.2539, + "epoch": 1.276354200454103, + "grad_norm": 0.1855136901140213, + "learning_rate": 4.364141523696841e-05, + "loss": 0.4182, "step": 35415 }, { - "epoch": 1.25, - "learning_rate": 4.396213963849749e-05, - "loss": 0.3026, + "epoch": 1.2765344001153278, + "grad_norm": 0.1479266881942749, + "learning_rate": 4.3639470668775865e-05, + "loss": 0.3851, "step": 35420 }, { - "epoch": 1.25, - "learning_rate": 4.3960283032727313e-05, - "loss": 0.3028, + "epoch": 1.2767145997765526, + "grad_norm": 0.18044587969779968, + "learning_rate": 4.363752584662415e-05, + "loss": 0.4205, "step": 35425 }, { - "epoch": 1.25, - "learning_rate": 4.395842618077034e-05, - "loss": 0.2885, + "epoch": 1.276894799437777, + "grad_norm": 0.2061435431241989, + "learning_rate": 4.3635580770539744e-05, + "loss": 0.4146, "step": 35430 }, { - "epoch": 1.25, - "learning_rate": 4.395656908265071e-05, - "loss": 0.2749, + "epoch": 1.2770749990990016, + "grad_norm": 0.2000425010919571, + "learning_rate": 4.363363544054916e-05, + "loss": 0.4167, "step": 35435 }, { - "epoch": 1.25, - "learning_rate": 4.3954711738392526e-05, - "loss": 0.3019, + "epoch": 1.2772551987602263, + "grad_norm": 0.18255402147769928, + "learning_rate": 4.3631689856678905e-05, + "loss": 0.393, "step": 35440 }, { - "epoch": 1.25, - "learning_rate": 4.395285414801989e-05, - "loss": 0.3046, + "epoch": 1.277435398421451, + "grad_norm": 0.20933406054973602, + "learning_rate": 4.362974401895547e-05, + "loss": 0.4177, "step": 35445 }, { - "epoch": 1.25, - "learning_rate": 4.395099631155694e-05, - "loss": 0.2637, + "epoch": 1.2776155980826756, + "grad_norm": 0.1986079066991806, + "learning_rate": 4.36277979274054e-05, + "loss": 0.4176, "step": 35450 }, { - "epoch": 1.25, - "learning_rate": 4.3949138229027795e-05, - "loss": 0.2905, + "epoch": 1.2777957977439003, + "grad_norm": 0.21557162702083588, + "learning_rate": 4.3625851582055174e-05, + "loss": 0.3766, "step": 35455 }, { - "epoch": 1.25, - "learning_rate": 4.394727990045657e-05, - "loss": 0.2931, + "epoch": 1.2779759974051248, + "grad_norm": 0.17857031524181366, + "learning_rate": 4.362390498293134e-05, + "loss": 0.4071, "step": 35460 }, { - "epoch": 1.25, - "learning_rate": 4.394542132586741e-05, - "loss": 0.3078, + "epoch": 1.2781561970663495, + "grad_norm": 0.19413699209690094, + "learning_rate": 4.362195813006039e-05, + "loss": 0.4288, "step": 35465 }, { - "epoch": 1.25, - "learning_rate": 4.394356250528443e-05, - "loss": 0.2762, + "epoch": 1.2783363967275743, + "grad_norm": 0.16761010885238647, + "learning_rate": 4.362001102346888e-05, + "loss": 0.4273, "step": 35470 }, { - "epoch": 1.25, - "learning_rate": 4.394170343873178e-05, - "loss": 0.2756, + "epoch": 1.2785165963887988, + "grad_norm": 0.20636701583862305, + "learning_rate": 4.3618063663183315e-05, + "loss": 0.4188, "step": 35475 }, { - "epoch": 1.25, - "learning_rate": 4.39398441262336e-05, - "loss": 0.2865, + "epoch": 1.2786967960500233, + "grad_norm": 0.22335347533226013, + "learning_rate": 4.361611604923025e-05, + "loss": 0.4317, "step": 35480 }, { - "epoch": 1.25, - "learning_rate": 4.393798456781402e-05, - "loss": 0.2863, + "epoch": 1.278876995711248, + "grad_norm": 0.23554205894470215, + "learning_rate": 4.361416818163619e-05, + "loss": 0.4478, "step": 35485 }, { - "epoch": 1.25, - "learning_rate": 4.393612476349719e-05, - "loss": 0.2765, + "epoch": 1.2790571953724728, + "grad_norm": 0.22054627537727356, + "learning_rate": 4.361222006042771e-05, + "loss": 0.3942, "step": 35490 }, { - "epoch": 1.25, - "learning_rate": 4.393426471330726e-05, - "loss": 0.2794, + "epoch": 1.2792373950336973, + "grad_norm": 0.19049589335918427, + "learning_rate": 4.361027168563132e-05, + "loss": 0.4183, "step": 35495 }, { - "epoch": 1.25, - "learning_rate": 4.393240441726838e-05, - "loss": 0.3137, + "epoch": 1.279417594694922, + "grad_norm": 0.1891845464706421, + "learning_rate": 4.360832305727359e-05, + "loss": 0.4341, "step": 35500 }, { - "epoch": 1.25, - "eval_loss": 0.285795658826828, - "eval_runtime": 10.5526, - "eval_samples_per_second": 9.476, - "eval_steps_per_second": 9.476, + "epoch": 1.279417594694922, + "eval_loss": 0.44991669058799744, + "eval_runtime": 3.8804, + "eval_samples_per_second": 25.771, + "eval_steps_per_second": 6.443, "step": 35500 }, { - "epoch": 1.25, - "learning_rate": 4.393054387540471e-05, - "loss": 0.2958, + "epoch": 1.2795977943561465, + "grad_norm": 0.16972802579402924, + "learning_rate": 4.360637417538106e-05, + "loss": 0.3922, "step": 35505 }, { - "epoch": 1.25, - "learning_rate": 4.3928683087740394e-05, - "loss": 0.27, + "epoch": 1.2797779940173712, + "grad_norm": 0.15279355645179749, + "learning_rate": 4.360442503998028e-05, + "loss": 0.387, "step": 35510 }, { - "epoch": 1.25, - "learning_rate": 4.392682205429962e-05, - "loss": 0.296, + "epoch": 1.279958193678596, + "grad_norm": 0.19364698231220245, + "learning_rate": 4.360247565109782e-05, + "loss": 0.4193, "step": 35515 }, { - "epoch": 1.25, - "learning_rate": 4.392496077510652e-05, - "loss": 0.3007, + "epoch": 1.2801383933398205, + "grad_norm": 0.13126322627067566, + "learning_rate": 4.3600526008760226e-05, + "loss": 0.3859, "step": 35520 }, { - "epoch": 1.25, - "learning_rate": 4.392309925018528e-05, - "loss": 0.3007, + "epoch": 1.2803185930010452, + "grad_norm": 0.18221980333328247, + "learning_rate": 4.359857611299406e-05, + "loss": 0.4055, "step": 35525 }, { - "epoch": 1.25, - "learning_rate": 4.3921237479560066e-05, - "loss": 0.2779, + "epoch": 1.2804987926622697, + "grad_norm": 0.13313446938991547, + "learning_rate": 4.35966259638259e-05, + "loss": 0.4021, "step": 35530 }, { - "epoch": 1.25, - "learning_rate": 4.3919375463255055e-05, - "loss": 0.2885, + "epoch": 1.2806789923234945, + "grad_norm": 0.20920483767986298, + "learning_rate": 4.359467556128232e-05, + "loss": 0.4436, "step": 35535 }, { - "epoch": 1.25, - "learning_rate": 4.391751320129442e-05, - "loss": 0.2823, + "epoch": 1.2808591919847192, + "grad_norm": 0.15556688606739044, + "learning_rate": 4.359272490538987e-05, + "loss": 0.3783, "step": 35540 }, { - "epoch": 1.25, - "learning_rate": 4.3915650693702345e-05, - "loss": 0.2783, + "epoch": 1.2810393916459437, + "grad_norm": 0.17486773431301117, + "learning_rate": 4.359077399617515e-05, + "loss": 0.4021, "step": 35545 }, { - "epoch": 1.25, - "learning_rate": 4.391378794050302e-05, - "loss": 0.29, + "epoch": 1.2812195913071682, + "grad_norm": 0.1680872142314911, + "learning_rate": 4.358882283366473e-05, + "loss": 0.3872, "step": 35550 }, { - "epoch": 1.25, - "learning_rate": 4.391192494172061e-05, - "loss": 0.2846, + "epoch": 1.281399790968393, + "grad_norm": 0.17405681312084198, + "learning_rate": 4.3586871417885204e-05, + "loss": 0.4278, "step": 35555 }, { - "epoch": 1.25, - "learning_rate": 4.391006169737932e-05, - "loss": 0.2778, + "epoch": 1.2815799906296177, + "grad_norm": 0.1495596170425415, + "learning_rate": 4.358491974886315e-05, + "loss": 0.4176, "step": 35560 }, { - "epoch": 1.25, - "learning_rate": 4.390819820750334e-05, - "loss": 0.2847, + "epoch": 1.2817601902908422, + "grad_norm": 0.188494473695755, + "learning_rate": 4.358296782662517e-05, + "loss": 0.4321, "step": 35565 }, { - "epoch": 1.25, - "learning_rate": 4.390633447211687e-05, - "loss": 0.3156, + "epoch": 1.281940389952067, + "grad_norm": 0.20558683574199677, + "learning_rate": 4.358101565119784e-05, + "loss": 0.4239, "step": 35570 }, { - "epoch": 1.25, - "learning_rate": 4.390447049124411e-05, - "loss": 0.3048, + "epoch": 1.2821205896132915, + "grad_norm": 0.16552236676216125, + "learning_rate": 4.3579063222607776e-05, + "loss": 0.4232, "step": 35575 }, { - "epoch": 1.25, - "learning_rate": 4.390260626490926e-05, - "loss": 0.2952, + "epoch": 1.2823007892745162, + "grad_norm": 0.17347842454910278, + "learning_rate": 4.357711054088157e-05, + "loss": 0.4082, "step": 35580 }, { - "epoch": 1.25, - "learning_rate": 4.390074179313652e-05, - "loss": 0.2893, + "epoch": 1.282480988935741, + "grad_norm": 0.17773760855197906, + "learning_rate": 4.357515760604583e-05, + "loss": 0.3855, "step": 35585 }, { - "epoch": 1.25, - "learning_rate": 4.389887707595011e-05, - "loss": 0.3191, + "epoch": 1.2826611885969654, + "grad_norm": 0.16329875588417053, + "learning_rate": 4.3573204418127165e-05, + "loss": 0.3963, "step": 35590 }, { - "epoch": 1.25, - "learning_rate": 4.3897012113374224e-05, - "loss": 0.2988, + "epoch": 1.28284138825819, + "grad_norm": 0.195121169090271, + "learning_rate": 4.357125097715218e-05, + "loss": 0.4459, "step": 35595 }, { - "epoch": 1.25, - "learning_rate": 4.3895146905433096e-05, - "loss": 0.3018, + "epoch": 1.2830215879194147, + "grad_norm": 0.22871240973472595, + "learning_rate": 4.35692972831475e-05, + "loss": 0.4106, "step": 35600 }, { - "epoch": 1.25, - "learning_rate": 4.389328145215094e-05, - "loss": 0.2972, + "epoch": 1.2832017875806394, + "grad_norm": 0.13964316248893738, + "learning_rate": 4.356734333613974e-05, + "loss": 0.4066, "step": 35605 }, { - "epoch": 1.25, - "learning_rate": 4.389141575355198e-05, - "loss": 0.2698, + "epoch": 1.283381987241864, + "grad_norm": 0.16373074054718018, + "learning_rate": 4.356538913615553e-05, + "loss": 0.4039, "step": 35610 }, { - "epoch": 1.25, - "learning_rate": 4.3889549809660425e-05, - "loss": 0.2763, + "epoch": 1.2835621869030887, + "grad_norm": 0.18498462438583374, + "learning_rate": 4.3563434683221475e-05, + "loss": 0.4177, "step": 35615 }, { - "epoch": 1.25, - "learning_rate": 4.388768362050052e-05, - "loss": 0.278, + "epoch": 1.2837423865643132, + "grad_norm": 0.2500152289867401, + "learning_rate": 4.356147997736422e-05, + "loss": 0.4178, "step": 35620 }, { - "epoch": 1.25, - "learning_rate": 4.388581718609649e-05, - "loss": 0.2908, + "epoch": 1.283922586225538, + "grad_norm": 0.17622318863868713, + "learning_rate": 4.3559525018610395e-05, + "loss": 0.3932, "step": 35625 }, { - "epoch": 1.25, - "learning_rate": 4.388395050647257e-05, - "loss": 0.2866, + "epoch": 1.2841027858867626, + "grad_norm": 0.16591259837150574, + "learning_rate": 4.355756980698664e-05, + "loss": 0.402, "step": 35630 }, { - "epoch": 1.25, - "learning_rate": 4.3882083581652996e-05, - "loss": 0.3039, + "epoch": 1.2842829855479871, + "grad_norm": 0.22070670127868652, + "learning_rate": 4.355561434251958e-05, + "loss": 0.4183, "step": 35635 }, { - "epoch": 1.25, - "learning_rate": 4.3880216411662015e-05, - "loss": 0.2785, + "epoch": 1.2844631852092119, + "grad_norm": 0.205961674451828, + "learning_rate": 4.3553658625235874e-05, + "loss": 0.4136, "step": 35640 }, { - "epoch": 1.25, - "learning_rate": 4.387834899652387e-05, - "loss": 0.2908, + "epoch": 1.2846433848704364, + "grad_norm": 0.1596502810716629, + "learning_rate": 4.355170265516216e-05, + "loss": 0.4191, "step": 35645 }, { - "epoch": 1.25, - "learning_rate": 4.38764813362628e-05, - "loss": 0.3119, + "epoch": 1.2848235845316611, + "grad_norm": 0.15234126150608063, + "learning_rate": 4.354974643232508e-05, + "loss": 0.4161, "step": 35650 }, { - "epoch": 1.25, - "learning_rate": 4.387461343090305e-05, - "loss": 0.3132, + "epoch": 1.2850037841928856, + "grad_norm": 0.181291401386261, + "learning_rate": 4.354778995675131e-05, + "loss": 0.4216, "step": 35655 }, { - "epoch": 1.25, - "learning_rate": 4.38727452804689e-05, - "loss": 0.2776, + "epoch": 1.2851839838541104, + "grad_norm": 0.16666920483112335, + "learning_rate": 4.354583322846748e-05, + "loss": 0.4508, "step": 35660 }, { - "epoch": 1.25, - "learning_rate": 4.387087688498458e-05, - "loss": 0.2854, + "epoch": 1.2853641835153349, + "grad_norm": 0.22122518718242645, + "learning_rate": 4.354387624750027e-05, + "loss": 0.413, "step": 35665 }, { - "epoch": 1.25, - "learning_rate": 4.386900824447436e-05, - "loss": 0.2701, + "epoch": 1.2855443831765596, + "grad_norm": 0.18632946908473969, + "learning_rate": 4.354191901387634e-05, + "loss": 0.42, "step": 35670 }, { - "epoch": 1.26, - "learning_rate": 4.3867139358962506e-05, - "loss": 0.2914, + "epoch": 1.2857245828377843, + "grad_norm": 0.16238003969192505, + "learning_rate": 4.3539961527622345e-05, + "loss": 0.3928, "step": 35675 }, { - "epoch": 1.26, - "learning_rate": 4.386527022847328e-05, - "loss": 0.2743, + "epoch": 1.2859047824990089, + "grad_norm": 0.21126984059810638, + "learning_rate": 4.353800378876497e-05, + "loss": 0.3968, "step": 35680 }, { - "epoch": 1.26, - "learning_rate": 4.3863400853030954e-05, - "loss": 0.3009, + "epoch": 1.2860849821602336, + "grad_norm": 0.17129042744636536, + "learning_rate": 4.3536045797330885e-05, + "loss": 0.4172, "step": 35685 }, { - "epoch": 1.26, - "learning_rate": 4.3861531232659796e-05, - "loss": 0.2845, + "epoch": 1.286265181821458, + "grad_norm": 0.18168915808200836, + "learning_rate": 4.353408755334676e-05, + "loss": 0.4172, "step": 35690 }, { - "epoch": 1.26, - "learning_rate": 4.3859661367384084e-05, - "loss": 0.2956, + "epoch": 1.2864453814826828, + "grad_norm": 0.15571817755699158, + "learning_rate": 4.3532129056839274e-05, + "loss": 0.3766, "step": 35695 }, { - "epoch": 1.26, - "learning_rate": 4.385779125722811e-05, - "loss": 0.2896, + "epoch": 1.2866255811439076, + "grad_norm": 0.2171332985162735, + "learning_rate": 4.353017030783513e-05, + "loss": 0.4286, "step": 35700 }, { - "epoch": 1.26, - "learning_rate": 4.385592090221613e-05, - "loss": 0.2897, + "epoch": 1.286805780805132, + "grad_norm": 0.1743398904800415, + "learning_rate": 4.3528211306360986e-05, + "loss": 0.4258, "step": 35705 }, { - "epoch": 1.26, - "learning_rate": 4.3854050302372453e-05, - "loss": 0.299, + "epoch": 1.2869859804663566, + "grad_norm": 0.1896362006664276, + "learning_rate": 4.352625205244357e-05, + "loss": 0.4056, "step": 35710 }, { - "epoch": 1.26, - "learning_rate": 4.3852179457721354e-05, - "loss": 0.2715, + "epoch": 1.2871661801275813, + "grad_norm": 0.1765444278717041, + "learning_rate": 4.352429254610955e-05, + "loss": 0.4137, "step": 35715 }, { - "epoch": 1.26, - "learning_rate": 4.385030836828713e-05, - "loss": 0.3069, + "epoch": 1.287346379788806, + "grad_norm": 0.20674960315227509, + "learning_rate": 4.352233278738562e-05, + "loss": 0.4382, "step": 35720 }, { - "epoch": 1.26, - "learning_rate": 4.384843703409407e-05, - "loss": 0.2927, + "epoch": 1.2875265794500306, + "grad_norm": 0.1722467541694641, + "learning_rate": 4.35203727762985e-05, + "loss": 0.429, "step": 35725 }, { - "epoch": 1.26, - "learning_rate": 4.384656545516649e-05, - "loss": 0.3173, + "epoch": 1.2877067791112553, + "grad_norm": 0.1646967977285385, + "learning_rate": 4.3518412512874885e-05, + "loss": 0.3841, "step": 35730 }, { - "epoch": 1.26, - "learning_rate": 4.384469363152867e-05, - "loss": 0.2901, + "epoch": 1.2878869787724798, + "grad_norm": 0.19040453433990479, + "learning_rate": 4.3516451997141485e-05, + "loss": 0.4182, "step": 35735 }, { - "epoch": 1.26, - "learning_rate": 4.3842821563204916e-05, - "loss": 0.2724, + "epoch": 1.2880671784337046, + "grad_norm": 0.1459205448627472, + "learning_rate": 4.3514491229125015e-05, + "loss": 0.3975, "step": 35740 }, { - "epoch": 1.26, - "learning_rate": 4.3840949250219546e-05, - "loss": 0.2853, + "epoch": 1.2882473780949293, + "grad_norm": 0.1318344622850418, + "learning_rate": 4.3512530208852185e-05, + "loss": 0.4007, "step": 35745 }, { - "epoch": 1.26, - "learning_rate": 4.383907669259687e-05, - "loss": 0.2798, + "epoch": 1.2884275777561538, + "grad_norm": 0.21362341940402985, + "learning_rate": 4.3510568936349714e-05, + "loss": 0.4328, "step": 35750 }, { - "epoch": 1.26, - "learning_rate": 4.383720389036119e-05, - "loss": 0.2963, + "epoch": 1.2886077774173785, + "grad_norm": 0.2101558893918991, + "learning_rate": 4.350860741164432e-05, + "loss": 0.468, "step": 35755 }, { - "epoch": 1.26, - "learning_rate": 4.383533084353684e-05, - "loss": 0.2845, + "epoch": 1.288787977078603, + "grad_norm": 0.1813066005706787, + "learning_rate": 4.350664563476274e-05, + "loss": 0.4452, "step": 35760 }, { - "epoch": 1.26, - "learning_rate": 4.383345755214813e-05, - "loss": 0.2914, + "epoch": 1.2889681767398278, + "grad_norm": 0.1613730937242508, + "learning_rate": 4.35046836057317e-05, + "loss": 0.4398, "step": 35765 }, { - "epoch": 1.26, - "learning_rate": 4.3831584016219385e-05, - "loss": 0.2825, + "epoch": 1.2891483764010523, + "grad_norm": 0.17162039875984192, + "learning_rate": 4.350272132457792e-05, + "loss": 0.4311, "step": 35770 }, { - "epoch": 1.26, - "learning_rate": 4.382971023577493e-05, - "loss": 0.3114, + "epoch": 1.289328576062277, + "grad_norm": 0.16017718613147736, + "learning_rate": 4.350075879132815e-05, + "loss": 0.413, "step": 35775 }, { - "epoch": 1.26, - "learning_rate": 4.382783621083911e-05, - "loss": 0.3057, + "epoch": 1.2895087757235015, + "grad_norm": 0.16685138642787933, + "learning_rate": 4.349879600600912e-05, + "loss": 0.4324, "step": 35780 }, { - "epoch": 1.26, - "learning_rate": 4.3825961941436236e-05, - "loss": 0.27, + "epoch": 1.2896889753847263, + "grad_norm": 0.18246233463287354, + "learning_rate": 4.349683296864758e-05, + "loss": 0.4127, "step": 35785 }, { - "epoch": 1.26, - "learning_rate": 4.3824087427590645e-05, - "loss": 0.2814, + "epoch": 1.289869175045951, + "grad_norm": 0.17236393690109253, + "learning_rate": 4.349486967927027e-05, + "loss": 0.4159, "step": 35790 }, { - "epoch": 1.26, - "learning_rate": 4.382221266932669e-05, - "loss": 0.2841, + "epoch": 1.2900493747071755, + "grad_norm": 0.17806027829647064, + "learning_rate": 4.349290613790393e-05, + "loss": 0.4077, "step": 35795 }, { - "epoch": 1.26, - "learning_rate": 4.382033766666871e-05, - "loss": 0.2814, + "epoch": 1.2902295743684002, + "grad_norm": 0.17833392322063446, + "learning_rate": 4.3490942344575336e-05, + "loss": 0.4335, "step": 35800 }, { - "epoch": 1.26, - "learning_rate": 4.381846241964105e-05, - "loss": 0.2871, + "epoch": 1.2904097740296248, + "grad_norm": 0.2036380022764206, + "learning_rate": 4.348897829931123e-05, + "loss": 0.4366, "step": 35805 }, { - "epoch": 1.26, - "learning_rate": 4.381658692826805e-05, - "loss": 0.2807, + "epoch": 1.2905899736908495, + "grad_norm": 0.1541140377521515, + "learning_rate": 4.348701400213838e-05, + "loss": 0.4208, "step": 35810 }, { - "epoch": 1.26, - "learning_rate": 4.3814711192574075e-05, - "loss": 0.2895, + "epoch": 1.2907701733520742, + "grad_norm": 0.19483250379562378, + "learning_rate": 4.3485049453083536e-05, + "loss": 0.3919, "step": 35815 }, { - "epoch": 1.26, - "learning_rate": 4.381283521258347e-05, - "loss": 0.3185, + "epoch": 1.2909503730132987, + "grad_norm": 0.1808329075574875, + "learning_rate": 4.348308465217348e-05, + "loss": 0.4261, "step": 35820 }, { - "epoch": 1.26, - "learning_rate": 4.38109589883206e-05, - "loss": 0.2811, + "epoch": 1.2911305726745232, + "grad_norm": 0.208456888794899, + "learning_rate": 4.348111959943496e-05, + "loss": 0.39, "step": 35825 }, { - "epoch": 1.26, - "learning_rate": 4.3809082519809826e-05, - "loss": 0.291, + "epoch": 1.291310772335748, + "grad_norm": 0.19897425174713135, + "learning_rate": 4.3479154294894774e-05, + "loss": 0.4752, "step": 35830 }, { - "epoch": 1.26, - "learning_rate": 4.380720580707551e-05, - "loss": 0.2967, + "epoch": 1.2914909719969727, + "grad_norm": 0.16709640622138977, + "learning_rate": 4.347718873857969e-05, + "loss": 0.4283, "step": 35835 }, { - "epoch": 1.26, - "learning_rate": 4.380532885014202e-05, - "loss": 0.2848, + "epoch": 1.2916711716581972, + "grad_norm": 0.19628377258777618, + "learning_rate": 4.347522293051648e-05, + "loss": 0.4476, "step": 35840 }, { - "epoch": 1.26, - "learning_rate": 4.380345164903373e-05, - "loss": 0.2688, + "epoch": 1.291851371319422, + "grad_norm": 0.19053253531455994, + "learning_rate": 4.3473256870731935e-05, + "loss": 0.4143, "step": 35845 }, { - "epoch": 1.26, - "learning_rate": 4.380157420377501e-05, - "loss": 0.2852, + "epoch": 1.2920315709806465, + "grad_norm": 0.16357672214508057, + "learning_rate": 4.347129055925285e-05, + "loss": 0.4099, "step": 35850 }, { - "epoch": 1.26, - "learning_rate": 4.379969651439024e-05, - "loss": 0.297, + "epoch": 1.2922117706418712, + "grad_norm": 0.17562684416770935, + "learning_rate": 4.3469323996106e-05, + "loss": 0.3779, "step": 35855 }, { - "epoch": 1.26, - "learning_rate": 4.3797818580903806e-05, - "loss": 0.2875, + "epoch": 1.292391970303096, + "grad_norm": 0.21830067038536072, + "learning_rate": 4.346735718131819e-05, + "loss": 0.4124, "step": 35860 }, { - "epoch": 1.26, - "learning_rate": 4.3795940403340076e-05, - "loss": 0.2677, + "epoch": 1.2925721699643204, + "grad_norm": 0.21300473809242249, + "learning_rate": 4.3465390114916206e-05, + "loss": 0.3939, "step": 35865 }, { - "epoch": 1.26, - "learning_rate": 4.379406198172345e-05, - "loss": 0.2658, + "epoch": 1.2927523696255452, + "grad_norm": 0.15682797133922577, + "learning_rate": 4.3463422796926864e-05, + "loss": 0.3946, "step": 35870 }, { - "epoch": 1.26, - "learning_rate": 4.3792183316078324e-05, - "loss": 0.2732, + "epoch": 1.2929325692867697, + "grad_norm": 0.1903832107782364, + "learning_rate": 4.3461455227376956e-05, + "loss": 0.4147, "step": 35875 }, { - "epoch": 1.26, - "learning_rate": 4.379030440642907e-05, - "loss": 0.3065, + "epoch": 1.2931127689479944, + "grad_norm": 0.1733843833208084, + "learning_rate": 4.3459487406293296e-05, + "loss": 0.4034, "step": 35880 }, { - "epoch": 1.26, - "learning_rate": 4.3788425252800104e-05, - "loss": 0.294, + "epoch": 1.293292968609219, + "grad_norm": 0.193365678191185, + "learning_rate": 4.34575193337027e-05, + "loss": 0.4185, "step": 35885 }, { - "epoch": 1.26, - "learning_rate": 4.378654585521582e-05, - "loss": 0.2804, + "epoch": 1.2934731682704437, + "grad_norm": 0.19296406209468842, + "learning_rate": 4.345555100963198e-05, + "loss": 0.4011, "step": 35890 }, { - "epoch": 1.26, - "learning_rate": 4.378466621370061e-05, - "loss": 0.2957, + "epoch": 1.2936533679316682, + "grad_norm": 0.16157175600528717, + "learning_rate": 4.3453582434107934e-05, + "loss": 0.4164, "step": 35895 }, { - "epoch": 1.26, - "learning_rate": 4.378278632827889e-05, - "loss": 0.2936, + "epoch": 1.293833567592893, + "grad_norm": 0.17810019850730896, + "learning_rate": 4.3451613607157416e-05, + "loss": 0.394, "step": 35900 }, { - "epoch": 1.26, - "learning_rate": 4.378090619897508e-05, - "loss": 0.2891, + "epoch": 1.2940137672541177, + "grad_norm": 0.1655871868133545, + "learning_rate": 4.344964452880723e-05, + "loss": 0.3975, "step": 35905 }, { - "epoch": 1.26, - "learning_rate": 4.377902582581357e-05, - "loss": 0.267, + "epoch": 1.2941939669153422, + "grad_norm": 0.24324651062488556, + "learning_rate": 4.3447675199084204e-05, + "loss": 0.4328, "step": 35910 }, { - "epoch": 1.26, - "learning_rate": 4.377714520881879e-05, - "loss": 0.3045, + "epoch": 1.294374166576567, + "grad_norm": 0.15802602469921112, + "learning_rate": 4.344570561801518e-05, + "loss": 0.4295, "step": 35915 }, { - "epoch": 1.26, - "learning_rate": 4.377526434801515e-05, - "loss": 0.3042, + "epoch": 1.2945543662377914, + "grad_norm": 0.1700577437877655, + "learning_rate": 4.344373578562698e-05, + "loss": 0.3671, "step": 35920 }, { - "epoch": 1.26, - "learning_rate": 4.377338324342708e-05, - "loss": 0.3083, + "epoch": 1.2947345658990161, + "grad_norm": 0.1449105143547058, + "learning_rate": 4.3441765701946455e-05, + "loss": 0.4034, "step": 35925 }, { - "epoch": 1.26, - "learning_rate": 4.377150189507899e-05, - "loss": 0.292, + "epoch": 1.2949147655602409, + "grad_norm": 0.18655553460121155, + "learning_rate": 4.343979536700045e-05, + "loss": 0.3888, "step": 35930 }, { - "epoch": 1.26, - "learning_rate": 4.376962030299533e-05, - "loss": 0.311, + "epoch": 1.2950949652214654, + "grad_norm": 0.1947435736656189, + "learning_rate": 4.34378247808158e-05, + "loss": 0.4335, "step": 35935 }, { - "epoch": 1.26, - "learning_rate": 4.376773846720051e-05, - "loss": 0.2774, + "epoch": 1.29527516488269, + "grad_norm": 0.1621570587158203, + "learning_rate": 4.343585394341936e-05, + "loss": 0.4024, "step": 35940 }, { - "epoch": 1.26, - "learning_rate": 4.3765856387718984e-05, - "loss": 0.2974, + "epoch": 1.2954553645439146, + "grad_norm": 0.15728051960468292, + "learning_rate": 4.343388285483797e-05, + "loss": 0.3716, "step": 35945 }, { - "epoch": 1.26, - "learning_rate": 4.3763974064575175e-05, - "loss": 0.2974, + "epoch": 1.2956355642051394, + "grad_norm": 0.17615413665771484, + "learning_rate": 4.34319115150985e-05, + "loss": 0.4256, "step": 35950 }, { - "epoch": 1.26, - "learning_rate": 4.376209149779353e-05, - "loss": 0.2679, + "epoch": 1.2958157638663639, + "grad_norm": 0.17523658275604248, + "learning_rate": 4.3429939924227806e-05, + "loss": 0.4552, "step": 35955 }, { - "epoch": 1.27, - "learning_rate": 4.37602086873985e-05, - "loss": 0.294, + "epoch": 1.2959959635275886, + "grad_norm": 0.15205228328704834, + "learning_rate": 4.3427968082252744e-05, + "loss": 0.4318, "step": 35960 }, { - "epoch": 1.27, - "learning_rate": 4.375832563341451e-05, - "loss": 0.3024, + "epoch": 1.2961761631888131, + "grad_norm": 0.18132284283638, + "learning_rate": 4.3425995989200184e-05, + "loss": 0.4419, "step": 35965 }, { - "epoch": 1.27, - "learning_rate": 4.3756442335866034e-05, - "loss": 0.267, + "epoch": 1.2963563628500379, + "grad_norm": 0.16100190579891205, + "learning_rate": 4.3424023645097e-05, + "loss": 0.4099, "step": 35970 }, { - "epoch": 1.27, - "learning_rate": 4.3754558794777514e-05, - "loss": 0.291, + "epoch": 1.2965365625112626, + "grad_norm": 0.22151976823806763, + "learning_rate": 4.342205104997006e-05, + "loss": 0.456, "step": 35975 }, { - "epoch": 1.27, - "learning_rate": 4.37526750101734e-05, - "loss": 0.2738, + "epoch": 1.296716762172487, + "grad_norm": 0.19415588676929474, + "learning_rate": 4.3420078203846245e-05, + "loss": 0.4543, "step": 35980 }, { - "epoch": 1.27, - "learning_rate": 4.375079098207816e-05, - "loss": 0.2847, + "epoch": 1.2968969618337116, + "grad_norm": 0.18258939683437347, + "learning_rate": 4.341810510675243e-05, + "loss": 0.429, "step": 35985 }, { - "epoch": 1.27, - "learning_rate": 4.374890671051626e-05, - "loss": 0.2949, + "epoch": 1.2970771614949363, + "grad_norm": 0.20076502859592438, + "learning_rate": 4.3416131758715496e-05, + "loss": 0.4105, "step": 35990 }, { - "epoch": 1.27, - "learning_rate": 4.3747022195512165e-05, - "loss": 0.3078, + "epoch": 1.297257361156161, + "grad_norm": 0.1802350878715515, + "learning_rate": 4.3414158159762334e-05, + "loss": 0.4423, "step": 35995 }, { - "epoch": 1.27, - "learning_rate": 4.374513743709034e-05, - "loss": 0.2833, + "epoch": 1.2974375608173856, + "grad_norm": 0.2174658179283142, + "learning_rate": 4.341218430991982e-05, + "loss": 0.4242, "step": 36000 }, { - "epoch": 1.27, - "eval_loss": 0.2862606644630432, - "eval_runtime": 10.5286, - "eval_samples_per_second": 9.498, - "eval_steps_per_second": 9.498, + "epoch": 1.2974375608173856, + "eval_loss": 0.4492260217666626, + "eval_runtime": 3.5485, + "eval_samples_per_second": 28.181, + "eval_steps_per_second": 7.045, "step": 36000 }, { - "epoch": 1.27, - "learning_rate": 4.374325243527526e-05, - "loss": 0.2925, + "epoch": 1.2976177604786103, + "grad_norm": 0.1733621507883072, + "learning_rate": 4.3410210209214875e-05, + "loss": 0.4091, "step": 36005 }, { - "epoch": 1.27, - "learning_rate": 4.3741367190091396e-05, - "loss": 0.2695, + "epoch": 1.2977979601398348, + "grad_norm": 0.20660637319087982, + "learning_rate": 4.3408235857674376e-05, + "loss": 0.4355, "step": 36010 }, { - "epoch": 1.27, - "learning_rate": 4.3739481701563236e-05, - "loss": 0.2741, + "epoch": 1.2979781598010596, + "grad_norm": 0.16919872164726257, + "learning_rate": 4.340626125532522e-05, + "loss": 0.4069, "step": 36015 }, { - "epoch": 1.27, - "learning_rate": 4.3737595969715254e-05, - "loss": 0.3076, + "epoch": 1.2981583594622843, + "grad_norm": 0.19883739948272705, + "learning_rate": 4.3404286402194326e-05, + "loss": 0.4177, "step": 36020 }, { - "epoch": 1.27, - "learning_rate": 4.3735709994571924e-05, - "loss": 0.2953, + "epoch": 1.2983385591235088, + "grad_norm": 0.13305337727069855, + "learning_rate": 4.340231129830859e-05, + "loss": 0.4168, "step": 36025 }, { - "epoch": 1.27, - "learning_rate": 4.3733823776157764e-05, - "loss": 0.2976, + "epoch": 1.2985187587847336, + "grad_norm": 0.19353942573070526, + "learning_rate": 4.3400335943694925e-05, + "loss": 0.3978, "step": 36030 }, { - "epoch": 1.27, - "learning_rate": 4.373193731449724e-05, - "loss": 0.3005, + "epoch": 1.298698958445958, + "grad_norm": 0.2054118812084198, + "learning_rate": 4.339836033838025e-05, + "loss": 0.4347, "step": 36035 }, { - "epoch": 1.27, - "learning_rate": 4.373005060961486e-05, - "loss": 0.3054, + "epoch": 1.2988791581071828, + "grad_norm": 0.16488704085350037, + "learning_rate": 4.339638448239147e-05, + "loss": 0.3811, "step": 36040 }, { - "epoch": 1.27, - "learning_rate": 4.372816366153511e-05, - "loss": 0.288, + "epoch": 1.2990593577684075, + "grad_norm": 0.1690049171447754, + "learning_rate": 4.3394408375755526e-05, + "loss": 0.4393, "step": 36045 }, { - "epoch": 1.27, - "learning_rate": 4.37262764702825e-05, - "loss": 0.2644, + "epoch": 1.299239557429632, + "grad_norm": 0.21596601605415344, + "learning_rate": 4.339243201849932e-05, + "loss": 0.4089, "step": 36050 }, { - "epoch": 1.27, - "learning_rate": 4.372438903588153e-05, - "loss": 0.3041, + "epoch": 1.2994197570908566, + "grad_norm": 0.17720189690589905, + "learning_rate": 4.339045541064978e-05, + "loss": 0.4117, "step": 36055 }, { - "epoch": 1.27, - "learning_rate": 4.3722501358356705e-05, - "loss": 0.2938, + "epoch": 1.2995999567520813, + "grad_norm": 0.16456130146980286, + "learning_rate": 4.3388478552233856e-05, + "loss": 0.3757, "step": 36060 }, { - "epoch": 1.27, - "learning_rate": 4.372061343773255e-05, - "loss": 0.2867, + "epoch": 1.299780156413306, + "grad_norm": 0.1337253451347351, + "learning_rate": 4.338650144327847e-05, + "loss": 0.4465, "step": 36065 }, { - "epoch": 1.27, - "learning_rate": 4.3718725274033556e-05, - "loss": 0.2751, + "epoch": 1.2999603560745305, + "grad_norm": 0.17448823153972626, + "learning_rate": 4.338452408381056e-05, + "loss": 0.4445, "step": 36070 }, { - "epoch": 1.27, - "learning_rate": 4.371683686728426e-05, - "loss": 0.3224, + "epoch": 1.3001405557357553, + "grad_norm": 0.1743369847536087, + "learning_rate": 4.338254647385708e-05, + "loss": 0.4016, "step": 36075 }, { - "epoch": 1.27, - "learning_rate": 4.371494821750916e-05, - "loss": 0.2743, + "epoch": 1.3003207553969798, + "grad_norm": 0.2010771483182907, + "learning_rate": 4.338056861344495e-05, + "loss": 0.4215, "step": 36080 }, { - "epoch": 1.27, - "learning_rate": 4.3713059324732796e-05, - "loss": 0.3075, + "epoch": 1.3005009550582045, + "grad_norm": 0.21414510905742645, + "learning_rate": 4.337859050260113e-05, + "loss": 0.4478, "step": 36085 }, { - "epoch": 1.27, - "learning_rate": 4.371117018897969e-05, - "loss": 0.2818, + "epoch": 1.3006811547194292, + "grad_norm": 0.20338131487369537, + "learning_rate": 4.337661214135258e-05, + "loss": 0.4265, "step": 36090 }, { - "epoch": 1.27, - "learning_rate": 4.370928081027437e-05, - "loss": 0.2828, + "epoch": 1.3008613543806538, + "grad_norm": 0.17728644609451294, + "learning_rate": 4.3374633529726247e-05, + "loss": 0.4039, "step": 36095 }, { - "epoch": 1.27, - "learning_rate": 4.370739118864137e-05, - "loss": 0.3118, + "epoch": 1.3010415540418783, + "grad_norm": 0.13623587787151337, + "learning_rate": 4.3372654667749086e-05, + "loss": 0.3892, "step": 36100 }, { - "epoch": 1.27, - "learning_rate": 4.3705501324105224e-05, - "loss": 0.2779, + "epoch": 1.301221753703103, + "grad_norm": 0.20998680591583252, + "learning_rate": 4.337067555544806e-05, + "loss": 0.4134, "step": 36105 }, { - "epoch": 1.27, - "learning_rate": 4.370361121669047e-05, - "loss": 0.2905, + "epoch": 1.3014019533643277, + "grad_norm": 0.1900775134563446, + "learning_rate": 4.336869619285014e-05, + "loss": 0.4382, "step": 36110 }, { - "epoch": 1.27, - "learning_rate": 4.370172086642165e-05, - "loss": 0.3131, + "epoch": 1.3015821530255522, + "grad_norm": 0.17663796246051788, + "learning_rate": 4.33667165799823e-05, + "loss": 0.4058, "step": 36115 }, { - "epoch": 1.27, - "learning_rate": 4.369983027332331e-05, - "loss": 0.2999, + "epoch": 1.301762352686777, + "grad_norm": 0.17739155888557434, + "learning_rate": 4.336473671687149e-05, + "loss": 0.3889, "step": 36120 }, { - "epoch": 1.27, - "learning_rate": 4.369793943742e-05, - "loss": 0.3009, + "epoch": 1.3019425523480015, + "grad_norm": 0.204684317111969, + "learning_rate": 4.33627566035447e-05, + "loss": 0.435, "step": 36125 }, { - "epoch": 1.27, - "learning_rate": 4.369604835873626e-05, - "loss": 0.281, + "epoch": 1.3021227520092262, + "grad_norm": 0.17078737914562225, + "learning_rate": 4.336077624002891e-05, + "loss": 0.3845, "step": 36130 }, { - "epoch": 1.27, - "learning_rate": 4.3694157037296665e-05, - "loss": 0.2928, + "epoch": 1.302302951670451, + "grad_norm": 0.20431089401245117, + "learning_rate": 4.33587956263511e-05, + "loss": 0.3966, "step": 36135 }, { - "epoch": 1.27, - "learning_rate": 4.3692265473125756e-05, - "loss": 0.3069, + "epoch": 1.3024831513316755, + "grad_norm": 0.18561053276062012, + "learning_rate": 4.335681476253824e-05, + "loss": 0.4092, "step": 36140 }, { - "epoch": 1.27, - "learning_rate": 4.3690373666248094e-05, - "loss": 0.3013, + "epoch": 1.3026633509929002, + "grad_norm": 0.1851317286491394, + "learning_rate": 4.335483364861734e-05, + "loss": 0.4035, "step": 36145 }, { - "epoch": 1.27, - "learning_rate": 4.3688481616688255e-05, - "loss": 0.2989, + "epoch": 1.3028435506541247, + "grad_norm": 0.18310926854610443, + "learning_rate": 4.3352852284615395e-05, + "loss": 0.4261, "step": 36150 }, { - "epoch": 1.27, - "learning_rate": 4.3686589324470796e-05, - "loss": 0.2663, + "epoch": 1.3030237503153494, + "grad_norm": 0.19984884560108185, + "learning_rate": 4.335087067055938e-05, + "loss": 0.4395, "step": 36155 }, { - "epoch": 1.27, - "learning_rate": 4.368469678962028e-05, - "loss": 0.2916, + "epoch": 1.303203949976574, + "grad_norm": 0.17451319098472595, + "learning_rate": 4.334888880647631e-05, + "loss": 0.4466, "step": 36160 }, { - "epoch": 1.27, - "learning_rate": 4.3682804012161305e-05, - "loss": 0.2982, + "epoch": 1.3033841496377987, + "grad_norm": 0.18329133093357086, + "learning_rate": 4.3346906692393184e-05, + "loss": 0.4218, "step": 36165 }, { - "epoch": 1.27, - "learning_rate": 4.368091099211843e-05, - "loss": 0.3024, + "epoch": 1.3035643492990232, + "grad_norm": 0.19069811701774597, + "learning_rate": 4.3344924328337e-05, + "loss": 0.4008, "step": 36170 }, { - "epoch": 1.27, - "learning_rate": 4.3679017729516226e-05, - "loss": 0.2766, + "epoch": 1.303744548960248, + "grad_norm": 0.18040628731250763, + "learning_rate": 4.334294171433478e-05, + "loss": 0.3912, "step": 36175 }, { - "epoch": 1.27, - "learning_rate": 4.367712422437929e-05, - "loss": 0.3051, + "epoch": 1.3039247486214727, + "grad_norm": 0.20285740494728088, + "learning_rate": 4.3340958850413526e-05, + "loss": 0.4664, "step": 36180 }, { - "epoch": 1.27, - "learning_rate": 4.3675230476732206e-05, - "loss": 0.2863, + "epoch": 1.3041049482826972, + "grad_norm": 0.2134329378604889, + "learning_rate": 4.3338975736600266e-05, + "loss": 0.4236, "step": 36185 }, { - "epoch": 1.27, - "learning_rate": 4.367333648659956e-05, - "loss": 0.3048, + "epoch": 1.304285147943922, + "grad_norm": 0.18839861452579498, + "learning_rate": 4.3336992372922e-05, + "loss": 0.4059, "step": 36190 }, { - "epoch": 1.27, - "learning_rate": 4.3671442254005955e-05, - "loss": 0.2668, + "epoch": 1.3044653476051464, + "grad_norm": 0.22417490184307098, + "learning_rate": 4.333500875940577e-05, + "loss": 0.4422, "step": 36195 }, { - "epoch": 1.27, - "learning_rate": 4.366954777897597e-05, - "loss": 0.2889, + "epoch": 1.3046455472663712, + "grad_norm": 0.1869698464870453, + "learning_rate": 4.33330248960786e-05, + "loss": 0.4479, "step": 36200 }, { - "epoch": 1.27, - "learning_rate": 4.366765306153421e-05, - "loss": 0.2853, + "epoch": 1.304825746927596, + "grad_norm": 0.14332622289657593, + "learning_rate": 4.33310407829675e-05, + "loss": 0.3966, "step": 36205 }, { - "epoch": 1.27, - "learning_rate": 4.366575810170528e-05, - "loss": 0.2799, + "epoch": 1.3050059465888204, + "grad_norm": 0.23910625278949738, + "learning_rate": 4.3329056420099534e-05, + "loss": 0.409, "step": 36210 }, { - "epoch": 1.27, - "learning_rate": 4.3663862899513784e-05, - "loss": 0.312, + "epoch": 1.305186146250045, + "grad_norm": 0.178507000207901, + "learning_rate": 4.332707180750172e-05, + "loss": 0.4216, "step": 36215 }, { - "epoch": 1.27, - "learning_rate": 4.366196745498432e-05, - "loss": 0.2744, + "epoch": 1.3053663459112697, + "grad_norm": 0.1948593556880951, + "learning_rate": 4.3325086945201096e-05, + "loss": 0.3968, "step": 36220 }, { - "epoch": 1.27, - "learning_rate": 4.366007176814152e-05, - "loss": 0.3083, + "epoch": 1.3055465455724944, + "grad_norm": 0.2139938473701477, + "learning_rate": 4.332310183322471e-05, + "loss": 0.4525, "step": 36225 }, { - "epoch": 1.27, - "learning_rate": 4.365817583900998e-05, - "loss": 0.3109, + "epoch": 1.305726745233719, + "grad_norm": 0.1733957827091217, + "learning_rate": 4.332111647159962e-05, + "loss": 0.4319, "step": 36230 }, { - "epoch": 1.27, - "learning_rate": 4.365627966761432e-05, - "loss": 0.2955, + "epoch": 1.3059069448949436, + "grad_norm": 0.18427705764770508, + "learning_rate": 4.331913086035285e-05, + "loss": 0.4355, "step": 36235 }, { - "epoch": 1.28, - "learning_rate": 4.3654383253979163e-05, - "loss": 0.2991, + "epoch": 1.3060871445561681, + "grad_norm": 0.17030684649944305, + "learning_rate": 4.3317144999511474e-05, + "loss": 0.4461, "step": 36240 }, { - "epoch": 1.28, - "learning_rate": 4.365248659812914e-05, - "loss": 0.2743, + "epoch": 1.3062673442173929, + "grad_norm": 0.17609108984470367, + "learning_rate": 4.3315158889102546e-05, + "loss": 0.4154, "step": 36245 }, { - "epoch": 1.28, - "learning_rate": 4.365058970008886e-05, - "loss": 0.2983, + "epoch": 1.3064475438786176, + "grad_norm": 0.22373346984386444, + "learning_rate": 4.3313172529153124e-05, + "loss": 0.394, "step": 36250 }, { - "epoch": 1.28, - "learning_rate": 4.364869255988298e-05, - "loss": 0.2837, + "epoch": 1.3066277435398421, + "grad_norm": 0.18238912522792816, + "learning_rate": 4.331118591969027e-05, + "loss": 0.4212, "step": 36255 }, { - "epoch": 1.28, - "learning_rate": 4.36467951775361e-05, - "loss": 0.2954, + "epoch": 1.3068079432010669, + "grad_norm": 0.17490154504776, + "learning_rate": 4.330919906074106e-05, + "loss": 0.3987, "step": 36260 }, { - "epoch": 1.28, - "learning_rate": 4.3644897553072885e-05, - "loss": 0.3065, + "epoch": 1.3069881428622914, + "grad_norm": 0.17882512509822845, + "learning_rate": 4.330721195233255e-05, + "loss": 0.4248, "step": 36265 }, { - "epoch": 1.28, - "learning_rate": 4.364299968651795e-05, - "loss": 0.2985, + "epoch": 1.307168342523516, + "grad_norm": 0.21549130976200104, + "learning_rate": 4.330522459449182e-05, + "loss": 0.4354, "step": 36270 }, { - "epoch": 1.28, - "learning_rate": 4.364110157789596e-05, - "loss": 0.2885, + "epoch": 1.3073485421847406, + "grad_norm": 0.19337189197540283, + "learning_rate": 4.330323698724596e-05, + "loss": 0.4282, "step": 36275 }, { - "epoch": 1.28, - "learning_rate": 4.363920322723155e-05, - "loss": 0.2916, + "epoch": 1.3075287418459653, + "grad_norm": 0.16519051790237427, + "learning_rate": 4.330124913062203e-05, + "loss": 0.4204, "step": 36280 }, { - "epoch": 1.28, - "learning_rate": 4.363730463454937e-05, - "loss": 0.301, + "epoch": 1.3077089415071899, + "grad_norm": 0.18474717438220978, + "learning_rate": 4.329926102464712e-05, + "loss": 0.4295, "step": 36285 }, { - "epoch": 1.28, - "learning_rate": 4.3635405799874076e-05, - "loss": 0.2864, + "epoch": 1.3078891411684146, + "grad_norm": 0.21671119332313538, + "learning_rate": 4.3297272669348325e-05, + "loss": 0.4407, "step": 36290 }, { - "epoch": 1.28, - "learning_rate": 4.363350672323031e-05, - "loss": 0.303, + "epoch": 1.3080693408296393, + "grad_norm": Infinity, + "learning_rate": 4.32956818056143e-05, + "loss": 0.4212, "step": 36295 }, { - "epoch": 1.28, - "learning_rate": 4.3631607404642747e-05, - "loss": 0.2987, + "epoch": 1.3082495404908638, + "grad_norm": 0.16523273289203644, + "learning_rate": 4.329369300160078e-05, + "loss": 0.4094, "step": 36300 }, { - "epoch": 1.28, - "learning_rate": 4.362970784413604e-05, - "loss": 0.2927, + "epoch": 1.3084297401520886, + "grad_norm": 0.18672731518745422, + "learning_rate": 4.329170394833923e-05, + "loss": 0.4582, "step": 36305 }, { - "epoch": 1.28, - "learning_rate": 4.362780804173484e-05, - "loss": 0.2928, + "epoch": 1.308609939813313, + "grad_norm": 0.19107048213481903, + "learning_rate": 4.328971464585676e-05, + "loss": 0.3931, "step": 36310 }, { - "epoch": 1.28, - "learning_rate": 4.362590799746384e-05, - "loss": 0.2916, + "epoch": 1.3087901394745378, + "grad_norm": 0.17582692205905914, + "learning_rate": 4.3287725094180466e-05, + "loss": 0.3489, "step": 36315 }, { - "epoch": 1.28, - "learning_rate": 4.36240077113477e-05, - "loss": 0.2699, + "epoch": 1.3089703391357626, + "grad_norm": 0.20227184891700745, + "learning_rate": 4.328573529333746e-05, + "loss": 0.4136, "step": 36320 }, { - "epoch": 1.28, - "learning_rate": 4.3622107183411096e-05, - "loss": 0.2861, + "epoch": 1.309150538796987, + "grad_norm": 0.17152424156665802, + "learning_rate": 4.328374524335485e-05, + "loss": 0.4413, "step": 36325 }, { - "epoch": 1.28, - "learning_rate": 4.36202064136787e-05, - "loss": 0.2675, + "epoch": 1.3093307384582116, + "grad_norm": 0.20434246957302094, + "learning_rate": 4.328175494425975e-05, + "loss": 0.4357, "step": 36330 }, { - "epoch": 1.28, - "learning_rate": 4.3618305402175196e-05, - "loss": 0.2824, + "epoch": 1.3095109381194363, + "grad_norm": 0.18232357501983643, + "learning_rate": 4.327976439607928e-05, + "loss": 0.4028, "step": 36335 }, { - "epoch": 1.28, - "learning_rate": 4.361640414892526e-05, - "loss": 0.2912, + "epoch": 1.309691137780661, + "grad_norm": 0.1636376529932022, + "learning_rate": 4.327777359884056e-05, + "loss": 0.3889, "step": 36340 }, { - "epoch": 1.28, - "learning_rate": 4.3614502653953596e-05, - "loss": 0.2873, + "epoch": 1.3098713374418856, + "grad_norm": 0.21091686189174652, + "learning_rate": 4.327578255257071e-05, + "loss": 0.4193, "step": 36345 }, { - "epoch": 1.28, - "learning_rate": 4.3612600917284874e-05, - "loss": 0.2911, + "epoch": 1.3100515371031103, + "grad_norm": 0.2029031366109848, + "learning_rate": 4.327379125729687e-05, + "loss": 0.4471, "step": 36350 }, { - "epoch": 1.28, - "learning_rate": 4.3610698938943795e-05, - "loss": 0.2893, + "epoch": 1.3102317367643348, + "grad_norm": 0.17957919836044312, + "learning_rate": 4.327179971304615e-05, + "loss": 0.384, "step": 36355 }, { - "epoch": 1.28, - "learning_rate": 4.360879671895506e-05, - "loss": 0.3058, + "epoch": 1.3104119364255595, + "grad_norm": 0.2107733190059662, + "learning_rate": 4.3269807919845705e-05, + "loss": 0.3978, "step": 36360 }, { - "epoch": 1.28, - "learning_rate": 4.3606894257343356e-05, - "loss": 0.2766, + "epoch": 1.3105921360867843, + "grad_norm": 0.16910696029663086, + "learning_rate": 4.326781587772266e-05, + "loss": 0.4386, "step": 36365 }, { - "epoch": 1.28, - "learning_rate": 4.36049915541334e-05, - "loss": 0.2626, + "epoch": 1.3107723357480088, + "grad_norm": 0.18517418205738068, + "learning_rate": 4.326582358670416e-05, + "loss": 0.3967, "step": 36370 }, { - "epoch": 1.28, - "learning_rate": 4.360308860934989e-05, - "loss": 0.2879, + "epoch": 1.3109525354092335, + "grad_norm": 0.21739526093006134, + "learning_rate": 4.326383104681735e-05, + "loss": 0.4325, "step": 36375 }, { - "epoch": 1.28, - "learning_rate": 4.360118542301753e-05, - "loss": 0.2785, + "epoch": 1.311132735070458, + "grad_norm": 0.18001195788383484, + "learning_rate": 4.3261838258089384e-05, + "loss": 0.4216, "step": 36380 }, { - "epoch": 1.28, - "learning_rate": 4.3599281995161035e-05, - "loss": 0.2919, + "epoch": 1.3113129347316828, + "grad_norm": 0.2097054123878479, + "learning_rate": 4.32598452205474e-05, + "loss": 0.4309, "step": 36385 }, { - "epoch": 1.28, - "learning_rate": 4.359737832580512e-05, - "loss": 0.2894, + "epoch": 1.3114931343929073, + "grad_norm": 0.15929162502288818, + "learning_rate": 4.325785193421856e-05, + "loss": 0.4344, "step": 36390 }, { - "epoch": 1.28, - "learning_rate": 4.359547441497451e-05, - "loss": 0.2673, + "epoch": 1.311673334054132, + "grad_norm": 0.17544053494930267, + "learning_rate": 4.325585839913003e-05, + "loss": 0.4344, "step": 36395 }, { - "epoch": 1.28, - "learning_rate": 4.359357026269392e-05, - "loss": 0.2778, + "epoch": 1.3118535337153565, + "grad_norm": 0.2082267850637436, + "learning_rate": 4.3253864615308956e-05, + "loss": 0.4096, "step": 36400 }, { - "epoch": 1.28, - "learning_rate": 4.3591665868988064e-05, - "loss": 0.3002, + "epoch": 1.3120337333765812, + "grad_norm": 0.1683303415775299, + "learning_rate": 4.3251870582782516e-05, + "loss": 0.4234, "step": 36405 }, { - "epoch": 1.28, - "learning_rate": 4.358976123388168e-05, - "loss": 0.3183, + "epoch": 1.312213933037806, + "grad_norm": 0.15545429289340973, + "learning_rate": 4.3249876301577877e-05, + "loss": 0.3982, "step": 36410 }, { - "epoch": 1.28, - "learning_rate": 4.3587856357399504e-05, - "loss": 0.276, + "epoch": 1.3123941326990305, + "grad_norm": 0.22306688129901886, + "learning_rate": 4.3247881771722195e-05, + "loss": 0.4617, "step": 36415 }, { - "epoch": 1.28, - "learning_rate": 4.358595123956626e-05, - "loss": 0.3022, + "epoch": 1.3125743323602552, + "grad_norm": 0.19142894446849823, + "learning_rate": 4.3245886993242666e-05, + "loss": 0.4333, "step": 36420 }, { - "epoch": 1.28, - "learning_rate": 4.358404588040669e-05, - "loss": 0.2946, + "epoch": 1.3127545320214797, + "grad_norm": 0.14523069560527802, + "learning_rate": 4.324389196616645e-05, + "loss": 0.394, "step": 36425 }, { - "epoch": 1.28, - "learning_rate": 4.358214027994553e-05, - "loss": 0.2858, + "epoch": 1.3129347316827045, + "grad_norm": 0.1527920961380005, + "learning_rate": 4.3241896690520746e-05, + "loss": 0.448, "step": 36430 }, { - "epoch": 1.28, - "learning_rate": 4.358023443820752e-05, - "loss": 0.3093, + "epoch": 1.3131149313439292, + "grad_norm": 0.1837022453546524, + "learning_rate": 4.323990116633273e-05, + "loss": 0.3912, "step": 36435 }, { - "epoch": 1.28, - "learning_rate": 4.357832835521742e-05, - "loss": 0.2731, + "epoch": 1.3132951310051537, + "grad_norm": 0.192708358168602, + "learning_rate": 4.323790539362958e-05, + "loss": 0.4349, "step": 36440 }, { - "epoch": 1.28, - "learning_rate": 4.357642203099996e-05, - "loss": 0.2918, + "epoch": 1.3134753306663782, + "grad_norm": 0.16803932189941406, + "learning_rate": 4.323590937243852e-05, + "loss": 0.4473, "step": 36445 }, { - "epoch": 1.28, - "learning_rate": 4.3574515465579915e-05, - "loss": 0.3219, + "epoch": 1.313655530327603, + "grad_norm": 0.18764148652553558, + "learning_rate": 4.323391310278672e-05, + "loss": 0.4169, "step": 36450 }, { - "epoch": 1.28, - "learning_rate": 4.3572608658982016e-05, - "loss": 0.2716, + "epoch": 1.3138357299888277, + "grad_norm": 0.14096736907958984, + "learning_rate": 4.3231916584701374e-05, + "loss": 0.389, "step": 36455 }, { - "epoch": 1.28, - "learning_rate": 4.3570701611231045e-05, - "loss": 0.2929, + "epoch": 1.3140159296500522, + "grad_norm": 0.16323328018188477, + "learning_rate": 4.32299198182097e-05, + "loss": 0.3844, "step": 36460 }, { - "epoch": 1.28, - "learning_rate": 4.356879432235175e-05, - "loss": 0.3035, + "epoch": 1.314196129311277, + "grad_norm": 0.2195003479719162, + "learning_rate": 4.32279228033389e-05, + "loss": 0.4448, "step": 36465 }, { - "epoch": 1.28, - "learning_rate": 4.356688679236889e-05, - "loss": 0.2904, + "epoch": 1.3143763289725015, + "grad_norm": 0.1823636144399643, + "learning_rate": 4.3225925540116174e-05, + "loss": 0.3907, "step": 36470 }, { - "epoch": 1.28, - "learning_rate": 4.356497902130724e-05, - "loss": 0.3162, + "epoch": 1.3145565286337262, + "grad_norm": 0.1974102109670639, + "learning_rate": 4.322392802856875e-05, + "loss": 0.4337, "step": 36475 }, { - "epoch": 1.28, - "learning_rate": 4.356307100919158e-05, - "loss": 0.3035, + "epoch": 1.314736728294951, + "grad_norm": 0.14405225217342377, + "learning_rate": 4.3221930268723834e-05, + "loss": 0.3845, "step": 36480 }, { - "epoch": 1.28, - "learning_rate": 4.356116275604667e-05, - "loss": 0.3243, + "epoch": 1.3149169279561754, + "grad_norm": 0.15730611979961395, + "learning_rate": 4.321993226060864e-05, + "loss": 0.4251, "step": 36485 }, { - "epoch": 1.28, - "learning_rate": 4.35592542618973e-05, - "loss": 0.3113, + "epoch": 1.3150971276174, + "grad_norm": 0.17731639742851257, + "learning_rate": 4.3217934004250396e-05, + "loss": 0.4128, "step": 36490 }, { - "epoch": 1.28, - "learning_rate": 4.355734552676823e-05, - "loss": 0.2986, + "epoch": 1.3152773272786247, + "grad_norm": 0.17964981496334076, + "learning_rate": 4.321593549967634e-05, + "loss": 0.4303, "step": 36495 }, { - "epoch": 1.28, - "learning_rate": 4.355543655068428e-05, - "loss": 0.2644, + "epoch": 1.3154575269398494, + "grad_norm": 0.17006689310073853, + "learning_rate": 4.3213936746913675e-05, + "loss": 0.418, "step": 36500 }, { - "epoch": 1.28, - "eval_loss": 0.28553855419158936, - "eval_runtime": 10.5423, - "eval_samples_per_second": 9.486, - "eval_steps_per_second": 9.486, + "epoch": 1.3154575269398494, + "eval_loss": 0.4494481384754181, + "eval_runtime": 3.5496, + "eval_samples_per_second": 28.172, + "eval_steps_per_second": 7.043, "step": 36500 }, { - "epoch": 1.28, - "learning_rate": 4.35535273336702e-05, - "loss": 0.2938, + "epoch": 1.315637726601074, + "grad_norm": 0.18565762042999268, + "learning_rate": 4.321193774598966e-05, + "loss": 0.4313, "step": 36505 }, { - "epoch": 1.28, - "learning_rate": 4.35516178757508e-05, - "loss": 0.2965, + "epoch": 1.3158179262622987, + "grad_norm": 0.18812930583953857, + "learning_rate": 4.3209938496931514e-05, + "loss": 0.4287, "step": 36510 }, { - "epoch": 1.28, - "learning_rate": 4.3549708176950864e-05, - "loss": 0.2701, + "epoch": 1.3159981259235232, + "grad_norm": 0.21864257752895355, + "learning_rate": 4.320793899976648e-05, + "loss": 0.4444, "step": 36515 }, { - "epoch": 1.28, - "learning_rate": 4.35477982372952e-05, - "loss": 0.2727, + "epoch": 1.316178325584748, + "grad_norm": 0.18846680223941803, + "learning_rate": 4.32059392545218e-05, + "loss": 0.4351, "step": 36520 }, { - "epoch": 1.29, - "learning_rate": 4.3545888056808595e-05, - "loss": 0.3053, + "epoch": 1.3163585252459726, + "grad_norm": 0.2109052538871765, + "learning_rate": 4.320393926122473e-05, + "loss": 0.3791, "step": 36525 }, { - "epoch": 1.29, - "learning_rate": 4.3543977635515856e-05, - "loss": 0.2859, + "epoch": 1.3165387249071971, + "grad_norm": 0.16090500354766846, + "learning_rate": 4.320193901990251e-05, + "loss": 0.4113, "step": 36530 }, { - "epoch": 1.29, - "learning_rate": 4.3542066973441784e-05, - "loss": 0.2769, + "epoch": 1.3167189245684219, + "grad_norm": 0.15768031775951385, + "learning_rate": 4.31999385305824e-05, + "loss": 0.3952, "step": 36535 }, { - "epoch": 1.29, - "learning_rate": 4.3540156070611194e-05, - "loss": 0.2967, + "epoch": 1.3168991242296464, + "grad_norm": 0.18743303418159485, + "learning_rate": 4.319793779329163e-05, + "loss": 0.4256, "step": 36540 }, { - "epoch": 1.29, - "learning_rate": 4.35382449270489e-05, - "loss": 0.2793, + "epoch": 1.3170793238908711, + "grad_norm": 0.2007930725812912, + "learning_rate": 4.31959368080575e-05, + "loss": 0.4352, "step": 36545 }, { - "epoch": 1.29, - "learning_rate": 4.3536333542779704e-05, - "loss": 0.3308, + "epoch": 1.3172595235520959, + "grad_norm": 0.21530933678150177, + "learning_rate": 4.319393557490725e-05, + "loss": 0.4444, "step": 36550 }, { - "epoch": 1.29, - "learning_rate": 4.353442191782844e-05, - "loss": 0.2875, + "epoch": 1.3174397232133204, + "grad_norm": 0.173873171210289, + "learning_rate": 4.3191934093868146e-05, + "loss": 0.4345, "step": 36555 }, { - "epoch": 1.29, - "learning_rate": 4.3532510052219924e-05, - "loss": 0.2916, + "epoch": 1.3176199228745449, + "grad_norm": 0.17242655158042908, + "learning_rate": 4.318993236496747e-05, + "loss": 0.3883, "step": 36560 }, { - "epoch": 1.29, - "learning_rate": 4.353059794597898e-05, - "loss": 0.3129, + "epoch": 1.3178001225357696, + "grad_norm": 0.14753524959087372, + "learning_rate": 4.318793038823248e-05, + "loss": 0.4319, "step": 36565 }, { - "epoch": 1.29, - "learning_rate": 4.3528685599130427e-05, - "loss": 0.2945, + "epoch": 1.3179803221969943, + "grad_norm": 0.18761220574378967, + "learning_rate": 4.318592816369046e-05, + "loss": 0.4511, "step": 36570 }, { - "epoch": 1.29, - "learning_rate": 4.35267730116991e-05, - "loss": 0.3076, + "epoch": 1.3181605218582189, + "grad_norm": 0.15937355160713196, + "learning_rate": 4.3183925691368695e-05, + "loss": 0.3816, "step": 36575 }, { - "epoch": 1.29, - "learning_rate": 4.3524860183709836e-05, - "loss": 0.2925, + "epoch": 1.3183407215194436, + "grad_norm": 0.18989646434783936, + "learning_rate": 4.318192297129446e-05, + "loss": 0.3873, "step": 36580 }, { - "epoch": 1.29, - "learning_rate": 4.352294711518748e-05, - "loss": 0.3183, + "epoch": 1.318520921180668, + "grad_norm": 0.1784542053937912, + "learning_rate": 4.3179920003495045e-05, + "loss": 0.4507, "step": 36585 }, { - "epoch": 1.29, - "learning_rate": 4.352103380615686e-05, - "loss": 0.3042, + "epoch": 1.3187011208418928, + "grad_norm": 0.162387415766716, + "learning_rate": 4.3177916787997735e-05, + "loss": 0.4457, "step": 36590 }, { - "epoch": 1.29, - "learning_rate": 4.3519120256642814e-05, - "loss": 0.2945, + "epoch": 1.3188813205031176, + "grad_norm": 0.1939140409231186, + "learning_rate": 4.3175913324829834e-05, + "loss": 0.377, "step": 36595 }, { - "epoch": 1.29, - "learning_rate": 4.35172064666702e-05, - "loss": 0.3112, + "epoch": 1.319061520164342, + "grad_norm": 0.19154348969459534, + "learning_rate": 4.317390961401862e-05, + "loss": 0.3741, "step": 36600 }, { - "epoch": 1.29, - "learning_rate": 4.3515292436263863e-05, - "loss": 0.2821, + "epoch": 1.3192417198255666, + "grad_norm": 0.18536292016506195, + "learning_rate": 4.3171905655591425e-05, + "loss": 0.3934, "step": 36605 }, { - "epoch": 1.29, - "learning_rate": 4.351337816544866e-05, - "loss": 0.2871, + "epoch": 1.3194219194867913, + "grad_norm": 0.1465887874364853, + "learning_rate": 4.316990144957553e-05, + "loss": 0.4129, "step": 36610 }, { - "epoch": 1.29, - "learning_rate": 4.351146365424944e-05, - "loss": 0.3181, + "epoch": 1.319602119148016, + "grad_norm": 0.21748924255371094, + "learning_rate": 4.316789699599824e-05, + "loss": 0.4058, "step": 36615 }, { - "epoch": 1.29, - "learning_rate": 4.350954890269106e-05, - "loss": 0.2962, + "epoch": 1.3197823188092406, + "grad_norm": 0.18273966014385223, + "learning_rate": 4.316589229488687e-05, + "loss": 0.429, "step": 36620 }, { - "epoch": 1.29, - "learning_rate": 4.3507633910798394e-05, - "loss": 0.2991, + "epoch": 1.3199625184704653, + "grad_norm": 0.21654346585273743, + "learning_rate": 4.3163887346268735e-05, + "loss": 0.4469, "step": 36625 }, { - "epoch": 1.29, - "learning_rate": 4.3505718678596294e-05, - "loss": 0.3041, + "epoch": 1.3201427181316898, + "grad_norm": 0.18244554102420807, + "learning_rate": 4.316188215017116e-05, + "loss": 0.4285, "step": 36630 }, { - "epoch": 1.29, - "learning_rate": 4.350380320610963e-05, - "loss": 0.301, + "epoch": 1.3203229177929146, + "grad_norm": 0.20105114579200745, + "learning_rate": 4.315987670662145e-05, + "loss": 0.4308, "step": 36635 }, { - "epoch": 1.29, - "learning_rate": 4.350188749336328e-05, - "loss": 0.2916, + "epoch": 1.3205031174541393, + "grad_norm": 0.20013387501239777, + "learning_rate": 4.315787101564693e-05, + "loss": 0.4312, "step": 36640 }, { - "epoch": 1.29, - "learning_rate": 4.349997154038211e-05, - "loss": 0.2933, + "epoch": 1.3206833171153638, + "grad_norm": 0.20948466658592224, + "learning_rate": 4.315586507727494e-05, + "loss": 0.4199, "step": 36645 }, { - "epoch": 1.29, - "learning_rate": 4.3498055347191005e-05, - "loss": 0.2878, + "epoch": 1.3208635167765885, + "grad_norm": 0.19690115749835968, + "learning_rate": 4.3153858891532804e-05, + "loss": 0.4365, "step": 36650 }, { - "epoch": 1.29, - "learning_rate": 4.3496138913814837e-05, - "loss": 0.3054, + "epoch": 1.321043716437813, + "grad_norm": 0.17856040596961975, + "learning_rate": 4.315185245844785e-05, + "loss": 0.4259, "step": 36655 }, { - "epoch": 1.29, - "learning_rate": 4.34942222402785e-05, - "loss": 0.2858, + "epoch": 1.3212239160990378, + "grad_norm": 0.16999214887619019, + "learning_rate": 4.314984577804743e-05, + "loss": 0.4338, "step": 36660 }, { - "epoch": 1.29, - "learning_rate": 4.3492305326606876e-05, - "loss": 0.2823, + "epoch": 1.3214041157602623, + "grad_norm": 0.1723853200674057, + "learning_rate": 4.314783885035887e-05, + "loss": 0.4113, "step": 36665 }, { - "epoch": 1.29, - "learning_rate": 4.349038817282485e-05, - "loss": 0.2597, + "epoch": 1.321584315421487, + "grad_norm": 0.19773799180984497, + "learning_rate": 4.314583167540952e-05, + "loss": 0.436, "step": 36670 }, { - "epoch": 1.29, - "learning_rate": 4.3488470778957316e-05, - "loss": 0.2985, + "epoch": 1.3217645150827115, + "grad_norm": 0.15514503419399261, + "learning_rate": 4.314382425322672e-05, + "loss": 0.4356, "step": 36675 }, { - "epoch": 1.29, - "learning_rate": 4.348655314502919e-05, - "loss": 0.2786, + "epoch": 1.3219447147439363, + "grad_norm": 0.20653900504112244, + "learning_rate": 4.314181658383783e-05, + "loss": 0.3912, "step": 36680 }, { - "epoch": 1.29, - "learning_rate": 4.348463527106533e-05, - "loss": 0.2555, + "epoch": 1.322124914405161, + "grad_norm": 0.15066716074943542, + "learning_rate": 4.313980866727021e-05, + "loss": 0.4433, "step": 36685 }, { - "epoch": 1.29, - "learning_rate": 4.348271715709068e-05, - "loss": 0.2963, + "epoch": 1.3223051140663855, + "grad_norm": 0.1296568661928177, + "learning_rate": 4.313780050355119e-05, + "loss": 0.3966, "step": 36690 }, { - "epoch": 1.29, - "learning_rate": 4.348079880313013e-05, - "loss": 0.2941, + "epoch": 1.3224853137276102, + "grad_norm": 0.2106575220823288, + "learning_rate": 4.313579209270817e-05, + "loss": 0.3952, "step": 36695 }, { - "epoch": 1.29, - "learning_rate": 4.347888020920858e-05, - "loss": 0.2925, + "epoch": 1.3226655133888348, + "grad_norm": 0.1847856044769287, + "learning_rate": 4.313378343476849e-05, + "loss": 0.4303, "step": 36700 }, { - "epoch": 1.29, - "learning_rate": 4.3476961375350945e-05, - "loss": 0.2928, + "epoch": 1.3228457130500595, + "grad_norm": 0.20785640180110931, + "learning_rate": 4.313177452975952e-05, + "loss": 0.4227, "step": 36705 }, { - "epoch": 1.29, - "learning_rate": 4.3475042301582145e-05, - "loss": 0.28, + "epoch": 1.3230259127112842, + "grad_norm": 0.14709509909152985, + "learning_rate": 4.312976537770863e-05, + "loss": 0.4353, "step": 36710 }, { - "epoch": 1.29, - "learning_rate": 4.347312298792711e-05, - "loss": 0.301, + "epoch": 1.3232061123725087, + "grad_norm": 0.19536839425563812, + "learning_rate": 4.312775597864319e-05, + "loss": 0.4323, "step": 36715 }, { - "epoch": 1.29, - "learning_rate": 4.3471203434410726e-05, - "loss": 0.2977, + "epoch": 1.3233863120337332, + "grad_norm": 0.15768684446811676, + "learning_rate": 4.31257463325906e-05, + "loss": 0.416, "step": 36720 }, { - "epoch": 1.29, - "learning_rate": 4.3469283641057954e-05, - "loss": 0.2834, + "epoch": 1.323566511694958, + "grad_norm": 0.20642277598381042, + "learning_rate": 4.312373643957821e-05, + "loss": 0.4052, "step": 36725 }, { - "epoch": 1.29, - "learning_rate": 4.3467363607893695e-05, - "loss": 0.2851, + "epoch": 1.3237467113561827, + "grad_norm": 0.1643398553133011, + "learning_rate": 4.312172629963343e-05, + "loss": 0.4166, "step": 36730 }, { - "epoch": 1.29, - "learning_rate": 4.34654433349429e-05, - "loss": 0.2715, + "epoch": 1.3239269110174072, + "grad_norm": 0.1844344139099121, + "learning_rate": 4.311971591278363e-05, + "loss": 0.4033, "step": 36735 }, { - "epoch": 1.29, - "learning_rate": 4.346352282223048e-05, - "loss": 0.2796, + "epoch": 1.324107110678632, + "grad_norm": 0.1502874493598938, + "learning_rate": 4.311770527905622e-05, + "loss": 0.3871, "step": 36740 }, { - "epoch": 1.29, - "learning_rate": 4.3461602069781396e-05, - "loss": 0.3012, + "epoch": 1.3242873103398565, + "grad_norm": 0.16832222044467926, + "learning_rate": 4.3115694398478574e-05, + "loss": 0.4425, "step": 36745 }, { - "epoch": 1.29, - "learning_rate": 4.3459681077620576e-05, - "loss": 0.2873, + "epoch": 1.3244675100010812, + "grad_norm": 0.14828276634216309, + "learning_rate": 4.31136832710781e-05, + "loss": 0.3985, "step": 36750 }, { - "epoch": 1.29, - "learning_rate": 4.345775984577295e-05, - "loss": 0.2934, + "epoch": 1.324647709662306, + "grad_norm": 0.20640629529953003, + "learning_rate": 4.31116718968822e-05, + "loss": 0.4414, "step": 36755 }, { - "epoch": 1.29, - "learning_rate": 4.345583837426349e-05, - "loss": 0.2894, + "epoch": 1.3248279093235305, + "grad_norm": 0.1458745151758194, + "learning_rate": 4.310966027591828e-05, + "loss": 0.4191, "step": 36760 }, { - "epoch": 1.29, - "learning_rate": 4.345391666311712e-05, - "loss": 0.2864, + "epoch": 1.3250081089847552, + "grad_norm": 0.2120623141527176, + "learning_rate": 4.3107648408213744e-05, + "loss": 0.4122, "step": 36765 }, { - "epoch": 1.29, - "learning_rate": 4.3451994712358815e-05, - "loss": 0.2974, + "epoch": 1.3251883086459797, + "grad_norm": 0.17051535844802856, + "learning_rate": 4.3105636293795995e-05, + "loss": 0.4658, "step": 36770 }, { - "epoch": 1.29, - "learning_rate": 4.345007252201351e-05, - "loss": 0.2784, + "epoch": 1.3253685083072044, + "grad_norm": 0.19851194322109222, + "learning_rate": 4.310362393269247e-05, + "loss": 0.4223, "step": 36775 }, { - "epoch": 1.29, - "learning_rate": 4.3448150092106185e-05, - "loss": 0.3068, + "epoch": 1.325548707968429, + "grad_norm": 0.2429044246673584, + "learning_rate": 4.310161132493057e-05, + "loss": 0.4051, "step": 36780 }, { - "epoch": 1.29, - "learning_rate": 4.344622742266178e-05, - "loss": 0.2844, + "epoch": 1.3257289076296537, + "grad_norm": 0.21794357895851135, + "learning_rate": 4.3099598470537716e-05, + "loss": 0.4102, "step": 36785 }, { - "epoch": 1.29, - "learning_rate": 4.3444689114656336e-05, - "loss": 0.2789, + "epoch": 1.3259091072908782, + "grad_norm": 0.17098356783390045, + "learning_rate": 4.3097585369541336e-05, + "loss": 0.4288, "step": 36790 }, { - "epoch": 1.29, - "learning_rate": 4.344276601410812e-05, - "loss": 0.2819, + "epoch": 1.326089306952103, + "grad_norm": 0.17869386076927185, + "learning_rate": 4.309557202196887e-05, + "loss": 0.4128, "step": 36795 }, { - "epoch": 1.29, - "learning_rate": 4.3440842674092734e-05, - "loss": 0.2916, + "epoch": 1.3262695066133277, + "grad_norm": 0.17495180666446686, + "learning_rate": 4.309355842784773e-05, + "loss": 0.4169, "step": 36800 }, { - "epoch": 1.29, - "learning_rate": 4.343891909463517e-05, - "loss": 0.3131, + "epoch": 1.3264497062745522, + "grad_norm": 0.15498143434524536, + "learning_rate": 4.309154458720536e-05, + "loss": 0.4238, "step": 36805 }, { - "epoch": 1.3, - "learning_rate": 4.3436995275760376e-05, - "loss": 0.2675, + "epoch": 1.326629905935777, + "grad_norm": 0.17504778504371643, + "learning_rate": 4.3089530500069194e-05, + "loss": 0.433, "step": 36810 }, { - "epoch": 1.3, - "learning_rate": 4.343507121749336e-05, - "loss": 0.2875, + "epoch": 1.3268101055970014, + "grad_norm": 0.1534842997789383, + "learning_rate": 4.308751616646668e-05, + "loss": 0.4162, "step": 36815 }, { - "epoch": 1.3, - "learning_rate": 4.3433146919859094e-05, - "loss": 0.2685, + "epoch": 1.3269903052582261, + "grad_norm": 0.14600083231925964, + "learning_rate": 4.308550158642526e-05, + "loss": 0.4286, "step": 36820 }, { - "epoch": 1.3, - "learning_rate": 4.343122238288257e-05, - "loss": 0.2749, + "epoch": 1.3271705049194509, + "grad_norm": 0.15580976009368896, + "learning_rate": 4.3083486759972384e-05, + "loss": 0.4622, "step": 36825 }, { - "epoch": 1.3, - "learning_rate": 4.3429297606588747e-05, - "loss": 0.2847, + "epoch": 1.3273507045806754, + "grad_norm": 0.14176791906356812, + "learning_rate": 4.30814716871355e-05, + "loss": 0.4127, "step": 36830 }, { - "epoch": 1.3, - "learning_rate": 4.342737259100266e-05, - "loss": 0.2959, + "epoch": 1.3275309042419, + "grad_norm": 0.1750148981809616, + "learning_rate": 4.3079456367942065e-05, + "loss": 0.4201, "step": 36835 }, { - "epoch": 1.3, - "learning_rate": 4.3425447336149275e-05, - "loss": 0.2964, + "epoch": 1.3277111039031246, + "grad_norm": 0.18358172476291656, + "learning_rate": 4.3077440802419544e-05, + "loss": 0.4328, "step": 36840 }, { - "epoch": 1.3, - "learning_rate": 4.3423521842053606e-05, - "loss": 0.2817, + "epoch": 1.3278913035643494, + "grad_norm": 0.20503877103328705, + "learning_rate": 4.307542499059538e-05, + "loss": 0.4361, "step": 36845 }, { - "epoch": 1.3, - "learning_rate": 4.342159610874063e-05, - "loss": 0.2893, + "epoch": 1.3280715032255739, + "grad_norm": 0.20601756870746613, + "learning_rate": 4.307340893249706e-05, + "loss": 0.4039, "step": 36850 }, { - "epoch": 1.3, - "learning_rate": 4.341967013623539e-05, - "loss": 0.2597, + "epoch": 1.3282517028867986, + "grad_norm": 0.1701328456401825, + "learning_rate": 4.307139262815204e-05, + "loss": 0.3766, "step": 36855 }, { - "epoch": 1.3, - "learning_rate": 4.341774392456286e-05, - "loss": 0.3009, + "epoch": 1.3284319025480231, + "grad_norm": 0.1465604156255722, + "learning_rate": 4.30693760775878e-05, + "loss": 0.4086, "step": 36860 }, { - "epoch": 1.3, - "learning_rate": 4.3415817473748074e-05, - "loss": 0.2799, + "epoch": 1.3286121022092479, + "grad_norm": 0.15594859421253204, + "learning_rate": 4.3067359280831797e-05, + "loss": 0.3908, "step": 36865 }, { - "epoch": 1.3, - "learning_rate": 4.3413890783816026e-05, - "loss": 0.2736, + "epoch": 1.3287923018704726, + "grad_norm": 0.15714693069458008, + "learning_rate": 4.306534223791153e-05, + "loss": 0.4307, "step": 36870 }, { - "epoch": 1.3, - "learning_rate": 4.3411963854791745e-05, - "loss": 0.3029, + "epoch": 1.328972501531697, + "grad_norm": 0.17976239323616028, + "learning_rate": 4.306332494885446e-05, + "loss": 0.4154, "step": 36875 }, { - "epoch": 1.3, - "learning_rate": 4.341003668670025e-05, - "loss": 0.2777, + "epoch": 1.3291527011929218, + "grad_norm": 0.17832331359386444, + "learning_rate": 4.30613074136881e-05, + "loss": 0.4376, "step": 36880 }, { - "epoch": 1.3, - "learning_rate": 4.340810927956656e-05, - "loss": 0.2807, + "epoch": 1.3293329008541463, + "grad_norm": 0.15694265067577362, + "learning_rate": 4.305928963243992e-05, + "loss": 0.4172, "step": 36885 }, { - "epoch": 1.3, - "learning_rate": 4.340618163341569e-05, - "loss": 0.2852, + "epoch": 1.329513100515371, + "grad_norm": 0.1645699441432953, + "learning_rate": 4.305727160513741e-05, + "loss": 0.459, "step": 36890 }, { - "epoch": 1.3, - "learning_rate": 4.340425374827269e-05, - "loss": 0.2986, + "epoch": 1.3296933001765956, + "grad_norm": 0.16790169477462769, + "learning_rate": 4.305525333180807e-05, + "loss": 0.3762, "step": 36895 }, { - "epoch": 1.3, - "learning_rate": 4.3402325624162585e-05, - "loss": 0.2926, + "epoch": 1.3298734998378203, + "grad_norm": 0.1775948405265808, + "learning_rate": 4.3053234812479406e-05, + "loss": 0.3997, "step": 36900 }, { - "epoch": 1.3, - "learning_rate": 4.340039726111041e-05, - "loss": 0.273, + "epoch": 1.3300536994990448, + "grad_norm": 0.19268429279327393, + "learning_rate": 4.305121604717891e-05, + "loss": 0.4283, "step": 36905 }, { - "epoch": 1.3, - "learning_rate": 4.3398468659141206e-05, - "loss": 0.2762, + "epoch": 1.3302338991602696, + "grad_norm": 0.1967247873544693, + "learning_rate": 4.304919703593409e-05, + "loss": 0.4265, "step": 36910 }, { - "epoch": 1.3, - "learning_rate": 4.3396539818280004e-05, - "loss": 0.2821, + "epoch": 1.3304140988214943, + "grad_norm": 0.1895369440317154, + "learning_rate": 4.304717777877246e-05, + "loss": 0.3815, "step": 36915 }, { - "epoch": 1.3, - "learning_rate": 4.339461073855186e-05, - "loss": 0.3022, + "epoch": 1.3305942984827188, + "grad_norm": 0.1666320413351059, + "learning_rate": 4.304515827572152e-05, + "loss": 0.4227, "step": 36920 }, { - "epoch": 1.3, - "learning_rate": 4.3392681419981815e-05, - "loss": 0.3041, + "epoch": 1.3307744981439436, + "grad_norm": 0.19722457230091095, + "learning_rate": 4.304313852680879e-05, + "loss": 0.4094, "step": 36925 }, { - "epoch": 1.3, - "learning_rate": 4.339075186259493e-05, - "loss": 0.2892, + "epoch": 1.330954697805168, + "grad_norm": 0.20568078756332397, + "learning_rate": 4.3041118532061794e-05, + "loss": 0.415, "step": 36930 }, { - "epoch": 1.3, - "learning_rate": 4.338882206641625e-05, - "loss": 0.2866, + "epoch": 1.3311348974663928, + "grad_norm": 0.16985160112380981, + "learning_rate": 4.303909829150805e-05, + "loss": 0.4198, "step": 36935 }, { - "epoch": 1.3, - "learning_rate": 4.3386892031470826e-05, - "loss": 0.2874, + "epoch": 1.3313150971276175, + "grad_norm": 0.1945793330669403, + "learning_rate": 4.3037077805175085e-05, + "loss": 0.4086, "step": 36940 }, { - "epoch": 1.3, - "learning_rate": 4.338496175778374e-05, - "loss": 0.2866, + "epoch": 1.331495296788842, + "grad_norm": 0.19523586332798004, + "learning_rate": 4.303505707309043e-05, + "loss": 0.4424, "step": 36945 }, { - "epoch": 1.3, - "learning_rate": 4.338303124538004e-05, - "loss": 0.2786, + "epoch": 1.3316754964500666, + "grad_norm": 0.18730002641677856, + "learning_rate": 4.303303609528161e-05, + "loss": 0.4097, "step": 36950 }, { - "epoch": 1.3, - "learning_rate": 4.338110049428478e-05, - "loss": 0.264, + "epoch": 1.3318556961112913, + "grad_norm": 0.22210468351840973, + "learning_rate": 4.303101487177616e-05, + "loss": 0.4432, "step": 36955 }, { - "epoch": 1.3, - "learning_rate": 4.3379169504523054e-05, - "loss": 0.2933, + "epoch": 1.332035895772516, + "grad_norm": 0.16588768362998962, + "learning_rate": 4.3028993402601636e-05, + "loss": 0.4628, "step": 36960 }, { - "epoch": 1.3, - "learning_rate": 4.337723827611992e-05, - "loss": 0.2882, + "epoch": 1.3322160954337405, + "grad_norm": 0.19048090279102325, + "learning_rate": 4.302697168778556e-05, + "loss": 0.4337, "step": 36965 }, { - "epoch": 1.3, - "learning_rate": 4.3375306809100455e-05, - "loss": 0.276, + "epoch": 1.3323962950949653, + "grad_norm": 0.18421435356140137, + "learning_rate": 4.302494972735549e-05, + "loss": 0.4059, "step": 36970 }, { - "epoch": 1.3, - "learning_rate": 4.337337510348975e-05, - "loss": 0.2903, + "epoch": 1.3325764947561898, + "grad_norm": 0.15311434864997864, + "learning_rate": 4.3022927521338965e-05, + "loss": 0.3721, "step": 36975 }, { - "epoch": 1.3, - "learning_rate": 4.337144315931288e-05, - "loss": 0.2877, + "epoch": 1.3327566944174145, + "grad_norm": 0.1969848871231079, + "learning_rate": 4.302090506976354e-05, + "loss": 0.4358, "step": 36980 }, { - "epoch": 1.3, - "learning_rate": 4.3369510976594917e-05, - "loss": 0.2836, + "epoch": 1.3329368940786392, + "grad_norm": 0.19193395972251892, + "learning_rate": 4.301888237265678e-05, + "loss": 0.4397, "step": 36985 }, { - "epoch": 1.3, - "learning_rate": 4.336757855536097e-05, - "loss": 0.3013, + "epoch": 1.3331170937398638, + "grad_norm": 0.19378533959388733, + "learning_rate": 4.301685943004622e-05, + "loss": 0.4522, "step": 36990 }, { - "epoch": 1.3, - "learning_rate": 4.336564589563611e-05, - "loss": 0.2935, + "epoch": 1.3332972934010883, + "grad_norm": 0.19879907369613647, + "learning_rate": 4.301483624195945e-05, + "loss": 0.4517, "step": 36995 }, { - "epoch": 1.3, - "learning_rate": 4.336371299744545e-05, - "loss": 0.295, + "epoch": 1.333477493062313, + "grad_norm": 0.16110192239284515, + "learning_rate": 4.301281280842403e-05, + "loss": 0.4311, "step": 37000 }, { - "epoch": 1.3, - "eval_loss": 0.2844620645046234, - "eval_runtime": 10.5304, - "eval_samples_per_second": 9.496, - "eval_steps_per_second": 9.496, + "epoch": 1.333477493062313, + "eval_loss": 0.4481649100780487, + "eval_runtime": 3.5338, + "eval_samples_per_second": 28.298, + "eval_steps_per_second": 7.075, "step": 37000 }, { - "epoch": 1.3, - "learning_rate": 4.336177986081407e-05, - "loss": 0.3063, + "epoch": 1.3336576927235377, + "grad_norm": 0.1890387237071991, + "learning_rate": 4.301078912946751e-05, + "loss": 0.4437, "step": 37005 }, { - "epoch": 1.3, - "learning_rate": 4.3359846485767083e-05, - "loss": 0.2739, + "epoch": 1.3338378923847622, + "grad_norm": 0.2009790688753128, + "learning_rate": 4.300876520511748e-05, + "loss": 0.4314, "step": 37010 }, { - "epoch": 1.3, - "learning_rate": 4.33579128723296e-05, - "loss": 0.2798, + "epoch": 1.334018092045987, + "grad_norm": 0.23660525679588318, + "learning_rate": 4.300674103540151e-05, + "loss": 0.4406, "step": 37015 }, { - "epoch": 1.3, - "learning_rate": 4.3355979020526704e-05, - "loss": 0.3129, + "epoch": 1.3341982917072115, + "grad_norm": 0.20665594935417175, + "learning_rate": 4.300471662034719e-05, + "loss": 0.3956, "step": 37020 }, { - "epoch": 1.3, - "learning_rate": 4.335404493038352e-05, - "loss": 0.2926, + "epoch": 1.3343784913684362, + "grad_norm": 0.19279490411281586, + "learning_rate": 4.3002691959982076e-05, + "loss": 0.4062, "step": 37025 }, { - "epoch": 1.3, - "learning_rate": 4.3352110601925155e-05, - "loss": 0.3111, + "epoch": 1.334558691029661, + "grad_norm": 0.20935142040252686, + "learning_rate": 4.3000667054333775e-05, + "loss": 0.4324, "step": 37030 }, { - "epoch": 1.3, - "learning_rate": 4.3350176035176734e-05, - "loss": 0.3063, + "epoch": 1.3347388906908855, + "grad_norm": 0.1611267477273941, + "learning_rate": 4.2998641903429875e-05, + "loss": 0.4183, "step": 37035 }, { - "epoch": 1.3, - "learning_rate": 4.3348241230163367e-05, - "loss": 0.2934, + "epoch": 1.3349190903521102, + "grad_norm": 0.2519780993461609, + "learning_rate": 4.299661650729796e-05, + "loss": 0.447, "step": 37040 }, { - "epoch": 1.3, - "learning_rate": 4.3346306186910185e-05, - "loss": 0.3079, + "epoch": 1.3350992900133347, + "grad_norm": 0.1985885351896286, + "learning_rate": 4.2994590865965634e-05, + "loss": 0.4577, "step": 37045 }, { - "epoch": 1.3, - "learning_rate": 4.33443709054423e-05, - "loss": 0.3001, + "epoch": 1.3352794896745594, + "grad_norm": 0.14682763814926147, + "learning_rate": 4.2992564979460484e-05, + "loss": 0.397, "step": 37050 }, { - "epoch": 1.3, - "learning_rate": 4.334243538578486e-05, - "loss": 0.303, + "epoch": 1.3354596893357842, + "grad_norm": 0.17361211776733398, + "learning_rate": 4.299053884781012e-05, + "loss": 0.3837, "step": 37055 }, { - "epoch": 1.3, - "learning_rate": 4.334049962796297e-05, - "loss": 0.3096, + "epoch": 1.3356398889970087, + "grad_norm": 0.17351488769054413, + "learning_rate": 4.2988512471042154e-05, + "loss": 0.4613, "step": 37060 }, { - "epoch": 1.3, - "learning_rate": 4.333856363200179e-05, - "loss": 0.2888, + "epoch": 1.3358200886582332, + "grad_norm": 0.16963542997837067, + "learning_rate": 4.2986485849184185e-05, + "loss": 0.4384, "step": 37065 }, { - "epoch": 1.3, - "learning_rate": 4.3336627397926444e-05, - "loss": 0.294, + "epoch": 1.336000288319458, + "grad_norm": 0.16560989618301392, + "learning_rate": 4.298445898226383e-05, + "loss": 0.4036, "step": 37070 }, { - "epoch": 1.3, - "learning_rate": 4.3334690925762074e-05, - "loss": 0.3129, + "epoch": 1.3361804879806827, + "grad_norm": 0.1843060851097107, + "learning_rate": 4.29824318703087e-05, + "loss": 0.384, "step": 37075 }, { - "epoch": 1.3, - "learning_rate": 4.333275421553382e-05, - "loss": 0.2793, + "epoch": 1.3363606876419072, + "grad_norm": 0.20342443883419037, + "learning_rate": 4.298040451334642e-05, + "loss": 0.4224, "step": 37080 }, { - "epoch": 1.3, - "learning_rate": 4.333081726726684e-05, - "loss": 0.275, + "epoch": 1.336540887303132, + "grad_norm": 0.17561762034893036, + "learning_rate": 4.297837691140461e-05, + "loss": 0.4238, "step": 37085 }, { - "epoch": 1.3, - "learning_rate": 4.332888008098629e-05, - "loss": 0.2776, + "epoch": 1.3367210869643564, + "grad_norm": 0.20807789266109467, + "learning_rate": 4.297634906451089e-05, + "loss": 0.4155, "step": 37090 }, { - "epoch": 1.31, - "learning_rate": 4.33269426567173e-05, - "loss": 0.286, + "epoch": 1.3369012866255812, + "grad_norm": 0.19749189913272858, + "learning_rate": 4.29743209726929e-05, + "loss": 0.4417, "step": 37095 }, { - "epoch": 1.31, - "learning_rate": 4.332500499448503e-05, - "loss": 0.2853, + "epoch": 1.337081486286806, + "grad_norm": 0.15271851420402527, + "learning_rate": 4.297229263597827e-05, + "loss": 0.3862, "step": 37100 }, { - "epoch": 1.31, - "learning_rate": 4.3323067094314654e-05, - "loss": 0.2795, + "epoch": 1.3372616859480304, + "grad_norm": 0.19541774690151215, + "learning_rate": 4.2970264054394625e-05, + "loss": 0.3878, "step": 37105 }, { - "epoch": 1.31, - "learning_rate": 4.332112895623133e-05, - "loss": 0.2884, + "epoch": 1.337441885609255, + "grad_norm": 0.20491370558738708, + "learning_rate": 4.296823522796961e-05, + "loss": 0.4285, "step": 37110 }, { - "epoch": 1.31, - "learning_rate": 4.3319190580260215e-05, - "loss": 0.2788, + "epoch": 1.3376220852704797, + "grad_norm": 0.19413931667804718, + "learning_rate": 4.2966206156730875e-05, + "loss": 0.4268, "step": 37115 }, { - "epoch": 1.31, - "learning_rate": 4.331725196642649e-05, - "loss": 0.2847, + "epoch": 1.3378022849317044, + "grad_norm": 0.17734622955322266, + "learning_rate": 4.296417684070606e-05, + "loss": 0.3678, "step": 37120 }, { - "epoch": 1.31, - "learning_rate": 4.331531311475531e-05, - "loss": 0.3001, + "epoch": 1.337982484592929, + "grad_norm": 0.1772347390651703, + "learning_rate": 4.29621472799228e-05, + "loss": 0.4416, "step": 37125 }, { - "epoch": 1.31, - "learning_rate": 4.331337402527187e-05, - "loss": 0.2949, + "epoch": 1.3381626842541536, + "grad_norm": 0.2039223164319992, + "learning_rate": 4.296011747440878e-05, + "loss": 0.4219, "step": 37130 }, { - "epoch": 1.31, - "learning_rate": 4.331143469800133e-05, - "loss": 0.3044, + "epoch": 1.3383428839153781, + "grad_norm": 0.17975406348705292, + "learning_rate": 4.295808742419163e-05, + "loss": 0.4512, "step": 37135 }, { - "epoch": 1.31, - "learning_rate": 4.330949513296888e-05, - "loss": 0.2916, + "epoch": 1.3385230835766029, + "grad_norm": 0.21606260538101196, + "learning_rate": 4.295605712929901e-05, + "loss": 0.4352, "step": 37140 }, { - "epoch": 1.31, - "learning_rate": 4.33075553301997e-05, - "loss": 0.2822, + "epoch": 1.3387032832378276, + "grad_norm": 0.16076348721981049, + "learning_rate": 4.295402658975859e-05, + "loss": 0.4171, "step": 37145 }, { - "epoch": 1.31, - "learning_rate": 4.330561528971898e-05, - "loss": 0.2897, + "epoch": 1.3388834828990521, + "grad_norm": 0.14485345780849457, + "learning_rate": 4.295199580559804e-05, + "loss": 0.4193, "step": 37150 }, { - "epoch": 1.31, - "learning_rate": 4.330367501155191e-05, - "loss": 0.2808, + "epoch": 1.3390636825602769, + "grad_norm": 0.17786848545074463, + "learning_rate": 4.2949964776845014e-05, + "loss": 0.3925, "step": 37155 }, { - "epoch": 1.31, - "learning_rate": 4.3301734495723687e-05, - "loss": 0.2806, + "epoch": 1.3392438822215014, + "grad_norm": 0.2276924103498459, + "learning_rate": 4.29479335035272e-05, + "loss": 0.4485, "step": 37160 }, { - "epoch": 1.31, - "learning_rate": 4.32997937422595e-05, - "loss": 0.2932, + "epoch": 1.339424081882726, + "grad_norm": 0.17060185968875885, + "learning_rate": 4.294590198567226e-05, + "loss": 0.4437, "step": 37165 }, { - "epoch": 1.31, - "learning_rate": 4.3297852751184555e-05, - "loss": 0.2838, + "epoch": 1.3396042815439506, + "grad_norm": 0.18180356919765472, + "learning_rate": 4.294387022330789e-05, + "loss": 0.3794, "step": 37170 }, { - "epoch": 1.31, - "learning_rate": 4.3295911522524044e-05, - "loss": 0.2988, + "epoch": 1.3397844812051753, + "grad_norm": 0.17206217348575592, + "learning_rate": 4.294183821646175e-05, + "loss": 0.4358, "step": 37175 }, { - "epoch": 1.31, - "learning_rate": 4.329397005630318e-05, - "loss": 0.2963, + "epoch": 1.3399646808663999, + "grad_norm": 0.19352799654006958, + "learning_rate": 4.293980596516155e-05, + "loss": 0.4043, "step": 37180 }, { - "epoch": 1.31, - "learning_rate": 4.3292028352547176e-05, - "loss": 0.2902, + "epoch": 1.3401448805276246, + "grad_norm": 0.2113627791404724, + "learning_rate": 4.2937773469434963e-05, + "loss": 0.3979, "step": 37185 }, { - "epoch": 1.31, - "learning_rate": 4.3290086411281234e-05, - "loss": 0.2953, + "epoch": 1.3403250801888493, + "grad_norm": 0.1969025582075119, + "learning_rate": 4.293574072930968e-05, + "loss": 0.4085, "step": 37190 }, { - "epoch": 1.31, - "learning_rate": 4.328814423253058e-05, - "loss": 0.2826, + "epoch": 1.3405052798500738, + "grad_norm": 0.1911691427230835, + "learning_rate": 4.29337077448134e-05, + "loss": 0.4124, "step": 37195 }, { - "epoch": 1.31, - "learning_rate": 4.3286201816320415e-05, - "loss": 0.2786, + "epoch": 1.3406854795112986, + "grad_norm": 0.16924938559532166, + "learning_rate": 4.293167451597383e-05, + "loss": 0.4439, "step": 37200 }, { - "epoch": 1.31, - "learning_rate": 4.328425916267598e-05, - "loss": 0.288, + "epoch": 1.340865679172523, + "grad_norm": 0.15736305713653564, + "learning_rate": 4.292964104281867e-05, + "loss": 0.3968, "step": 37205 }, { - "epoch": 1.31, - "learning_rate": 4.328231627162248e-05, - "loss": 0.285, + "epoch": 1.3410458788337478, + "grad_norm": 0.22912681102752686, + "learning_rate": 4.2927607325375616e-05, + "loss": 0.4219, "step": 37210 }, { - "epoch": 1.31, - "learning_rate": 4.328037314318516e-05, - "loss": 0.2882, + "epoch": 1.3412260784949726, + "grad_norm": 0.1928148865699768, + "learning_rate": 4.292557336367239e-05, + "loss": 0.3937, "step": 37215 }, { - "epoch": 1.31, - "learning_rate": 4.327842977738924e-05, - "loss": 0.287, + "epoch": 1.341406278156197, + "grad_norm": 0.19849920272827148, + "learning_rate": 4.2923539157736695e-05, + "loss": 0.4182, "step": 37220 }, { - "epoch": 1.31, - "learning_rate": 4.327648617425995e-05, - "loss": 0.2864, + "epoch": 1.3415864778174216, + "grad_norm": 0.14143873751163483, + "learning_rate": 4.292150470759624e-05, + "loss": 0.3873, "step": 37225 }, { - "epoch": 1.31, - "learning_rate": 4.3274542333822544e-05, - "loss": 0.3049, + "epoch": 1.3417666774786463, + "grad_norm": 0.17856481671333313, + "learning_rate": 4.291947001327876e-05, + "loss": 0.435, "step": 37230 }, { - "epoch": 1.31, - "learning_rate": 4.327259825610224e-05, - "loss": 0.2877, + "epoch": 1.341946877139871, + "grad_norm": 0.15934741497039795, + "learning_rate": 4.291743507481197e-05, + "loss": 0.4061, "step": 37235 }, { - "epoch": 1.31, - "learning_rate": 4.327065394112429e-05, - "loss": 0.2963, + "epoch": 1.3421270768010956, + "grad_norm": 0.193971648812294, + "learning_rate": 4.2915399892223595e-05, + "loss": 0.3881, "step": 37240 }, { - "epoch": 1.31, - "learning_rate": 4.3268709388913944e-05, - "loss": 0.2902, + "epoch": 1.3423072764623203, + "grad_norm": 0.22746896743774414, + "learning_rate": 4.2913364465541366e-05, + "loss": 0.411, "step": 37245 }, { - "epoch": 1.31, - "learning_rate": 4.326676459949644e-05, - "loss": 0.293, + "epoch": 1.3424874761235448, + "grad_norm": 0.21635814011096954, + "learning_rate": 4.291132879479302e-05, + "loss": 0.4091, "step": 37250 }, { - "epoch": 1.31, - "learning_rate": 4.326481957289704e-05, - "loss": 0.2932, + "epoch": 1.3426676757847695, + "grad_norm": 0.19370247423648834, + "learning_rate": 4.290929288000628e-05, + "loss": 0.4045, "step": 37255 }, { - "epoch": 1.31, - "learning_rate": 4.326287430914099e-05, - "loss": 0.3053, + "epoch": 1.3428478754459943, + "grad_norm": 0.14422830939292908, + "learning_rate": 4.290725672120889e-05, + "loss": 0.3829, "step": 37260 }, { - "epoch": 1.31, - "learning_rate": 4.326092880825355e-05, - "loss": 0.2989, + "epoch": 1.3430280751072188, + "grad_norm": 0.16739609837532043, + "learning_rate": 4.29052203184286e-05, + "loss": 0.4541, "step": 37265 }, { - "epoch": 1.31, - "learning_rate": 4.3258983070259994e-05, - "loss": 0.2813, + "epoch": 1.3432082747684435, + "grad_norm": 0.17968153953552246, + "learning_rate": 4.290318367169314e-05, + "loss": 0.4112, "step": 37270 }, { - "epoch": 1.31, - "learning_rate": 4.325703709518557e-05, - "loss": 0.2908, + "epoch": 1.343388474429668, + "grad_norm": 0.20018011331558228, + "learning_rate": 4.290114678103028e-05, + "loss": 0.4536, "step": 37275 }, { - "epoch": 1.31, - "learning_rate": 4.325509088305556e-05, - "loss": 0.2831, + "epoch": 1.3435686740908928, + "grad_norm": 0.1976526975631714, + "learning_rate": 4.289910964646776e-05, + "loss": 0.4011, "step": 37280 }, { - "epoch": 1.31, - "learning_rate": 4.325314443389522e-05, - "loss": 0.2986, + "epoch": 1.3437488737521173, + "grad_norm": 0.19108478724956512, + "learning_rate": 4.289707226803333e-05, + "loss": 0.4303, "step": 37285 }, { - "epoch": 1.31, - "learning_rate": 4.325119774772982e-05, - "loss": 0.2859, + "epoch": 1.343929073413342, + "grad_norm": 0.1997978538274765, + "learning_rate": 4.289503464575476e-05, + "loss": 0.4182, "step": 37290 }, { - "epoch": 1.31, - "learning_rate": 4.324925082458465e-05, - "loss": 0.2937, + "epoch": 1.3441092730745665, + "grad_norm": 0.20176932215690613, + "learning_rate": 4.28929967796598e-05, + "loss": 0.4345, "step": 37295 }, { - "epoch": 1.31, - "learning_rate": 4.3247303664485e-05, - "loss": 0.2865, + "epoch": 1.3442894727357912, + "grad_norm": 0.1834762841463089, + "learning_rate": 4.289095866977623e-05, + "loss": 0.4147, "step": 37300 }, { - "epoch": 1.31, - "learning_rate": 4.324535626745612e-05, - "loss": 0.2826, + "epoch": 1.344469672397016, + "grad_norm": 0.18414629995822906, + "learning_rate": 4.288892031613181e-05, + "loss": 0.4264, "step": 37305 }, { - "epoch": 1.31, - "learning_rate": 4.324340863352332e-05, - "loss": 0.2826, + "epoch": 1.3446498720582405, + "grad_norm": 0.175818532705307, + "learning_rate": 4.288688171875431e-05, + "loss": 0.379, "step": 37310 }, { - "epoch": 1.31, - "learning_rate": 4.3241460762711874e-05, - "loss": 0.2433, + "epoch": 1.3448300717194652, + "grad_norm": 0.16372618079185486, + "learning_rate": 4.288484287767152e-05, + "loss": 0.3926, "step": 37315 }, { - "epoch": 1.31, - "learning_rate": 4.3239512655047086e-05, - "loss": 0.2867, + "epoch": 1.3450102713806897, + "grad_norm": 0.17011037468910217, + "learning_rate": 4.2882803792911205e-05, + "loss": 0.4143, "step": 37320 }, { - "epoch": 1.31, - "learning_rate": 4.323756431055425e-05, - "loss": 0.2893, + "epoch": 1.3451904710419145, + "grad_norm": 0.13715289533138275, + "learning_rate": 4.288076446450115e-05, + "loss": 0.3912, "step": 37325 }, { - "epoch": 1.31, - "learning_rate": 4.323561572925866e-05, - "loss": 0.3029, + "epoch": 1.3453706707031392, + "grad_norm": 0.17241427302360535, + "learning_rate": 4.2878724892469135e-05, + "loss": 0.3965, "step": 37330 }, { - "epoch": 1.31, - "learning_rate": 4.323366691118561e-05, - "loss": 0.2687, + "epoch": 1.3455508703643637, + "grad_norm": 0.17453129589557648, + "learning_rate": 4.287668507684296e-05, + "loss": 0.4192, "step": 37335 }, { - "epoch": 1.31, - "learning_rate": 4.323171785636042e-05, - "loss": 0.2693, + "epoch": 1.3457310700255882, + "grad_norm": 0.22554276883602142, + "learning_rate": 4.287464501765041e-05, + "loss": 0.3842, "step": 37340 }, { - "epoch": 1.31, - "learning_rate": 4.322976856480839e-05, - "loss": 0.2726, + "epoch": 1.345911269686813, + "grad_norm": 0.1914464235305786, + "learning_rate": 4.2872604714919285e-05, + "loss": 0.4156, "step": 37345 }, { - "epoch": 1.31, - "learning_rate": 4.322781903655482e-05, - "loss": 0.2934, + "epoch": 1.3460914693480377, + "grad_norm": 0.16608940064907074, + "learning_rate": 4.287056416867738e-05, + "loss": 0.4284, "step": 37350 }, { - "epoch": 1.31, - "learning_rate": 4.322586927162504e-05, - "loss": 0.2632, + "epoch": 1.3462716690092622, + "grad_norm": 0.20731467008590698, + "learning_rate": 4.28685233789525e-05, + "loss": 0.4522, "step": 37355 }, { - "epoch": 1.31, - "learning_rate": 4.322391927004435e-05, - "loss": 0.3015, + "epoch": 1.346451868670487, + "grad_norm": 0.1691913902759552, + "learning_rate": 4.286648234577244e-05, + "loss": 0.4299, "step": 37360 }, { - "epoch": 1.31, - "learning_rate": 4.322196903183809e-05, - "loss": 0.2796, + "epoch": 1.3466320683317115, + "grad_norm": 0.19073499739170074, + "learning_rate": 4.286444106916503e-05, + "loss": 0.4108, "step": 37365 }, { - "epoch": 1.31, - "learning_rate": 4.322001855703156e-05, - "loss": 0.292, + "epoch": 1.3468122679929362, + "grad_norm": 0.1724918931722641, + "learning_rate": 4.286239954915806e-05, + "loss": 0.4408, "step": 37370 }, { - "epoch": 1.31, - "learning_rate": 4.3218067845650103e-05, - "loss": 0.2613, + "epoch": 1.346992467654161, + "grad_norm": 0.19788911938667297, + "learning_rate": 4.2860357785779356e-05, + "loss": 0.4212, "step": 37375 }, { - "epoch": 1.32, - "learning_rate": 4.321611689771904e-05, - "loss": 0.2833, + "epoch": 1.3471726673153854, + "grad_norm": 0.17659629881381989, + "learning_rate": 4.2858315779056734e-05, + "loss": 0.4535, "step": 37380 }, { - "epoch": 1.32, - "learning_rate": 4.321416571326371e-05, - "loss": 0.3139, + "epoch": 1.3473528669766102, + "grad_norm": 0.1844049096107483, + "learning_rate": 4.285627352901802e-05, + "loss": 0.388, "step": 37385 }, { - "epoch": 1.32, - "learning_rate": 4.3212214292309435e-05, - "loss": 0.2846, + "epoch": 1.3475330666378347, + "grad_norm": 0.19511617720127106, + "learning_rate": 4.285423103569103e-05, + "loss": 0.4225, "step": 37390 }, { - "epoch": 1.32, - "learning_rate": 4.321026263488156e-05, - "loss": 0.2884, + "epoch": 1.3477132662990594, + "grad_norm": 0.1395874172449112, + "learning_rate": 4.2852188299103614e-05, + "loss": 0.3867, "step": 37395 }, { - "epoch": 1.32, - "learning_rate": 4.320831074100542e-05, - "loss": 0.2962, + "epoch": 1.347893465960284, + "grad_norm": 0.1697840690612793, + "learning_rate": 4.2850145319283575e-05, + "loss": 0.3939, "step": 37400 }, { - "epoch": 1.32, - "learning_rate": 4.3206358610706376e-05, - "loss": 0.3143, + "epoch": 1.3480736656215087, + "grad_norm": 0.20716294646263123, + "learning_rate": 4.284810209625876e-05, + "loss": 0.4563, "step": 37405 }, { - "epoch": 1.32, - "learning_rate": 4.3204406244009754e-05, - "loss": 0.2929, + "epoch": 1.3482538652827332, + "grad_norm": 0.19542010128498077, + "learning_rate": 4.2846058630057016e-05, + "loss": 0.4322, "step": 37410 }, { - "epoch": 1.32, - "learning_rate": 4.3202453640940924e-05, - "loss": 0.2903, + "epoch": 1.348434064943958, + "grad_norm": 0.15762588381767273, + "learning_rate": 4.2844014920706176e-05, + "loss": 0.4279, "step": 37415 }, { - "epoch": 1.32, - "learning_rate": 4.320050080152522e-05, - "loss": 0.2909, + "epoch": 1.3486142646051826, + "grad_norm": 0.20713409781455994, + "learning_rate": 4.284197096823409e-05, + "loss": 0.4345, "step": 37420 }, { - "epoch": 1.32, - "learning_rate": 4.319854772578801e-05, - "loss": 0.2865, + "epoch": 1.3487944642664071, + "grad_norm": 0.18533405661582947, + "learning_rate": 4.2839926772668605e-05, + "loss": 0.4219, "step": 37425 }, { - "epoch": 1.32, - "learning_rate": 4.319659441375465e-05, - "loss": 0.3007, + "epoch": 1.3489746639276319, + "grad_norm": 0.19821499288082123, + "learning_rate": 4.283788233403757e-05, + "loss": 0.4249, "step": 37430 }, { - "epoch": 1.32, - "learning_rate": 4.319464086545052e-05, - "loss": 0.2722, + "epoch": 1.3491548635888564, + "grad_norm": 0.20067061483860016, + "learning_rate": 4.283583765236884e-05, + "loss": 0.4566, "step": 37435 }, { - "epoch": 1.32, - "learning_rate": 4.319268708090095e-05, - "loss": 0.3013, + "epoch": 1.3493350632500811, + "grad_norm": 0.16984498500823975, + "learning_rate": 4.2833792727690275e-05, + "loss": 0.38, "step": 37440 }, { - "epoch": 1.32, - "learning_rate": 4.3190733060131326e-05, - "loss": 0.2961, + "epoch": 1.3495152629113059, + "grad_norm": 0.1640443056821823, + "learning_rate": 4.283174756002973e-05, + "loss": 0.4147, "step": 37445 }, { - "epoch": 1.32, - "learning_rate": 4.318877880316703e-05, - "loss": 0.3062, + "epoch": 1.3496954625725304, + "grad_norm": 0.16982309520244598, + "learning_rate": 4.2829702149415094e-05, + "loss": 0.4274, "step": 37450 }, { - "epoch": 1.32, - "learning_rate": 4.318682431003343e-05, - "loss": 0.2742, + "epoch": 1.3498756622337549, + "grad_norm": 0.19028659164905548, + "learning_rate": 4.2827656495874205e-05, + "loss": 0.4304, "step": 37455 }, { - "epoch": 1.32, - "learning_rate": 4.31848695807559e-05, - "loss": 0.2973, + "epoch": 1.3500558618949796, + "grad_norm": 0.20685800909996033, + "learning_rate": 4.282561059943495e-05, + "loss": 0.4145, "step": 37460 }, { - "epoch": 1.32, - "learning_rate": 4.318291461535982e-05, - "loss": 0.2853, + "epoch": 1.3502360615562043, + "grad_norm": 0.18476605415344238, + "learning_rate": 4.2823564460125206e-05, + "loss": 0.4054, "step": 37465 }, { - "epoch": 1.32, - "learning_rate": 4.318095941387058e-05, - "loss": 0.2627, + "epoch": 1.3504162612174289, + "grad_norm": 0.17075756192207336, + "learning_rate": 4.2821518077972845e-05, + "loss": 0.3853, "step": 37470 }, { - "epoch": 1.32, - "learning_rate": 4.3179003976313556e-05, - "loss": 0.2873, + "epoch": 1.3505964608786536, + "grad_norm": 0.1620296984910965, + "learning_rate": 4.281947145300574e-05, + "loss": 0.3927, "step": 37475 }, { - "epoch": 1.32, - "learning_rate": 4.3177048302714154e-05, - "loss": 0.2737, + "epoch": 1.350776660539878, + "grad_norm": 0.1557290405035019, + "learning_rate": 4.2817424585251804e-05, + "loss": 0.4178, "step": 37480 }, { - "epoch": 1.32, - "learning_rate": 4.317509239309776e-05, - "loss": 0.2906, + "epoch": 1.3509568602011028, + "grad_norm": 0.17383769154548645, + "learning_rate": 4.2815377474738894e-05, + "loss": 0.4483, "step": 37485 }, { - "epoch": 1.32, - "learning_rate": 4.3173136247489766e-05, - "loss": 0.2583, + "epoch": 1.3511370598623276, + "grad_norm": 0.19885095953941345, + "learning_rate": 4.2813330121494924e-05, + "loss": 0.4472, "step": 37490 }, { - "epoch": 1.32, - "learning_rate": 4.317117986591557e-05, - "loss": 0.3009, + "epoch": 1.351317259523552, + "grad_norm": 0.18634334206581116, + "learning_rate": 4.281128252554778e-05, + "loss": 0.4156, "step": 37495 }, { - "epoch": 1.32, - "learning_rate": 4.316922324840058e-05, - "loss": 0.2795, + "epoch": 1.3514974591847766, + "grad_norm": 0.21078574657440186, + "learning_rate": 4.280923468692535e-05, + "loss": 0.4509, "step": 37500 }, { - "epoch": 1.32, - "eval_loss": 0.28416377305984497, - "eval_runtime": 10.5464, - "eval_samples_per_second": 9.482, - "eval_steps_per_second": 9.482, + "epoch": 1.3514974591847766, + "eval_loss": 0.44750291109085083, + "eval_runtime": 3.5455, + "eval_samples_per_second": 28.205, + "eval_steps_per_second": 7.051, "step": 37500 }, { - "epoch": 1.32, - "learning_rate": 4.3167266394970206e-05, - "loss": 0.2812, + "epoch": 1.3516776588460013, + "grad_norm": 0.21261395514011383, + "learning_rate": 4.280718660565556e-05, + "loss": 0.4191, "step": 37505 }, { - "epoch": 1.32, - "learning_rate": 4.316530930564985e-05, - "loss": 0.2841, + "epoch": 1.351857858507226, + "grad_norm": 0.1816295087337494, + "learning_rate": 4.280513828176629e-05, + "loss": 0.4367, "step": 37510 }, { - "epoch": 1.32, - "learning_rate": 4.3163351980464916e-05, - "loss": 0.2898, + "epoch": 1.3520380581684506, + "grad_norm": 0.16183680295944214, + "learning_rate": 4.280308971528546e-05, + "loss": 0.4168, "step": 37515 }, { - "epoch": 1.32, - "learning_rate": 4.3161394419440836e-05, - "loss": 0.2741, + "epoch": 1.3522182578296753, + "grad_norm": 0.19457785785198212, + "learning_rate": 4.280104090624097e-05, + "loss": 0.4259, "step": 37520 }, { - "epoch": 1.32, - "learning_rate": 4.3159436622603013e-05, - "loss": 0.3133, + "epoch": 1.3523984574908998, + "grad_norm": 0.16114507615566254, + "learning_rate": 4.279899185466077e-05, + "loss": 0.4355, "step": 37525 }, { - "epoch": 1.32, - "learning_rate": 4.3157478589976874e-05, - "loss": 0.3004, + "epoch": 1.3525786571521246, + "grad_norm": 0.17467471957206726, + "learning_rate": 4.2796942560572725e-05, + "loss": 0.4127, "step": 37530 }, { - "epoch": 1.32, - "learning_rate": 4.315552032158784e-05, - "loss": 0.3081, + "epoch": 1.3527588568133493, + "grad_norm": 0.18271581828594208, + "learning_rate": 4.279489302400479e-05, + "loss": 0.4033, "step": 37535 }, { - "epoch": 1.32, - "learning_rate": 4.315356181746134e-05, - "loss": 0.2947, + "epoch": 1.3529390564745738, + "grad_norm": 0.15047767758369446, + "learning_rate": 4.279284324498489e-05, + "loss": 0.3937, "step": 37540 }, { - "epoch": 1.32, - "learning_rate": 4.31516030776228e-05, - "loss": 0.3216, + "epoch": 1.3531192561357985, + "grad_norm": 0.1880033016204834, + "learning_rate": 4.2790793223540944e-05, + "loss": 0.4112, "step": 37545 }, { - "epoch": 1.32, - "learning_rate": 4.314964410209767e-05, - "loss": 0.2855, + "epoch": 1.353299455797023, + "grad_norm": 0.2034742534160614, + "learning_rate": 4.278874295970088e-05, + "loss": 0.4222, "step": 37550 }, { - "epoch": 1.32, - "learning_rate": 4.3147684890911365e-05, - "loss": 0.308, + "epoch": 1.3534796554582478, + "grad_norm": 0.14780563116073608, + "learning_rate": 4.278669245349264e-05, + "loss": 0.4239, "step": 37555 }, { - "epoch": 1.32, - "learning_rate": 4.314572544408932e-05, - "loss": 0.3049, + "epoch": 1.3536598551194725, + "grad_norm": 0.15537936985492706, + "learning_rate": 4.278464170494416e-05, + "loss": 0.3862, "step": 37560 }, { - "epoch": 1.32, - "learning_rate": 4.314376576165701e-05, - "loss": 0.2787, + "epoch": 1.353840054780697, + "grad_norm": 0.14298701286315918, + "learning_rate": 4.278259071408338e-05, + "loss": 0.4007, "step": 37565 }, { - "epoch": 1.32, - "learning_rate": 4.314180584363984e-05, - "loss": 0.2797, + "epoch": 1.3540202544419215, + "grad_norm": 0.1611703783273697, + "learning_rate": 4.278053948093824e-05, + "loss": 0.4007, "step": 37570 }, { - "epoch": 1.32, - "learning_rate": 4.313984569006329e-05, - "loss": 0.3024, + "epoch": 1.3542004541031463, + "grad_norm": 0.2151387333869934, + "learning_rate": 4.27784880055367e-05, + "loss": 0.4169, "step": 37575 }, { - "epoch": 1.32, - "learning_rate": 4.313788530095279e-05, - "loss": 0.2949, + "epoch": 1.354380653764371, + "grad_norm": 0.17721928656101227, + "learning_rate": 4.277643628790669e-05, + "loss": 0.4298, "step": 37580 }, { - "epoch": 1.32, - "learning_rate": 4.3135924676333806e-05, - "loss": 0.2951, + "epoch": 1.3545608534255955, + "grad_norm": 0.17061589658260345, + "learning_rate": 4.277438432807619e-05, + "loss": 0.4005, "step": 37585 }, { - "epoch": 1.32, - "learning_rate": 4.313396381623179e-05, - "loss": 0.2992, + "epoch": 1.3547410530868202, + "grad_norm": 0.159605473279953, + "learning_rate": 4.277233212607315e-05, + "loss": 0.4158, "step": 37590 }, { - "epoch": 1.32, - "learning_rate": 4.313200272067221e-05, - "loss": 0.2875, + "epoch": 1.3549212527480448, + "grad_norm": 0.15221120417118073, + "learning_rate": 4.2770279681925506e-05, + "loss": 0.4054, "step": 37595 }, { - "epoch": 1.32, - "learning_rate": 4.313004138968052e-05, - "loss": 0.2947, + "epoch": 1.3551014524092695, + "grad_norm": 0.1982593834400177, + "learning_rate": 4.2768226995661255e-05, + "loss": 0.3861, "step": 37600 }, { - "epoch": 1.32, - "learning_rate": 4.3128079823282185e-05, - "loss": 0.2795, + "epoch": 1.3552816520704942, + "grad_norm": 0.1528361439704895, + "learning_rate": 4.276617406730835e-05, + "loss": 0.4002, "step": 37605 }, { - "epoch": 1.32, - "learning_rate": 4.312611802150269e-05, - "loss": 0.29, + "epoch": 1.3554618517317187, + "grad_norm": 0.17486098408699036, + "learning_rate": 4.2764120896894755e-05, + "loss": 0.3899, "step": 37610 }, { - "epoch": 1.32, - "learning_rate": 4.31241559843675e-05, - "loss": 0.3024, + "epoch": 1.3556420513929432, + "grad_norm": 0.16037683188915253, + "learning_rate": 4.276206748444846e-05, + "loss": 0.4031, "step": 37615 }, { - "epoch": 1.32, - "learning_rate": 4.312219371190208e-05, - "loss": 0.2777, + "epoch": 1.355822251054168, + "grad_norm": 0.1568463146686554, + "learning_rate": 4.2760013829997434e-05, + "loss": 0.4136, "step": 37620 }, { - "epoch": 1.32, - "learning_rate": 4.3120231204131924e-05, - "loss": 0.2749, + "epoch": 1.3560024507153927, + "grad_norm": 0.18203102052211761, + "learning_rate": 4.2757959933569656e-05, + "loss": 0.3986, "step": 37625 }, { - "epoch": 1.32, - "learning_rate": 4.31182684610825e-05, - "loss": 0.2936, + "epoch": 1.3561826503766172, + "grad_norm": 0.16312682628631592, + "learning_rate": 4.275590579519311e-05, + "loss": 0.4104, "step": 37630 }, { - "epoch": 1.32, - "learning_rate": 4.311630548277931e-05, - "loss": 0.293, + "epoch": 1.356362850037842, + "grad_norm": 0.18882383406162262, + "learning_rate": 4.275385141489578e-05, + "loss": 0.4471, "step": 37635 }, { - "epoch": 1.32, - "learning_rate": 4.311434226924782e-05, - "loss": 0.305, + "epoch": 1.3565430496990665, + "grad_norm": 0.21008935570716858, + "learning_rate": 4.275179679270568e-05, + "loss": 0.4276, "step": 37640 }, { - "epoch": 1.32, - "learning_rate": 4.311237882051354e-05, - "loss": 0.2852, + "epoch": 1.3567232493602912, + "grad_norm": 0.21513846516609192, + "learning_rate": 4.274974192865077e-05, + "loss": 0.4229, "step": 37645 }, { - "epoch": 1.32, - "learning_rate": 4.311041513660195e-05, - "loss": 0.2913, + "epoch": 1.356903449021516, + "grad_norm": 0.2062099725008011, + "learning_rate": 4.274768682275907e-05, + "loss": 0.4215, "step": 37650 }, { - "epoch": 1.32, - "learning_rate": 4.310845121753857e-05, - "loss": 0.2762, + "epoch": 1.3570836486827405, + "grad_norm": 0.1856852024793625, + "learning_rate": 4.274563147505857e-05, + "loss": 0.4214, "step": 37655 }, { - "epoch": 1.32, - "learning_rate": 4.310648706334888e-05, - "loss": 0.3097, + "epoch": 1.3572638483439652, + "grad_norm": 0.2162654846906662, + "learning_rate": 4.2743575885577277e-05, + "loss": 0.415, "step": 37660 }, { - "epoch": 1.33, - "learning_rate": 4.310452267405839e-05, - "loss": 0.2954, + "epoch": 1.3574440480051897, + "grad_norm": 0.14745785295963287, + "learning_rate": 4.27415200543432e-05, + "loss": 0.4311, "step": 37665 }, { - "epoch": 1.33, - "learning_rate": 4.31025580496926e-05, - "loss": 0.2794, + "epoch": 1.3576242476664144, + "grad_norm": 0.16586333513259888, + "learning_rate": 4.2739463981384345e-05, + "loss": 0.4549, "step": 37670 }, { - "epoch": 1.33, - "learning_rate": 4.310059319027702e-05, - "loss": 0.3138, + "epoch": 1.357804447327639, + "grad_norm": 0.19415488839149475, + "learning_rate": 4.2737407666728724e-05, + "loss": 0.4612, "step": 37675 }, { - "epoch": 1.33, - "learning_rate": 4.309862809583717e-05, - "loss": 0.27, + "epoch": 1.3579846469888637, + "grad_norm": 0.20370012521743774, + "learning_rate": 4.2735351110404365e-05, + "loss": 0.3613, "step": 37680 }, { - "epoch": 1.33, - "learning_rate": 4.309666276639856e-05, - "loss": 0.3006, + "epoch": 1.3581648466500882, + "grad_norm": 0.16060635447502136, + "learning_rate": 4.273329431243927e-05, + "loss": 0.3881, "step": 37685 }, { - "epoch": 1.33, - "learning_rate": 4.309469720198671e-05, - "loss": 0.2863, + "epoch": 1.358345046311313, + "grad_norm": 0.1804780513048172, + "learning_rate": 4.273123727286148e-05, + "loss": 0.3816, "step": 37690 }, { - "epoch": 1.33, - "learning_rate": 4.309273140262715e-05, - "loss": 0.2867, + "epoch": 1.3585252459725377, + "grad_norm": 0.21636547148227692, + "learning_rate": 4.272917999169902e-05, + "loss": 0.4187, "step": 37695 }, { - "epoch": 1.33, - "learning_rate": 4.3090765368345384e-05, - "loss": 0.2901, + "epoch": 1.3587054456337622, + "grad_norm": 0.20415888726711273, + "learning_rate": 4.272712246897991e-05, + "loss": 0.4345, "step": 37700 }, { - "epoch": 1.33, - "learning_rate": 4.308879909916696e-05, - "loss": 0.2751, + "epoch": 1.358885645294987, + "grad_norm": 0.17378808557987213, + "learning_rate": 4.272506470473219e-05, + "loss": 0.4309, "step": 37705 }, { - "epoch": 1.33, - "learning_rate": 4.3086832595117397e-05, - "loss": 0.3008, + "epoch": 1.3590658449562114, + "grad_norm": 0.1880928874015808, + "learning_rate": 4.2723006698983894e-05, + "loss": 0.421, "step": 37710 }, { - "epoch": 1.33, - "learning_rate": 4.3084865856222234e-05, - "loss": 0.2783, + "epoch": 1.3592460446174361, + "grad_norm": 0.18104448914527893, + "learning_rate": 4.272094845176307e-05, + "loss": 0.4287, "step": 37715 }, { - "epoch": 1.33, - "learning_rate": 4.308289888250701e-05, - "loss": 0.2733, + "epoch": 1.3594262442786609, + "grad_norm": 0.17074403166770935, + "learning_rate": 4.2718889963097744e-05, + "loss": 0.4521, "step": 37720 }, { - "epoch": 1.33, - "learning_rate": 4.3080931673997256e-05, - "loss": 0.2934, + "epoch": 1.3596064439398854, + "grad_norm": 0.2121085524559021, + "learning_rate": 4.2716831233015974e-05, + "loss": 0.4466, "step": 37725 }, { - "epoch": 1.33, - "learning_rate": 4.307896423071852e-05, - "loss": 0.2764, + "epoch": 1.35978664360111, + "grad_norm": 0.14753827452659607, + "learning_rate": 4.2714772261545813e-05, + "loss": 0.4041, "step": 37730 }, { - "epoch": 1.33, - "learning_rate": 4.307699655269635e-05, - "loss": 0.2962, + "epoch": 1.3599668432623346, + "grad_norm": 0.18550336360931396, + "learning_rate": 4.27127130487153e-05, + "loss": 0.4358, "step": 37735 }, { - "epoch": 1.33, - "learning_rate": 4.3075028639956295e-05, - "loss": 0.2987, + "epoch": 1.3601470429235594, + "grad_norm": 0.1991180032491684, + "learning_rate": 4.2710653594552506e-05, + "loss": 0.4421, "step": 37740 }, { - "epoch": 1.33, - "learning_rate": 4.30730604925239e-05, - "loss": 0.298, + "epoch": 1.3603272425847839, + "grad_norm": 0.16525886952877045, + "learning_rate": 4.2708593899085494e-05, + "loss": 0.4052, "step": 37745 }, { - "epoch": 1.33, - "learning_rate": 4.3071092110424724e-05, - "loss": 0.2706, + "epoch": 1.3605074422460086, + "grad_norm": 0.17231962084770203, + "learning_rate": 4.270653396234231e-05, + "loss": 0.4103, "step": 37750 }, { - "epoch": 1.33, - "learning_rate": 4.306912349368433e-05, - "loss": 0.2879, + "epoch": 1.3606876419072331, + "grad_norm": 0.1659233570098877, + "learning_rate": 4.2704473784351036e-05, + "loss": 0.3799, "step": 37755 }, { - "epoch": 1.33, - "learning_rate": 4.306715464232828e-05, - "loss": 0.2762, + "epoch": 1.3608678415684579, + "grad_norm": 0.15923145413398743, + "learning_rate": 4.2702413365139724e-05, + "loss": 0.3827, "step": 37760 }, { - "epoch": 1.33, - "learning_rate": 4.3065185556382124e-05, - "loss": 0.2766, + "epoch": 1.3610480412296826, + "grad_norm": 0.13835641741752625, + "learning_rate": 4.270035270473647e-05, + "loss": 0.3854, "step": 37765 }, { - "epoch": 1.33, - "learning_rate": 4.306321623587144e-05, - "loss": 0.305, + "epoch": 1.361228240890907, + "grad_norm": 0.24770118296146393, + "learning_rate": 4.269829180316932e-05, + "loss": 0.4457, "step": 37770 }, { - "epoch": 1.33, - "learning_rate": 4.3061246680821806e-05, - "loss": 0.2775, + "epoch": 1.3614084405521318, + "grad_norm": 0.15679314732551575, + "learning_rate": 4.269623066046639e-05, + "loss": 0.421, "step": 37775 }, { - "epoch": 1.33, - "learning_rate": 4.3059276891258784e-05, - "loss": 0.2644, + "epoch": 1.3615886402133563, + "grad_norm": 0.20325350761413574, + "learning_rate": 4.269416927665573e-05, + "loss": 0.3852, "step": 37780 }, { - "epoch": 1.33, - "learning_rate": 4.3057306867207946e-05, - "loss": 0.2929, + "epoch": 1.361768839874581, + "grad_norm": 0.18375974893569946, + "learning_rate": 4.269210765176544e-05, + "loss": 0.4165, "step": 37785 }, { - "epoch": 1.33, - "learning_rate": 4.3055336608694884e-05, - "loss": 0.3116, + "epoch": 1.3619490395358056, + "grad_norm": 0.1880822330713272, + "learning_rate": 4.269004578582362e-05, + "loss": 0.4473, "step": 37790 }, { - "epoch": 1.33, - "learning_rate": 4.305336611574517e-05, - "loss": 0.2964, + "epoch": 1.3621292391970303, + "grad_norm": 0.16802115738391876, + "learning_rate": 4.2687983678858346e-05, + "loss": 0.4006, "step": 37795 }, { - "epoch": 1.33, - "learning_rate": 4.3051395388384405e-05, - "loss": 0.2832, + "epoch": 1.3623094388582548, + "grad_norm": 0.18482767045497894, + "learning_rate": 4.268592133089771e-05, + "loss": 0.4042, "step": 37800 }, { - "epoch": 1.33, - "learning_rate": 4.3049424426638156e-05, - "loss": 0.2944, + "epoch": 1.3624896385194796, + "grad_norm": 0.18209148943424225, + "learning_rate": 4.268385874196983e-05, + "loss": 0.419, "step": 37805 }, { - "epoch": 1.33, - "learning_rate": 4.304745323053203e-05, - "loss": 0.2928, + "epoch": 1.3626698381807043, + "grad_norm": 0.20778799057006836, + "learning_rate": 4.268179591210279e-05, + "loss": 0.4544, "step": 37810 }, { - "epoch": 1.33, - "learning_rate": 4.304548180009162e-05, - "loss": 0.2924, + "epoch": 1.3628500378419288, + "grad_norm": 0.19366760551929474, + "learning_rate": 4.267973284132471e-05, + "loss": 0.4157, "step": 37815 }, { - "epoch": 1.33, - "learning_rate": 4.304351013534252e-05, - "loss": 0.2803, + "epoch": 1.3630302375031536, + "grad_norm": 0.15186309814453125, + "learning_rate": 4.267766952966369e-05, + "loss": 0.3843, "step": 37820 }, { - "epoch": 1.33, - "learning_rate": 4.304153823631033e-05, - "loss": 0.3049, + "epoch": 1.363210437164378, + "grad_norm": 0.19888971745967865, + "learning_rate": 4.267560597714785e-05, + "loss": 0.4345, "step": 37825 }, { - "epoch": 1.33, - "learning_rate": 4.3039566103020654e-05, - "loss": 0.2881, + "epoch": 1.3633906368256028, + "grad_norm": 0.205244779586792, + "learning_rate": 4.2673542183805295e-05, + "loss": 0.437, "step": 37830 }, { - "epoch": 1.33, - "learning_rate": 4.30375937354991e-05, - "loss": 0.2739, + "epoch": 1.3635708364868275, + "grad_norm": 0.21422690153121948, + "learning_rate": 4.267147814966415e-05, + "loss": 0.4205, "step": 37835 }, { - "epoch": 1.33, - "learning_rate": 4.303562113377129e-05, - "loss": 0.2623, + "epoch": 1.363751036148052, + "grad_norm": 0.15978442132472992, + "learning_rate": 4.266941387475254e-05, + "loss": 0.4362, "step": 37840 }, { - "epoch": 1.33, - "learning_rate": 4.3033648297862814e-05, - "loss": 0.2819, + "epoch": 1.3639312358092766, + "grad_norm": 0.17365151643753052, + "learning_rate": 4.2667349359098586e-05, + "loss": 0.4162, "step": 37845 }, { - "epoch": 1.33, - "learning_rate": 4.303167522779931e-05, - "loss": 0.2592, + "epoch": 1.3641114354705013, + "grad_norm": 0.17353810369968414, + "learning_rate": 4.266528460273041e-05, + "loss": 0.3909, "step": 37850 }, { - "epoch": 1.33, - "learning_rate": 4.302970192360638e-05, - "loss": 0.3023, + "epoch": 1.364291635131726, + "grad_norm": 0.16570428013801575, + "learning_rate": 4.266321960567616e-05, + "loss": 0.3856, "step": 37855 }, { - "epoch": 1.33, - "learning_rate": 4.3027728385309655e-05, - "loss": 0.2909, + "epoch": 1.3644718347929505, + "grad_norm": 0.17112329602241516, + "learning_rate": 4.2661154367963965e-05, + "loss": 0.4471, "step": 37860 }, { - "epoch": 1.33, - "learning_rate": 4.302575461293476e-05, - "loss": 0.3025, + "epoch": 1.3646520344541753, + "grad_norm": 0.14121274650096893, + "learning_rate": 4.2659088889621954e-05, + "loss": 0.4194, "step": 37865 }, { - "epoch": 1.33, - "learning_rate": 4.302378060650731e-05, - "loss": 0.2759, + "epoch": 1.3648322341153998, + "grad_norm": 0.17881150543689728, + "learning_rate": 4.265702317067828e-05, + "loss": 0.4413, "step": 37870 }, { - "epoch": 1.33, - "learning_rate": 4.302180636605295e-05, - "loss": 0.2955, + "epoch": 1.3650124337766245, + "grad_norm": 0.22597812116146088, + "learning_rate": 4.2654957211161085e-05, + "loss": 0.4434, "step": 37875 }, { - "epoch": 1.33, - "learning_rate": 4.301983189159731e-05, - "loss": 0.2767, + "epoch": 1.3651926334378492, + "grad_norm": 0.16727763414382935, + "learning_rate": 4.2652891011098505e-05, + "loss": 0.4055, "step": 37880 }, { - "epoch": 1.33, - "learning_rate": 4.301785718316603e-05, - "loss": 0.3205, + "epoch": 1.3653728330990738, + "grad_norm": 0.19390372931957245, + "learning_rate": 4.265082457051872e-05, + "loss": 0.3988, "step": 37885 }, { - "epoch": 1.33, - "learning_rate": 4.3015882240784755e-05, - "loss": 0.3078, + "epoch": 1.3655530327602985, + "grad_norm": 0.177637979388237, + "learning_rate": 4.264875788944985e-05, + "loss": 0.411, "step": 37890 }, { - "epoch": 1.33, - "learning_rate": 4.301390706447912e-05, - "loss": 0.2932, + "epoch": 1.365733232421523, + "grad_norm": 0.17487314343452454, + "learning_rate": 4.2646690967920086e-05, + "loss": 0.4127, "step": 37895 }, { - "epoch": 1.33, - "learning_rate": 4.3011931654274776e-05, - "loss": 0.292, + "epoch": 1.3659134320827477, + "grad_norm": 0.203982412815094, + "learning_rate": 4.264462380595756e-05, + "loss": 0.4603, "step": 37900 }, { - "epoch": 1.33, - "learning_rate": 4.300995601019736e-05, - "loss": 0.2999, + "epoch": 1.3660936317439722, + "grad_norm": 0.23240825533866882, + "learning_rate": 4.264255640359046e-05, + "loss": 0.418, "step": 37905 }, { - "epoch": 1.33, - "learning_rate": 4.300798013227254e-05, - "loss": 0.2601, + "epoch": 1.366273831405197, + "grad_norm": 0.15722376108169556, + "learning_rate": 4.2640488760846945e-05, + "loss": 0.4449, "step": 37910 }, { - "epoch": 1.33, - "learning_rate": 4.300600402052597e-05, - "loss": 0.2811, + "epoch": 1.3664540310664215, + "grad_norm": 0.14210271835327148, + "learning_rate": 4.263842087775518e-05, + "loss": 0.3789, "step": 37915 }, { - "epoch": 1.33, - "learning_rate": 4.3004027674983294e-05, - "loss": 0.2905, + "epoch": 1.3666342307276462, + "grad_norm": 0.15911325812339783, + "learning_rate": 4.263635275434336e-05, + "loss": 0.4281, "step": 37920 }, { - "epoch": 1.33, - "learning_rate": 4.3002051095670195e-05, - "loss": 0.2913, + "epoch": 1.366814430388871, + "grad_norm": 0.17531517148017883, + "learning_rate": 4.263428439063963e-05, + "loss": 0.4233, "step": 37925 }, { - "epoch": 1.33, - "learning_rate": 4.300007428261231e-05, - "loss": 0.2848, + "epoch": 1.3669946300500955, + "grad_norm": 0.16672587394714355, + "learning_rate": 4.263221578667219e-05, + "loss": 0.4255, "step": 37930 }, { - "epoch": 1.33, - "learning_rate": 4.299809723583534e-05, - "loss": 0.2817, + "epoch": 1.3671748297113202, + "grad_norm": 0.21964746713638306, + "learning_rate": 4.263014694246924e-05, + "loss": 0.4298, "step": 37935 }, { - "epoch": 1.33, - "learning_rate": 4.2996119955364936e-05, - "loss": 0.2777, + "epoch": 1.3673550293725447, + "grad_norm": 0.161033034324646, + "learning_rate": 4.262807785805894e-05, + "loss": 0.3999, "step": 37940 }, { - "epoch": 1.34, - "learning_rate": 4.2994142441226764e-05, - "loss": 0.2785, + "epoch": 1.3675352290337695, + "grad_norm": 0.20498016476631165, + "learning_rate": 4.262600853346949e-05, + "loss": 0.4068, "step": 37945 }, { - "epoch": 1.34, - "learning_rate": 4.299216469344652e-05, - "loss": 0.2951, + "epoch": 1.3677154286949942, + "grad_norm": 0.20285950601100922, + "learning_rate": 4.262393896872909e-05, + "loss": 0.4023, "step": 37950 }, { - "epoch": 1.34, - "learning_rate": 4.2990186712049874e-05, - "loss": 0.3124, + "epoch": 1.3678956283562187, + "grad_norm": 0.16656413674354553, + "learning_rate": 4.262186916386594e-05, + "loss": 0.4094, "step": 37955 }, { - "epoch": 1.34, - "learning_rate": 4.2988208497062506e-05, - "loss": 0.3078, + "epoch": 1.3680758280174432, + "grad_norm": 0.1727702021598816, + "learning_rate": 4.261979911890822e-05, + "loss": 0.3972, "step": 37960 }, { - "epoch": 1.34, - "learning_rate": 4.29862300485101e-05, - "loss": 0.2748, + "epoch": 1.368256027678668, + "grad_norm": 0.14835266768932343, + "learning_rate": 4.261772883388416e-05, + "loss": 0.4069, "step": 37965 }, { - "epoch": 1.34, - "learning_rate": 4.298425136641836e-05, - "loss": 0.2856, + "epoch": 1.3684362273398927, + "grad_norm": 0.22121186554431915, + "learning_rate": 4.261565830882195e-05, + "loss": 0.4128, "step": 37970 }, { - "epoch": 1.34, - "learning_rate": 4.2982272450812964e-05, - "loss": 0.2934, + "epoch": 1.3686164270011172, + "grad_norm": 0.15944576263427734, + "learning_rate": 4.261358754374981e-05, + "loss": 0.4228, "step": 37975 }, { - "epoch": 1.34, - "learning_rate": 4.2980293301719614e-05, - "loss": 0.293, + "epoch": 1.368796626662342, + "grad_norm": 0.19683369994163513, + "learning_rate": 4.261151653869595e-05, + "loss": 0.436, "step": 37980 }, { - "epoch": 1.34, - "learning_rate": 4.297831391916401e-05, - "loss": 0.272, + "epoch": 1.3689768263235664, + "grad_norm": 0.16129469871520996, + "learning_rate": 4.260944529368858e-05, + "loss": 0.4114, "step": 37985 }, { - "epoch": 1.34, - "learning_rate": 4.297633430317183e-05, - "loss": 0.2812, + "epoch": 1.3691570259847912, + "grad_norm": 0.18280497193336487, + "learning_rate": 4.260737380875593e-05, + "loss": 0.4122, "step": 37990 }, { - "epoch": 1.34, - "learning_rate": 4.297435445376881e-05, - "loss": 0.2859, + "epoch": 1.369337225646016, + "grad_norm": 0.20879638195037842, + "learning_rate": 4.260530208392622e-05, + "loss": 0.3891, "step": 37995 }, { - "epoch": 1.34, - "learning_rate": 4.297237437098064e-05, - "loss": 0.3075, + "epoch": 1.3695174253072404, + "grad_norm": 0.21062108874320984, + "learning_rate": 4.260323011922768e-05, + "loss": 0.3986, "step": 38000 }, { - "epoch": 1.34, - "eval_loss": 0.28374922275543213, - "eval_runtime": 10.5368, - "eval_samples_per_second": 9.491, - "eval_steps_per_second": 9.491, + "epoch": 1.3695174253072404, + "eval_loss": 0.4474603235721588, + "eval_runtime": 3.5425, + "eval_samples_per_second": 28.229, + "eval_steps_per_second": 7.057, "step": 38000 }, { - "epoch": 1.34, - "learning_rate": 4.297039405483303e-05, - "loss": 0.2702, + "epoch": 1.369697624968465, + "grad_norm": 0.22914621233940125, + "learning_rate": 4.2601157914688535e-05, + "loss": 0.418, "step": 38005 }, { - "epoch": 1.34, - "learning_rate": 4.2968413505351704e-05, - "loss": 0.2736, + "epoch": 1.3698778246296897, + "grad_norm": 0.2014915496110916, + "learning_rate": 4.259908547033703e-05, + "loss": 0.4176, "step": 38010 }, { - "epoch": 1.34, - "learning_rate": 4.296643272256237e-05, - "loss": 0.2756, + "epoch": 1.3700580242909144, + "grad_norm": 0.18956822156906128, + "learning_rate": 4.2597012786201384e-05, + "loss": 0.414, "step": 38015 }, { - "epoch": 1.34, - "learning_rate": 4.296445170649074e-05, - "loss": 0.2682, + "epoch": 1.370238223952139, + "grad_norm": 0.18128299713134766, + "learning_rate": 4.2594939862309845e-05, + "loss": 0.4348, "step": 38020 }, { - "epoch": 1.34, - "learning_rate": 4.296247045716254e-05, - "loss": 0.2696, + "epoch": 1.3704184236133636, + "grad_norm": 0.22281454503536224, + "learning_rate": 4.259286669869066e-05, + "loss": 0.4026, "step": 38025 }, { - "epoch": 1.34, - "learning_rate": 4.2960488974603506e-05, - "loss": 0.2754, + "epoch": 1.3705986232745881, + "grad_norm": 0.21276813745498657, + "learning_rate": 4.259079329537208e-05, + "loss": 0.4402, "step": 38030 }, { - "epoch": 1.34, - "learning_rate": 4.295850725883936e-05, - "loss": 0.2919, + "epoch": 1.3707788229358129, + "grad_norm": 0.14526169002056122, + "learning_rate": 4.2588719652382336e-05, + "loss": 0.4397, "step": 38035 }, { - "epoch": 1.34, - "learning_rate": 4.295652530989583e-05, - "loss": 0.2839, + "epoch": 1.3709590225970376, + "grad_norm": 0.18278780579566956, + "learning_rate": 4.25866457697497e-05, + "loss": 0.393, "step": 38040 }, { - "epoch": 1.34, - "learning_rate": 4.295454312779865e-05, - "loss": 0.2828, + "epoch": 1.3711392222582621, + "grad_norm": 0.15444017946720123, + "learning_rate": 4.2584571647502417e-05, + "loss": 0.427, "step": 38045 }, { - "epoch": 1.34, - "learning_rate": 4.2952560712573565e-05, - "loss": 0.2947, + "epoch": 1.3713194219194869, + "grad_norm": 0.1620875895023346, + "learning_rate": 4.2582497285668746e-05, + "loss": 0.405, "step": 38050 }, { - "epoch": 1.34, - "learning_rate": 4.2950578064246305e-05, - "loss": 0.2927, + "epoch": 1.3714996215807114, + "grad_norm": 0.15654486417770386, + "learning_rate": 4.258042268427695e-05, + "loss": 0.3709, "step": 38055 }, { - "epoch": 1.34, - "learning_rate": 4.294859518284263e-05, - "loss": 0.3115, + "epoch": 1.371679821241936, + "grad_norm": 0.14211991429328918, + "learning_rate": 4.257834784335531e-05, + "loss": 0.4132, "step": 38060 }, { - "epoch": 1.34, - "learning_rate": 4.294661206838827e-05, - "loss": 0.3096, + "epoch": 1.3718600209031608, + "grad_norm": 0.17219579219818115, + "learning_rate": 4.257627276293208e-05, + "loss": 0.4233, "step": 38065 }, { - "epoch": 1.34, - "learning_rate": 4.294462872090897e-05, - "loss": 0.2757, + "epoch": 1.3720402205643853, + "grad_norm": 0.19686567783355713, + "learning_rate": 4.257419744303553e-05, + "loss": 0.4242, "step": 38070 }, { - "epoch": 1.34, - "learning_rate": 4.29426451404305e-05, - "loss": 0.2621, + "epoch": 1.3722204202256099, + "grad_norm": 0.1756589412689209, + "learning_rate": 4.257212188369395e-05, + "loss": 0.4201, "step": 38075 }, { - "epoch": 1.34, - "learning_rate": 4.2940661326978604e-05, - "loss": 0.2941, + "epoch": 1.3724006198868346, + "grad_norm": 0.17567414045333862, + "learning_rate": 4.257004608493561e-05, + "loss": 0.4198, "step": 38080 }, { - "epoch": 1.34, - "learning_rate": 4.293867728057905e-05, - "loss": 0.288, + "epoch": 1.3725808195480593, + "grad_norm": 0.2001543641090393, + "learning_rate": 4.256797004678879e-05, + "loss": 0.425, "step": 38085 }, { - "epoch": 1.34, - "learning_rate": 4.2936693001257586e-05, - "loss": 0.295, + "epoch": 1.3727610192092838, + "grad_norm": 0.20645958185195923, + "learning_rate": 4.2565893769281787e-05, + "loss": 0.4048, "step": 38090 }, { - "epoch": 1.34, - "learning_rate": 4.293470848903999e-05, - "loss": 0.3041, + "epoch": 1.3729412188705086, + "grad_norm": 0.18848280608654022, + "learning_rate": 4.256381725244287e-05, + "loss": 0.4125, "step": 38095 }, { - "epoch": 1.34, - "learning_rate": 4.293272374395202e-05, - "loss": 0.3014, + "epoch": 1.373121418531733, + "grad_norm": 0.16142655909061432, + "learning_rate": 4.2561740496300353e-05, + "loss": 0.3701, "step": 38100 }, { - "epoch": 1.34, - "learning_rate": 4.293073876601945e-05, - "loss": 0.245, + "epoch": 1.3733016181929578, + "grad_norm": 0.17975199222564697, + "learning_rate": 4.2559663500882515e-05, + "loss": 0.4448, "step": 38105 }, { - "epoch": 1.34, - "learning_rate": 4.292875355526806e-05, - "loss": 0.2874, + "epoch": 1.3734818178541826, + "grad_norm": 0.16210007667541504, + "learning_rate": 4.255758626621767e-05, + "loss": 0.4238, "step": 38110 }, { - "epoch": 1.34, - "learning_rate": 4.2926768111723615e-05, - "loss": 0.2582, + "epoch": 1.373662017515407, + "grad_norm": 0.1924877017736435, + "learning_rate": 4.2555508792334105e-05, + "loss": 0.42, "step": 38115 }, { - "epoch": 1.34, - "learning_rate": 4.29247824354119e-05, - "loss": 0.307, + "epoch": 1.3738422171766316, + "grad_norm": 0.18068957328796387, + "learning_rate": 4.255343107926013e-05, + "loss": 0.378, "step": 38120 }, { - "epoch": 1.34, - "learning_rate": 4.29227965263587e-05, - "loss": 0.2779, + "epoch": 1.3740224168378563, + "grad_norm": 0.21288029849529266, + "learning_rate": 4.255135312702406e-05, + "loss": 0.427, "step": 38125 }, { - "epoch": 1.34, - "learning_rate": 4.29208103845898e-05, - "loss": 0.3095, + "epoch": 1.374202616499081, + "grad_norm": 0.18913322687149048, + "learning_rate": 4.25492749356542e-05, + "loss": 0.4031, "step": 38130 }, { - "epoch": 1.34, - "learning_rate": 4.291882401013099e-05, - "loss": 0.2944, + "epoch": 1.3743828161603056, + "grad_norm": 0.1928529441356659, + "learning_rate": 4.2547196505178866e-05, + "loss": 0.3974, "step": 38135 }, { - "epoch": 1.34, - "learning_rate": 4.291683740300805e-05, - "loss": 0.2929, + "epoch": 1.3745630158215303, + "grad_norm": 0.186807319521904, + "learning_rate": 4.254511783562638e-05, + "loss": 0.4134, "step": 38140 }, { - "epoch": 1.34, - "learning_rate": 4.291485056324679e-05, - "loss": 0.2837, + "epoch": 1.3747432154827548, + "grad_norm": 0.19627200067043304, + "learning_rate": 4.254303892702506e-05, + "loss": 0.4457, "step": 38145 }, { - "epoch": 1.34, - "learning_rate": 4.2912863490873e-05, - "loss": 0.2728, + "epoch": 1.3749234151439795, + "grad_norm": 0.17085368931293488, + "learning_rate": 4.254095977940323e-05, + "loss": 0.4452, "step": 38150 }, { - "epoch": 1.34, - "learning_rate": 4.291087618591249e-05, - "loss": 0.2735, + "epoch": 1.3751036148052043, + "grad_norm": 0.15709124505519867, + "learning_rate": 4.2538880392789214e-05, + "loss": 0.4101, "step": 38155 }, { - "epoch": 1.34, - "learning_rate": 4.290888864839104e-05, - "loss": 0.2843, + "epoch": 1.3752838144664288, + "grad_norm": 0.2019108682870865, + "learning_rate": 4.253680076721136e-05, + "loss": 0.403, "step": 38160 }, { - "epoch": 1.34, - "learning_rate": 4.290690087833449e-05, - "loss": 0.267, + "epoch": 1.3754640141276535, + "grad_norm": 0.17493942379951477, + "learning_rate": 4.253472090269798e-05, + "loss": 0.4093, "step": 38165 }, { - "epoch": 1.34, - "learning_rate": 4.2904912875768624e-05, - "loss": 0.3167, + "epoch": 1.375644213788878, + "grad_norm": 0.18124589323997498, + "learning_rate": 4.253264079927742e-05, + "loss": 0.4191, "step": 38170 }, { - "epoch": 1.34, - "learning_rate": 4.2902924640719267e-05, - "loss": 0.2905, + "epoch": 1.3758244134501028, + "grad_norm": 0.20440387725830078, + "learning_rate": 4.253056045697803e-05, + "loss": 0.4026, "step": 38175 }, { - "epoch": 1.34, - "learning_rate": 4.290093617321223e-05, - "loss": 0.2805, + "epoch": 1.3760046131113273, + "grad_norm": 0.2788972854614258, + "learning_rate": 4.252847987582815e-05, + "loss": 0.4271, "step": 38180 }, { - "epoch": 1.34, - "learning_rate": 4.289894747327333e-05, - "loss": 0.2999, + "epoch": 1.376184812772552, + "grad_norm": 0.20534437894821167, + "learning_rate": 4.252639905585613e-05, + "loss": 0.4404, "step": 38185 }, { - "epoch": 1.34, - "learning_rate": 4.28969585409284e-05, - "loss": 0.2963, + "epoch": 1.3763650124337765, + "grad_norm": 0.18792025744915009, + "learning_rate": 4.2524317997090304e-05, + "loss": 0.427, "step": 38190 }, { - "epoch": 1.34, - "learning_rate": 4.2894969376203257e-05, - "loss": 0.2979, + "epoch": 1.3765452120950012, + "grad_norm": 0.2014329433441162, + "learning_rate": 4.2522236699559045e-05, + "loss": 0.4211, "step": 38195 }, { - "epoch": 1.34, - "learning_rate": 4.289297997912373e-05, - "loss": 0.2835, + "epoch": 1.376725411756226, + "grad_norm": 0.1882011443376541, + "learning_rate": 4.25201551632907e-05, + "loss": 0.431, "step": 38200 }, { - "epoch": 1.34, - "learning_rate": 4.289099034971565e-05, - "loss": 0.2844, + "epoch": 1.3769056114174505, + "grad_norm": 0.1633293330669403, + "learning_rate": 4.251807338831363e-05, + "loss": 0.4322, "step": 38205 }, { - "epoch": 1.34, - "learning_rate": 4.288900048800485e-05, - "loss": 0.303, + "epoch": 1.3770858110786752, + "grad_norm": 0.1637231707572937, + "learning_rate": 4.2515991374656204e-05, + "loss": 0.3997, "step": 38210 }, { - "epoch": 1.34, - "learning_rate": 4.288701039401717e-05, - "loss": 0.291, + "epoch": 1.3772660107398997, + "grad_norm": 0.19281378388404846, + "learning_rate": 4.251390912234679e-05, + "loss": 0.4273, "step": 38215 }, { - "epoch": 1.34, - "learning_rate": 4.288502006777844e-05, - "loss": 0.2979, + "epoch": 1.3774462104011245, + "grad_norm": 0.18837355077266693, + "learning_rate": 4.251182663141375e-05, + "loss": 0.4359, "step": 38220 }, { - "epoch": 1.34, - "learning_rate": 4.288302950931452e-05, - "loss": 0.2875, + "epoch": 1.3776264100623492, + "grad_norm": 0.17957091331481934, + "learning_rate": 4.2509743901885474e-05, + "loss": 0.4057, "step": 38225 }, { - "epoch": 1.35, - "learning_rate": 4.288103871865124e-05, - "loss": 0.286, + "epoch": 1.3778066097235737, + "grad_norm": 0.1847553253173828, + "learning_rate": 4.2507660933790314e-05, + "loss": 0.44, "step": 38230 }, { - "epoch": 1.35, - "learning_rate": 4.287904769581446e-05, - "loss": 0.2736, + "epoch": 1.3779868093847982, + "grad_norm": 0.16359250247478485, + "learning_rate": 4.250557772715667e-05, + "loss": 0.3645, "step": 38235 }, { - "epoch": 1.35, - "learning_rate": 4.287705644083003e-05, - "loss": 0.277, + "epoch": 1.378167009046023, + "grad_norm": 0.18574312329292297, + "learning_rate": 4.250349428201292e-05, + "loss": 0.4006, "step": 38240 }, { - "epoch": 1.35, - "learning_rate": 4.28750649537238e-05, - "loss": 0.2884, + "epoch": 1.3783472087072477, + "grad_norm": 0.16163067519664764, + "learning_rate": 4.250141059838745e-05, + "loss": 0.4349, "step": 38245 }, { - "epoch": 1.35, - "learning_rate": 4.287307323452163e-05, - "loss": 0.2813, + "epoch": 1.3785274083684722, + "grad_norm": 0.15968047082424164, + "learning_rate": 4.249932667630864e-05, + "loss": 0.4231, "step": 38250 }, { - "epoch": 1.35, - "learning_rate": 4.2871081283249395e-05, - "loss": 0.2952, + "epoch": 1.378707608029697, + "grad_norm": 0.17597581446170807, + "learning_rate": 4.24972425158049e-05, + "loss": 0.402, "step": 38255 }, { - "epoch": 1.35, - "learning_rate": 4.2869089099932936e-05, - "loss": 0.288, + "epoch": 1.3788878076909215, + "grad_norm": 0.19579049944877625, + "learning_rate": 4.249515811690461e-05, + "loss": 0.3995, "step": 38260 }, { - "epoch": 1.35, - "learning_rate": 4.2867096684598134e-05, - "loss": 0.2726, + "epoch": 1.3790680073521462, + "grad_norm": 0.1438038945198059, + "learning_rate": 4.249307347963619e-05, + "loss": 0.4186, "step": 38265 }, { - "epoch": 1.35, - "learning_rate": 4.286510403727086e-05, - "loss": 0.3139, + "epoch": 1.379248207013371, + "grad_norm": 0.19650869071483612, + "learning_rate": 4.249098860402802e-05, + "loss": 0.4032, "step": 38270 }, { - "epoch": 1.35, - "learning_rate": 4.2863111157976974e-05, - "loss": 0.3018, + "epoch": 1.3794284066745954, + "grad_norm": 0.16172637045383453, + "learning_rate": 4.2488903490108524e-05, + "loss": 0.4299, "step": 38275 }, { - "epoch": 1.35, - "learning_rate": 4.286111804674237e-05, - "loss": 0.2958, + "epoch": 1.3796086063358202, + "grad_norm": 0.21487955749034882, + "learning_rate": 4.2486818137906095e-05, + "loss": 0.4782, "step": 38280 }, { - "epoch": 1.35, - "learning_rate": 4.2859124703592915e-05, - "loss": 0.2771, + "epoch": 1.3797888059970447, + "grad_norm": 0.20923011004924774, + "learning_rate": 4.248473254744917e-05, + "loss": 0.442, "step": 38285 }, { - "epoch": 1.35, - "learning_rate": 4.28571311285545e-05, - "loss": 0.3014, + "epoch": 1.3799690056582694, + "grad_norm": 0.15938104689121246, + "learning_rate": 4.2482646718766136e-05, + "loss": 0.4118, "step": 38290 }, { - "epoch": 1.35, - "learning_rate": 4.2855137321653007e-05, - "loss": 0.2514, + "epoch": 1.380149205319494, + "grad_norm": 0.1880597025156021, + "learning_rate": 4.2480560651885425e-05, + "loss": 0.4324, "step": 38295 }, { - "epoch": 1.35, - "learning_rate": 4.285314328291431e-05, - "loss": 0.3112, + "epoch": 1.3803294049807187, + "grad_norm": 0.1785939335823059, + "learning_rate": 4.247847434683546e-05, + "loss": 0.4128, "step": 38300 }, { - "epoch": 1.35, - "learning_rate": 4.285114901236433e-05, - "loss": 0.2921, + "epoch": 1.3805096046419432, + "grad_norm": 0.2147594839334488, + "learning_rate": 4.247638780364468e-05, + "loss": 0.3871, "step": 38305 }, { - "epoch": 1.35, - "learning_rate": 4.284915451002893e-05, - "loss": 0.2843, + "epoch": 1.380689804303168, + "grad_norm": 0.13881224393844604, + "learning_rate": 4.247430102234149e-05, + "loss": 0.4052, "step": 38310 }, { - "epoch": 1.35, - "learning_rate": 4.284715977593403e-05, - "loss": 0.282, + "epoch": 1.3808700039643926, + "grad_norm": 0.18304836750030518, + "learning_rate": 4.2472214002954324e-05, + "loss": 0.3941, "step": 38315 }, { - "epoch": 1.35, - "learning_rate": 4.284516481010552e-05, - "loss": 0.2915, + "epoch": 1.3810502036256171, + "grad_norm": 0.1778329461812973, + "learning_rate": 4.247012674551163e-05, + "loss": 0.4741, "step": 38320 }, { - "epoch": 1.35, - "learning_rate": 4.2843169612569306e-05, - "loss": 0.2836, + "epoch": 1.3812304032868419, + "grad_norm": 0.18518294394016266, + "learning_rate": 4.246803925004185e-05, + "loss": 0.443, "step": 38325 }, { - "epoch": 1.35, - "learning_rate": 4.2841174183351294e-05, - "loss": 0.2788, + "epoch": 1.3814106029480664, + "grad_norm": 0.21796804666519165, + "learning_rate": 4.2465951516573406e-05, + "loss": 0.4183, "step": 38330 }, { - "epoch": 1.35, - "learning_rate": 4.2839178522477394e-05, - "loss": 0.295, + "epoch": 1.3815908026092911, + "grad_norm": 0.2137499749660492, + "learning_rate": 4.246386354513475e-05, + "loss": 0.4216, "step": 38335 }, { - "epoch": 1.35, - "learning_rate": 4.283718262997352e-05, - "loss": 0.2832, + "epoch": 1.3817710022705159, + "grad_norm": 0.18902204930782318, + "learning_rate": 4.246177533575435e-05, + "loss": 0.4524, "step": 38340 }, { - "epoch": 1.35, - "learning_rate": 4.2835186505865574e-05, - "loss": 0.3233, + "epoch": 1.3819512019317404, + "grad_norm": 0.171752467751503, + "learning_rate": 4.2459686888460635e-05, + "loss": 0.4111, "step": 38345 }, { - "epoch": 1.35, - "learning_rate": 4.283319015017949e-05, - "loss": 0.2912, + "epoch": 1.3821314015929649, + "grad_norm": 0.15336576104164124, + "learning_rate": 4.245759820328206e-05, + "loss": 0.3991, "step": 38350 }, { - "epoch": 1.35, - "learning_rate": 4.2831193562941185e-05, - "loss": 0.2738, + "epoch": 1.3823116012541896, + "grad_norm": 0.19525669515132904, + "learning_rate": 4.2455509280247097e-05, + "loss": 0.4658, "step": 38355 }, { - "epoch": 1.35, - "learning_rate": 4.282919674417658e-05, - "loss": 0.2797, + "epoch": 1.3824918009154143, + "grad_norm": 0.15848879516124725, + "learning_rate": 4.2453420119384195e-05, + "loss": 0.427, "step": 38360 }, { - "epoch": 1.35, - "learning_rate": 4.2827199693911616e-05, - "loss": 0.3054, + "epoch": 1.3826720005766389, + "grad_norm": 0.16411955654621124, + "learning_rate": 4.245133072072183e-05, + "loss": 0.4179, "step": 38365 }, { - "epoch": 1.35, - "learning_rate": 4.28252024121722e-05, - "loss": 0.2882, + "epoch": 1.3828522002378636, + "grad_norm": 0.21341267228126526, + "learning_rate": 4.244924108428846e-05, + "loss": 0.4251, "step": 38370 }, { - "epoch": 1.35, - "learning_rate": 4.282320489898428e-05, - "loss": 0.2655, + "epoch": 1.383032399899088, + "grad_norm": 0.16351541876792908, + "learning_rate": 4.2447151210112555e-05, + "loss": 0.4354, "step": 38375 }, { - "epoch": 1.35, - "learning_rate": 4.2821207154373796e-05, - "loss": 0.2844, + "epoch": 1.3832125995603128, + "grad_norm": 0.176330104470253, + "learning_rate": 4.2445061098222596e-05, + "loss": 0.4136, "step": 38380 }, { - "epoch": 1.35, - "learning_rate": 4.281920917836668e-05, - "loss": 0.2826, + "epoch": 1.3833927992215376, + "grad_norm": 0.16498027741909027, + "learning_rate": 4.244297074864705e-05, + "loss": 0.4119, "step": 38385 }, { - "epoch": 1.35, - "learning_rate": 4.2817210970988875e-05, - "loss": 0.2975, + "epoch": 1.383572998882762, + "grad_norm": 0.20347841084003448, + "learning_rate": 4.244088016141441e-05, + "loss": 0.4565, "step": 38390 }, { - "epoch": 1.35, - "learning_rate": 4.2815212532266325e-05, - "loss": 0.2714, + "epoch": 1.3837531985439868, + "grad_norm": 0.20916853845119476, + "learning_rate": 4.2438789336553154e-05, + "loss": 0.4209, "step": 38395 }, { - "epoch": 1.35, - "learning_rate": 4.2813213862224986e-05, - "loss": 0.2979, + "epoch": 1.3839333982052113, + "grad_norm": 0.20273546874523163, + "learning_rate": 4.2436698274091765e-05, + "loss": 0.4065, "step": 38400 }, { - "epoch": 1.35, - "learning_rate": 4.28112149608908e-05, - "loss": 0.2812, + "epoch": 1.384113597866436, + "grad_norm": 0.15745113790035248, + "learning_rate": 4.2434606974058756e-05, + "loss": 0.3952, "step": 38405 }, { - "epoch": 1.35, - "learning_rate": 4.2809215828289725e-05, - "loss": 0.2823, + "epoch": 1.3842937975276606, + "grad_norm": 0.20275966823101044, + "learning_rate": 4.243251543648258e-05, + "loss": 0.3806, "step": 38410 }, { - "epoch": 1.35, - "learning_rate": 4.2807216464447724e-05, - "loss": 0.282, + "epoch": 1.3844739971888853, + "grad_norm": 0.15785571932792664, + "learning_rate": 4.243042366139177e-05, + "loss": 0.4033, "step": 38415 }, { - "epoch": 1.35, - "learning_rate": 4.280521686939075e-05, - "loss": 0.3143, + "epoch": 1.3846541968501098, + "grad_norm": 0.19757528603076935, + "learning_rate": 4.24283316488148e-05, + "loss": 0.3708, "step": 38420 }, { - "epoch": 1.35, - "learning_rate": 4.280321704314477e-05, - "loss": 0.3048, + "epoch": 1.3848343965113346, + "grad_norm": 0.2014017403125763, + "learning_rate": 4.24262393987802e-05, + "loss": 0.4544, "step": 38425 }, { - "epoch": 1.35, - "learning_rate": 4.280121698573575e-05, - "loss": 0.3021, + "epoch": 1.3850145961725593, + "grad_norm": 0.20693495869636536, + "learning_rate": 4.242414691131645e-05, + "loss": 0.4509, "step": 38430 }, { - "epoch": 1.35, - "learning_rate": 4.279921669718966e-05, - "loss": 0.3082, + "epoch": 1.3851947958337838, + "grad_norm": 0.18149085342884064, + "learning_rate": 4.242205418645208e-05, + "loss": 0.4171, "step": 38435 }, { - "epoch": 1.35, - "learning_rate": 4.279721617753247e-05, - "loss": 0.2914, + "epoch": 1.3853749954950085, + "grad_norm": 0.2000938504934311, + "learning_rate": 4.2419961224215595e-05, + "loss": 0.3862, "step": 38440 }, { - "epoch": 1.35, - "learning_rate": 4.2795215426790155e-05, - "loss": 0.2763, + "epoch": 1.385555195156233, + "grad_norm": 0.19279134273529053, + "learning_rate": 4.2417868024635504e-05, + "loss": 0.4, "step": 38445 }, { - "epoch": 1.35, - "learning_rate": 4.279321444498869e-05, - "loss": 0.2776, + "epoch": 1.3857353948174578, + "grad_norm": 0.18606650829315186, + "learning_rate": 4.241577458774034e-05, + "loss": 0.4044, "step": 38450 }, { - "epoch": 1.35, - "learning_rate": 4.279121323215407e-05, - "loss": 0.2933, + "epoch": 1.3859155944786825, + "grad_norm": 0.1808106005191803, + "learning_rate": 4.241368091355862e-05, + "loss": 0.4383, "step": 38455 }, { - "epoch": 1.35, - "learning_rate": 4.278921178831227e-05, - "loss": 0.2893, + "epoch": 1.386095794139907, + "grad_norm": 0.15027718245983124, + "learning_rate": 4.241158700211886e-05, + "loss": 0.4218, "step": 38460 }, { - "epoch": 1.35, - "learning_rate": 4.278721011348927e-05, - "loss": 0.2816, + "epoch": 1.3862759938011315, + "grad_norm": 0.15103080868721008, + "learning_rate": 4.24094928534496e-05, + "loss": 0.416, "step": 38465 }, { - "epoch": 1.35, - "learning_rate": 4.278520820771108e-05, - "loss": 0.2731, + "epoch": 1.3864561934623563, + "grad_norm": 0.1560763716697693, + "learning_rate": 4.2407398467579376e-05, + "loss": 0.4568, "step": 38470 }, { - "epoch": 1.35, - "learning_rate": 4.278320607100368e-05, - "loss": 0.2938, + "epoch": 1.386636393123581, + "grad_norm": 0.17564137279987335, + "learning_rate": 4.2405303844536714e-05, + "loss": 0.3982, "step": 38475 }, { - "epoch": 1.35, - "learning_rate": 4.278120370339307e-05, - "loss": 0.2871, + "epoch": 1.3868165927848055, + "grad_norm": 0.1863342970609665, + "learning_rate": 4.2403208984350164e-05, + "loss": 0.4215, "step": 38480 }, { - "epoch": 1.35, - "learning_rate": 4.277920110490524e-05, - "loss": 0.2997, + "epoch": 1.3869967924460302, + "grad_norm": 0.2165493667125702, + "learning_rate": 4.240111388704825e-05, + "loss": 0.4043, "step": 38485 }, { - "epoch": 1.35, - "learning_rate": 4.2777198275566214e-05, - "loss": 0.2904, + "epoch": 1.3871769921072548, + "grad_norm": 0.17703358829021454, + "learning_rate": 4.2399018552659536e-05, + "loss": 0.4201, "step": 38490 }, { - "epoch": 1.35, - "learning_rate": 4.277519521540197e-05, - "loss": 0.2833, + "epoch": 1.3873571917684795, + "grad_norm": 0.12427543848752975, + "learning_rate": 4.239692298121256e-05, + "loss": 0.4103, "step": 38495 }, { - "epoch": 1.35, - "learning_rate": 4.277319192443854e-05, - "loss": 0.2809, + "epoch": 1.3875373914297042, + "grad_norm": 0.161130890250206, + "learning_rate": 4.239482717273587e-05, + "loss": 0.4157, "step": 38500 }, { - "epoch": 1.35, - "eval_loss": 0.28302499651908875, - "eval_runtime": 10.5824, - "eval_samples_per_second": 9.45, - "eval_steps_per_second": 9.45, + "epoch": 1.3875373914297042, + "eval_loss": 0.44684794545173645, + "eval_runtime": 3.5231, + "eval_samples_per_second": 28.384, + "eval_steps_per_second": 7.096, "step": 38500 }, { - "epoch": 1.35, - "learning_rate": 4.277118840270192e-05, - "loss": 0.3101, + "epoch": 1.3877175910909287, + "grad_norm": 0.20817585289478302, + "learning_rate": 4.2392731127258037e-05, + "loss": 0.4488, "step": 38505 }, { - "epoch": 1.35, - "learning_rate": 4.2769184650218133e-05, - "loss": 0.2771, + "epoch": 1.3878977907521532, + "grad_norm": 0.17625221610069275, + "learning_rate": 4.239063484480761e-05, + "loss": 0.4266, "step": 38510 }, { - "epoch": 1.36, - "learning_rate": 4.276718066701319e-05, - "loss": 0.278, + "epoch": 1.388077990413378, + "grad_norm": 0.2073509842157364, + "learning_rate": 4.238853832541315e-05, + "loss": 0.4415, "step": 38515 }, { - "epoch": 1.36, - "learning_rate": 4.276517645311312e-05, - "loss": 0.2896, + "epoch": 1.3882581900746027, + "grad_norm": 0.2082330286502838, + "learning_rate": 4.238644156910322e-05, + "loss": 0.388, "step": 38520 }, { - "epoch": 1.36, - "learning_rate": 4.276317200854394e-05, - "loss": 0.288, + "epoch": 1.3884383897358272, + "grad_norm": 0.19567419588565826, + "learning_rate": 4.238434457590639e-05, + "loss": 0.4268, "step": 38525 }, { - "epoch": 1.36, - "learning_rate": 4.276116733333167e-05, - "loss": 0.2893, + "epoch": 1.388618589397052, + "grad_norm": 0.19679030776023865, + "learning_rate": 4.238224734585123e-05, + "loss": 0.4052, "step": 38530 }, { - "epoch": 1.36, - "learning_rate": 4.275916242750236e-05, - "loss": 0.294, + "epoch": 1.3887987890582765, + "grad_norm": 0.219960555434227, + "learning_rate": 4.238014987896631e-05, + "loss": 0.4552, "step": 38535 }, { - "epoch": 1.36, - "learning_rate": 4.2757157291082026e-05, - "loss": 0.3026, + "epoch": 1.3889789887195012, + "grad_norm": 0.16919073462486267, + "learning_rate": 4.2378052175280216e-05, + "loss": 0.3934, "step": 38540 }, { - "epoch": 1.36, - "learning_rate": 4.27551519240967e-05, - "loss": 0.2939, + "epoch": 1.389159188380726, + "grad_norm": 0.15269456803798676, + "learning_rate": 4.237595423482153e-05, + "loss": 0.3468, "step": 38545 }, { - "epoch": 1.36, - "learning_rate": 4.2753146326572436e-05, - "loss": 0.2771, + "epoch": 1.3893393880419505, + "grad_norm": 0.18729466199874878, + "learning_rate": 4.237385605761883e-05, + "loss": 0.4197, "step": 38550 }, { - "epoch": 1.36, - "learning_rate": 4.275114049853526e-05, - "loss": 0.3041, + "epoch": 1.3895195877031752, + "grad_norm": 0.19090360403060913, + "learning_rate": 4.2371757643700705e-05, + "loss": 0.4401, "step": 38555 }, { - "epoch": 1.36, - "learning_rate": 4.274913444001123e-05, - "loss": 0.2886, + "epoch": 1.3896997873643997, + "grad_norm": 0.17647488415241241, + "learning_rate": 4.236965899309574e-05, + "loss": 0.4282, "step": 38560 }, { - "epoch": 1.36, - "learning_rate": 4.274712815102638e-05, - "loss": 0.2751, + "epoch": 1.3898799870256244, + "grad_norm": 0.2209496647119522, + "learning_rate": 4.236756010583254e-05, + "loss": 0.4473, "step": 38565 }, { - "epoch": 1.36, - "learning_rate": 4.274512163160677e-05, - "loss": 0.2692, + "epoch": 1.3900601866868492, + "grad_norm": 0.16806313395500183, + "learning_rate": 4.23654609819397e-05, + "loss": 0.4122, "step": 38570 }, { - "epoch": 1.36, - "learning_rate": 4.274311488177845e-05, - "loss": 0.3038, + "epoch": 1.3902403863480737, + "grad_norm": 0.20954453945159912, + "learning_rate": 4.236336162144581e-05, + "loss": 0.4201, "step": 38575 }, { - "epoch": 1.36, - "learning_rate": 4.274110790156747e-05, - "loss": 0.281, + "epoch": 1.3904205860092982, + "grad_norm": 0.14711293578147888, + "learning_rate": 4.236126202437948e-05, + "loss": 0.3762, "step": 38580 }, { - "epoch": 1.36, - "learning_rate": 4.273910069099991e-05, - "loss": 0.2563, + "epoch": 1.390600785670523, + "grad_norm": 0.23657748103141785, + "learning_rate": 4.235916219076931e-05, + "loss": 0.4146, "step": 38585 }, { - "epoch": 1.36, - "learning_rate": 4.273709325010181e-05, - "loss": 0.286, + "epoch": 1.3907809853317477, + "grad_norm": 0.16697244346141815, + "learning_rate": 4.235706212064392e-05, + "loss": 0.4094, "step": 38590 }, { - "epoch": 1.36, - "learning_rate": 4.2735085578899236e-05, - "loss": 0.2666, + "epoch": 1.3909611849929722, + "grad_norm": 0.18580500781536102, + "learning_rate": 4.2354961814031924e-05, + "loss": 0.402, "step": 38595 }, { - "epoch": 1.36, - "learning_rate": 4.2733077677418275e-05, - "loss": 0.2747, + "epoch": 1.391141384654197, + "grad_norm": 0.1857820749282837, + "learning_rate": 4.235286127096193e-05, + "loss": 0.3989, "step": 38600 }, { - "epoch": 1.36, - "learning_rate": 4.2731069545684974e-05, - "loss": 0.2701, + "epoch": 1.3913215843154214, + "grad_norm": 0.1920892894268036, + "learning_rate": 4.235076049146257e-05, + "loss": 0.4196, "step": 38605 }, { - "epoch": 1.36, - "learning_rate": 4.272906118372544e-05, - "loss": 0.3037, + "epoch": 1.3915017839766461, + "grad_norm": 0.18875955045223236, + "learning_rate": 4.2348659475562445e-05, + "loss": 0.4203, "step": 38610 }, { - "epoch": 1.36, - "learning_rate": 4.272705259156572e-05, - "loss": 0.2786, + "epoch": 1.3916819836378709, + "grad_norm": 0.15751686692237854, + "learning_rate": 4.23465582232902e-05, + "loss": 0.4026, "step": 38615 }, { - "epoch": 1.36, - "learning_rate": 4.27250437692319e-05, - "loss": 0.2938, + "epoch": 1.3918621832990954, + "grad_norm": 0.1680629700422287, + "learning_rate": 4.2344456734674454e-05, + "loss": 0.3999, "step": 38620 }, { - "epoch": 1.36, - "learning_rate": 4.272303471675007e-05, - "loss": 0.2668, + "epoch": 1.39204238296032, + "grad_norm": 0.14273864030838013, + "learning_rate": 4.234235500974384e-05, + "loss": 0.427, "step": 38625 }, { - "epoch": 1.36, - "learning_rate": 4.272102543414631e-05, - "loss": 0.2861, + "epoch": 1.3922225826215446, + "grad_norm": 0.17425033450126648, + "learning_rate": 4.2340253048527e-05, + "loss": 0.4206, "step": 38630 }, { - "epoch": 1.36, - "learning_rate": 4.2719015921446726e-05, - "loss": 0.2727, + "epoch": 1.3924027822827694, + "grad_norm": 0.14705787599086761, + "learning_rate": 4.233815085105257e-05, + "loss": 0.4026, "step": 38635 }, { - "epoch": 1.36, - "learning_rate": 4.2717006178677386e-05, - "loss": 0.2813, + "epoch": 1.3925829819439939, + "grad_norm": 0.19145555794239044, + "learning_rate": 4.233604841734919e-05, + "loss": 0.3902, "step": 38640 }, { - "epoch": 1.36, - "learning_rate": 4.271499620586441e-05, - "loss": 0.2772, + "epoch": 1.3927631816052186, + "grad_norm": 0.20708268880844116, + "learning_rate": 4.2333945747445516e-05, + "loss": 0.4036, "step": 38645 }, { - "epoch": 1.36, - "learning_rate": 4.271298600303387e-05, - "loss": 0.3153, + "epoch": 1.3929433812664431, + "grad_norm": 0.17952348291873932, + "learning_rate": 4.2331842841370175e-05, + "loss": 0.4346, "step": 38650 }, { - "epoch": 1.36, - "learning_rate": 4.2710975570211884e-05, - "loss": 0.3017, + "epoch": 1.3931235809276679, + "grad_norm": 0.21833153069019318, + "learning_rate": 4.232973969915184e-05, + "loss": 0.4475, "step": 38655 }, { - "epoch": 1.36, - "learning_rate": 4.2708964907424555e-05, - "loss": 0.3075, + "epoch": 1.3933037805888926, + "grad_norm": 0.18720868229866028, + "learning_rate": 4.232763632081915e-05, + "loss": 0.3696, "step": 38660 }, { - "epoch": 1.36, - "learning_rate": 4.2706954014697984e-05, - "loss": 0.2937, + "epoch": 1.393483980250117, + "grad_norm": 0.1900511085987091, + "learning_rate": 4.232553270640077e-05, + "loss": 0.3998, "step": 38665 }, { - "epoch": 1.36, - "learning_rate": 4.2704942892058285e-05, - "loss": 0.2905, + "epoch": 1.3936641799113418, + "grad_norm": 0.19259464740753174, + "learning_rate": 4.232342885592536e-05, + "loss": 0.3862, "step": 38670 }, { - "epoch": 1.36, - "learning_rate": 4.2702931539531567e-05, - "loss": 0.2858, + "epoch": 1.3938443795725663, + "grad_norm": 0.19600766897201538, + "learning_rate": 4.232132476942159e-05, + "loss": 0.3997, "step": 38675 }, { - "epoch": 1.36, - "learning_rate": 4.270091995714396e-05, - "loss": 0.2628, + "epoch": 1.394024579233791, + "grad_norm": 0.17231407761573792, + "learning_rate": 4.231922044691813e-05, + "loss": 0.406, "step": 38680 }, { - "epoch": 1.36, - "learning_rate": 4.269890814492156e-05, - "loss": 0.3051, + "epoch": 1.3942047788950156, + "grad_norm": 0.19340550899505615, + "learning_rate": 4.231711588844363e-05, + "loss": 0.4058, "step": 38685 }, { - "epoch": 1.36, - "learning_rate": 4.269689610289051e-05, - "loss": 0.3129, + "epoch": 1.3943849785562403, + "grad_norm": 0.19323815405368805, + "learning_rate": 4.2315011094026784e-05, + "loss": 0.4092, "step": 38690 }, { - "epoch": 1.36, - "learning_rate": 4.2694883831076924e-05, - "loss": 0.2792, + "epoch": 1.3945651782174648, + "grad_norm": 0.19978420436382294, + "learning_rate": 4.231290606369627e-05, + "loss": 0.442, "step": 38695 }, { - "epoch": 1.36, - "learning_rate": 4.2692871329506936e-05, - "loss": 0.3053, + "epoch": 1.3947453778786896, + "grad_norm": 0.1691822111606598, + "learning_rate": 4.2310800797480756e-05, + "loss": 0.4045, "step": 38700 }, { - "epoch": 1.36, - "learning_rate": 4.2690858598206665e-05, - "loss": 0.2971, + "epoch": 1.3949255775399143, + "grad_norm": 0.2136184573173523, + "learning_rate": 4.230869529540894e-05, + "loss": 0.3986, "step": 38705 }, { - "epoch": 1.36, - "learning_rate": 4.268884563720226e-05, - "loss": 0.2868, + "epoch": 1.3951057772011388, + "grad_norm": 0.1695156693458557, + "learning_rate": 4.230658955750949e-05, + "loss": 0.4319, "step": 38710 }, { - "epoch": 1.36, - "learning_rate": 4.2686832446519856e-05, - "loss": 0.2846, + "epoch": 1.3952859768623636, + "grad_norm": 0.17145298421382904, + "learning_rate": 4.230448358381112e-05, + "loss": 0.3985, "step": 38715 }, { - "epoch": 1.36, - "learning_rate": 4.2684819026185583e-05, - "loss": 0.2756, + "epoch": 1.395466176523588, + "grad_norm": 0.18987350165843964, + "learning_rate": 4.2302377374342505e-05, + "loss": 0.4378, "step": 38720 }, { - "epoch": 1.36, - "learning_rate": 4.268280537622559e-05, - "loss": 0.291, + "epoch": 1.3956463761848128, + "grad_norm": 0.17236654460430145, + "learning_rate": 4.2300270929132344e-05, + "loss": 0.4128, "step": 38725 }, { - "epoch": 1.36, - "learning_rate": 4.268079149666603e-05, - "loss": 0.3187, + "epoch": 1.3958265758460375, + "grad_norm": 0.18958859145641327, + "learning_rate": 4.229816424820935e-05, + "loss": 0.4303, "step": 38730 }, { - "epoch": 1.36, - "learning_rate": 4.267877738753303e-05, - "loss": 0.2888, + "epoch": 1.396006775507262, + "grad_norm": 0.19442634284496307, + "learning_rate": 4.229605733160221e-05, + "loss": 0.4405, "step": 38735 }, { - "epoch": 1.36, - "learning_rate": 4.2676763048852764e-05, - "loss": 0.2993, + "epoch": 1.3961869751684866, + "grad_norm": 0.16832728683948517, + "learning_rate": 4.2293950179339645e-05, + "loss": 0.4066, "step": 38740 }, { - "epoch": 1.36, - "learning_rate": 4.2674748480651385e-05, - "loss": 0.2921, + "epoch": 1.3963671748297113, + "grad_norm": 0.20120170712471008, + "learning_rate": 4.2291842791450356e-05, + "loss": 0.4036, "step": 38745 }, { - "epoch": 1.36, - "learning_rate": 4.267273368295504e-05, - "loss": 0.2735, + "epoch": 1.396547374490936, + "grad_norm": 0.17381571233272552, + "learning_rate": 4.2289735167963054e-05, + "loss": 0.4033, "step": 38750 }, { - "epoch": 1.36, - "learning_rate": 4.2670718655789896e-05, - "loss": 0.296, + "epoch": 1.3967275741521605, + "grad_norm": 0.17229481041431427, + "learning_rate": 4.228762730890645e-05, + "loss": 0.4159, "step": 38755 }, { - "epoch": 1.36, - "learning_rate": 4.266870339918211e-05, - "loss": 0.3009, + "epoch": 1.3969077738133853, + "grad_norm": 0.1683155596256256, + "learning_rate": 4.228551921430928e-05, + "loss": 0.3986, "step": 38760 }, { - "epoch": 1.36, - "learning_rate": 4.266668791315786e-05, - "loss": 0.2583, + "epoch": 1.3970879734746098, + "grad_norm": 0.1502419412136078, + "learning_rate": 4.228341088420026e-05, + "loss": 0.4202, "step": 38765 }, { - "epoch": 1.36, - "learning_rate": 4.266467219774331e-05, - "loss": 0.28, + "epoch": 1.3972681731358345, + "grad_norm": 0.16682939231395721, + "learning_rate": 4.2281302318608106e-05, + "loss": 0.4485, "step": 38770 }, { - "epoch": 1.36, - "learning_rate": 4.2662656252964635e-05, - "loss": 0.2812, + "epoch": 1.3974483727970592, + "grad_norm": 0.20583415031433105, + "learning_rate": 4.227919351756155e-05, + "loss": 0.4224, "step": 38775 }, { - "epoch": 1.36, - "learning_rate": 4.2660640078848e-05, - "loss": 0.3001, + "epoch": 1.3976285724582838, + "grad_norm": 0.1680290699005127, + "learning_rate": 4.227708448108934e-05, + "loss": 0.4034, "step": 38780 }, { - "epoch": 1.36, - "learning_rate": 4.26586236754196e-05, - "loss": 0.2902, + "epoch": 1.3978087721195085, + "grad_norm": 0.1613532453775406, + "learning_rate": 4.227497520922019e-05, + "loss": 0.4187, "step": 38785 }, { - "epoch": 1.36, - "learning_rate": 4.265660704270561e-05, - "loss": 0.2587, + "epoch": 1.397988971780733, + "grad_norm": 0.19061127305030823, + "learning_rate": 4.2272865701982855e-05, + "loss": 0.4305, "step": 38790 }, { - "epoch": 1.36, - "learning_rate": 4.2654590180732215e-05, - "loss": 0.3016, + "epoch": 1.3981691714419577, + "grad_norm": 0.1994105577468872, + "learning_rate": 4.227075595940606e-05, + "loss": 0.4196, "step": 38795 }, { - "epoch": 1.37, - "learning_rate": 4.2652573089525594e-05, - "loss": 0.2793, + "epoch": 1.3983493711031822, + "grad_norm": 0.16450746357440948, + "learning_rate": 4.226864598151857e-05, + "loss": 0.4488, "step": 38800 }, { - "epoch": 1.37, - "learning_rate": 4.2650555769111945e-05, - "loss": 0.2888, + "epoch": 1.398529570764407, + "grad_norm": 0.15390661358833313, + "learning_rate": 4.226653576834911e-05, + "loss": 0.4042, "step": 38805 }, { - "epoch": 1.37, - "learning_rate": 4.2648538219517466e-05, - "loss": 0.3073, + "epoch": 1.3987097704256315, + "grad_norm": 0.22952650487422943, + "learning_rate": 4.226442531992645e-05, + "loss": 0.4069, "step": 38810 }, { - "epoch": 1.37, - "learning_rate": 4.2646520440768346e-05, - "loss": 0.2823, + "epoch": 1.3988899700868562, + "grad_norm": 0.18804705142974854, + "learning_rate": 4.2262314636279334e-05, + "loss": 0.3962, "step": 38815 }, { - "epoch": 1.37, - "learning_rate": 4.264450243289079e-05, - "loss": 0.3026, + "epoch": 1.399070169748081, + "grad_norm": 0.15264515578746796, + "learning_rate": 4.226020371743653e-05, + "loss": 0.4237, "step": 38820 }, { - "epoch": 1.37, - "learning_rate": 4.2642484195911e-05, - "loss": 0.2741, + "epoch": 1.3992503694093055, + "grad_norm": 0.18798717856407166, + "learning_rate": 4.225809256342678e-05, + "loss": 0.4252, "step": 38825 }, { - "epoch": 1.37, - "learning_rate": 4.2640465729855186e-05, - "loss": 0.2949, + "epoch": 1.3994305690705302, + "grad_norm": 0.21027231216430664, + "learning_rate": 4.225598117427887e-05, + "loss": 0.4366, "step": 38830 }, { - "epoch": 1.37, - "learning_rate": 4.263844703474954e-05, - "loss": 0.2495, + "epoch": 1.3996107687317547, + "grad_norm": 0.17129452526569366, + "learning_rate": 4.225386955002155e-05, + "loss": 0.4157, "step": 38835 }, { - "epoch": 1.37, - "learning_rate": 4.2636428110620274e-05, - "loss": 0.2771, + "epoch": 1.3997909683929795, + "grad_norm": 0.19141530990600586, + "learning_rate": 4.2251757690683604e-05, + "loss": 0.3856, "step": 38840 }, { - "epoch": 1.37, - "learning_rate": 4.263440895749363e-05, - "loss": 0.2698, + "epoch": 1.3999711680542042, + "grad_norm": 0.1746709644794464, + "learning_rate": 4.22496455962938e-05, + "loss": 0.395, "step": 38845 }, { - "epoch": 1.37, - "learning_rate": 4.2632389575395795e-05, - "loss": 0.3005, + "epoch": 1.4001513677154287, + "grad_norm": 0.20320837199687958, + "learning_rate": 4.22475332668809e-05, + "loss": 0.4106, "step": 38850 }, { - "epoch": 1.37, - "learning_rate": 4.263036996435301e-05, - "loss": 0.2822, + "epoch": 1.4003315673766532, + "grad_norm": 0.17430174350738525, + "learning_rate": 4.224542070247371e-05, + "loss": 0.4004, "step": 38855 }, { - "epoch": 1.37, - "learning_rate": 4.262835012439148e-05, - "loss": 0.2735, + "epoch": 1.400511767037878, + "grad_norm": 0.1601121872663498, + "learning_rate": 4.224330790310101e-05, + "loss": 0.4212, "step": 38860 }, { - "epoch": 1.37, - "learning_rate": 4.262633005553745e-05, - "loss": 0.2999, + "epoch": 1.4006919666991027, + "grad_norm": 0.16500531136989594, + "learning_rate": 4.2241194868791565e-05, + "loss": 0.3749, "step": 38865 }, { - "epoch": 1.37, - "learning_rate": 4.262430975781714e-05, - "loss": 0.2825, + "epoch": 1.4008721663603272, + "grad_norm": 0.18439029157161713, + "learning_rate": 4.2239081599574184e-05, + "loss": 0.4193, "step": 38870 }, { - "epoch": 1.37, - "learning_rate": 4.2622289231256776e-05, - "loss": 0.2627, + "epoch": 1.401052366021552, + "grad_norm": 0.20299525558948517, + "learning_rate": 4.223696809547766e-05, + "loss": 0.4482, "step": 38875 }, { - "epoch": 1.37, - "learning_rate": 4.26202684758826e-05, - "loss": 0.2937, + "epoch": 1.4012325656827764, + "grad_norm": 0.17272146046161652, + "learning_rate": 4.2234854356530776e-05, + "loss": 0.4225, "step": 38880 }, { - "epoch": 1.37, - "learning_rate": 4.261824749172086e-05, - "loss": 0.283, + "epoch": 1.4014127653440012, + "grad_norm": 0.1982017606496811, + "learning_rate": 4.223274038276233e-05, + "loss": 0.4437, "step": 38885 }, { - "epoch": 1.37, - "learning_rate": 4.2616226278797776e-05, - "loss": 0.2989, + "epoch": 1.401592965005226, + "grad_norm": 0.16172081232070923, + "learning_rate": 4.223062617420114e-05, + "loss": 0.4323, "step": 38890 }, { - "epoch": 1.37, - "learning_rate": 4.2614204837139616e-05, - "loss": 0.2897, + "epoch": 1.4017731646664504, + "grad_norm": 0.1717664897441864, + "learning_rate": 4.2228511730876006e-05, + "loss": 0.3999, "step": 38895 }, { - "epoch": 1.37, - "learning_rate": 4.26121831667726e-05, - "loss": 0.3178, + "epoch": 1.4019533643276751, + "grad_norm": 0.18911094963550568, + "learning_rate": 4.2226397052815734e-05, + "loss": 0.409, "step": 38900 }, { - "epoch": 1.37, - "learning_rate": 4.2610161267723005e-05, - "loss": 0.281, + "epoch": 1.4021335639888997, + "grad_norm": 0.19404666125774384, + "learning_rate": 4.2224282140049145e-05, + "loss": 0.411, "step": 38905 }, { - "epoch": 1.37, - "learning_rate": 4.260813914001706e-05, - "loss": 0.2685, + "epoch": 1.4023137636501244, + "grad_norm": 0.19572977721691132, + "learning_rate": 4.2222166992605037e-05, + "loss": 0.4489, "step": 38910 }, { - "epoch": 1.37, - "learning_rate": 4.2606116783681045e-05, - "loss": 0.291, + "epoch": 1.402493963311349, + "grad_norm": 0.20973151922225952, + "learning_rate": 4.2220051610512236e-05, + "loss": 0.4269, "step": 38915 }, { - "epoch": 1.37, - "learning_rate": 4.2604094198741195e-05, - "loss": 0.2739, + "epoch": 1.4026741629725736, + "grad_norm": 0.17148537933826447, + "learning_rate": 4.2217935993799576e-05, + "loss": 0.3622, "step": 38920 }, { - "epoch": 1.37, - "learning_rate": 4.260207138522379e-05, - "loss": 0.3218, + "epoch": 1.4028543626337981, + "grad_norm": 0.1789887696504593, + "learning_rate": 4.221582014249586e-05, + "loss": 0.436, "step": 38925 }, { - "epoch": 1.37, - "learning_rate": 4.260004834315509e-05, - "loss": 0.2865, + "epoch": 1.4030345622950229, + "grad_norm": 0.17625224590301514, + "learning_rate": 4.2213704056629936e-05, + "loss": 0.4053, "step": 38930 }, { - "epoch": 1.37, - "learning_rate": 4.259802507256135e-05, - "loss": 0.3042, + "epoch": 1.4032147619562476, + "grad_norm": 0.1674962043762207, + "learning_rate": 4.2211587736230614e-05, + "loss": 0.4552, "step": 38935 }, { - "epoch": 1.37, - "learning_rate": 4.2596001573468864e-05, - "loss": 0.2767, + "epoch": 1.4033949616174721, + "grad_norm": 0.20774921774864197, + "learning_rate": 4.220947118132676e-05, + "loss": 0.4201, "step": 38940 }, { - "epoch": 1.37, - "learning_rate": 4.259397784590389e-05, - "loss": 0.2787, + "epoch": 1.4035751612786969, + "grad_norm": 0.15896476805210114, + "learning_rate": 4.220735439194718e-05, + "loss": 0.4324, "step": 38945 }, { - "epoch": 1.37, - "learning_rate": 4.2591953889892706e-05, - "loss": 0.2751, + "epoch": 1.4037553609399214, + "grad_norm": 0.21336419880390167, + "learning_rate": 4.220523736812073e-05, + "loss": 0.4731, "step": 38950 }, { - "epoch": 1.37, - "learning_rate": 4.25899297054616e-05, - "loss": 0.2917, + "epoch": 1.403935560601146, + "grad_norm": 0.23092104494571686, + "learning_rate": 4.220312010987626e-05, + "loss": 0.422, "step": 38955 }, { - "epoch": 1.37, - "learning_rate": 4.2587905292636845e-05, - "loss": 0.279, + "epoch": 1.4041157602623708, + "grad_norm": 0.20718926191329956, + "learning_rate": 4.22010026172426e-05, + "loss": 0.394, "step": 38960 }, { - "epoch": 1.37, - "learning_rate": 4.258588065144473e-05, - "loss": 0.2934, + "epoch": 1.4042959599235953, + "grad_norm": 0.18056684732437134, + "learning_rate": 4.219888489024861e-05, + "loss": 0.421, "step": 38965 }, { - "epoch": 1.37, - "learning_rate": 4.2583855781911544e-05, - "loss": 0.2789, + "epoch": 1.4044761595848199, + "grad_norm": 0.19858917593955994, + "learning_rate": 4.2196766928923147e-05, + "loss": 0.4036, "step": 38970 }, { - "epoch": 1.37, - "learning_rate": 4.258183068406358e-05, - "loss": 0.2685, + "epoch": 1.4046563592460446, + "grad_norm": 0.23689039051532745, + "learning_rate": 4.219464873329506e-05, + "loss": 0.4278, "step": 38975 }, { - "epoch": 1.37, - "learning_rate": 4.257980535792713e-05, - "loss": 0.3244, + "epoch": 1.4048365589072693, + "grad_norm": 0.1857483685016632, + "learning_rate": 4.219253030339322e-05, + "loss": 0.4156, "step": 38980 }, { - "epoch": 1.37, - "learning_rate": 4.2577779803528495e-05, - "loss": 0.3174, + "epoch": 1.4050167585684938, + "grad_norm": 0.1789952665567398, + "learning_rate": 4.2190411639246474e-05, + "loss": 0.4147, "step": 38985 }, { - "epoch": 1.37, - "learning_rate": 4.2575754020893966e-05, - "loss": 0.2866, + "epoch": 1.4051969582297186, + "grad_norm": 0.21986453235149384, + "learning_rate": 4.218829274088371e-05, + "loss": 0.454, "step": 38990 }, { - "epoch": 1.37, - "learning_rate": 4.257372801004986e-05, - "loss": 0.2718, + "epoch": 1.405377157890943, + "grad_norm": 0.16250832378864288, + "learning_rate": 4.2186173608333776e-05, + "loss": 0.3945, "step": 38995 }, { - "epoch": 1.37, - "learning_rate": 4.257170177102248e-05, - "loss": 0.2912, + "epoch": 1.4055573575521678, + "grad_norm": 0.16607147455215454, + "learning_rate": 4.2184054241625556e-05, + "loss": 0.429, "step": 39000 }, { - "epoch": 1.37, - "eval_loss": 0.28303414583206177, - "eval_runtime": 10.5465, - "eval_samples_per_second": 9.482, - "eval_steps_per_second": 9.482, + "epoch": 1.4055573575521678, + "eval_loss": 0.4471230208873749, + "eval_runtime": 3.5385, + "eval_samples_per_second": 28.261, + "eval_steps_per_second": 7.065, "step": 39000 }, { - "epoch": 1.37, - "learning_rate": 4.256967530383813e-05, - "loss": 0.3285, + "epoch": 1.4057375572133926, + "grad_norm": 0.18567611277103424, + "learning_rate": 4.218193464078792e-05, + "loss": 0.4589, "step": 39005 }, { - "epoch": 1.37, - "learning_rate": 4.2567648608523127e-05, - "loss": 0.2956, + "epoch": 1.405917756874617, + "grad_norm": 0.14248013496398926, + "learning_rate": 4.217981480584976e-05, + "loss": 0.4067, "step": 39010 }, { - "epoch": 1.37, - "learning_rate": 4.2565621685103784e-05, - "loss": 0.3043, + "epoch": 1.4060979565358416, + "grad_norm": 0.18082767724990845, + "learning_rate": 4.217769473683994e-05, + "loss": 0.4069, "step": 39015 }, { - "epoch": 1.37, - "learning_rate": 4.256359453360642e-05, - "loss": 0.2776, + "epoch": 1.4062781561970663, + "grad_norm": 0.2055099606513977, + "learning_rate": 4.217557443378736e-05, + "loss": 0.4208, "step": 39020 }, { - "epoch": 1.37, - "learning_rate": 4.2561567154057355e-05, - "loss": 0.2852, + "epoch": 1.406458355858291, + "grad_norm": 0.20879048109054565, + "learning_rate": 4.2173453896720906e-05, + "loss": 0.3962, "step": 39025 }, { - "epoch": 1.37, - "learning_rate": 4.255953954648291e-05, - "loss": 0.2754, + "epoch": 1.4066385555195156, + "grad_norm": 0.18732579052448273, + "learning_rate": 4.217133312566946e-05, + "loss": 0.4423, "step": 39030 }, { - "epoch": 1.37, - "learning_rate": 4.255751171090942e-05, - "loss": 0.3048, + "epoch": 1.4068187551807403, + "grad_norm": 0.199151873588562, + "learning_rate": 4.2169212120661926e-05, + "loss": 0.4394, "step": 39035 }, { - "epoch": 1.37, - "learning_rate": 4.255548364736321e-05, - "loss": 0.2933, + "epoch": 1.4069989548419648, + "grad_norm": 0.15201795101165771, + "learning_rate": 4.21670908817272e-05, + "loss": 0.4055, "step": 39040 }, { - "epoch": 1.37, - "learning_rate": 4.2553455355870615e-05, - "loss": 0.288, + "epoch": 1.4071791545031895, + "grad_norm": 0.15921539068222046, + "learning_rate": 4.2164969408894194e-05, + "loss": 0.4533, "step": 39045 }, { - "epoch": 1.37, - "learning_rate": 4.2551426836457976e-05, - "loss": 0.2834, + "epoch": 1.4073593541644143, + "grad_norm": 0.16528940200805664, + "learning_rate": 4.21628477021918e-05, + "loss": 0.4136, "step": 39050 }, { - "epoch": 1.37, - "learning_rate": 4.254939808915162e-05, - "loss": 0.3058, + "epoch": 1.4075395538256388, + "grad_norm": 0.20184743404388428, + "learning_rate": 4.216072576164892e-05, + "loss": 0.453, "step": 39055 }, { - "epoch": 1.37, - "learning_rate": 4.254736911397789e-05, - "loss": 0.275, + "epoch": 1.4077197534868635, + "grad_norm": 0.2002110779285431, + "learning_rate": 4.215860358729448e-05, + "loss": 0.4098, "step": 39060 }, { - "epoch": 1.37, - "learning_rate": 4.2545339910963146e-05, - "loss": 0.2764, + "epoch": 1.407899953148088, + "grad_norm": 0.16042189300060272, + "learning_rate": 4.215648117915739e-05, + "loss": 0.3906, "step": 39065 }, { - "epoch": 1.37, - "learning_rate": 4.2543310480133725e-05, - "loss": 0.2942, + "epoch": 1.4080801528093128, + "grad_norm": 0.21840621531009674, + "learning_rate": 4.2154358537266564e-05, + "loss": 0.4053, "step": 39070 }, { - "epoch": 1.37, - "learning_rate": 4.254128082151597e-05, - "loss": 0.2854, + "epoch": 1.4082603524705375, + "grad_norm": 0.15878549218177795, + "learning_rate": 4.2152235661650925e-05, + "loss": 0.4222, "step": 39075 }, { - "epoch": 1.37, - "learning_rate": 4.253925093513626e-05, - "loss": 0.2886, + "epoch": 1.408440552131762, + "grad_norm": 0.17753714323043823, + "learning_rate": 4.215011255233939e-05, + "loss": 0.4452, "step": 39080 }, { - "epoch": 1.38, - "learning_rate": 4.2537220821020916e-05, - "loss": 0.2735, + "epoch": 1.4086207517929865, + "grad_norm": 0.1962030678987503, + "learning_rate": 4.2147989209360903e-05, + "loss": 0.4588, "step": 39085 }, { - "epoch": 1.38, - "learning_rate": 4.253519047919633e-05, - "loss": 0.2643, + "epoch": 1.4088009514542112, + "grad_norm": 0.17643392086029053, + "learning_rate": 4.2145865632744376e-05, + "loss": 0.4102, "step": 39090 }, { - "epoch": 1.38, - "learning_rate": 4.2533159909688844e-05, - "loss": 0.3042, + "epoch": 1.408981151115436, + "grad_norm": 0.2033703774213791, + "learning_rate": 4.214374182251874e-05, + "loss": 0.3898, "step": 39095 }, { - "epoch": 1.38, - "learning_rate": 4.253112911252484e-05, - "loss": 0.2939, + "epoch": 1.4091613507766605, + "grad_norm": 0.2141045182943344, + "learning_rate": 4.214161777871296e-05, + "loss": 0.4343, "step": 39100 }, { - "epoch": 1.38, - "learning_rate": 4.2529098087730665e-05, - "loss": 0.2816, + "epoch": 1.4093415504378852, + "grad_norm": 0.19499428570270538, + "learning_rate": 4.213949350135594e-05, + "loss": 0.4263, "step": 39105 }, { - "epoch": 1.38, - "learning_rate": 4.252706683533272e-05, - "loss": 0.2897, + "epoch": 1.4095217500991097, + "grad_norm": 0.1868494302034378, + "learning_rate": 4.2137368990476655e-05, + "loss": 0.4148, "step": 39110 }, { - "epoch": 1.38, - "learning_rate": 4.252503535535735e-05, - "loss": 0.279, + "epoch": 1.4097019497603345, + "grad_norm": 0.16455568373203278, + "learning_rate": 4.213524424610402e-05, + "loss": 0.3985, "step": 39115 }, { - "epoch": 1.38, - "learning_rate": 4.252300364783095e-05, - "loss": 0.2804, + "epoch": 1.4098821494215592, + "grad_norm": 0.23864524066448212, + "learning_rate": 4.213311926826701e-05, + "loss": 0.4044, "step": 39120 }, { - "epoch": 1.38, - "learning_rate": 4.25209717127799e-05, - "loss": 0.2836, + "epoch": 1.4100623490827837, + "grad_norm": 0.20035329461097717, + "learning_rate": 4.213099405699457e-05, + "loss": 0.3905, "step": 39125 }, { - "epoch": 1.38, - "learning_rate": 4.2518939550230574e-05, - "loss": 0.2867, + "epoch": 1.4102425487440082, + "grad_norm": 0.18260158598423004, + "learning_rate": 4.212886861231564e-05, + "loss": 0.4173, "step": 39130 }, { - "epoch": 1.38, - "learning_rate": 4.251690716020936e-05, - "loss": 0.3086, + "epoch": 1.410422748405233, + "grad_norm": 0.14662878215312958, + "learning_rate": 4.212674293425919e-05, + "loss": 0.3954, "step": 39135 }, { - "epoch": 1.38, - "learning_rate": 4.2514874542742653e-05, - "loss": 0.2838, + "epoch": 1.4106029480664577, + "grad_norm": 0.2082986682653427, + "learning_rate": 4.2124617022854195e-05, + "loss": 0.4102, "step": 39140 }, { - "epoch": 1.38, - "learning_rate": 4.2512841697856855e-05, - "loss": 0.2926, + "epoch": 1.4107831477276822, + "grad_norm": 0.19178947806358337, + "learning_rate": 4.212249087812961e-05, + "loss": 0.4261, "step": 39145 }, { - "epoch": 1.38, - "learning_rate": 4.251080862557833e-05, - "loss": 0.293, + "epoch": 1.410963347388907, + "grad_norm": 0.18366897106170654, + "learning_rate": 4.2120364500114394e-05, + "loss": 0.3988, "step": 39150 }, { - "epoch": 1.38, - "learning_rate": 4.25087753259335e-05, - "loss": 0.2791, + "epoch": 1.4111435470501315, + "grad_norm": 0.20114761590957642, + "learning_rate": 4.2118237888837534e-05, + "loss": 0.4235, "step": 39155 }, { - "epoch": 1.38, - "learning_rate": 4.250674179894877e-05, - "loss": 0.2782, + "epoch": 1.4113237467113562, + "grad_norm": 0.17763690650463104, + "learning_rate": 4.2116111044327984e-05, + "loss": 0.4264, "step": 39160 }, { - "epoch": 1.38, - "learning_rate": 4.250470804465053e-05, - "loss": 0.3029, + "epoch": 1.411503946372581, + "grad_norm": 0.2246585637331009, + "learning_rate": 4.2113983966614745e-05, + "loss": 0.4089, "step": 39165 }, { - "epoch": 1.38, - "learning_rate": 4.250267406306519e-05, - "loss": 0.2718, + "epoch": 1.4116841460338054, + "grad_norm": 0.19135521352291107, + "learning_rate": 4.211185665572678e-05, + "loss": 0.4371, "step": 39170 }, { - "epoch": 1.38, - "learning_rate": 4.250063985421917e-05, - "loss": 0.2931, + "epoch": 1.4118643456950302, + "grad_norm": 0.19484780728816986, + "learning_rate": 4.2109729111693085e-05, + "loss": 0.3799, "step": 39175 }, { - "epoch": 1.38, - "learning_rate": 4.2498605418138865e-05, - "loss": 0.2912, + "epoch": 1.4120445453562547, + "grad_norm": 0.16387687623500824, + "learning_rate": 4.210760133454265e-05, + "loss": 0.3739, "step": 39180 }, { - "epoch": 1.38, - "learning_rate": 4.2496570754850704e-05, - "loss": 0.2946, + "epoch": 1.4122247450174794, + "grad_norm": 0.169037863612175, + "learning_rate": 4.210547332430446e-05, + "loss": 0.4141, "step": 39185 }, { - "epoch": 1.38, - "learning_rate": 4.2494535864381104e-05, - "loss": 0.2748, + "epoch": 1.412404944678704, + "grad_norm": 0.2004702091217041, + "learning_rate": 4.21033450810075e-05, + "loss": 0.4108, "step": 39190 }, { - "epoch": 1.38, - "learning_rate": 4.2492500746756484e-05, - "loss": 0.2726, + "epoch": 1.4125851443399287, + "grad_norm": 0.16014528274536133, + "learning_rate": 4.210121660468077e-05, + "loss": 0.3964, "step": 39195 }, { - "epoch": 1.38, - "learning_rate": 4.2490465402003273e-05, - "loss": 0.2799, + "epoch": 1.4127653440011532, + "grad_norm": 0.16173921525478363, + "learning_rate": 4.209908789535328e-05, + "loss": 0.4055, "step": 39200 }, { - "epoch": 1.38, - "learning_rate": 4.248842983014789e-05, - "loss": 0.2898, + "epoch": 1.412945543662378, + "grad_norm": 0.1848604828119278, + "learning_rate": 4.209695895305403e-05, + "loss": 0.4002, "step": 39205 }, { - "epoch": 1.38, - "learning_rate": 4.2486394031216766e-05, - "loss": 0.2735, + "epoch": 1.4131257433236026, + "grad_norm": 0.14384835958480835, + "learning_rate": 4.209482977781202e-05, + "loss": 0.4028, "step": 39210 }, { - "epoch": 1.38, - "learning_rate": 4.248435800523636e-05, - "loss": 0.2879, + "epoch": 1.4133059429848271, + "grad_norm": 0.15797820687294006, + "learning_rate": 4.209270036965627e-05, + "loss": 0.4436, "step": 39215 }, { - "epoch": 1.38, - "learning_rate": 4.248232175223307e-05, - "loss": 0.2592, + "epoch": 1.4134861426460519, + "grad_norm": 0.17530421912670135, + "learning_rate": 4.2090570728615774e-05, + "loss": 0.4092, "step": 39220 }, { - "epoch": 1.38, - "learning_rate": 4.248028527223335e-05, - "loss": 0.2795, + "epoch": 1.4136663423072764, + "grad_norm": 0.17297030985355377, + "learning_rate": 4.208844085471957e-05, + "loss": 0.4609, "step": 39225 }, { - "epoch": 1.38, - "learning_rate": 4.247824856526366e-05, - "loss": 0.2916, + "epoch": 1.4138465419685011, + "grad_norm": 0.1550433486700058, + "learning_rate": 4.2086310747996674e-05, + "loss": 0.4124, "step": 39230 }, { - "epoch": 1.38, - "learning_rate": 4.247621163135043e-05, - "loss": 0.2857, + "epoch": 1.4140267416297259, + "grad_norm": 0.19032645225524902, + "learning_rate": 4.2084180408476094e-05, + "loss": 0.4298, "step": 39235 }, { - "epoch": 1.38, - "learning_rate": 4.2474174470520103e-05, - "loss": 0.2693, + "epoch": 1.4142069412909504, + "grad_norm": 0.18603280186653137, + "learning_rate": 4.208204983618687e-05, + "loss": 0.4064, "step": 39240 }, { - "epoch": 1.38, - "learning_rate": 4.247213708279913e-05, - "loss": 0.2577, + "epoch": 1.4143871409521749, + "grad_norm": 0.21045638620853424, + "learning_rate": 4.2079919031158014e-05, + "loss": 0.435, "step": 39245 }, { - "epoch": 1.38, - "learning_rate": 4.247009946821398e-05, - "loss": 0.2838, + "epoch": 1.4145673406133996, + "grad_norm": 0.22773757576942444, + "learning_rate": 4.2077787993418574e-05, + "loss": 0.4148, "step": 39250 }, { - "epoch": 1.38, - "learning_rate": 4.24680616267911e-05, - "loss": 0.2802, + "epoch": 1.4147475402746243, + "grad_norm": 0.22381030023097992, + "learning_rate": 4.2075656722997583e-05, + "loss": 0.3881, "step": 39255 }, { - "epoch": 1.38, - "learning_rate": 4.246602355855696e-05, - "loss": 0.2706, + "epoch": 1.4149277399358489, + "grad_norm": 0.2066371738910675, + "learning_rate": 4.207352521992407e-05, + "loss": 0.4107, "step": 39260 }, { - "epoch": 1.38, - "learning_rate": 4.2463985263537995e-05, - "loss": 0.2977, + "epoch": 1.4151079395970736, + "grad_norm": 0.1946861892938614, + "learning_rate": 4.207139348422708e-05, + "loss": 0.4056, "step": 39265 }, { - "epoch": 1.38, - "learning_rate": 4.2461946741760704e-05, - "loss": 0.2698, + "epoch": 1.415288139258298, + "grad_norm": 0.16008664667606354, + "learning_rate": 4.2069261515935656e-05, + "loss": 0.4325, "step": 39270 }, { - "epoch": 1.38, - "learning_rate": 4.2459907993251545e-05, - "loss": 0.2963, + "epoch": 1.4154683389195228, + "grad_norm": 0.20454731583595276, + "learning_rate": 4.206712931507886e-05, + "loss": 0.3848, "step": 39275 }, { - "epoch": 1.38, - "learning_rate": 4.245786901803698e-05, - "loss": 0.2669, + "epoch": 1.4156485385807476, + "grad_norm": 0.19149497151374817, + "learning_rate": 4.206499688168572e-05, + "loss": 0.4211, "step": 39280 }, { - "epoch": 1.38, - "learning_rate": 4.245582981614349e-05, - "loss": 0.3193, + "epoch": 1.415828738241972, + "grad_norm": 0.18504855036735535, + "learning_rate": 4.2062864215785304e-05, + "loss": 0.4501, "step": 39285 }, { - "epoch": 1.38, - "learning_rate": 4.245379038759756e-05, - "loss": 0.2918, + "epoch": 1.4160089379031968, + "grad_norm": 0.18371212482452393, + "learning_rate": 4.206073131740668e-05, + "loss": 0.4172, "step": 39290 }, { - "epoch": 1.38, - "learning_rate": 4.2451750732425665e-05, - "loss": 0.2896, + "epoch": 1.4161891375644213, + "grad_norm": 0.17143423855304718, + "learning_rate": 4.205859818657888e-05, + "loss": 0.3977, "step": 39295 }, { - "epoch": 1.38, - "learning_rate": 4.244971085065429e-05, - "loss": 0.295, + "epoch": 1.416369337225646, + "grad_norm": 0.18957088887691498, + "learning_rate": 4.205646482333098e-05, + "loss": 0.4387, "step": 39300 }, { - "epoch": 1.38, - "learning_rate": 4.244767074230991e-05, - "loss": 0.2789, + "epoch": 1.4165495368868706, + "grad_norm": 0.18221431970596313, + "learning_rate": 4.205433122769206e-05, + "loss": 0.3961, "step": 39305 }, { - "epoch": 1.38, - "learning_rate": 4.244563040741903e-05, - "loss": 0.2951, + "epoch": 1.4167297365480953, + "grad_norm": 0.1832798719406128, + "learning_rate": 4.2052197399691174e-05, + "loss": 0.4188, "step": 39310 }, { - "epoch": 1.38, - "learning_rate": 4.244358984600813e-05, - "loss": 0.2768, + "epoch": 1.4169099362093198, + "grad_norm": 0.16603875160217285, + "learning_rate": 4.205006333935739e-05, + "loss": 0.3877, "step": 39315 }, { - "epoch": 1.38, - "learning_rate": 4.244154905810371e-05, - "loss": 0.302, + "epoch": 1.4170901358705446, + "grad_norm": 0.17114681005477905, + "learning_rate": 4.204792904671981e-05, + "loss": 0.4252, "step": 39320 }, { - "epoch": 1.38, - "learning_rate": 4.243950804373228e-05, - "loss": 0.2966, + "epoch": 1.4172703355317693, + "grad_norm": 0.16899357736110687, + "learning_rate": 4.204579452180749e-05, + "loss": 0.392, "step": 39325 }, { - "epoch": 1.38, - "learning_rate": 4.243746680292033e-05, - "loss": 0.2965, + "epoch": 1.4174505351929938, + "grad_norm": 0.20568254590034485, + "learning_rate": 4.2043659764649527e-05, + "loss": 0.4218, "step": 39330 }, { - "epoch": 1.38, - "learning_rate": 4.2435425335694355e-05, - "loss": 0.2845, + "epoch": 1.4176307348542185, + "grad_norm": 0.217452272772789, + "learning_rate": 4.2041524775274985e-05, + "loss": 0.3834, "step": 39335 }, { - "epoch": 1.38, - "learning_rate": 4.2433383642080884e-05, - "loss": 0.3013, + "epoch": 1.417810934515443, + "grad_norm": 0.18286210298538208, + "learning_rate": 4.2039389553712986e-05, + "loss": 0.4243, "step": 39340 }, { - "epoch": 1.38, - "learning_rate": 4.243134172210641e-05, - "loss": 0.2755, + "epoch": 1.4179911341766678, + "grad_norm": 0.176735058426857, + "learning_rate": 4.2037254099992584e-05, + "loss": 0.392, "step": 39345 }, { - "epoch": 1.38, - "learning_rate": 4.2429299575797454e-05, - "loss": 0.2901, + "epoch": 1.4181713338378925, + "grad_norm": 0.18584541976451874, + "learning_rate": 4.2035118414142905e-05, + "loss": 0.415, "step": 39350 }, { - "epoch": 1.38, - "learning_rate": 4.2427257203180526e-05, - "loss": 0.2814, + "epoch": 1.418351533499117, + "grad_norm": 0.20618844032287598, + "learning_rate": 4.203298249619303e-05, + "loss": 0.4303, "step": 39355 }, { - "epoch": 1.38, - "learning_rate": 4.242521460428215e-05, - "loss": 0.2813, + "epoch": 1.4185317331603415, + "grad_norm": 0.1963784545660019, + "learning_rate": 4.203084634617207e-05, + "loss": 0.418, "step": 39360 }, { - "epoch": 1.38, - "learning_rate": 4.242317177912885e-05, - "loss": 0.2902, + "epoch": 1.4187119328215663, + "grad_norm": 0.19487901031970978, + "learning_rate": 4.202870996410913e-05, + "loss": 0.4129, "step": 39365 }, { - "epoch": 1.39, - "learning_rate": 4.2421128727747136e-05, - "loss": 0.2907, + "epoch": 1.418892132482791, + "grad_norm": 0.19040003418922424, + "learning_rate": 4.2026573350033304e-05, + "loss": 0.4042, "step": 39370 }, { - "epoch": 1.39, - "learning_rate": 4.2419085450163555e-05, - "loss": 0.2991, + "epoch": 1.4190723321440155, + "grad_norm": 0.21092675626277924, + "learning_rate": 4.2024436503973716e-05, + "loss": 0.4216, "step": 39375 }, { - "epoch": 1.39, - "learning_rate": 4.2417041946404626e-05, - "loss": 0.2856, + "epoch": 1.4192525318052402, + "grad_norm": 0.17777122557163239, + "learning_rate": 4.2022299425959476e-05, + "loss": 0.4004, "step": 39380 }, { - "epoch": 1.39, - "learning_rate": 4.241499821649689e-05, - "loss": 0.2923, + "epoch": 1.4194327314664648, + "grad_norm": 0.1416267305612564, + "learning_rate": 4.2020162116019706e-05, + "loss": 0.3728, "step": 39385 }, { - "epoch": 1.39, - "learning_rate": 4.2412954260466876e-05, - "loss": 0.3112, + "epoch": 1.4196129311276895, + "grad_norm": 0.1654488891363144, + "learning_rate": 4.2018024574183525e-05, + "loss": 0.412, "step": 39390 }, { - "epoch": 1.39, - "learning_rate": 4.241091007834113e-05, - "loss": 0.2896, + "epoch": 1.4197931307889142, + "grad_norm": 0.19146573543548584, + "learning_rate": 4.2015886800480044e-05, + "loss": 0.393, "step": 39395 }, { - "epoch": 1.39, - "learning_rate": 4.240886567014618e-05, - "loss": 0.3001, + "epoch": 1.4199733304501387, + "grad_norm": 0.17153365910053253, + "learning_rate": 4.20137487949384e-05, + "loss": 0.4341, "step": 39400 }, { - "epoch": 1.39, - "learning_rate": 4.240682103590859e-05, - "loss": 0.291, + "epoch": 1.4201535301113635, + "grad_norm": 0.24907280504703522, + "learning_rate": 4.201161055758773e-05, + "loss": 0.4512, "step": 39405 }, { - "epoch": 1.39, - "learning_rate": 4.2404776175654906e-05, - "loss": 0.2911, + "epoch": 1.420333729772588, + "grad_norm": 0.1950104534626007, + "learning_rate": 4.200947208845716e-05, + "loss": 0.438, "step": 39410 }, { - "epoch": 1.39, - "learning_rate": 4.2402731089411664e-05, - "loss": 0.2845, + "epoch": 1.4205139294338127, + "grad_norm": 0.16665388643741608, + "learning_rate": 4.200733338757582e-05, + "loss": 0.4124, "step": 39415 }, { - "epoch": 1.39, - "learning_rate": 4.240068577720543e-05, - "loss": 0.3, + "epoch": 1.4206941290950372, + "grad_norm": 0.18930195271968842, + "learning_rate": 4.2005194454972864e-05, + "loss": 0.4393, "step": 39420 }, { - "epoch": 1.39, - "learning_rate": 4.239864023906276e-05, - "loss": 0.2983, + "epoch": 1.420874328756262, + "grad_norm": 0.1515640914440155, + "learning_rate": 4.2003055290677416e-05, + "loss": 0.3813, "step": 39425 }, { - "epoch": 1.39, - "learning_rate": 4.239659447501021e-05, - "loss": 0.277, + "epoch": 1.4210545284174865, + "grad_norm": 0.14799198508262634, + "learning_rate": 4.200091589471863e-05, + "loss": 0.4036, "step": 39430 }, { - "epoch": 1.39, - "learning_rate": 4.239454848507435e-05, - "loss": 0.2754, + "epoch": 1.4212347280787112, + "grad_norm": 0.18082498013973236, + "learning_rate": 4.199877626712567e-05, + "loss": 0.4354, "step": 39435 }, { - "epoch": 1.39, - "learning_rate": 4.239250226928174e-05, - "loss": 0.2756, + "epoch": 1.421414927739936, + "grad_norm": 0.1811361163854599, + "learning_rate": 4.199663640792767e-05, + "loss": 0.4331, "step": 39440 }, { - "epoch": 1.39, - "learning_rate": 4.239045582765895e-05, - "loss": 0.2923, + "epoch": 1.4215951274011605, + "grad_norm": 0.1772868037223816, + "learning_rate": 4.199449631715378e-05, + "loss": 0.3759, "step": 39445 }, { - "epoch": 1.39, - "learning_rate": 4.2388409160232545e-05, - "loss": 0.2859, + "epoch": 1.4217753270623852, + "grad_norm": 0.17968173325061798, + "learning_rate": 4.1992355994833175e-05, + "loss": 0.3889, "step": 39450 }, { - "epoch": 1.39, - "learning_rate": 4.2386362267029116e-05, - "loss": 0.2837, + "epoch": 1.4219555267236097, + "grad_norm": 0.2115190178155899, + "learning_rate": 4.199021544099501e-05, + "loss": 0.3891, "step": 39455 }, { - "epoch": 1.39, - "learning_rate": 4.2384315148075225e-05, - "loss": 0.2795, + "epoch": 1.4221357263848344, + "grad_norm": 0.2003733217716217, + "learning_rate": 4.1988074655668446e-05, + "loss": 0.4274, "step": 39460 }, { - "epoch": 1.39, - "learning_rate": 4.2382267803397454e-05, - "loss": 0.2934, + "epoch": 1.4223159260460592, + "grad_norm": 0.1704542636871338, + "learning_rate": 4.1985933638882655e-05, + "loss": 0.4479, "step": 39465 }, { - "epoch": 1.39, - "learning_rate": 4.23802202330224e-05, - "loss": 0.2652, + "epoch": 1.4224961257072837, + "grad_norm": 0.18098214268684387, + "learning_rate": 4.198379239066681e-05, + "loss": 0.4312, "step": 39470 }, { - "epoch": 1.39, - "learning_rate": 4.2378172436976644e-05, - "loss": 0.3227, + "epoch": 1.4226763253685082, + "grad_norm": 0.17561987042427063, + "learning_rate": 4.198165091105007e-05, + "loss": 0.396, "step": 39475 }, { - "epoch": 1.39, - "learning_rate": 4.2376124415286764e-05, - "loss": 0.2581, + "epoch": 1.422856525029733, + "grad_norm": 0.14922846853733063, + "learning_rate": 4.197950920006164e-05, + "loss": 0.4461, "step": 39480 }, { - "epoch": 1.39, - "learning_rate": 4.237407616797936e-05, - "loss": 0.2916, + "epoch": 1.4230367246909577, + "grad_norm": 0.189297616481781, + "learning_rate": 4.197736725773068e-05, + "loss": 0.3796, "step": 39485 }, { - "epoch": 1.39, - "learning_rate": 4.237202769508103e-05, - "loss": 0.2714, + "epoch": 1.4232169243521822, + "grad_norm": 0.18948666751384735, + "learning_rate": 4.197522508408637e-05, + "loss": 0.4309, "step": 39490 }, { - "epoch": 1.39, - "learning_rate": 4.2369978996618365e-05, - "loss": 0.2715, + "epoch": 1.423397124013407, + "grad_norm": 0.21630705893039703, + "learning_rate": 4.197308267915791e-05, + "loss": 0.4242, "step": 39495 }, { - "epoch": 1.39, - "learning_rate": 4.236793007261798e-05, - "loss": 0.2822, + "epoch": 1.4235773236746314, + "grad_norm": 0.2401585429906845, + "learning_rate": 4.1970940042974485e-05, + "loss": 0.4483, "step": 39500 }, { - "epoch": 1.39, - "eval_loss": 0.28314095735549927, - "eval_runtime": 10.5583, - "eval_samples_per_second": 9.471, - "eval_steps_per_second": 9.471, + "epoch": 1.4235773236746314, + "eval_loss": 0.44728443026542664, + "eval_runtime": 3.5187, + "eval_samples_per_second": 28.419, + "eval_steps_per_second": 7.105, "step": 39500 }, { - "epoch": 1.39, - "learning_rate": 4.236588092310646e-05, - "loss": 0.2802, + "epoch": 1.4237575233358561, + "grad_norm": 0.2015465348958969, + "learning_rate": 4.196879717556529e-05, + "loss": 0.4352, "step": 39505 }, { - "epoch": 1.39, - "learning_rate": 4.236383154811042e-05, - "loss": 0.284, + "epoch": 1.4239377229970809, + "grad_norm": 0.17503204941749573, + "learning_rate": 4.1966654076959516e-05, + "loss": 0.3889, "step": 39510 }, { - "epoch": 1.39, - "learning_rate": 4.236178194765647e-05, - "loss": 0.3279, + "epoch": 1.4241179226583054, + "grad_norm": 0.15715694427490234, + "learning_rate": 4.196451074718637e-05, + "loss": 0.4131, "step": 39515 }, { - "epoch": 1.39, - "learning_rate": 4.2359732121771226e-05, - "loss": 0.2795, + "epoch": 1.42429812231953, + "grad_norm": 0.20377200841903687, + "learning_rate": 4.196236718627504e-05, + "loss": 0.4144, "step": 39520 }, { - "epoch": 1.39, - "learning_rate": 4.23576820704813e-05, - "loss": 0.2839, + "epoch": 1.4244783219807546, + "grad_norm": 0.18509119749069214, + "learning_rate": 4.196022339425475e-05, + "loss": 0.4244, "step": 39525 }, { - "epoch": 1.39, - "learning_rate": 4.235563179381331e-05, - "loss": 0.271, + "epoch": 1.4246585216419794, + "grad_norm": 0.1515013426542282, + "learning_rate": 4.195807937115469e-05, + "loss": 0.4429, "step": 39530 }, { - "epoch": 1.39, - "learning_rate": 4.235358129179389e-05, - "loss": 0.2893, + "epoch": 1.4248387213032039, + "grad_norm": 0.2172863632440567, + "learning_rate": 4.1955935117004095e-05, + "loss": 0.3957, "step": 39535 }, { - "epoch": 1.39, - "learning_rate": 4.2351530564449635e-05, - "loss": 0.2983, + "epoch": 1.4250189209644286, + "grad_norm": 0.16076506674289703, + "learning_rate": 4.1953790631832156e-05, + "loss": 0.426, "step": 39540 }, { - "epoch": 1.39, - "learning_rate": 4.23494796118072e-05, - "loss": 0.2856, + "epoch": 1.4251991206256531, + "grad_norm": 0.24314238131046295, + "learning_rate": 4.1951645915668105e-05, + "loss": 0.4287, "step": 39545 }, { - "epoch": 1.39, - "learning_rate": 4.234742843389321e-05, - "loss": 0.2899, + "epoch": 1.4253793202868779, + "grad_norm": 0.22566382586956024, + "learning_rate": 4.1949500968541154e-05, + "loss": 0.4242, "step": 39550 }, { - "epoch": 1.39, - "learning_rate": 4.234537703073429e-05, - "loss": 0.2832, + "epoch": 1.4255595199481026, + "grad_norm": 0.1787545531988144, + "learning_rate": 4.194735579048055e-05, + "loss": 0.4171, "step": 39555 }, { - "epoch": 1.39, - "learning_rate": 4.234332540235708e-05, - "loss": 0.3104, + "epoch": 1.425739719609327, + "grad_norm": 0.142437145113945, + "learning_rate": 4.1945210381515485e-05, + "loss": 0.4205, "step": 39560 }, { - "epoch": 1.39, - "learning_rate": 4.234127354878822e-05, - "loss": 0.2856, + "epoch": 1.4259199192705518, + "grad_norm": 0.16321620345115662, + "learning_rate": 4.194306474167522e-05, + "loss": 0.4164, "step": 39565 }, { - "epoch": 1.39, - "learning_rate": 4.2339221470054355e-05, - "loss": 0.2836, + "epoch": 1.4261001189317764, + "grad_norm": 0.24486680328845978, + "learning_rate": 4.1940918870988976e-05, + "loss": 0.4363, "step": 39570 }, { - "epoch": 1.39, - "learning_rate": 4.2337169166182125e-05, - "loss": 0.2826, + "epoch": 1.426280318593001, + "grad_norm": 0.23074302077293396, + "learning_rate": 4.1938772769486e-05, + "loss": 0.4505, "step": 39575 }, { - "epoch": 1.39, - "learning_rate": 4.233511663719817e-05, - "loss": 0.2842, + "epoch": 1.4264605182542258, + "grad_norm": 0.1574547439813614, + "learning_rate": 4.193662643719552e-05, + "loss": 0.3959, "step": 39580 }, { - "epoch": 1.39, - "learning_rate": 4.2333063883129156e-05, - "loss": 0.2884, + "epoch": 1.4266407179154503, + "grad_norm": 0.18195997178554535, + "learning_rate": 4.193447987414678e-05, + "loss": 0.414, "step": 39585 }, { - "epoch": 1.39, - "learning_rate": 4.233101090400173e-05, - "loss": 0.2684, + "epoch": 1.4268209175766748, + "grad_norm": 0.19354818761348724, + "learning_rate": 4.1932333080369036e-05, + "loss": 0.4372, "step": 39590 }, { - "epoch": 1.39, - "learning_rate": 4.232895769984255e-05, - "loss": 0.2712, + "epoch": 1.4270011172378996, + "grad_norm": 0.1999470740556717, + "learning_rate": 4.1930186055891525e-05, + "loss": 0.4069, "step": 39595 }, { - "epoch": 1.39, - "learning_rate": 4.232690427067827e-05, - "loss": 0.2711, + "epoch": 1.4271813168991243, + "grad_norm": 0.18640463054180145, + "learning_rate": 4.1928038800743504e-05, + "loss": 0.4398, "step": 39600 }, { - "epoch": 1.39, - "learning_rate": 4.232485061653556e-05, - "loss": 0.3023, + "epoch": 1.4273615165603488, + "grad_norm": 0.18737342953681946, + "learning_rate": 4.192589131495424e-05, + "loss": 0.4102, "step": 39605 }, { - "epoch": 1.39, - "learning_rate": 4.232279673744108e-05, - "loss": 0.292, + "epoch": 1.4275417162215736, + "grad_norm": 0.16932520270347595, + "learning_rate": 4.1923743598552984e-05, + "loss": 0.4147, "step": 39610 }, { - "epoch": 1.39, - "learning_rate": 4.232074263342151e-05, - "loss": 0.2956, + "epoch": 1.427721915882798, + "grad_norm": 0.14350475370883942, + "learning_rate": 4.192159565156899e-05, + "loss": 0.4086, "step": 39615 }, { - "epoch": 1.39, - "learning_rate": 4.2318688304503505e-05, - "loss": 0.3008, + "epoch": 1.4279021155440228, + "grad_norm": 0.19839464128017426, + "learning_rate": 4.1919447474031546e-05, + "loss": 0.4067, "step": 39620 }, { - "epoch": 1.39, - "learning_rate": 4.231663375071375e-05, - "loss": 0.2826, + "epoch": 1.4280823152052475, + "grad_norm": 0.17800691723823547, + "learning_rate": 4.19172990659699e-05, + "loss": 0.4202, "step": 39625 }, { - "epoch": 1.39, - "learning_rate": 4.231457897207891e-05, - "loss": 0.2865, + "epoch": 1.428262514866472, + "grad_norm": 0.19023603200912476, + "learning_rate": 4.191515042741332e-05, + "loss": 0.394, "step": 39630 }, { - "epoch": 1.39, - "learning_rate": 4.231252396862568e-05, - "loss": 0.2763, + "epoch": 1.4284427145276966, + "grad_norm": 0.14899520576000214, + "learning_rate": 4.1913001558391095e-05, + "loss": 0.4398, "step": 39635 }, { - "epoch": 1.39, - "learning_rate": 4.2310468740380725e-05, - "loss": 0.315, + "epoch": 1.4286229141889213, + "grad_norm": 0.1971236616373062, + "learning_rate": 4.19108524589325e-05, + "loss": 0.407, "step": 39640 }, { - "epoch": 1.39, - "learning_rate": 4.230841328737075e-05, - "loss": 0.3025, + "epoch": 1.428803113850146, + "grad_norm": 0.16257238388061523, + "learning_rate": 4.190870312906682e-05, + "loss": 0.4173, "step": 39645 }, { - "epoch": 1.39, - "learning_rate": 4.230635760962244e-05, - "loss": 0.2954, + "epoch": 1.4289833135113705, + "grad_norm": 0.19401347637176514, + "learning_rate": 4.190655356882332e-05, + "loss": 0.4268, "step": 39650 }, { - "epoch": 1.4, - "learning_rate": 4.2304301707162475e-05, - "loss": 0.2741, + "epoch": 1.4291635131725953, + "grad_norm": 0.16988852620124817, + "learning_rate": 4.1904403778231313e-05, + "loss": 0.4242, "step": 39655 }, { - "epoch": 1.4, - "learning_rate": 4.230224558001755e-05, - "loss": 0.2777, + "epoch": 1.4293437128338198, + "grad_norm": 0.16938932240009308, + "learning_rate": 4.1902253757320074e-05, + "loss": 0.4149, "step": 39660 }, { - "epoch": 1.4, - "learning_rate": 4.230018922821438e-05, - "loss": 0.2835, + "epoch": 1.4295239124950445, + "grad_norm": 0.2068401277065277, + "learning_rate": 4.190010350611889e-05, + "loss": 0.4358, "step": 39665 }, { - "epoch": 1.4, - "learning_rate": 4.229813265177965e-05, - "loss": 0.2953, + "epoch": 1.4297041121562692, + "grad_norm": 0.1826406568288803, + "learning_rate": 4.1897953024657084e-05, + "loss": 0.4312, "step": 39670 }, { - "epoch": 1.4, - "learning_rate": 4.2296075850740066e-05, - "loss": 0.2712, + "epoch": 1.4298843118174938, + "grad_norm": 0.17122584581375122, + "learning_rate": 4.189580231296393e-05, + "loss": 0.44, "step": 39675 }, { - "epoch": 1.4, - "learning_rate": 4.229401882512234e-05, - "loss": 0.3011, + "epoch": 1.4300645114787185, + "grad_norm": 0.18926846981048584, + "learning_rate": 4.1893651371068743e-05, + "loss": 0.4346, "step": 39680 }, { - "epoch": 1.4, - "learning_rate": 4.229196157495318e-05, - "loss": 0.2996, + "epoch": 1.430244711139943, + "grad_norm": 0.16383080184459686, + "learning_rate": 4.1891500199000827e-05, + "loss": 0.3993, "step": 39685 }, { - "epoch": 1.4, - "learning_rate": 4.228990410025928e-05, - "loss": 0.2695, + "epoch": 1.4304249108011677, + "grad_norm": 0.1707577407360077, + "learning_rate": 4.1889348796789484e-05, + "loss": 0.3975, "step": 39690 }, { - "epoch": 1.4, - "learning_rate": 4.2287846401067374e-05, - "loss": 0.2822, + "epoch": 1.4306051104623922, + "grad_norm": 0.20824764668941498, + "learning_rate": 4.188719716446404e-05, + "loss": 0.417, "step": 39695 }, { - "epoch": 1.4, - "learning_rate": 4.228578847740419e-05, - "loss": 0.2804, + "epoch": 1.430785310123617, + "grad_norm": 0.15967229008674622, + "learning_rate": 4.188504530205381e-05, + "loss": 0.404, "step": 39700 }, { - "epoch": 1.4, - "learning_rate": 4.2283730329296417e-05, - "loss": 0.2755, + "epoch": 1.4309655097848415, + "grad_norm": 0.18324309587478638, + "learning_rate": 4.1882893209588104e-05, + "loss": 0.4453, "step": 39705 }, { - "epoch": 1.4, - "learning_rate": 4.228167195677081e-05, - "loss": 0.3029, + "epoch": 1.4311457094460662, + "grad_norm": 0.1739649921655655, + "learning_rate": 4.188074088709624e-05, + "loss": 0.4023, "step": 39710 }, { - "epoch": 1.4, - "learning_rate": 4.227961335985407e-05, - "loss": 0.2772, + "epoch": 1.431325909107291, + "grad_norm": 0.17934979498386383, + "learning_rate": 4.187858833460755e-05, + "loss": 0.4329, "step": 39715 }, { - "epoch": 1.4, - "learning_rate": 4.227755453857295e-05, - "loss": 0.2997, + "epoch": 1.4315061087685155, + "grad_norm": 0.19043073058128357, + "learning_rate": 4.187643555215137e-05, + "loss": 0.3626, "step": 39720 }, { - "epoch": 1.4, - "learning_rate": 4.227549549295415e-05, - "loss": 0.2893, + "epoch": 1.4316863084297402, + "grad_norm": 0.1956600397825241, + "learning_rate": 4.187428253975702e-05, + "loss": 0.4218, "step": 39725 }, { - "epoch": 1.4, - "learning_rate": 4.2273436223024445e-05, - "loss": 0.2749, + "epoch": 1.4318665080909647, + "grad_norm": 0.16748665273189545, + "learning_rate": 4.1872129297453835e-05, + "loss": 0.4228, "step": 39730 }, { - "epoch": 1.4, - "learning_rate": 4.227137672881055e-05, - "loss": 0.2954, + "epoch": 1.4320467077521895, + "grad_norm": 0.24490876495838165, + "learning_rate": 4.186997582527115e-05, + "loss": 0.4301, "step": 39735 }, { - "epoch": 1.4, - "learning_rate": 4.2269317010339215e-05, - "loss": 0.2996, + "epoch": 1.4322269074134142, + "grad_norm": 0.19111377000808716, + "learning_rate": 4.186782212323832e-05, + "loss": 0.4255, "step": 39740 }, { - "epoch": 1.4, - "learning_rate": 4.226725706763717e-05, - "loss": 0.2949, + "epoch": 1.4324071070746387, + "grad_norm": 0.236825630068779, + "learning_rate": 4.186566819138467e-05, + "loss": 0.4159, "step": 39745 }, { - "epoch": 1.4, - "learning_rate": 4.2265196900731176e-05, - "loss": 0.296, + "epoch": 1.4325873067358632, + "grad_norm": 0.17333528399467468, + "learning_rate": 4.186351402973956e-05, + "loss": 0.4423, "step": 39750 }, { - "epoch": 1.4, - "learning_rate": 4.226313650964798e-05, - "loss": 0.288, + "epoch": 1.432767506397088, + "grad_norm": 0.15992626547813416, + "learning_rate": 4.1861359638332343e-05, + "loss": 0.4119, "step": 39755 }, { - "epoch": 1.4, - "learning_rate": 4.226107589441432e-05, - "loss": 0.2819, + "epoch": 1.4329477060583127, + "grad_norm": 0.18964506685733795, + "learning_rate": 4.185920501719236e-05, + "loss": 0.4394, "step": 39760 }, { - "epoch": 1.4, - "learning_rate": 4.225901505505698e-05, - "loss": 0.2796, + "epoch": 1.4331279057195372, + "grad_norm": 0.1828775703907013, + "learning_rate": 4.185705016634897e-05, + "loss": 0.4115, "step": 39765 }, { - "epoch": 1.4, - "learning_rate": 4.2256953991602705e-05, - "loss": 0.262, + "epoch": 1.433308105380762, + "grad_norm": 0.17985029518604279, + "learning_rate": 4.1854895085831533e-05, + "loss": 0.4361, "step": 39770 }, { - "epoch": 1.4, - "learning_rate": 4.225489270407825e-05, - "loss": 0.2852, + "epoch": 1.4334883050419864, + "grad_norm": 0.19158676266670227, + "learning_rate": 4.1852739775669425e-05, + "loss": 0.4207, "step": 39775 }, { - "epoch": 1.4, - "learning_rate": 4.225283119251039e-05, - "loss": 0.2634, + "epoch": 1.4336685047032112, + "grad_norm": 0.1590370237827301, + "learning_rate": 4.185058423589199e-05, + "loss": 0.4402, "step": 39780 }, { - "epoch": 1.4, - "learning_rate": 4.225076945692587e-05, - "loss": 0.2904, + "epoch": 1.433848704364436, + "grad_norm": 0.17683321237564087, + "learning_rate": 4.184842846652861e-05, + "loss": 0.4283, "step": 39785 }, { - "epoch": 1.4, - "learning_rate": 4.224870749735149e-05, - "loss": 0.2756, + "epoch": 1.4340289040256604, + "grad_norm": 0.16492830216884613, + "learning_rate": 4.1846272467608655e-05, + "loss": 0.3904, "step": 39790 }, { - "epoch": 1.4, - "learning_rate": 4.224664531381402e-05, - "loss": 0.2935, + "epoch": 1.4342091036868851, + "grad_norm": 0.22784380614757538, + "learning_rate": 4.18441162391615e-05, + "loss": 0.4632, "step": 39795 }, { - "epoch": 1.4, - "learning_rate": 4.224458290634021e-05, - "loss": 0.2877, + "epoch": 1.4343893033481097, + "grad_norm": 0.17077568173408508, + "learning_rate": 4.184195978121652e-05, + "loss": 0.4248, "step": 39800 }, { - "epoch": 1.4, - "learning_rate": 4.224252027495686e-05, - "loss": 0.2858, + "epoch": 1.4345695030093344, + "grad_norm": 0.17400763928890228, + "learning_rate": 4.1839803093803106e-05, + "loss": 0.4172, "step": 39805 }, { - "epoch": 1.4, - "learning_rate": 4.224045741969075e-05, - "loss": 0.2794, + "epoch": 1.434749702670559, + "grad_norm": 0.1851196140050888, + "learning_rate": 4.183764617695063e-05, + "loss": 0.4743, "step": 39810 }, { - "epoch": 1.4, - "learning_rate": 4.223839434056866e-05, - "loss": 0.3027, + "epoch": 1.4349299023317836, + "grad_norm": 0.16248086094856262, + "learning_rate": 4.183548903068848e-05, + "loss": 0.4334, "step": 39815 }, { - "epoch": 1.4, - "learning_rate": 4.223633103761738e-05, - "loss": 0.2658, + "epoch": 1.4351101019930081, + "grad_norm": 0.19981388747692108, + "learning_rate": 4.1833331655046055e-05, + "loss": 0.4594, "step": 39820 }, { - "epoch": 1.4, - "learning_rate": 4.22342675108637e-05, - "loss": 0.2767, + "epoch": 1.4352903016542329, + "grad_norm": 0.21425208449363708, + "learning_rate": 4.1831174050052745e-05, + "loss": 0.4465, "step": 39825 }, { - "epoch": 1.4, - "learning_rate": 4.223220376033442e-05, - "loss": 0.2956, + "epoch": 1.4354705013154576, + "grad_norm": 0.16509120166301727, + "learning_rate": 4.182901621573795e-05, + "loss": 0.4481, "step": 39830 }, { - "epoch": 1.4, - "learning_rate": 4.223013978605633e-05, - "loss": 0.2936, + "epoch": 1.4356507009766821, + "grad_norm": 0.18470360338687897, + "learning_rate": 4.1826858152131064e-05, + "loss": 0.4352, "step": 39835 }, { - "epoch": 1.4, - "learning_rate": 4.222807558805622e-05, - "loss": 0.3004, + "epoch": 1.4358309006379069, + "grad_norm": 0.1776283085346222, + "learning_rate": 4.182469985926149e-05, + "loss": 0.3918, "step": 39840 }, { - "epoch": 1.4, - "learning_rate": 4.222601116636091e-05, - "loss": 0.2876, + "epoch": 1.4360111002991314, + "grad_norm": 0.16285589337348938, + "learning_rate": 4.1822541337158646e-05, + "loss": 0.4151, "step": 39845 }, { - "epoch": 1.4, - "learning_rate": 4.22239465209972e-05, - "loss": 0.2749, + "epoch": 1.436191299960356, + "grad_norm": 0.15581516921520233, + "learning_rate": 4.1820382585851925e-05, + "loss": 0.4004, "step": 39850 }, { - "epoch": 1.4, - "learning_rate": 4.222188165199189e-05, - "loss": 0.293, + "epoch": 1.4363714996215808, + "grad_norm": 0.15109944343566895, + "learning_rate": 4.1818223605370756e-05, + "loss": 0.4049, "step": 39855 }, { - "epoch": 1.4, - "learning_rate": 4.221981655937179e-05, - "loss": 0.3175, + "epoch": 1.4365516992828053, + "grad_norm": 0.1799648404121399, + "learning_rate": 4.1816064395744536e-05, + "loss": 0.4167, "step": 39860 }, { - "epoch": 1.4, - "learning_rate": 4.221775124316374e-05, - "loss": 0.2766, + "epoch": 1.4367318989440299, + "grad_norm": 0.19493626058101654, + "learning_rate": 4.181390495700271e-05, + "loss": 0.4467, "step": 39865 }, { - "epoch": 1.4, - "learning_rate": 4.2215685703394525e-05, - "loss": 0.2864, + "epoch": 1.4369120986052546, + "grad_norm": 0.21801850199699402, + "learning_rate": 4.181174528917468e-05, + "loss": 0.4298, "step": 39870 }, { - "epoch": 1.4, - "learning_rate": 4.221361994009098e-05, - "loss": 0.2821, + "epoch": 1.4370922982664793, + "grad_norm": 0.24591484665870667, + "learning_rate": 4.1809585392289865e-05, + "loss": 0.4069, "step": 39875 }, { - "epoch": 1.4, - "learning_rate": 4.2211553953279914e-05, - "loss": 0.2932, + "epoch": 1.4372724979277038, + "grad_norm": 0.16199952363967896, + "learning_rate": 4.180742526637771e-05, + "loss": 0.4273, "step": 39880 }, { - "epoch": 1.4, - "learning_rate": 4.220948774298817e-05, - "loss": 0.2625, + "epoch": 1.4374526975889286, + "grad_norm": 0.16908341646194458, + "learning_rate": 4.180526491146764e-05, + "loss": 0.4104, "step": 39885 }, { - "epoch": 1.4, - "learning_rate": 4.220742130924257e-05, - "loss": 0.2784, + "epoch": 1.437632897250153, + "grad_norm": 0.18602709472179413, + "learning_rate": 4.180310432758908e-05, + "loss": 0.4457, "step": 39890 }, { - "epoch": 1.4, - "learning_rate": 4.2205354652069946e-05, - "loss": 0.2753, + "epoch": 1.4378130969113778, + "grad_norm": 0.21784497797489166, + "learning_rate": 4.1800943514771486e-05, + "loss": 0.4454, "step": 39895 }, { - "epoch": 1.4, - "learning_rate": 4.2203287771497125e-05, - "loss": 0.3228, + "epoch": 1.4379932965726026, + "grad_norm": 0.166326642036438, + "learning_rate": 4.179878247304429e-05, + "loss": 0.4221, "step": 39900 }, { - "epoch": 1.4, - "learning_rate": 4.2201220667550955e-05, - "loss": 0.2661, + "epoch": 1.438173496233827, + "grad_norm": 0.17365773022174835, + "learning_rate": 4.1796621202436934e-05, + "loss": 0.3897, "step": 39905 }, { - "epoch": 1.4, - "learning_rate": 4.219915334025827e-05, - "loss": 0.2924, + "epoch": 1.4383536958950518, + "grad_norm": 0.231038898229599, + "learning_rate": 4.179445970297887e-05, + "loss": 0.4301, "step": 39910 }, { - "epoch": 1.4, - "learning_rate": 4.219708578964591e-05, - "loss": 0.2912, + "epoch": 1.4385338955562763, + "grad_norm": 0.19160155951976776, + "learning_rate": 4.179229797469954e-05, + "loss": 0.3991, "step": 39915 }, { - "epoch": 1.4, - "learning_rate": 4.2195018015740726e-05, - "loss": 0.2761, + "epoch": 1.438714095217501, + "grad_norm": 0.1742529571056366, + "learning_rate": 4.179013601762839e-05, + "loss": 0.4099, "step": 39920 }, { - "epoch": 1.4, - "learning_rate": 4.219295001856957e-05, - "loss": 0.2652, + "epoch": 1.4388942948787256, + "grad_norm": 0.17680296301841736, + "learning_rate": 4.17879738317949e-05, + "loss": 0.4349, "step": 39925 }, { - "epoch": 1.4, - "learning_rate": 4.219088179815928e-05, - "loss": 0.2966, + "epoch": 1.4390744945399503, + "grad_norm": 0.21508027613162994, + "learning_rate": 4.1785811417228513e-05, + "loss": 0.4284, "step": 39930 }, { - "epoch": 1.41, - "learning_rate": 4.218881335453673e-05, - "loss": 0.2974, + "epoch": 1.4392546942011748, + "grad_norm": 0.19093069434165955, + "learning_rate": 4.1783648773958706e-05, + "loss": 0.4541, "step": 39935 }, { - "epoch": 1.41, - "learning_rate": 4.218674468772875e-05, - "loss": 0.2852, + "epoch": 1.4394348938623995, + "grad_norm": 0.18556737899780273, + "learning_rate": 4.178148590201492e-05, + "loss": 0.416, "step": 39940 }, { - "epoch": 1.41, - "learning_rate": 4.218467579776223e-05, - "loss": 0.2864, + "epoch": 1.4396150935236243, + "grad_norm": 0.17068710923194885, + "learning_rate": 4.177932280142665e-05, + "loss": 0.4588, "step": 39945 }, { - "epoch": 1.41, - "learning_rate": 4.218260668466402e-05, - "loss": 0.2779, + "epoch": 1.4397952931848488, + "grad_norm": 0.195718452334404, + "learning_rate": 4.177715947222334e-05, + "loss": 0.4256, "step": 39950 }, { - "epoch": 1.41, - "learning_rate": 4.218053734846098e-05, - "loss": 0.2762, + "epoch": 1.4399754928460735, + "grad_norm": 0.21339944005012512, + "learning_rate": 4.177499591443449e-05, + "loss": 0.4219, "step": 39955 }, { - "epoch": 1.41, - "learning_rate": 4.2178467789179986e-05, - "loss": 0.2944, + "epoch": 1.440155692507298, + "grad_norm": 0.21166032552719116, + "learning_rate": 4.1772832128089564e-05, + "loss": 0.4104, "step": 39960 }, { - "epoch": 1.41, - "learning_rate": 4.217639800684791e-05, - "loss": 0.2905, + "epoch": 1.4403358921685228, + "grad_norm": 0.1573396623134613, + "learning_rate": 4.177066811321805e-05, + "loss": 0.4435, "step": 39965 }, { - "epoch": 1.41, - "learning_rate": 4.217432800149162e-05, - "loss": 0.2807, + "epoch": 1.4405160918297475, + "grad_norm": 0.2001795619726181, + "learning_rate": 4.176850386984943e-05, + "loss": 0.3743, "step": 39970 }, { - "epoch": 1.41, - "learning_rate": 4.217267183664723e-05, - "loss": 0.2647, + "epoch": 1.440696291490972, + "grad_norm": 0.19709444046020508, + "learning_rate": 4.176633939801319e-05, + "loss": 0.4321, "step": 39975 }, { - "epoch": 1.41, - "learning_rate": 4.2170601429915105e-05, - "loss": 0.3072, + "epoch": 1.4408764911521965, + "grad_norm": 0.1635955274105072, + "learning_rate": 4.176417469773882e-05, + "loss": 0.4251, "step": 39980 }, { - "epoch": 1.41, - "learning_rate": 4.2168530800234035e-05, - "loss": 0.2968, + "epoch": 1.4410566908134212, + "grad_norm": 0.15509602427482605, + "learning_rate": 4.1762009769055835e-05, + "loss": 0.4027, "step": 39985 }, { - "epoch": 1.41, - "learning_rate": 4.216645994763091e-05, - "loss": 0.272, + "epoch": 1.441236890474646, + "grad_norm": 0.1558351367712021, + "learning_rate": 4.1759844611993685e-05, + "loss": 0.4319, "step": 39990 }, { - "epoch": 1.41, - "learning_rate": 4.2164388872132607e-05, - "loss": 0.2814, + "epoch": 1.4414170901358705, + "grad_norm": 0.19999557733535767, + "learning_rate": 4.175767922658191e-05, + "loss": 0.3745, "step": 39995 }, { - "epoch": 1.41, - "learning_rate": 4.216231757376603e-05, - "loss": 0.3099, + "epoch": 1.4415972897970952, + "grad_norm": 0.16591276228427887, + "learning_rate": 4.1755513612849993e-05, + "loss": 0.4251, "step": 40000 }, { - "epoch": 1.41, - "eval_loss": 0.2824369966983795, - "eval_runtime": 10.5321, - "eval_samples_per_second": 9.495, - "eval_steps_per_second": 9.495, + "epoch": 1.4415972897970952, + "eval_loss": 0.4461284279823303, + "eval_runtime": 3.5457, + "eval_samples_per_second": 28.203, + "eval_steps_per_second": 7.051, "step": 40000 }, { - "epoch": 1.41, - "learning_rate": 4.216024605255807e-05, - "loss": 0.2774, + "epoch": 1.4417774894583197, + "grad_norm": 0.1718062311410904, + "learning_rate": 4.1753347770827454e-05, + "loss": 0.4254, "step": 40005 }, { - "epoch": 1.41, - "learning_rate": 4.215817430853563e-05, - "loss": 0.2988, + "epoch": 1.4419576891195445, + "grad_norm": 0.19657757878303528, + "learning_rate": 4.17511817005438e-05, + "loss": 0.4246, "step": 40010 }, { - "epoch": 1.41, - "learning_rate": 4.21561023417256e-05, - "loss": 0.3029, + "epoch": 1.4421378887807692, + "grad_norm": 0.18001805245876312, + "learning_rate": 4.1749015402028526e-05, + "loss": 0.409, "step": 40015 }, { - "epoch": 1.41, - "learning_rate": 4.215403015215489e-05, - "loss": 0.2871, + "epoch": 1.4423180884419937, + "grad_norm": 0.158418208360672, + "learning_rate": 4.174684887531116e-05, + "loss": 0.3819, "step": 40020 }, { - "epoch": 1.41, - "learning_rate": 4.21519577398504e-05, - "loss": 0.291, + "epoch": 1.4424982881032182, + "grad_norm": 0.23636119067668915, + "learning_rate": 4.174468212042123e-05, + "loss": 0.399, "step": 40025 }, { - "epoch": 1.41, - "learning_rate": 4.214988510483904e-05, - "loss": 0.2806, + "epoch": 1.442678487764443, + "grad_norm": 0.1882403939962387, + "learning_rate": 4.1742515137388246e-05, + "loss": 0.4072, "step": 40030 }, { - "epoch": 1.41, - "learning_rate": 4.214781224714773e-05, - "loss": 0.286, + "epoch": 1.4428586874256677, + "grad_norm": 0.17548276484012604, + "learning_rate": 4.174034792624173e-05, + "loss": 0.4078, "step": 40035 }, { - "epoch": 1.41, - "learning_rate": 4.214573916680338e-05, - "loss": 0.2788, + "epoch": 1.4430388870868922, + "grad_norm": 0.15663708746433258, + "learning_rate": 4.1738180487011214e-05, + "loss": 0.4162, "step": 40040 }, { - "epoch": 1.41, - "learning_rate": 4.2143665863832896e-05, - "loss": 0.2832, + "epoch": 1.443219086748117, + "grad_norm": 0.19377632439136505, + "learning_rate": 4.173601281972623e-05, + "loss": 0.4313, "step": 40045 }, { - "epoch": 1.41, - "learning_rate": 4.214159233826322e-05, - "loss": 0.2754, + "epoch": 1.4433992864093415, + "grad_norm": 0.22673170268535614, + "learning_rate": 4.173384492441632e-05, + "loss": 0.4314, "step": 40050 }, { - "epoch": 1.41, - "learning_rate": 4.2139518590121256e-05, - "loss": 0.2872, + "epoch": 1.4435794860705662, + "grad_norm": 0.15245167911052704, + "learning_rate": 4.173167680111101e-05, + "loss": 0.3985, "step": 40055 }, { - "epoch": 1.41, - "learning_rate": 4.213744461943395e-05, - "loss": 0.2831, + "epoch": 1.443759685731791, + "grad_norm": 0.1588246375322342, + "learning_rate": 4.1729508449839834e-05, + "loss": 0.42, "step": 40060 }, { - "epoch": 1.41, - "learning_rate": 4.213537042622821e-05, - "loss": 0.2739, + "epoch": 1.4439398853930154, + "grad_norm": 0.20271897315979004, + "learning_rate": 4.1727339870632345e-05, + "loss": 0.4485, "step": 40065 }, { - "epoch": 1.41, - "learning_rate": 4.213329601053098e-05, - "loss": 0.3059, + "epoch": 1.4441200850542402, + "grad_norm": 0.1787983775138855, + "learning_rate": 4.17251710635181e-05, + "loss": 0.3992, "step": 40070 }, { - "epoch": 1.41, - "learning_rate": 4.213122137236919e-05, - "loss": 0.2987, + "epoch": 1.4443002847154647, + "grad_norm": 0.17043855786323547, + "learning_rate": 4.1723002028526625e-05, + "loss": 0.4319, "step": 40075 }, { - "epoch": 1.41, - "learning_rate": 4.212914651176978e-05, - "loss": 0.2718, + "epoch": 1.4444804843766894, + "grad_norm": 0.1581939458847046, + "learning_rate": 4.172083276568749e-05, + "loss": 0.4217, "step": 40080 }, { - "epoch": 1.41, - "learning_rate": 4.212707142875969e-05, - "loss": 0.2723, + "epoch": 1.4446606840379141, + "grad_norm": 0.19269004464149475, + "learning_rate": 4.1718663275030246e-05, + "loss": 0.3794, "step": 40085 }, { - "epoch": 1.41, - "learning_rate": 4.212499612336587e-05, - "loss": 0.2886, + "epoch": 1.4448408836991387, + "grad_norm": 0.1823306381702423, + "learning_rate": 4.1716493556584455e-05, + "loss": 0.3788, "step": 40090 }, { - "epoch": 1.41, - "learning_rate": 4.212292059561526e-05, - "loss": 0.3042, + "epoch": 1.4450210833603632, + "grad_norm": 0.22795365750789642, + "learning_rate": 4.171432361037968e-05, + "loss": 0.4046, "step": 40095 }, { - "epoch": 1.41, - "learning_rate": 4.2120844845534816e-05, - "loss": 0.3038, + "epoch": 1.445201283021588, + "grad_norm": 0.1868722289800644, + "learning_rate": 4.1712153436445464e-05, + "loss": 0.4358, "step": 40100 }, { - "epoch": 1.41, - "learning_rate": 4.211876887315147e-05, - "loss": 0.293, + "epoch": 1.4453814826828126, + "grad_norm": 0.2065182775259018, + "learning_rate": 4.1709983034811406e-05, + "loss": 0.4051, "step": 40105 }, { - "epoch": 1.41, - "learning_rate": 4.211669267849221e-05, - "loss": 0.2877, + "epoch": 1.4455616823440371, + "grad_norm": 0.14879098534584045, + "learning_rate": 4.170781240550706e-05, + "loss": 0.3798, "step": 40110 }, { - "epoch": 1.41, - "learning_rate": 4.211461626158396e-05, - "loss": 0.2785, + "epoch": 1.4457418820052619, + "grad_norm": 0.15872523188591003, + "learning_rate": 4.170564154856201e-05, + "loss": 0.406, "step": 40115 }, { - "epoch": 1.41, - "learning_rate": 4.211253962245371e-05, - "loss": 0.297, + "epoch": 1.4459220816664864, + "grad_norm": 0.15561115741729736, + "learning_rate": 4.170347046400583e-05, + "loss": 0.3787, "step": 40120 }, { - "epoch": 1.41, - "learning_rate": 4.21104627611284e-05, - "loss": 0.2807, + "epoch": 1.4461022813277111, + "grad_norm": 0.22237730026245117, + "learning_rate": 4.170129915186809e-05, + "loss": 0.4211, "step": 40125 }, { - "epoch": 1.41, - "learning_rate": 4.210838567763501e-05, - "loss": 0.3068, + "epoch": 1.4462824809889359, + "grad_norm": 0.19073696434497833, + "learning_rate": 4.169912761217839e-05, + "loss": 0.4023, "step": 40130 }, { - "epoch": 1.41, - "learning_rate": 4.21063083720005e-05, - "loss": 0.2605, + "epoch": 1.4464626806501604, + "grad_norm": 0.18226531147956848, + "learning_rate": 4.16969558449663e-05, + "loss": 0.4144, "step": 40135 }, { - "epoch": 1.41, - "learning_rate": 4.210423084425186e-05, - "loss": 0.2947, + "epoch": 1.4466428803113849, + "grad_norm": 0.19188474118709564, + "learning_rate": 4.169478385026142e-05, + "loss": 0.4288, "step": 40140 }, { - "epoch": 1.41, - "learning_rate": 4.2102153094416055e-05, - "loss": 0.3167, + "epoch": 1.4468230799726096, + "grad_norm": 0.2194652110338211, + "learning_rate": 4.169261162809335e-05, + "loss": 0.4719, "step": 40145 }, { - "epoch": 1.41, - "learning_rate": 4.2100075122520055e-05, - "loss": 0.2757, + "epoch": 1.4470032796338343, + "grad_norm": 0.192906454205513, + "learning_rate": 4.169043917849168e-05, + "loss": 0.4207, "step": 40150 }, { - "epoch": 1.41, - "learning_rate": 4.2097996928590854e-05, - "loss": 0.2864, + "epoch": 1.4471834792950589, + "grad_norm": 0.17282317578792572, + "learning_rate": 4.1688266501486004e-05, + "loss": 0.388, "step": 40155 }, { - "epoch": 1.41, - "learning_rate": 4.209591851265543e-05, - "loss": 0.308, + "epoch": 1.4473636789562836, + "grad_norm": 0.2374262660741806, + "learning_rate": 4.168609359710592e-05, + "loss": 0.4231, "step": 40160 }, { - "epoch": 1.41, - "learning_rate": 4.209383987474077e-05, - "loss": 0.3065, + "epoch": 1.447543878617508, + "grad_norm": 0.14937429130077362, + "learning_rate": 4.1683920465381054e-05, + "loss": 0.4245, "step": 40165 }, { - "epoch": 1.41, - "learning_rate": 4.209176101487387e-05, - "loss": 0.3025, + "epoch": 1.4477240782787328, + "grad_norm": 0.15380804240703583, + "learning_rate": 4.1681747106340995e-05, + "loss": 0.4201, "step": 40170 }, { - "epoch": 1.41, - "learning_rate": 4.2089681933081716e-05, - "loss": 0.2801, + "epoch": 1.4479042779399576, + "grad_norm": 0.22721770405769348, + "learning_rate": 4.167957352001537e-05, + "loss": 0.3973, "step": 40175 }, { - "epoch": 1.41, - "learning_rate": 4.20876026293913e-05, - "loss": 0.308, + "epoch": 1.448084477601182, + "grad_norm": 0.21499106287956238, + "learning_rate": 4.167739970643377e-05, + "loss": 0.4453, "step": 40180 }, { - "epoch": 1.41, - "learning_rate": 4.208552310382963e-05, - "loss": 0.3071, + "epoch": 1.4482646772624068, + "grad_norm": 0.19283875823020935, + "learning_rate": 4.167522566562584e-05, + "loss": 0.4062, "step": 40185 }, { - "epoch": 1.41, - "learning_rate": 4.2083443356423694e-05, - "loss": 0.2772, + "epoch": 1.4484448769236313, + "grad_norm": 0.1777193248271942, + "learning_rate": 4.167305139762119e-05, + "loss": 0.4055, "step": 40190 }, { - "epoch": 1.41, - "learning_rate": 4.2081363387200504e-05, - "loss": 0.2632, + "epoch": 1.448625076584856, + "grad_norm": 0.2283918261528015, + "learning_rate": 4.167087690244943e-05, + "loss": 0.4338, "step": 40195 }, { - "epoch": 1.41, - "learning_rate": 4.2079283196187077e-05, - "loss": 0.2998, + "epoch": 1.4488052762460806, + "grad_norm": 0.17080770432949066, + "learning_rate": 4.1668702180140206e-05, + "loss": 0.4003, "step": 40200 }, { - "epoch": 1.41, - "learning_rate": 4.207720278341041e-05, - "loss": 0.2964, + "epoch": 1.4489854759073053, + "grad_norm": 0.17411410808563232, + "learning_rate": 4.166652723072314e-05, + "loss": 0.4115, "step": 40205 }, { - "epoch": 1.41, - "learning_rate": 4.2075122148897516e-05, - "loss": 0.3021, + "epoch": 1.4491656755685298, + "grad_norm": 0.18166224658489227, + "learning_rate": 4.166435205422787e-05, + "loss": 0.4184, "step": 40210 }, { - "epoch": 1.41, - "learning_rate": 4.207304129267542e-05, - "loss": 0.295, + "epoch": 1.4493458752297546, + "grad_norm": 0.17343877255916595, + "learning_rate": 4.166217665068403e-05, + "loss": 0.4373, "step": 40215 }, { - "epoch": 1.42, - "learning_rate": 4.207096021477113e-05, - "loss": 0.2857, + "epoch": 1.4495260748909793, + "grad_norm": 0.1936635822057724, + "learning_rate": 4.166000102012126e-05, + "loss": 0.387, "step": 40220 }, { - "epoch": 1.42, - "learning_rate": 4.2068878915211665e-05, - "loss": 0.2762, + "epoch": 1.4497062745522038, + "grad_norm": 0.1682083159685135, + "learning_rate": 4.16578251625692e-05, + "loss": 0.422, "step": 40225 }, { - "epoch": 1.42, - "learning_rate": 4.206679739402406e-05, - "loss": 0.2891, + "epoch": 1.4498864742134285, + "grad_norm": 0.20389603078365326, + "learning_rate": 4.16556490780575e-05, + "loss": 0.4434, "step": 40230 }, { - "epoch": 1.42, - "learning_rate": 4.2064715651235344e-05, - "loss": 0.2594, + "epoch": 1.450066673874653, + "grad_norm": 0.16445006430149078, + "learning_rate": 4.16534727666158e-05, + "loss": 0.4393, "step": 40235 }, { - "epoch": 1.42, - "learning_rate": 4.2062633686872545e-05, - "loss": 0.2997, + "epoch": 1.4502468735358778, + "grad_norm": 0.1689777970314026, + "learning_rate": 4.165129622827376e-05, + "loss": 0.3975, "step": 40240 }, { - "epoch": 1.42, - "learning_rate": 4.2060551500962676e-05, - "loss": 0.2933, + "epoch": 1.4504270731971025, + "grad_norm": 0.17561562359333038, + "learning_rate": 4.164911946306104e-05, + "loss": 0.4218, "step": 40245 }, { - "epoch": 1.42, - "learning_rate": 4.2058469093532806e-05, - "loss": 0.2704, + "epoch": 1.450607272858327, + "grad_norm": 0.18199895322322845, + "learning_rate": 4.164694247100728e-05, + "loss": 0.4344, "step": 40250 }, { - "epoch": 1.42, - "learning_rate": 4.205638646460994e-05, - "loss": 0.2835, + "epoch": 1.4507874725195515, + "grad_norm": 0.17951489984989166, + "learning_rate": 4.164476525214216e-05, + "loss": 0.4282, "step": 40255 }, { - "epoch": 1.42, - "learning_rate": 4.2054303614221154e-05, - "loss": 0.2882, + "epoch": 1.4509676721807763, + "grad_norm": 0.1799135059118271, + "learning_rate": 4.1642587806495324e-05, + "loss": 0.4252, "step": 40260 }, { - "epoch": 1.42, - "learning_rate": 4.205222054239346e-05, - "loss": 0.2911, + "epoch": 1.451147871842001, + "grad_norm": 0.172772616147995, + "learning_rate": 4.1640410134096465e-05, + "loss": 0.413, "step": 40265 }, { - "epoch": 1.42, - "learning_rate": 4.205013724915393e-05, - "loss": 0.2964, + "epoch": 1.4513280715032255, + "grad_norm": 0.15084902942180634, + "learning_rate": 4.1638232234975225e-05, + "loss": 0.4336, "step": 40270 }, { - "epoch": 1.42, - "learning_rate": 4.2048053734529604e-05, - "loss": 0.2897, + "epoch": 1.4515082711644502, + "grad_norm": 0.17858850955963135, + "learning_rate": 4.163605410916131e-05, + "loss": 0.443, "step": 40275 }, { - "epoch": 1.42, - "learning_rate": 4.204596999854753e-05, - "loss": 0.2759, + "epoch": 1.4516884708256748, + "grad_norm": 0.1724681556224823, + "learning_rate": 4.163387575668437e-05, + "loss": 0.3995, "step": 40280 }, { - "epoch": 1.42, - "learning_rate": 4.204388604123477e-05, - "loss": 0.2827, + "epoch": 1.4518686704868995, + "grad_norm": 0.15848155319690704, + "learning_rate": 4.163169717757409e-05, + "loss": 0.3968, "step": 40285 }, { - "epoch": 1.42, - "learning_rate": 4.204180186261839e-05, - "loss": 0.2804, + "epoch": 1.4520488701481242, + "grad_norm": 0.17804880440235138, + "learning_rate": 4.162951837186016e-05, + "loss": 0.4272, "step": 40290 }, { - "epoch": 1.42, - "learning_rate": 4.2039717462725435e-05, - "loss": 0.2762, + "epoch": 1.4522290698093487, + "grad_norm": 0.19191712141036987, + "learning_rate": 4.1627339339572256e-05, + "loss": 0.4096, "step": 40295 }, { - "epoch": 1.42, - "learning_rate": 4.203763284158299e-05, - "loss": 0.3067, + "epoch": 1.4524092694705735, + "grad_norm": 0.19533273577690125, + "learning_rate": 4.1625160080740075e-05, + "loss": 0.4129, "step": 40300 }, { - "epoch": 1.42, - "learning_rate": 4.203554799921809e-05, - "loss": 0.3022, + "epoch": 1.452589469131798, + "grad_norm": 0.28732794523239136, + "learning_rate": 4.162298059539331e-05, + "loss": 0.4749, "step": 40305 }, { - "epoch": 1.42, - "learning_rate": 4.203346293565784e-05, - "loss": 0.3145, + "epoch": 1.4527696687930227, + "grad_norm": 0.16327139735221863, + "learning_rate": 4.1620800883561656e-05, + "loss": 0.4015, "step": 40310 }, { - "epoch": 1.42, - "learning_rate": 4.203137765092929e-05, - "loss": 0.2803, + "epoch": 1.4529498684542472, + "grad_norm": 0.1888490468263626, + "learning_rate": 4.16186209452748e-05, + "loss": 0.4243, "step": 40315 }, { - "epoch": 1.42, - "learning_rate": 4.202929214505954e-05, - "loss": 0.306, + "epoch": 1.453130068115472, + "grad_norm": 0.20011357963085175, + "learning_rate": 4.161644078056246e-05, + "loss": 0.4201, "step": 40320 }, { - "epoch": 1.42, - "learning_rate": 4.202720641807565e-05, - "loss": 0.2882, + "epoch": 1.4533102677766965, + "grad_norm": 0.1774764508008957, + "learning_rate": 4.161426038945432e-05, + "loss": 0.3993, "step": 40325 }, { - "epoch": 1.42, - "learning_rate": 4.20251204700047e-05, - "loss": 0.2888, + "epoch": 1.4534904674379212, + "grad_norm": 0.18601520359516144, + "learning_rate": 4.1612079771980106e-05, + "loss": 0.382, "step": 40330 }, { - "epoch": 1.42, - "learning_rate": 4.202303430087379e-05, - "loss": 0.286, + "epoch": 1.453670667099146, + "grad_norm": 0.18623565137386322, + "learning_rate": 4.160989892816952e-05, + "loss": 0.4032, "step": 40335 }, { - "epoch": 1.42, - "learning_rate": 4.202094791070998e-05, - "loss": 0.2982, + "epoch": 1.4538508667603705, + "grad_norm": 0.16086627542972565, + "learning_rate": 4.160771785805228e-05, + "loss": 0.4054, "step": 40340 }, { - "epoch": 1.42, - "learning_rate": 4.201886129954039e-05, - "loss": 0.2865, + "epoch": 1.4540310664215952, + "grad_norm": 0.1630929857492447, + "learning_rate": 4.16055365616581e-05, + "loss": 0.4016, "step": 40345 }, { - "epoch": 1.42, - "learning_rate": 4.2016774467392106e-05, - "loss": 0.3012, + "epoch": 1.4542112660828197, + "grad_norm": 0.16197270154953003, + "learning_rate": 4.160335503901669e-05, + "loss": 0.4461, "step": 40350 }, { - "epoch": 1.42, - "learning_rate": 4.201468741429222e-05, - "loss": 0.2862, + "epoch": 1.4543914657440444, + "grad_norm": 0.160113126039505, + "learning_rate": 4.1601173290157794e-05, + "loss": 0.429, "step": 40355 }, { - "epoch": 1.42, - "learning_rate": 4.201260014026782e-05, - "loss": 0.2935, + "epoch": 1.4545716654052692, + "grad_norm": 0.20948779582977295, + "learning_rate": 4.159899131511111e-05, + "loss": 0.4058, "step": 40360 }, { - "epoch": 1.42, - "learning_rate": 4.2010512645346034e-05, - "loss": 0.2814, + "epoch": 1.4547518650664937, + "grad_norm": 0.19424784183502197, + "learning_rate": 4.15968091139064e-05, + "loss": 0.4113, "step": 40365 }, { - "epoch": 1.42, - "learning_rate": 4.2008424929553934e-05, - "loss": 0.3095, + "epoch": 1.4549320647277182, + "grad_norm": 0.19249606132507324, + "learning_rate": 4.159462668657337e-05, + "loss": 0.4407, "step": 40370 }, { - "epoch": 1.42, - "learning_rate": 4.200633699291866e-05, - "loss": 0.2908, + "epoch": 1.455112264388943, + "grad_norm": 0.13863718509674072, + "learning_rate": 4.159244403314176e-05, + "loss": 0.4143, "step": 40375 }, { - "epoch": 1.42, - "learning_rate": 4.20042488354673e-05, - "loss": 0.2814, + "epoch": 1.4552924640501677, + "grad_norm": 0.16577591001987457, + "learning_rate": 4.159026115364132e-05, + "loss": 0.4468, "step": 40380 }, { - "epoch": 1.42, - "learning_rate": 4.200216045722698e-05, - "loss": 0.2764, + "epoch": 1.4554726637113922, + "grad_norm": 0.16115480661392212, + "learning_rate": 4.1588078048101784e-05, + "loss": 0.4213, "step": 40385 }, { - "epoch": 1.42, - "learning_rate": 4.200007185822481e-05, - "loss": 0.2903, + "epoch": 1.455652863372617, + "grad_norm": 0.17219962179660797, + "learning_rate": 4.15858947165529e-05, + "loss": 0.4411, "step": 40390 }, { - "epoch": 1.42, - "learning_rate": 4.199798303848791e-05, - "loss": 0.2933, + "epoch": 1.4558330630338414, + "grad_norm": 0.17890863120555878, + "learning_rate": 4.15837111590244e-05, + "loss": 0.4052, "step": 40395 }, { - "epoch": 1.42, - "learning_rate": 4.19958939980434e-05, - "loss": 0.2782, + "epoch": 1.4560132626950661, + "grad_norm": 0.19050833582878113, + "learning_rate": 4.158152737554606e-05, + "loss": 0.4257, "step": 40400 }, { - "epoch": 1.42, - "learning_rate": 4.199380473691841e-05, - "loss": 0.2777, + "epoch": 1.4561934623562909, + "grad_norm": 0.1742664873600006, + "learning_rate": 4.1579343366147604e-05, + "loss": 0.4023, "step": 40405 }, { - "epoch": 1.42, - "learning_rate": 4.199171525514006e-05, - "loss": 0.2911, + "epoch": 1.4563736620175154, + "grad_norm": 0.19294539093971252, + "learning_rate": 4.157715913085881e-05, + "loss": 0.4072, "step": 40410 }, { - "epoch": 1.42, - "learning_rate": 4.198962555273549e-05, - "loss": 0.2774, + "epoch": 1.4565538616787401, + "grad_norm": 0.1636057049036026, + "learning_rate": 4.1574974669709435e-05, + "loss": 0.4023, "step": 40415 }, { - "epoch": 1.42, - "learning_rate": 4.1987535629731835e-05, - "loss": 0.3019, + "epoch": 1.4567340613399646, + "grad_norm": 0.15918225049972534, + "learning_rate": 4.1572789982729244e-05, + "loss": 0.3778, "step": 40420 }, { - "epoch": 1.42, - "learning_rate": 4.1985445486156216e-05, - "loss": 0.2958, + "epoch": 1.4569142610011894, + "grad_norm": 0.17120924592018127, + "learning_rate": 4.1570605069947986e-05, + "loss": 0.4285, "step": 40425 }, { - "epoch": 1.42, - "learning_rate": 4.198335512203578e-05, - "loss": 0.2622, + "epoch": 1.4570944606624139, + "grad_norm": 0.16076339781284332, + "learning_rate": 4.1568419931395456e-05, + "loss": 0.4201, "step": 40430 }, { - "epoch": 1.42, - "learning_rate": 4.1981264537397674e-05, - "loss": 0.2592, + "epoch": 1.4572746603236386, + "grad_norm": 0.14771609008312225, + "learning_rate": 4.15662345671014e-05, + "loss": 0.3823, "step": 40435 }, { - "epoch": 1.42, - "learning_rate": 4.197917373226904e-05, - "loss": 0.2674, + "epoch": 1.4574548599848631, + "grad_norm": 0.19097311794757843, + "learning_rate": 4.156404897709562e-05, + "loss": 0.4274, "step": 40440 }, { - "epoch": 1.42, - "learning_rate": 4.1977082706677026e-05, - "loss": 0.2983, + "epoch": 1.4576350596460879, + "grad_norm": 0.16327013075351715, + "learning_rate": 4.1561863161407866e-05, + "loss": 0.4176, "step": 40445 }, { - "epoch": 1.42, - "learning_rate": 4.197499146064877e-05, - "loss": 0.2961, + "epoch": 1.4578152593073126, + "grad_norm": 0.15996558964252472, + "learning_rate": 4.1559677120067935e-05, + "loss": 0.4001, "step": 40450 }, { - "epoch": 1.42, - "learning_rate": 4.197289999421145e-05, - "loss": 0.2746, + "epoch": 1.457995458968537, + "grad_norm": 0.1909141093492508, + "learning_rate": 4.1557490853105614e-05, + "loss": 0.3835, "step": 40455 }, { - "epoch": 1.42, - "learning_rate": 4.1970808307392204e-05, - "loss": 0.2803, + "epoch": 1.4581756586297618, + "grad_norm": 0.21793286502361298, + "learning_rate": 4.155530436055068e-05, + "loss": 0.3851, "step": 40460 }, { - "epoch": 1.42, - "learning_rate": 4.1968716400218196e-05, - "loss": 0.2771, + "epoch": 1.4583558582909864, + "grad_norm": 0.14985129237174988, + "learning_rate": 4.155311764243294e-05, + "loss": 0.385, "step": 40465 }, { - "epoch": 1.42, - "learning_rate": 4.1966624272716594e-05, - "loss": 0.2805, + "epoch": 1.458536057952211, + "grad_norm": 0.16198906302452087, + "learning_rate": 4.155093069878216e-05, + "loss": 0.3893, "step": 40470 }, { - "epoch": 1.42, - "learning_rate": 4.196453192491455e-05, - "loss": 0.2842, + "epoch": 1.4587162576134358, + "grad_norm": 0.20363600552082062, + "learning_rate": 4.154874352962816e-05, + "loss": 0.4146, "step": 40475 }, { - "epoch": 1.42, - "learning_rate": 4.1962439356839236e-05, - "loss": 0.2983, + "epoch": 1.4588964572746603, + "grad_norm": 0.1638229936361313, + "learning_rate": 4.154655613500075e-05, + "loss": 0.4149, "step": 40480 }, { - "epoch": 1.42, - "learning_rate": 4.196034656851783e-05, - "loss": 0.2932, + "epoch": 1.4590766569358848, + "grad_norm": 0.18865369260311127, + "learning_rate": 4.15443685149297e-05, + "loss": 0.3961, "step": 40485 }, { - "epoch": 1.42, - "learning_rate": 4.1958253559977506e-05, - "loss": 0.2799, + "epoch": 1.4592568565971096, + "grad_norm": 0.17428448796272278, + "learning_rate": 4.154218066944483e-05, + "loss": 0.4314, "step": 40490 }, { - "epoch": 1.42, - "learning_rate": 4.195616033124543e-05, - "loss": 0.2829, + "epoch": 1.4594370562583343, + "grad_norm": 0.18061710894107819, + "learning_rate": 4.1539992598575954e-05, + "loss": 0.4091, "step": 40495 }, { - "epoch": 1.42, - "learning_rate": 4.195406688234879e-05, - "loss": 0.3083, + "epoch": 1.4596172559195588, + "grad_norm": 0.14302043616771698, + "learning_rate": 4.153780430235289e-05, + "loss": 0.3872, "step": 40500 }, { - "epoch": 1.42, - "eval_loss": 0.2803879678249359, - "eval_runtime": 10.5501, - "eval_samples_per_second": 9.479, - "eval_steps_per_second": 9.479, + "epoch": 1.4596172559195588, + "eval_loss": 0.44571349024772644, + "eval_runtime": 3.5421, + "eval_samples_per_second": 28.232, + "eval_steps_per_second": 7.058, "step": 40500 }, { - "epoch": 1.43, - "learning_rate": 4.195197321331476e-05, - "loss": 0.3007, + "epoch": 1.4597974555807836, + "grad_norm": 0.1947942078113556, + "learning_rate": 4.153561578080543e-05, + "loss": 0.413, "step": 40505 }, { - "epoch": 1.43, - "learning_rate": 4.194987932417054e-05, - "loss": 0.3135, + "epoch": 1.459977655242008, + "grad_norm": 0.1369856297969818, + "learning_rate": 4.153342703396341e-05, + "loss": 0.3949, "step": 40510 }, { - "epoch": 1.43, - "learning_rate": 4.194778521494329e-05, - "loss": 0.2749, + "epoch": 1.4601578549032328, + "grad_norm": 0.17609436810016632, + "learning_rate": 4.153123806185666e-05, + "loss": 0.4322, "step": 40515 }, { - "epoch": 1.43, - "learning_rate": 4.194569088566023e-05, - "loss": 0.2535, + "epoch": 1.4603380545644575, + "grad_norm": 0.2042810618877411, + "learning_rate": 4.152904886451498e-05, + "loss": 0.4339, "step": 40520 }, { - "epoch": 1.43, - "learning_rate": 4.194359633634854e-05, - "loss": 0.2984, + "epoch": 1.460518254225682, + "grad_norm": 0.22225399315357208, + "learning_rate": 4.152685944196821e-05, + "loss": 0.4344, "step": 40525 }, { - "epoch": 1.43, - "learning_rate": 4.194150156703542e-05, - "loss": 0.2547, + "epoch": 1.4606984538869066, + "grad_norm": 0.47528132796287537, + "learning_rate": 4.152466979424619e-05, + "loss": 0.4446, "step": 40530 }, { - "epoch": 1.43, - "learning_rate": 4.1939406577748065e-05, - "loss": 0.3001, + "epoch": 1.4608786535481313, + "grad_norm": 0.16672192513942719, + "learning_rate": 4.1522479921378733e-05, + "loss": 0.4329, "step": 40535 }, { - "epoch": 1.43, - "learning_rate": 4.193731136851368e-05, - "loss": 0.2765, + "epoch": 1.461058853209356, + "grad_norm": 0.19320343434810638, + "learning_rate": 4.1520289823395686e-05, + "loss": 0.4102, "step": 40540 }, { - "epoch": 1.43, - "learning_rate": 4.193521593935946e-05, - "loss": 0.2817, + "epoch": 1.4612390528705805, + "grad_norm": 0.17399749159812927, + "learning_rate": 4.151809950032689e-05, + "loss": 0.4194, "step": 40545 }, { - "epoch": 1.43, - "learning_rate": 4.1933120290312636e-05, - "loss": 0.2948, + "epoch": 1.4614192525318053, + "grad_norm": 0.1759127378463745, + "learning_rate": 4.1515908952202186e-05, + "loss": 0.3828, "step": 40550 }, { - "epoch": 1.43, - "learning_rate": 4.19310244214004e-05, - "loss": 0.3034, + "epoch": 1.4615994521930298, + "grad_norm": 0.17152760922908783, + "learning_rate": 4.151371817905142e-05, + "loss": 0.4291, "step": 40555 }, { - "epoch": 1.43, - "learning_rate": 4.1928928332649965e-05, - "loss": 0.3064, + "epoch": 1.4617796518542545, + "grad_norm": 0.18539777398109436, + "learning_rate": 4.151152718090445e-05, + "loss": 0.376, "step": 40560 }, { - "epoch": 1.43, - "learning_rate": 4.192683202408855e-05, - "loss": 0.2673, + "epoch": 1.4619598515154792, + "grad_norm": 0.1786716729402542, + "learning_rate": 4.1509335957791106e-05, + "loss": 0.3922, "step": 40565 }, { - "epoch": 1.43, - "learning_rate": 4.1924735495743384e-05, - "loss": 0.2798, + "epoch": 1.4621400511767038, + "grad_norm": 0.15850621461868286, + "learning_rate": 4.150714450974126e-05, + "loss": 0.4005, "step": 40570 }, { - "epoch": 1.43, - "learning_rate": 4.1922638747641676e-05, - "loss": 0.2704, + "epoch": 1.4623202508379285, + "grad_norm": 0.19711154699325562, + "learning_rate": 4.150495283678477e-05, + "loss": 0.4157, "step": 40575 }, { - "epoch": 1.43, - "learning_rate": 4.192054177981065e-05, - "loss": 0.2958, + "epoch": 1.462500450499153, + "grad_norm": 0.2046595960855484, + "learning_rate": 4.150276093895149e-05, + "loss": 0.4197, "step": 40580 }, { - "epoch": 1.43, - "learning_rate": 4.191844459227756e-05, - "loss": 0.2943, + "epoch": 1.4626806501603777, + "grad_norm": 0.23600243031978607, + "learning_rate": 4.1500568816271285e-05, + "loss": 0.4453, "step": 40585 }, { - "epoch": 1.43, - "learning_rate": 4.1916347185069596e-05, - "loss": 0.2997, + "epoch": 1.4628608498216025, + "grad_norm": 0.18028078973293304, + "learning_rate": 4.149837646877402e-05, + "loss": 0.449, "step": 40590 }, { - "epoch": 1.43, - "learning_rate": 4.1914249558214025e-05, - "loss": 0.3073, + "epoch": 1.463041049482827, + "grad_norm": 0.21281985938549042, + "learning_rate": 4.149618389648958e-05, + "loss": 0.434, "step": 40595 }, { - "epoch": 1.43, - "learning_rate": 4.191215171173807e-05, - "loss": 0.3079, + "epoch": 1.4632212491440515, + "grad_norm": 0.18634112179279327, + "learning_rate": 4.149399109944783e-05, + "loss": 0.3694, "step": 40600 }, { - "epoch": 1.43, - "learning_rate": 4.1910053645668965e-05, - "loss": 0.2856, + "epoch": 1.4634014488052762, + "grad_norm": 0.2347174882888794, + "learning_rate": 4.1491798077678636e-05, + "loss": 0.4023, "step": 40605 }, { - "epoch": 1.43, - "learning_rate": 4.190795536003396e-05, - "loss": 0.2899, + "epoch": 1.463581648466501, + "grad_norm": 0.2317020446062088, + "learning_rate": 4.148960483121189e-05, + "loss": 0.4539, "step": 40610 }, { - "epoch": 1.43, - "learning_rate": 4.190585685486029e-05, - "loss": 0.2806, + "epoch": 1.4637618481277255, + "grad_norm": 0.16629308462142944, + "learning_rate": 4.148741136007747e-05, + "loss": 0.4403, "step": 40615 }, { - "epoch": 1.43, - "learning_rate": 4.190375813017522e-05, - "loss": 0.2648, + "epoch": 1.4639420477889502, + "grad_norm": 0.19857972860336304, + "learning_rate": 4.148521766430527e-05, + "loss": 0.3936, "step": 40620 }, { - "epoch": 1.43, - "learning_rate": 4.1901659186005996e-05, - "loss": 0.2655, + "epoch": 1.4641222474501747, + "grad_norm": 0.18918615579605103, + "learning_rate": 4.148302374392516e-05, + "loss": 0.4059, "step": 40625 }, { - "epoch": 1.43, - "learning_rate": 4.1899560022379856e-05, - "loss": 0.317, + "epoch": 1.4643024471113995, + "grad_norm": 0.1877117156982422, + "learning_rate": 4.148082959896704e-05, + "loss": 0.3964, "step": 40630 }, { - "epoch": 1.43, - "learning_rate": 4.189746063932407e-05, - "loss": 0.2878, + "epoch": 1.4644826467726242, + "grad_norm": 0.1804433912038803, + "learning_rate": 4.1478635229460814e-05, + "loss": 0.4587, "step": 40635 }, { - "epoch": 1.43, - "learning_rate": 4.189536103686589e-05, - "loss": 0.2867, + "epoch": 1.4646628464338487, + "grad_norm": 0.16584636270999908, + "learning_rate": 4.1476440635436376e-05, + "loss": 0.3789, "step": 40640 }, { - "epoch": 1.43, - "learning_rate": 4.189326121503258e-05, - "loss": 0.2737, + "epoch": 1.4648430460950732, + "grad_norm": 0.16475588083267212, + "learning_rate": 4.1474245816923616e-05, + "loss": 0.3847, "step": 40645 }, { - "epoch": 1.43, - "learning_rate": 4.1891161173851415e-05, - "loss": 0.2758, + "epoch": 1.465023245756298, + "grad_norm": 0.18502764403820038, + "learning_rate": 4.147205077395245e-05, + "loss": 0.4038, "step": 40650 }, { - "epoch": 1.43, - "learning_rate": 4.188906091334964e-05, - "loss": 0.2886, + "epoch": 1.4652034454175227, + "grad_norm": 0.14497093856334686, + "learning_rate": 4.146985550655279e-05, + "loss": 0.3932, "step": 40655 }, { - "epoch": 1.43, - "learning_rate": 4.188696043355455e-05, - "loss": 0.2923, + "epoch": 1.4653836450787472, + "grad_norm": 0.21168319880962372, + "learning_rate": 4.146766001475453e-05, + "loss": 0.411, "step": 40660 }, { - "epoch": 1.43, - "learning_rate": 4.188485973449341e-05, - "loss": 0.2712, + "epoch": 1.465563844739972, + "grad_norm": 0.17997999489307404, + "learning_rate": 4.146546429858759e-05, + "loss": 0.3992, "step": 40665 }, { - "epoch": 1.43, - "learning_rate": 4.1882758816193484e-05, - "loss": 0.3056, + "epoch": 1.4657440444011964, + "grad_norm": 0.1749783307313919, + "learning_rate": 4.146326835808188e-05, + "loss": 0.463, "step": 40670 }, { - "epoch": 1.43, - "learning_rate": 4.188065767868207e-05, - "loss": 0.3079, + "epoch": 1.4659242440624212, + "grad_norm": 0.19459916651248932, + "learning_rate": 4.1461072193267344e-05, + "loss": 0.4353, "step": 40675 }, { - "epoch": 1.43, - "learning_rate": 4.187855632198643e-05, - "loss": 0.279, + "epoch": 1.466104443723646, + "grad_norm": 0.18047486245632172, + "learning_rate": 4.145887580417387e-05, + "loss": 0.414, "step": 40680 }, { - "epoch": 1.43, - "learning_rate": 4.187645474613387e-05, - "loss": 0.2859, + "epoch": 1.4662846433848704, + "grad_norm": 0.21394000947475433, + "learning_rate": 4.145667919083141e-05, + "loss": 0.3916, "step": 40685 }, { - "epoch": 1.43, - "learning_rate": 4.187435295115166e-05, - "loss": 0.2993, + "epoch": 1.4664648430460951, + "grad_norm": 0.2331513613462448, + "learning_rate": 4.1454482353269875e-05, + "loss": 0.4231, "step": 40690 }, { - "epoch": 1.43, - "learning_rate": 4.18722509370671e-05, - "loss": 0.2918, + "epoch": 1.4666450427073197, + "grad_norm": 0.18889982998371124, + "learning_rate": 4.145228529151921e-05, + "loss": 0.4509, "step": 40695 }, { - "epoch": 1.43, - "learning_rate": 4.187014870390749e-05, - "loss": 0.2852, + "epoch": 1.4668252423685444, + "grad_norm": 0.1711355745792389, + "learning_rate": 4.1450088005609335e-05, + "loss": 0.4098, "step": 40700 }, { - "epoch": 1.43, - "learning_rate": 4.186804625170011e-05, - "loss": 0.2632, + "epoch": 1.467005442029769, + "grad_norm": 0.22893080115318298, + "learning_rate": 4.1447890495570205e-05, + "loss": 0.406, "step": 40705 }, { - "epoch": 1.43, - "learning_rate": 4.186594358047226e-05, - "loss": 0.3012, + "epoch": 1.4671856416909936, + "grad_norm": 0.1730494350194931, + "learning_rate": 4.1445692761431743e-05, + "loss": 0.4144, "step": 40710 }, { - "epoch": 1.43, - "learning_rate": 4.1863840690251255e-05, - "loss": 0.2882, + "epoch": 1.4673658413522181, + "grad_norm": 0.22228698432445526, + "learning_rate": 4.14434948032239e-05, + "loss": 0.4327, "step": 40715 }, { - "epoch": 1.43, - "learning_rate": 4.1861737581064386e-05, - "loss": 0.2928, + "epoch": 1.4675460410134429, + "grad_norm": 0.18846584856510162, + "learning_rate": 4.144129662097663e-05, + "loss": 0.4167, "step": 40720 }, { - "epoch": 1.43, - "learning_rate": 4.1859634252938974e-05, - "loss": 0.2965, + "epoch": 1.4677262406746676, + "grad_norm": 0.18671122193336487, + "learning_rate": 4.143909821471988e-05, + "loss": 0.4348, "step": 40725 }, { - "epoch": 1.43, - "learning_rate": 4.1857530705902316e-05, - "loss": 0.2917, + "epoch": 1.4679064403358921, + "grad_norm": 0.15065552294254303, + "learning_rate": 4.143689958448359e-05, + "loss": 0.4215, "step": 40730 }, { - "epoch": 1.43, - "learning_rate": 4.185542693998173e-05, - "loss": 0.2961, + "epoch": 1.4680866399971169, + "grad_norm": 0.19978941977024078, + "learning_rate": 4.143470073029774e-05, + "loss": 0.4065, "step": 40735 }, { - "epoch": 1.43, - "learning_rate": 4.1853322955204545e-05, - "loss": 0.2923, + "epoch": 1.4682668396583414, + "grad_norm": 0.1559489220380783, + "learning_rate": 4.143250165219226e-05, + "loss": 0.423, "step": 40740 }, { - "epoch": 1.43, - "learning_rate": 4.1851218751598056e-05, - "loss": 0.2859, + "epoch": 1.468447039319566, + "grad_norm": 0.1872124969959259, + "learning_rate": 4.143030235019713e-05, + "loss": 0.4196, "step": 40745 }, { - "epoch": 1.43, - "learning_rate": 4.18491143291896e-05, - "loss": 0.2739, + "epoch": 1.4686272389807908, + "grad_norm": 0.20608900487422943, + "learning_rate": 4.142810282434231e-05, + "loss": 0.419, "step": 40750 }, { - "epoch": 1.43, - "learning_rate": 4.18470096880065e-05, - "loss": 0.2854, + "epoch": 1.4688074386420154, + "grad_norm": 0.16788601875305176, + "learning_rate": 4.1425903074657776e-05, + "loss": 0.4231, "step": 40755 }, { - "epoch": 1.43, - "learning_rate": 4.1844904828076086e-05, - "loss": 0.2799, + "epoch": 1.4689876383032399, + "grad_norm": 0.305698037147522, + "learning_rate": 4.142370310117348e-05, + "loss": 0.3935, "step": 40760 }, { - "epoch": 1.43, - "learning_rate": 4.184279974942568e-05, - "loss": 0.3104, + "epoch": 1.4691678379644646, + "grad_norm": 0.13668397068977356, + "learning_rate": 4.142150290391943e-05, + "loss": 0.387, "step": 40765 }, { - "epoch": 1.43, - "learning_rate": 4.184069445208262e-05, - "loss": 0.2642, + "epoch": 1.4693480376256893, + "grad_norm": 0.15131601691246033, + "learning_rate": 4.141930248292557e-05, + "loss": 0.4168, "step": 40770 }, { - "epoch": 1.43, - "learning_rate": 4.183858893607423e-05, - "loss": 0.2641, + "epoch": 1.4695282372869138, + "grad_norm": 0.19707903265953064, + "learning_rate": 4.141710183822189e-05, + "loss": 0.4628, "step": 40775 }, { - "epoch": 1.43, - "learning_rate": 4.1836483201427876e-05, - "loss": 0.3153, + "epoch": 1.4697084369481386, + "grad_norm": 0.19335195422172546, + "learning_rate": 4.1414900969838375e-05, + "loss": 0.4157, "step": 40780 }, { - "epoch": 1.43, - "learning_rate": 4.183437724817087e-05, - "loss": 0.2718, + "epoch": 1.469888636609363, + "grad_norm": 0.20567668974399567, + "learning_rate": 4.141269987780502e-05, + "loss": 0.4535, "step": 40785 }, { - "epoch": 1.44, - "learning_rate": 4.183227107633058e-05, - "loss": 0.2915, + "epoch": 1.4700688362705878, + "grad_norm": 0.17595280706882477, + "learning_rate": 4.14104985621518e-05, + "loss": 0.4263, "step": 40790 }, { - "epoch": 1.44, - "learning_rate": 4.183016468593434e-05, - "loss": 0.2779, + "epoch": 1.4702490359318126, + "grad_norm": 0.18598760664463043, + "learning_rate": 4.140829702290872e-05, + "loss": 0.4139, "step": 40795 }, { - "epoch": 1.44, - "learning_rate": 4.18280580770095e-05, - "loss": 0.2813, + "epoch": 1.470429235593037, + "grad_norm": 0.19767579436302185, + "learning_rate": 4.140609526010576e-05, + "loss": 0.3817, "step": 40800 }, { - "epoch": 1.44, - "learning_rate": 4.182595124958342e-05, - "loss": 0.2932, + "epoch": 1.4706094352542618, + "grad_norm": 0.18627217411994934, + "learning_rate": 4.140389327377294e-05, + "loss": 0.3926, "step": 40805 }, { - "epoch": 1.44, - "learning_rate": 4.1823844203683456e-05, - "loss": 0.2671, + "epoch": 1.4707896349154863, + "grad_norm": 0.20653992891311646, + "learning_rate": 4.140169106394024e-05, + "loss": 0.4532, "step": 40810 }, { - "epoch": 1.44, - "learning_rate": 4.182173693933695e-05, - "loss": 0.3033, + "epoch": 1.470969834576711, + "grad_norm": 0.16096976399421692, + "learning_rate": 4.139948863063768e-05, + "loss": 0.3833, "step": 40815 }, { - "epoch": 1.44, - "learning_rate": 4.1819629456571285e-05, - "loss": 0.3036, + "epoch": 1.4711500342379356, + "grad_norm": 0.14477817714214325, + "learning_rate": 4.1397285973895264e-05, + "loss": 0.4148, "step": 40820 }, { - "epoch": 1.44, - "learning_rate": 4.181752175541381e-05, - "loss": 0.3172, + "epoch": 1.4713302338991603, + "grad_norm": 0.17661945521831512, + "learning_rate": 4.1395083093743006e-05, + "loss": 0.4228, "step": 40825 }, { - "epoch": 1.44, - "learning_rate": 4.1815413835891904e-05, - "loss": 0.2817, + "epoch": 1.4715104335603848, + "grad_norm": 0.16946406662464142, + "learning_rate": 4.139287999021091e-05, + "loss": 0.4413, "step": 40830 }, { - "epoch": 1.44, - "learning_rate": 4.181330569803293e-05, - "loss": 0.2732, + "epoch": 1.4716906332216095, + "grad_norm": 0.2282014936208725, + "learning_rate": 4.1390676663328995e-05, + "loss": 0.4072, "step": 40835 }, { - "epoch": 1.44, - "learning_rate": 4.181119734186426e-05, - "loss": 0.2779, + "epoch": 1.4718708328828343, + "grad_norm": 0.1720079779624939, + "learning_rate": 4.138847311312728e-05, + "loss": 0.4107, "step": 40840 }, { - "epoch": 1.44, - "learning_rate": 4.180908876741327e-05, - "loss": 0.2961, + "epoch": 1.4720510325440588, + "grad_norm": 0.1870976686477661, + "learning_rate": 4.13862693396358e-05, + "loss": 0.4437, "step": 40845 }, { - "epoch": 1.44, - "learning_rate": 4.180697997470734e-05, - "loss": 0.2648, + "epoch": 1.4722312322052835, + "grad_norm": 0.14838680624961853, + "learning_rate": 4.138406534288457e-05, + "loss": 0.4164, "step": 40850 }, { - "epoch": 1.44, - "learning_rate": 4.1804870963773855e-05, - "loss": 0.2808, + "epoch": 1.472411431866508, + "grad_norm": 0.146432027220726, + "learning_rate": 4.138186112290362e-05, + "loss": 0.3791, "step": 40855 }, { - "epoch": 1.44, - "learning_rate": 4.180276173464019e-05, - "loss": 0.2897, + "epoch": 1.4725916315277328, + "grad_norm": 0.1663684993982315, + "learning_rate": 4.137965667972298e-05, + "loss": 0.3969, "step": 40860 }, { - "epoch": 1.44, - "learning_rate": 4.1800652287333745e-05, - "loss": 0.2875, + "epoch": 1.4727718311889575, + "grad_norm": 0.24084417521953583, + "learning_rate": 4.1377452013372695e-05, + "loss": 0.4105, "step": 40865 }, { - "epoch": 1.44, - "learning_rate": 4.17985426218819e-05, - "loss": 0.2783, + "epoch": 1.472952030850182, + "grad_norm": 0.21911956369876862, + "learning_rate": 4.13752471238828e-05, + "loss": 0.45, "step": 40870 }, { - "epoch": 1.44, - "learning_rate": 4.1796432738312046e-05, - "loss": 0.2906, + "epoch": 1.4731322305114065, + "grad_norm": 0.20108048617839813, + "learning_rate": 4.137304201128334e-05, + "loss": 0.432, "step": 40875 }, { - "epoch": 1.44, - "learning_rate": 4.179432263665159e-05, - "loss": 0.2797, + "epoch": 1.4733124301726312, + "grad_norm": 0.16961334645748138, + "learning_rate": 4.1370836675604326e-05, + "loss": 0.401, "step": 40880 }, { - "epoch": 1.44, - "learning_rate": 4.1792212316927917e-05, - "loss": 0.3023, + "epoch": 1.473492629833856, + "grad_norm": 0.16937093436717987, + "learning_rate": 4.1368631116875856e-05, + "loss": 0.4159, "step": 40885 }, { - "epoch": 1.44, - "learning_rate": 4.1790101779168425e-05, - "loss": 0.301, + "epoch": 1.4736728294950805, + "grad_norm": 0.19566501677036285, + "learning_rate": 4.136642533512795e-05, + "loss": 0.4196, "step": 40890 }, { - "epoch": 1.44, - "learning_rate": 4.1787991023400534e-05, - "loss": 0.2779, + "epoch": 1.4738530291563052, + "grad_norm": 0.17469613254070282, + "learning_rate": 4.136421933039066e-05, + "loss": 0.3932, "step": 40895 }, { - "epoch": 1.44, - "learning_rate": 4.178588004965165e-05, - "loss": 0.2833, + "epoch": 1.4740332288175297, + "grad_norm": 0.18725554645061493, + "learning_rate": 4.136201310269406e-05, + "loss": 0.4323, "step": 40900 }, { - "epoch": 1.44, - "learning_rate": 4.1783768857949166e-05, - "loss": 0.3119, + "epoch": 1.4742134284787545, + "grad_norm": 0.21251484751701355, + "learning_rate": 4.135980665206819e-05, + "loss": 0.4335, "step": 40905 }, { - "epoch": 1.44, - "learning_rate": 4.178165744832051e-05, - "loss": 0.293, + "epoch": 1.4743936281399792, + "grad_norm": 0.21791920065879822, + "learning_rate": 4.135759997854313e-05, + "loss": 0.4479, "step": 40910 }, { - "epoch": 1.44, - "learning_rate": 4.1779545820793085e-05, - "loss": 0.2687, + "epoch": 1.4745738278012037, + "grad_norm": 0.19280044734477997, + "learning_rate": 4.1355393082148936e-05, + "loss": 0.387, "step": 40915 }, { - "epoch": 1.44, - "learning_rate": 4.177743397539432e-05, - "loss": 0.2857, + "epoch": 1.4747540274624285, + "grad_norm": 0.14178399741649628, + "learning_rate": 4.1353185962915675e-05, + "loss": 0.4095, "step": 40920 }, { - "epoch": 1.44, - "learning_rate": 4.177532191215163e-05, - "loss": 0.3076, + "epoch": 1.474934227123653, + "grad_norm": 0.24102537333965302, + "learning_rate": 4.135097862087342e-05, + "loss": 0.4249, "step": 40925 }, { - "epoch": 1.44, - "learning_rate": 4.177320963109244e-05, - "loss": 0.2961, + "epoch": 1.4751144267848777, + "grad_norm": 0.2044147253036499, + "learning_rate": 4.134877105605225e-05, + "loss": 0.3982, "step": 40930 }, { - "epoch": 1.44, - "learning_rate": 4.177109713224418e-05, - "loss": 0.2973, + "epoch": 1.4752946264461022, + "grad_norm": 0.14155633747577667, + "learning_rate": 4.1346563268482245e-05, + "loss": 0.4237, "step": 40935 }, { - "epoch": 1.44, - "learning_rate": 4.176898441563428e-05, - "loss": 0.298, + "epoch": 1.475474826107327, + "grad_norm": 0.20636247098445892, + "learning_rate": 4.134435525819347e-05, + "loss": 0.414, "step": 40940 }, { - "epoch": 1.44, - "learning_rate": 4.176687148129016e-05, - "loss": 0.2911, + "epoch": 1.4756550257685515, + "grad_norm": 0.16069145500659943, + "learning_rate": 4.1342147025216015e-05, + "loss": 0.4105, "step": 40945 }, { - "epoch": 1.44, - "learning_rate": 4.176475832923927e-05, - "loss": 0.3035, + "epoch": 1.4758352254297762, + "grad_norm": 0.2055576741695404, + "learning_rate": 4.1339938569579985e-05, + "loss": 0.4265, "step": 40950 }, { - "epoch": 1.44, - "learning_rate": 4.1762644959509035e-05, - "loss": 0.2867, + "epoch": 1.476015425091001, + "grad_norm": 0.1672385036945343, + "learning_rate": 4.1337729891315445e-05, + "loss": 0.4226, "step": 40955 }, { - "epoch": 1.44, - "learning_rate": 4.176053137212691e-05, - "loss": 0.2858, + "epoch": 1.4761956247522254, + "grad_norm": 0.1980341076850891, + "learning_rate": 4.1335520990452504e-05, + "loss": 0.4252, "step": 40960 }, { - "epoch": 1.44, - "learning_rate": 4.175841756712032e-05, - "loss": 0.296, + "epoch": 1.4763758244134502, + "grad_norm": 0.19401240348815918, + "learning_rate": 4.1333311867021254e-05, + "loss": 0.4046, "step": 40965 }, { - "epoch": 1.44, - "learning_rate": 4.175630354451673e-05, - "loss": 0.2876, + "epoch": 1.4765560240746747, + "grad_norm": 0.2943877875804901, + "learning_rate": 4.133110252105178e-05, + "loss": 0.4149, "step": 40970 }, { - "epoch": 1.44, - "learning_rate": 4.1754189304343584e-05, - "loss": 0.2937, + "epoch": 1.4767362237358994, + "grad_norm": 0.15814274549484253, + "learning_rate": 4.13288929525742e-05, + "loss": 0.4066, "step": 40975 }, { - "epoch": 1.44, - "learning_rate": 4.175207484662833e-05, - "loss": 0.2828, + "epoch": 1.4769164233971241, + "grad_norm": 0.1717575490474701, + "learning_rate": 4.132668316161863e-05, + "loss": 0.4235, "step": 40980 }, { - "epoch": 1.44, - "learning_rate": 4.1749960171398413e-05, - "loss": 0.262, + "epoch": 1.4770966230583487, + "grad_norm": 0.1958007961511612, + "learning_rate": 4.1324473148215146e-05, + "loss": 0.4493, "step": 40985 }, { - "epoch": 1.44, - "learning_rate": 4.174784527868132e-05, - "loss": 0.3013, + "epoch": 1.4772768227195732, + "grad_norm": 0.17059576511383057, + "learning_rate": 4.1322262912393884e-05, + "loss": 0.3968, "step": 40990 }, { - "epoch": 1.44, - "learning_rate": 4.174573016850448e-05, - "loss": 0.3128, + "epoch": 1.477457022380798, + "grad_norm": 0.18559856712818146, + "learning_rate": 4.132005245418495e-05, + "loss": 0.4099, "step": 40995 }, { - "epoch": 1.44, - "learning_rate": 4.174361484089537e-05, - "loss": 0.2979, + "epoch": 1.4776372220420226, + "grad_norm": 0.206581249833107, + "learning_rate": 4.131784177361845e-05, + "loss": 0.4063, "step": 41000 }, { - "epoch": 1.44, - "eval_loss": 0.28040316700935364, - "eval_runtime": 10.542, - "eval_samples_per_second": 9.486, - "eval_steps_per_second": 9.486, + "epoch": 1.4776372220420226, + "eval_loss": 0.44472381472587585, + "eval_runtime": 3.5272, + "eval_samples_per_second": 28.351, + "eval_steps_per_second": 7.088, "step": 41000 }, { - "epoch": 1.44, - "learning_rate": 4.174149929588146e-05, - "loss": 0.2828, + "epoch": 1.4778174217032471, + "grad_norm": 0.2029978483915329, + "learning_rate": 4.131563087072453e-05, + "loss": 0.4428, "step": 41005 }, { - "epoch": 1.44, - "learning_rate": 4.173938353349022e-05, - "loss": 0.296, + "epoch": 1.4779976213644719, + "grad_norm": 0.1657671481370926, + "learning_rate": 4.131341974553329e-05, + "loss": 0.4282, "step": 41010 }, { - "epoch": 1.44, - "learning_rate": 4.1737267553749105e-05, - "loss": 0.3014, + "epoch": 1.4781778210256964, + "grad_norm": 0.22293435037136078, + "learning_rate": 4.131120839807487e-05, + "loss": 0.4178, "step": 41015 }, { - "epoch": 1.44, - "learning_rate": 4.173515135668561e-05, - "loss": 0.2585, + "epoch": 1.4783580206869211, + "grad_norm": 0.18376168608665466, + "learning_rate": 4.130899682837939e-05, + "loss": 0.4239, "step": 41020 }, { - "epoch": 1.44, - "learning_rate": 4.17330349423272e-05, - "loss": 0.2728, + "epoch": 1.4785382203481459, + "grad_norm": 0.20025895535945892, + "learning_rate": 4.130678503647698e-05, + "loss": 0.4262, "step": 41025 }, { - "epoch": 1.44, - "learning_rate": 4.173091831070135e-05, - "loss": 0.2747, + "epoch": 1.4787184200093704, + "grad_norm": 0.16391725838184357, + "learning_rate": 4.1304573022397784e-05, + "loss": 0.3756, "step": 41030 }, { - "epoch": 1.44, - "learning_rate": 4.172880146183556e-05, - "loss": 0.2934, + "epoch": 1.4788986196705949, + "grad_norm": 0.16140343248844147, + "learning_rate": 4.1302360786171946e-05, + "loss": 0.4036, "step": 41035 }, { - "epoch": 1.44, - "learning_rate": 4.172668439575731e-05, - "loss": 0.2874, + "epoch": 1.4790788193318196, + "grad_norm": 0.23130172491073608, + "learning_rate": 4.1300148327829593e-05, + "loss": 0.4511, "step": 41040 }, { - "epoch": 1.44, - "learning_rate": 4.1724567112494087e-05, - "loss": 0.2871, + "epoch": 1.4792590189930444, + "grad_norm": 0.17555512487888336, + "learning_rate": 4.1297935647400874e-05, + "loss": 0.4236, "step": 41045 }, { - "epoch": 1.44, - "learning_rate": 4.172244961207338e-05, - "loss": 0.2739, + "epoch": 1.4794392186542689, + "grad_norm": 0.18214865028858185, + "learning_rate": 4.1295722744915934e-05, + "loss": 0.4806, "step": 41050 }, { - "epoch": 1.44, - "learning_rate": 4.172033189452268e-05, - "loss": 0.2983, + "epoch": 1.4796194183154936, + "grad_norm": 0.17145216464996338, + "learning_rate": 4.129350962040494e-05, + "loss": 0.4215, "step": 41055 }, { - "epoch": 1.44, - "learning_rate": 4.17182139598695e-05, - "loss": 0.2867, + "epoch": 1.479799617976718, + "grad_norm": 0.1968265026807785, + "learning_rate": 4.1291296273898015e-05, + "loss": 0.4054, "step": 41060 }, { - "epoch": 1.44, - "learning_rate": 4.171609580814132e-05, - "loss": 0.2692, + "epoch": 1.4799798176379428, + "grad_norm": 0.14597874879837036, + "learning_rate": 4.1289082705425344e-05, + "loss": 0.4229, "step": 41065 }, { - "epoch": 1.44, - "learning_rate": 4.171397743936565e-05, - "loss": 0.2837, + "epoch": 1.4801600172991676, + "grad_norm": 0.1973959356546402, + "learning_rate": 4.1286868915017064e-05, + "loss": 0.3867, "step": 41070 }, { - "epoch": 1.45, - "learning_rate": 4.171185885357001e-05, - "loss": 0.2832, + "epoch": 1.480340216960392, + "grad_norm": 0.18178540468215942, + "learning_rate": 4.1284654902703356e-05, + "loss": 0.4134, "step": 41075 }, { - "epoch": 1.45, - "learning_rate": 4.170974005078189e-05, - "loss": 0.2724, + "epoch": 1.4805204166216168, + "grad_norm": 0.2181033045053482, + "learning_rate": 4.1282440668514376e-05, + "loss": 0.4203, "step": 41080 }, { - "epoch": 1.45, - "learning_rate": 4.170762103102881e-05, - "loss": 0.2777, + "epoch": 1.4807006162828413, + "grad_norm": 0.18515872955322266, + "learning_rate": 4.128022621248029e-05, + "loss": 0.4207, "step": 41085 }, { - "epoch": 1.45, - "learning_rate": 4.1705501794338274e-05, - "loss": 0.3046, + "epoch": 1.480880815944066, + "grad_norm": 0.16878053545951843, + "learning_rate": 4.1278011534631276e-05, + "loss": 0.4058, "step": 41090 }, { - "epoch": 1.45, - "learning_rate": 4.170338234073781e-05, - "loss": 0.2767, + "epoch": 1.4810610156052908, + "grad_norm": 0.1830851435661316, + "learning_rate": 4.127579663499752e-05, + "loss": 0.3987, "step": 41095 }, { - "epoch": 1.45, - "learning_rate": 4.170126267025494e-05, - "loss": 0.2689, + "epoch": 1.4812412152665153, + "grad_norm": 0.1524893045425415, + "learning_rate": 4.1273581513609173e-05, + "loss": 0.4154, "step": 41100 }, { - "epoch": 1.45, - "learning_rate": 4.169914278291718e-05, - "loss": 0.2693, + "epoch": 1.4814214149277398, + "grad_norm": 0.18265849351882935, + "learning_rate": 4.127136617049643e-05, + "loss": 0.4195, "step": 41105 }, { - "epoch": 1.45, - "learning_rate": 4.169702267875205e-05, - "loss": 0.2948, + "epoch": 1.4816016145889646, + "grad_norm": 0.18311630189418793, + "learning_rate": 4.126915060568947e-05, + "loss": 0.4548, "step": 41110 }, { - "epoch": 1.45, - "learning_rate": 4.169490235778709e-05, - "loss": 0.295, + "epoch": 1.4817818142501893, + "grad_norm": 0.1863049566745758, + "learning_rate": 4.126693481921848e-05, + "loss": 0.4483, "step": 41115 }, { - "epoch": 1.45, - "learning_rate": 4.169278182004982e-05, - "loss": 0.2893, + "epoch": 1.4819620139114138, + "grad_norm": 0.15636593103408813, + "learning_rate": 4.126471881111367e-05, + "loss": 0.4159, "step": 41120 }, { - "epoch": 1.45, - "learning_rate": 4.169066106556778e-05, - "loss": 0.2922, + "epoch": 1.4821422135726385, + "grad_norm": 0.1574103981256485, + "learning_rate": 4.1262502581405196e-05, + "loss": 0.4247, "step": 41125 }, { - "epoch": 1.45, - "learning_rate": 4.1688540094368505e-05, - "loss": 0.3208, + "epoch": 1.482322413233863, + "grad_norm": 0.1982761174440384, + "learning_rate": 4.1260286130123285e-05, + "loss": 0.4114, "step": 41130 }, { - "epoch": 1.45, - "learning_rate": 4.168641890647954e-05, - "loss": 0.2858, + "epoch": 1.4825026128950878, + "grad_norm": 0.17163503170013428, + "learning_rate": 4.125806945729812e-05, + "loss": 0.4174, "step": 41135 }, { - "epoch": 1.45, - "learning_rate": 4.1684297501928416e-05, - "loss": 0.2928, + "epoch": 1.4826828125563125, + "grad_norm": 0.1855458766222, + "learning_rate": 4.1255852562959904e-05, + "loss": 0.4327, "step": 41140 }, { - "epoch": 1.45, - "learning_rate": 4.1682175880742694e-05, - "loss": 0.2622, + "epoch": 1.482863012217537, + "grad_norm": 0.21407297253608704, + "learning_rate": 4.125363544713884e-05, + "loss": 0.4342, "step": 41145 }, { - "epoch": 1.45, - "learning_rate": 4.16800540429499e-05, - "loss": 0.2929, + "epoch": 1.4830432118787615, + "grad_norm": 0.20228859782218933, + "learning_rate": 4.1251418109865146e-05, + "loss": 0.4116, "step": 41150 }, { - "epoch": 1.45, - "learning_rate": 4.167793198857761e-05, - "loss": 0.2812, + "epoch": 1.4832234115399863, + "grad_norm": 0.19245915114879608, + "learning_rate": 4.124920055116903e-05, + "loss": 0.3937, "step": 41155 }, { - "epoch": 1.45, - "learning_rate": 4.167580971765335e-05, - "loss": 0.2923, + "epoch": 1.483403611201211, + "grad_norm": 0.1870017945766449, + "learning_rate": 4.12469827710807e-05, + "loss": 0.426, "step": 41160 }, { - "epoch": 1.45, - "learning_rate": 4.1673687230204695e-05, - "loss": 0.3185, + "epoch": 1.4835838108624355, + "grad_norm": 0.1986088752746582, + "learning_rate": 4.1244764769630375e-05, + "loss": 0.4125, "step": 41165 }, { - "epoch": 1.45, - "learning_rate": 4.16715645262592e-05, - "loss": 0.2985, + "epoch": 1.4837640105236602, + "grad_norm": 0.18899187445640564, + "learning_rate": 4.124254654684827e-05, + "loss": 0.4205, "step": 41170 }, { - "epoch": 1.45, - "learning_rate": 4.166944160584443e-05, - "loss": 0.3063, + "epoch": 1.4839442101848848, + "grad_norm": 0.1863652914762497, + "learning_rate": 4.1240328102764614e-05, + "loss": 0.4055, "step": 41175 }, { - "epoch": 1.45, - "learning_rate": 4.166731846898795e-05, - "loss": 0.2869, + "epoch": 1.4841244098461095, + "grad_norm": 0.2102532535791397, + "learning_rate": 4.1238109437409635e-05, + "loss": 0.4175, "step": 41180 }, { - "epoch": 1.45, - "learning_rate": 4.166519511571731e-05, - "loss": 0.3082, + "epoch": 1.4843046095073342, + "grad_norm": 0.17602580785751343, + "learning_rate": 4.123589055081356e-05, + "loss": 0.3868, "step": 41185 }, { - "epoch": 1.45, - "learning_rate": 4.166307154606011e-05, - "loss": 0.2986, + "epoch": 1.4844848091685587, + "grad_norm": 0.1657838225364685, + "learning_rate": 4.123367144300662e-05, + "loss": 0.3956, "step": 41190 }, { - "epoch": 1.45, - "learning_rate": 4.16609477600439e-05, - "loss": 0.2614, + "epoch": 1.4846650088297835, + "grad_norm": 0.15461941063404083, + "learning_rate": 4.123145211401904e-05, + "loss": 0.432, "step": 41195 }, { - "epoch": 1.45, - "learning_rate": 4.1658823757696265e-05, - "loss": 0.2923, + "epoch": 1.484845208491008, + "grad_norm": 0.17926329374313354, + "learning_rate": 4.1229232563881084e-05, + "loss": 0.4127, "step": 41200 }, { - "epoch": 1.45, - "learning_rate": 4.165669953904478e-05, - "loss": 0.2819, + "epoch": 1.4850254081522327, + "grad_norm": 0.15599684417247772, + "learning_rate": 4.122701279262296e-05, + "loss": 0.4382, "step": 41205 }, { - "epoch": 1.45, - "learning_rate": 4.165457510411703e-05, - "loss": 0.2844, + "epoch": 1.4852056078134572, + "grad_norm": 0.17323412001132965, + "learning_rate": 4.1224792800274936e-05, + "loss": 0.39, "step": 41210 }, { - "epoch": 1.45, - "learning_rate": 4.1652450452940594e-05, - "loss": 0.2923, + "epoch": 1.485385807474682, + "grad_norm": 0.19088657200336456, + "learning_rate": 4.122257258686725e-05, + "loss": 0.4044, "step": 41215 }, { - "epoch": 1.45, - "learning_rate": 4.1650325585543073e-05, - "loss": 0.2964, + "epoch": 1.4855660071359065, + "grad_norm": 0.1666693538427353, + "learning_rate": 4.122035215243015e-05, + "loss": 0.422, "step": 41220 }, { - "epoch": 1.45, - "learning_rate": 4.164820050195204e-05, - "loss": 0.2783, + "epoch": 1.4857462067971312, + "grad_norm": 0.23202933371067047, + "learning_rate": 4.12181314969939e-05, + "loss": 0.4344, "step": 41225 }, { - "epoch": 1.45, - "learning_rate": 4.164607520219509e-05, - "loss": 0.2901, + "epoch": 1.485926406458356, + "grad_norm": 0.19096602499485016, + "learning_rate": 4.1215910620588745e-05, + "loss": 0.4192, "step": 41230 }, { - "epoch": 1.45, - "learning_rate": 4.164394968629983e-05, - "loss": 0.2434, + "epoch": 1.4861066061195805, + "grad_norm": 0.21714234352111816, + "learning_rate": 4.1213689523244945e-05, + "loss": 0.397, "step": 41235 }, { - "epoch": 1.45, - "learning_rate": 4.1641823954293855e-05, - "loss": 0.2941, + "epoch": 1.4862868057808052, + "grad_norm": 0.22744521498680115, + "learning_rate": 4.121146820499277e-05, + "loss": 0.439, "step": 41240 }, { - "epoch": 1.45, - "learning_rate": 4.163969800620476e-05, - "loss": 0.3284, + "epoch": 1.4864670054420297, + "grad_norm": 0.1769517958164215, + "learning_rate": 4.120924666586248e-05, + "loss": 0.4124, "step": 41245 }, { - "epoch": 1.45, - "learning_rate": 4.1637571842060156e-05, - "loss": 0.3151, + "epoch": 1.4866472051032544, + "grad_norm": 0.16635587811470032, + "learning_rate": 4.1207024905884335e-05, + "loss": 0.4265, "step": 41250 }, { - "epoch": 1.45, - "learning_rate": 4.163544546188764e-05, - "loss": 0.2672, + "epoch": 1.4868274047644792, + "grad_norm": 0.1759326308965683, + "learning_rate": 4.120480292508861e-05, + "loss": 0.4143, "step": 41255 }, { - "epoch": 1.45, - "learning_rate": 4.163331886571483e-05, - "loss": 0.2914, + "epoch": 1.4870076044257037, + "grad_norm": 0.15628643333911896, + "learning_rate": 4.120258072350559e-05, + "loss": 0.4084, "step": 41260 }, { - "epoch": 1.45, - "learning_rate": 4.163119205356934e-05, - "loss": 0.2725, + "epoch": 1.4871878040869282, + "grad_norm": 0.1729431450366974, + "learning_rate": 4.1200358301165544e-05, + "loss": 0.4391, "step": 41265 }, { - "epoch": 1.45, - "learning_rate": 4.162906502547877e-05, - "loss": 0.2869, + "epoch": 1.487368003748153, + "grad_norm": 0.20765365660190582, + "learning_rate": 4.1198135658098755e-05, + "loss": 0.4579, "step": 41270 }, { - "epoch": 1.45, - "learning_rate": 4.162693778147077e-05, - "loss": 0.2658, + "epoch": 1.4875482034093777, + "grad_norm": 0.18058431148529053, + "learning_rate": 4.11959127943355e-05, + "loss": 0.4406, "step": 41275 }, { - "epoch": 1.45, - "learning_rate": 4.162481032157293e-05, - "loss": 0.2572, + "epoch": 1.4877284030706022, + "grad_norm": 0.16490638256072998, + "learning_rate": 4.119368970990607e-05, + "loss": 0.4603, "step": 41280 }, { - "epoch": 1.45, - "learning_rate": 4.1622682645812886e-05, - "loss": 0.2777, + "epoch": 1.487908602731827, + "grad_norm": 0.20363958179950714, + "learning_rate": 4.119146640484075e-05, + "loss": 0.3987, "step": 41285 }, { - "epoch": 1.45, - "learning_rate": 4.162055475421825e-05, - "loss": 0.307, + "epoch": 1.4880888023930514, + "grad_norm": 0.17337477207183838, + "learning_rate": 4.118924287916984e-05, + "loss": 0.4447, "step": 41290 }, { - "epoch": 1.45, - "learning_rate": 4.161842664681668e-05, - "loss": 0.2797, + "epoch": 1.4882690020542761, + "grad_norm": 0.1706027388572693, + "learning_rate": 4.118701913292363e-05, + "loss": 0.4281, "step": 41295 }, { - "epoch": 1.45, - "learning_rate": 4.1616298323635776e-05, - "loss": 0.2818, + "epoch": 1.4884492017155009, + "grad_norm": 0.22588284313678741, + "learning_rate": 4.118479516613242e-05, + "loss": 0.4214, "step": 41300 }, { - "epoch": 1.45, - "learning_rate": 4.161416978470321e-05, - "loss": 0.298, + "epoch": 1.4886294013767254, + "grad_norm": 0.17973540723323822, + "learning_rate": 4.1182570978826496e-05, + "loss": 0.4239, "step": 41305 }, { - "epoch": 1.45, - "learning_rate": 4.1612041030046576e-05, - "loss": 0.2989, + "epoch": 1.4888096010379501, + "grad_norm": 0.1527833491563797, + "learning_rate": 4.118034657103619e-05, + "loss": 0.3976, "step": 41310 }, { - "epoch": 1.45, - "learning_rate": 4.1609912059693545e-05, - "loss": 0.3038, + "epoch": 1.4889898006991746, + "grad_norm": 0.15691448748111725, + "learning_rate": 4.1178121942791786e-05, + "loss": 0.4011, "step": 41315 }, { - "epoch": 1.45, - "learning_rate": 4.1607782873671756e-05, - "loss": 0.2784, + "epoch": 1.4891700003603994, + "grad_norm": 0.20908258855342865, + "learning_rate": 4.117589709412361e-05, + "loss": 0.4172, "step": 41320 }, { - "epoch": 1.45, - "learning_rate": 4.160565347200885e-05, - "loss": 0.3006, + "epoch": 1.4893502000216239, + "grad_norm": 0.18858399987220764, + "learning_rate": 4.117367202506196e-05, + "loss": 0.4134, "step": 41325 }, { - "epoch": 1.45, - "learning_rate": 4.160352385473248e-05, - "loss": 0.2945, + "epoch": 1.4895303996828486, + "grad_norm": 0.2324933558702469, + "learning_rate": 4.117144673563717e-05, + "loss": 0.4321, "step": 41330 }, { - "epoch": 1.45, - "learning_rate": 4.160139402187029e-05, - "loss": 0.2757, + "epoch": 1.4897105993440731, + "grad_norm": 0.1979554146528244, + "learning_rate": 4.116922122587954e-05, + "loss": 0.3956, "step": 41335 }, { - "epoch": 1.45, - "learning_rate": 4.1599263973449944e-05, - "loss": 0.2844, + "epoch": 1.4898907990052979, + "grad_norm": 0.17453639209270477, + "learning_rate": 4.11669954958194e-05, + "loss": 0.4303, "step": 41340 }, { - "epoch": 1.45, - "learning_rate": 4.1597133709499095e-05, - "loss": 0.2848, + "epoch": 1.4900709986665226, + "grad_norm": 0.17398759722709656, + "learning_rate": 4.116476954548708e-05, + "loss": 0.4485, "step": 41345 }, { - "epoch": 1.45, - "learning_rate": 4.159500323004539e-05, - "loss": 0.2781, + "epoch": 1.490251198327747, + "grad_norm": 0.1593339741230011, + "learning_rate": 4.116254337491291e-05, + "loss": 0.4429, "step": 41350 }, { - "epoch": 1.45, - "learning_rate": 4.159287253511652e-05, - "loss": 0.2788, + "epoch": 1.4904313979889718, + "grad_norm": 0.15613697469234467, + "learning_rate": 4.1160316984127205e-05, + "loss": 0.3826, "step": 41355 }, { - "epoch": 1.46, - "learning_rate": 4.159074162474013e-05, - "loss": 0.2766, + "epoch": 1.4906115976501964, + "grad_norm": 0.1693560630083084, + "learning_rate": 4.115809037316032e-05, + "loss": 0.4813, "step": 41360 }, { - "epoch": 1.46, - "learning_rate": 4.1588610498943895e-05, - "loss": 0.3108, + "epoch": 1.490791797311421, + "grad_norm": 0.15100261569023132, + "learning_rate": 4.1155863542042575e-05, + "loss": 0.4032, "step": 41365 }, { - "epoch": 1.46, - "learning_rate": 4.1586479157755483e-05, - "loss": 0.3063, + "epoch": 1.4909719969726458, + "grad_norm": 0.2202003002166748, + "learning_rate": 4.115363649080432e-05, + "loss": 0.3974, "step": 41370 }, { - "epoch": 1.46, - "learning_rate": 4.158434760120257e-05, - "loss": 0.2931, + "epoch": 1.4911521966338703, + "grad_norm": 0.1720680594444275, + "learning_rate": 4.11514092194759e-05, + "loss": 0.3987, "step": 41375 }, { - "epoch": 1.46, - "learning_rate": 4.158221582931283e-05, - "loss": 0.2764, + "epoch": 1.4913323962950948, + "grad_norm": 0.16511781513690948, + "learning_rate": 4.114918172808765e-05, + "loss": 0.3955, "step": 41380 }, { - "epoch": 1.46, - "learning_rate": 4.158008384211395e-05, - "loss": 0.2896, + "epoch": 1.4915125959563196, + "grad_norm": 0.18977214395999908, + "learning_rate": 4.1146954016669925e-05, + "loss": 0.3936, "step": 41385 }, { - "epoch": 1.46, - "learning_rate": 4.15779516396336e-05, - "loss": 0.3045, + "epoch": 1.4916927956175443, + "grad_norm": 0.18784664571285248, + "learning_rate": 4.1144726085253084e-05, + "loss": 0.4507, "step": 41390 }, { - "epoch": 1.46, - "learning_rate": 4.157581922189949e-05, - "loss": 0.3006, + "epoch": 1.4918729952787688, + "grad_norm": 0.1858154982328415, + "learning_rate": 4.1142497933867465e-05, + "loss": 0.4643, "step": 41395 }, { - "epoch": 1.46, - "learning_rate": 4.157368658893928e-05, - "loss": 0.2764, + "epoch": 1.4920531949399936, + "grad_norm": 0.19693292677402496, + "learning_rate": 4.1140269562543445e-05, + "loss": 0.411, "step": 41400 }, { - "epoch": 1.46, - "learning_rate": 4.1571553740780665e-05, - "loss": 0.287, + "epoch": 1.492233394601218, + "grad_norm": 0.14796632528305054, + "learning_rate": 4.113804097131138e-05, + "loss": 0.4257, "step": 41405 }, { - "epoch": 1.46, - "learning_rate": 4.156942067745136e-05, - "loss": 0.3057, + "epoch": 1.4924135942624428, + "grad_norm": 0.2132243663072586, + "learning_rate": 4.1135812160201624e-05, + "loss": 0.4285, "step": 41410 }, { - "epoch": 1.46, - "learning_rate": 4.1567287398979036e-05, - "loss": 0.2763, + "epoch": 1.4925937939236675, + "grad_norm": 0.151437446475029, + "learning_rate": 4.113358312924455e-05, + "loss": 0.3778, "step": 41415 }, { - "epoch": 1.46, - "learning_rate": 4.156515390539142e-05, - "loss": 0.2769, + "epoch": 1.492773993584892, + "grad_norm": 0.1567225605249405, + "learning_rate": 4.1131353878470536e-05, + "loss": 0.365, "step": 41420 }, { - "epoch": 1.46, - "learning_rate": 4.156302019671618e-05, - "loss": 0.2767, + "epoch": 1.4929541932461168, + "grad_norm": 0.19728878140449524, + "learning_rate": 4.1129124407909944e-05, + "loss": 0.4408, "step": 41425 }, { - "epoch": 1.46, - "learning_rate": 4.1560886272981045e-05, - "loss": 0.2798, + "epoch": 1.4931343929073413, + "grad_norm": 0.1711881160736084, + "learning_rate": 4.112689471759316e-05, + "loss": 0.3945, "step": 41430 }, { - "epoch": 1.46, - "learning_rate": 4.155875213421372e-05, - "loss": 0.318, + "epoch": 1.493314592568566, + "grad_norm": 0.18180076777935028, + "learning_rate": 4.112466480755055e-05, + "loss": 0.4029, "step": 41435 }, { - "epoch": 1.46, - "learning_rate": 4.1556617780441895e-05, - "loss": 0.3104, + "epoch": 1.4934947922297905, + "grad_norm": 0.17946167290210724, + "learning_rate": 4.1122434677812506e-05, + "loss": 0.4141, "step": 41440 }, { - "epoch": 1.46, - "learning_rate": 4.155448321169332e-05, - "loss": 0.2767, + "epoch": 1.4936749918910153, + "grad_norm": 0.17549100518226624, + "learning_rate": 4.1120204328409416e-05, + "loss": 0.4119, "step": 41445 }, { - "epoch": 1.46, - "learning_rate": 4.155234842799568e-05, - "loss": 0.3217, + "epoch": 1.4938551915522398, + "grad_norm": 0.18163146078586578, + "learning_rate": 4.1117973759371666e-05, + "loss": 0.4345, "step": 41450 }, { - "epoch": 1.46, - "learning_rate": 4.1550213429376705e-05, - "loss": 0.2916, + "epoch": 1.4940353912134645, + "grad_norm": 0.1497826874256134, + "learning_rate": 4.111574297072963e-05, + "loss": 0.3586, "step": 41455 }, { - "epoch": 1.46, - "learning_rate": 4.154807821586412e-05, - "loss": 0.2939, + "epoch": 1.4942155908746892, + "grad_norm": 0.17524641752243042, + "learning_rate": 4.111351196251373e-05, + "loss": 0.3819, "step": 41460 }, { - "epoch": 1.46, - "learning_rate": 4.1545942787485636e-05, - "loss": 0.3086, + "epoch": 1.4943957905359138, + "grad_norm": 0.19001320004463196, + "learning_rate": 4.1111280734754345e-05, + "loss": 0.4198, "step": 41465 }, { - "epoch": 1.46, - "learning_rate": 4.1543807144268996e-05, - "loss": 0.266, + "epoch": 1.4945759901971385, + "grad_norm": 0.15616640448570251, + "learning_rate": 4.1109049287481874e-05, + "loss": 0.4113, "step": 41470 }, { - "epoch": 1.46, - "learning_rate": 4.154167128624191e-05, - "loss": 0.3067, + "epoch": 1.494756189858363, + "grad_norm": 0.1617770940065384, + "learning_rate": 4.110681762072672e-05, + "loss": 0.4383, "step": 41475 }, { - "epoch": 1.46, - "learning_rate": 4.153953521343214e-05, - "loss": 0.2628, + "epoch": 1.4949363895195877, + "grad_norm": 0.20571884512901306, + "learning_rate": 4.110458573451931e-05, + "loss": 0.4229, "step": 41480 }, { - "epoch": 1.46, - "learning_rate": 4.15373989258674e-05, - "loss": 0.2992, + "epoch": 1.4951165891808125, + "grad_norm": 0.18498611450195312, + "learning_rate": 4.110235362889003e-05, + "loss": 0.4011, "step": 41485 }, { - "epoch": 1.46, - "learning_rate": 4.153526242357543e-05, - "loss": 0.2798, + "epoch": 1.495296788842037, + "grad_norm": 0.19509319961071014, + "learning_rate": 4.1100121303869296e-05, + "loss": 0.41, "step": 41490 }, { - "epoch": 1.46, - "learning_rate": 4.1533125706583974e-05, - "loss": 0.3018, + "epoch": 1.4954769885032615, + "grad_norm": 0.21900251507759094, + "learning_rate": 4.109788875948754e-05, + "loss": 0.4099, "step": 41495 }, { - "epoch": 1.46, - "learning_rate": 4.153098877492079e-05, - "loss": 0.2966, + "epoch": 1.4956571881644862, + "grad_norm": 0.1650031954050064, + "learning_rate": 4.109565599577515e-05, + "loss": 0.386, "step": 41500 }, { - "epoch": 1.46, - "eval_loss": 0.28131207823753357, - "eval_runtime": 10.5489, - "eval_samples_per_second": 9.48, - "eval_steps_per_second": 9.48, + "epoch": 1.4956571881644862, + "eval_loss": 0.44493234157562256, + "eval_runtime": 3.5367, + "eval_samples_per_second": 28.275, + "eval_steps_per_second": 7.069, "step": 41500 }, { - "epoch": 1.46, - "learning_rate": 4.15288516286136e-05, - "loss": 0.2731, + "epoch": 1.495837387825711, + "grad_norm": 0.19879759848117828, + "learning_rate": 4.109342301276257e-05, + "loss": 0.4323, "step": 41505 }, { - "epoch": 1.46, - "learning_rate": 4.152671426769016e-05, - "loss": 0.2762, + "epoch": 1.4960175874869355, + "grad_norm": 0.22263413667678833, + "learning_rate": 4.109118981048022e-05, + "loss": 0.4187, "step": 41510 }, { - "epoch": 1.46, - "learning_rate": 4.1524576692178235e-05, - "loss": 0.2886, + "epoch": 1.4961977871481602, + "grad_norm": 0.18593116104602814, + "learning_rate": 4.1088956388958524e-05, + "loss": 0.4388, "step": 41515 }, { - "epoch": 1.46, - "learning_rate": 4.1522438902105564e-05, - "loss": 0.299, + "epoch": 1.4963779868093847, + "grad_norm": 0.16802819073200226, + "learning_rate": 4.1086722748227903e-05, + "loss": 0.4141, "step": 41520 }, { - "epoch": 1.46, - "learning_rate": 4.152030089749992e-05, - "loss": 0.282, + "epoch": 1.4965581864706095, + "grad_norm": 0.19732625782489777, + "learning_rate": 4.108448888831881e-05, + "loss": 0.4034, "step": 41525 }, { - "epoch": 1.46, - "learning_rate": 4.1518162678389054e-05, - "loss": 0.3056, + "epoch": 1.4967383861318342, + "grad_norm": 0.15761670470237732, + "learning_rate": 4.108225480926167e-05, + "loss": 0.4235, "step": 41530 }, { - "epoch": 1.46, - "learning_rate": 4.1516024244800734e-05, - "loss": 0.2771, + "epoch": 1.4969185857930587, + "grad_norm": 0.195955291390419, + "learning_rate": 4.108002051108691e-05, + "loss": 0.4061, "step": 41535 }, { - "epoch": 1.46, - "learning_rate": 4.151388559676272e-05, - "loss": 0.2835, + "epoch": 1.4970987854542832, + "grad_norm": 0.19495412707328796, + "learning_rate": 4.107778599382499e-05, + "loss": 0.4036, "step": 41540 }, { - "epoch": 1.46, - "learning_rate": 4.1511746734302786e-05, - "loss": 0.2921, + "epoch": 1.497278985115508, + "grad_norm": 0.2070777267217636, + "learning_rate": 4.1075551257506354e-05, + "loss": 0.4182, "step": 41545 }, { - "epoch": 1.46, - "learning_rate": 4.1509607657448704e-05, - "loss": 0.2958, + "epoch": 1.4974591847767327, + "grad_norm": 0.1688157469034195, + "learning_rate": 4.1073316302161435e-05, + "loss": 0.4271, "step": 41550 }, { - "epoch": 1.46, - "learning_rate": 4.150746836622824e-05, - "loss": 0.2752, + "epoch": 1.4976393844379572, + "grad_norm": 0.15762291848659515, + "learning_rate": 4.1071081127820696e-05, + "loss": 0.4158, "step": 41555 }, { - "epoch": 1.46, - "learning_rate": 4.150532886066918e-05, - "loss": 0.3075, + "epoch": 1.497819584099182, + "grad_norm": 0.19532495737075806, + "learning_rate": 4.1068845734514593e-05, + "loss": 0.4508, "step": 41560 }, { - "epoch": 1.46, - "learning_rate": 4.1503189140799315e-05, - "loss": 0.2905, + "epoch": 1.4979997837604064, + "grad_norm": 0.15557561814785004, + "learning_rate": 4.1066610122273575e-05, + "loss": 0.4036, "step": 41565 }, { - "epoch": 1.46, - "learning_rate": 4.15010492066464e-05, - "loss": 0.298, + "epoch": 1.4981799834216312, + "grad_norm": 0.18481487035751343, + "learning_rate": 4.1064374291128106e-05, + "loss": 0.4475, "step": 41570 }, { - "epoch": 1.46, - "learning_rate": 4.1498909058238246e-05, - "loss": 0.2838, + "epoch": 1.498360183082856, + "grad_norm": 0.17046202719211578, + "learning_rate": 4.1062138241108645e-05, + "loss": 0.4168, "step": 41575 }, { - "epoch": 1.46, - "learning_rate": 4.149676869560262e-05, - "loss": 0.2928, + "epoch": 1.4985403827440804, + "grad_norm": 0.18078990280628204, + "learning_rate": 4.105990197224566e-05, + "loss": 0.4325, "step": 41580 }, { - "epoch": 1.46, - "learning_rate": 4.149462811876733e-05, - "loss": 0.2737, + "epoch": 1.4987205824053051, + "grad_norm": 0.1715507060289383, + "learning_rate": 4.105766548456962e-05, + "loss": 0.422, "step": 41585 }, { - "epoch": 1.46, - "learning_rate": 4.1492487327760165e-05, - "loss": 0.2936, + "epoch": 1.4989007820665297, + "grad_norm": 0.17720800638198853, + "learning_rate": 4.1055428778111004e-05, + "loss": 0.4115, "step": 41590 }, { - "epoch": 1.46, - "learning_rate": 4.149034632260892e-05, - "loss": 0.2766, + "epoch": 1.4990809817277544, + "grad_norm": 0.16280311346054077, + "learning_rate": 4.105319185290027e-05, + "loss": 0.4068, "step": 41595 }, { - "epoch": 1.46, - "learning_rate": 4.1488205103341395e-05, - "loss": 0.279, + "epoch": 1.4992611813889791, + "grad_norm": 0.1896113157272339, + "learning_rate": 4.10509547089679e-05, + "loss": 0.4245, "step": 41600 }, { - "epoch": 1.46, - "learning_rate": 4.14860636699854e-05, - "loss": 0.2808, + "epoch": 1.4994413810502036, + "grad_norm": 0.17678354680538177, + "learning_rate": 4.10487173463444e-05, + "loss": 0.4236, "step": 41605 }, { - "epoch": 1.46, - "learning_rate": 4.148392202256872e-05, - "loss": 0.2937, + "epoch": 1.4996215807114281, + "grad_norm": 0.15229156613349915, + "learning_rate": 4.104647976506022e-05, + "loss": 0.3917, "step": 41610 }, { - "epoch": 1.46, - "learning_rate": 4.148178016111919e-05, - "loss": 0.2873, + "epoch": 1.4998017803726529, + "grad_norm": 0.2196597158908844, + "learning_rate": 4.104424196514586e-05, + "loss": 0.4439, "step": 41615 }, { - "epoch": 1.46, - "learning_rate": 4.147963808566459e-05, - "loss": 0.2778, + "epoch": 1.4999819800338776, + "grad_norm": 0.18850156664848328, + "learning_rate": 4.104200394663181e-05, + "loss": 0.4067, "step": 41620 }, { - "epoch": 1.46, - "learning_rate": 4.147749579623276e-05, - "loss": 0.2742, + "epoch": 1.5001621796951021, + "grad_norm": 0.19405673444271088, + "learning_rate": 4.103976570954856e-05, + "loss": 0.4496, "step": 41625 }, { - "epoch": 1.46, - "learning_rate": 4.14753532928515e-05, - "loss": 0.296, + "epoch": 1.5003423793563269, + "grad_norm": 0.20671923458576202, + "learning_rate": 4.103752725392661e-05, + "loss": 0.4071, "step": 41630 }, { - "epoch": 1.46, - "learning_rate": 4.1473210575548644e-05, - "loss": 0.2929, + "epoch": 1.5005225790175514, + "grad_norm": 0.2034417986869812, + "learning_rate": 4.103528857979646e-05, + "loss": 0.3851, "step": 41635 }, { - "epoch": 1.47, - "learning_rate": 4.1471067644352e-05, - "loss": 0.2728, + "epoch": 1.500702778678776, + "grad_norm": 0.1623309999704361, + "learning_rate": 4.10330496871886e-05, + "loss": 0.4161, "step": 41640 }, { - "epoch": 1.47, - "learning_rate": 4.1468924499289396e-05, - "loss": 0.2815, + "epoch": 1.5008829783400008, + "grad_norm": 0.1519644856452942, + "learning_rate": 4.103081057613355e-05, + "loss": 0.4051, "step": 41645 }, { - "epoch": 1.47, - "learning_rate": 4.1466781140388666e-05, - "loss": 0.3016, + "epoch": 1.5010631780012254, + "grad_norm": 0.14681312441825867, + "learning_rate": 4.1028571246661804e-05, + "loss": 0.4235, "step": 41650 }, { - "epoch": 1.47, - "learning_rate": 4.1464637567677636e-05, - "loss": 0.2837, + "epoch": 1.5012433776624499, + "grad_norm": 0.189476877450943, + "learning_rate": 4.102633169880388e-05, + "loss": 0.3896, "step": 41655 }, { - "epoch": 1.47, - "learning_rate": 4.146249378118413e-05, - "loss": 0.28, + "epoch": 1.5014235773236746, + "grad_norm": 0.17130400240421295, + "learning_rate": 4.102409193259029e-05, + "loss": 0.4145, "step": 41660 }, { - "epoch": 1.47, - "learning_rate": 4.1460349780936e-05, - "loss": 0.2909, + "epoch": 1.5016037769848993, + "grad_norm": 0.16844402253627777, + "learning_rate": 4.102185194805155e-05, + "loss": 0.445, "step": 41665 }, { - "epoch": 1.47, - "learning_rate": 4.145820556696107e-05, - "loss": 0.313, + "epoch": 1.501783976646124, + "grad_norm": 0.17054221034049988, + "learning_rate": 4.101961174521818e-05, + "loss": 0.4269, "step": 41670 }, { - "epoch": 1.47, - "learning_rate": 4.1456061139287186e-05, - "loss": 0.2919, + "epoch": 1.5019641763073486, + "grad_norm": 0.1802477240562439, + "learning_rate": 4.101737132412069e-05, + "loss": 0.4183, "step": 41675 }, { - "epoch": 1.47, - "learning_rate": 4.14539164979422e-05, - "loss": 0.2965, + "epoch": 1.502144375968573, + "grad_norm": 0.20035143196582794, + "learning_rate": 4.101513068478963e-05, + "loss": 0.4227, "step": 41680 }, { - "epoch": 1.47, - "learning_rate": 4.145177164295395e-05, - "loss": 0.2947, + "epoch": 1.5023245756297978, + "grad_norm": 0.165787011384964, + "learning_rate": 4.101288982725551e-05, + "loss": 0.4357, "step": 41685 }, { - "epoch": 1.47, - "learning_rate": 4.144962657435028e-05, - "loss": 0.2752, + "epoch": 1.5025047752910226, + "grad_norm": 0.1805458515882492, + "learning_rate": 4.1010648751548876e-05, + "loss": 0.4115, "step": 41690 }, { - "epoch": 1.47, - "learning_rate": 4.144748129215905e-05, - "loss": 0.2919, + "epoch": 1.502684974952247, + "grad_norm": 0.17907939851284027, + "learning_rate": 4.1008407457700234e-05, + "loss": 0.3908, "step": 41695 }, { - "epoch": 1.47, - "learning_rate": 4.144533579640812e-05, - "loss": 0.2917, + "epoch": 1.5028651746134716, + "grad_norm": 0.24330779910087585, + "learning_rate": 4.1006165945740154e-05, + "loss": 0.4361, "step": 41700 }, { - "epoch": 1.47, - "learning_rate": 4.1443190087125344e-05, - "loss": 0.2771, + "epoch": 1.5030453742746963, + "grad_norm": 0.21545155346393585, + "learning_rate": 4.100392421569916e-05, + "loss": 0.4375, "step": 41705 }, { - "epoch": 1.47, - "learning_rate": 4.144104416433858e-05, - "loss": 0.3088, + "epoch": 1.503225573935921, + "grad_norm": 0.18745043873786926, + "learning_rate": 4.1001682267607796e-05, + "loss": 0.3854, "step": 41710 }, { - "epoch": 1.47, - "learning_rate": 4.143889802807569e-05, - "loss": 0.3076, + "epoch": 1.5034057735971458, + "grad_norm": 0.16889239847660065, + "learning_rate": 4.0999440101496606e-05, + "loss": 0.3463, "step": 41715 }, { - "epoch": 1.47, - "learning_rate": 4.143675167836455e-05, - "loss": 0.2778, + "epoch": 1.5035859732583703, + "grad_norm": 0.18077971041202545, + "learning_rate": 4.099719771739614e-05, + "loss": 0.3698, "step": 41720 }, { - "epoch": 1.47, - "learning_rate": 4.143460511523302e-05, - "loss": 0.276, + "epoch": 1.5037661729195948, + "grad_norm": 0.20077045261859894, + "learning_rate": 4.099495511533696e-05, + "loss": 0.4117, "step": 41725 }, { - "epoch": 1.47, - "learning_rate": 4.1432458338708965e-05, - "loss": 0.2951, + "epoch": 1.5039463725808195, + "grad_norm": 0.1883951872587204, + "learning_rate": 4.099271229534961e-05, + "loss": 0.4191, "step": 41730 }, { - "epoch": 1.47, - "learning_rate": 4.143031134882027e-05, - "loss": 0.2916, + "epoch": 1.5041265722420443, + "grad_norm": 0.18416355550289154, + "learning_rate": 4.0990469257464645e-05, + "loss": 0.3896, "step": 41735 }, { - "epoch": 1.47, - "learning_rate": 4.142816414559481e-05, - "loss": 0.2642, + "epoch": 1.5043067719032688, + "grad_norm": 0.20206254720687866, + "learning_rate": 4.0988226001712646e-05, + "loss": 0.4027, "step": 41740 }, { - "epoch": 1.47, - "learning_rate": 4.142601672906047e-05, - "loss": 0.2762, + "epoch": 1.5044869715644935, + "grad_norm": 0.18239402770996094, + "learning_rate": 4.0985982528124156e-05, + "loss": 0.3764, "step": 41745 }, { - "epoch": 1.47, - "learning_rate": 4.142386909924512e-05, - "loss": 0.3077, + "epoch": 1.504667171225718, + "grad_norm": 0.1996680647134781, + "learning_rate": 4.098373883672974e-05, + "loss": 0.404, "step": 41750 }, { - "epoch": 1.47, - "learning_rate": 4.142172125617666e-05, - "loss": 0.2859, + "epoch": 1.5048473708869428, + "grad_norm": 0.1679215282201767, + "learning_rate": 4.0981494927559994e-05, + "loss": 0.4324, "step": 41755 }, { - "epoch": 1.47, - "learning_rate": 4.141957319988297e-05, - "loss": 0.2824, + "epoch": 1.5050275705481675, + "grad_norm": 0.19576683640480042, + "learning_rate": 4.0979250800645465e-05, + "loss": 0.4211, "step": 41760 }, { - "epoch": 1.47, - "learning_rate": 4.1417424930391946e-05, - "loss": 0.2843, + "epoch": 1.505207770209392, + "grad_norm": 0.20221199095249176, + "learning_rate": 4.097700645601673e-05, + "loss": 0.3923, "step": 41765 }, { - "epoch": 1.47, - "learning_rate": 4.1415276447731476e-05, - "loss": 0.3122, + "epoch": 1.5053879698706165, + "grad_norm": 0.14687031507492065, + "learning_rate": 4.097476189370439e-05, + "loss": 0.3818, "step": 41770 }, { - "epoch": 1.47, - "learning_rate": 4.141312775192946e-05, - "loss": 0.2629, + "epoch": 1.5055681695318412, + "grad_norm": 0.19657109677791595, + "learning_rate": 4.097251711373901e-05, + "loss": 0.417, "step": 41775 }, { - "epoch": 1.47, - "learning_rate": 4.141097884301379e-05, - "loss": 0.2667, + "epoch": 1.505748369193066, + "grad_norm": 0.2037065178155899, + "learning_rate": 4.097027211615117e-05, + "loss": 0.3996, "step": 41780 }, { - "epoch": 1.47, - "learning_rate": 4.140882972101239e-05, - "loss": 0.27, + "epoch": 1.5059285688542907, + "grad_norm": 0.22538067400455475, + "learning_rate": 4.096802690097146e-05, + "loss": 0.4108, "step": 41785 }, { - "epoch": 1.47, - "learning_rate": 4.1406680385953134e-05, - "loss": 0.304, + "epoch": 1.5061087685155152, + "grad_norm": 0.2207726240158081, + "learning_rate": 4.096578146823048e-05, + "loss": 0.4288, "step": 41790 }, { - "epoch": 1.47, - "learning_rate": 4.140453083786396e-05, - "loss": 0.3035, + "epoch": 1.5062889681767397, + "grad_norm": 0.16942410171031952, + "learning_rate": 4.096353581795882e-05, + "loss": 0.4209, "step": 41795 }, { - "epoch": 1.47, - "learning_rate": 4.140238107677276e-05, - "loss": 0.2816, + "epoch": 1.5064691678379645, + "grad_norm": 0.22281381487846375, + "learning_rate": 4.096128995018707e-05, + "loss": 0.4037, "step": 41800 }, { - "epoch": 1.47, - "learning_rate": 4.140023110270744e-05, - "loss": 0.2717, + "epoch": 1.5066493674991892, + "grad_norm": 0.19026844203472137, + "learning_rate": 4.095904386494585e-05, + "loss": 0.407, "step": 41805 }, { - "epoch": 1.47, - "learning_rate": 4.1398080915695936e-05, - "loss": 0.2917, + "epoch": 1.5068295671604137, + "grad_norm": 0.23003491759300232, + "learning_rate": 4.095679756226573e-05, + "loss": 0.3969, "step": 41810 }, { - "epoch": 1.47, - "learning_rate": 4.139593051576616e-05, - "loss": 0.3025, + "epoch": 1.5070097668216382, + "grad_norm": 0.19548535346984863, + "learning_rate": 4.0954551042177337e-05, + "loss": 0.403, "step": 41815 }, { - "epoch": 1.47, - "learning_rate": 4.1393779902946025e-05, - "loss": 0.2898, + "epoch": 1.507189966482863, + "grad_norm": 0.19245415925979614, + "learning_rate": 4.0952304304711275e-05, + "loss": 0.398, "step": 41820 }, { - "epoch": 1.47, - "learning_rate": 4.139162907726347e-05, - "loss": 0.2947, + "epoch": 1.5073701661440877, + "grad_norm": 0.18857969343662262, + "learning_rate": 4.0950057349898165e-05, + "loss": 0.4223, "step": 41825 }, { - "epoch": 1.47, - "learning_rate": 4.138947803874641e-05, - "loss": 0.3029, + "epoch": 1.5075503658053124, + "grad_norm": 0.19236567616462708, + "learning_rate": 4.0947810177768595e-05, + "loss": 0.4026, "step": 41830 }, { - "epoch": 1.47, - "learning_rate": 4.1387326787422774e-05, - "loss": 0.3071, + "epoch": 1.507730565466537, + "grad_norm": 0.14722810685634613, + "learning_rate": 4.094556278835321e-05, + "loss": 0.3917, "step": 41835 }, { - "epoch": 1.47, - "learning_rate": 4.138517532332051e-05, - "loss": 0.3212, + "epoch": 1.5079107651277615, + "grad_norm": 0.18496693670749664, + "learning_rate": 4.094331518168262e-05, + "loss": 0.4228, "step": 41840 }, { - "epoch": 1.47, - "learning_rate": 4.1383023646467536e-05, - "loss": 0.2873, + "epoch": 1.5080909647889862, + "grad_norm": 0.14098331332206726, + "learning_rate": 4.0941067357787445e-05, + "loss": 0.4242, "step": 41845 }, { - "epoch": 1.47, - "learning_rate": 4.138087175689179e-05, - "loss": 0.2885, + "epoch": 1.508271164450211, + "grad_norm": 0.16862405836582184, + "learning_rate": 4.093881931669831e-05, + "loss": 0.4082, "step": 41850 }, { - "epoch": 1.47, - "learning_rate": 4.137871965462124e-05, - "loss": 0.2966, + "epoch": 1.5084513641114354, + "grad_norm": 0.18131491541862488, + "learning_rate": 4.093657105844585e-05, + "loss": 0.4356, "step": 41855 }, { - "epoch": 1.47, - "learning_rate": 4.137656733968379e-05, - "loss": 0.2855, + "epoch": 1.50863156377266, + "grad_norm": 0.17331118881702423, + "learning_rate": 4.093432258306069e-05, + "loss": 0.4143, "step": 41860 }, { - "epoch": 1.47, - "learning_rate": 4.137441481210741e-05, - "loss": 0.2517, + "epoch": 1.5088117634338847, + "grad_norm": 0.17720840871334076, + "learning_rate": 4.093207389057348e-05, + "loss": 0.3942, "step": 41865 }, { - "epoch": 1.47, - "learning_rate": 4.137226207192005e-05, - "loss": 0.2752, + "epoch": 1.5089919630951094, + "grad_norm": 0.18100488185882568, + "learning_rate": 4.0929824981014845e-05, + "loss": 0.4303, "step": 41870 }, { - "epoch": 1.47, - "learning_rate": 4.137010911914965e-05, - "loss": 0.2747, + "epoch": 1.5091721627563341, + "grad_norm": 0.1756538450717926, + "learning_rate": 4.0927575854415425e-05, + "loss": 0.418, "step": 41875 }, { - "epoch": 1.47, - "learning_rate": 4.1367955953824176e-05, - "loss": 0.2791, + "epoch": 1.5093523624175587, + "grad_norm": 0.2303672730922699, + "learning_rate": 4.092532651080587e-05, + "loss": 0.4329, "step": 41880 }, { - "epoch": 1.47, - "learning_rate": 4.136580257597158e-05, - "loss": 0.2903, + "epoch": 1.5095325620787832, + "grad_norm": 0.15763244032859802, + "learning_rate": 4.092307695021682e-05, + "loss": 0.4029, "step": 41885 }, { - "epoch": 1.47, - "learning_rate": 4.136364898561982e-05, - "loss": 0.2824, + "epoch": 1.509712761740008, + "grad_norm": 0.16717827320098877, + "learning_rate": 4.092082717267893e-05, + "loss": 0.4265, "step": 41890 }, { - "epoch": 1.47, - "learning_rate": 4.1361495182796866e-05, - "loss": 0.2816, + "epoch": 1.5098929614012326, + "grad_norm": 0.2047724723815918, + "learning_rate": 4.091857717822286e-05, + "loss": 0.4463, "step": 41895 }, { - "epoch": 1.47, - "learning_rate": 4.1359341167530684e-05, - "loss": 0.2877, + "epoch": 1.5100731610624571, + "grad_norm": 0.18995271623134613, + "learning_rate": 4.091632696687925e-05, + "loss": 0.4023, "step": 41900 }, { - "epoch": 1.47, - "learning_rate": 4.1357186939849225e-05, - "loss": 0.2658, + "epoch": 1.5102533607236819, + "grad_norm": 0.19766134023666382, + "learning_rate": 4.091407653867877e-05, + "loss": 0.4182, "step": 41905 }, { - "epoch": 1.47, - "learning_rate": 4.135503249978049e-05, - "loss": 0.2948, + "epoch": 1.5104335603849064, + "grad_norm": 0.21319149434566498, + "learning_rate": 4.091182589365208e-05, + "loss": 0.4199, "step": 41910 }, { - "epoch": 1.47, - "learning_rate": 4.135287784735242e-05, - "loss": 0.3228, + "epoch": 1.5106137600461311, + "grad_norm": 0.17943358421325684, + "learning_rate": 4.090957503182984e-05, + "loss": 0.3911, "step": 41915 }, { - "epoch": 1.47, - "learning_rate": 4.135072298259301e-05, - "loss": 0.2755, + "epoch": 1.5107939597073559, + "grad_norm": 0.196610227227211, + "learning_rate": 4.0907323953242724e-05, + "loss": 0.4463, "step": 41920 }, { - "epoch": 1.48, - "learning_rate": 4.134856790553025e-05, - "loss": 0.2892, + "epoch": 1.5109741593685804, + "grad_norm": 0.18429423868656158, + "learning_rate": 4.0905072657921396e-05, + "loss": 0.4321, "step": 41925 }, { - "epoch": 1.48, - "learning_rate": 4.13464126161921e-05, - "loss": 0.2966, + "epoch": 1.5111543590298049, + "grad_norm": 0.16434848308563232, + "learning_rate": 4.090282114589653e-05, + "loss": 0.4202, "step": 41930 }, { - "epoch": 1.48, - "learning_rate": 4.134425711460656e-05, - "loss": 0.2837, + "epoch": 1.5113345586910296, + "grad_norm": 0.15789394080638885, + "learning_rate": 4.090056941719881e-05, + "loss": 0.3869, "step": 41935 }, { - "epoch": 1.48, - "learning_rate": 4.134210140080161e-05, - "loss": 0.2627, + "epoch": 1.5115147583522544, + "grad_norm": 0.16550783812999725, + "learning_rate": 4.08983174718589e-05, + "loss": 0.4141, "step": 41940 }, { - "epoch": 1.48, - "learning_rate": 4.1339945474805244e-05, - "loss": 0.3242, + "epoch": 1.511694958013479, + "grad_norm": 0.159341499209404, + "learning_rate": 4.08960653099075e-05, + "loss": 0.4122, "step": 41945 }, { - "epoch": 1.48, - "learning_rate": 4.133778933664546e-05, - "loss": 0.2873, + "epoch": 1.5118751576747036, + "grad_norm": 0.187036395072937, + "learning_rate": 4.089381293137529e-05, + "loss": 0.4435, "step": 41950 }, { - "epoch": 1.48, - "learning_rate": 4.1335632986350235e-05, - "loss": 0.29, + "epoch": 1.512055357335928, + "grad_norm": 0.18756411969661713, + "learning_rate": 4.0891560336292955e-05, + "loss": 0.4126, "step": 41955 }, { - "epoch": 1.48, - "learning_rate": 4.1333476423947594e-05, - "loss": 0.2837, + "epoch": 1.5122355569971528, + "grad_norm": 0.18429414927959442, + "learning_rate": 4.088930752469119e-05, + "loss": 0.4426, "step": 41960 }, { - "epoch": 1.48, - "learning_rate": 4.1331319649465515e-05, - "loss": 0.282, + "epoch": 1.5124157566583776, + "grad_norm": 0.16013193130493164, + "learning_rate": 4.0887054496600676e-05, + "loss": 0.3959, "step": 41965 }, { - "epoch": 1.48, - "learning_rate": 4.132916266293202e-05, - "loss": 0.3029, + "epoch": 1.512595956319602, + "grad_norm": 0.20077770948410034, + "learning_rate": 4.088480125205213e-05, + "loss": 0.3921, "step": 41970 }, { - "epoch": 1.48, - "learning_rate": 4.1327005464375106e-05, - "loss": 0.283, + "epoch": 1.5127761559808266, + "grad_norm": 0.20495128631591797, + "learning_rate": 4.088254779107624e-05, + "loss": 0.4392, "step": 41975 }, { - "epoch": 1.48, - "learning_rate": 4.13248480538228e-05, - "loss": 0.3077, + "epoch": 1.5129563556420513, + "grad_norm": 0.16244658827781677, + "learning_rate": 4.088074486648847e-05, + "loss": 0.4346, "step": 41980 }, { - "epoch": 1.48, - "learning_rate": 4.132269043130308e-05, - "loss": 0.286, + "epoch": 1.513136555303276, + "grad_norm": 0.2103343904018402, + "learning_rate": 4.0878491016020734e-05, + "loss": 0.4033, "step": 41985 }, { - "epoch": 1.48, - "learning_rate": 4.1320532596843985e-05, - "loss": 0.2866, + "epoch": 1.5133167549645008, + "grad_norm": 0.17164449393749237, + "learning_rate": 4.087623694921164e-05, + "loss": 0.4303, "step": 41990 }, { - "epoch": 1.48, - "learning_rate": 4.1318374550473536e-05, - "loss": 0.3032, + "epoch": 1.5134969546257253, + "grad_norm": 0.1736312210559845, + "learning_rate": 4.087398266609188e-05, + "loss": 0.4023, "step": 41995 }, { - "epoch": 1.48, - "learning_rate": 4.1316216292219746e-05, - "loss": 0.3057, + "epoch": 1.5136771542869498, + "grad_norm": 0.17446556687355042, + "learning_rate": 4.0871728166692195e-05, + "loss": 0.4104, "step": 42000 }, { - "epoch": 1.48, - "eval_loss": 0.2798173427581787, - "eval_runtime": 10.5336, - "eval_samples_per_second": 9.493, - "eval_steps_per_second": 9.493, + "epoch": 1.5136771542869498, + "eval_loss": 0.4447964131832123, + "eval_runtime": 3.5328, + "eval_samples_per_second": 28.306, + "eval_steps_per_second": 7.076, "step": 42000 }, { - "epoch": 1.48, - "learning_rate": 4.131405782211064e-05, - "loss": 0.2677, + "epoch": 1.5138573539481746, + "grad_norm": 0.16829872131347656, + "learning_rate": 4.0869473451043274e-05, + "loss": 0.4072, "step": 42005 }, { - "epoch": 1.48, - "learning_rate": 4.1311899140174246e-05, - "loss": 0.287, + "epoch": 1.5140375536093993, + "grad_norm": 0.1769193410873413, + "learning_rate": 4.086721851917585e-05, + "loss": 0.3981, "step": 42010 }, { - "epoch": 1.48, - "learning_rate": 4.1309740246438596e-05, - "loss": 0.2707, + "epoch": 1.5142177532706238, + "grad_norm": 0.13675101101398468, + "learning_rate": 4.086496337112064e-05, + "loss": 0.3936, "step": 42015 }, { - "epoch": 1.48, - "learning_rate": 4.1307581140931705e-05, - "loss": 0.2812, + "epoch": 1.5143979529318485, + "grad_norm": 0.16778729856014252, + "learning_rate": 4.0862708006908393e-05, + "loss": 0.3619, "step": 42020 }, { - "epoch": 1.48, - "learning_rate": 4.1305421823681635e-05, - "loss": 0.3009, + "epoch": 1.514578152593073, + "grad_norm": 0.21226206421852112, + "learning_rate": 4.086045242656982e-05, + "loss": 0.4095, "step": 42025 }, { - "epoch": 1.48, - "learning_rate": 4.13032622947164e-05, - "loss": 0.2792, + "epoch": 1.5147583522542978, + "grad_norm": 0.16903939843177795, + "learning_rate": 4.0858196630135636e-05, + "loss": 0.4046, "step": 42030 }, { - "epoch": 1.48, - "learning_rate": 4.1301102554064046e-05, - "loss": 0.2933, + "epoch": 1.5149385519155225, + "grad_norm": 0.18711115419864655, + "learning_rate": 4.0855940617636604e-05, + "loss": 0.4402, "step": 42035 }, { - "epoch": 1.48, - "learning_rate": 4.129894260175262e-05, - "loss": 0.2755, + "epoch": 1.515118751576747, + "grad_norm": 0.22342517971992493, + "learning_rate": 4.0853684389103444e-05, + "loss": 0.4173, "step": 42040 }, { - "epoch": 1.48, - "learning_rate": 4.1296782437810166e-05, - "loss": 0.2816, + "epoch": 1.5152989512379715, + "grad_norm": 0.2237049639225006, + "learning_rate": 4.0851427944566915e-05, + "loss": 0.441, "step": 42045 }, { - "epoch": 1.48, - "learning_rate": 4.129462206226473e-05, - "loss": 0.2795, + "epoch": 1.5154791508991963, + "grad_norm": 0.1923510730266571, + "learning_rate": 4.084917128405774e-05, + "loss": 0.4073, "step": 42050 }, { - "epoch": 1.48, - "learning_rate": 4.129246147514437e-05, - "loss": 0.2853, + "epoch": 1.515659350560421, + "grad_norm": 0.14314790070056915, + "learning_rate": 4.0846914407606676e-05, + "loss": 0.3933, "step": 42055 }, { - "epoch": 1.48, - "learning_rate": 4.129030067647713e-05, - "loss": 0.2732, + "epoch": 1.5158395502216457, + "grad_norm": 0.18931898474693298, + "learning_rate": 4.0844657315244475e-05, + "loss": 0.3878, "step": 42060 }, { - "epoch": 1.48, - "learning_rate": 4.128813966629108e-05, - "loss": 0.2671, + "epoch": 1.5160197498828702, + "grad_norm": 0.1530705988407135, + "learning_rate": 4.084240000700188e-05, + "loss": 0.4434, "step": 42065 }, { - "epoch": 1.48, - "learning_rate": 4.1285978444614266e-05, - "loss": 0.3028, + "epoch": 1.5161999495440948, + "grad_norm": 0.1672196239233017, + "learning_rate": 4.084014248290966e-05, + "loss": 0.423, "step": 42070 }, { - "epoch": 1.48, - "learning_rate": 4.1283817011474755e-05, - "loss": 0.2778, + "epoch": 1.5163801492053195, + "grad_norm": 0.17072351276874542, + "learning_rate": 4.083788474299856e-05, + "loss": 0.4333, "step": 42075 }, { - "epoch": 1.48, - "learning_rate": 4.128165536690061e-05, - "loss": 0.2685, + "epoch": 1.5165603488665442, + "grad_norm": 0.18741382658481598, + "learning_rate": 4.083562678729935e-05, + "loss": 0.4173, "step": 42080 }, { - "epoch": 1.48, - "learning_rate": 4.12794935109199e-05, - "loss": 0.289, + "epoch": 1.5167405485277687, + "grad_norm": 0.19706690311431885, + "learning_rate": 4.083336861584278e-05, + "loss": 0.4282, "step": 42085 }, { - "epoch": 1.48, - "learning_rate": 4.12773314435607e-05, - "loss": 0.2809, + "epoch": 1.5169207481889933, + "grad_norm": 0.1541031002998352, + "learning_rate": 4.0831110228659644e-05, + "loss": 0.4078, "step": 42090 }, { - "epoch": 1.48, - "learning_rate": 4.1275169164851066e-05, - "loss": 0.3002, + "epoch": 1.517100947850218, + "grad_norm": 0.16391873359680176, + "learning_rate": 4.082885162578069e-05, + "loss": 0.4351, "step": 42095 }, { - "epoch": 1.48, - "learning_rate": 4.127300667481909e-05, - "loss": 0.296, + "epoch": 1.5172811475114427, + "grad_norm": 0.1912384182214737, + "learning_rate": 4.0826592807236694e-05, + "loss": 0.3714, "step": 42100 }, { - "epoch": 1.48, - "learning_rate": 4.1270843973492854e-05, - "loss": 0.2916, + "epoch": 1.5174613471726675, + "grad_norm": 0.19222666323184967, + "learning_rate": 4.0824333773058434e-05, + "loss": 0.4148, "step": 42105 }, { - "epoch": 1.48, - "learning_rate": 4.126868106090043e-05, - "loss": 0.2953, + "epoch": 1.517641546833892, + "grad_norm": 0.18687984347343445, + "learning_rate": 4.08220745232767e-05, + "loss": 0.4279, "step": 42110 }, { - "epoch": 1.48, - "learning_rate": 4.12665179370699e-05, - "loss": 0.2817, + "epoch": 1.5178217464951165, + "grad_norm": 0.17974166572093964, + "learning_rate": 4.0819815057922254e-05, + "loss": 0.4572, "step": 42115 }, { - "epoch": 1.48, - "learning_rate": 4.1264354602029364e-05, - "loss": 0.2931, + "epoch": 1.5180019461563412, + "grad_norm": 0.18375541269779205, + "learning_rate": 4.0817555377025895e-05, + "loss": 0.4348, "step": 42120 }, { - "epoch": 1.48, - "learning_rate": 4.126219105580689e-05, - "loss": 0.2771, + "epoch": 1.518182145817566, + "grad_norm": 0.21198663115501404, + "learning_rate": 4.0815295480618395e-05, + "loss": 0.4156, "step": 42125 }, { - "epoch": 1.48, - "learning_rate": 4.1260027298430594e-05, - "loss": 0.3055, + "epoch": 1.5183623454787905, + "grad_norm": 0.18676543235778809, + "learning_rate": 4.081303536873058e-05, + "loss": 0.4357, "step": 42130 }, { - "epoch": 1.48, - "learning_rate": 4.125786332992855e-05, - "loss": 0.2824, + "epoch": 1.5185425451400152, + "grad_norm": 0.19135519862174988, + "learning_rate": 4.08107750413932e-05, + "loss": 0.3997, "step": 42135 }, { - "epoch": 1.48, - "learning_rate": 4.125613200313527e-05, - "loss": 0.2557, + "epoch": 1.5187227448012397, + "grad_norm": 0.16404955089092255, + "learning_rate": 4.080851449863708e-05, + "loss": 0.3918, "step": 42140 }, { - "epoch": 1.48, - "learning_rate": 4.1253967654677706e-05, - "loss": 0.3008, + "epoch": 1.5189029444624644, + "grad_norm": 0.17126797139644623, + "learning_rate": 4.080625374049301e-05, + "loss": 0.4189, "step": 42145 }, { - "epoch": 1.48, - "learning_rate": 4.125180309517308e-05, - "loss": 0.2772, + "epoch": 1.5190831441236892, + "grad_norm": 0.1819930374622345, + "learning_rate": 4.0803992766991785e-05, + "loss": 0.4446, "step": 42150 }, { - "epoch": 1.48, - "learning_rate": 4.124963832464951e-05, - "loss": 0.2838, + "epoch": 1.5192633437849137, + "grad_norm": 0.18088293075561523, + "learning_rate": 4.080173157816422e-05, + "loss": 0.4154, "step": 42155 }, { - "epoch": 1.48, - "learning_rate": 4.124747334313509e-05, - "loss": 0.2992, + "epoch": 1.5194435434461382, + "grad_norm": 0.18512316048145294, + "learning_rate": 4.079947017404113e-05, + "loss": 0.43, "step": 42160 }, { - "epoch": 1.48, - "learning_rate": 4.124530815065794e-05, - "loss": 0.3114, + "epoch": 1.519623743107363, + "grad_norm": 0.17444075644016266, + "learning_rate": 4.079720855465331e-05, + "loss": 0.4072, "step": 42165 }, { - "epoch": 1.48, - "learning_rate": 4.1243142747246164e-05, - "loss": 0.3013, + "epoch": 1.5198039427685877, + "grad_norm": 0.16421203315258026, + "learning_rate": 4.079494672003159e-05, + "loss": 0.4049, "step": 42170 }, { - "epoch": 1.48, - "learning_rate": 4.1240977132927886e-05, - "loss": 0.3294, + "epoch": 1.5199841424298124, + "grad_norm": 0.24568875133991241, + "learning_rate": 4.079268467020677e-05, + "loss": 0.4327, "step": 42175 }, { - "epoch": 1.48, - "learning_rate": 4.123881130773123e-05, - "loss": 0.2592, + "epoch": 1.520164342091037, + "grad_norm": 0.2001422941684723, + "learning_rate": 4.079042240520968e-05, + "loss": 0.4125, "step": 42180 }, { - "epoch": 1.48, - "learning_rate": 4.123664527168431e-05, - "loss": 0.302, + "epoch": 1.5203445417522614, + "grad_norm": 0.16765598952770233, + "learning_rate": 4.078815992507115e-05, + "loss": 0.3885, "step": 42185 }, { - "epoch": 1.48, - "learning_rate": 4.123447902481524e-05, - "loss": 0.2871, + "epoch": 1.5205247414134861, + "grad_norm": 0.20924493670463562, + "learning_rate": 4.078589722982199e-05, + "loss": 0.4214, "step": 42190 }, { - "epoch": 1.48, - "learning_rate": 4.123231256715217e-05, - "loss": 0.276, + "epoch": 1.5207049410747109, + "grad_norm": 0.183701753616333, + "learning_rate": 4.078363431949304e-05, + "loss": 0.4176, "step": 42195 }, { - "epoch": 1.48, - "learning_rate": 4.123014589872321e-05, - "loss": 0.2924, + "epoch": 1.5208851407359354, + "grad_norm": 0.1759697049856186, + "learning_rate": 4.0781371194115126e-05, + "loss": 0.4188, "step": 42200 }, { - "epoch": 1.48, - "learning_rate": 4.122797901955651e-05, - "loss": 0.2593, + "epoch": 1.52106534039716, + "grad_norm": 0.19455984234809875, + "learning_rate": 4.077910785371909e-05, + "loss": 0.4576, "step": 42205 }, { - "epoch": 1.49, - "learning_rate": 4.122581192968019e-05, - "loss": 0.276, + "epoch": 1.5212455400583846, + "grad_norm": 0.21152974665164948, + "learning_rate": 4.077684429833576e-05, + "loss": 0.4207, "step": 42210 }, { - "epoch": 1.49, - "learning_rate": 4.1223644629122395e-05, - "loss": 0.2861, + "epoch": 1.5214257397196094, + "grad_norm": 0.18577638268470764, + "learning_rate": 4.0774580527995975e-05, + "loss": 0.4593, "step": 42215 }, { - "epoch": 1.49, - "learning_rate": 4.122147711791128e-05, - "loss": 0.2924, + "epoch": 1.521605939380834, + "grad_norm": 0.18051621317863464, + "learning_rate": 4.0772316542730594e-05, + "loss": 0.4016, "step": 42220 }, { - "epoch": 1.49, - "learning_rate": 4.1219309396074956e-05, - "loss": 0.309, + "epoch": 1.5217861390420586, + "grad_norm": 0.1438736468553543, + "learning_rate": 4.077005234257045e-05, + "loss": 0.4366, "step": 42225 }, { - "epoch": 1.49, - "learning_rate": 4.12171414636416e-05, - "loss": 0.2984, + "epoch": 1.5219663387032831, + "grad_norm": 0.20258064568042755, + "learning_rate": 4.0767787927546394e-05, + "loss": 0.4339, "step": 42230 }, { - "epoch": 1.49, - "learning_rate": 4.121497332063935e-05, - "loss": 0.2866, + "epoch": 1.5221465383645079, + "grad_norm": 0.19496627151966095, + "learning_rate": 4.076552329768929e-05, + "loss": 0.4543, "step": 42235 }, { - "epoch": 1.49, - "learning_rate": 4.1212804967096355e-05, - "loss": 0.284, + "epoch": 1.5223267380257326, + "grad_norm": 0.1588614284992218, + "learning_rate": 4.0763258453029976e-05, + "loss": 0.4298, "step": 42240 }, { - "epoch": 1.49, - "learning_rate": 4.121063640304077e-05, - "loss": 0.3037, + "epoch": 1.522506937686957, + "grad_norm": 0.1556130349636078, + "learning_rate": 4.076099339359931e-05, + "loss": 0.4272, "step": 42245 }, { - "epoch": 1.49, - "learning_rate": 4.120846762850076e-05, - "loss": 0.3191, + "epoch": 1.5226871373481818, + "grad_norm": 0.20325186848640442, + "learning_rate": 4.0758728119428166e-05, + "loss": 0.4189, "step": 42250 }, { - "epoch": 1.49, - "learning_rate": 4.120629864350447e-05, - "loss": 0.2827, + "epoch": 1.5228673370094064, + "grad_norm": 0.20584464073181152, + "learning_rate": 4.075646263054741e-05, + "loss": 0.4432, "step": 42255 }, { - "epoch": 1.49, - "learning_rate": 4.120412944808008e-05, - "loss": 0.3025, + "epoch": 1.523047536670631, + "grad_norm": 0.15573151409626007, + "learning_rate": 4.0754196926987897e-05, + "loss": 0.3929, "step": 42260 }, { - "epoch": 1.49, - "learning_rate": 4.120196004225574e-05, - "loss": 0.276, + "epoch": 1.5232277363318558, + "grad_norm": 0.17175668478012085, + "learning_rate": 4.0751931008780496e-05, + "loss": 0.4094, "step": 42265 }, { - "epoch": 1.49, - "learning_rate": 4.119979042605964e-05, - "loss": 0.277, + "epoch": 1.5234079359930803, + "grad_norm": 0.16942541301250458, + "learning_rate": 4.074966487595608e-05, + "loss": 0.4101, "step": 42270 }, { - "epoch": 1.49, - "learning_rate": 4.1197620599519927e-05, - "loss": 0.306, + "epoch": 1.5235881356543048, + "grad_norm": 0.1620819866657257, + "learning_rate": 4.074739852854555e-05, + "loss": 0.4122, "step": 42275 }, { - "epoch": 1.49, - "learning_rate": 4.119545056266478e-05, - "loss": 0.2725, + "epoch": 1.5237683353155296, + "grad_norm": 0.22527901828289032, + "learning_rate": 4.0745131966579744e-05, + "loss": 0.3755, "step": 42280 }, { - "epoch": 1.49, - "learning_rate": 4.11932803155224e-05, - "loss": 0.279, + "epoch": 1.5239485349767543, + "grad_norm": 0.16794419288635254, + "learning_rate": 4.074286519008957e-05, + "loss": 0.4646, "step": 42285 }, { - "epoch": 1.49, - "learning_rate": 4.119110985812093e-05, - "loss": 0.2873, + "epoch": 1.524128734637979, + "grad_norm": 0.20727737247943878, + "learning_rate": 4.0740598199105904e-05, + "loss": 0.3832, "step": 42290 }, { - "epoch": 1.49, - "learning_rate": 4.118893919048858e-05, - "loss": 0.281, + "epoch": 1.5243089342992036, + "grad_norm": 0.1709243506193161, + "learning_rate": 4.073833099365965e-05, + "loss": 0.4257, "step": 42295 }, { - "epoch": 1.49, - "learning_rate": 4.118676831265351e-05, - "loss": 0.2841, + "epoch": 1.524489133960428, + "grad_norm": 0.16771294176578522, + "learning_rate": 4.073606357378166e-05, + "loss": 0.4417, "step": 42300 }, { - "epoch": 1.49, - "learning_rate": 4.1184597224643936e-05, - "loss": 0.2785, + "epoch": 1.5246693336216528, + "grad_norm": 0.2179926335811615, + "learning_rate": 4.073379593950286e-05, + "loss": 0.3852, "step": 42305 }, { - "epoch": 1.49, - "learning_rate": 4.118242592648802e-05, - "loss": 0.2958, + "epoch": 1.5248495332828775, + "grad_norm": 0.19868656992912292, + "learning_rate": 4.073152809085414e-05, + "loss": 0.4342, "step": 42310 }, { - "epoch": 1.49, - "learning_rate": 4.1180254418213976e-05, - "loss": 0.2794, + "epoch": 1.525029732944102, + "grad_norm": 0.14885884523391724, + "learning_rate": 4.0729260027866395e-05, + "loss": 0.4217, "step": 42315 }, { - "epoch": 1.49, - "learning_rate": 4.1178082699849984e-05, - "loss": 0.2772, + "epoch": 1.5252099326053266, + "grad_norm": 0.21947623789310455, + "learning_rate": 4.0726991750570525e-05, + "loss": 0.4223, "step": 42320 }, { - "epoch": 1.49, - "learning_rate": 4.1175910771424266e-05, - "loss": 0.2904, + "epoch": 1.5253901322665513, + "grad_norm": 0.2015712708234787, + "learning_rate": 4.072472325899743e-05, + "loss": 0.4617, "step": 42325 }, { - "epoch": 1.49, - "learning_rate": 4.1173738632964986e-05, - "loss": 0.2881, + "epoch": 1.525570331927776, + "grad_norm": 0.1568281650543213, + "learning_rate": 4.072245455317804e-05, + "loss": 0.3907, "step": 42330 }, { - "epoch": 1.49, - "learning_rate": 4.117156628450039e-05, - "loss": 0.2859, + "epoch": 1.5257505315890008, + "grad_norm": 0.16805681586265564, + "learning_rate": 4.0720185633143246e-05, + "loss": 0.421, "step": 42335 }, { - "epoch": 1.49, - "learning_rate": 4.1169393726058645e-05, - "loss": 0.2855, + "epoch": 1.5259307312502253, + "grad_norm": 0.16904395818710327, + "learning_rate": 4.071791649892396e-05, + "loss": 0.4295, "step": 42340 }, { - "epoch": 1.49, - "learning_rate": 4.116722095766798e-05, - "loss": 0.3053, + "epoch": 1.5261109309114498, + "grad_norm": 0.15109315514564514, + "learning_rate": 4.07156471505511e-05, + "loss": 0.3572, "step": 42345 }, { - "epoch": 1.49, - "learning_rate": 4.116504797935662e-05, - "loss": 0.2883, + "epoch": 1.5262911305726745, + "grad_norm": 0.1983298510313034, + "learning_rate": 4.0713377588055604e-05, + "loss": 0.4146, "step": 42350 }, { - "epoch": 1.49, - "learning_rate": 4.1162874791152755e-05, - "loss": 0.2792, + "epoch": 1.5264713302338992, + "grad_norm": 0.22432877123355865, + "learning_rate": 4.0711107811468375e-05, + "loss": 0.4239, "step": 42355 }, { - "epoch": 1.49, - "learning_rate": 4.116070139308461e-05, - "loss": 0.292, + "epoch": 1.5266515298951238, + "grad_norm": 0.18590804934501648, + "learning_rate": 4.070883782082035e-05, + "loss": 0.4095, "step": 42360 }, { - "epoch": 1.49, - "learning_rate": 4.115852778518041e-05, - "loss": 0.2843, + "epoch": 1.5268317295563483, + "grad_norm": 0.22011986374855042, + "learning_rate": 4.070656761614244e-05, + "loss": 0.4078, "step": 42365 }, { - "epoch": 1.49, - "learning_rate": 4.1156353967468384e-05, - "loss": 0.2932, + "epoch": 1.527011929217573, + "grad_norm": 0.19090569019317627, + "learning_rate": 4.0704297197465594e-05, + "loss": 0.4148, "step": 42370 }, { - "epoch": 1.49, - "learning_rate": 4.115417993997674e-05, - "loss": 0.3182, + "epoch": 1.5271921288787977, + "grad_norm": 0.14910888671875, + "learning_rate": 4.070202656482074e-05, + "loss": 0.41, "step": 42375 }, { - "epoch": 1.49, - "learning_rate": 4.115200570273372e-05, - "loss": 0.2978, + "epoch": 1.5273723285400225, + "grad_norm": 0.13588306307792664, + "learning_rate": 4.0699755718238806e-05, + "loss": 0.416, "step": 42380 }, { - "epoch": 1.49, - "learning_rate": 4.114983125576754e-05, - "loss": 0.3059, + "epoch": 1.527552528201247, + "grad_norm": 0.1951780468225479, + "learning_rate": 4.069748465775075e-05, + "loss": 0.4077, "step": 42385 }, { - "epoch": 1.49, - "learning_rate": 4.114765659910646e-05, - "loss": 0.2767, + "epoch": 1.5277327278624715, + "grad_norm": 0.1684001386165619, + "learning_rate": 4.06952133833875e-05, + "loss": 0.3969, "step": 42390 }, { - "epoch": 1.49, - "learning_rate": 4.1145481732778695e-05, - "loss": 0.3192, + "epoch": 1.5279129275236962, + "grad_norm": 0.15427611768245697, + "learning_rate": 4.0692941895180004e-05, + "loss": 0.4256, "step": 42395 }, { - "epoch": 1.49, - "learning_rate": 4.114330665681249e-05, - "loss": 0.2982, + "epoch": 1.528093127184921, + "grad_norm": 0.19290746748447418, + "learning_rate": 4.0690670193159214e-05, + "loss": 0.4083, "step": 42400 }, { - "epoch": 1.49, - "learning_rate": 4.114113137123608e-05, - "loss": 0.3278, + "epoch": 1.5282733268461455, + "grad_norm": 0.23477697372436523, + "learning_rate": 4.0688398277356076e-05, + "loss": 0.443, "step": 42405 }, { - "epoch": 1.49, - "learning_rate": 4.1138955876077725e-05, - "loss": 0.2876, + "epoch": 1.5284535265073702, + "grad_norm": 0.1695445477962494, + "learning_rate": 4.068612614780156e-05, + "loss": 0.4119, "step": 42410 }, { - "epoch": 1.49, - "learning_rate": 4.1136780171365656e-05, - "loss": 0.2876, + "epoch": 1.5286337261685947, + "grad_norm": 0.1924455612897873, + "learning_rate": 4.068385380452661e-05, + "loss": 0.3882, "step": 42415 }, { - "epoch": 1.49, - "learning_rate": 4.113460425712814e-05, - "loss": 0.2808, + "epoch": 1.5288139258298195, + "grad_norm": 0.17386910319328308, + "learning_rate": 4.0681581247562186e-05, + "loss": 0.4252, "step": 42420 }, { - "epoch": 1.49, - "learning_rate": 4.1132428133393414e-05, - "loss": 0.3196, + "epoch": 1.5289941254910442, + "grad_norm": 0.15284450352191925, + "learning_rate": 4.0679308476939245e-05, + "loss": 0.3943, "step": 42425 }, { - "epoch": 1.49, - "learning_rate": 4.113025180018974e-05, - "loss": 0.2973, + "epoch": 1.5291743251522687, + "grad_norm": 0.21759410202503204, + "learning_rate": 4.067703549268877e-05, + "loss": 0.4231, "step": 42430 }, { - "epoch": 1.49, - "learning_rate": 4.1128075257545384e-05, - "loss": 0.2506, + "epoch": 1.5293545248134932, + "grad_norm": 0.20288415253162384, + "learning_rate": 4.0674762294841715e-05, + "loss": 0.4339, "step": 42435 }, { - "epoch": 1.49, - "learning_rate": 4.1125898505488595e-05, - "loss": 0.3138, + "epoch": 1.529534724474718, + "grad_norm": 0.229291170835495, + "learning_rate": 4.067248888342907e-05, + "loss": 0.4271, "step": 42440 }, { - "epoch": 1.49, - "learning_rate": 4.112372154404765e-05, - "loss": 0.2755, + "epoch": 1.5297149241359427, + "grad_norm": 0.1930796056985855, + "learning_rate": 4.067021525848179e-05, + "loss": 0.4097, "step": 42445 }, { - "epoch": 1.49, - "learning_rate": 4.112154437325079e-05, - "loss": 0.2799, + "epoch": 1.5298951237971674, + "grad_norm": 0.17433331906795502, + "learning_rate": 4.066794142003086e-05, + "loss": 0.403, "step": 42450 }, { - "epoch": 1.49, - "learning_rate": 4.111936699312631e-05, - "loss": 0.3056, + "epoch": 1.530075323458392, + "grad_norm": 0.15750248730182648, + "learning_rate": 4.066566736810727e-05, + "loss": 0.3983, "step": 42455 }, { - "epoch": 1.49, - "learning_rate": 4.111718940370247e-05, - "loss": 0.2922, + "epoch": 1.5302555231196164, + "grad_norm": 0.22471831738948822, + "learning_rate": 4.0663393102741995e-05, + "loss": 0.4662, "step": 42460 }, { - "epoch": 1.49, - "learning_rate": 4.1115011605007545e-05, - "loss": 0.2564, + "epoch": 1.5304357227808412, + "grad_norm": 0.17949113249778748, + "learning_rate": 4.066111862396601e-05, + "loss": 0.4209, "step": 42465 }, { - "epoch": 1.49, - "learning_rate": 4.111283359706982e-05, - "loss": 0.2812, + "epoch": 1.530615922442066, + "grad_norm": 0.17478035390377045, + "learning_rate": 4.065884393181032e-05, + "loss": 0.3824, "step": 42470 }, { - "epoch": 1.49, - "learning_rate": 4.111065537991757e-05, - "loss": 0.3041, + "epoch": 1.5307961221032904, + "grad_norm": 0.14620402455329895, + "learning_rate": 4.065656902630592e-05, + "loss": 0.4105, "step": 42475 }, { - "epoch": 1.49, - "learning_rate": 4.1108476953579076e-05, - "loss": 0.2833, + "epoch": 1.530976321764515, + "grad_norm": 0.2226269692182541, + "learning_rate": 4.06542939074838e-05, + "loss": 0.4242, "step": 42480 }, { - "epoch": 1.49, - "learning_rate": 4.110629831808263e-05, - "loss": 0.2885, + "epoch": 1.5311565214257397, + "grad_norm": 0.1945657581090927, + "learning_rate": 4.0652018575374945e-05, + "loss": 0.4298, "step": 42485 }, { - "epoch": 1.49, - "learning_rate": 4.11041194734565e-05, - "loss": 0.2598, + "epoch": 1.5313367210869644, + "grad_norm": 0.18082071840763092, + "learning_rate": 4.0649743030010366e-05, + "loss": 0.4119, "step": 42490 }, { - "epoch": 1.5, - "learning_rate": 4.1101940419729e-05, - "loss": 0.2646, + "epoch": 1.5315169207481891, + "grad_norm": 0.1490999460220337, + "learning_rate": 4.064746727142108e-05, + "loss": 0.3611, "step": 42495 }, { - "epoch": 1.5, - "learning_rate": 4.109976115692842e-05, - "loss": 0.2901, + "epoch": 1.5316971204094136, + "grad_norm": 0.20667734742164612, + "learning_rate": 4.064519129963807e-05, + "loss": 0.3959, "step": 42500 }, { - "epoch": 1.5, - "eval_loss": 0.28041115403175354, - "eval_runtime": 10.5442, - "eval_samples_per_second": 9.484, - "eval_steps_per_second": 9.484, + "epoch": 1.5316971204094136, + "eval_loss": 0.4439639151096344, + "eval_runtime": 3.531, + "eval_samples_per_second": 28.32, + "eval_steps_per_second": 7.08, "step": 42500 }, { - "epoch": 1.5, - "learning_rate": 4.109758168508304e-05, - "loss": 0.285, + "epoch": 1.5318773200706381, + "grad_norm": 0.18733155727386475, + "learning_rate": 4.064291511469237e-05, + "loss": 0.4323, "step": 42505 }, { - "epoch": 1.5, - "learning_rate": 4.1095402004221184e-05, - "loss": 0.2893, + "epoch": 1.5320575197318629, + "grad_norm": 0.17432504892349243, + "learning_rate": 4.064063871661497e-05, + "loss": 0.3858, "step": 42510 }, { - "epoch": 1.5, - "learning_rate": 4.1093222114371134e-05, - "loss": 0.2786, + "epoch": 1.5322377193930876, + "grad_norm": 0.16605962812900543, + "learning_rate": 4.06383621054369e-05, + "loss": 0.4115, "step": 42515 }, { - "epoch": 1.5, - "learning_rate": 4.10910420155612e-05, - "loss": 0.2808, + "epoch": 1.5324179190543121, + "grad_norm": 0.16073060035705566, + "learning_rate": 4.0636085281189176e-05, + "loss": 0.428, "step": 42520 }, { - "epoch": 1.5, - "learning_rate": 4.108886170781969e-05, - "loss": 0.2984, + "epoch": 1.5325981187155369, + "grad_norm": 0.18236148357391357, + "learning_rate": 4.063380824390282e-05, + "loss": 0.4489, "step": 42525 }, { - "epoch": 1.5, - "learning_rate": 4.108668119117491e-05, - "loss": 0.2864, + "epoch": 1.5327783183767614, + "grad_norm": 0.16601453721523285, + "learning_rate": 4.063153099360884e-05, + "loss": 0.4184, "step": 42530 }, { - "epoch": 1.5, - "learning_rate": 4.1084500465655186e-05, - "loss": 0.2877, + "epoch": 1.532958518037986, + "grad_norm": 0.15551328659057617, + "learning_rate": 4.0629253530338284e-05, + "loss": 0.3619, "step": 42535 }, { - "epoch": 1.5, - "learning_rate": 4.1082319531288816e-05, - "loss": 0.2936, + "epoch": 1.5331387176992108, + "grad_norm": 0.2021326869726181, + "learning_rate": 4.062697585412218e-05, + "loss": 0.4062, "step": 42540 }, { - "epoch": 1.5, - "learning_rate": 4.108013838810413e-05, - "loss": 0.2693, + "epoch": 1.5333189173604354, + "grad_norm": 0.1801024228334427, + "learning_rate": 4.062469796499155e-05, + "loss": 0.398, "step": 42545 }, { - "epoch": 1.5, - "learning_rate": 4.1077957036129436e-05, - "loss": 0.2713, + "epoch": 1.5334991170216599, + "grad_norm": 0.1799817532300949, + "learning_rate": 4.062241986297743e-05, + "loss": 0.4576, "step": 42550 }, { - "epoch": 1.5, - "learning_rate": 4.1075775475393077e-05, - "loss": 0.2775, + "epoch": 1.5336793166828846, + "grad_norm": 0.22685213387012482, + "learning_rate": 4.062014154811087e-05, + "loss": 0.4469, "step": 42555 }, { - "epoch": 1.5, - "learning_rate": 4.107359370592335e-05, - "loss": 0.2905, + "epoch": 1.5338595163441093, + "grad_norm": 0.19450637698173523, + "learning_rate": 4.061786302042291e-05, + "loss": 0.4152, "step": 42560 }, { - "epoch": 1.5, - "learning_rate": 4.107141172774862e-05, - "loss": 0.2979, + "epoch": 1.534039716005334, + "grad_norm": 0.17256684601306915, + "learning_rate": 4.0615584279944586e-05, + "loss": 0.3805, "step": 42565 }, { - "epoch": 1.5, - "learning_rate": 4.106922954089719e-05, - "loss": 0.2906, + "epoch": 1.5342199156665586, + "grad_norm": 0.17024463415145874, + "learning_rate": 4.061330532670695e-05, + "loss": 0.4086, "step": 42570 }, { - "epoch": 1.5, - "learning_rate": 4.106704714539741e-05, - "loss": 0.2724, + "epoch": 1.534400115327783, + "grad_norm": 0.16583757102489471, + "learning_rate": 4.061102616074105e-05, + "loss": 0.4145, "step": 42575 }, { - "epoch": 1.5, - "learning_rate": 4.106486454127761e-05, - "loss": 0.2972, + "epoch": 1.5345803149890078, + "grad_norm": 0.1984453946352005, + "learning_rate": 4.060874678207794e-05, + "loss": 0.426, "step": 42580 }, { - "epoch": 1.5, - "learning_rate": 4.106268172856613e-05, - "loss": 0.2951, + "epoch": 1.5347605146502326, + "grad_norm": 0.192495658993721, + "learning_rate": 4.060646719074868e-05, + "loss": 0.4444, "step": 42585 }, { - "epoch": 1.5, - "learning_rate": 4.1060498707291314e-05, - "loss": 0.3015, + "epoch": 1.534940714311457, + "grad_norm": 0.1842796951532364, + "learning_rate": 4.060418738678432e-05, + "loss": 0.4038, "step": 42590 }, { - "epoch": 1.5, - "learning_rate": 4.1058315477481506e-05, - "loss": 0.2937, + "epoch": 1.5351209139726816, + "grad_norm": 0.17399680614471436, + "learning_rate": 4.060190737021594e-05, + "loss": 0.4175, "step": 42595 }, { - "epoch": 1.5, - "learning_rate": 4.105613203916505e-05, - "loss": 0.2863, + "epoch": 1.5353011136339063, + "grad_norm": 0.20878851413726807, + "learning_rate": 4.059962714107458e-05, + "loss": 0.4576, "step": 42600 }, { - "epoch": 1.5, - "learning_rate": 4.1053948392370315e-05, - "loss": 0.2809, + "epoch": 1.535481313295131, + "grad_norm": 0.19469203054904938, + "learning_rate": 4.059734669939133e-05, + "loss": 0.4215, "step": 42605 }, { - "epoch": 1.5, - "learning_rate": 4.105176453712563e-05, - "loss": 0.2942, + "epoch": 1.5356615129563558, + "grad_norm": 0.1677183359861374, + "learning_rate": 4.0595066045197245e-05, + "loss": 0.4248, "step": 42610 }, { - "epoch": 1.5, - "learning_rate": 4.104958047345936e-05, - "loss": 0.2738, + "epoch": 1.5358417126175803, + "grad_norm": 0.18572258949279785, + "learning_rate": 4.059278517852341e-05, + "loss": 0.395, "step": 42615 }, { - "epoch": 1.5, - "learning_rate": 4.104739620139987e-05, - "loss": 0.2744, + "epoch": 1.5360219122788048, + "grad_norm": 0.2249453216791153, + "learning_rate": 4.059050409940089e-05, + "loss": 0.4284, "step": 42620 }, { - "epoch": 1.5, - "learning_rate": 4.1045211720975506e-05, - "loss": 0.2867, + "epoch": 1.5362021119400295, + "grad_norm": 0.17428001761436462, + "learning_rate": 4.058822280786077e-05, + "loss": 0.3935, "step": 42625 }, { - "epoch": 1.5, - "learning_rate": 4.1043027032214656e-05, - "loss": 0.2808, + "epoch": 1.5363823116012543, + "grad_norm": 0.1760479211807251, + "learning_rate": 4.058594130393414e-05, + "loss": 0.4689, "step": 42630 }, { - "epoch": 1.5, - "learning_rate": 4.104084213514566e-05, - "loss": 0.2839, + "epoch": 1.5365625112624788, + "grad_norm": 0.2455560564994812, + "learning_rate": 4.058365958765207e-05, + "loss": 0.4436, "step": 42635 }, { - "epoch": 1.5, - "learning_rate": 4.1038657029796904e-05, - "loss": 0.2892, + "epoch": 1.5367427109237035, + "grad_norm": 0.18493212759494781, + "learning_rate": 4.058137765904565e-05, + "loss": 0.4125, "step": 42640 }, { - "epoch": 1.5, - "learning_rate": 4.1036471716196755e-05, - "loss": 0.293, + "epoch": 1.536922910584928, + "grad_norm": 0.16667693853378296, + "learning_rate": 4.057909551814599e-05, + "loss": 0.3845, "step": 42645 }, { - "epoch": 1.5, - "learning_rate": 4.103428619437359e-05, - "loss": 0.301, + "epoch": 1.5371031102461528, + "grad_norm": 0.1610030233860016, + "learning_rate": 4.057681316498416e-05, + "loss": 0.4306, "step": 42650 }, { - "epoch": 1.5, - "learning_rate": 4.103210046435579e-05, - "loss": 0.2923, + "epoch": 1.5372833099073775, + "grad_norm": 0.19051054120063782, + "learning_rate": 4.0574530599591264e-05, + "loss": 0.3968, "step": 42655 }, { - "epoch": 1.5, - "learning_rate": 4.1029914526171734e-05, - "loss": 0.2789, + "epoch": 1.537463509568602, + "grad_norm": 0.19822818040847778, + "learning_rate": 4.0572247821998414e-05, + "loss": 0.4072, "step": 42660 }, { - "epoch": 1.5, - "learning_rate": 4.1027728379849794e-05, - "loss": 0.3124, + "epoch": 1.5376437092298265, + "grad_norm": 0.16644228994846344, + "learning_rate": 4.056996483223669e-05, + "loss": 0.4174, "step": 42665 }, { - "epoch": 1.5, - "learning_rate": 4.1025542025418365e-05, - "loss": 0.2955, + "epoch": 1.5378239088910512, + "grad_norm": 0.14218945801258087, + "learning_rate": 4.056768163033722e-05, + "loss": 0.3884, "step": 42670 }, { - "epoch": 1.5, - "learning_rate": 4.102335546290583e-05, - "loss": 0.266, + "epoch": 1.538004108552276, + "grad_norm": 0.21810731291770935, + "learning_rate": 4.05653982163311e-05, + "loss": 0.3874, "step": 42675 }, { - "epoch": 1.5, - "learning_rate": 4.102116869234059e-05, - "loss": 0.291, + "epoch": 1.5381843082135007, + "grad_norm": 0.184495210647583, + "learning_rate": 4.056311459024944e-05, + "loss": 0.4172, "step": 42680 }, { - "epoch": 1.5, - "learning_rate": 4.1018981713751035e-05, - "loss": 0.2801, + "epoch": 1.5383645078747252, + "grad_norm": 0.23121629655361176, + "learning_rate": 4.0560830752123355e-05, + "loss": 0.4142, "step": 42685 }, { - "epoch": 1.5, - "learning_rate": 4.101679452716556e-05, - "loss": 0.2897, + "epoch": 1.5385447075359497, + "grad_norm": 0.1597655862569809, + "learning_rate": 4.0558546701983977e-05, + "loss": 0.4112, "step": 42690 }, { - "epoch": 1.5, - "learning_rate": 4.101460713261256e-05, - "loss": 0.2788, + "epoch": 1.5387249071971745, + "grad_norm": 0.17811793088912964, + "learning_rate": 4.05562624398624e-05, + "loss": 0.3963, "step": 42695 }, { - "epoch": 1.5, - "learning_rate": 4.101241953012044e-05, - "loss": 0.2689, + "epoch": 1.5389051068583992, + "grad_norm": 0.20575061440467834, + "learning_rate": 4.055397796578976e-05, + "loss": 0.4076, "step": 42700 }, { - "epoch": 1.5, - "learning_rate": 4.101023171971761e-05, - "loss": 0.2894, + "epoch": 1.5390853065196237, + "grad_norm": 0.1952335387468338, + "learning_rate": 4.055169327979719e-05, + "loss": 0.4558, "step": 42705 }, { - "epoch": 1.5, - "learning_rate": 4.1008043701432465e-05, - "loss": 0.2889, + "epoch": 1.5392655061808482, + "grad_norm": 0.1855412870645523, + "learning_rate": 4.0549408381915796e-05, + "loss": 0.4063, "step": 42710 }, { - "epoch": 1.5, - "learning_rate": 4.1005855475293426e-05, - "loss": 0.2879, + "epoch": 1.539445705842073, + "grad_norm": 0.16599588096141815, + "learning_rate": 4.054712327217673e-05, + "loss": 0.4303, "step": 42715 }, { - "epoch": 1.5, - "learning_rate": 4.10036670413289e-05, - "loss": 0.2864, + "epoch": 1.5396259055032977, + "grad_norm": 0.15664242208003998, + "learning_rate": 4.054483795061112e-05, + "loss": 0.3982, "step": 42720 }, { - "epoch": 1.5, - "learning_rate": 4.1001478399567305e-05, - "loss": 0.2618, + "epoch": 1.5398061051645224, + "grad_norm": 0.2023800015449524, + "learning_rate": 4.05425524172501e-05, + "loss": 0.4052, "step": 42725 }, { - "epoch": 1.5, - "learning_rate": 4.099928955003706e-05, - "loss": 0.2802, + "epoch": 1.539986304825747, + "grad_norm": 0.1691524088382721, + "learning_rate": 4.0540266672124814e-05, + "loss": 0.4235, "step": 42730 }, { - "epoch": 1.5, - "learning_rate": 4.099710049276658e-05, - "loss": 0.2785, + "epoch": 1.5401665044869715, + "grad_norm": 0.2135685384273529, + "learning_rate": 4.0537980715266404e-05, + "loss": 0.4041, "step": 42735 }, { - "epoch": 1.5, - "learning_rate": 4.0994911227784296e-05, - "loss": 0.2576, + "epoch": 1.5403467041481962, + "grad_norm": 0.16403359174728394, + "learning_rate": 4.0535694546706014e-05, + "loss": 0.3921, "step": 42740 }, { - "epoch": 1.5, - "learning_rate": 4.099272175511863e-05, - "loss": 0.3038, + "epoch": 1.540526903809421, + "grad_norm": 0.1725141555070877, + "learning_rate": 4.05334081664748e-05, + "loss": 0.4409, "step": 42745 }, { - "epoch": 1.5, - "learning_rate": 4.099053207479802e-05, - "loss": 0.312, + "epoch": 1.5407071034706454, + "grad_norm": 0.16590328514575958, + "learning_rate": 4.05311215746039e-05, + "loss": 0.4197, "step": 42750 }, { - "epoch": 1.5, - "learning_rate": 4.098834218685087e-05, - "loss": 0.3024, + "epoch": 1.5408873031318702, + "grad_norm": 0.20750023424625397, + "learning_rate": 4.0528834771124474e-05, + "loss": 0.4164, "step": 42755 }, { - "epoch": 1.5, - "learning_rate": 4.0986152091305645e-05, - "loss": 0.2952, + "epoch": 1.5410675027930947, + "grad_norm": 0.18303443491458893, + "learning_rate": 4.052654775606768e-05, + "loss": 0.4091, "step": 42760 }, { - "epoch": 1.5, - "learning_rate": 4.0983961788190774e-05, - "loss": 0.3006, + "epoch": 1.5412477024543194, + "grad_norm": 0.16997192800045013, + "learning_rate": 4.0524260529464695e-05, + "loss": 0.4316, "step": 42765 }, { - "epoch": 1.5, - "learning_rate": 4.0981771277534684e-05, - "loss": 0.322, + "epoch": 1.5414279021155441, + "grad_norm": 0.17433921992778778, + "learning_rate": 4.052197309134665e-05, + "loss": 0.3874, "step": 42770 }, { - "epoch": 1.5, - "learning_rate": 4.0979580559365835e-05, - "loss": 0.2838, + "epoch": 1.5416081017767687, + "grad_norm": 0.1725977659225464, + "learning_rate": 4.051968544174473e-05, + "loss": 0.4378, "step": 42775 }, { - "epoch": 1.51, - "learning_rate": 4.097738963371266e-05, - "loss": 0.2807, + "epoch": 1.5417883014379932, + "grad_norm": 0.23443584144115448, + "learning_rate": 4.0517397580690096e-05, + "loss": 0.4625, "step": 42780 }, { - "epoch": 1.51, - "learning_rate": 4.097519850060361e-05, - "loss": 0.2987, + "epoch": 1.541968501099218, + "grad_norm": 0.22204741835594177, + "learning_rate": 4.051510950821393e-05, + "loss": 0.4225, "step": 42785 }, { - "epoch": 1.51, - "learning_rate": 4.097300716006713e-05, - "loss": 0.2989, + "epoch": 1.5421487007604426, + "grad_norm": 0.160122349858284, + "learning_rate": 4.05128212243474e-05, + "loss": 0.3939, "step": 42790 }, { - "epoch": 1.51, - "learning_rate": 4.097081561213169e-05, - "loss": 0.2826, + "epoch": 1.5423289004216674, + "grad_norm": 0.21652203798294067, + "learning_rate": 4.0510532729121684e-05, + "loss": 0.4134, "step": 42795 }, { - "epoch": 1.51, - "learning_rate": 4.096862385682572e-05, - "loss": 0.277, + "epoch": 1.5425091000828919, + "grad_norm": 0.2179877758026123, + "learning_rate": 4.0508244022567966e-05, + "loss": 0.4251, "step": 42800 }, { - "epoch": 1.51, - "learning_rate": 4.09664318941777e-05, - "loss": 0.2774, + "epoch": 1.5426892997441164, + "grad_norm": 0.16457681357860565, + "learning_rate": 4.050595510471742e-05, + "loss": 0.4323, "step": 42805 }, { - "epoch": 1.51, - "learning_rate": 4.0964239724216094e-05, - "loss": 0.2809, + "epoch": 1.5428694994053411, + "grad_norm": 0.1653529703617096, + "learning_rate": 4.050366597560124e-05, + "loss": 0.41, "step": 42810 }, { - "epoch": 1.51, - "learning_rate": 4.096204734696935e-05, - "loss": 0.2972, + "epoch": 1.5430496990665659, + "grad_norm": 0.16849032044410706, + "learning_rate": 4.0501376635250606e-05, + "loss": 0.4431, "step": 42815 }, { - "epoch": 1.51, - "learning_rate": 4.095985476246593e-05, - "loss": 0.2467, + "epoch": 1.5432298987277904, + "grad_norm": 0.17099571228027344, + "learning_rate": 4.049908708369673e-05, + "loss": 0.3836, "step": 42820 }, { - "epoch": 1.51, - "learning_rate": 4.0957661970734326e-05, - "loss": 0.3159, + "epoch": 1.5434100983890149, + "grad_norm": 0.198577880859375, + "learning_rate": 4.049679732097079e-05, + "loss": 0.3952, "step": 42825 }, { - "epoch": 1.51, - "learning_rate": 4.095546897180299e-05, - "loss": 0.2756, + "epoch": 1.5435902980502396, + "grad_norm": 0.1876094788312912, + "learning_rate": 4.049450734710398e-05, + "loss": 0.4149, "step": 42830 }, { - "epoch": 1.51, - "learning_rate": 4.095327576570041e-05, - "loss": 0.2908, + "epoch": 1.5437704977114644, + "grad_norm": 0.14950236678123474, + "learning_rate": 4.049221716212751e-05, + "loss": 0.3671, "step": 42835 }, { - "epoch": 1.51, - "learning_rate": 4.095108235245505e-05, - "loss": 0.3008, + "epoch": 1.543950697372689, + "grad_norm": 0.19585910439491272, + "learning_rate": 4.048992676607258e-05, + "loss": 0.4084, "step": 42840 }, { - "epoch": 1.51, - "learning_rate": 4.0948888732095405e-05, - "loss": 0.2964, + "epoch": 1.5441308970339136, + "grad_norm": 0.16137710213661194, + "learning_rate": 4.04876361589704e-05, + "loss": 0.4119, "step": 42845 }, { - "epoch": 1.51, - "learning_rate": 4.0946694904649954e-05, - "loss": 0.3011, + "epoch": 1.544311096695138, + "grad_norm": 0.20326700806617737, + "learning_rate": 4.048534534085218e-05, + "loss": 0.4127, "step": 42850 }, { - "epoch": 1.51, - "learning_rate": 4.0944500870147176e-05, - "loss": 0.2992, + "epoch": 1.5444912963563628, + "grad_norm": 0.16420388221740723, + "learning_rate": 4.0483054311749114e-05, + "loss": 0.3519, "step": 42855 }, { - "epoch": 1.51, - "learning_rate": 4.094230662861555e-05, - "loss": 0.2891, + "epoch": 1.5446714960175876, + "grad_norm": 0.17619933187961578, + "learning_rate": 4.048076307169244e-05, + "loss": 0.4249, "step": 42860 }, { - "epoch": 1.51, - "learning_rate": 4.094011218008359e-05, - "loss": 0.2977, + "epoch": 1.544851695678812, + "grad_norm": 0.1833731085062027, + "learning_rate": 4.047847162071336e-05, + "loss": 0.4124, "step": 42865 }, { - "epoch": 1.51, - "learning_rate": 4.093791752457978e-05, - "loss": 0.2708, + "epoch": 1.5450318953400366, + "grad_norm": 0.18540017306804657, + "learning_rate": 4.0476179958843105e-05, + "loss": 0.3995, "step": 42870 }, { - "epoch": 1.51, - "learning_rate": 4.093572266213261e-05, - "loss": 0.2906, + "epoch": 1.5452120950012613, + "grad_norm": 0.19920703768730164, + "learning_rate": 4.04738880861129e-05, + "loss": 0.4219, "step": 42875 }, { - "epoch": 1.51, - "learning_rate": 4.093352759277058e-05, - "loss": 0.2713, + "epoch": 1.545392294662486, + "grad_norm": 0.19860808551311493, + "learning_rate": 4.0471596002553956e-05, + "loss": 0.4153, "step": 42880 }, { - "epoch": 1.51, - "learning_rate": 4.0931332316522206e-05, - "loss": 0.2765, + "epoch": 1.5455724943237108, + "grad_norm": 0.1732819527387619, + "learning_rate": 4.0469303708197515e-05, + "loss": 0.4229, "step": 42885 }, { - "epoch": 1.51, - "learning_rate": 4.092913683341597e-05, - "loss": 0.3059, + "epoch": 1.5457526939849353, + "grad_norm": 0.19622696936130524, + "learning_rate": 4.0467011203074815e-05, + "loss": 0.4149, "step": 42890 }, { - "epoch": 1.51, - "learning_rate": 4.0926941143480397e-05, - "loss": 0.2758, + "epoch": 1.5459328936461598, + "grad_norm": 0.1721510887145996, + "learning_rate": 4.0464718487217066e-05, + "loss": 0.4413, "step": 42895 }, { - "epoch": 1.51, - "learning_rate": 4.092474524674399e-05, - "loss": 0.267, + "epoch": 1.5461130933073846, + "grad_norm": 0.1426592469215393, + "learning_rate": 4.046242556065553e-05, + "loss": 0.4244, "step": 42900 }, { - "epoch": 1.51, - "learning_rate": 4.092254914323526e-05, - "loss": 0.2707, + "epoch": 1.5462932929686093, + "grad_norm": 0.20957881212234497, + "learning_rate": 4.046013242342144e-05, + "loss": 0.4354, "step": 42905 }, { - "epoch": 1.51, - "learning_rate": 4.0920352832982714e-05, - "loss": 0.3051, + "epoch": 1.5464734926298338, + "grad_norm": 0.15983013808727264, + "learning_rate": 4.045783907554604e-05, + "loss": 0.3761, "step": 42910 }, { - "epoch": 1.51, - "learning_rate": 4.0918156316014884e-05, - "loss": 0.3182, + "epoch": 1.5466536922910585, + "grad_norm": 0.1735716015100479, + "learning_rate": 4.045554551706057e-05, + "loss": 0.3935, "step": 42915 }, { - "epoch": 1.51, - "learning_rate": 4.091595959236029e-05, - "loss": 0.3193, + "epoch": 1.546833891952283, + "grad_norm": 0.17203421890735626, + "learning_rate": 4.045325174799629e-05, + "loss": 0.4176, "step": 42920 }, { - "epoch": 1.51, - "learning_rate": 4.091376266204744e-05, - "loss": 0.2697, + "epoch": 1.5470140916135078, + "grad_norm": 0.17394539713859558, + "learning_rate": 4.0450957768384446e-05, + "loss": 0.3769, "step": 42925 }, { - "epoch": 1.51, - "learning_rate": 4.0911565525104876e-05, - "loss": 0.2995, + "epoch": 1.5471942912747325, + "grad_norm": 0.18221496045589447, + "learning_rate": 4.044866357825629e-05, + "loss": 0.3931, "step": 42930 }, { - "epoch": 1.51, - "learning_rate": 4.090936818156112e-05, - "loss": 0.2751, + "epoch": 1.547374490935957, + "grad_norm": 0.24642367660999298, + "learning_rate": 4.0446369177643085e-05, + "loss": 0.4427, "step": 42935 }, { - "epoch": 1.51, - "learning_rate": 4.0907170631444703e-05, - "loss": 0.3034, + "epoch": 1.5475546905971815, + "grad_norm": 0.2149946391582489, + "learning_rate": 4.044407456657609e-05, + "loss": 0.4279, "step": 42940 }, { - "epoch": 1.51, - "learning_rate": 4.090497287478415e-05, - "loss": 0.2835, + "epoch": 1.5477348902584063, + "grad_norm": 0.147264301776886, + "learning_rate": 4.0441779745086575e-05, + "loss": 0.3768, "step": 42945 }, { - "epoch": 1.51, - "learning_rate": 4.090277491160802e-05, - "loss": 0.2839, + "epoch": 1.547915089919631, + "grad_norm": 0.17553412914276123, + "learning_rate": 4.0439484713205795e-05, + "loss": 0.3712, "step": 42950 }, { - "epoch": 1.51, - "learning_rate": 4.090057674194483e-05, - "loss": 0.3095, + "epoch": 1.5480952895808557, + "grad_norm": 0.1730952113866806, + "learning_rate": 4.0437189470965026e-05, + "loss": 0.4062, "step": 42955 }, { - "epoch": 1.51, - "learning_rate": 4.089837836582313e-05, - "loss": 0.2662, + "epoch": 1.5482754892420802, + "grad_norm": 0.21400214731693268, + "learning_rate": 4.043489401839554e-05, + "loss": 0.4106, "step": 42960 }, { - "epoch": 1.51, - "learning_rate": 4.089617978327146e-05, - "loss": 0.2959, + "epoch": 1.5484556889033048, + "grad_norm": 0.1784030795097351, + "learning_rate": 4.0432598355528606e-05, + "loss": 0.4188, "step": 42965 }, { - "epoch": 1.51, - "learning_rate": 4.089398099431838e-05, - "loss": 0.3037, + "epoch": 1.5486358885645295, + "grad_norm": 0.2129783183336258, + "learning_rate": 4.043030248239551e-05, + "loss": 0.4148, "step": 42970 }, { - "epoch": 1.51, - "learning_rate": 4.089178199899243e-05, - "loss": 0.3023, + "epoch": 1.5488160882257542, + "grad_norm": 0.23545123636722565, + "learning_rate": 4.042800639902754e-05, + "loss": 0.422, "step": 42975 }, { - "epoch": 1.51, - "learning_rate": 4.0889582797322155e-05, - "loss": 0.2804, + "epoch": 1.5489962878869787, + "grad_norm": 0.1820184886455536, + "learning_rate": 4.042571010545596e-05, + "loss": 0.371, "step": 42980 }, { - "epoch": 1.51, - "learning_rate": 4.088738338933613e-05, - "loss": 0.2966, + "epoch": 1.5491764875482033, + "grad_norm": 0.19500315189361572, + "learning_rate": 4.0423413601712065e-05, + "loss": 0.4332, "step": 42985 }, { - "epoch": 1.51, - "learning_rate": 4.0885183775062895e-05, - "loss": 0.269, + "epoch": 1.549356687209428, + "grad_norm": 0.18859097361564636, + "learning_rate": 4.042111688782715e-05, + "loss": 0.4262, "step": 42990 }, { - "epoch": 1.51, - "learning_rate": 4.088298395453102e-05, - "loss": 0.274, + "epoch": 1.5495368868706527, + "grad_norm": 0.16476643085479736, + "learning_rate": 4.04188199638325e-05, + "loss": 0.4073, "step": 42995 }, { - "epoch": 1.51, - "learning_rate": 4.088078392776906e-05, - "loss": 0.2803, + "epoch": 1.5497170865318775, + "grad_norm": 0.1720295399427414, + "learning_rate": 4.041652282975942e-05, + "loss": 0.4289, "step": 43000 }, { - "epoch": 1.51, - "eval_loss": 0.2803913950920105, - "eval_runtime": 10.5528, - "eval_samples_per_second": 9.476, - "eval_steps_per_second": 9.476, + "epoch": 1.5497170865318775, + "eval_loss": 0.4438794255256653, + "eval_runtime": 3.5307, + "eval_samples_per_second": 28.323, + "eval_steps_per_second": 7.081, "step": 43000 }, { - "epoch": 1.51, - "learning_rate": 4.08785836948056e-05, - "loss": 0.2946, + "epoch": 1.549897286193102, + "grad_norm": 0.17833761870861053, + "learning_rate": 4.041422548563919e-05, + "loss": 0.4653, "step": 43005 }, { - "epoch": 1.51, - "learning_rate": 4.0876383255669184e-05, - "loss": 0.2873, + "epoch": 1.5500774858543265, + "grad_norm": 0.1873827427625656, + "learning_rate": 4.041192793150314e-05, + "loss": 0.4042, "step": 43010 }, { - "epoch": 1.51, - "learning_rate": 4.08741826103884e-05, - "loss": 0.2766, + "epoch": 1.5502576855155512, + "grad_norm": 0.20229150354862213, + "learning_rate": 4.040963016738254e-05, + "loss": 0.446, "step": 43015 }, { - "epoch": 1.51, - "learning_rate": 4.087198175899181e-05, - "loss": 0.295, + "epoch": 1.550437885176776, + "grad_norm": 0.17710404098033905, + "learning_rate": 4.040733219330871e-05, + "loss": 0.4281, "step": 43020 }, { - "epoch": 1.51, - "learning_rate": 4.0869780701508e-05, - "loss": 0.3029, + "epoch": 1.5506180848380005, + "grad_norm": 0.15827186405658722, + "learning_rate": 4.040503400931297e-05, + "loss": 0.3829, "step": 43025 }, { - "epoch": 1.51, - "learning_rate": 4.086757943796555e-05, - "loss": 0.2777, + "epoch": 1.5507982844992252, + "grad_norm": 0.18000894784927368, + "learning_rate": 4.040273561542662e-05, + "loss": 0.4088, "step": 43030 }, { - "epoch": 1.51, - "learning_rate": 4.086537796839304e-05, - "loss": 0.3146, + "epoch": 1.5509784841604497, + "grad_norm": 0.18159720301628113, + "learning_rate": 4.0400437011680986e-05, + "loss": 0.4461, "step": 43035 }, { - "epoch": 1.51, - "learning_rate": 4.086317629281904e-05, - "loss": 0.2991, + "epoch": 1.5511586838216744, + "grad_norm": 0.19357514381408691, + "learning_rate": 4.039813819810737e-05, + "loss": 0.4084, "step": 43040 }, { - "epoch": 1.51, - "learning_rate": 4.086097441127216e-05, - "loss": 0.2706, + "epoch": 1.5513388834828992, + "grad_norm": 0.16021384298801422, + "learning_rate": 4.039583917473711e-05, + "loss": 0.4259, "step": 43045 }, { - "epoch": 1.51, - "learning_rate": 4.085877232378098e-05, - "loss": 0.2813, + "epoch": 1.5515190831441237, + "grad_norm": 0.22351530194282532, + "learning_rate": 4.039353994160152e-05, + "loss": 0.4508, "step": 43050 }, { - "epoch": 1.51, - "learning_rate": 4.085657003037409e-05, - "loss": 0.3014, + "epoch": 1.5516992828053482, + "grad_norm": 0.1790456771850586, + "learning_rate": 4.039124049873193e-05, + "loss": 0.435, "step": 43055 }, { - "epoch": 1.51, - "learning_rate": 4.0854367531080086e-05, - "loss": 0.2965, + "epoch": 1.551879482466573, + "grad_norm": 0.1445944756269455, + "learning_rate": 4.038894084615966e-05, + "loss": 0.4111, "step": 43060 }, { - "epoch": 1.52, - "learning_rate": 4.085216482592757e-05, - "loss": 0.2942, + "epoch": 1.5520596821277977, + "grad_norm": 0.1887555867433548, + "learning_rate": 4.038664098391606e-05, + "loss": 0.4236, "step": 43065 }, { - "epoch": 1.52, - "learning_rate": 4.0849961914945146e-05, - "loss": 0.291, + "epoch": 1.5522398817890224, + "grad_norm": 0.14658775925636292, + "learning_rate": 4.038434091203245e-05, + "loss": 0.4447, "step": 43070 }, { - "epoch": 1.52, - "learning_rate": 4.084775879816141e-05, - "loss": 0.2719, + "epoch": 1.552420081450247, + "grad_norm": 0.192291259765625, + "learning_rate": 4.038204063054017e-05, + "loss": 0.468, "step": 43075 }, { - "epoch": 1.52, - "learning_rate": 4.0845555475604966e-05, - "loss": 0.2727, + "epoch": 1.5526002811114714, + "grad_norm": 0.15899904072284698, + "learning_rate": 4.037974013947058e-05, + "loss": 0.377, "step": 43080 }, { - "epoch": 1.52, - "learning_rate": 4.0843351947304425e-05, - "loss": 0.2802, + "epoch": 1.5527804807726961, + "grad_norm": 0.182462677359581, + "learning_rate": 4.037743943885499e-05, + "loss": 0.4033, "step": 43085 }, { - "epoch": 1.52, - "learning_rate": 4.084114821328841e-05, - "loss": 0.2944, + "epoch": 1.5529606804339209, + "grad_norm": 0.17787639796733856, + "learning_rate": 4.037513852872478e-05, + "loss": 0.416, "step": 43090 }, { - "epoch": 1.52, - "learning_rate": 4.083894427358551e-05, - "loss": 0.2627, + "epoch": 1.5531408800951454, + "grad_norm": 0.1610783487558365, + "learning_rate": 4.037283740911128e-05, + "loss": 0.4039, "step": 43095 }, { - "epoch": 1.52, - "learning_rate": 4.0836740128224374e-05, - "loss": 0.3068, + "epoch": 1.55332107975637, + "grad_norm": 0.15985815227031708, + "learning_rate": 4.037053608004584e-05, + "loss": 0.4483, "step": 43100 }, { - "epoch": 1.52, - "learning_rate": 4.083453577723359e-05, - "loss": 0.2858, + "epoch": 1.5535012794175946, + "grad_norm": 0.21218977868556976, + "learning_rate": 4.036823454155982e-05, + "loss": 0.4047, "step": 43105 }, { - "epoch": 1.52, - "learning_rate": 4.083233122064179e-05, - "loss": 0.304, + "epoch": 1.5536814790788194, + "grad_norm": 0.1868336945772171, + "learning_rate": 4.036593279368458e-05, + "loss": 0.4103, "step": 43110 }, { - "epoch": 1.52, - "learning_rate": 4.083012645847762e-05, - "loss": 0.3016, + "epoch": 1.553861678740044, + "grad_norm": 0.24280469119548798, + "learning_rate": 4.0363630836451496e-05, + "loss": 0.4188, "step": 43115 }, { - "epoch": 1.52, - "learning_rate": 4.0827921490769683e-05, - "loss": 0.2655, + "epoch": 1.5540418784012686, + "grad_norm": 0.1968899369239807, + "learning_rate": 4.036132866989191e-05, + "loss": 0.419, "step": 43120 }, { - "epoch": 1.52, - "learning_rate": 4.0825716317546616e-05, - "loss": 0.3103, + "epoch": 1.5542220780624931, + "grad_norm": 0.19591078162193298, + "learning_rate": 4.035902629403718e-05, + "loss": 0.4057, "step": 43125 }, { - "epoch": 1.52, - "learning_rate": 4.0823510938837055e-05, - "loss": 0.2703, + "epoch": 1.5544022777237179, + "grad_norm": 0.1963011920452118, + "learning_rate": 4.0356723708918705e-05, + "loss": 0.394, "step": 43130 }, { - "epoch": 1.52, - "learning_rate": 4.0821305354669624e-05, - "loss": 0.2977, + "epoch": 1.5545824773849426, + "grad_norm": 0.1776949167251587, + "learning_rate": 4.035442091456784e-05, + "loss": 0.3855, "step": 43135 }, { - "epoch": 1.52, - "learning_rate": 4.081909956507298e-05, - "loss": 0.2772, + "epoch": 1.554762677046167, + "grad_norm": 0.21400536596775055, + "learning_rate": 4.035211791101596e-05, + "loss": 0.4177, "step": 43140 }, { - "epoch": 1.52, - "learning_rate": 4.081689357007575e-05, - "loss": 0.2902, + "epoch": 1.5549428767073918, + "grad_norm": 0.22430282831192017, + "learning_rate": 4.034981469829445e-05, + "loss": 0.444, "step": 43145 }, { - "epoch": 1.52, - "learning_rate": 4.081468736970658e-05, - "loss": 0.2871, + "epoch": 1.5551230763686164, + "grad_norm": 0.1932390183210373, + "learning_rate": 4.034751127643468e-05, + "loss": 0.3992, "step": 43150 }, { - "epoch": 1.52, - "learning_rate": 4.081248096399412e-05, - "loss": 0.2702, + "epoch": 1.555303276029841, + "grad_norm": 0.20954085886478424, + "learning_rate": 4.034520764546805e-05, + "loss": 0.4377, "step": 43155 }, { - "epoch": 1.52, - "learning_rate": 4.081027435296701e-05, - "loss": 0.2834, + "epoch": 1.5554834756910658, + "grad_norm": 0.21870118379592896, + "learning_rate": 4.034290380542593e-05, + "loss": 0.4406, "step": 43160 }, { - "epoch": 1.52, - "learning_rate": 4.08080675366539e-05, - "loss": 0.2735, + "epoch": 1.5556636753522903, + "grad_norm": 0.19834262132644653, + "learning_rate": 4.0340599756339715e-05, + "loss": 0.4218, "step": 43165 }, { - "epoch": 1.52, - "learning_rate": 4.080586051508347e-05, - "loss": 0.2994, + "epoch": 1.5558438750135148, + "grad_norm": 0.1412007212638855, + "learning_rate": 4.03382954982408e-05, + "loss": 0.3994, "step": 43170 }, { - "epoch": 1.52, - "learning_rate": 4.080365328828435e-05, - "loss": 0.2786, + "epoch": 1.5560240746747396, + "grad_norm": 0.15708720684051514, + "learning_rate": 4.0335991031160584e-05, + "loss": 0.3901, "step": 43175 }, { - "epoch": 1.52, - "learning_rate": 4.0801445856285205e-05, - "loss": 0.2774, + "epoch": 1.5562042743359643, + "grad_norm": 0.13734932243824005, + "learning_rate": 4.0333686355130454e-05, + "loss": 0.4384, "step": 43180 }, { - "epoch": 1.52, - "learning_rate": 4.07992382191147e-05, - "loss": 0.2731, + "epoch": 1.556384473997189, + "grad_norm": 0.2183203250169754, + "learning_rate": 4.033138147018181e-05, + "loss": 0.4417, "step": 43185 }, { - "epoch": 1.52, - "learning_rate": 4.079703037680149e-05, - "loss": 0.2878, + "epoch": 1.5565646736584136, + "grad_norm": 0.20702525973320007, + "learning_rate": 4.0329076376346063e-05, + "loss": 0.4323, "step": 43190 }, { - "epoch": 1.52, - "learning_rate": 4.079482232937426e-05, - "loss": 0.2821, + "epoch": 1.556744873319638, + "grad_norm": 0.20246562361717224, + "learning_rate": 4.032677107365463e-05, + "loss": 0.4611, "step": 43195 }, { - "epoch": 1.52, - "learning_rate": 4.079261407686167e-05, - "loss": 0.2589, + "epoch": 1.5569250729808628, + "grad_norm": 0.15538166463375092, + "learning_rate": 4.0324465562138905e-05, + "loss": 0.4066, "step": 43200 }, { - "epoch": 1.52, - "learning_rate": 4.079040561929239e-05, - "loss": 0.2859, + "epoch": 1.5571052726420875, + "grad_norm": 0.1648663729429245, + "learning_rate": 4.03221598418303e-05, + "loss": 0.4347, "step": 43205 }, { - "epoch": 1.52, - "learning_rate": 4.078819695669511e-05, - "loss": 0.2743, + "epoch": 1.557285472303312, + "grad_norm": 0.2193061113357544, + "learning_rate": 4.031985391276023e-05, + "loss": 0.412, "step": 43210 }, { - "epoch": 1.52, - "learning_rate": 4.0785988089098494e-05, - "loss": 0.2989, + "epoch": 1.5574656719645366, + "grad_norm": 0.1736992746591568, + "learning_rate": 4.031754777496012e-05, + "loss": 0.4344, "step": 43215 }, { - "epoch": 1.52, - "learning_rate": 4.078377901653122e-05, - "loss": 0.288, + "epoch": 1.5576458716257613, + "grad_norm": 0.18772049248218536, + "learning_rate": 4.031524142846139e-05, + "loss": 0.4455, "step": 43220 }, { - "epoch": 1.52, - "learning_rate": 4.078156973902198e-05, - "loss": 0.2737, + "epoch": 1.557826071286986, + "grad_norm": 0.185786172747612, + "learning_rate": 4.031293487329546e-05, + "loss": 0.4187, "step": 43225 }, { - "epoch": 1.52, - "learning_rate": 4.077936025659945e-05, - "loss": 0.3033, + "epoch": 1.5580062709482108, + "grad_norm": 0.21457712352275848, + "learning_rate": 4.031062810949375e-05, + "loss": 0.4008, "step": 43230 }, { - "epoch": 1.52, - "learning_rate": 4.0777150569292345e-05, - "loss": 0.2956, + "epoch": 1.5581864706094353, + "grad_norm": 0.16875572502613068, + "learning_rate": 4.03083211370877e-05, + "loss": 0.4041, "step": 43235 }, { - "epoch": 1.52, - "learning_rate": 4.0774940677129316e-05, - "loss": 0.2846, + "epoch": 1.5583666702706598, + "grad_norm": 0.1875239759683609, + "learning_rate": 4.0306013956108747e-05, + "loss": 0.4421, "step": 43240 }, { - "epoch": 1.52, - "learning_rate": 4.07727305801391e-05, - "loss": 0.3056, + "epoch": 1.5585468699318845, + "grad_norm": 0.1823228895664215, + "learning_rate": 4.030370656658831e-05, + "loss": 0.3946, "step": 43245 }, { - "epoch": 1.52, - "learning_rate": 4.0770520278350354e-05, - "loss": 0.2888, + "epoch": 1.5587270695931092, + "grad_norm": 0.1981915831565857, + "learning_rate": 4.030139896855783e-05, + "loss": 0.4127, "step": 43250 }, { - "epoch": 1.52, - "learning_rate": 4.07683097717918e-05, - "loss": 0.2872, + "epoch": 1.5589072692543338, + "grad_norm": 0.17064636945724487, + "learning_rate": 4.029909116204875e-05, + "loss": 0.3918, "step": 43255 }, { - "epoch": 1.52, - "learning_rate": 4.076609906049214e-05, - "loss": 0.2991, + "epoch": 1.5590874689155585, + "grad_norm": 0.21734385192394257, + "learning_rate": 4.0296783147092527e-05, + "loss": 0.4109, "step": 43260 }, { - "epoch": 1.52, - "learning_rate": 4.0763888144480075e-05, - "loss": 0.292, + "epoch": 1.559267668576783, + "grad_norm": 0.18934614956378937, + "learning_rate": 4.029447492372059e-05, + "loss": 0.407, "step": 43265 }, { - "epoch": 1.52, - "learning_rate": 4.076167702378431e-05, - "loss": 0.2684, + "epoch": 1.5594478682380077, + "grad_norm": 0.22457720339298248, + "learning_rate": 4.029216649196439e-05, + "loss": 0.428, "step": 43270 }, { - "epoch": 1.52, - "learning_rate": 4.0759465698433545e-05, - "loss": 0.2921, + "epoch": 1.5596280678992325, + "grad_norm": 0.16613884270191193, + "learning_rate": 4.028985785185538e-05, + "loss": 0.4337, "step": 43275 }, { - "epoch": 1.52, - "learning_rate": 4.0757254168456514e-05, - "loss": 0.2779, + "epoch": 1.559808267560457, + "grad_norm": 0.1653304249048233, + "learning_rate": 4.0287549003425026e-05, + "loss": 0.3973, "step": 43280 }, { - "epoch": 1.52, - "learning_rate": 4.075504243388192e-05, - "loss": 0.2974, + "epoch": 1.5599884672216815, + "grad_norm": 0.18372784554958344, + "learning_rate": 4.028523994670477e-05, + "loss": 0.4102, "step": 43285 }, { - "epoch": 1.52, - "learning_rate": 4.075283049473848e-05, - "loss": 0.2597, + "epoch": 1.5601686668829062, + "grad_norm": 0.1557554453611374, + "learning_rate": 4.028293068172608e-05, + "loss": 0.417, "step": 43290 }, { - "epoch": 1.52, - "learning_rate": 4.0750618351054924e-05, - "loss": 0.268, + "epoch": 1.560348866544131, + "grad_norm": 0.17378360033035278, + "learning_rate": 4.028062120852042e-05, + "loss": 0.4113, "step": 43295 }, { - "epoch": 1.52, - "learning_rate": 4.0748406002859963e-05, - "loss": 0.2818, + "epoch": 1.5605290662053557, + "grad_norm": 0.17926354706287384, + "learning_rate": 4.027831152711925e-05, + "loss": 0.402, "step": 43300 }, { - "epoch": 1.52, - "learning_rate": 4.074619345018233e-05, - "loss": 0.2867, + "epoch": 1.5607092658665802, + "grad_norm": 0.17073003947734833, + "learning_rate": 4.027600163755405e-05, + "loss": 0.4631, "step": 43305 }, { - "epoch": 1.52, - "learning_rate": 4.0743980693050753e-05, - "loss": 0.2596, + "epoch": 1.5608894655278047, + "grad_norm": 0.17934848368167877, + "learning_rate": 4.027369153985628e-05, + "loss": 0.4058, "step": 43310 }, { - "epoch": 1.52, - "learning_rate": 4.074176773149396e-05, - "loss": 0.3037, + "epoch": 1.5610696651890295, + "grad_norm": 0.16843603551387787, + "learning_rate": 4.0271381234057426e-05, + "loss": 0.4345, "step": 43315 }, { - "epoch": 1.52, - "learning_rate": 4.0739554565540697e-05, - "loss": 0.2669, + "epoch": 1.5612498648502542, + "grad_norm": 0.1804237961769104, + "learning_rate": 4.026907072018896e-05, + "loss": 0.4171, "step": 43320 }, { - "epoch": 1.52, - "learning_rate": 4.073734119521968e-05, - "loss": 0.2882, + "epoch": 1.5614300645114787, + "grad_norm": 0.18647703528404236, + "learning_rate": 4.0266759998282355e-05, + "loss": 0.4483, "step": 43325 }, { - "epoch": 1.52, - "learning_rate": 4.0735127620559664e-05, - "loss": 0.3036, + "epoch": 1.5616102641727032, + "grad_norm": 0.15589216351509094, + "learning_rate": 4.026444906836909e-05, + "loss": 0.3917, "step": 43330 }, { - "epoch": 1.52, - "learning_rate": 4.0732913841589385e-05, - "loss": 0.2811, + "epoch": 1.561790463833928, + "grad_norm": 0.2177332192659378, + "learning_rate": 4.026213793048068e-05, + "loss": 0.4652, "step": 43335 }, { - "epoch": 1.52, - "learning_rate": 4.073069985833758e-05, - "loss": 0.2856, + "epoch": 1.5619706634951527, + "grad_norm": 0.22540467977523804, + "learning_rate": 4.0259826584648596e-05, + "loss": 0.4222, "step": 43340 }, { - "epoch": 1.52, - "learning_rate": 4.072848567083302e-05, - "loss": 0.3084, + "epoch": 1.5621508631563774, + "grad_norm": 0.1936839520931244, + "learning_rate": 4.025751503090432e-05, + "loss": 0.4248, "step": 43345 }, { - "epoch": 1.53, - "learning_rate": 4.0726271279104435e-05, - "loss": 0.2959, + "epoch": 1.562331062817602, + "grad_norm": 0.20749831199645996, + "learning_rate": 4.025520326927936e-05, + "loss": 0.4026, "step": 43350 }, { - "epoch": 1.53, - "learning_rate": 4.072405668318057e-05, - "loss": 0.2694, + "epoch": 1.5625112624788264, + "grad_norm": 0.1811009794473648, + "learning_rate": 4.025289129980521e-05, + "loss": 0.3812, "step": 43355 }, { - "epoch": 1.53, - "learning_rate": 4.07218418830902e-05, - "loss": 0.3004, + "epoch": 1.5626914621400512, + "grad_norm": 0.1572161763906479, + "learning_rate": 4.025057912251337e-05, + "loss": 0.4306, "step": 43360 }, { - "epoch": 1.53, - "learning_rate": 4.0719626878862074e-05, - "loss": 0.246, + "epoch": 1.562871661801276, + "grad_norm": 0.18943502008914948, + "learning_rate": 4.024826673743533e-05, + "loss": 0.3895, "step": 43365 }, { - "epoch": 1.53, - "learning_rate": 4.071741167052495e-05, - "loss": 0.2825, + "epoch": 1.5630518614625004, + "grad_norm": 0.2089279443025589, + "learning_rate": 4.024595414460261e-05, + "loss": 0.4108, "step": 43370 }, { - "epoch": 1.53, - "learning_rate": 4.071519625810759e-05, - "loss": 0.2835, + "epoch": 1.563232061123725, + "grad_norm": 0.16804815828800201, + "learning_rate": 4.0243641344046725e-05, + "loss": 0.3943, "step": 43375 }, { - "epoch": 1.53, - "learning_rate": 4.0712980641638774e-05, - "loss": 0.2787, + "epoch": 1.5634122607849497, + "grad_norm": 0.19758044183254242, + "learning_rate": 4.0241328335799185e-05, + "loss": 0.4298, "step": 43380 }, { - "epoch": 1.53, - "learning_rate": 4.071076482114725e-05, - "loss": 0.27, + "epoch": 1.5635924604461744, + "grad_norm": 0.18327586352825165, + "learning_rate": 4.023901511989149e-05, + "loss": 0.4356, "step": 43385 }, { - "epoch": 1.53, - "learning_rate": 4.0708548796661805e-05, - "loss": 0.3059, + "epoch": 1.5637726601073991, + "grad_norm": 0.18740832805633545, + "learning_rate": 4.023670169635516e-05, + "loss": 0.4263, "step": 43390 }, { - "epoch": 1.53, - "learning_rate": 4.07063325682112e-05, - "loss": 0.2701, + "epoch": 1.5639528597686236, + "grad_norm": 0.18278846144676208, + "learning_rate": 4.0234388065221716e-05, + "loss": 0.3991, "step": 43395 }, { - "epoch": 1.53, - "learning_rate": 4.0704116135824226e-05, - "loss": 0.3022, + "epoch": 1.5641330594298481, + "grad_norm": 0.137952983379364, + "learning_rate": 4.02320742265227e-05, + "loss": 0.4308, "step": 43400 }, { - "epoch": 1.53, - "learning_rate": 4.070189949952965e-05, - "loss": 0.2763, + "epoch": 1.5643132590910729, + "grad_norm": 0.16237494349479675, + "learning_rate": 4.0229760180289604e-05, + "loss": 0.3995, "step": 43405 }, { - "epoch": 1.53, - "learning_rate": 4.0699682659356254e-05, - "loss": 0.3014, + "epoch": 1.5644934587522976, + "grad_norm": 0.1925591230392456, + "learning_rate": 4.022744592655398e-05, + "loss": 0.4095, "step": 43410 }, { - "epoch": 1.53, - "learning_rate": 4.069746561533282e-05, - "loss": 0.3238, + "epoch": 1.5646736584135221, + "grad_norm": 0.17975488305091858, + "learning_rate": 4.022513146534735e-05, + "loss": 0.397, "step": 43415 }, { - "epoch": 1.53, - "learning_rate": 4.069524836748815e-05, - "loss": 0.2848, + "epoch": 1.5648538580747469, + "grad_norm": 0.19634199142456055, + "learning_rate": 4.022281679670127e-05, + "loss": 0.4361, "step": 43420 }, { - "epoch": 1.53, - "learning_rate": 4.069303091585102e-05, - "loss": 0.2805, + "epoch": 1.5650340577359714, + "grad_norm": 0.22516696155071259, + "learning_rate": 4.022050192064724e-05, + "loss": 0.4091, "step": 43425 }, { - "epoch": 1.53, - "learning_rate": 4.069081326045023e-05, - "loss": 0.2921, + "epoch": 1.565214257397196, + "grad_norm": 0.19511666893959045, + "learning_rate": 4.021818683721682e-05, + "loss": 0.4301, "step": 43430 }, { - "epoch": 1.53, - "learning_rate": 4.068859540131457e-05, - "loss": 0.277, + "epoch": 1.5653944570584208, + "grad_norm": 0.19139276444911957, + "learning_rate": 4.021587154644156e-05, + "loss": 0.4132, "step": 43435 }, { - "epoch": 1.53, - "learning_rate": 4.068637733847283e-05, - "loss": 0.2988, + "epoch": 1.5655746567196454, + "grad_norm": 0.18717695772647858, + "learning_rate": 4.021355604835299e-05, + "loss": 0.4059, "step": 43440 }, { - "epoch": 1.53, - "learning_rate": 4.068415907195382e-05, - "loss": 0.295, + "epoch": 1.5657548563808699, + "grad_norm": 0.16392220556735992, + "learning_rate": 4.0211240342982656e-05, + "loss": 0.4349, "step": 43445 }, { - "epoch": 1.53, - "learning_rate": 4.0681940601786345e-05, - "loss": 0.2782, + "epoch": 1.5659350560420946, + "grad_norm": 0.23648183047771454, + "learning_rate": 4.0208924430362126e-05, + "loss": 0.4023, "step": 43450 }, { - "epoch": 1.53, - "learning_rate": 4.06797219279992e-05, - "loss": 0.2732, + "epoch": 1.5661152557033193, + "grad_norm": 0.23210176825523376, + "learning_rate": 4.020660831052295e-05, + "loss": 0.4303, "step": 43455 }, { - "epoch": 1.53, - "learning_rate": 4.06775030506212e-05, - "loss": 0.3081, + "epoch": 1.566295455364544, + "grad_norm": 0.18653087317943573, + "learning_rate": 4.020429198349667e-05, + "loss": 0.4452, "step": 43460 }, { - "epoch": 1.53, - "learning_rate": 4.067528396968116e-05, - "loss": 0.3017, + "epoch": 1.5664756550257686, + "grad_norm": 0.17263540625572205, + "learning_rate": 4.0201975449314865e-05, + "loss": 0.4231, "step": 43465 }, { - "epoch": 1.53, - "learning_rate": 4.067306468520788e-05, - "loss": 0.2942, + "epoch": 1.566655854686993, + "grad_norm": 0.2012946456670761, + "learning_rate": 4.019965870800908e-05, + "loss": 0.4403, "step": 43470 }, { - "epoch": 1.53, - "learning_rate": 4.06708451972302e-05, - "loss": 0.3089, + "epoch": 1.5668360543482178, + "grad_norm": 0.2045126110315323, + "learning_rate": 4.019734175961089e-05, + "loss": 0.4306, "step": 43475 }, { - "epoch": 1.53, - "learning_rate": 4.0668625505776906e-05, - "loss": 0.2851, + "epoch": 1.5670162540094426, + "grad_norm": 0.15547120571136475, + "learning_rate": 4.019502460415186e-05, + "loss": 0.4273, "step": 43480 }, { - "epoch": 1.53, - "learning_rate": 4.066640561087684e-05, - "loss": 0.2862, + "epoch": 1.567196453670667, + "grad_norm": 0.17727886140346527, + "learning_rate": 4.0192707241663567e-05, + "loss": 0.4196, "step": 43485 }, { - "epoch": 1.53, - "learning_rate": 4.066418551255883e-05, - "loss": 0.2883, + "epoch": 1.5673766533318916, + "grad_norm": 0.16581672430038452, + "learning_rate": 4.0190389672177575e-05, + "loss": 0.3979, "step": 43490 }, { - "epoch": 1.53, - "learning_rate": 4.066196521085169e-05, - "loss": 0.2865, + "epoch": 1.5675568529931163, + "grad_norm": 0.1664031445980072, + "learning_rate": 4.0188071895725466e-05, + "loss": 0.4289, "step": 43495 }, { - "epoch": 1.53, - "learning_rate": 4.0659744705784254e-05, - "loss": 0.2946, + "epoch": 1.567737052654341, + "grad_norm": 0.17638015747070312, + "learning_rate": 4.018575391233882e-05, + "loss": 0.4044, "step": 43500 }, { - "epoch": 1.53, - "eval_loss": 0.27933764457702637, - "eval_runtime": 10.6369, - "eval_samples_per_second": 9.401, - "eval_steps_per_second": 9.401, + "epoch": 1.567737052654341, + "eval_loss": 0.44283291697502136, + "eval_runtime": 3.524, + "eval_samples_per_second": 28.377, + "eval_steps_per_second": 7.094, "step": 43500 }, { - "epoch": 1.53, - "learning_rate": 4.065752399738535e-05, - "loss": 0.2491, + "epoch": 1.5679172523155658, + "grad_norm": 0.19836536049842834, + "learning_rate": 4.018343572204921e-05, + "loss": 0.4234, "step": 43505 }, { - "epoch": 1.53, - "learning_rate": 4.065530308568381e-05, - "loss": 0.2951, + "epoch": 1.5680974519767903, + "grad_norm": 0.1390867680311203, + "learning_rate": 4.018111732488823e-05, + "loss": 0.386, "step": 43510 }, { - "epoch": 1.53, - "learning_rate": 4.0653081970708486e-05, - "loss": 0.2764, + "epoch": 1.5682776516380148, + "grad_norm": 0.2248731255531311, + "learning_rate": 4.0178798720887465e-05, + "loss": 0.4143, "step": 43515 }, { - "epoch": 1.53, - "learning_rate": 4.065086065248821e-05, - "loss": 0.2887, + "epoch": 1.5684578512992395, + "grad_norm": 0.149456188082695, + "learning_rate": 4.017647991007851e-05, + "loss": 0.3873, "step": 43520 }, { - "epoch": 1.53, - "learning_rate": 4.0648639131051815e-05, - "loss": 0.272, + "epoch": 1.5686380509604643, + "grad_norm": 0.20427654683589935, + "learning_rate": 4.017416089249296e-05, + "loss": 0.4319, "step": 43525 }, { - "epoch": 1.53, - "learning_rate": 4.064641740642815e-05, - "loss": 0.2902, + "epoch": 1.5688182506216888, + "grad_norm": 0.18678858876228333, + "learning_rate": 4.017184166816239e-05, + "loss": 0.3849, "step": 43530 }, { - "epoch": 1.53, - "learning_rate": 4.064419547864607e-05, - "loss": 0.2942, + "epoch": 1.5689984502829135, + "grad_norm": 0.14671590924263, + "learning_rate": 4.0169522237118426e-05, + "loss": 0.3873, "step": 43535 }, { - "epoch": 1.53, - "learning_rate": 4.064197334773443e-05, - "loss": 0.2664, + "epoch": 1.569178649944138, + "grad_norm": 0.1654651165008545, + "learning_rate": 4.0167202599392656e-05, + "loss": 0.4051, "step": 43540 }, { - "epoch": 1.53, - "learning_rate": 4.063975101372207e-05, - "loss": 0.2929, + "epoch": 1.5693588496053628, + "grad_norm": 0.17216157913208008, + "learning_rate": 4.0164882755016685e-05, + "loss": 0.4207, "step": 43545 }, { - "epoch": 1.53, - "learning_rate": 4.063752847663785e-05, - "loss": 0.2745, + "epoch": 1.5695390492665875, + "grad_norm": 0.1685381382703781, + "learning_rate": 4.0162562704022124e-05, + "loss": 0.3873, "step": 43550 }, { - "epoch": 1.53, - "learning_rate": 4.063530573651063e-05, - "loss": 0.3009, + "epoch": 1.569719248927812, + "grad_norm": 0.16307340562343597, + "learning_rate": 4.0160242446440584e-05, + "loss": 0.3893, "step": 43555 }, { - "epoch": 1.53, - "learning_rate": 4.0633082793369256e-05, - "loss": 0.3144, + "epoch": 1.5698994485890365, + "grad_norm": 0.2021232396364212, + "learning_rate": 4.015792198230367e-05, + "loss": 0.4492, "step": 43560 }, { - "epoch": 1.53, - "learning_rate": 4.063085964724263e-05, - "loss": 0.2595, + "epoch": 1.5700796482502613, + "grad_norm": 0.1811605542898178, + "learning_rate": 4.0155601311643006e-05, + "loss": 0.3913, "step": 43565 }, { - "epoch": 1.53, - "learning_rate": 4.062863629815957e-05, - "loss": 0.2784, + "epoch": 1.570259847911486, + "grad_norm": 0.14910484850406647, + "learning_rate": 4.015328043449021e-05, + "loss": 0.4193, "step": 43570 }, { - "epoch": 1.53, - "learning_rate": 4.062641274614897e-05, - "loss": 0.2907, + "epoch": 1.5704400475727107, + "grad_norm": 0.24436582624912262, + "learning_rate": 4.0150959350876903e-05, + "loss": 0.432, "step": 43575 }, { - "epoch": 1.53, - "learning_rate": 4.062418899123971e-05, - "loss": 0.2899, + "epoch": 1.5706202472339352, + "grad_norm": 0.1627466231584549, + "learning_rate": 4.014863806083471e-05, + "loss": 0.4316, "step": 43580 }, { - "epoch": 1.53, - "learning_rate": 4.062196503346064e-05, - "loss": 0.2848, + "epoch": 1.5708004468951597, + "grad_norm": 0.18843139708042145, + "learning_rate": 4.0146316564395254e-05, + "loss": 0.4146, "step": 43585 }, { - "epoch": 1.53, - "learning_rate": 4.061974087284066e-05, - "loss": 0.2721, + "epoch": 1.5709806465563845, + "grad_norm": 0.17627915740013123, + "learning_rate": 4.014399486159016e-05, + "loss": 0.444, "step": 43590 }, { - "epoch": 1.53, - "learning_rate": 4.0617516509408636e-05, - "loss": 0.2915, + "epoch": 1.5711608462176092, + "grad_norm": 0.2867834270000458, + "learning_rate": 4.014167295245108e-05, + "loss": 0.4179, "step": 43595 }, { - "epoch": 1.53, - "learning_rate": 4.0615291943193446e-05, - "loss": 0.2946, + "epoch": 1.5713410458788337, + "grad_norm": 0.16591203212738037, + "learning_rate": 4.013935083700963e-05, + "loss": 0.4026, "step": 43600 }, { - "epoch": 1.53, - "learning_rate": 4.0613067174223987e-05, - "loss": 0.2876, + "epoch": 1.5715212455400582, + "grad_norm": 0.1895548552274704, + "learning_rate": 4.0137028515297456e-05, + "loss": 0.4287, "step": 43605 }, { - "epoch": 1.53, - "learning_rate": 4.0610842202529144e-05, - "loss": 0.2818, + "epoch": 1.571701445201283, + "grad_norm": 0.20505109429359436, + "learning_rate": 4.0134705987346206e-05, + "loss": 0.4474, "step": 43610 }, { - "epoch": 1.53, - "learning_rate": 4.0608617028137793e-05, - "loss": 0.2997, + "epoch": 1.5718816448625077, + "grad_norm": 0.18428920209407806, + "learning_rate": 4.013238325318751e-05, + "loss": 0.4338, "step": 43615 }, { - "epoch": 1.53, - "learning_rate": 4.060639165107885e-05, - "loss": 0.2639, + "epoch": 1.5720618445237324, + "grad_norm": 0.21533146500587463, + "learning_rate": 4.013006031285302e-05, + "loss": 0.4064, "step": 43620 }, { - "epoch": 1.53, - "learning_rate": 4.060416607138118e-05, - "loss": 0.289, + "epoch": 1.572242044184957, + "grad_norm": 0.17549021542072296, + "learning_rate": 4.012773716637439e-05, + "loss": 0.4381, "step": 43625 }, { - "epoch": 1.54, - "learning_rate": 4.06019402890737e-05, - "loss": 0.3012, + "epoch": 1.5724222438461815, + "grad_norm": 0.21111688017845154, + "learning_rate": 4.0125413813783275e-05, + "loss": 0.4279, "step": 43630 }, { - "epoch": 1.54, - "learning_rate": 4.0599714304185315e-05, - "loss": 0.3031, + "epoch": 1.5726024435074062, + "grad_norm": 0.23240859806537628, + "learning_rate": 4.0123090255111316e-05, + "loss": 0.435, "step": 43635 }, { - "epoch": 1.54, - "learning_rate": 4.059748811674492e-05, - "loss": 0.2745, + "epoch": 1.572782643168631, + "grad_norm": 0.1591193825006485, + "learning_rate": 4.0120766490390197e-05, + "loss": 0.4326, "step": 43640 }, { - "epoch": 1.54, - "learning_rate": 4.059526172678142e-05, - "loss": 0.2801, + "epoch": 1.5729628428298554, + "grad_norm": 0.19279992580413818, + "learning_rate": 4.011844251965154e-05, + "loss": 0.4427, "step": 43645 }, { - "epoch": 1.54, - "learning_rate": 4.059303513432372e-05, - "loss": 0.3029, + "epoch": 1.5731430424910802, + "grad_norm": 0.19141902029514313, + "learning_rate": 4.0116118342927045e-05, + "loss": 0.3833, "step": 43650 }, { - "epoch": 1.54, - "learning_rate": 4.0590808339400735e-05, - "loss": 0.2762, + "epoch": 1.5733232421523047, + "grad_norm": 0.1679369956254959, + "learning_rate": 4.0113793960248356e-05, + "loss": 0.4242, "step": 43655 }, { - "epoch": 1.54, - "learning_rate": 4.0588581342041385e-05, - "loss": 0.2592, + "epoch": 1.5735034418135294, + "grad_norm": 0.21762417256832123, + "learning_rate": 4.0111469371647156e-05, + "loss": 0.4086, "step": 43660 }, { - "epoch": 1.54, - "learning_rate": 4.0586354142274575e-05, - "loss": 0.3027, + "epoch": 1.5736836414747541, + "grad_norm": 0.1647619605064392, + "learning_rate": 4.010914457715511e-05, + "loss": 0.4345, "step": 43665 }, { - "epoch": 1.54, - "learning_rate": 4.0584126740129225e-05, - "loss": 0.2809, + "epoch": 1.5738638411359787, + "grad_norm": 0.27651816606521606, + "learning_rate": 4.01068195768039e-05, + "loss": 0.4154, "step": 43670 }, { - "epoch": 1.54, - "learning_rate": 4.0581899135634263e-05, - "loss": 0.2901, + "epoch": 1.5740440407972032, + "grad_norm": 0.20368202030658722, + "learning_rate": 4.010449437062519e-05, + "loss": 0.4557, "step": 43675 }, { - "epoch": 1.54, - "learning_rate": 4.057967132881861e-05, - "loss": 0.2773, + "epoch": 1.574224240458428, + "grad_norm": 0.17364609241485596, + "learning_rate": 4.0102168958650676e-05, + "loss": 0.3769, "step": 43680 }, { - "epoch": 1.54, - "learning_rate": 4.0577443319711197e-05, - "loss": 0.2837, + "epoch": 1.5744044401196526, + "grad_norm": 0.17485357820987701, + "learning_rate": 4.009984334091203e-05, + "loss": 0.3792, "step": 43685 }, { - "epoch": 1.54, - "learning_rate": 4.057521510834093e-05, - "loss": 0.2884, + "epoch": 1.5745846397808774, + "grad_norm": 0.20240798592567444, + "learning_rate": 4.009751751744094e-05, + "loss": 0.3928, "step": 43690 }, { - "epoch": 1.54, - "learning_rate": 4.0572986694736784e-05, - "loss": 0.2677, + "epoch": 1.5747648394421019, + "grad_norm": 0.21913383901119232, + "learning_rate": 4.009519148826909e-05, + "loss": 0.4397, "step": 43695 }, { - "epoch": 1.54, - "learning_rate": 4.057075807892765e-05, - "loss": 0.2645, + "epoch": 1.5749450391033264, + "grad_norm": 0.1732109934091568, + "learning_rate": 4.009286525342819e-05, + "loss": 0.3939, "step": 43700 }, { - "epoch": 1.54, - "learning_rate": 4.05685292609425e-05, - "loss": 0.2882, + "epoch": 1.5751252387645511, + "grad_norm": 0.20331084728240967, + "learning_rate": 4.0090538812949916e-05, + "loss": 0.4074, "step": 43705 }, { - "epoch": 1.54, - "learning_rate": 4.0566300240810245e-05, - "loss": 0.285, + "epoch": 1.5753054384257759, + "grad_norm": 0.18988221883773804, + "learning_rate": 4.008821216686598e-05, + "loss": 0.3991, "step": 43710 }, { - "epoch": 1.54, - "learning_rate": 4.056407101855985e-05, - "loss": 0.277, + "epoch": 1.5754856380870004, + "grad_norm": 0.17192235589027405, + "learning_rate": 4.008588531520807e-05, + "loss": 0.4486, "step": 43715 }, { - "epoch": 1.54, - "learning_rate": 4.056184159422024e-05, - "loss": 0.2951, + "epoch": 1.5756658377482249, + "grad_norm": 0.19935615360736847, + "learning_rate": 4.008355825800789e-05, + "loss": 0.4195, "step": 43720 }, { - "epoch": 1.54, - "learning_rate": 4.0559611967820375e-05, - "loss": 0.2924, + "epoch": 1.5758460374094496, + "grad_norm": 0.18798862397670746, + "learning_rate": 4.0081230995297154e-05, + "loss": 0.397, "step": 43725 }, { - "epoch": 1.54, - "learning_rate": 4.05573821393892e-05, - "loss": 0.3046, + "epoch": 1.5760262370706744, + "grad_norm": 0.16383147239685059, + "learning_rate": 4.007890352710757e-05, + "loss": 0.4065, "step": 43730 }, { - "epoch": 1.54, - "learning_rate": 4.055515210895568e-05, - "loss": 0.3116, + "epoch": 1.576206436731899, + "grad_norm": 0.18491680920124054, + "learning_rate": 4.007657585347083e-05, + "loss": 0.397, "step": 43735 }, { - "epoch": 1.54, - "learning_rate": 4.055292187654875e-05, - "loss": 0.2598, + "epoch": 1.5763866363931236, + "grad_norm": 0.1732751429080963, + "learning_rate": 4.007424797441868e-05, + "loss": 0.3996, "step": 43740 }, { - "epoch": 1.54, - "learning_rate": 4.055069144219739e-05, - "loss": 0.3018, + "epoch": 1.576566836054348, + "grad_norm": 0.17933018505573273, + "learning_rate": 4.00719198899828e-05, + "loss": 0.427, "step": 43745 }, { - "epoch": 1.54, - "learning_rate": 4.054846080593054e-05, - "loss": 0.3049, + "epoch": 1.5767470357155728, + "grad_norm": 0.17597095668315887, + "learning_rate": 4.006959160019495e-05, + "loss": 0.4075, "step": 43750 }, { - "epoch": 1.54, - "learning_rate": 4.054622996777718e-05, - "loss": 0.3057, + "epoch": 1.5769272353767976, + "grad_norm": 0.16884708404541016, + "learning_rate": 4.0067263105086825e-05, + "loss": 0.4244, "step": 43755 }, { - "epoch": 1.54, - "learning_rate": 4.054399892776627e-05, - "loss": 0.2754, + "epoch": 1.577107435038022, + "grad_norm": 0.21198032796382904, + "learning_rate": 4.0064934404690146e-05, + "loss": 0.4058, "step": 43760 }, { - "epoch": 1.54, - "learning_rate": 4.054176768592678e-05, - "loss": 0.2652, + "epoch": 1.5772876346992466, + "grad_norm": 0.16649512946605682, + "learning_rate": 4.006260549903666e-05, + "loss": 0.4174, "step": 43765 }, { - "epoch": 1.54, - "learning_rate": 4.053953624228767e-05, - "loss": 0.2693, + "epoch": 1.5774678343604713, + "grad_norm": 0.16471701860427856, + "learning_rate": 4.00602763881581e-05, + "loss": 0.3756, "step": 43770 }, { - "epoch": 1.54, - "learning_rate": 4.053730459687792e-05, - "loss": 0.297, + "epoch": 1.577648034021696, + "grad_norm": 0.1989121437072754, + "learning_rate": 4.005794707208618e-05, + "loss": 0.4276, "step": 43775 }, { - "epoch": 1.54, - "learning_rate": 4.053507274972652e-05, - "loss": 0.264, + "epoch": 1.5778282336829208, + "grad_norm": 0.14594782888889313, + "learning_rate": 4.005561755085265e-05, + "loss": 0.4355, "step": 43780 }, { - "epoch": 1.54, - "learning_rate": 4.053284070086244e-05, - "loss": 0.314, + "epoch": 1.5780084333441453, + "grad_norm": 0.19597959518432617, + "learning_rate": 4.0053287824489236e-05, + "loss": 0.4134, "step": 43785 }, { - "epoch": 1.54, - "learning_rate": 4.0530608450314654e-05, - "loss": 0.2932, + "epoch": 1.5781886330053698, + "grad_norm": 0.17923668026924133, + "learning_rate": 4.0050957893027706e-05, + "loss": 0.4061, "step": 43790 }, { - "epoch": 1.54, - "learning_rate": 4.052837599811216e-05, - "loss": 0.3142, + "epoch": 1.5783688326665946, + "grad_norm": 0.1501871943473816, + "learning_rate": 4.004862775649978e-05, + "loss": 0.3867, "step": 43795 }, { - "epoch": 1.54, - "learning_rate": 4.0526143344283916e-05, - "loss": 0.2902, + "epoch": 1.5785490323278193, + "grad_norm": 0.18202674388885498, + "learning_rate": 4.004629741493721e-05, + "loss": 0.4072, "step": 43800 }, { - "epoch": 1.54, - "learning_rate": 4.0523910488858954e-05, - "loss": 0.294, + "epoch": 1.578729231989044, + "grad_norm": 0.13299371302127838, + "learning_rate": 4.004396686837176e-05, + "loss": 0.4034, "step": 43805 }, { - "epoch": 1.54, - "learning_rate": 4.052167743186623e-05, - "loss": 0.3107, + "epoch": 1.5789094316502685, + "grad_norm": 0.17386196553707123, + "learning_rate": 4.004163611683517e-05, + "loss": 0.4056, "step": 43810 }, { - "epoch": 1.54, - "learning_rate": 4.0519444173334766e-05, - "loss": 0.2692, + "epoch": 1.579089631311493, + "grad_norm": 0.18044842779636383, + "learning_rate": 4.0039305160359195e-05, + "loss": 0.4121, "step": 43815 }, { - "epoch": 1.54, - "learning_rate": 4.051721071329354e-05, - "loss": 0.2847, + "epoch": 1.5792698309727178, + "grad_norm": 0.15430901944637299, + "learning_rate": 4.0036973998975604e-05, + "loss": 0.3994, "step": 43820 }, { - "epoch": 1.54, - "learning_rate": 4.0514977051771566e-05, - "loss": 0.2965, + "epoch": 1.5794500306339425, + "grad_norm": 0.17539086937904358, + "learning_rate": 4.0034642632716155e-05, + "loss": 0.4177, "step": 43825 }, { - "epoch": 1.54, - "learning_rate": 4.051274318879784e-05, - "loss": 0.2907, + "epoch": 1.579630230295167, + "grad_norm": 0.19086940586566925, + "learning_rate": 4.0032311061612604e-05, + "loss": 0.4033, "step": 43830 }, { - "epoch": 1.54, - "learning_rate": 4.051050912440135e-05, - "loss": 0.2911, + "epoch": 1.5798104299563915, + "grad_norm": 0.19829294085502625, + "learning_rate": 4.0029979285696736e-05, + "loss": 0.431, "step": 43835 }, { - "epoch": 1.54, - "learning_rate": 4.050827485861114e-05, - "loss": 0.2847, + "epoch": 1.5799906296176163, + "grad_norm": 0.21436335146427155, + "learning_rate": 4.0027647305000306e-05, + "loss": 0.4564, "step": 43840 }, { - "epoch": 1.54, - "learning_rate": 4.050604039145619e-05, - "loss": 0.2794, + "epoch": 1.580170829278841, + "grad_norm": 0.21377906203269958, + "learning_rate": 4.002531511955509e-05, + "loss": 0.441, "step": 43845 }, { - "epoch": 1.54, - "learning_rate": 4.0503805722965535e-05, - "loss": 0.2913, + "epoch": 1.5803510289400657, + "grad_norm": 0.1993681788444519, + "learning_rate": 4.0022982729392855e-05, + "loss": 0.4371, "step": 43850 }, { - "epoch": 1.54, - "learning_rate": 4.0501570853168166e-05, - "loss": 0.2668, + "epoch": 1.5805312286012903, + "grad_norm": 0.16495195031166077, + "learning_rate": 4.00206501345454e-05, + "loss": 0.436, "step": 43855 }, { - "epoch": 1.54, - "learning_rate": 4.049933578209313e-05, - "loss": 0.3133, + "epoch": 1.5807114282625148, + "grad_norm": 0.16763421893119812, + "learning_rate": 4.0018317335044495e-05, + "loss": 0.4238, "step": 43860 }, { - "epoch": 1.54, - "learning_rate": 4.049710050976943e-05, - "loss": 0.2685, + "epoch": 1.5808916279237395, + "grad_norm": 0.16144713759422302, + "learning_rate": 4.0015984330921916e-05, + "loss": 0.4115, "step": 43865 }, { - "epoch": 1.54, - "learning_rate": 4.049486503622609e-05, - "loss": 0.289, + "epoch": 1.5810718275849642, + "grad_norm": 0.20617428421974182, + "learning_rate": 4.0013651122209465e-05, + "loss": 0.4347, "step": 43870 }, { - "epoch": 1.54, - "learning_rate": 4.049262936149213e-05, - "loss": 0.2724, + "epoch": 1.5812520272461887, + "grad_norm": 0.1693793535232544, + "learning_rate": 4.0011317708938924e-05, + "loss": 0.3648, "step": 43875 }, { - "epoch": 1.54, - "learning_rate": 4.0490393485596605e-05, - "loss": 0.2723, + "epoch": 1.5814322269074133, + "grad_norm": 0.15378816425800323, + "learning_rate": 4.0008984091142086e-05, + "loss": 0.4536, "step": 43880 }, { - "epoch": 1.54, - "learning_rate": 4.0488157408568516e-05, - "loss": 0.2928, + "epoch": 1.581612426568638, + "grad_norm": 0.20745548605918884, + "learning_rate": 4.0006650268850745e-05, + "loss": 0.4071, "step": 43885 }, { - "epoch": 1.54, - "learning_rate": 4.048592113043692e-05, - "loss": 0.2693, + "epoch": 1.5817926262298627, + "grad_norm": 0.2162124514579773, + "learning_rate": 4.00043162420967e-05, + "loss": 0.4081, "step": 43890 }, { - "epoch": 1.54, - "learning_rate": 4.0483684651230835e-05, - "loss": 0.3124, + "epoch": 1.5819728258910875, + "grad_norm": 0.16745862364768982, + "learning_rate": 4.000198201091175e-05, + "loss": 0.4493, "step": 43895 }, { - "epoch": 1.54, - "learning_rate": 4.0481447970979316e-05, - "loss": 0.2454, + "epoch": 1.582153025552312, + "grad_norm": 0.16501225531101227, + "learning_rate": 3.999964757532769e-05, + "loss": 0.3873, "step": 43900 }, { - "epoch": 1.54, - "learning_rate": 4.047921108971139e-05, - "loss": 0.2742, + "epoch": 1.5823332252135365, + "grad_norm": 0.22815780341625214, + "learning_rate": 3.9997312935376346e-05, + "loss": 0.3807, "step": 43905 }, { - "epoch": 1.54, - "learning_rate": 4.0476974007456114e-05, - "loss": 0.2796, + "epoch": 1.5825134248747612, + "grad_norm": 0.17533724009990692, + "learning_rate": 3.9994978091089515e-05, + "loss": 0.4033, "step": 43910 }, { - "epoch": 1.55, - "learning_rate": 4.047473672424253e-05, - "loss": 0.2946, + "epoch": 1.582693624535986, + "grad_norm": 0.1684589833021164, + "learning_rate": 3.999264304249901e-05, + "loss": 0.4166, "step": 43915 }, { - "epoch": 1.55, - "learning_rate": 4.0472499240099695e-05, - "loss": 0.309, + "epoch": 1.5828738241972105, + "grad_norm": 0.16925841569900513, + "learning_rate": 3.999030778963665e-05, + "loss": 0.394, "step": 43920 }, { - "epoch": 1.55, - "learning_rate": 4.047026155505664e-05, - "loss": 0.288, + "epoch": 1.5830540238584352, + "grad_norm": 0.1653861552476883, + "learning_rate": 3.9987972332534246e-05, + "loss": 0.4332, "step": 43925 }, { - "epoch": 1.55, - "learning_rate": 4.046802366914244e-05, - "loss": 0.286, + "epoch": 1.5832342235196597, + "grad_norm": 0.1570800542831421, + "learning_rate": 3.998563667122362e-05, + "loss": 0.3974, "step": 43930 }, { - "epoch": 1.55, - "learning_rate": 4.046578558238615e-05, - "loss": 0.2997, + "epoch": 1.5834144231808844, + "grad_norm": 0.18440444767475128, + "learning_rate": 3.9983300805736595e-05, + "loss": 0.4235, "step": 43935 }, { - "epoch": 1.55, - "learning_rate": 4.0463547294816815e-05, - "loss": 0.2772, + "epoch": 1.5835946228421092, + "grad_norm": 0.15225686132907867, + "learning_rate": 3.9980964736104995e-05, + "loss": 0.4064, "step": 43940 }, { - "epoch": 1.55, - "learning_rate": 4.0461308806463525e-05, - "loss": 0.2691, + "epoch": 1.5837748225033337, + "grad_norm": 0.17495448887348175, + "learning_rate": 3.997862846236066e-05, + "loss": 0.4167, "step": 43945 }, { - "epoch": 1.55, - "learning_rate": 4.045907011735532e-05, - "loss": 0.2804, + "epoch": 1.5839550221645582, + "grad_norm": 0.1724550575017929, + "learning_rate": 3.9976291984535405e-05, + "loss": 0.4093, "step": 43950 }, { - "epoch": 1.55, - "learning_rate": 4.045683122752128e-05, - "loss": 0.2867, + "epoch": 1.584135221825783, + "grad_norm": 0.19340504705905914, + "learning_rate": 3.997395530266108e-05, + "loss": 0.4351, "step": 43955 }, { - "epoch": 1.55, - "learning_rate": 4.045459213699047e-05, - "loss": 0.2909, + "epoch": 1.5843154214870077, + "grad_norm": 0.18444287776947021, + "learning_rate": 3.9971618416769495e-05, + "loss": 0.3998, "step": 43960 }, { - "epoch": 1.55, - "learning_rate": 4.045235284579198e-05, - "loss": 0.3073, + "epoch": 1.5844956211482324, + "grad_norm": 0.13938504457473755, + "learning_rate": 3.9969281326892523e-05, + "loss": 0.3954, "step": 43965 }, { - "epoch": 1.55, - "learning_rate": 4.045011335395486e-05, - "loss": 0.3136, + "epoch": 1.584675820809457, + "grad_norm": 0.15549679100513458, + "learning_rate": 3.996694403306198e-05, + "loss": 0.4072, "step": 43970 }, { - "epoch": 1.55, - "learning_rate": 4.04478736615082e-05, - "loss": 0.288, + "epoch": 1.5848560204706814, + "grad_norm": 0.16593880951404572, + "learning_rate": 3.9964606535309735e-05, + "loss": 0.4273, "step": 43975 }, { - "epoch": 1.55, - "learning_rate": 4.044563376848107e-05, - "loss": 0.2732, + "epoch": 1.5850362201319061, + "grad_norm": 0.23627829551696777, + "learning_rate": 3.9962268833667615e-05, + "loss": 0.4176, "step": 43980 }, { - "epoch": 1.55, - "learning_rate": 4.0443393674902585e-05, - "loss": 0.3115, + "epoch": 1.5852164197931309, + "grad_norm": 0.1789269745349884, + "learning_rate": 3.9959930928167474e-05, + "loss": 0.4231, "step": 43985 }, { - "epoch": 1.55, - "learning_rate": 4.04411533808018e-05, - "loss": 0.2983, + "epoch": 1.5853966194543554, + "grad_norm": 0.20784516632556915, + "learning_rate": 3.995759281884118e-05, + "loss": 0.4042, "step": 43990 }, { - "epoch": 1.55, - "learning_rate": 4.043891288620781e-05, - "loss": 0.2848, + "epoch": 1.58557681911558, + "grad_norm": 0.18998631834983826, + "learning_rate": 3.995525450572059e-05, + "loss": 0.4483, "step": 43995 }, { - "epoch": 1.55, - "learning_rate": 4.0436672191149724e-05, - "loss": 0.2671, + "epoch": 1.5857570187768046, + "grad_norm": 0.2027607262134552, + "learning_rate": 3.9952915988837534e-05, + "loss": 0.3861, "step": 44000 }, { - "epoch": 1.55, - "eval_loss": 0.279005765914917, - "eval_runtime": 10.5327, - "eval_samples_per_second": 9.494, - "eval_steps_per_second": 9.494, + "epoch": 1.5857570187768046, + "eval_loss": 0.44330334663391113, + "eval_runtime": 3.5296, + "eval_samples_per_second": 28.332, + "eval_steps_per_second": 7.083, "step": 44000 }, { - "epoch": 1.55, - "learning_rate": 4.043443129565662e-05, - "loss": 0.2622, + "epoch": 1.5859372184380294, + "grad_norm": 0.18675218522548676, + "learning_rate": 3.99505772682239e-05, + "loss": 0.399, "step": 44005 }, { - "epoch": 1.55, - "learning_rate": 4.043219019975759e-05, - "loss": 0.2953, + "epoch": 1.586117418099254, + "grad_norm": 0.18582110106945038, + "learning_rate": 3.994823834391154e-05, + "loss": 0.3755, "step": 44010 }, { - "epoch": 1.55, - "learning_rate": 4.042994890348174e-05, - "loss": 0.2774, + "epoch": 1.5862976177604786, + "grad_norm": 0.18552209436893463, + "learning_rate": 3.994589921593233e-05, + "loss": 0.4288, "step": 44015 }, { - "epoch": 1.55, - "learning_rate": 4.042770740685818e-05, - "loss": 0.3034, + "epoch": 1.5864778174217031, + "grad_norm": 0.17207638919353485, + "learning_rate": 3.994355988431814e-05, + "loss": 0.3947, "step": 44020 }, { - "epoch": 1.55, - "learning_rate": 4.0425465709916e-05, - "loss": 0.2754, + "epoch": 1.5866580170829279, + "grad_norm": 0.22943609952926636, + "learning_rate": 3.994122034910083e-05, + "loss": 0.4271, "step": 44025 }, { - "epoch": 1.55, - "learning_rate": 4.042322381268432e-05, - "loss": 0.2879, + "epoch": 1.5868382167441526, + "grad_norm": 0.21029749512672424, + "learning_rate": 3.9938880610312294e-05, + "loss": 0.4071, "step": 44030 }, { - "epoch": 1.55, - "learning_rate": 4.0420981715192235e-05, - "loss": 0.297, + "epoch": 1.587018416405377, + "grad_norm": 0.1644810289144516, + "learning_rate": 3.99365406679844e-05, + "loss": 0.3875, "step": 44035 }, { - "epoch": 1.55, - "learning_rate": 4.041873941746887e-05, - "loss": 0.282, + "epoch": 1.5871986160666018, + "grad_norm": 0.1785242259502411, + "learning_rate": 3.993420052214904e-05, + "loss": 0.3676, "step": 44040 }, { - "epoch": 1.55, - "learning_rate": 4.0416496919543336e-05, - "loss": 0.3116, + "epoch": 1.5873788157278264, + "grad_norm": 0.17535260319709778, + "learning_rate": 3.9931860172838076e-05, + "loss": 0.3939, "step": 44045 }, { - "epoch": 1.55, - "learning_rate": 4.041425422144475e-05, - "loss": 0.2781, + "epoch": 1.587559015389051, + "grad_norm": 0.17083501815795898, + "learning_rate": 3.992951962008341e-05, + "loss": 0.4247, "step": 44050 }, { - "epoch": 1.55, - "learning_rate": 4.041201132320223e-05, - "loss": 0.2828, + "epoch": 1.5877392150502758, + "grad_norm": 0.230384960770607, + "learning_rate": 3.992717886391693e-05, + "loss": 0.4225, "step": 44055 }, { - "epoch": 1.55, - "learning_rate": 4.040976822484489e-05, - "loss": 0.3013, + "epoch": 1.5879194147115003, + "grad_norm": 0.1916215866804123, + "learning_rate": 3.992483790437054e-05, + "loss": 0.4315, "step": 44060 }, { - "epoch": 1.55, - "learning_rate": 4.0407524926401874e-05, - "loss": 0.3005, + "epoch": 1.5880996143727248, + "grad_norm": 0.17122019827365875, + "learning_rate": 3.992249674147611e-05, + "loss": 0.409, "step": 44065 }, { - "epoch": 1.55, - "learning_rate": 4.04052814279023e-05, - "loss": 0.3076, + "epoch": 1.5882798140339496, + "grad_norm": 0.17510074377059937, + "learning_rate": 3.9920155375265555e-05, + "loss": 0.4144, "step": 44070 }, { - "epoch": 1.55, - "learning_rate": 4.04030377293753e-05, - "loss": 0.3003, + "epoch": 1.5884600136951743, + "grad_norm": 0.1663195937871933, + "learning_rate": 3.991781380577076e-05, + "loss": 0.4265, "step": 44075 }, { - "epoch": 1.55, - "learning_rate": 4.040079383085001e-05, - "loss": 0.2927, + "epoch": 1.588640213356399, + "grad_norm": 0.18738073110580444, + "learning_rate": 3.991547203302366e-05, + "loss": 0.4012, "step": 44080 }, { - "epoch": 1.55, - "learning_rate": 4.039854973235556e-05, - "loss": 0.2855, + "epoch": 1.5888204130176236, + "grad_norm": 0.1917044073343277, + "learning_rate": 3.991313005705613e-05, + "loss": 0.4378, "step": 44085 }, { - "epoch": 1.55, - "learning_rate": 4.0396305433921086e-05, - "loss": 0.2697, + "epoch": 1.589000612678848, + "grad_norm": 0.2828647792339325, + "learning_rate": 3.991078787790009e-05, + "loss": 0.3974, "step": 44090 }, { - "epoch": 1.55, - "learning_rate": 4.039406093557573e-05, - "loss": 0.2696, + "epoch": 1.5891808123400728, + "grad_norm": 0.17092807590961456, + "learning_rate": 3.990844549558745e-05, + "loss": 0.4047, "step": 44095 }, { - "epoch": 1.55, - "learning_rate": 4.0391816237348646e-05, - "loss": 0.3029, + "epoch": 1.5893610120012975, + "grad_norm": 0.1954643726348877, + "learning_rate": 3.990610291015014e-05, + "loss": 0.4167, "step": 44100 }, { - "epoch": 1.55, - "learning_rate": 4.038957133926896e-05, - "loss": 0.2862, + "epoch": 1.589541211662522, + "grad_norm": 0.2312636822462082, + "learning_rate": 3.990376012162006e-05, + "loss": 0.4022, "step": 44105 }, { - "epoch": 1.55, - "learning_rate": 4.038732624136584e-05, - "loss": 0.2742, + "epoch": 1.5897214113237466, + "grad_norm": 0.1981632262468338, + "learning_rate": 3.990141713002912e-05, + "loss": 0.4631, "step": 44110 }, { - "epoch": 1.55, - "learning_rate": 4.038508094366843e-05, - "loss": 0.2863, + "epoch": 1.5899016109849713, + "grad_norm": 0.16944488883018494, + "learning_rate": 3.989907393540927e-05, + "loss": 0.3809, "step": 44115 }, { - "epoch": 1.55, - "learning_rate": 4.038283544620588e-05, - "loss": 0.2809, + "epoch": 1.590081810646196, + "grad_norm": 0.1728655993938446, + "learning_rate": 3.9896730537792415e-05, + "loss": 0.3891, "step": 44120 }, { - "epoch": 1.55, - "learning_rate": 4.0380589749007357e-05, - "loss": 0.2874, + "epoch": 1.5902620103074208, + "grad_norm": 0.19275811314582825, + "learning_rate": 3.989438693721049e-05, + "loss": 0.4316, "step": 44125 }, { - "epoch": 1.55, - "learning_rate": 4.037834385210201e-05, - "loss": 0.2926, + "epoch": 1.5904422099686453, + "grad_norm": 0.16822832822799683, + "learning_rate": 3.989204313369543e-05, + "loss": 0.4205, "step": 44130 }, { - "epoch": 1.55, - "learning_rate": 4.037609775551899e-05, - "loss": 0.2669, + "epoch": 1.5906224096298698, + "grad_norm": 0.16480205953121185, + "learning_rate": 3.9889699127279164e-05, + "loss": 0.435, "step": 44135 }, { - "epoch": 1.55, - "learning_rate": 4.037385145928749e-05, - "loss": 0.2645, + "epoch": 1.5908026092910945, + "grad_norm": 0.1605161726474762, + "learning_rate": 3.9887354917993635e-05, + "loss": 0.4301, "step": 44140 }, { - "epoch": 1.55, - "learning_rate": 4.0371604963436656e-05, - "loss": 0.2976, + "epoch": 1.5909828089523192, + "grad_norm": 0.16853894293308258, + "learning_rate": 3.988501050587078e-05, + "loss": 0.4255, "step": 44145 }, { - "epoch": 1.55, - "learning_rate": 4.036935826799566e-05, - "loss": 0.2779, + "epoch": 1.5911630086135438, + "grad_norm": 0.20577529072761536, + "learning_rate": 3.9882665890942526e-05, + "loss": 0.3954, "step": 44150 }, { - "epoch": 1.55, - "learning_rate": 4.0367111372993674e-05, - "loss": 0.2729, + "epoch": 1.5913432082747685, + "grad_norm": 0.15664778649806976, + "learning_rate": 3.988032107324084e-05, + "loss": 0.4088, "step": 44155 }, { - "epoch": 1.55, - "learning_rate": 4.036486427845987e-05, - "loss": 0.2845, + "epoch": 1.591523407935993, + "grad_norm": 0.22527268528938293, + "learning_rate": 3.987797605279766e-05, + "loss": 0.4338, "step": 44160 }, { - "epoch": 1.55, - "learning_rate": 4.036261698442344e-05, - "loss": 0.276, + "epoch": 1.5917036075972177, + "grad_norm": 0.15542291104793549, + "learning_rate": 3.987563082964493e-05, + "loss": 0.4333, "step": 44165 }, { - "epoch": 1.55, - "learning_rate": 4.036036949091354e-05, - "loss": 0.2767, + "epoch": 1.5918838072584425, + "grad_norm": 0.18337464332580566, + "learning_rate": 3.987328540381461e-05, + "loss": 0.4064, "step": 44170 }, { - "epoch": 1.55, - "learning_rate": 4.035812179795938e-05, - "loss": 0.2749, + "epoch": 1.592064006919667, + "grad_norm": 0.21346335113048553, + "learning_rate": 3.987093977533867e-05, + "loss": 0.4588, "step": 44175 }, { - "epoch": 1.55, - "learning_rate": 4.035587390559012e-05, - "loss": 0.2965, + "epoch": 1.5922442065808915, + "grad_norm": 0.1897256225347519, + "learning_rate": 3.986859394424905e-05, + "loss": 0.44, "step": 44180 }, { - "epoch": 1.55, - "learning_rate": 4.035362581383496e-05, - "loss": 0.3134, + "epoch": 1.5924244062421162, + "grad_norm": 0.1959022730588913, + "learning_rate": 3.98662479105777e-05, + "loss": 0.4221, "step": 44185 }, { - "epoch": 1.55, - "learning_rate": 4.035137752272309e-05, - "loss": 0.2744, + "epoch": 1.592604605903341, + "grad_norm": 0.20452988147735596, + "learning_rate": 3.986390167435661e-05, + "loss": 0.4131, "step": 44190 }, { - "epoch": 1.55, - "learning_rate": 4.0349129032283694e-05, - "loss": 0.2862, + "epoch": 1.5927848055645657, + "grad_norm": 0.18788976967334747, + "learning_rate": 3.9861555235617734e-05, + "loss": 0.4157, "step": 44195 }, { - "epoch": 1.56, - "learning_rate": 4.0346880342545975e-05, - "loss": 0.2865, + "epoch": 1.5929650052257902, + "grad_norm": 0.14749525487422943, + "learning_rate": 3.985920859439306e-05, + "loss": 0.371, "step": 44200 }, { - "epoch": 1.56, - "learning_rate": 4.034463145353913e-05, - "loss": 0.2837, + "epoch": 1.5931452048870147, + "grad_norm": 0.19985322654247284, + "learning_rate": 3.9856861750714535e-05, + "loss": 0.4088, "step": 44205 }, { - "epoch": 1.56, - "learning_rate": 4.034238236529235e-05, - "loss": 0.2748, + "epoch": 1.5933254045482395, + "grad_norm": 0.18450570106506348, + "learning_rate": 3.985451470461414e-05, + "loss": 0.3813, "step": 44210 }, { - "epoch": 1.56, - "learning_rate": 4.034013307783486e-05, - "loss": 0.3044, + "epoch": 1.5935056042094642, + "grad_norm": 0.26536038517951965, + "learning_rate": 3.985216745612387e-05, + "loss": 0.3779, "step": 44215 }, { - "epoch": 1.56, - "learning_rate": 4.033788359119585e-05, - "loss": 0.2856, + "epoch": 1.5936858038706887, + "grad_norm": 0.17288127541542053, + "learning_rate": 3.984982000527568e-05, + "loss": 0.4086, "step": 44220 }, { - "epoch": 1.56, - "learning_rate": 4.0335633905404526e-05, - "loss": 0.307, + "epoch": 1.5938660035319132, + "grad_norm": 0.19189390540122986, + "learning_rate": 3.984747235210158e-05, + "loss": 0.4554, "step": 44225 }, { - "epoch": 1.56, - "learning_rate": 4.033338402049009e-05, - "loss": 0.2752, + "epoch": 1.594046203193138, + "grad_norm": 0.17613013088703156, + "learning_rate": 3.984512449663353e-05, + "loss": 0.3713, "step": 44230 }, { - "epoch": 1.56, - "learning_rate": 4.033113393648178e-05, - "loss": 0.2831, + "epoch": 1.5942264028543627, + "grad_norm": 0.15496566891670227, + "learning_rate": 3.984277643890355e-05, + "loss": 0.3943, "step": 44235 }, { - "epoch": 1.56, - "learning_rate": 4.03288836534088e-05, - "loss": 0.2967, + "epoch": 1.5944066025155874, + "grad_norm": 0.16410352289676666, + "learning_rate": 3.98404281789436e-05, + "loss": 0.426, "step": 44240 }, { - "epoch": 1.56, - "learning_rate": 4.0326633171300363e-05, - "loss": 0.2885, + "epoch": 1.594586802176812, + "grad_norm": 0.19269990921020508, + "learning_rate": 3.9838079716785704e-05, + "loss": 0.3917, "step": 44245 }, { - "epoch": 1.56, - "learning_rate": 4.03243824901857e-05, - "loss": 0.2944, + "epoch": 1.5947670018380364, + "grad_norm": 0.19066692888736725, + "learning_rate": 3.983573105246183e-05, + "loss": 0.4285, "step": 44250 }, { - "epoch": 1.56, - "learning_rate": 4.032213161009403e-05, - "loss": 0.2789, + "epoch": 1.5949472014992612, + "grad_norm": 0.17595848441123962, + "learning_rate": 3.9833382186004005e-05, + "loss": 0.4524, "step": 44255 }, { - "epoch": 1.56, - "learning_rate": 4.031988053105457e-05, - "loss": 0.2865, + "epoch": 1.595127401160486, + "grad_norm": 0.15249063074588776, + "learning_rate": 3.983103311744421e-05, + "loss": 0.4218, "step": 44260 }, { - "epoch": 1.56, - "learning_rate": 4.031762925309657e-05, - "loss": 0.2864, + "epoch": 1.5953076008217104, + "grad_norm": 0.16133977472782135, + "learning_rate": 3.982868384681446e-05, + "loss": 0.4011, "step": 44265 }, { - "epoch": 1.56, - "learning_rate": 4.0315377776249244e-05, - "loss": 0.2824, + "epoch": 1.595487800482935, + "grad_norm": 0.16677658259868622, + "learning_rate": 3.982633437414677e-05, + "loss": 0.389, "step": 44270 }, { - "epoch": 1.56, - "learning_rate": 4.031312610054182e-05, - "loss": 0.2908, + "epoch": 1.5956680001441597, + "grad_norm": 0.18336910009384155, + "learning_rate": 3.9823984699473147e-05, + "loss": 0.3929, "step": 44275 }, { - "epoch": 1.56, - "learning_rate": 4.031087422600356e-05, - "loss": 0.2705, + "epoch": 1.5958481998053844, + "grad_norm": 0.2092888355255127, + "learning_rate": 3.982163482282559e-05, + "loss": 0.4169, "step": 44280 }, { - "epoch": 1.56, - "learning_rate": 4.030862215266368e-05, - "loss": 0.2942, + "epoch": 1.5960283994666091, + "grad_norm": 0.17717908322811127, + "learning_rate": 3.981928474423614e-05, + "loss": 0.4569, "step": 44285 }, { - "epoch": 1.56, - "learning_rate": 4.030636988055143e-05, - "loss": 0.2905, + "epoch": 1.5962085991278336, + "grad_norm": 0.15611782670021057, + "learning_rate": 3.98169344637368e-05, + "loss": 0.4195, "step": 44290 }, { - "epoch": 1.56, - "learning_rate": 4.030411740969606e-05, - "loss": 0.2792, + "epoch": 1.5963887987890581, + "grad_norm": 0.18297843635082245, + "learning_rate": 3.9814583981359596e-05, + "loss": 0.3639, "step": 44295 }, { - "epoch": 1.56, - "learning_rate": 4.030186474012681e-05, - "loss": 0.2969, + "epoch": 1.5965689984502829, + "grad_norm": 0.15773242712020874, + "learning_rate": 3.981223329713655e-05, + "loss": 0.3866, "step": 44300 }, { - "epoch": 1.56, - "learning_rate": 4.029961187187293e-05, - "loss": 0.2886, + "epoch": 1.5967491981115076, + "grad_norm": 0.20912377536296844, + "learning_rate": 3.98098824110997e-05, + "loss": 0.3917, "step": 44305 }, { - "epoch": 1.56, - "learning_rate": 4.029735880496367e-05, - "loss": 0.2571, + "epoch": 1.5969293977727324, + "grad_norm": 0.18566472828388214, + "learning_rate": 3.980753132328107e-05, + "loss": 0.4331, "step": 44310 }, { - "epoch": 1.56, - "learning_rate": 4.029510553942829e-05, - "loss": 0.3031, + "epoch": 1.5971095974339569, + "grad_norm": 0.17410017549991608, + "learning_rate": 3.9805180033712685e-05, + "loss": 0.4333, "step": 44315 }, { - "epoch": 1.56, - "learning_rate": 4.029285207529604e-05, - "loss": 0.2603, + "epoch": 1.5972897970951814, + "grad_norm": 0.17320142686367035, + "learning_rate": 3.980282854242659e-05, + "loss": 0.4056, "step": 44320 }, { - "epoch": 1.56, - "learning_rate": 4.029059841259619e-05, - "loss": 0.2936, + "epoch": 1.597469996756406, + "grad_norm": 0.19781510531902313, + "learning_rate": 3.9800476849454825e-05, + "loss": 0.4087, "step": 44325 }, { - "epoch": 1.56, - "learning_rate": 4.028834455135799e-05, - "loss": 0.2819, + "epoch": 1.5976501964176308, + "grad_norm": 0.18340526521205902, + "learning_rate": 3.979812495482943e-05, + "loss": 0.4255, "step": 44330 }, { - "epoch": 1.56, - "learning_rate": 4.028609049161072e-05, - "loss": 0.3126, + "epoch": 1.5978303960788554, + "grad_norm": 0.19130225479602814, + "learning_rate": 3.9795772858582444e-05, + "loss": 0.4059, "step": 44335 }, { - "epoch": 1.56, - "learning_rate": 4.028383623338363e-05, - "loss": 0.2795, + "epoch": 1.5980105957400799, + "grad_norm": 0.18067646026611328, + "learning_rate": 3.979342056074592e-05, + "loss": 0.4199, "step": 44340 }, { - "epoch": 1.56, - "learning_rate": 4.028158177670601e-05, - "loss": 0.2877, + "epoch": 1.5981907954013046, + "grad_norm": 0.20790977776050568, + "learning_rate": 3.97910680613519e-05, + "loss": 0.3784, "step": 44345 }, { - "epoch": 1.56, - "learning_rate": 4.027932712160711e-05, - "loss": 0.2691, + "epoch": 1.5983709950625293, + "grad_norm": 0.22750818729400635, + "learning_rate": 3.978871536043245e-05, + "loss": 0.4147, "step": 44350 }, { - "epoch": 1.56, - "learning_rate": 4.027707226811622e-05, - "loss": 0.2824, + "epoch": 1.598551194723754, + "grad_norm": 0.19239923357963562, + "learning_rate": 3.978636245801961e-05, + "loss": 0.4602, "step": 44355 }, { - "epoch": 1.56, - "learning_rate": 4.027481721626262e-05, - "loss": 0.2895, + "epoch": 1.5987313943849786, + "grad_norm": 0.13409259915351868, + "learning_rate": 3.9784009354145446e-05, + "loss": 0.4135, "step": 44360 }, { - "epoch": 1.56, - "learning_rate": 4.0272561966075586e-05, - "loss": 0.2697, + "epoch": 1.598911594046203, + "grad_norm": 0.18741188943386078, + "learning_rate": 3.978165604884201e-05, + "loss": 0.4095, "step": 44365 }, { - "epoch": 1.56, - "learning_rate": 4.027030651758439e-05, - "loss": 0.2739, + "epoch": 1.5990917937074278, + "grad_norm": 0.1525307148694992, + "learning_rate": 3.9779302542141384e-05, + "loss": 0.4054, "step": 44370 }, { - "epoch": 1.56, - "learning_rate": 4.026805087081833e-05, - "loss": 0.3059, + "epoch": 1.5992719933686526, + "grad_norm": 0.16644379496574402, + "learning_rate": 3.977694883407561e-05, + "loss": 0.4299, "step": 44375 }, { - "epoch": 1.56, - "learning_rate": 4.02657950258067e-05, - "loss": 0.2956, + "epoch": 1.599452193029877, + "grad_norm": 0.234503835439682, + "learning_rate": 3.977459492467678e-05, + "loss": 0.4382, "step": 44380 }, { - "epoch": 1.56, - "learning_rate": 4.0263538982578774e-05, - "loss": 0.2596, + "epoch": 1.5996323926911016, + "grad_norm": 0.1683819741010666, + "learning_rate": 3.977224081397696e-05, + "loss": 0.4252, "step": 44385 }, { - "epoch": 1.56, - "learning_rate": 4.0261282741163855e-05, - "loss": 0.2915, + "epoch": 1.5998125923523263, + "grad_norm": 0.15491560101509094, + "learning_rate": 3.97698865020082e-05, + "loss": 0.4087, "step": 44390 }, { - "epoch": 1.56, - "learning_rate": 4.025902630159124e-05, - "loss": 0.2936, + "epoch": 1.599992792013551, + "grad_norm": 0.18362046778202057, + "learning_rate": 3.9767531988802606e-05, + "loss": 0.3969, "step": 44395 }, { - "epoch": 1.56, - "learning_rate": 4.025676966389021e-05, - "loss": 0.2569, + "epoch": 1.6001729916747758, + "grad_norm": 0.178349107503891, + "learning_rate": 3.9765177274392244e-05, + "loss": 0.451, "step": 44400 }, { - "epoch": 1.56, - "learning_rate": 4.02545128280901e-05, - "loss": 0.2738, + "epoch": 1.6003531913360003, + "grad_norm": 0.1806674599647522, + "learning_rate": 3.9762822358809206e-05, + "loss": 0.4001, "step": 44405 }, { - "epoch": 1.56, - "learning_rate": 4.025225579422018e-05, - "loss": 0.3099, + "epoch": 1.6005333909972248, + "grad_norm": 0.16210544109344482, + "learning_rate": 3.976046724208557e-05, + "loss": 0.3909, "step": 44410 }, { - "epoch": 1.56, - "learning_rate": 4.025045002453369e-05, - "loss": 0.2883, + "epoch": 1.6007135906584495, + "grad_norm": 0.18187157809734344, + "learning_rate": 3.975811192425342e-05, + "loss": 0.4267, "step": 44415 }, { - "epoch": 1.56, - "learning_rate": 4.0248192634212e-05, - "loss": 0.291, + "epoch": 1.6008937903196743, + "grad_norm": 0.20049238204956055, + "learning_rate": 3.9755756405344855e-05, + "loss": 0.3962, "step": 44420 }, { - "epoch": 1.56, - "learning_rate": 4.0245935045902575e-05, - "loss": 0.2972, + "epoch": 1.6010739899808988, + "grad_norm": 0.1605527549982071, + "learning_rate": 3.9753400685391974e-05, + "loss": 0.4174, "step": 44425 }, { - "epoch": 1.56, - "learning_rate": 4.0243677259634734e-05, - "loss": 0.291, + "epoch": 1.6012541896421235, + "grad_norm": 0.16398051381111145, + "learning_rate": 3.975104476442686e-05, + "loss": 0.4539, "step": 44430 }, { - "epoch": 1.56, - "learning_rate": 4.0241419275437783e-05, - "loss": 0.2713, + "epoch": 1.601434389303348, + "grad_norm": 0.18974505364894867, + "learning_rate": 3.9748688642481614e-05, + "loss": 0.3968, "step": 44435 }, { - "epoch": 1.56, - "learning_rate": 4.023916109334105e-05, - "loss": 0.2927, + "epoch": 1.6016145889645728, + "grad_norm": 0.22180098295211792, + "learning_rate": 3.974633231958834e-05, + "loss": 0.4166, "step": 44440 }, { - "epoch": 1.56, - "learning_rate": 4.023690271337385e-05, - "loss": 0.3123, + "epoch": 1.6017947886257975, + "grad_norm": 0.15120428800582886, + "learning_rate": 3.974397579577914e-05, + "loss": 0.4171, "step": 44445 }, { - "epoch": 1.56, - "learning_rate": 4.0234644135565526e-05, - "loss": 0.2886, + "epoch": 1.601974988287022, + "grad_norm": 0.19408200681209564, + "learning_rate": 3.974161907108613e-05, + "loss": 0.4058, "step": 44450 }, { - "epoch": 1.56, - "learning_rate": 4.023238535994537e-05, - "loss": 0.2892, + "epoch": 1.6021551879482465, + "grad_norm": 0.1284191906452179, + "learning_rate": 3.973926214554142e-05, + "loss": 0.3842, "step": 44455 }, { - "epoch": 1.56, - "learning_rate": 4.023012638654273e-05, - "loss": 0.2858, + "epoch": 1.6023353876094713, + "grad_norm": 0.12077841907739639, + "learning_rate": 3.9736905019177106e-05, + "loss": 0.363, "step": 44460 }, { - "epoch": 1.56, - "learning_rate": 4.022786721538694e-05, - "loss": 0.2648, + "epoch": 1.602515587270696, + "grad_norm": 0.19127848744392395, + "learning_rate": 3.973454769202532e-05, + "loss": 0.3965, "step": 44465 }, { - "epoch": 1.56, - "learning_rate": 4.022560784650733e-05, - "loss": 0.2844, + "epoch": 1.6026957869319207, + "grad_norm": 0.16007506847381592, + "learning_rate": 3.9732190164118175e-05, + "loss": 0.3915, "step": 44470 }, { - "epoch": 1.56, - "learning_rate": 4.022334827993324e-05, - "loss": 0.2985, + "epoch": 1.6028759865931452, + "grad_norm": 0.16943296790122986, + "learning_rate": 3.972983243548779e-05, + "loss": 0.4289, "step": 44475 }, { - "epoch": 1.56, - "learning_rate": 4.0221088515693995e-05, - "loss": 0.2637, + "epoch": 1.6030561862543697, + "grad_norm": 0.15759488940238953, + "learning_rate": 3.972747450616629e-05, + "loss": 0.405, "step": 44480 }, { - "epoch": 1.57, - "learning_rate": 4.021882855381895e-05, - "loss": 0.2832, + "epoch": 1.6032363859155945, + "grad_norm": 0.178999662399292, + "learning_rate": 3.97251163761858e-05, + "loss": 0.3919, "step": 44485 }, { - "epoch": 1.57, - "learning_rate": 4.021656839433745e-05, - "loss": 0.3021, + "epoch": 1.6034165855768192, + "grad_norm": 0.26717543601989746, + "learning_rate": 3.9722758045578454e-05, + "loss": 0.4161, "step": 44490 }, { - "epoch": 1.57, - "learning_rate": 4.0214308037278836e-05, - "loss": 0.2737, + "epoch": 1.6035967852380437, + "grad_norm": 0.24966903030872345, + "learning_rate": 3.9720399514376374e-05, + "loss": 0.4346, "step": 44495 }, { - "epoch": 1.57, - "learning_rate": 4.021204748267246e-05, - "loss": 0.2878, + "epoch": 1.6037769848992682, + "grad_norm": 0.17381806671619415, + "learning_rate": 3.97180407826117e-05, + "loss": 0.4227, "step": 44500 }, { - "epoch": 1.57, - "eval_loss": 0.2789715826511383, - "eval_runtime": 10.538, - "eval_samples_per_second": 9.489, - "eval_steps_per_second": 9.489, + "epoch": 1.6037769848992682, + "eval_loss": 0.44284766912460327, + "eval_runtime": 3.5457, + "eval_samples_per_second": 28.203, + "eval_steps_per_second": 7.051, "step": 44500 }, { - "epoch": 1.57, - "learning_rate": 4.020978673054767e-05, - "loss": 0.2947, + "epoch": 1.603957184560493, + "grad_norm": 0.15787242352962494, + "learning_rate": 3.971568185031658e-05, + "loss": 0.3882, "step": 44505 }, { - "epoch": 1.57, - "learning_rate": 4.020752578093382e-05, - "loss": 0.2945, + "epoch": 1.6041373842217177, + "grad_norm": 0.168611541390419, + "learning_rate": 3.971332271752313e-05, + "loss": 0.3956, "step": 44510 }, { - "epoch": 1.57, - "learning_rate": 4.020526463386028e-05, - "loss": 0.2918, + "epoch": 1.6043175838829424, + "grad_norm": 0.17954233288764954, + "learning_rate": 3.9710963384263515e-05, + "loss": 0.4084, "step": 44515 }, { - "epoch": 1.57, - "learning_rate": 4.020300328935639e-05, - "loss": 0.3068, + "epoch": 1.604497783544167, + "grad_norm": 0.16351082921028137, + "learning_rate": 3.970860385056987e-05, + "loss": 0.4356, "step": 44520 }, { - "epoch": 1.57, - "learning_rate": 4.020074174745153e-05, - "loss": 0.2898, + "epoch": 1.6046779832053915, + "grad_norm": 0.19120116531848907, + "learning_rate": 3.9706244116474345e-05, + "loss": 0.418, "step": 44525 }, { - "epoch": 1.57, - "learning_rate": 4.0198480008175044e-05, - "loss": 0.2909, + "epoch": 1.6048581828666162, + "grad_norm": 0.15512365102767944, + "learning_rate": 3.97038841820091e-05, + "loss": 0.4185, "step": 44530 }, { - "epoch": 1.57, - "learning_rate": 4.019621807155632e-05, - "loss": 0.2707, + "epoch": 1.605038382527841, + "grad_norm": 0.1973215490579605, + "learning_rate": 3.970152404720627e-05, + "loss": 0.4046, "step": 44535 }, { - "epoch": 1.57, - "learning_rate": 4.019395593762472e-05, - "loss": 0.2816, + "epoch": 1.6052185821890654, + "grad_norm": 0.18126599490642548, + "learning_rate": 3.969916371209802e-05, + "loss": 0.4358, "step": 44540 }, { - "epoch": 1.57, - "learning_rate": 4.019169360640961e-05, - "loss": 0.2972, + "epoch": 1.6053987818502902, + "grad_norm": 0.21188373863697052, + "learning_rate": 3.9696803176716515e-05, + "loss": 0.472, "step": 44545 }, { - "epoch": 1.57, - "learning_rate": 4.018943107794038e-05, - "loss": 0.2918, + "epoch": 1.6055789815115147, + "grad_norm": 0.16535495221614838, + "learning_rate": 3.969444244109391e-05, + "loss": 0.4183, "step": 44550 }, { - "epoch": 1.57, - "learning_rate": 4.018716835224638e-05, - "loss": 0.2743, + "epoch": 1.6057591811727394, + "grad_norm": 0.21638596057891846, + "learning_rate": 3.969208150526237e-05, + "loss": 0.4231, "step": 44555 }, { - "epoch": 1.57, - "learning_rate": 4.0184905429357026e-05, - "loss": 0.2786, + "epoch": 1.6059393808339641, + "grad_norm": 0.21015167236328125, + "learning_rate": 3.968972036925407e-05, + "loss": 0.4411, "step": 44560 }, { - "epoch": 1.57, - "learning_rate": 4.018264230930167e-05, - "loss": 0.2878, + "epoch": 1.6061195804951887, + "grad_norm": 0.16333642601966858, + "learning_rate": 3.968735903310117e-05, + "loss": 0.4082, "step": 44565 }, { - "epoch": 1.57, - "learning_rate": 4.0180378992109714e-05, - "loss": 0.2979, + "epoch": 1.6062997801564132, + "grad_norm": 0.1700402796268463, + "learning_rate": 3.968499749683584e-05, + "loss": 0.406, "step": 44570 }, { - "epoch": 1.57, - "learning_rate": 4.017811547781054e-05, - "loss": 0.2852, + "epoch": 1.606479979817638, + "grad_norm": 0.16421562433242798, + "learning_rate": 3.968263576049027e-05, + "loss": 0.3949, "step": 44575 }, { - "epoch": 1.57, - "learning_rate": 4.0175851766433536e-05, - "loss": 0.2772, + "epoch": 1.6066601794788626, + "grad_norm": 0.18624955415725708, + "learning_rate": 3.968027382409663e-05, + "loss": 0.3847, "step": 44580 }, { - "epoch": 1.57, - "learning_rate": 4.01735878580081e-05, - "loss": 0.287, + "epoch": 1.6068403791400874, + "grad_norm": 0.23237396776676178, + "learning_rate": 3.9677911687687095e-05, + "loss": 0.4233, "step": 44585 }, { - "epoch": 1.57, - "learning_rate": 4.0171323752563636e-05, - "loss": 0.2819, + "epoch": 1.6070205788013119, + "grad_norm": 0.17745272815227509, + "learning_rate": 3.967554935129387e-05, + "loss": 0.405, "step": 44590 }, { - "epoch": 1.57, - "learning_rate": 4.016905945012952e-05, - "loss": 0.2993, + "epoch": 1.6072007784625364, + "grad_norm": 0.1729012280702591, + "learning_rate": 3.9673186814949115e-05, + "loss": 0.3969, "step": 44595 }, { - "epoch": 1.57, - "learning_rate": 4.0166794950735164e-05, - "loss": 0.2784, + "epoch": 1.6073809781237611, + "grad_norm": 0.2897031605243683, + "learning_rate": 3.967082407868503e-05, + "loss": 0.3892, "step": 44600 }, { - "epoch": 1.57, - "learning_rate": 4.016453025440997e-05, - "loss": 0.2872, + "epoch": 1.6075611777849859, + "grad_norm": 0.17418605089187622, + "learning_rate": 3.9668461142533807e-05, + "loss": 0.4135, "step": 44605 }, { - "epoch": 1.57, - "learning_rate": 4.016226536118335e-05, - "loss": 0.2724, + "epoch": 1.6077413774462104, + "grad_norm": 0.19740115106105804, + "learning_rate": 3.9666098006527653e-05, + "loss": 0.3922, "step": 44610 }, { - "epoch": 1.57, - "learning_rate": 4.016000027108471e-05, - "loss": 0.2988, + "epoch": 1.6079215771074349, + "grad_norm": 0.18826255202293396, + "learning_rate": 3.966373467069874e-05, + "loss": 0.4254, "step": 44615 }, { - "epoch": 1.57, - "learning_rate": 4.015773498414345e-05, - "loss": 0.2687, + "epoch": 1.6081017767686596, + "grad_norm": 0.18724419176578522, + "learning_rate": 3.9661371135079285e-05, + "loss": 0.4175, "step": 44620 }, { - "epoch": 1.57, - "learning_rate": 4.015546950038899e-05, - "loss": 0.2678, + "epoch": 1.6082819764298844, + "grad_norm": 0.20004336535930634, + "learning_rate": 3.9659007399701485e-05, + "loss": 0.4015, "step": 44625 }, { - "epoch": 1.57, - "learning_rate": 4.015320381985075e-05, - "loss": 0.2811, + "epoch": 1.608462176091109, + "grad_norm": 0.2576258182525635, + "learning_rate": 3.9656643464597545e-05, + "loss": 0.4176, "step": 44630 }, { - "epoch": 1.57, - "learning_rate": 4.015093794255814e-05, - "loss": 0.2692, + "epoch": 1.6086423757523336, + "grad_norm": 0.1877816915512085, + "learning_rate": 3.9654279329799684e-05, + "loss": 0.4314, "step": 44635 }, { - "epoch": 1.57, - "learning_rate": 4.0148671868540596e-05, - "loss": 0.2624, + "epoch": 1.608822575413558, + "grad_norm": 0.20604005455970764, + "learning_rate": 3.96519149953401e-05, + "loss": 0.4033, "step": 44640 }, { - "epoch": 1.57, - "learning_rate": 4.014640559782752e-05, - "loss": 0.3, + "epoch": 1.6090027750747828, + "grad_norm": 0.17732234299182892, + "learning_rate": 3.964955046125101e-05, + "loss": 0.4149, "step": 44645 }, { - "epoch": 1.57, - "learning_rate": 4.014413913044835e-05, - "loss": 0.2471, + "epoch": 1.6091829747360076, + "grad_norm": 0.22829070687294006, + "learning_rate": 3.964718572756463e-05, + "loss": 0.3822, "step": 44650 }, { - "epoch": 1.57, - "learning_rate": 4.0141872466432515e-05, - "loss": 0.2739, + "epoch": 1.609363174397232, + "grad_norm": 0.20384085178375244, + "learning_rate": 3.964482079431319e-05, + "loss": 0.411, "step": 44655 }, { - "epoch": 1.57, - "learning_rate": 4.013960560580945e-05, - "loss": 0.2639, + "epoch": 1.6095433740584568, + "grad_norm": 0.21670548617839813, + "learning_rate": 3.9642455661528885e-05, + "loss": 0.4065, "step": 44660 }, { - "epoch": 1.57, - "learning_rate": 4.013733854860858e-05, - "loss": 0.2871, + "epoch": 1.6097235737196813, + "grad_norm": 0.18137463927268982, + "learning_rate": 3.964009032924396e-05, + "loss": 0.3918, "step": 44665 }, { - "epoch": 1.57, - "learning_rate": 4.013507129485935e-05, - "loss": 0.2904, + "epoch": 1.609903773380906, + "grad_norm": 0.16431452333927155, + "learning_rate": 3.963772479749065e-05, + "loss": 0.3946, "step": 44670 }, { - "epoch": 1.57, - "learning_rate": 4.0132803844591186e-05, - "loss": 0.2661, + "epoch": 1.6100839730421308, + "grad_norm": 0.17995165288448334, + "learning_rate": 3.963535906630117e-05, + "loss": 0.4308, "step": 44675 }, { - "epoch": 1.57, - "learning_rate": 4.013053619783355e-05, - "loss": 0.2731, + "epoch": 1.6102641727033553, + "grad_norm": 0.17555540800094604, + "learning_rate": 3.9632993135707755e-05, + "loss": 0.4113, "step": 44680 }, { - "epoch": 1.57, - "learning_rate": 4.012826835461585e-05, - "loss": 0.2718, + "epoch": 1.6104443723645798, + "grad_norm": 0.21081358194351196, + "learning_rate": 3.963062700574264e-05, + "loss": 0.4261, "step": 44685 }, { - "epoch": 1.57, - "learning_rate": 4.012600031496757e-05, - "loss": 0.2877, + "epoch": 1.6106245720258046, + "grad_norm": 0.18575575947761536, + "learning_rate": 3.9628260676438064e-05, + "loss": 0.3961, "step": 44690 }, { - "epoch": 1.57, - "learning_rate": 4.012373207891814e-05, - "loss": 0.2779, + "epoch": 1.6108047716870293, + "grad_norm": 0.16622988879680634, + "learning_rate": 3.962589414782627e-05, + "loss": 0.4321, "step": 44695 }, { - "epoch": 1.57, - "learning_rate": 4.012146364649702e-05, - "loss": 0.288, + "epoch": 1.610984971348254, + "grad_norm": 0.19155137240886688, + "learning_rate": 3.96235274199395e-05, + "loss": 0.4032, "step": 44700 }, { - "epoch": 1.57, - "learning_rate": 4.011919501773366e-05, - "loss": 0.2802, + "epoch": 1.6111651710094785, + "grad_norm": 0.15774855017662048, + "learning_rate": 3.962116049280999e-05, + "loss": 0.4115, "step": 44705 }, { - "epoch": 1.57, - "learning_rate": 4.011692619265751e-05, - "loss": 0.2772, + "epoch": 1.611345370670703, + "grad_norm": 0.17993386089801788, + "learning_rate": 3.961879336647001e-05, + "loss": 0.4065, "step": 44710 }, { - "epoch": 1.57, - "learning_rate": 4.011465717129805e-05, - "loss": 0.2637, + "epoch": 1.6115255703319278, + "grad_norm": 0.18789418041706085, + "learning_rate": 3.961642604095181e-05, + "loss": 0.391, "step": 44715 }, { - "epoch": 1.57, - "learning_rate": 4.011238795368471e-05, - "loss": 0.2746, + "epoch": 1.6117057699931525, + "grad_norm": 0.1855124831199646, + "learning_rate": 3.9614058516287624e-05, + "loss": 0.4414, "step": 44720 }, { - "epoch": 1.57, - "learning_rate": 4.0110118539846986e-05, - "loss": 0.29, + "epoch": 1.611885969654377, + "grad_norm": 0.17999990284442902, + "learning_rate": 3.961169079250971e-05, + "loss": 0.385, "step": 44725 }, { - "epoch": 1.57, - "learning_rate": 4.0107848929814325e-05, - "loss": 0.3046, + "epoch": 1.6120661693156015, + "grad_norm": 0.1700844168663025, + "learning_rate": 3.9609322869650354e-05, + "loss": 0.4109, "step": 44730 }, { - "epoch": 1.57, - "learning_rate": 4.01055791236162e-05, - "loss": 0.3014, + "epoch": 1.6122463689768263, + "grad_norm": 0.1721271425485611, + "learning_rate": 3.9606954747741797e-05, + "loss": 0.4331, "step": 44735 }, { - "epoch": 1.57, - "learning_rate": 4.0103309121282085e-05, - "loss": 0.2705, + "epoch": 1.612426568638051, + "grad_norm": 0.17084349691867828, + "learning_rate": 3.960458642681631e-05, + "loss": 0.3995, "step": 44740 }, { - "epoch": 1.57, - "learning_rate": 4.010103892284146e-05, - "loss": 0.3123, + "epoch": 1.6126067682992757, + "grad_norm": 0.14765715599060059, + "learning_rate": 3.960221790690616e-05, + "loss": 0.4097, "step": 44745 }, { - "epoch": 1.57, - "learning_rate": 4.009876852832379e-05, - "loss": 0.2675, + "epoch": 1.6127869679605003, + "grad_norm": 0.16474126279354095, + "learning_rate": 3.959984918804361e-05, + "loss": 0.4378, "step": 44750 }, { - "epoch": 1.57, - "learning_rate": 4.009649793775857e-05, - "loss": 0.2937, + "epoch": 1.6129671676217248, + "grad_norm": 0.2174513041973114, + "learning_rate": 3.959748027026095e-05, + "loss": 0.3976, "step": 44755 }, { - "epoch": 1.57, - "learning_rate": 4.009422715117526e-05, - "loss": 0.2638, + "epoch": 1.6131473672829495, + "grad_norm": 0.15356197953224182, + "learning_rate": 3.959511115359045e-05, + "loss": 0.4151, "step": 44760 }, { - "epoch": 1.57, - "learning_rate": 4.009195616860337e-05, - "loss": 0.3036, + "epoch": 1.6133275669441742, + "grad_norm": 0.18707029521465302, + "learning_rate": 3.959274183806438e-05, + "loss": 0.4231, "step": 44765 }, { - "epoch": 1.58, - "learning_rate": 4.008968499007237e-05, - "loss": 0.2876, + "epoch": 1.6135077666053987, + "grad_norm": 0.20073364675045013, + "learning_rate": 3.959037232371503e-05, + "loss": 0.4256, "step": 44770 }, { - "epoch": 1.58, - "learning_rate": 4.0087413615611756e-05, - "loss": 0.2855, + "epoch": 1.6136879662666233, + "grad_norm": 0.20367778837680817, + "learning_rate": 3.9588002610574694e-05, + "loss": 0.4231, "step": 44775 }, { - "epoch": 1.58, - "learning_rate": 4.008514204525101e-05, - "loss": 0.3001, + "epoch": 1.613868165927848, + "grad_norm": 0.15489740669727325, + "learning_rate": 3.958563269867563e-05, + "loss": 0.3753, "step": 44780 }, { - "epoch": 1.58, - "learning_rate": 4.008287027901965e-05, - "loss": 0.3207, + "epoch": 1.6140483655890727, + "grad_norm": 0.18717733025550842, + "learning_rate": 3.958326258805015e-05, + "loss": 0.4145, "step": 44785 }, { - "epoch": 1.58, - "learning_rate": 4.008059831694716e-05, - "loss": 0.2864, + "epoch": 1.6142285652502975, + "grad_norm": 0.1864037811756134, + "learning_rate": 3.9580892278730534e-05, + "loss": 0.3988, "step": 44790 }, { - "epoch": 1.58, - "learning_rate": 4.0078326159063026e-05, - "loss": 0.2738, + "epoch": 1.614408764911522, + "grad_norm": 0.19516822695732117, + "learning_rate": 3.95785217707491e-05, + "loss": 0.4386, "step": 44795 }, { - "epoch": 1.58, - "learning_rate": 4.007605380539677e-05, - "loss": 0.2664, + "epoch": 1.6145889645727465, + "grad_norm": 0.17213517427444458, + "learning_rate": 3.957615106413811e-05, + "loss": 0.4205, "step": 44800 }, { - "epoch": 1.58, - "learning_rate": 4.0073781255977894e-05, - "loss": 0.2699, + "epoch": 1.6147691642339712, + "grad_norm": 0.16478149592876434, + "learning_rate": 3.95737801589299e-05, + "loss": 0.3819, "step": 44805 }, { - "epoch": 1.58, - "learning_rate": 4.00715085108359e-05, - "loss": 0.2927, + "epoch": 1.614949363895196, + "grad_norm": 0.15579113364219666, + "learning_rate": 3.957140905515674e-05, + "loss": 0.4181, "step": 44810 }, { - "epoch": 1.58, - "learning_rate": 4.00692355700003e-05, - "loss": 0.2615, + "epoch": 1.6151295635564207, + "grad_norm": 0.21928997337818146, + "learning_rate": 3.956903775285097e-05, + "loss": 0.4157, "step": 44815 }, { - "epoch": 1.58, - "learning_rate": 4.006696243350061e-05, - "loss": 0.2782, + "epoch": 1.6153097632176452, + "grad_norm": 0.14107581973075867, + "learning_rate": 3.956666625204487e-05, + "loss": 0.3837, "step": 44820 }, { - "epoch": 1.58, - "learning_rate": 4.0064689101366336e-05, - "loss": 0.2823, + "epoch": 1.6154899628788697, + "grad_norm": 0.19918832182884216, + "learning_rate": 3.956429455277077e-05, + "loss": 0.4018, "step": 44825 }, { - "epoch": 1.58, - "learning_rate": 4.006241557362701e-05, - "loss": 0.2789, + "epoch": 1.6156701625400944, + "grad_norm": 0.19882018864154816, + "learning_rate": 3.9561922655060965e-05, + "loss": 0.4323, "step": 44830 }, { - "epoch": 1.58, - "learning_rate": 4.0060141850312135e-05, - "loss": 0.2832, + "epoch": 1.6158503622013192, + "grad_norm": 0.17899902164936066, + "learning_rate": 3.95595505589478e-05, + "loss": 0.4351, "step": 44835 }, { - "epoch": 1.58, - "learning_rate": 4.005786793145126e-05, - "loss": 0.2645, + "epoch": 1.6160305618625437, + "grad_norm": 0.14967146515846252, + "learning_rate": 3.955717826446357e-05, + "loss": 0.4039, "step": 44840 }, { - "epoch": 1.58, - "learning_rate": 4.005559381707388e-05, - "loss": 0.275, + "epoch": 1.6162107615237682, + "grad_norm": 0.17827114462852478, + "learning_rate": 3.95548057716406e-05, + "loss": 0.3759, "step": 44845 }, { - "epoch": 1.58, - "learning_rate": 4.005331950720953e-05, - "loss": 0.283, + "epoch": 1.616390961184993, + "grad_norm": 0.19635485112667084, + "learning_rate": 3.955243308051122e-05, + "loss": 0.4571, "step": 44850 }, { - "epoch": 1.58, - "learning_rate": 4.0051045001887755e-05, - "loss": 0.283, + "epoch": 1.6165711608462177, + "grad_norm": 0.20336943864822388, + "learning_rate": 3.955006019110776e-05, + "loss": 0.4104, "step": 44855 }, { - "epoch": 1.58, - "learning_rate": 4.004877030113808e-05, - "loss": 0.3178, + "epoch": 1.6167513605074424, + "grad_norm": 0.16588455438613892, + "learning_rate": 3.954768710346255e-05, + "loss": 0.3808, "step": 44860 }, { - "epoch": 1.58, - "learning_rate": 4.0046495404990043e-05, - "loss": 0.2851, + "epoch": 1.616931560168667, + "grad_norm": 0.18041035532951355, + "learning_rate": 3.954531381760791e-05, + "loss": 0.3845, "step": 44865 }, { - "epoch": 1.58, - "learning_rate": 4.004422031347318e-05, - "loss": 0.2787, + "epoch": 1.6171117598298914, + "grad_norm": 0.19246827065944672, + "learning_rate": 3.95429403335762e-05, + "loss": 0.4226, "step": 44870 }, { - "epoch": 1.58, - "learning_rate": 4.0041945026617024e-05, - "loss": 0.274, + "epoch": 1.6172919594911161, + "grad_norm": 0.1802048683166504, + "learning_rate": 3.954056665139972e-05, + "loss": 0.4507, "step": 44875 }, { - "epoch": 1.58, - "learning_rate": 4.0039669544451126e-05, - "loss": 0.278, + "epoch": 1.6174721591523409, + "grad_norm": 0.1771692931652069, + "learning_rate": 3.9538192771110855e-05, + "loss": 0.3853, "step": 44880 }, { - "epoch": 1.58, - "learning_rate": 4.003739386700503e-05, - "loss": 0.2996, + "epoch": 1.6176523588135654, + "grad_norm": 0.19818015396595, + "learning_rate": 3.953581869274192e-05, + "loss": 0.4143, "step": 44885 }, { - "epoch": 1.58, - "learning_rate": 4.003511799430829e-05, - "loss": 0.3108, + "epoch": 1.61783255847479, + "grad_norm": 0.19741560518741608, + "learning_rate": 3.953344441632527e-05, + "loss": 0.4189, "step": 44890 }, { - "epoch": 1.58, - "learning_rate": 4.003284192639045e-05, - "loss": 0.28, + "epoch": 1.6180127581360146, + "grad_norm": 0.15774419903755188, + "learning_rate": 3.953106994189326e-05, + "loss": 0.4124, "step": 44895 }, { - "epoch": 1.58, - "learning_rate": 4.003056566328106e-05, - "loss": 0.2945, + "epoch": 1.6181929577972394, + "grad_norm": 0.1899198293685913, + "learning_rate": 3.952869526947823e-05, + "loss": 0.4436, "step": 44900 }, { - "epoch": 1.58, - "learning_rate": 4.002828920500968e-05, - "loss": 0.3199, + "epoch": 1.618373157458464, + "grad_norm": 0.19726921617984772, + "learning_rate": 3.9526320399112536e-05, + "loss": 0.3846, "step": 44905 }, { - "epoch": 1.58, - "learning_rate": 4.002601255160587e-05, - "loss": 0.2892, + "epoch": 1.6185533571196886, + "grad_norm": 0.18593871593475342, + "learning_rate": 3.952394533082855e-05, + "loss": 0.4035, "step": 44910 }, { - "epoch": 1.58, - "learning_rate": 4.00237357030992e-05, - "loss": 0.3298, + "epoch": 1.6187335567809131, + "grad_norm": 0.1598072648048401, + "learning_rate": 3.952157006465861e-05, + "loss": 0.4077, "step": 44915 }, { - "epoch": 1.58, - "learning_rate": 4.002145865951921e-05, - "loss": 0.2979, + "epoch": 1.6189137564421379, + "grad_norm": 0.2124381959438324, + "learning_rate": 3.95191946006351e-05, + "loss": 0.4266, "step": 44920 }, { - "epoch": 1.58, - "learning_rate": 4.001918142089548e-05, - "loss": 0.2916, + "epoch": 1.6190939561033626, + "grad_norm": 0.16838979721069336, + "learning_rate": 3.951681893879036e-05, + "loss": 0.4073, "step": 44925 }, { - "epoch": 1.58, - "learning_rate": 4.001690398725758e-05, - "loss": 0.2871, + "epoch": 1.619274155764587, + "grad_norm": 0.16557417809963226, + "learning_rate": 3.951444307915678e-05, + "loss": 0.4183, "step": 44930 }, { - "epoch": 1.58, - "learning_rate": 4.0014626358635084e-05, - "loss": 0.2833, + "epoch": 1.6194543554258118, + "grad_norm": 0.17037592828273773, + "learning_rate": 3.951206702176672e-05, + "loss": 0.3463, "step": 44935 }, { - "epoch": 1.58, - "learning_rate": 4.001234853505755e-05, - "loss": 0.2868, + "epoch": 1.6196345550870364, + "grad_norm": 0.16905367374420166, + "learning_rate": 3.950969076665256e-05, + "loss": 0.3978, "step": 44940 }, { - "epoch": 1.58, - "learning_rate": 4.001007051655457e-05, - "loss": 0.2803, + "epoch": 1.619814754748261, + "grad_norm": 0.16748353838920593, + "learning_rate": 3.950731431384668e-05, + "loss": 0.461, "step": 44945 }, { - "epoch": 1.58, - "learning_rate": 4.000779230315571e-05, - "loss": 0.2863, + "epoch": 1.6199949544094858, + "grad_norm": 0.1732574999332428, + "learning_rate": 3.950493766338144e-05, + "loss": 0.407, "step": 44950 }, { - "epoch": 1.58, - "learning_rate": 4.000551389489057e-05, - "loss": 0.2994, + "epoch": 1.6201751540707103, + "grad_norm": 0.12833158671855927, + "learning_rate": 3.9502560815289236e-05, + "loss": 0.3877, "step": 44955 }, { - "epoch": 1.58, - "learning_rate": 4.000323529178871e-05, - "loss": 0.3044, + "epoch": 1.6203553537319348, + "grad_norm": 0.17978282272815704, + "learning_rate": 3.950018376960245e-05, + "loss": 0.4099, "step": 44960 }, { - "epoch": 1.58, - "learning_rate": 4.000095649387973e-05, - "loss": 0.2818, + "epoch": 1.6205355533931596, + "grad_norm": 0.19126909971237183, + "learning_rate": 3.949780652635347e-05, + "loss": 0.4621, "step": 44965 }, { - "epoch": 1.58, - "learning_rate": 3.9998677501193214e-05, - "loss": 0.2852, + "epoch": 1.6207157530543843, + "grad_norm": 0.1761917769908905, + "learning_rate": 3.9495429085574675e-05, + "loss": 0.4437, "step": 44970 }, { - "epoch": 1.58, - "learning_rate": 3.999639831375876e-05, - "loss": 0.2878, + "epoch": 1.620895952715609, + "grad_norm": 0.21770428121089935, + "learning_rate": 3.949305144729847e-05, + "loss": 0.4273, "step": 44975 }, { - "epoch": 1.58, - "learning_rate": 3.999411893160595e-05, - "loss": 0.283, + "epoch": 1.6210761523768336, + "grad_norm": 0.21799218654632568, + "learning_rate": 3.949067361155725e-05, + "loss": 0.4328, "step": 44980 }, { - "epoch": 1.58, - "learning_rate": 3.9991839354764394e-05, - "loss": 0.2982, + "epoch": 1.621256352038058, + "grad_norm": 0.21713624894618988, + "learning_rate": 3.94882955783834e-05, + "loss": 0.4416, "step": 44985 }, { - "epoch": 1.58, - "learning_rate": 3.998955958326368e-05, - "loss": 0.2903, + "epoch": 1.6214365516992828, + "grad_norm": 0.1928412914276123, + "learning_rate": 3.9485917347809334e-05, + "loss": 0.4139, "step": 44990 }, { - "epoch": 1.58, - "learning_rate": 3.998727961713342e-05, - "loss": 0.305, + "epoch": 1.6216167513605075, + "grad_norm": 0.24302361905574799, + "learning_rate": 3.948353891986743e-05, + "loss": 0.4207, "step": 44995 }, { - "epoch": 1.58, - "learning_rate": 3.99849994564032e-05, - "loss": 0.2845, + "epoch": 1.621796951021732, + "grad_norm": 0.17859376966953278, + "learning_rate": 3.948116029459014e-05, + "loss": 0.4249, "step": 45000 }, { - "epoch": 1.58, - "eval_loss": 0.27851128578186035, - "eval_runtime": 10.5579, - "eval_samples_per_second": 9.472, - "eval_steps_per_second": 9.472, + "epoch": 1.621796951021732, + "eval_loss": 0.44367220997810364, + "eval_runtime": 3.5481, + "eval_samples_per_second": 28.184, + "eval_steps_per_second": 7.046, "step": 45000 }, { - "epoch": 1.58, - "learning_rate": 3.998271910110264e-05, - "loss": 0.3026, + "epoch": 1.6219771506829566, + "grad_norm": 0.18802441656589508, + "learning_rate": 3.947878147200983e-05, + "loss": 0.4004, "step": 45005 }, { - "epoch": 1.58, - "learning_rate": 3.998043855126135e-05, - "loss": 0.2869, + "epoch": 1.6221573503441813, + "grad_norm": 0.2054462730884552, + "learning_rate": 3.9476402452158926e-05, + "loss": 0.4113, "step": 45010 }, { - "epoch": 1.58, - "learning_rate": 3.997815780690895e-05, - "loss": 0.2622, + "epoch": 1.622337550005406, + "grad_norm": 0.17358975112438202, + "learning_rate": 3.947402323506984e-05, + "loss": 0.4125, "step": 45015 }, { - "epoch": 1.58, - "learning_rate": 3.997587686807502e-05, - "loss": 0.3, + "epoch": 1.6225177496666308, + "grad_norm": 0.2103101909160614, + "learning_rate": 3.9471643820774995e-05, + "loss": 0.4065, "step": 45020 }, { - "epoch": 1.58, - "learning_rate": 3.997359573478921e-05, - "loss": 0.2866, + "epoch": 1.6226979493278553, + "grad_norm": 0.18095333874225616, + "learning_rate": 3.94692642093068e-05, + "loss": 0.4137, "step": 45025 }, { - "epoch": 1.58, - "learning_rate": 3.997131440708112e-05, - "loss": 0.2857, + "epoch": 1.6228781489890798, + "grad_norm": 0.17532798647880554, + "learning_rate": 3.946688440069768e-05, + "loss": 0.4145, "step": 45030 }, { - "epoch": 1.58, - "learning_rate": 3.9969032884980386e-05, - "loss": 0.2889, + "epoch": 1.6230583486503045, + "grad_norm": 0.21436169743537903, + "learning_rate": 3.946450439498006e-05, + "loss": 0.432, "step": 45035 }, { - "epoch": 1.58, - "learning_rate": 3.9966751168516626e-05, - "loss": 0.2875, + "epoch": 1.6232385483115293, + "grad_norm": 0.15949729084968567, + "learning_rate": 3.946212419218638e-05, + "loss": 0.4074, "step": 45040 }, { - "epoch": 1.58, - "learning_rate": 3.996446925771946e-05, - "loss": 0.3088, + "epoch": 1.6234187479727538, + "grad_norm": 0.17256999015808105, + "learning_rate": 3.945974379234905e-05, + "loss": 0.4371, "step": 45045 }, { - "epoch": 1.58, - "learning_rate": 3.9962187152618525e-05, - "loss": 0.3145, + "epoch": 1.6235989476339785, + "grad_norm": 0.2011367380619049, + "learning_rate": 3.945736319550051e-05, + "loss": 0.443, "step": 45050 }, { - "epoch": 1.59, - "learning_rate": 3.995990485324345e-05, - "loss": 0.291, + "epoch": 1.623779147295203, + "grad_norm": 0.14212122559547424, + "learning_rate": 3.945498240167319e-05, + "loss": 0.4378, "step": 45055 }, { - "epoch": 1.59, - "learning_rate": 3.995762235962387e-05, - "loss": 0.2996, + "epoch": 1.6239593469564277, + "grad_norm": 0.21600797772407532, + "learning_rate": 3.9452601410899544e-05, + "loss": 0.4043, "step": 45060 }, { - "epoch": 1.59, - "learning_rate": 3.995533967178941e-05, - "loss": 0.2641, + "epoch": 1.6241395466176525, + "grad_norm": 0.16741763055324554, + "learning_rate": 3.9450220223212e-05, + "loss": 0.4199, "step": 45065 }, { - "epoch": 1.59, - "learning_rate": 3.995305678976974e-05, - "loss": 0.293, + "epoch": 1.624319746278877, + "grad_norm": 0.1476602405309677, + "learning_rate": 3.9447838838643e-05, + "loss": 0.3941, "step": 45070 }, { - "epoch": 1.59, - "learning_rate": 3.995077371359448e-05, - "loss": 0.2948, + "epoch": 1.6244999459401015, + "grad_norm": 0.17911425232887268, + "learning_rate": 3.9445457257224996e-05, + "loss": 0.4306, "step": 45075 }, { - "epoch": 1.59, - "learning_rate": 3.994849044329326e-05, - "loss": 0.2961, + "epoch": 1.6246801456013262, + "grad_norm": 0.17017336189746857, + "learning_rate": 3.944307547899042e-05, + "loss": 0.4172, "step": 45080 }, { - "epoch": 1.59, - "learning_rate": 3.9946206978895754e-05, - "loss": 0.2595, + "epoch": 1.624860345262551, + "grad_norm": 0.17422279715538025, + "learning_rate": 3.944069350397175e-05, + "loss": 0.3886, "step": 45085 }, { - "epoch": 1.59, - "learning_rate": 3.99439233204316e-05, - "loss": 0.294, + "epoch": 1.6250405449237757, + "grad_norm": 0.1593707799911499, + "learning_rate": 3.9438311332201424e-05, + "loss": 0.3895, "step": 45090 }, { - "epoch": 1.59, - "learning_rate": 3.994163946793045e-05, - "loss": 0.291, + "epoch": 1.6252207445850002, + "grad_norm": 0.1855667531490326, + "learning_rate": 3.94359289637119e-05, + "loss": 0.4233, "step": 45095 }, { - "epoch": 1.59, - "learning_rate": 3.993935542142196e-05, - "loss": 0.2901, + "epoch": 1.6254009442462247, + "grad_norm": 0.16529178619384766, + "learning_rate": 3.943354639853565e-05, + "loss": 0.3903, "step": 45100 }, { - "epoch": 1.59, - "learning_rate": 3.993707118093578e-05, - "loss": 0.2719, + "epoch": 1.6255811439074495, + "grad_norm": 0.15914855897426605, + "learning_rate": 3.9431163636705114e-05, + "loss": 0.4092, "step": 45105 }, { - "epoch": 1.59, - "learning_rate": 3.993478674650158e-05, - "loss": 0.2859, + "epoch": 1.6257613435686742, + "grad_norm": 0.21924299001693726, + "learning_rate": 3.942878067825277e-05, + "loss": 0.4527, "step": 45110 }, { - "epoch": 1.59, - "learning_rate": 3.993250211814901e-05, - "loss": 0.2932, + "epoch": 1.6259415432298987, + "grad_norm": 0.16359423100948334, + "learning_rate": 3.942639752321108e-05, + "loss": 0.3986, "step": 45115 }, { - "epoch": 1.59, - "learning_rate": 3.993021729590775e-05, - "loss": 0.2849, + "epoch": 1.6261217428911232, + "grad_norm": 0.23440222442150116, + "learning_rate": 3.942401417161252e-05, + "loss": 0.4406, "step": 45120 }, { - "epoch": 1.59, - "learning_rate": 3.992793227980744e-05, - "loss": 0.2834, + "epoch": 1.626301942552348, + "grad_norm": 0.2375732809305191, + "learning_rate": 3.942163062348956e-05, + "loss": 0.4347, "step": 45125 }, { - "epoch": 1.59, - "learning_rate": 3.992564706987779e-05, - "loss": 0.2918, + "epoch": 1.6264821422135727, + "grad_norm": 0.19271384179592133, + "learning_rate": 3.9419246878874674e-05, + "loss": 0.4476, "step": 45130 }, { - "epoch": 1.59, - "learning_rate": 3.992336166614844e-05, - "loss": 0.2689, + "epoch": 1.6266623418747974, + "grad_norm": 0.1916663497686386, + "learning_rate": 3.9416862937800335e-05, + "loss": 0.4026, "step": 45135 }, { - "epoch": 1.59, - "learning_rate": 3.9921076068649073e-05, - "loss": 0.3011, + "epoch": 1.626842541536022, + "grad_norm": 0.18741334974765778, + "learning_rate": 3.941447880029903e-05, + "loss": 0.3952, "step": 45140 }, { - "epoch": 1.59, - "learning_rate": 3.991879027740936e-05, - "loss": 0.2934, + "epoch": 1.6270227411972464, + "grad_norm": 0.16537748277187347, + "learning_rate": 3.941209446640325e-05, + "loss": 0.4506, "step": 45145 }, { - "epoch": 1.59, - "learning_rate": 3.9916504292459003e-05, - "loss": 0.2646, + "epoch": 1.6272029408584712, + "grad_norm": 0.17888925969600677, + "learning_rate": 3.940970993614547e-05, + "loss": 0.4331, "step": 45150 }, { - "epoch": 1.59, - "learning_rate": 3.991421811382765e-05, - "loss": 0.313, + "epoch": 1.627383140519696, + "grad_norm": 0.1680968850851059, + "learning_rate": 3.940732520955818e-05, + "loss": 0.4103, "step": 45155 }, { - "epoch": 1.59, - "learning_rate": 3.9911931741545014e-05, - "loss": 0.2736, + "epoch": 1.6275633401809204, + "grad_norm": 0.22438734769821167, + "learning_rate": 3.940494028667387e-05, + "loss": 0.4185, "step": 45160 }, { - "epoch": 1.59, - "learning_rate": 3.990964517564077e-05, - "loss": 0.288, + "epoch": 1.6277435398421451, + "grad_norm": 0.1568988710641861, + "learning_rate": 3.940255516752504e-05, + "loss": 0.404, "step": 45165 }, { - "epoch": 1.59, - "learning_rate": 3.9907358416144606e-05, - "loss": 0.315, + "epoch": 1.6279237395033697, + "grad_norm": 0.18171727657318115, + "learning_rate": 3.94001698521442e-05, + "loss": 0.3919, "step": 45170 }, { - "epoch": 1.59, - "learning_rate": 3.990507146308622e-05, - "loss": 0.2904, + "epoch": 1.6281039391645944, + "grad_norm": 0.20402833819389343, + "learning_rate": 3.9397784340563813e-05, + "loss": 0.3906, "step": 45175 }, { - "epoch": 1.59, - "learning_rate": 3.99027843164953e-05, - "loss": 0.2748, + "epoch": 1.6282841388258191, + "grad_norm": 0.2102474421262741, + "learning_rate": 3.939539863281641e-05, + "loss": 0.4252, "step": 45180 }, { - "epoch": 1.59, - "learning_rate": 3.990049697640156e-05, - "loss": 0.2799, + "epoch": 1.6284643384870436, + "grad_norm": 0.15504828095436096, + "learning_rate": 3.939301272893449e-05, + "loss": 0.394, "step": 45185 }, { - "epoch": 1.59, - "learning_rate": 3.9898209442834674e-05, - "loss": 0.2663, + "epoch": 1.6286445381482682, + "grad_norm": 0.19128242135047913, + "learning_rate": 3.939062662895055e-05, + "loss": 0.3909, "step": 45190 }, { - "epoch": 1.59, - "learning_rate": 3.9895921715824355e-05, - "loss": 0.2832, + "epoch": 1.6288247378094929, + "grad_norm": 0.18570300936698914, + "learning_rate": 3.938824033289712e-05, + "loss": 0.4437, "step": 45195 }, { - "epoch": 1.59, - "learning_rate": 3.989363379540032e-05, - "loss": 0.2778, + "epoch": 1.6290049374707176, + "grad_norm": 0.1933826506137848, + "learning_rate": 3.938585384080668e-05, + "loss": 0.4038, "step": 45200 }, { - "epoch": 1.59, - "learning_rate": 3.989134568159226e-05, - "loss": 0.3048, + "epoch": 1.6291851371319424, + "grad_norm": 0.17036275565624237, + "learning_rate": 3.93834671527118e-05, + "loss": 0.4106, "step": 45205 }, { - "epoch": 1.59, - "learning_rate": 3.988905737442989e-05, - "loss": 0.283, + "epoch": 1.6293653367931669, + "grad_norm": 0.2243850827217102, + "learning_rate": 3.9381080268644936e-05, + "loss": 0.4018, "step": 45210 }, { - "epoch": 1.59, - "learning_rate": 3.988676887394292e-05, - "loss": 0.2761, + "epoch": 1.6295455364543914, + "grad_norm": 0.2178872525691986, + "learning_rate": 3.9378693188638646e-05, + "loss": 0.4064, "step": 45215 }, { - "epoch": 1.59, - "learning_rate": 3.988448018016107e-05, - "loss": 0.2957, + "epoch": 1.629725736115616, + "grad_norm": 0.2029925286769867, + "learning_rate": 3.937630591272545e-05, + "loss": 0.4244, "step": 45220 }, { - "epoch": 1.59, - "learning_rate": 3.9882191293114055e-05, - "loss": 0.2997, + "epoch": 1.6299059357768408, + "grad_norm": 0.20879967510700226, + "learning_rate": 3.937391844093786e-05, + "loss": 0.4523, "step": 45225 }, { - "epoch": 1.59, - "learning_rate": 3.9879902212831586e-05, - "loss": 0.2857, + "epoch": 1.6300861354380654, + "grad_norm": 0.1707211285829544, + "learning_rate": 3.937153077330843e-05, + "loss": 0.4215, "step": 45230 }, { - "epoch": 1.59, - "learning_rate": 3.98776129393434e-05, - "loss": 0.2847, + "epoch": 1.6302663350992899, + "grad_norm": 0.17152579128742218, + "learning_rate": 3.936914290986966e-05, + "loss": 0.4481, "step": 45235 }, { - "epoch": 1.59, - "learning_rate": 3.987532347267922e-05, - "loss": 0.318, + "epoch": 1.6304465347605146, + "grad_norm": 0.17980226874351501, + "learning_rate": 3.9366754850654106e-05, + "loss": 0.3495, "step": 45240 }, { - "epoch": 1.59, - "learning_rate": 3.987303381286876e-05, - "loss": 0.2782, + "epoch": 1.6306267344217393, + "grad_norm": 0.18112671375274658, + "learning_rate": 3.9364366595694296e-05, + "loss": 0.3669, "step": 45245 }, { - "epoch": 1.59, - "learning_rate": 3.987074395994176e-05, - "loss": 0.2763, + "epoch": 1.630806934082964, + "grad_norm": 0.20279546082019806, + "learning_rate": 3.936197814502278e-05, + "loss": 0.3874, "step": 45250 }, { - "epoch": 1.59, - "learning_rate": 3.986845391392795e-05, - "loss": 0.2799, + "epoch": 1.6309871337441886, + "grad_norm": 0.15267014503479004, + "learning_rate": 3.9359589498672086e-05, + "loss": 0.4019, "step": 45255 }, { - "epoch": 1.59, - "learning_rate": 3.986616367485706e-05, - "loss": 0.2632, + "epoch": 1.631167333405413, + "grad_norm": 0.16568580269813538, + "learning_rate": 3.9357200656674764e-05, + "loss": 0.4233, "step": 45260 }, { - "epoch": 1.59, - "learning_rate": 3.986387324275884e-05, - "loss": 0.3109, + "epoch": 1.6313475330666378, + "grad_norm": 0.17788587510585785, + "learning_rate": 3.935481161906336e-05, + "loss": 0.3938, "step": 45265 }, { - "epoch": 1.59, - "learning_rate": 3.9861582617663014e-05, - "loss": 0.3049, + "epoch": 1.6315277327278626, + "grad_norm": 0.19289278984069824, + "learning_rate": 3.935242238587043e-05, + "loss": 0.4054, "step": 45270 }, { - "epoch": 1.59, - "learning_rate": 3.9859291799599334e-05, - "loss": 0.287, + "epoch": 1.631707932389087, + "grad_norm": 0.17799817025661469, + "learning_rate": 3.935003295712853e-05, + "loss": 0.4, "step": 45275 }, { - "epoch": 1.59, - "learning_rate": 3.985700078859754e-05, - "loss": 0.2702, + "epoch": 1.6318881320503116, + "grad_norm": 0.1564566195011139, + "learning_rate": 3.9347643332870206e-05, + "loss": 0.4249, "step": 45280 }, { - "epoch": 1.59, - "learning_rate": 3.985470958468739e-05, - "loss": 0.3012, + "epoch": 1.6320683317115363, + "grad_norm": 0.22281388938426971, + "learning_rate": 3.934525351312801e-05, + "loss": 0.396, "step": 45285 }, { - "epoch": 1.59, - "learning_rate": 3.985241818789862e-05, - "loss": 0.3009, + "epoch": 1.632248531372761, + "grad_norm": 0.17600180208683014, + "learning_rate": 3.934286349793452e-05, + "loss": 0.423, "step": 45290 }, { - "epoch": 1.59, - "learning_rate": 3.985012659826099e-05, - "loss": 0.3043, + "epoch": 1.6324287310339858, + "grad_norm": 0.19052568078041077, + "learning_rate": 3.93404732873223e-05, + "loss": 0.4046, "step": 45295 }, { - "epoch": 1.59, - "learning_rate": 3.984783481580425e-05, - "loss": 0.2747, + "epoch": 1.6326089306952103, + "grad_norm": 0.18254795670509338, + "learning_rate": 3.9338082881323896e-05, + "loss": 0.3924, "step": 45300 }, { - "epoch": 1.59, - "learning_rate": 3.984554284055816e-05, - "loss": 0.3035, + "epoch": 1.6327891303564348, + "grad_norm": 0.20248796045780182, + "learning_rate": 3.933569227997189e-05, + "loss": 0.4133, "step": 45305 }, { - "epoch": 1.59, - "learning_rate": 3.984325067255248e-05, - "loss": 0.2666, + "epoch": 1.6329693300176595, + "grad_norm": 0.1688736379146576, + "learning_rate": 3.9333301483298854e-05, + "loss": 0.3878, "step": 45310 }, { - "epoch": 1.59, - "learning_rate": 3.984095831181698e-05, - "loss": 0.3017, + "epoch": 1.6331495296788843, + "grad_norm": 0.17724154889583588, + "learning_rate": 3.9330910491337365e-05, + "loss": 0.3865, "step": 45315 }, { - "epoch": 1.59, - "learning_rate": 3.983866575838141e-05, - "loss": 0.2895, + "epoch": 1.633329729340109, + "grad_norm": 0.1554093062877655, + "learning_rate": 3.932851930411999e-05, + "loss": 0.4509, "step": 45320 }, { - "epoch": 1.59, - "learning_rate": 3.9836373012275543e-05, - "loss": 0.3074, + "epoch": 1.6335099290013335, + "grad_norm": 0.18391548097133636, + "learning_rate": 3.9326127921679315e-05, + "loss": 0.4208, "step": 45325 }, { - "epoch": 1.59, - "learning_rate": 3.983408007352916e-05, - "loss": 0.2672, + "epoch": 1.633690128662558, + "grad_norm": 0.18900564312934875, + "learning_rate": 3.932373634404793e-05, + "loss": 0.412, "step": 45330 }, { - "epoch": 1.6, - "learning_rate": 3.983178694217201e-05, - "loss": 0.2651, + "epoch": 1.6338703283237828, + "grad_norm": 0.17100130021572113, + "learning_rate": 3.932134457125839e-05, + "loss": 0.4065, "step": 45335 }, { - "epoch": 1.6, - "learning_rate": 3.982949361823388e-05, - "loss": 0.3096, + "epoch": 1.6340505279850075, + "grad_norm": 0.17979390919208527, + "learning_rate": 3.931895260334331e-05, + "loss": 0.3887, "step": 45340 }, { - "epoch": 1.6, - "learning_rate": 3.982720010174456e-05, - "loss": 0.2666, + "epoch": 1.634230727646232, + "grad_norm": 0.1904669851064682, + "learning_rate": 3.9316560440335275e-05, + "loss": 0.4119, "step": 45345 }, { - "epoch": 1.6, - "learning_rate": 3.982490639273381e-05, - "loss": 0.2852, + "epoch": 1.6344109273074565, + "grad_norm": 0.16896291077136993, + "learning_rate": 3.931416808226688e-05, + "loss": 0.4423, "step": 45350 }, { - "epoch": 1.6, - "learning_rate": 3.9822612491231425e-05, - "loss": 0.2856, + "epoch": 1.6345911269686813, + "grad_norm": 0.22345204651355743, + "learning_rate": 3.931177552917071e-05, + "loss": 0.4178, "step": 45355 }, { - "epoch": 1.6, - "learning_rate": 3.982031839726718e-05, - "loss": 0.2854, + "epoch": 1.634771326629906, + "grad_norm": 0.16224905848503113, + "learning_rate": 3.9309382781079375e-05, + "loss": 0.4169, "step": 45360 }, { - "epoch": 1.6, - "learning_rate": 3.981802411087087e-05, - "loss": 0.2934, + "epoch": 1.6349515262911307, + "grad_norm": 0.20584708452224731, + "learning_rate": 3.930698983802547e-05, + "loss": 0.4187, "step": 45365 }, { - "epoch": 1.6, - "learning_rate": 3.981572963207229e-05, - "loss": 0.3026, + "epoch": 1.6351317259523552, + "grad_norm": 0.17952153086662292, + "learning_rate": 3.930459670004159e-05, + "loss": 0.4389, "step": 45370 }, { - "epoch": 1.6, - "learning_rate": 3.981343496090121e-05, - "loss": 0.3136, + "epoch": 1.6353119256135797, + "grad_norm": 0.2221333533525467, + "learning_rate": 3.930220336716036e-05, + "loss": 0.4312, "step": 45375 }, { - "epoch": 1.6, - "learning_rate": 3.9811140097387456e-05, - "loss": 0.2767, + "epoch": 1.6354921252748045, + "grad_norm": 0.16802071034908295, + "learning_rate": 3.929980983941437e-05, + "loss": 0.4034, "step": 45380 }, { - "epoch": 1.6, - "learning_rate": 3.98088450415608e-05, - "loss": 0.299, + "epoch": 1.6356723249360292, + "grad_norm": 0.20047937333583832, + "learning_rate": 3.9297416116836246e-05, + "loss": 0.3942, "step": 45385 }, { - "epoch": 1.6, - "learning_rate": 3.9806549793451046e-05, - "loss": 0.2696, + "epoch": 1.6358525245972537, + "grad_norm": 0.19346770644187927, + "learning_rate": 3.9295022199458596e-05, + "loss": 0.4662, "step": 45390 }, { - "epoch": 1.6, - "learning_rate": 3.9804254353088003e-05, - "loss": 0.2684, + "epoch": 1.6360327242584782, + "grad_norm": 0.18764521181583405, + "learning_rate": 3.9292628087314034e-05, + "loss": 0.4294, "step": 45395 }, { - "epoch": 1.6, - "learning_rate": 3.980195872050147e-05, - "loss": 0.2764, + "epoch": 1.636212923919703, + "grad_norm": 0.1746283918619156, + "learning_rate": 3.9290233780435174e-05, + "loss": 0.4132, "step": 45400 }, { - "epoch": 1.6, - "learning_rate": 3.979966289572127e-05, - "loss": 0.269, + "epoch": 1.6363931235809277, + "grad_norm": 0.18788184225559235, + "learning_rate": 3.928783927885466e-05, + "loss": 0.4366, "step": 45405 }, { - "epoch": 1.6, - "learning_rate": 3.97973668787772e-05, - "loss": 0.269, + "epoch": 1.6365733232421524, + "grad_norm": 0.16621163487434387, + "learning_rate": 3.9285444582605086e-05, + "loss": 0.4393, "step": 45410 }, { - "epoch": 1.6, - "learning_rate": 3.9795070669699064e-05, - "loss": 0.2676, + "epoch": 1.636753522903377, + "grad_norm": 0.16439075767993927, + "learning_rate": 3.9283049691719106e-05, + "loss": 0.3832, "step": 45415 }, { - "epoch": 1.6, - "learning_rate": 3.9792774268516687e-05, - "loss": 0.2857, + "epoch": 1.6369337225646015, + "grad_norm": 0.15328091382980347, + "learning_rate": 3.928065460622933e-05, + "loss": 0.4351, "step": 45420 }, { - "epoch": 1.6, - "learning_rate": 3.979047767525989e-05, - "loss": 0.3097, + "epoch": 1.6371139222258262, + "grad_norm": 0.20914670825004578, + "learning_rate": 3.927825932616841e-05, + "loss": 0.3928, "step": 45425 }, { - "epoch": 1.6, - "learning_rate": 3.978818088995848e-05, - "loss": 0.2589, + "epoch": 1.637294121887051, + "grad_norm": 0.21218664944171906, + "learning_rate": 3.9275863851568964e-05, + "loss": 0.4199, "step": 45430 }, { - "epoch": 1.6, - "learning_rate": 3.97858839126423e-05, - "loss": 0.2776, + "epoch": 1.6374743215482754, + "grad_norm": 0.19584229588508606, + "learning_rate": 3.9273468182463645e-05, + "loss": 0.4132, "step": 45435 }, { - "epoch": 1.6, - "learning_rate": 3.978358674334115e-05, - "loss": 0.2829, + "epoch": 1.6376545212095002, + "grad_norm": 0.20541848242282867, + "learning_rate": 3.9271072318885076e-05, + "loss": 0.4287, "step": 45440 }, { - "epoch": 1.6, - "learning_rate": 3.9781289382084875e-05, - "loss": 0.3155, + "epoch": 1.6378347208707247, + "grad_norm": 0.20829689502716064, + "learning_rate": 3.9268676260865914e-05, + "loss": 0.4313, "step": 45445 }, { - "epoch": 1.6, - "learning_rate": 3.9778991828903286e-05, - "loss": 0.2622, + "epoch": 1.6380149205319494, + "grad_norm": 0.1655767560005188, + "learning_rate": 3.92662800084388e-05, + "loss": 0.4409, "step": 45450 }, { - "epoch": 1.6, - "learning_rate": 3.977669408382625e-05, - "loss": 0.2948, + "epoch": 1.6381951201931741, + "grad_norm": 0.17300541698932648, + "learning_rate": 3.9263883561636385e-05, + "loss": 0.4154, "step": 45455 }, { - "epoch": 1.6, - "learning_rate": 3.977439614688356e-05, - "loss": 0.2935, + "epoch": 1.6383753198543987, + "grad_norm": 0.220417782664299, + "learning_rate": 3.926148692049132e-05, + "loss": 0.418, "step": 45460 }, { - "epoch": 1.6, - "learning_rate": 3.9772098018105085e-05, - "loss": 0.259, + "epoch": 1.6385555195156232, + "grad_norm": 0.18571282923221588, + "learning_rate": 3.925909008503625e-05, + "loss": 0.4301, "step": 45465 }, { - "epoch": 1.6, - "learning_rate": 3.9769799697520645e-05, - "loss": 0.2842, + "epoch": 1.638735719176848, + "grad_norm": 0.18570081889629364, + "learning_rate": 3.925669305530384e-05, + "loss": 0.4454, "step": 45470 }, { - "epoch": 1.6, - "learning_rate": 3.976750118516009e-05, - "loss": 0.3044, + "epoch": 1.6389159188380726, + "grad_norm": 0.14735782146453857, + "learning_rate": 3.925429583132675e-05, + "loss": 0.4146, "step": 45475 }, { - "epoch": 1.6, - "learning_rate": 3.976520248105326e-05, - "loss": 0.2931, + "epoch": 1.6390961184992974, + "grad_norm": 0.17413799464702606, + "learning_rate": 3.925189841313764e-05, + "loss": 0.3904, "step": 45480 }, { - "epoch": 1.6, - "learning_rate": 3.976290358523002e-05, - "loss": 0.272, + "epoch": 1.6392763181605219, + "grad_norm": 0.27123942971229553, + "learning_rate": 3.924950080076917e-05, + "loss": 0.4315, "step": 45485 }, { - "epoch": 1.6, - "learning_rate": 3.9760604497720196e-05, - "loss": 0.2871, + "epoch": 1.6394565178217464, + "grad_norm": 0.1668565422296524, + "learning_rate": 3.924710299425401e-05, + "loss": 0.4216, "step": 45490 }, { - "epoch": 1.6, - "learning_rate": 3.975830521855366e-05, - "loss": 0.2799, + "epoch": 1.6396367174829711, + "grad_norm": 0.17070479691028595, + "learning_rate": 3.924470499362483e-05, + "loss": 0.423, "step": 45495 }, { - "epoch": 1.6, - "learning_rate": 3.975600574776024e-05, - "loss": 0.2895, + "epoch": 1.6398169171441959, + "grad_norm": 0.1955229640007019, + "learning_rate": 3.9242306798914305e-05, + "loss": 0.4145, "step": 45500 }, { - "epoch": 1.6, - "eval_loss": 0.2773810625076294, - "eval_runtime": 10.5526, - "eval_samples_per_second": 9.476, - "eval_steps_per_second": 9.476, + "epoch": 1.6398169171441959, + "eval_loss": 0.4432607591152191, + "eval_runtime": 3.5416, + "eval_samples_per_second": 28.236, + "eval_steps_per_second": 7.059, "step": 45500 }, { - "epoch": 1.6, - "learning_rate": 3.975370608536982e-05, - "loss": 0.2877, + "epoch": 1.6399971168054204, + "grad_norm": 0.1931176483631134, + "learning_rate": 3.9239908410155104e-05, + "loss": 0.412, "step": 45505 }, { - "epoch": 1.6, - "learning_rate": 3.975140623141226e-05, - "loss": 0.2763, + "epoch": 1.6401773164666449, + "grad_norm": 0.216093048453331, + "learning_rate": 3.923750982737991e-05, + "loss": 0.4194, "step": 45510 }, { - "epoch": 1.6, - "learning_rate": 3.9749106185917394e-05, - "loss": 0.2933, + "epoch": 1.6403575161278696, + "grad_norm": 0.17901061475276947, + "learning_rate": 3.92351110506214e-05, + "loss": 0.4208, "step": 45515 }, { - "epoch": 1.6, - "learning_rate": 3.974680594891511e-05, - "loss": 0.2566, + "epoch": 1.6405377157890944, + "grad_norm": 0.16819880902767181, + "learning_rate": 3.923271207991226e-05, + "loss": 0.3989, "step": 45520 }, { - "epoch": 1.6, - "learning_rate": 3.974450552043527e-05, - "loss": 0.2687, + "epoch": 1.640717915450319, + "grad_norm": 0.20534752309322357, + "learning_rate": 3.923031291528517e-05, + "loss": 0.4011, "step": 45525 }, { - "epoch": 1.6, - "learning_rate": 3.974220490050775e-05, - "loss": 0.2882, + "epoch": 1.6408981151115436, + "grad_norm": 0.16648423671722412, + "learning_rate": 3.922791355677282e-05, + "loss": 0.4435, "step": 45530 }, { - "epoch": 1.6, - "learning_rate": 3.97399040891624e-05, - "loss": 0.2873, + "epoch": 1.641078314772768, + "grad_norm": 0.20146596431732178, + "learning_rate": 3.9225514004407916e-05, + "loss": 0.4431, "step": 45535 }, { - "epoch": 1.6, - "learning_rate": 3.973760308642912e-05, - "loss": 0.2725, + "epoch": 1.6412585144339928, + "grad_norm": 0.18600507080554962, + "learning_rate": 3.922311425822313e-05, + "loss": 0.4459, "step": 45540 }, { - "epoch": 1.6, - "learning_rate": 3.973530189233777e-05, - "loss": 0.297, + "epoch": 1.6414387140952176, + "grad_norm": 0.19291111826896667, + "learning_rate": 3.922071431825116e-05, + "loss": 0.3978, "step": 45545 }, { - "epoch": 1.6, - "learning_rate": 3.9733000506918245e-05, - "loss": 0.2573, + "epoch": 1.641618913756442, + "grad_norm": 0.18025104701519012, + "learning_rate": 3.921831418452472e-05, + "loss": 0.4321, "step": 45550 }, { - "epoch": 1.6, - "learning_rate": 3.973069893020041e-05, - "loss": 0.284, + "epoch": 1.6417991134176668, + "grad_norm": 0.18391737341880798, + "learning_rate": 3.921591385707649e-05, + "loss": 0.4303, "step": 45555 }, { - "epoch": 1.6, - "learning_rate": 3.972839716221416e-05, - "loss": 0.2903, + "epoch": 1.6419793130788913, + "grad_norm": 0.23778654634952545, + "learning_rate": 3.921351333593919e-05, + "loss": 0.4226, "step": 45560 }, { - "epoch": 1.6, - "learning_rate": 3.972609520298938e-05, - "loss": 0.2788, + "epoch": 1.642159512740116, + "grad_norm": 0.1602877527475357, + "learning_rate": 3.921111262114553e-05, + "loss": 0.3877, "step": 45565 }, { - "epoch": 1.6, - "learning_rate": 3.9723793052555957e-05, - "loss": 0.2796, + "epoch": 1.6423397124013408, + "grad_norm": 0.17459672689437866, + "learning_rate": 3.920871171272821e-05, + "loss": 0.4258, "step": 45570 }, { - "epoch": 1.6, - "learning_rate": 3.9721490710943784e-05, - "loss": 0.2792, + "epoch": 1.6425199120625653, + "grad_norm": 0.16200929880142212, + "learning_rate": 3.920631061071994e-05, + "loss": 0.379, "step": 45575 }, { - "epoch": 1.6, - "learning_rate": 3.971918817818276e-05, - "loss": 0.3132, + "epoch": 1.6427001117237898, + "grad_norm": 0.2194150686264038, + "learning_rate": 3.9203909315153445e-05, + "loss": 0.4539, "step": 45580 }, { - "epoch": 1.6, - "learning_rate": 3.971688545430277e-05, - "loss": 0.2981, + "epoch": 1.6428803113850146, + "grad_norm": 0.15917415916919708, + "learning_rate": 3.920150782606142e-05, + "loss": 0.4171, "step": 45585 }, { - "epoch": 1.6, - "learning_rate": 3.971458253933373e-05, - "loss": 0.2772, + "epoch": 1.6430605110462393, + "grad_norm": 0.16585630178451538, + "learning_rate": 3.919910614347662e-05, + "loss": 0.4325, "step": 45590 }, { - "epoch": 1.6, - "learning_rate": 3.971227943330553e-05, - "loss": 0.3021, + "epoch": 1.643240710707464, + "grad_norm": 0.1744535267353058, + "learning_rate": 3.9196704267431735e-05, + "loss": 0.4188, "step": 45595 }, { - "epoch": 1.6, - "learning_rate": 3.9709976136248074e-05, - "loss": 0.2925, + "epoch": 1.6434209103686885, + "grad_norm": 0.17278815805912018, + "learning_rate": 3.91943021979595e-05, + "loss": 0.4041, "step": 45600 }, { - "epoch": 1.6, - "learning_rate": 3.9707672648191274e-05, - "loss": 0.2876, + "epoch": 1.643601110029913, + "grad_norm": 0.19840632379055023, + "learning_rate": 3.919189993509265e-05, + "loss": 0.4229, "step": 45605 }, { - "epoch": 1.6, - "learning_rate": 3.9705368969165036e-05, - "loss": 0.293, + "epoch": 1.6437813096911378, + "grad_norm": 0.178060382604599, + "learning_rate": 3.918949747886391e-05, + "loss": 0.4469, "step": 45610 }, { - "epoch": 1.6, - "learning_rate": 3.9703065099199275e-05, - "loss": 0.2589, + "epoch": 1.6439615093523625, + "grad_norm": 0.1763332337141037, + "learning_rate": 3.918709482930602e-05, + "loss": 0.3798, "step": 45615 }, { - "epoch": 1.61, - "learning_rate": 3.970076103832391e-05, - "loss": 0.296, + "epoch": 1.644141709013587, + "grad_norm": 0.17714865505695343, + "learning_rate": 3.918469198645171e-05, + "loss": 0.4069, "step": 45620 }, { - "epoch": 1.61, - "learning_rate": 3.969845678656884e-05, - "loss": 0.2924, + "epoch": 1.6443219086748115, + "grad_norm": 0.18300695717334747, + "learning_rate": 3.918228895033371e-05, + "loss": 0.3987, "step": 45625 }, { - "epoch": 1.61, - "learning_rate": 3.9696152343963996e-05, - "loss": 0.3112, + "epoch": 1.6445021083360363, + "grad_norm": 0.1943272352218628, + "learning_rate": 3.9179885720984775e-05, + "loss": 0.4106, "step": 45630 }, { - "epoch": 1.61, - "learning_rate": 3.9693847710539305e-05, - "loss": 0.3033, + "epoch": 1.644682307997261, + "grad_norm": 0.20393919944763184, + "learning_rate": 3.917748229843764e-05, + "loss": 0.4568, "step": 45635 }, { - "epoch": 1.61, - "learning_rate": 3.9691542886324684e-05, - "loss": 0.2816, + "epoch": 1.6448625076584857, + "grad_norm": 0.18924960494041443, + "learning_rate": 3.917507868272506e-05, + "loss": 0.4561, "step": 45640 }, { - "epoch": 1.61, - "learning_rate": 3.968923787135006e-05, - "loss": 0.2761, + "epoch": 1.6450427073197103, + "grad_norm": 0.1751289963722229, + "learning_rate": 3.9172674873879766e-05, + "loss": 0.4143, "step": 45645 }, { - "epoch": 1.61, - "learning_rate": 3.9686932665645357e-05, - "loss": 0.2845, + "epoch": 1.6452229069809348, + "grad_norm": 0.16475655138492584, + "learning_rate": 3.917027087193453e-05, + "loss": 0.3721, "step": 45650 }, { - "epoch": 1.61, - "learning_rate": 3.968462726924052e-05, - "loss": 0.2957, + "epoch": 1.6454031066421595, + "grad_norm": 0.2124844789505005, + "learning_rate": 3.9167866676922096e-05, + "loss": 0.4227, "step": 45655 }, { - "epoch": 1.61, - "learning_rate": 3.9682321682165474e-05, - "loss": 0.2873, + "epoch": 1.6455833063033842, + "grad_norm": 0.17801690101623535, + "learning_rate": 3.916546228887521e-05, + "loss": 0.4318, "step": 45660 }, { - "epoch": 1.61, - "learning_rate": 3.968001590445016e-05, - "loss": 0.2757, + "epoch": 1.6457635059646087, + "grad_norm": 0.17565682530403137, + "learning_rate": 3.9163057707826654e-05, + "loss": 0.3954, "step": 45665 }, { - "epoch": 1.61, - "learning_rate": 3.967770993612451e-05, - "loss": 0.2825, + "epoch": 1.6459437056258335, + "grad_norm": 0.17757514119148254, + "learning_rate": 3.916065293380917e-05, + "loss": 0.4042, "step": 45670 }, { - "epoch": 1.61, - "learning_rate": 3.967540377721847e-05, - "loss": 0.2928, + "epoch": 1.646123905287058, + "grad_norm": 0.20188312232494354, + "learning_rate": 3.9158247966855545e-05, + "loss": 0.4265, "step": 45675 }, { - "epoch": 1.61, - "learning_rate": 3.9673097427761975e-05, - "loss": 0.2877, + "epoch": 1.6463041049482827, + "grad_norm": 0.19370879232883453, + "learning_rate": 3.915584280699853e-05, + "loss": 0.4094, "step": 45680 }, { - "epoch": 1.61, - "learning_rate": 3.967079088778499e-05, - "loss": 0.2864, + "epoch": 1.6464843046095075, + "grad_norm": 0.15388374030590057, + "learning_rate": 3.915343745427089e-05, + "loss": 0.3697, "step": 45685 }, { - "epoch": 1.61, - "learning_rate": 3.9668484157317455e-05, - "loss": 0.2722, + "epoch": 1.646664504270732, + "grad_norm": 0.19175679981708527, + "learning_rate": 3.9151031908705406e-05, + "loss": 0.4483, "step": 45690 }, { - "epoch": 1.61, - "learning_rate": 3.966617723638931e-05, - "loss": 0.281, + "epoch": 1.6468447039319565, + "grad_norm": 0.19182166457176208, + "learning_rate": 3.9148626170334854e-05, + "loss": 0.3927, "step": 45695 }, { - "epoch": 1.61, - "learning_rate": 3.966387012503053e-05, - "loss": 0.2646, + "epoch": 1.6470249035931812, + "grad_norm": 0.1689678132534027, + "learning_rate": 3.914622023919201e-05, + "loss": 0.4264, "step": 45700 }, { - "epoch": 1.61, - "learning_rate": 3.966156282327106e-05, - "loss": 0.2854, + "epoch": 1.647205103254406, + "grad_norm": 0.17200689017772675, + "learning_rate": 3.914381411530965e-05, + "loss": 0.4504, "step": 45705 }, { - "epoch": 1.61, - "learning_rate": 3.965925533114085e-05, - "loss": 0.2768, + "epoch": 1.6473853029156307, + "grad_norm": 0.17576947808265686, + "learning_rate": 3.914140779872056e-05, + "loss": 0.4333, "step": 45710 }, { - "epoch": 1.61, - "learning_rate": 3.965694764866988e-05, - "loss": 0.2905, + "epoch": 1.6475655025768552, + "grad_norm": 0.18409965932369232, + "learning_rate": 3.913900128945753e-05, + "loss": 0.3983, "step": 45715 }, { - "epoch": 1.61, - "learning_rate": 3.9654639775888093e-05, - "loss": 0.2845, + "epoch": 1.6477457022380797, + "grad_norm": 0.1899709552526474, + "learning_rate": 3.913659458755335e-05, + "loss": 0.3916, "step": 45720 }, { - "epoch": 1.61, - "learning_rate": 3.9652331712825474e-05, - "loss": 0.2775, + "epoch": 1.6479259018993044, + "grad_norm": 0.1969635933637619, + "learning_rate": 3.9134187693040806e-05, + "loss": 0.4051, "step": 45725 }, { - "epoch": 1.61, - "learning_rate": 3.9650023459511983e-05, - "loss": 0.2828, + "epoch": 1.6481061015605292, + "grad_norm": 0.15576127171516418, + "learning_rate": 3.913178060595268e-05, + "loss": 0.4128, "step": 45730 }, { - "epoch": 1.61, - "learning_rate": 3.9647715015977594e-05, - "loss": 0.3034, + "epoch": 1.6482863012217537, + "grad_norm": 0.17115731537342072, + "learning_rate": 3.9129373326321785e-05, + "loss": 0.4159, "step": 45735 }, { - "epoch": 1.61, - "learning_rate": 3.964540638225227e-05, - "loss": 0.2993, + "epoch": 1.6484665008829782, + "grad_norm": 0.1695828139781952, + "learning_rate": 3.912696585418092e-05, + "loss": 0.4314, "step": 45740 }, { - "epoch": 1.61, - "learning_rate": 3.964309755836601e-05, - "loss": 0.3023, + "epoch": 1.648646700544203, + "grad_norm": 0.18110521137714386, + "learning_rate": 3.912455818956288e-05, + "loss": 0.4301, "step": 45745 }, { - "epoch": 1.61, - "learning_rate": 3.964078854434876e-05, - "loss": 0.2665, + "epoch": 1.6488269002054277, + "grad_norm": 0.15694189071655273, + "learning_rate": 3.912215033250046e-05, + "loss": 0.3813, "step": 45750 }, { - "epoch": 1.61, - "learning_rate": 3.9638479340230535e-05, - "loss": 0.2827, + "epoch": 1.6490070998666524, + "grad_norm": 0.1641474962234497, + "learning_rate": 3.911974228302647e-05, + "loss": 0.3951, "step": 45755 }, { - "epoch": 1.61, - "learning_rate": 3.96361699460413e-05, - "loss": 0.2882, + "epoch": 1.649187299527877, + "grad_norm": 0.20309121906757355, + "learning_rate": 3.911733404117375e-05, + "loss": 0.4113, "step": 45760 }, { - "epoch": 1.61, - "learning_rate": 3.9633860361811036e-05, - "loss": 0.2943, + "epoch": 1.6493674991891014, + "grad_norm": 0.219468891620636, + "learning_rate": 3.9114925606975064e-05, + "loss": 0.3907, "step": 45765 }, { - "epoch": 1.61, - "learning_rate": 3.963155058756975e-05, - "loss": 0.2901, + "epoch": 1.6495476988503261, + "grad_norm": 0.17919921875, + "learning_rate": 3.9112516980463255e-05, + "loss": 0.418, "step": 45770 }, { - "epoch": 1.61, - "learning_rate": 3.962924062334742e-05, - "loss": 0.283, + "epoch": 1.6497278985115509, + "grad_norm": 0.15530207753181458, + "learning_rate": 3.911010816167113e-05, + "loss": 0.3502, "step": 45775 }, { - "epoch": 1.61, - "learning_rate": 3.962693046917404e-05, - "loss": 0.2823, + "epoch": 1.6499080981727754, + "grad_norm": 0.15395016968250275, + "learning_rate": 3.910769915063153e-05, + "loss": 0.4206, "step": 45780 }, { - "epoch": 1.61, - "learning_rate": 3.96246201250796e-05, - "loss": 0.2808, + "epoch": 1.650088297834, + "grad_norm": 0.17081965506076813, + "learning_rate": 3.910528994737725e-05, + "loss": 0.4347, "step": 45785 }, { - "epoch": 1.61, - "learning_rate": 3.962230959109411e-05, - "loss": 0.2731, + "epoch": 1.6502684974952246, + "grad_norm": 0.2129909098148346, + "learning_rate": 3.910288055194112e-05, + "loss": 0.4404, "step": 45790 }, { - "epoch": 1.61, - "learning_rate": 3.961999886724756e-05, - "loss": 0.2726, + "epoch": 1.6504486971564494, + "grad_norm": 0.203140988945961, + "learning_rate": 3.910047096435598e-05, + "loss": 0.4398, "step": 45795 }, { - "epoch": 1.61, - "learning_rate": 3.961768795356998e-05, - "loss": 0.306, + "epoch": 1.650628896817674, + "grad_norm": 0.19998563826084137, + "learning_rate": 3.909806118465466e-05, + "loss": 0.4061, "step": 45800 }, { - "epoch": 1.61, - "learning_rate": 3.961537685009134e-05, - "loss": 0.2753, + "epoch": 1.6508090964788986, + "grad_norm": 0.1874360889196396, + "learning_rate": 3.909565121286997e-05, + "loss": 0.4194, "step": 45805 }, { - "epoch": 1.61, - "learning_rate": 3.961306555684167e-05, - "loss": 0.2986, + "epoch": 1.6509892961401231, + "grad_norm": 0.19713318347930908, + "learning_rate": 3.909324104903477e-05, + "loss": 0.4072, "step": 45810 }, { - "epoch": 1.61, - "learning_rate": 3.961075407385097e-05, - "loss": 0.2763, + "epoch": 1.6511694958013479, + "grad_norm": 0.17209462821483612, + "learning_rate": 3.9090830693181885e-05, + "loss": 0.417, "step": 45815 }, { - "epoch": 1.61, - "learning_rate": 3.960844240114926e-05, - "loss": 0.2872, + "epoch": 1.6513496954625726, + "grad_norm": 0.17033915221691132, + "learning_rate": 3.9088420145344164e-05, + "loss": 0.3954, "step": 45820 }, { - "epoch": 1.61, - "learning_rate": 3.960613053876655e-05, - "loss": 0.2956, + "epoch": 1.6515298951237973, + "grad_norm": 0.19625675678253174, + "learning_rate": 3.9086009405554445e-05, + "loss": 0.3812, "step": 45825 }, { - "epoch": 1.61, - "learning_rate": 3.960381848673286e-05, - "loss": 0.306, + "epoch": 1.6517100947850218, + "grad_norm": 0.16199907660484314, + "learning_rate": 3.908359847384557e-05, + "loss": 0.429, "step": 45830 }, { - "epoch": 1.61, - "learning_rate": 3.9601506245078216e-05, - "loss": 0.2837, + "epoch": 1.6518902944462464, + "grad_norm": 0.2132118195295334, + "learning_rate": 3.9081187350250406e-05, + "loss": 0.4157, "step": 45835 }, { - "epoch": 1.61, - "learning_rate": 3.959919381383262e-05, - "loss": 0.2831, + "epoch": 1.652070494107471, + "grad_norm": 0.17869247496128082, + "learning_rate": 3.9078776034801775e-05, + "loss": 0.399, "step": 45840 }, { - "epoch": 1.61, - "learning_rate": 3.959688119302614e-05, - "loss": 0.2889, + "epoch": 1.6522506937686958, + "grad_norm": 0.15512903034687042, + "learning_rate": 3.907636452753256e-05, + "loss": 0.4001, "step": 45845 }, { - "epoch": 1.61, - "learning_rate": 3.9594568382688755e-05, - "loss": 0.2778, + "epoch": 1.6524308934299203, + "grad_norm": 0.15080289542675018, + "learning_rate": 3.907395282847559e-05, + "loss": 0.4167, "step": 45850 }, { - "epoch": 1.61, - "learning_rate": 3.959225538285053e-05, - "loss": 0.2599, + "epoch": 1.6526110930911448, + "grad_norm": 0.19971080124378204, + "learning_rate": 3.907154093766375e-05, + "loss": 0.4297, "step": 45855 }, { - "epoch": 1.61, - "learning_rate": 3.9589942193541486e-05, - "loss": 0.2797, + "epoch": 1.6527912927523696, + "grad_norm": 0.19148339331150055, + "learning_rate": 3.9069128855129876e-05, + "loss": 0.411, "step": 45860 }, { - "epoch": 1.61, - "learning_rate": 3.958762881479165e-05, - "loss": 0.293, + "epoch": 1.6529714924135943, + "grad_norm": 0.15302042663097382, + "learning_rate": 3.906671658090686e-05, + "loss": 0.4408, "step": 45865 }, { - "epoch": 1.61, - "learning_rate": 3.9585315246631074e-05, - "loss": 0.283, + "epoch": 1.653151692074819, + "grad_norm": 0.1940101683139801, + "learning_rate": 3.9064304115027544e-05, + "loss": 0.3945, "step": 45870 }, { - "epoch": 1.61, - "learning_rate": 3.9583001489089786e-05, - "loss": 0.2734, + "epoch": 1.6533318917360436, + "grad_norm": 0.19384680688381195, + "learning_rate": 3.9061891457524814e-05, + "loss": 0.4195, "step": 45875 }, { - "epoch": 1.61, - "learning_rate": 3.9580687542197834e-05, - "loss": 0.3061, + "epoch": 1.653512091397268, + "grad_norm": 0.2088027447462082, + "learning_rate": 3.9059478608431526e-05, + "loss": 0.3886, "step": 45880 }, { - "epoch": 1.61, - "learning_rate": 3.957837340598527e-05, - "loss": 0.2923, + "epoch": 1.6536922910584928, + "grad_norm": 0.16105620563030243, + "learning_rate": 3.905706556778057e-05, + "loss": 0.4008, "step": 45885 }, { - "epoch": 1.61, - "learning_rate": 3.957605908048213e-05, - "loss": 0.3052, + "epoch": 1.6538724907197175, + "grad_norm": 0.1838972270488739, + "learning_rate": 3.9054652335604814e-05, + "loss": 0.428, "step": 45890 }, { - "epoch": 1.61, - "learning_rate": 3.9573744565718476e-05, - "loss": 0.2921, + "epoch": 1.654052690380942, + "grad_norm": 0.18796832859516144, + "learning_rate": 3.905223891193715e-05, + "loss": 0.4167, "step": 45895 }, { - "epoch": 1.61, - "learning_rate": 3.957142986172434e-05, - "loss": 0.2828, + "epoch": 1.6542328900421666, + "grad_norm": 0.18928878009319305, + "learning_rate": 3.9049825296810436e-05, + "loss": 0.3747, "step": 45900 }, { - "epoch": 1.62, - "learning_rate": 3.95691149685298e-05, - "loss": 0.29, + "epoch": 1.6544130897033913, + "grad_norm": 0.1923985332250595, + "learning_rate": 3.904741149025759e-05, + "loss": 0.3859, "step": 45905 }, { - "epoch": 1.62, - "learning_rate": 3.956679988616489e-05, - "loss": 0.313, + "epoch": 1.654593289364616, + "grad_norm": 0.2407243549823761, + "learning_rate": 3.904499749231147e-05, + "loss": 0.3979, "step": 45910 }, { - "epoch": 1.62, - "learning_rate": 3.9564484614659704e-05, - "loss": 0.2805, + "epoch": 1.6547734890258408, + "grad_norm": 0.1961093693971634, + "learning_rate": 3.904258330300498e-05, + "loss": 0.4073, "step": 45915 }, { - "epoch": 1.62, - "learning_rate": 3.956216915404427e-05, - "loss": 0.2935, + "epoch": 1.6549536886870653, + "grad_norm": 0.23786601424217224, + "learning_rate": 3.904016892237101e-05, + "loss": 0.4582, "step": 45920 }, { - "epoch": 1.62, - "learning_rate": 3.955985350434868e-05, - "loss": 0.3025, + "epoch": 1.6551338883482898, + "grad_norm": 0.1749258190393448, + "learning_rate": 3.903775435044246e-05, + "loss": 0.4478, "step": 45925 }, { - "epoch": 1.62, - "learning_rate": 3.955753766560297e-05, - "loss": 0.2691, + "epoch": 1.6553140880095145, + "grad_norm": 0.16315124928951263, + "learning_rate": 3.903533958725223e-05, + "loss": 0.3567, "step": 45930 }, { - "epoch": 1.62, - "learning_rate": 3.955522163783724e-05, - "loss": 0.2919, + "epoch": 1.6554942876707393, + "grad_norm": 0.14763472974300385, + "learning_rate": 3.9032924632833205e-05, + "loss": 0.3758, "step": 45935 }, { - "epoch": 1.62, - "learning_rate": 3.955290542108154e-05, - "loss": 0.289, + "epoch": 1.6556744873319638, + "grad_norm": 0.22486995160579681, + "learning_rate": 3.90305094872183e-05, + "loss": 0.4554, "step": 45940 }, { - "epoch": 1.62, - "learning_rate": 3.955058901536596e-05, - "loss": 0.2994, + "epoch": 1.6558546869931885, + "grad_norm": 0.19883042573928833, + "learning_rate": 3.902809415044043e-05, + "loss": 0.4338, "step": 45945 }, { - "epoch": 1.62, - "learning_rate": 3.9548272420720566e-05, - "loss": 0.2907, + "epoch": 1.656034886654413, + "grad_norm": 0.18573078513145447, + "learning_rate": 3.902567862253248e-05, + "loss": 0.4199, "step": 45950 }, { - "epoch": 1.62, - "learning_rate": 3.9545955637175446e-05, - "loss": 0.2685, + "epoch": 1.6562150863156377, + "grad_norm": 0.18856869637966156, + "learning_rate": 3.902326290352738e-05, + "loss": 0.3974, "step": 45955 }, { - "epoch": 1.62, - "learning_rate": 3.9543638664760675e-05, - "loss": 0.2935, + "epoch": 1.6563952859768625, + "grad_norm": 0.18239809572696686, + "learning_rate": 3.902084699345804e-05, + "loss": 0.433, "step": 45960 }, { - "epoch": 1.62, - "learning_rate": 3.9541321503506346e-05, - "loss": 0.279, + "epoch": 1.656575485638087, + "grad_norm": 0.23285090923309326, + "learning_rate": 3.9018430892357376e-05, + "loss": 0.4054, "step": 45965 }, { - "epoch": 1.62, - "learning_rate": 3.953900415344254e-05, - "loss": 0.3079, + "epoch": 1.6567556852993115, + "grad_norm": 0.16723239421844482, + "learning_rate": 3.90160146002583e-05, + "loss": 0.4016, "step": 45970 }, { - "epoch": 1.62, - "learning_rate": 3.9536686614599346e-05, - "loss": 0.2906, + "epoch": 1.6569358849605362, + "grad_norm": 0.14857235550880432, + "learning_rate": 3.901359811719374e-05, + "loss": 0.4149, "step": 45975 }, { - "epoch": 1.62, - "learning_rate": 3.953436888700684e-05, - "loss": 0.2907, + "epoch": 1.657116084621761, + "grad_norm": 0.1981445550918579, + "learning_rate": 3.901118144319662e-05, + "loss": 0.403, "step": 45980 }, { - "epoch": 1.62, - "learning_rate": 3.9532050970695153e-05, - "loss": 0.2623, + "epoch": 1.6572962842829857, + "grad_norm": 0.1867840737104416, + "learning_rate": 3.9008764578299866e-05, + "loss": 0.4106, "step": 45985 }, { - "epoch": 1.62, - "learning_rate": 3.952973286569436e-05, - "loss": 0.2741, + "epoch": 1.6574764839442102, + "grad_norm": 0.21596737205982208, + "learning_rate": 3.900634752253641e-05, + "loss": 0.4186, "step": 45990 }, { - "epoch": 1.62, - "learning_rate": 3.9527414572034555e-05, - "loss": 0.2883, + "epoch": 1.6576566836054347, + "grad_norm": 0.20319004356861115, + "learning_rate": 3.900393027593917e-05, + "loss": 0.4474, "step": 45995 }, { - "epoch": 1.62, - "learning_rate": 3.952509608974585e-05, - "loss": 0.2835, + "epoch": 1.6578368832666595, + "grad_norm": 0.16582396626472473, + "learning_rate": 3.90015128385411e-05, + "loss": 0.4064, "step": 46000 }, { - "epoch": 1.62, - "eval_loss": 0.2772733271121979, - "eval_runtime": 10.5485, - "eval_samples_per_second": 9.48, - "eval_steps_per_second": 9.48, + "epoch": 1.6578368832666595, + "eval_loss": 0.4423971474170685, + "eval_runtime": 3.531, + "eval_samples_per_second": 28.32, + "eval_steps_per_second": 7.08, "step": 46000 }, { - "epoch": 1.62, - "learning_rate": 3.952277741885834e-05, - "loss": 0.2889, + "epoch": 1.6580170829278842, + "grad_norm": 0.15383246541023254, + "learning_rate": 3.8999095210375124e-05, + "loss": 0.3594, "step": 46005 }, { - "epoch": 1.62, - "learning_rate": 3.9520458559402135e-05, - "loss": 0.298, + "epoch": 1.6581972825891087, + "grad_norm": 0.18127766251564026, + "learning_rate": 3.899667739147419e-05, + "loss": 0.431, "step": 46010 }, { - "epoch": 1.62, - "learning_rate": 3.951813951140735e-05, - "loss": 0.2769, + "epoch": 1.6583774822503332, + "grad_norm": 0.19410793483257294, + "learning_rate": 3.899425938187123e-05, + "loss": 0.3563, "step": 46015 }, { - "epoch": 1.62, - "learning_rate": 3.951582027490409e-05, - "loss": 0.2866, + "epoch": 1.658557681911558, + "grad_norm": 0.23459197580814362, + "learning_rate": 3.89918411815992e-05, + "loss": 0.4236, "step": 46020 }, { - "epoch": 1.62, - "learning_rate": 3.951350084992246e-05, - "loss": 0.3019, + "epoch": 1.6587378815727827, + "grad_norm": 0.1808062195777893, + "learning_rate": 3.898942279069104e-05, + "loss": 0.3991, "step": 46025 }, { - "epoch": 1.62, - "learning_rate": 3.95111812364926e-05, - "loss": 0.3141, + "epoch": 1.6589180812340074, + "grad_norm": 0.14679357409477234, + "learning_rate": 3.8987004209179715e-05, + "loss": 0.3997, "step": 46030 }, { - "epoch": 1.62, - "learning_rate": 3.950886143464461e-05, - "loss": 0.2843, + "epoch": 1.659098280895232, + "grad_norm": 0.18178507685661316, + "learning_rate": 3.898458543709815e-05, + "loss": 0.4321, "step": 46035 }, { - "epoch": 1.62, - "learning_rate": 3.950654144440862e-05, - "loss": 0.2648, + "epoch": 1.6592784805564564, + "grad_norm": 0.20688877999782562, + "learning_rate": 3.8982166474479323e-05, + "loss": 0.4243, "step": 46040 }, { - "epoch": 1.62, - "learning_rate": 3.9504221265814746e-05, - "loss": 0.2859, + "epoch": 1.6594586802176812, + "grad_norm": 0.21331140398979187, + "learning_rate": 3.8979747321356186e-05, + "loss": 0.4117, "step": 46045 }, { - "epoch": 1.62, - "learning_rate": 3.950190089889311e-05, - "loss": 0.3057, + "epoch": 1.659638879878906, + "grad_norm": 0.20621632039546967, + "learning_rate": 3.89773279777617e-05, + "loss": 0.4367, "step": 46050 }, { - "epoch": 1.62, - "learning_rate": 3.9499580343673856e-05, - "loss": 0.2912, + "epoch": 1.6598190795401304, + "grad_norm": 0.20351921021938324, + "learning_rate": 3.897490844372882e-05, + "loss": 0.4259, "step": 46055 }, { - "epoch": 1.62, - "learning_rate": 3.949725960018711e-05, - "loss": 0.2906, + "epoch": 1.6599992792013551, + "grad_norm": 0.17361485958099365, + "learning_rate": 3.897248871929052e-05, + "loss": 0.4149, "step": 46060 }, { - "epoch": 1.62, - "learning_rate": 3.9494938668463e-05, - "loss": 0.2849, + "epoch": 1.6601794788625797, + "grad_norm": 0.1759531944990158, + "learning_rate": 3.897006880447977e-05, + "loss": 0.3962, "step": 46065 }, { - "epoch": 1.62, - "learning_rate": 3.949261754853166e-05, - "loss": 0.2912, + "epoch": 1.6603596785238044, + "grad_norm": 0.17056173086166382, + "learning_rate": 3.896764869932953e-05, + "loss": 0.3861, "step": 46070 }, { - "epoch": 1.62, - "learning_rate": 3.949029624042323e-05, - "loss": 0.3034, + "epoch": 1.6605398781850291, + "grad_norm": 0.16253900527954102, + "learning_rate": 3.8965228403872784e-05, + "loss": 0.4117, "step": 46075 }, { - "epoch": 1.62, - "learning_rate": 3.948797474416786e-05, - "loss": 0.3001, + "epoch": 1.6607200778462536, + "grad_norm": 0.17955611646175385, + "learning_rate": 3.8962807918142507e-05, + "loss": 0.4292, "step": 46080 }, { - "epoch": 1.62, - "learning_rate": 3.948565305979568e-05, - "loss": 0.2599, + "epoch": 1.6609002775074782, + "grad_norm": 0.15855169296264648, + "learning_rate": 3.896038724217167e-05, + "loss": 0.3961, "step": 46085 }, { - "epoch": 1.62, - "learning_rate": 3.948333118733685e-05, - "loss": 0.3083, + "epoch": 1.6610804771687029, + "grad_norm": 0.192918598651886, + "learning_rate": 3.8957966375993266e-05, + "loss": 0.3705, "step": 46090 }, { - "epoch": 1.62, - "learning_rate": 3.94810091268215e-05, - "loss": 0.2711, + "epoch": 1.6612606768299276, + "grad_norm": Infinity, + "learning_rate": 3.895602954612325e-05, + "loss": 0.399, "step": 46095 }, { - "epoch": 1.62, - "learning_rate": 3.947868687827979e-05, - "loss": 0.2913, + "epoch": 1.6614408764911524, + "grad_norm": 0.15826788544654846, + "learning_rate": 3.895360833765434e-05, + "loss": 0.422, "step": 46100 }, { - "epoch": 1.62, - "learning_rate": 3.9476364441741873e-05, - "loss": 0.2639, + "epoch": 1.6616210761523769, + "grad_norm": 0.1578192263841629, + "learning_rate": 3.895118693907021e-05, + "loss": 0.4362, "step": 46105 }, { - "epoch": 1.62, - "learning_rate": 3.947404181723791e-05, - "loss": 0.2943, + "epoch": 1.6618012758136014, + "grad_norm": 0.18749912083148956, + "learning_rate": 3.8948765350403856e-05, + "loss": 0.4229, "step": 46110 }, { - "epoch": 1.62, - "learning_rate": 3.947171900479805e-05, - "loss": 0.293, + "epoch": 1.661981475474826, + "grad_norm": 0.2128848135471344, + "learning_rate": 3.8946343571688273e-05, + "loss": 0.3845, "step": 46115 }, { - "epoch": 1.62, - "learning_rate": 3.9469396004452453e-05, - "loss": 0.2659, + "epoch": 1.6621616751360508, + "grad_norm": 0.19334283471107483, + "learning_rate": 3.894392160295647e-05, + "loss": 0.4214, "step": 46120 }, { - "epoch": 1.62, - "learning_rate": 3.946707281623129e-05, - "loss": 0.271, + "epoch": 1.6623418747972754, + "grad_norm": 0.17643995583057404, + "learning_rate": 3.894149944424142e-05, + "loss": 0.3705, "step": 46125 }, { - "epoch": 1.62, - "learning_rate": 3.946474944016472e-05, - "loss": 0.2975, + "epoch": 1.6625220744584999, + "grad_norm": 0.17659886181354523, + "learning_rate": 3.8939077095576154e-05, + "loss": 0.3882, "step": 46130 }, { - "epoch": 1.62, - "learning_rate": 3.9462425876282904e-05, - "loss": 0.2955, + "epoch": 1.6627022741197246, + "grad_norm": 0.1546994149684906, + "learning_rate": 3.893665455699366e-05, + "loss": 0.3932, "step": 46135 }, { - "epoch": 1.62, - "learning_rate": 3.9460102124616027e-05, - "loss": 0.2788, + "epoch": 1.6628824737809493, + "grad_norm": 0.18664954602718353, + "learning_rate": 3.893423182852694e-05, + "loss": 0.43, "step": 46140 }, { - "epoch": 1.62, - "learning_rate": 3.945777818519425e-05, - "loss": 0.2802, + "epoch": 1.663062673442174, + "grad_norm": 0.2018563598394394, + "learning_rate": 3.893180891020901e-05, + "loss": 0.4061, "step": 46145 }, { - "epoch": 1.62, - "learning_rate": 3.9455454058047753e-05, - "loss": 0.2871, + "epoch": 1.6632428731033986, + "grad_norm": 0.17747381329536438, + "learning_rate": 3.8929385802072885e-05, + "loss": 0.3716, "step": 46150 }, { - "epoch": 1.62, - "learning_rate": 3.9453129743206714e-05, - "loss": 0.2792, + "epoch": 1.663423072764623, + "grad_norm": 0.1724482625722885, + "learning_rate": 3.892696250415156e-05, + "loss": 0.4176, "step": 46155 }, { - "epoch": 1.62, - "learning_rate": 3.9450805240701304e-05, - "loss": 0.301, + "epoch": 1.6636032724258478, + "grad_norm": 0.18025627732276917, + "learning_rate": 3.892453901647809e-05, + "loss": 0.4606, "step": 46160 }, { - "epoch": 1.62, - "learning_rate": 3.944848055056172e-05, - "loss": 0.2779, + "epoch": 1.6637834720870726, + "grad_norm": 0.18692156672477722, + "learning_rate": 3.8922115339085455e-05, + "loss": 0.3861, "step": 46165 }, { - "epoch": 1.62, - "learning_rate": 3.944615567281812e-05, - "loss": 0.2917, + "epoch": 1.663963671748297, + "grad_norm": 0.16012603044509888, + "learning_rate": 3.89196914720067e-05, + "loss": 0.3892, "step": 46170 }, { - "epoch": 1.62, - "learning_rate": 3.944383060750072e-05, - "loss": 0.2802, + "epoch": 1.6641438714095218, + "grad_norm": 0.20383629202842712, + "learning_rate": 3.891726741527484e-05, + "loss": 0.4128, "step": 46175 }, { - "epoch": 1.62, - "learning_rate": 3.944150535463971e-05, - "loss": 0.308, + "epoch": 1.6643240710707463, + "grad_norm": 0.2038664072751999, + "learning_rate": 3.891484316892291e-05, + "loss": 0.4766, "step": 46180 }, { - "epoch": 1.62, - "learning_rate": 3.9439179914265255e-05, - "loss": 0.2955, + "epoch": 1.664504270731971, + "grad_norm": 0.18376675248146057, + "learning_rate": 3.891241873298394e-05, + "loss": 0.3883, "step": 46185 }, { - "epoch": 1.63, - "learning_rate": 3.9436854286407565e-05, - "loss": 0.2685, + "epoch": 1.6646844703931958, + "grad_norm": 0.17446856200695038, + "learning_rate": 3.890999410749095e-05, + "loss": 0.4284, "step": 46190 }, { - "epoch": 1.63, - "learning_rate": 3.943452847109684e-05, - "loss": 0.277, + "epoch": 1.6648646700544203, + "grad_norm": 0.20632441341876984, + "learning_rate": 3.890756929247699e-05, + "loss": 0.4319, "step": 46195 }, { - "epoch": 1.63, - "learning_rate": 3.943220246836328e-05, - "loss": 0.3088, + "epoch": 1.6650448697156448, + "grad_norm": 0.1419346034526825, + "learning_rate": 3.890514428797508e-05, + "loss": 0.384, "step": 46200 }, { - "epoch": 1.63, - "learning_rate": 3.9429876278237083e-05, - "loss": 0.265, + "epoch": 1.6652250693768695, + "grad_norm": 0.17198680341243744, + "learning_rate": 3.890271909401828e-05, + "loss": 0.4073, "step": 46205 }, { - "epoch": 1.63, - "learning_rate": 3.9427549900748445e-05, - "loss": 0.2596, + "epoch": 1.6654052690380943, + "grad_norm": 0.17511259019374847, + "learning_rate": 3.890029371063962e-05, + "loss": 0.4082, "step": 46210 }, { - "epoch": 1.63, - "learning_rate": 3.942522333592759e-05, - "loss": 0.2739, + "epoch": 1.665585468699319, + "grad_norm": 0.16610261797904968, + "learning_rate": 3.8897868137872154e-05, + "loss": 0.4148, "step": 46215 }, { - "epoch": 1.63, - "learning_rate": 3.94228965838047e-05, - "loss": 0.2872, + "epoch": 1.6657656683605435, + "grad_norm": 0.15712276101112366, + "learning_rate": 3.889544237574893e-05, + "loss": 0.3822, "step": 46220 }, { - "epoch": 1.63, - "learning_rate": 3.942056964441001e-05, - "loss": 0.2797, + "epoch": 1.665945868021768, + "grad_norm": 0.15209558606147766, + "learning_rate": 3.889301642430299e-05, + "loss": 0.4053, "step": 46225 }, { - "epoch": 1.63, - "learning_rate": 3.941824251777373e-05, - "loss": 0.278, + "epoch": 1.6661260676829928, + "grad_norm": 0.1979793757200241, + "learning_rate": 3.889059028356738e-05, + "loss": 0.4097, "step": 46230 }, { - "epoch": 1.63, - "learning_rate": 3.9415915203926066e-05, - "loss": 0.291, + "epoch": 1.6663062673442175, + "grad_norm": 0.17566531896591187, + "learning_rate": 3.8888163953575174e-05, + "loss": 0.3878, "step": 46235 }, { - "epoch": 1.63, - "learning_rate": 3.941358770289725e-05, - "loss": 0.2806, + "epoch": 1.666486467005442, + "grad_norm": 0.23031200468540192, + "learning_rate": 3.8885737434359424e-05, + "loss": 0.4245, "step": 46240 }, { - "epoch": 1.63, - "learning_rate": 3.94112600147175e-05, - "loss": 0.2903, + "epoch": 1.6666666666666665, + "grad_norm": 0.16964511573314667, + "learning_rate": 3.8883310725953194e-05, + "loss": 0.3682, "step": 46245 }, { - "epoch": 1.63, - "learning_rate": 3.940893213941703e-05, - "loss": 0.2905, + "epoch": 1.6668468663278913, + "grad_norm": 0.20693713426589966, + "learning_rate": 3.888088382838954e-05, + "loss": 0.4083, "step": 46250 }, { - "epoch": 1.63, - "learning_rate": 3.9406604077026073e-05, - "loss": 0.2838, + "epoch": 1.667027065989116, + "grad_norm": 0.18024256825447083, + "learning_rate": 3.8878456741701524e-05, + "loss": 0.4068, "step": 46255 }, { - "epoch": 1.63, - "learning_rate": 3.9404275827574855e-05, - "loss": 0.2762, + "epoch": 1.6672072656503407, + "grad_norm": 0.1766664981842041, + "learning_rate": 3.887602946592223e-05, + "loss": 0.3993, "step": 46260 }, { - "epoch": 1.63, - "learning_rate": 3.9401947391093606e-05, - "loss": 0.2821, + "epoch": 1.6673874653115652, + "grad_norm": 0.18678483366966248, + "learning_rate": 3.8873602001084716e-05, + "loss": 0.4381, "step": 46265 }, { - "epoch": 1.63, - "learning_rate": 3.939961876761257e-05, - "loss": 0.3049, + "epoch": 1.6675676649727897, + "grad_norm": 0.13964805006980896, + "learning_rate": 3.887117434722206e-05, + "loss": 0.4004, "step": 46270 }, { - "epoch": 1.63, - "learning_rate": 3.939728995716196e-05, - "loss": 0.2822, + "epoch": 1.6677478646340145, + "grad_norm": 0.15428251028060913, + "learning_rate": 3.886874650436735e-05, + "loss": 0.3547, "step": 46275 }, { - "epoch": 1.63, - "learning_rate": 3.9394960959772045e-05, - "loss": 0.2667, + "epoch": 1.6679280642952392, + "grad_norm": 0.1429775059223175, + "learning_rate": 3.8866318472553644e-05, + "loss": 0.394, "step": 46280 }, { - "epoch": 1.63, - "learning_rate": 3.939263177547304e-05, - "loss": 0.2875, + "epoch": 1.6681082639564637, + "grad_norm": 0.16724984347820282, + "learning_rate": 3.886389025181404e-05, + "loss": 0.3977, "step": 46285 }, { - "epoch": 1.63, - "learning_rate": 3.9390302404295195e-05, - "loss": 0.2996, + "epoch": 1.6682884636176882, + "grad_norm": 0.1835627257823944, + "learning_rate": 3.886146184218161e-05, + "loss": 0.4373, "step": 46290 }, { - "epoch": 1.63, - "learning_rate": 3.938797284626876e-05, - "loss": 0.2871, + "epoch": 1.668468663278913, + "grad_norm": 0.17956386506557465, + "learning_rate": 3.8859033243689446e-05, + "loss": 0.4227, "step": 46295 }, { - "epoch": 1.63, - "learning_rate": 3.9385643101423985e-05, - "loss": 0.2833, + "epoch": 1.6686488629401377, + "grad_norm": 0.19547246396541595, + "learning_rate": 3.885660445637064e-05, + "loss": 0.383, "step": 46300 }, { - "epoch": 1.63, - "learning_rate": 3.9383313169791104e-05, - "loss": 0.2626, + "epoch": 1.6688290626013624, + "grad_norm": 0.2311650663614273, + "learning_rate": 3.885417548025828e-05, + "loss": 0.3988, "step": 46305 }, { - "epoch": 1.63, - "learning_rate": 3.9380983051400387e-05, - "loss": 0.2652, + "epoch": 1.669009262262587, + "grad_norm": 0.15516655147075653, + "learning_rate": 3.885174631538546e-05, + "loss": 0.4264, "step": 46310 }, { - "epoch": 1.63, - "learning_rate": 3.937865274628208e-05, - "loss": 0.2765, + "epoch": 1.6691894619238115, + "grad_norm": 0.19897285103797913, + "learning_rate": 3.8849316961785276e-05, + "loss": 0.4051, "step": 46315 }, { - "epoch": 1.63, - "learning_rate": 3.937632225446645e-05, - "loss": 0.2799, + "epoch": 1.6693696615850362, + "grad_norm": 0.22610118985176086, + "learning_rate": 3.884688741949084e-05, + "loss": 0.4286, "step": 46320 }, { - "epoch": 1.63, - "learning_rate": 3.937399157598374e-05, - "loss": 0.2789, + "epoch": 1.669549861246261, + "grad_norm": 0.16517026722431183, + "learning_rate": 3.884445768853524e-05, + "loss": 0.4167, "step": 46325 }, { - "epoch": 1.63, - "learning_rate": 3.937166071086422e-05, - "loss": 0.2795, + "epoch": 1.6697300609074857, + "grad_norm": 0.21267640590667725, + "learning_rate": 3.884202776895158e-05, + "loss": 0.3828, "step": 46330 }, { - "epoch": 1.63, - "learning_rate": 3.9369329659138165e-05, - "loss": 0.2931, + "epoch": 1.6699102605687102, + "grad_norm": 0.20657142996788025, + "learning_rate": 3.883959766077297e-05, + "loss": 0.4069, "step": 46335 }, { - "epoch": 1.63, - "learning_rate": 3.936699842083583e-05, - "loss": 0.2742, + "epoch": 1.6700904602299347, + "grad_norm": 0.16115951538085938, + "learning_rate": 3.8837167364032526e-05, + "loss": 0.4248, "step": 46340 }, { - "epoch": 1.63, - "learning_rate": 3.936466699598749e-05, - "loss": 0.2872, + "epoch": 1.6702706598911594, + "grad_norm": 0.2205209583044052, + "learning_rate": 3.883473687876336e-05, + "loss": 0.4162, "step": 46345 }, { - "epoch": 1.63, - "learning_rate": 3.9362335384623414e-05, - "loss": 0.2813, + "epoch": 1.6704508595523841, + "grad_norm": 0.18244054913520813, + "learning_rate": 3.883230620499857e-05, + "loss": 0.4127, "step": 46350 }, { - "epoch": 1.63, - "learning_rate": 3.9360003586773886e-05, - "loss": 0.2804, + "epoch": 1.6706310592136087, + "grad_norm": 0.14126257598400116, + "learning_rate": 3.8829875342771287e-05, + "loss": 0.4314, "step": 46355 }, { - "epoch": 1.63, - "learning_rate": 3.935767160246917e-05, - "loss": 0.2565, + "epoch": 1.6708112588748332, + "grad_norm": 0.1783919334411621, + "learning_rate": 3.8827444292114634e-05, + "loss": 0.4063, "step": 46360 }, { - "epoch": 1.63, - "learning_rate": 3.935533943173954e-05, - "loss": 0.2673, + "epoch": 1.670991458536058, + "grad_norm": 0.1652853786945343, + "learning_rate": 3.882501305306174e-05, + "loss": 0.4104, "step": 46365 }, { - "epoch": 1.63, - "learning_rate": 3.93530070746153e-05, - "loss": 0.2911, + "epoch": 1.6711716581972826, + "grad_norm": 0.16857433319091797, + "learning_rate": 3.8822581625645706e-05, + "loss": 0.3828, "step": 46370 }, { - "epoch": 1.63, - "learning_rate": 3.935067453112673e-05, - "loss": 0.2806, + "epoch": 1.6713518578585074, + "grad_norm": 0.2388414889574051, + "learning_rate": 3.882015000989968e-05, + "loss": 0.4411, "step": 46375 }, { - "epoch": 1.63, - "learning_rate": 3.934834180130409e-05, - "loss": 0.2451, + "epoch": 1.6715320575197319, + "grad_norm": 0.16864396631717682, + "learning_rate": 3.881771820585678e-05, + "loss": 0.4467, "step": 46380 }, { - "epoch": 1.63, - "learning_rate": 3.93460088851777e-05, - "loss": 0.2758, + "epoch": 1.6717122571809564, + "grad_norm": 0.1579165756702423, + "learning_rate": 3.881528621355015e-05, + "loss": 0.4215, "step": 46385 }, { - "epoch": 1.63, - "learning_rate": 3.934367578277783e-05, - "loss": 0.2824, + "epoch": 1.6718924568421811, + "grad_norm": 0.17890191078186035, + "learning_rate": 3.8812854033012916e-05, + "loss": 0.4252, "step": 46390 }, { - "epoch": 1.63, - "learning_rate": 3.934134249413479e-05, - "loss": 0.2827, + "epoch": 1.6720726565034059, + "grad_norm": 0.1696099191904068, + "learning_rate": 3.881042166427823e-05, + "loss": 0.3783, "step": 46395 }, { - "epoch": 1.63, - "learning_rate": 3.933900901927887e-05, - "loss": 0.2792, + "epoch": 1.6722528561646304, + "grad_norm": 0.13848702609539032, + "learning_rate": 3.880798910737921e-05, + "loss": 0.3871, "step": 46400 }, { - "epoch": 1.63, - "learning_rate": 3.933667535824036e-05, - "loss": 0.2811, + "epoch": 1.6724330558258549, + "grad_norm": 0.17733387649059296, + "learning_rate": 3.880555636234902e-05, + "loss": 0.4035, "step": 46405 }, { - "epoch": 1.63, - "learning_rate": 3.9334341511049576e-05, - "loss": 0.283, + "epoch": 1.6726132554870796, + "grad_norm": 0.21084022521972656, + "learning_rate": 3.880312342922079e-05, + "loss": 0.4408, "step": 46410 }, { - "epoch": 1.63, - "learning_rate": 3.9332007477736816e-05, - "loss": 0.2761, + "epoch": 1.6727934551483044, + "grad_norm": 0.21797522902488708, + "learning_rate": 3.880069030802768e-05, + "loss": 0.4113, "step": 46415 }, { - "epoch": 1.63, - "learning_rate": 3.932967325833238e-05, - "loss": 0.2941, + "epoch": 1.672973654809529, + "grad_norm": 0.19593912363052368, + "learning_rate": 3.879825699880284e-05, + "loss": 0.4092, "step": 46420 }, { - "epoch": 1.63, - "learning_rate": 3.932733885286658e-05, - "loss": 0.2831, + "epoch": 1.6731538544707536, + "grad_norm": 0.2192002683877945, + "learning_rate": 3.879582350157942e-05, + "loss": 0.4616, "step": 46425 }, { - "epoch": 1.63, - "learning_rate": 3.932500426136973e-05, - "loss": 0.2813, + "epoch": 1.673334054131978, + "grad_norm": 0.1574559360742569, + "learning_rate": 3.879338981639057e-05, + "loss": 0.3955, "step": 46430 }, { - "epoch": 1.63, - "learning_rate": 3.932266948387213e-05, - "loss": 0.2816, + "epoch": 1.6735142537932028, + "grad_norm": 0.20337316393852234, + "learning_rate": 3.8790955943269455e-05, + "loss": 0.4303, "step": 46435 }, { - "epoch": 1.63, - "learning_rate": 3.932033452040411e-05, - "loss": 0.3044, + "epoch": 1.6736944534544276, + "grad_norm": 0.15265347063541412, + "learning_rate": 3.878852188224924e-05, + "loss": 0.4054, "step": 46440 }, { - "epoch": 1.63, - "learning_rate": 3.931799937099599e-05, - "loss": 0.2882, + "epoch": 1.673874653115652, + "grad_norm": 0.1718795895576477, + "learning_rate": 3.8786087633363075e-05, + "loss": 0.3766, "step": 46445 }, { - "epoch": 1.63, - "learning_rate": 3.931566403567808e-05, - "loss": 0.2827, + "epoch": 1.6740548527768768, + "grad_norm": 0.18566109240055084, + "learning_rate": 3.8783653196644144e-05, + "loss": 0.3856, "step": 46450 }, { - "epoch": 1.63, - "learning_rate": 3.931332851448071e-05, - "loss": 0.2983, + "epoch": 1.6742350524381013, + "grad_norm": 0.18366526067256927, + "learning_rate": 3.87812185721256e-05, + "loss": 0.4811, "step": 46455 }, { - "epoch": 1.63, - "learning_rate": 3.931099280743419e-05, - "loss": 0.2995, + "epoch": 1.674415252099326, + "grad_norm": 0.1583663374185562, + "learning_rate": 3.8778783759840625e-05, + "loss": 0.437, "step": 46460 }, { - "epoch": 1.63, - "learning_rate": 3.9308656914568875e-05, - "loss": 0.2944, + "epoch": 1.6745954517605508, + "grad_norm": 0.16400283575057983, + "learning_rate": 3.877634875982239e-05, + "loss": 0.397, "step": 46465 }, { - "epoch": 1.63, - "learning_rate": 3.930632083591508e-05, - "loss": 0.2639, + "epoch": 1.6747756514217753, + "grad_norm": 0.18252485990524292, + "learning_rate": 3.877391357210407e-05, + "loss": 0.431, "step": 46470 }, { - "epoch": 1.64, - "learning_rate": 3.930398457150313e-05, - "loss": 0.2724, + "epoch": 1.6749558510829998, + "grad_norm": 0.22396881878376007, + "learning_rate": 3.877147819671884e-05, + "loss": 0.424, "step": 46475 }, { - "epoch": 1.64, - "learning_rate": 3.930164812136336e-05, - "loss": 0.2594, + "epoch": 1.6751360507442246, + "grad_norm": 0.16650210320949554, + "learning_rate": 3.876904263369989e-05, + "loss": 0.399, "step": 46480 }, { - "epoch": 1.64, - "learning_rate": 3.9299311485526124e-05, - "loss": 0.2702, + "epoch": 1.6753162504054493, + "grad_norm": 0.17203296720981598, + "learning_rate": 3.87666068830804e-05, + "loss": 0.4241, "step": 46485 }, { - "epoch": 1.64, - "learning_rate": 3.929697466402175e-05, - "loss": 0.2883, + "epoch": 1.675496450066674, + "grad_norm": 0.21028588712215424, + "learning_rate": 3.876417094489355e-05, + "loss": 0.3976, "step": 46490 }, { - "epoch": 1.64, - "learning_rate": 3.9294637656880586e-05, - "loss": 0.2708, + "epoch": 1.6756766497278985, + "grad_norm": 0.20283226668834686, + "learning_rate": 3.876173481917255e-05, + "loss": 0.4196, "step": 46495 }, { - "epoch": 1.64, - "learning_rate": 3.929230046413297e-05, - "loss": 0.2786, + "epoch": 1.675856849389123, + "grad_norm": 0.1598893254995346, + "learning_rate": 3.875929850595056e-05, + "loss": 0.3846, "step": 46500 }, { - "epoch": 1.64, - "eval_loss": 0.2775975167751312, - "eval_runtime": 10.5364, - "eval_samples_per_second": 9.491, - "eval_steps_per_second": 9.491, + "epoch": 1.675856849389123, + "eval_loss": 0.44327858090400696, + "eval_runtime": 3.5351, + "eval_samples_per_second": 28.288, + "eval_steps_per_second": 7.072, "step": 46500 }, { - "epoch": 1.64, - "learning_rate": 3.9289963085809246e-05, - "loss": 0.2771, + "epoch": 1.6760370490503478, + "grad_norm": 0.18217889964580536, + "learning_rate": 3.8756862005260804e-05, + "loss": 0.4492, "step": 46505 }, { - "epoch": 1.64, - "learning_rate": 3.928762552193977e-05, - "loss": 0.2936, + "epoch": 1.6762172487115725, + "grad_norm": 0.19530221819877625, + "learning_rate": 3.8754425317136465e-05, + "loss": 0.4021, "step": 46510 }, { - "epoch": 1.64, - "learning_rate": 3.9285287772554905e-05, - "loss": 0.2759, + "epoch": 1.676397448372797, + "grad_norm": 0.21097613871097565, + "learning_rate": 3.875198844161074e-05, + "loss": 0.4219, "step": 46515 }, { - "epoch": 1.64, - "learning_rate": 3.928294983768498e-05, - "loss": 0.2953, + "epoch": 1.6765776480340215, + "grad_norm": 0.14871633052825928, + "learning_rate": 3.874955137871684e-05, + "loss": 0.3881, "step": 46520 }, { - "epoch": 1.64, - "learning_rate": 3.928061171736036e-05, - "loss": 0.297, + "epoch": 1.6767578476952463, + "grad_norm": 0.1719123125076294, + "learning_rate": 3.874711412848796e-05, + "loss": 0.4257, "step": 46525 }, { - "epoch": 1.64, - "learning_rate": 3.9278273411611425e-05, - "loss": 0.2788, + "epoch": 1.676938047356471, + "grad_norm": 0.2002047300338745, + "learning_rate": 3.874467669095731e-05, + "loss": 0.3852, "step": 46530 }, { - "epoch": 1.64, - "learning_rate": 3.92759349204685e-05, - "loss": 0.2689, + "epoch": 1.6771182470176957, + "grad_norm": 0.18990401923656464, + "learning_rate": 3.87422390661581e-05, + "loss": 0.4062, "step": 46535 }, { - "epoch": 1.64, - "learning_rate": 3.927359624396199e-05, - "loss": 0.2782, + "epoch": 1.6772984466789203, + "grad_norm": 0.2010718584060669, + "learning_rate": 3.873980125412355e-05, + "loss": 0.4468, "step": 46540 }, { - "epoch": 1.64, - "learning_rate": 3.927125738212222e-05, - "loss": 0.2997, + "epoch": 1.6774786463401448, + "grad_norm": 0.20043112337589264, + "learning_rate": 3.873736325488687e-05, + "loss": 0.4265, "step": 46545 }, { - "epoch": 1.64, - "learning_rate": 3.9268918334979584e-05, - "loss": 0.2866, + "epoch": 1.6776588460013695, + "grad_norm": 0.208219975233078, + "learning_rate": 3.873492506848127e-05, + "loss": 0.4229, "step": 46550 }, { - "epoch": 1.64, - "learning_rate": 3.926657910256445e-05, - "loss": 0.2718, + "epoch": 1.6778390456625942, + "grad_norm": 0.1497962474822998, + "learning_rate": 3.873248669493997e-05, + "loss": 0.3987, "step": 46555 }, { - "epoch": 1.64, - "learning_rate": 3.926423968490719e-05, - "loss": 0.273, + "epoch": 1.6780192453238187, + "grad_norm": 0.1851639449596405, + "learning_rate": 3.87300481342962e-05, + "loss": 0.4378, "step": 46560 }, { - "epoch": 1.64, - "learning_rate": 3.9261900082038174e-05, - "loss": 0.2883, + "epoch": 1.6781994449850435, + "grad_norm": 0.18497171998023987, + "learning_rate": 3.8727609386583184e-05, + "loss": 0.3855, "step": 46565 }, { - "epoch": 1.64, - "learning_rate": 3.9259560293987785e-05, - "loss": 0.2827, + "epoch": 1.678379644646268, + "grad_norm": 0.19032762944698334, + "learning_rate": 3.872517045183416e-05, + "loss": 0.4086, "step": 46570 }, { - "epoch": 1.64, - "learning_rate": 3.92572203207864e-05, - "loss": 0.2716, + "epoch": 1.6785598443074927, + "grad_norm": 0.1807955503463745, + "learning_rate": 3.872273133008233e-05, + "loss": 0.4104, "step": 46575 }, { - "epoch": 1.64, - "learning_rate": 3.925488016246441e-05, - "loss": 0.3, + "epoch": 1.6787400439687175, + "grad_norm": 0.18649344146251678, + "learning_rate": 3.872029202136095e-05, + "loss": 0.4241, "step": 46580 }, { - "epoch": 1.64, - "learning_rate": 3.92525398190522e-05, - "loss": 0.3001, + "epoch": 1.678920243629942, + "grad_norm": 0.17632882297039032, + "learning_rate": 3.8717852525703246e-05, + "loss": 0.4656, "step": 46585 }, { - "epoch": 1.64, - "learning_rate": 3.9250199290580144e-05, - "loss": 0.2725, + "epoch": 1.6791004432911665, + "grad_norm": 0.1870754510164261, + "learning_rate": 3.871541284314245e-05, + "loss": 0.3766, "step": 46590 }, { - "epoch": 1.64, - "learning_rate": 3.924785857707865e-05, - "loss": 0.2713, + "epoch": 1.6792806429523912, + "grad_norm": 0.18941162526607513, + "learning_rate": 3.871297297371182e-05, + "loss": 0.4175, "step": 46595 }, { - "epoch": 1.64, - "learning_rate": 3.924551767857809e-05, - "loss": 0.2979, + "epoch": 1.679460842613616, + "grad_norm": 0.17592564225196838, + "learning_rate": 3.871053291744459e-05, + "loss": 0.3781, "step": 46600 }, { - "epoch": 1.64, - "learning_rate": 3.924317659510888e-05, - "loss": 0.2895, + "epoch": 1.6796410422748407, + "grad_norm": 0.17591671645641327, + "learning_rate": 3.870809267437398e-05, + "loss": 0.3759, "step": 46605 }, { - "epoch": 1.64, - "learning_rate": 3.924083532670141e-05, - "loss": 0.285, + "epoch": 1.6798212419360652, + "grad_norm": 0.17050381004810333, + "learning_rate": 3.870565224453329e-05, + "loss": 0.3951, "step": 46610 }, { - "epoch": 1.64, - "learning_rate": 3.9238493873386075e-05, - "loss": 0.2717, + "epoch": 1.6800014415972897, + "grad_norm": 0.20477713644504547, + "learning_rate": 3.870321162795573e-05, + "loss": 0.4295, "step": 46615 }, { - "epoch": 1.64, - "learning_rate": 3.923615223519328e-05, - "loss": 0.2959, + "epoch": 1.6801816412585144, + "grad_norm": 0.20674197375774384, + "learning_rate": 3.870077082467456e-05, + "loss": 0.4307, "step": 46620 }, { - "epoch": 1.64, - "learning_rate": 3.923381041215343e-05, - "loss": 0.2938, + "epoch": 1.6803618409197392, + "grad_norm": 0.1541663408279419, + "learning_rate": 3.8698329834723046e-05, + "loss": 0.4447, "step": 46625 }, { - "epoch": 1.64, - "learning_rate": 3.923146840429693e-05, - "loss": 0.2913, + "epoch": 1.6805420405809637, + "grad_norm": 0.1866021603345871, + "learning_rate": 3.8695888658134446e-05, + "loss": 0.3986, "step": 46630 }, { - "epoch": 1.64, - "learning_rate": 3.92291262116542e-05, - "loss": 0.2844, + "epoch": 1.6807222402421882, + "grad_norm": 0.17100781202316284, + "learning_rate": 3.8693447294942e-05, + "loss": 0.4129, "step": 46635 }, { - "epoch": 1.64, - "learning_rate": 3.9226783834255634e-05, - "loss": 0.2926, + "epoch": 1.680902439903413, + "grad_norm": 0.16829904913902283, + "learning_rate": 3.8691005745179e-05, + "loss": 0.436, "step": 46640 }, { - "epoch": 1.64, - "learning_rate": 3.9224441272131664e-05, - "loss": 0.2879, + "epoch": 1.6810826395646377, + "grad_norm": 0.17675045132637024, + "learning_rate": 3.868856400887868e-05, + "loss": 0.4185, "step": 46645 }, { - "epoch": 1.64, - "learning_rate": 3.922209852531269e-05, - "loss": 0.2757, + "epoch": 1.6812628392258624, + "grad_norm": 0.1742866337299347, + "learning_rate": 3.868612208607434e-05, + "loss": 0.4504, "step": 46650 }, { - "epoch": 1.64, - "learning_rate": 3.921975559382914e-05, - "loss": 0.2858, + "epoch": 1.681443038887087, + "grad_norm": 0.21476224064826965, + "learning_rate": 3.8683679976799235e-05, + "loss": 0.4158, "step": 46655 }, { - "epoch": 1.64, - "learning_rate": 3.921741247771144e-05, - "loss": 0.2691, + "epoch": 1.6816232385483114, + "grad_norm": 0.19867900013923645, + "learning_rate": 3.868123768108664e-05, + "loss": 0.3975, "step": 46660 }, { - "epoch": 1.64, - "learning_rate": 3.9215069176990006e-05, - "loss": 0.2943, + "epoch": 1.6818034382095362, + "grad_norm": 0.15411974489688873, + "learning_rate": 3.867879519896983e-05, + "loss": 0.4003, "step": 46665 }, { - "epoch": 1.64, - "learning_rate": 3.921272569169527e-05, - "loss": 0.2809, + "epoch": 1.6819836378707609, + "grad_norm": 0.20566970109939575, + "learning_rate": 3.8676352530482074e-05, + "loss": 0.4074, "step": 46670 }, { - "epoch": 1.64, - "learning_rate": 3.9210382021857655e-05, - "loss": 0.2826, + "epoch": 1.6821638375319854, + "grad_norm": 0.17975671589374542, + "learning_rate": 3.8673909675656675e-05, + "loss": 0.4181, "step": 46675 }, { - "epoch": 1.64, - "learning_rate": 3.920803816750759e-05, - "loss": 0.2968, + "epoch": 1.6823440371932101, + "grad_norm": 0.16022039949893951, + "learning_rate": 3.86714666345269e-05, + "loss": 0.4104, "step": 46680 }, { - "epoch": 1.64, - "learning_rate": 3.920569412867552e-05, - "loss": 0.2786, + "epoch": 1.6825242368544346, + "grad_norm": 0.22043001651763916, + "learning_rate": 3.8669023407126035e-05, + "loss": 0.4131, "step": 46685 }, { - "epoch": 1.64, - "learning_rate": 3.9203349905391876e-05, - "loss": 0.3007, + "epoch": 1.6827044365156594, + "grad_norm": 0.14950603246688843, + "learning_rate": 3.866657999348737e-05, + "loss": 0.3596, "step": 46690 }, { - "epoch": 1.64, - "learning_rate": 3.920100549768708e-05, - "loss": 0.3029, + "epoch": 1.682884636176884, + "grad_norm": 0.1892220824956894, + "learning_rate": 3.866413639364421e-05, + "loss": 0.3928, "step": 46695 }, { - "epoch": 1.64, - "learning_rate": 3.91986609055916e-05, - "loss": 0.2705, + "epoch": 1.6830648358381086, + "grad_norm": 0.15742996335029602, + "learning_rate": 3.8661692607629826e-05, + "loss": 0.41, "step": 46700 }, { - "epoch": 1.64, - "learning_rate": 3.919631612913586e-05, - "loss": 0.3101, + "epoch": 1.6832450354993331, + "grad_norm": 0.18481925129890442, + "learning_rate": 3.865924863547753e-05, + "loss": 0.4061, "step": 46705 }, { - "epoch": 1.64, - "learning_rate": 3.919397116835031e-05, - "loss": 0.2752, + "epoch": 1.6834252351605579, + "grad_norm": 0.2112767994403839, + "learning_rate": 3.865680447722062e-05, + "loss": 0.4467, "step": 46710 }, { - "epoch": 1.64, - "learning_rate": 3.91916260232654e-05, - "loss": 0.3007, + "epoch": 1.6836054348217826, + "grad_norm": 0.19676971435546875, + "learning_rate": 3.865436013289239e-05, + "loss": 0.4542, "step": 46715 }, { - "epoch": 1.64, - "learning_rate": 3.9189280693911576e-05, - "loss": 0.268, + "epoch": 1.6837856344830073, + "grad_norm": 0.18890570104122162, + "learning_rate": 3.865191560252614e-05, + "loss": 0.4311, "step": 46720 }, { - "epoch": 1.64, - "learning_rate": 3.918693518031929e-05, - "loss": 0.2829, + "epoch": 1.6839658341442318, + "grad_norm": 0.17850306630134583, + "learning_rate": 3.864947088615519e-05, + "loss": 0.424, "step": 46725 }, { - "epoch": 1.64, - "learning_rate": 3.918458948251902e-05, - "loss": 0.2913, + "epoch": 1.6841460338054564, + "grad_norm": 0.13823658227920532, + "learning_rate": 3.8647025983812844e-05, + "loss": 0.4182, "step": 46730 }, { - "epoch": 1.64, - "learning_rate": 3.9182243600541186e-05, - "loss": 0.2815, + "epoch": 1.684326233466681, + "grad_norm": 0.17909295856952667, + "learning_rate": 3.864458089553241e-05, + "loss": 0.4016, "step": 46735 }, { - "epoch": 1.64, - "learning_rate": 3.917989753441627e-05, - "loss": 0.2865, + "epoch": 1.6845064331279058, + "grad_norm": 0.20117956399917603, + "learning_rate": 3.8642135621347195e-05, + "loss": 0.3977, "step": 46740 }, { - "epoch": 1.64, - "learning_rate": 3.9177551284174724e-05, - "loss": 0.2695, + "epoch": 1.6846866327891303, + "grad_norm": 0.19415679574012756, + "learning_rate": 3.863969016129053e-05, + "loss": 0.3985, "step": 46745 }, { - "epoch": 1.64, - "learning_rate": 3.917520484984702e-05, - "loss": 0.2711, + "epoch": 1.6848668324503548, + "grad_norm": 0.2219560146331787, + "learning_rate": 3.8637244515395734e-05, + "loss": 0.4205, "step": 46750 }, { - "epoch": 1.64, - "learning_rate": 3.9173327569863296e-05, - "loss": 0.2899, + "epoch": 1.6850470321115796, + "grad_norm": 0.1750744730234146, + "learning_rate": 3.8634798683696114e-05, + "loss": 0.428, "step": 46755 }, { - "epoch": 1.65, - "learning_rate": 3.917098080425729e-05, - "loss": 0.2844, + "epoch": 1.6852272317728043, + "grad_norm": 0.19463662803173065, + "learning_rate": 3.8632352666225005e-05, + "loss": 0.4025, "step": 46760 }, { - "epoch": 1.65, - "learning_rate": 3.916863385465043e-05, - "loss": 0.2959, + "epoch": 1.685407431434029, + "grad_norm": 0.18877053260803223, + "learning_rate": 3.862990646301572e-05, + "loss": 0.4338, "step": 46765 }, { - "epoch": 1.65, - "learning_rate": 3.91662867210732e-05, - "loss": 0.276, + "epoch": 1.6855876310952536, + "grad_norm": 0.17238645255565643, + "learning_rate": 3.8627460074101606e-05, + "loss": 0.4193, "step": 46770 }, { - "epoch": 1.65, - "learning_rate": 3.9163939403556074e-05, - "loss": 0.2636, + "epoch": 1.685767830756478, + "grad_norm": 0.20663443207740784, + "learning_rate": 3.862501349951599e-05, + "loss": 0.4149, "step": 46775 }, { - "epoch": 1.65, - "learning_rate": 3.916159190212952e-05, - "loss": 0.2678, + "epoch": 1.6859480304177028, + "grad_norm": 0.17570826411247253, + "learning_rate": 3.86225667392922e-05, + "loss": 0.4042, "step": 46780 }, { - "epoch": 1.65, - "learning_rate": 3.915924421682404e-05, - "loss": 0.2807, + "epoch": 1.6861282300789275, + "grad_norm": 0.1705494225025177, + "learning_rate": 3.8620119793463573e-05, + "loss": 0.432, "step": 46785 }, { - "epoch": 1.65, - "learning_rate": 3.91568963476701e-05, - "loss": 0.299, + "epoch": 1.686308429740152, + "grad_norm": 0.18460239470005035, + "learning_rate": 3.861767266206345e-05, + "loss": 0.3972, "step": 46790 }, { - "epoch": 1.65, - "learning_rate": 3.915454829469819e-05, - "loss": 0.2929, + "epoch": 1.6864886294013766, + "grad_norm": 0.1540331393480301, + "learning_rate": 3.861522534512518e-05, + "loss": 0.4156, "step": 46795 }, { - "epoch": 1.65, - "learning_rate": 3.9152200057938796e-05, - "loss": 0.301, + "epoch": 1.6866688290626013, + "grad_norm": 0.2118932008743286, + "learning_rate": 3.861277784268209e-05, + "loss": 0.4159, "step": 46800 }, { - "epoch": 1.65, - "learning_rate": 3.9149851637422415e-05, - "loss": 0.2898, + "epoch": 1.686849028723826, + "grad_norm": 0.18069863319396973, + "learning_rate": 3.861033015476755e-05, + "loss": 0.4068, "step": 46805 }, { - "epoch": 1.65, - "learning_rate": 3.914750303317953e-05, - "loss": 0.3064, + "epoch": 1.6870292283850508, + "grad_norm": 0.2565334439277649, + "learning_rate": 3.860788228141489e-05, + "loss": 0.4314, "step": 46810 }, { - "epoch": 1.65, - "learning_rate": 3.914515424524065e-05, - "loss": 0.2903, + "epoch": 1.6872094280462753, + "grad_norm": 0.21841110289096832, + "learning_rate": 3.8605434222657465e-05, + "loss": 0.4269, "step": 46815 }, { - "epoch": 1.65, - "learning_rate": 3.9142805273636265e-05, - "loss": 0.2944, + "epoch": 1.6873896277074998, + "grad_norm": 0.1737854778766632, + "learning_rate": 3.860298597852864e-05, + "loss": 0.426, "step": 46820 }, { - "epoch": 1.65, - "learning_rate": 3.9140456118396865e-05, - "loss": 0.2811, + "epoch": 1.6875698273687245, + "grad_norm": 0.17643529176712036, + "learning_rate": 3.860053754906176e-05, + "loss": 0.4376, "step": 46825 }, { - "epoch": 1.65, - "learning_rate": 3.913810677955297e-05, - "loss": 0.3244, + "epoch": 1.6877500270299493, + "grad_norm": 0.18308429419994354, + "learning_rate": 3.859808893429019e-05, + "loss": 0.4082, "step": 46830 }, { - "epoch": 1.65, - "learning_rate": 3.913575725713507e-05, - "loss": 0.3122, + "epoch": 1.687930226691174, + "grad_norm": 0.1854483038187027, + "learning_rate": 3.859564013424729e-05, + "loss": 0.4091, "step": 46835 }, { - "epoch": 1.65, - "learning_rate": 3.9133407551173676e-05, - "loss": 0.2729, + "epoch": 1.6881104263523985, + "grad_norm": 0.2083449810743332, + "learning_rate": 3.859319114896643e-05, + "loss": 0.4431, "step": 46840 }, { - "epoch": 1.65, - "learning_rate": 3.91310576616993e-05, - "loss": 0.2804, + "epoch": 1.688290626013623, + "grad_norm": 0.15931734442710876, + "learning_rate": 3.859074197848097e-05, + "loss": 0.3969, "step": 46845 }, { - "epoch": 1.65, - "learning_rate": 3.912870758874246e-05, - "loss": 0.2933, + "epoch": 1.6884708256748477, + "grad_norm": 0.20284508168697357, + "learning_rate": 3.858829262282428e-05, + "loss": 0.4014, "step": 46850 }, { - "epoch": 1.65, - "learning_rate": 3.9126357332333656e-05, - "loss": 0.2887, + "epoch": 1.6886510253360725, + "grad_norm": 0.1814059317111969, + "learning_rate": 3.858584308202973e-05, + "loss": 0.4241, "step": 46855 }, { - "epoch": 1.65, - "learning_rate": 3.912400689250342e-05, - "loss": 0.2587, + "epoch": 1.688831224997297, + "grad_norm": 0.24571998417377472, + "learning_rate": 3.858339335613071e-05, + "loss": 0.4337, "step": 46860 }, { - "epoch": 1.65, - "learning_rate": 3.9121656269282245e-05, - "loss": 0.3161, + "epoch": 1.6890114246585215, + "grad_norm": 0.19344958662986755, + "learning_rate": 3.858094344516058e-05, + "loss": 0.4392, "step": 46865 }, { - "epoch": 1.65, - "learning_rate": 3.9119305462700696e-05, - "loss": 0.3236, + "epoch": 1.6891916243197462, + "grad_norm": 0.1718379110097885, + "learning_rate": 3.8578493349152725e-05, + "loss": 0.3957, "step": 46870 }, { - "epoch": 1.65, - "learning_rate": 3.911695447278925e-05, - "loss": 0.2894, + "epoch": 1.689371823980971, + "grad_norm": 0.18230155110359192, + "learning_rate": 3.857604306814052e-05, + "loss": 0.4316, "step": 46875 }, { - "epoch": 1.65, - "learning_rate": 3.911460329957846e-05, - "loss": 0.273, + "epoch": 1.6895520236421957, + "grad_norm": 0.2310900092124939, + "learning_rate": 3.857359260215736e-05, + "loss": 0.4239, "step": 46880 }, { - "epoch": 1.65, - "learning_rate": 3.9112251943098844e-05, - "loss": 0.2766, + "epoch": 1.6897322233034202, + "grad_norm": 0.14590436220169067, + "learning_rate": 3.8571141951236634e-05, + "loss": 0.3717, "step": 46885 }, { - "epoch": 1.65, - "learning_rate": 3.910990040338094e-05, - "loss": 0.2898, + "epoch": 1.6899124229646447, + "grad_norm": 0.20509690046310425, + "learning_rate": 3.8568691115411726e-05, + "loss": 0.4504, "step": 46890 }, { - "epoch": 1.65, - "learning_rate": 3.910754868045528e-05, - "loss": 0.2967, + "epoch": 1.6900926226258695, + "grad_norm": 0.22596745193004608, + "learning_rate": 3.856624009471602e-05, + "loss": 0.4144, "step": 46895 }, { - "epoch": 1.65, - "learning_rate": 3.910519677435239e-05, - "loss": 0.3025, + "epoch": 1.6902728222870942, + "grad_norm": 0.21356570720672607, + "learning_rate": 3.8563788889182925e-05, + "loss": 0.4174, "step": 46900 }, { - "epoch": 1.65, - "learning_rate": 3.910284468510283e-05, - "loss": 0.2868, + "epoch": 1.6904530219483187, + "grad_norm": 0.16195428371429443, + "learning_rate": 3.856133749884584e-05, + "loss": 0.4089, "step": 46905 }, { - "epoch": 1.65, - "learning_rate": 3.910049241273712e-05, - "loss": 0.3248, + "epoch": 1.6906332216095432, + "grad_norm": 0.1678587794303894, + "learning_rate": 3.8558885923738144e-05, + "loss": 0.4158, "step": 46910 }, { - "epoch": 1.65, - "learning_rate": 3.909813995728581e-05, - "loss": 0.2966, + "epoch": 1.690813421270768, + "grad_norm": 0.16259028017520905, + "learning_rate": 3.8556434163893254e-05, + "loss": 0.3828, "step": 46915 }, { - "epoch": 1.65, - "learning_rate": 3.909578731877943e-05, - "loss": 0.276, + "epoch": 1.6909936209319927, + "grad_norm": 0.16952233016490936, + "learning_rate": 3.8553982219344584e-05, + "loss": 0.4048, "step": 46920 }, { - "epoch": 1.65, - "learning_rate": 3.909343449724856e-05, - "loss": 0.2901, + "epoch": 1.6911738205932174, + "grad_norm": 0.20961052179336548, + "learning_rate": 3.855153009012552e-05, + "loss": 0.4091, "step": 46925 }, { - "epoch": 1.65, - "learning_rate": 3.909108149272371e-05, - "loss": 0.2952, + "epoch": 1.691354020254442, + "grad_norm": 0.17797677218914032, + "learning_rate": 3.854907777626948e-05, + "loss": 0.4102, "step": 46930 }, { - "epoch": 1.65, - "learning_rate": 3.908872830523547e-05, - "loss": 0.2676, + "epoch": 1.6915342199156664, + "grad_norm": 0.23650279641151428, + "learning_rate": 3.854662527780989e-05, + "loss": 0.4334, "step": 46935 }, { - "epoch": 1.65, - "learning_rate": 3.9086374934814375e-05, - "loss": 0.2738, + "epoch": 1.6917144195768912, + "grad_norm": 0.1639034003019333, + "learning_rate": 3.8544172594780145e-05, + "loss": 0.3668, "step": 46940 }, { - "epoch": 1.65, - "learning_rate": 3.9084021381490986e-05, - "loss": 0.2668, + "epoch": 1.691894619238116, + "grad_norm": 0.14335285127162933, + "learning_rate": 3.8541719727213675e-05, + "loss": 0.38, "step": 46945 }, { - "epoch": 1.65, - "learning_rate": 3.908166764529585e-05, - "loss": 0.2843, + "epoch": 1.6920748188993404, + "grad_norm": 0.2120550274848938, + "learning_rate": 3.853926667514389e-05, + "loss": 0.4064, "step": 46950 }, { - "epoch": 1.65, - "learning_rate": 3.907931372625955e-05, - "loss": 0.2964, + "epoch": 1.6922550185605651, + "grad_norm": 0.1797804832458496, + "learning_rate": 3.853681343860423e-05, + "loss": 0.4465, "step": 46955 }, { - "epoch": 1.65, - "learning_rate": 3.907695962441264e-05, - "loss": 0.2733, + "epoch": 1.6924352182217897, + "grad_norm": 0.17130543291568756, + "learning_rate": 3.8534360017628096e-05, + "loss": 0.3994, "step": 46960 }, { - "epoch": 1.65, - "learning_rate": 3.907460533978569e-05, - "loss": 0.2851, + "epoch": 1.6926154178830144, + "grad_norm": 0.17090466618537903, + "learning_rate": 3.853190641224893e-05, + "loss": 0.4236, "step": 46965 }, { - "epoch": 1.65, - "learning_rate": 3.907225087240925e-05, - "loss": 0.2835, + "epoch": 1.6927956175442391, + "grad_norm": 0.1600097268819809, + "learning_rate": 3.852945262250016e-05, + "loss": 0.4157, "step": 46970 }, { - "epoch": 1.65, - "learning_rate": 3.906989622231392e-05, - "loss": 0.2729, + "epoch": 1.6929758172054636, + "grad_norm": 0.16842542588710785, + "learning_rate": 3.8526998648415214e-05, + "loss": 0.4017, "step": 46975 }, { - "epoch": 1.65, - "learning_rate": 3.9067541389530256e-05, - "loss": 0.3028, + "epoch": 1.6931560168666882, + "grad_norm": 0.2014317363500595, + "learning_rate": 3.8524544490027534e-05, + "loss": 0.4108, "step": 46980 }, { - "epoch": 1.65, - "learning_rate": 3.906518637408884e-05, - "loss": 0.3031, + "epoch": 1.6933362165279129, + "grad_norm": 0.18398632109165192, + "learning_rate": 3.852209014737055e-05, + "loss": 0.3708, "step": 46985 }, { - "epoch": 1.65, - "learning_rate": 3.906283117602024e-05, - "loss": 0.3047, + "epoch": 1.6935164161891376, + "grad_norm": 0.217530757188797, + "learning_rate": 3.8519635620477714e-05, + "loss": 0.4193, "step": 46990 }, { - "epoch": 1.65, - "learning_rate": 3.906047579535504e-05, - "loss": 0.2737, + "epoch": 1.6936966158503624, + "grad_norm": 0.15751570463180542, + "learning_rate": 3.851718090938245e-05, + "loss": 0.4189, "step": 46995 }, { - "epoch": 1.65, - "learning_rate": 3.905812023212385e-05, - "loss": 0.3082, + "epoch": 1.6938768155115869, + "grad_norm": 0.21628709137439728, + "learning_rate": 3.851472601411822e-05, + "loss": 0.4166, "step": 47000 }, { - "epoch": 1.65, - "eval_loss": 0.27732065320014954, - "eval_runtime": 10.5336, - "eval_samples_per_second": 9.493, - "eval_steps_per_second": 9.493, + "epoch": 1.6938768155115869, + "eval_loss": 0.44244661927223206, + "eval_runtime": 3.5403, + "eval_samples_per_second": 28.246, + "eval_steps_per_second": 7.062, "step": 47000 }, { - "epoch": 1.65, - "learning_rate": 3.905576448635722e-05, - "loss": 0.3076, + "epoch": 1.6940570151728114, + "grad_norm": 0.19788464903831482, + "learning_rate": 3.851227093471847e-05, + "loss": 0.424, "step": 47005 }, { - "epoch": 1.65, - "learning_rate": 3.905340855808576e-05, - "loss": 0.2795, + "epoch": 1.694237214834036, + "grad_norm": 0.1924174278974533, + "learning_rate": 3.850981567121663e-05, + "loss": 0.4527, "step": 47010 }, { - "epoch": 1.65, - "learning_rate": 3.905105244734004e-05, - "loss": 0.2857, + "epoch": 1.6944174144952608, + "grad_norm": 0.16324442625045776, + "learning_rate": 3.850736022364617e-05, + "loss": 0.4156, "step": 47015 }, { - "epoch": 1.65, - "learning_rate": 3.9048696154150666e-05, - "loss": 0.2797, + "epoch": 1.6945976141564854, + "grad_norm": 0.23435580730438232, + "learning_rate": 3.8504904592040545e-05, + "loss": 0.4468, "step": 47020 }, { - "epoch": 1.65, - "learning_rate": 3.9046339678548234e-05, - "loss": 0.2599, + "epoch": 1.6947778138177099, + "grad_norm": 0.15442748367786407, + "learning_rate": 3.8502448776433216e-05, + "loss": 0.3801, "step": 47025 }, { - "epoch": 1.65, - "learning_rate": 3.904398302056333e-05, - "loss": 0.2587, + "epoch": 1.6949580134789346, + "grad_norm": 0.18875345587730408, + "learning_rate": 3.849999277685763e-05, + "loss": 0.3887, "step": 47030 }, { - "epoch": 1.65, - "learning_rate": 3.9041626180226575e-05, - "loss": 0.2786, + "epoch": 1.6951382131401593, + "grad_norm": 0.18448412418365479, + "learning_rate": 3.849753659334725e-05, + "loss": 0.4143, "step": 47035 }, { - "epoch": 1.65, - "learning_rate": 3.903926915756856e-05, - "loss": 0.2823, + "epoch": 1.695318412801384, + "grad_norm": 0.20887787640094757, + "learning_rate": 3.849508022593556e-05, + "loss": 0.4555, "step": 47040 }, { - "epoch": 1.66, - "learning_rate": 3.903691195261987e-05, - "loss": 0.2879, + "epoch": 1.6954986124626086, + "grad_norm": 0.16566239297389984, + "learning_rate": 3.849262367465601e-05, + "loss": 0.3988, "step": 47045 }, { - "epoch": 1.66, - "learning_rate": 3.903455456541114e-05, - "loss": 0.2644, + "epoch": 1.695678812123833, + "grad_norm": 0.2598322033882141, + "learning_rate": 3.849016693954207e-05, + "loss": 0.431, "step": 47050 }, { - "epoch": 1.66, - "learning_rate": 3.903219699597297e-05, - "loss": 0.2788, + "epoch": 1.6958590117850578, + "grad_norm": 0.1612170934677124, + "learning_rate": 3.848771002062722e-05, + "loss": 0.3811, "step": 47055 }, { - "epoch": 1.66, - "learning_rate": 3.902983924433596e-05, - "loss": 0.2764, + "epoch": 1.6960392114462826, + "grad_norm": 0.18904070556163788, + "learning_rate": 3.848525291794494e-05, + "loss": 0.4372, "step": 47060 }, { - "epoch": 1.66, - "learning_rate": 3.902748131053074e-05, - "loss": 0.2835, + "epoch": 1.696219411107507, + "grad_norm": 0.1836709976196289, + "learning_rate": 3.848279563152869e-05, + "loss": 0.3892, "step": 47065 }, { - "epoch": 1.66, - "learning_rate": 3.9025123194587927e-05, - "loss": 0.2733, + "epoch": 1.6963996107687318, + "grad_norm": 0.18471242487430573, + "learning_rate": 3.848033816141196e-05, + "loss": 0.3994, "step": 47070 }, { - "epoch": 1.66, - "learning_rate": 3.902276489653812e-05, - "loss": 0.2824, + "epoch": 1.6965798104299563, + "grad_norm": 0.18308402597904205, + "learning_rate": 3.847788050762824e-05, + "loss": 0.4139, "step": 47075 }, { - "epoch": 1.66, - "learning_rate": 3.902040641641196e-05, - "loss": 0.2928, + "epoch": 1.696760010091181, + "grad_norm": 0.1916336864233017, + "learning_rate": 3.8475422670211e-05, + "loss": 0.4577, "step": 47080 }, { - "epoch": 1.66, - "learning_rate": 3.9018047754240054e-05, - "loss": 0.2955, + "epoch": 1.6969402097524058, + "grad_norm": 0.17841115593910217, + "learning_rate": 3.8472964649193736e-05, + "loss": 0.4185, "step": 47085 }, { - "epoch": 1.66, - "learning_rate": 3.901568891005305e-05, - "loss": 0.284, + "epoch": 1.6971204094136303, + "grad_norm": 0.16689583659172058, + "learning_rate": 3.8470506444609946e-05, + "loss": 0.3826, "step": 47090 }, { - "epoch": 1.66, - "learning_rate": 3.9013329883881544e-05, - "loss": 0.316, + "epoch": 1.6973006090748548, + "grad_norm": 0.21197299659252167, + "learning_rate": 3.84680480564931e-05, + "loss": 0.4296, "step": 47095 }, { - "epoch": 1.66, - "learning_rate": 3.90109706757562e-05, - "loss": 0.297, + "epoch": 1.6974808087360795, + "grad_norm": 0.14996369183063507, + "learning_rate": 3.8465589484876716e-05, + "loss": 0.4002, "step": 47100 }, { - "epoch": 1.66, - "learning_rate": 3.900861128570763e-05, - "loss": 0.2845, + "epoch": 1.6976610083973043, + "grad_norm": 0.15397225320339203, + "learning_rate": 3.846313072979428e-05, + "loss": 0.3986, "step": 47105 }, { - "epoch": 1.66, - "learning_rate": 3.9006251713766474e-05, - "loss": 0.2836, + "epoch": 1.697841208058529, + "grad_norm": 0.1733284294605255, + "learning_rate": 3.846067179127929e-05, + "loss": 0.4098, "step": 47110 }, { - "epoch": 1.66, - "learning_rate": 3.900389195996337e-05, - "loss": 0.3025, + "epoch": 1.6980214077197535, + "grad_norm": 0.16923348605632782, + "learning_rate": 3.8458212669365256e-05, + "loss": 0.3978, "step": 47115 }, { - "epoch": 1.66, - "learning_rate": 3.900153202432895e-05, - "loss": 0.2968, + "epoch": 1.698201607380978, + "grad_norm": 0.16483382880687714, + "learning_rate": 3.845575336408568e-05, + "loss": 0.4234, "step": 47120 }, { - "epoch": 1.66, - "learning_rate": 3.8999171906893875e-05, - "loss": 0.2915, + "epoch": 1.6983818070422028, + "grad_norm": 0.18694746494293213, + "learning_rate": 3.845329387547407e-05, + "loss": 0.4383, "step": 47125 }, { - "epoch": 1.66, - "learning_rate": 3.8996811607688774e-05, - "loss": 0.2617, + "epoch": 1.6985620067034275, + "grad_norm": 0.20622876286506653, + "learning_rate": 3.845083420356393e-05, + "loss": 0.4094, "step": 47130 }, { - "epoch": 1.66, - "learning_rate": 3.8994451126744305e-05, - "loss": 0.2753, + "epoch": 1.698742206364652, + "grad_norm": 0.177239328622818, + "learning_rate": 3.8448374348388796e-05, + "loss": 0.4394, "step": 47135 }, { - "epoch": 1.66, - "learning_rate": 3.89920904640911e-05, - "loss": 0.3031, + "epoch": 1.6989224060258765, + "grad_norm": 0.21104155480861664, + "learning_rate": 3.8445914309982145e-05, + "loss": 0.4234, "step": 47140 }, { - "epoch": 1.66, - "learning_rate": 3.898972961975983e-05, - "loss": 0.3031, + "epoch": 1.6991026056871013, + "grad_norm": 0.19302360713481903, + "learning_rate": 3.844345408837753e-05, + "loss": 0.3983, "step": 47145 }, { - "epoch": 1.66, - "learning_rate": 3.898736859378114e-05, - "loss": 0.2961, + "epoch": 1.699282805348326, + "grad_norm": 0.18512195348739624, + "learning_rate": 3.844099368360845e-05, + "loss": 0.4173, "step": 47150 }, { - "epoch": 1.66, - "learning_rate": 3.898500738618568e-05, - "loss": 0.2889, + "epoch": 1.6994630050095507, + "grad_norm": 0.17956486344337463, + "learning_rate": 3.8438533095708426e-05, + "loss": 0.4372, "step": 47155 }, { - "epoch": 1.66, - "learning_rate": 3.898264599700412e-05, - "loss": 0.2899, + "epoch": 1.6996432046707752, + "grad_norm": 0.19222790002822876, + "learning_rate": 3.8436072324710995e-05, + "loss": 0.3728, "step": 47160 }, { - "epoch": 1.66, - "learning_rate": 3.898028442626712e-05, - "loss": 0.2805, + "epoch": 1.6998234043319997, + "grad_norm": 0.15799042582511902, + "learning_rate": 3.8433611370649686e-05, + "loss": 0.4133, "step": 47165 }, { - "epoch": 1.66, - "learning_rate": 3.897792267400533e-05, - "loss": 0.2881, + "epoch": 1.7000036039932245, + "grad_norm": 0.1787501573562622, + "learning_rate": 3.843115023355802e-05, + "loss": 0.4264, "step": 47170 }, { - "epoch": 1.66, - "learning_rate": 3.897556074024944e-05, - "loss": 0.2914, + "epoch": 1.7001838036544492, + "grad_norm": 0.17307919263839722, + "learning_rate": 3.842868891346952e-05, + "loss": 0.4198, "step": 47175 }, { - "epoch": 1.66, - "learning_rate": 3.8973198625030094e-05, - "loss": 0.3109, + "epoch": 1.7003640033156737, + "grad_norm": 0.1564173549413681, + "learning_rate": 3.8426227410417755e-05, + "loss": 0.3832, "step": 47180 }, { - "epoch": 1.66, - "learning_rate": 3.897083632837797e-05, - "loss": 0.2974, + "epoch": 1.7005442029768985, + "grad_norm": 0.21778932213783264, + "learning_rate": 3.842376572443623e-05, + "loss": 0.4172, "step": 47185 }, { - "epoch": 1.66, - "learning_rate": 3.896847385032375e-05, - "loss": 0.2844, + "epoch": 1.700724402638123, + "grad_norm": 0.16438071429729462, + "learning_rate": 3.8421303855558496e-05, + "loss": 0.4138, "step": 47190 }, { - "epoch": 1.66, - "learning_rate": 3.89661111908981e-05, - "loss": 0.2585, + "epoch": 1.7009046022993477, + "grad_norm": 0.2048167735338211, + "learning_rate": 3.8418841803818096e-05, + "loss": 0.4055, "step": 47195 }, { - "epoch": 1.66, - "learning_rate": 3.89637483501317e-05, - "loss": 0.2966, + "epoch": 1.7010848019605724, + "grad_norm": 0.15939681231975555, + "learning_rate": 3.841637956924857e-05, + "loss": 0.4179, "step": 47200 }, { - "epoch": 1.66, - "learning_rate": 3.896138532805523e-05, - "loss": 0.2913, + "epoch": 1.701265001621797, + "grad_norm": 0.16138513386249542, + "learning_rate": 3.841391715188348e-05, + "loss": 0.416, "step": 47205 }, { - "epoch": 1.66, - "learning_rate": 3.8959022124699375e-05, - "loss": 0.2973, + "epoch": 1.7014452012830215, + "grad_norm": 0.186210036277771, + "learning_rate": 3.841145455175635e-05, + "loss": 0.4287, "step": 47210 }, { - "epoch": 1.66, - "learning_rate": 3.895665874009482e-05, - "loss": 0.3064, + "epoch": 1.7016254009442462, + "grad_norm": 0.17103976011276245, + "learning_rate": 3.840899176890076e-05, + "loss": 0.3943, "step": 47215 }, { - "epoch": 1.66, - "learning_rate": 3.8954295174272245e-05, - "loss": 0.2975, + "epoch": 1.701805600605471, + "grad_norm": 0.15319930016994476, + "learning_rate": 3.840652880335025e-05, + "loss": 0.4086, "step": 47220 }, { - "epoch": 1.66, - "learning_rate": 3.895193142726234e-05, - "loss": 0.2924, + "epoch": 1.7019858002666957, + "grad_norm": 0.1645641177892685, + "learning_rate": 3.840406565513838e-05, + "loss": 0.3827, "step": 47225 }, { - "epoch": 1.66, - "learning_rate": 3.894956749909581e-05, - "loss": 0.2741, + "epoch": 1.7021659999279202, + "grad_norm": 0.1592930108308792, + "learning_rate": 3.840160232429872e-05, + "loss": 0.3865, "step": 47230 }, { - "epoch": 1.66, - "learning_rate": 3.894720338980332e-05, - "loss": 0.2908, + "epoch": 1.7023461995891447, + "grad_norm": 0.18125946819782257, + "learning_rate": 3.839913881086481e-05, + "loss": 0.428, "step": 47235 }, { - "epoch": 1.66, - "learning_rate": 3.89448390994156e-05, - "loss": 0.3165, + "epoch": 1.7025263992503694, + "grad_norm": 0.17202188074588776, + "learning_rate": 3.8396675114870234e-05, + "loss": 0.4201, "step": 47240 }, { - "epoch": 1.66, - "learning_rate": 3.894247462796333e-05, - "loss": 0.2858, + "epoch": 1.7027065989115941, + "grad_norm": 0.16342946887016296, + "learning_rate": 3.839421123634855e-05, + "loss": 0.389, "step": 47245 }, { - "epoch": 1.66, - "learning_rate": 3.894010997547722e-05, - "loss": 0.268, + "epoch": 1.7028867985728187, + "grad_norm": 0.17867057025432587, + "learning_rate": 3.8391747175333336e-05, + "loss": 0.4173, "step": 47250 }, { - "epoch": 1.66, - "learning_rate": 3.893774514198797e-05, - "loss": 0.2751, + "epoch": 1.7030669982340432, + "grad_norm": 0.20603960752487183, + "learning_rate": 3.838928293185815e-05, + "loss": 0.4412, "step": 47255 }, { - "epoch": 1.66, - "learning_rate": 3.8935380127526284e-05, - "loss": 0.2815, + "epoch": 1.703247197895268, + "grad_norm": 0.19826002418994904, + "learning_rate": 3.838681850595659e-05, + "loss": 0.3956, "step": 47260 }, { - "epoch": 1.66, - "learning_rate": 3.893301493212286e-05, - "loss": 0.3071, + "epoch": 1.7034273975564926, + "grad_norm": 0.18762384355068207, + "learning_rate": 3.83843538976622e-05, + "loss": 0.4009, "step": 47265 }, { - "epoch": 1.66, - "learning_rate": 3.8930649555808434e-05, - "loss": 0.2742, + "epoch": 1.7036075972177174, + "grad_norm": 0.21648670732975006, + "learning_rate": 3.838188910700861e-05, + "loss": 0.4453, "step": 47270 }, { - "epoch": 1.66, - "learning_rate": 3.89282839986137e-05, - "loss": 0.3003, + "epoch": 1.7037877968789419, + "grad_norm": 0.20147664844989777, + "learning_rate": 3.837942413402935e-05, + "loss": 0.4143, "step": 47275 }, { - "epoch": 1.66, - "learning_rate": 3.892591826056937e-05, - "loss": 0.2946, + "epoch": 1.7039679965401664, + "grad_norm": 0.16938365995883942, + "learning_rate": 3.837695897875803e-05, + "loss": 0.4284, "step": 47280 }, { - "epoch": 1.66, - "learning_rate": 3.892355234170617e-05, - "loss": 0.3135, + "epoch": 1.7041481962013911, + "grad_norm": 0.16605401039123535, + "learning_rate": 3.837449364122823e-05, + "loss": 0.4519, "step": 47285 }, { - "epoch": 1.66, - "learning_rate": 3.892118624205483e-05, - "loss": 0.2809, + "epoch": 1.7043283958626159, + "grad_norm": 0.15813526511192322, + "learning_rate": 3.837202812147355e-05, + "loss": 0.4502, "step": 47290 }, { - "epoch": 1.66, - "learning_rate": 3.891881996164605e-05, - "loss": 0.2887, + "epoch": 1.7045085955238404, + "grad_norm": 0.1578982025384903, + "learning_rate": 3.8369562419527574e-05, + "loss": 0.407, "step": 47295 }, { - "epoch": 1.66, - "learning_rate": 3.891645350051057e-05, - "loss": 0.2931, + "epoch": 1.7046887951850649, + "grad_norm": 0.20871727168560028, + "learning_rate": 3.8367096535423895e-05, + "loss": 0.4251, "step": 47300 }, { - "epoch": 1.66, - "learning_rate": 3.891408685867911e-05, - "loss": 0.3101, + "epoch": 1.7048689948462896, + "grad_norm": 0.1831165850162506, + "learning_rate": 3.836463046919612e-05, + "loss": 0.4263, "step": 47305 }, { - "epoch": 1.66, - "learning_rate": 3.891172003618241e-05, - "loss": 0.3029, + "epoch": 1.7050491945075144, + "grad_norm": 0.2124657928943634, + "learning_rate": 3.836216422087784e-05, + "loss": 0.3903, "step": 47310 }, { - "epoch": 1.66, - "learning_rate": 3.890935303305118e-05, - "loss": 0.279, + "epoch": 1.705229394168739, + "grad_norm": 0.2060711830854416, + "learning_rate": 3.8359697790502656e-05, + "loss": 0.4089, "step": 47315 }, { - "epoch": 1.66, - "learning_rate": 3.890698584931618e-05, - "loss": 0.2807, + "epoch": 1.7054095938299636, + "grad_norm": 0.1653200387954712, + "learning_rate": 3.835723117810418e-05, + "loss": 0.376, "step": 47320 }, { - "epoch": 1.67, - "learning_rate": 3.890461848500813e-05, - "loss": 0.2789, + "epoch": 1.705589793491188, + "grad_norm": 0.15278410911560059, + "learning_rate": 3.835476438371601e-05, + "loss": 0.4144, "step": 47325 }, { - "epoch": 1.67, - "learning_rate": 3.890225094015777e-05, - "loss": 0.2878, + "epoch": 1.7057699931524128, + "grad_norm": 0.21837785840034485, + "learning_rate": 3.835229740737176e-05, + "loss": 0.4179, "step": 47330 }, { - "epoch": 1.67, - "learning_rate": 3.889988321479584e-05, - "loss": 0.2847, + "epoch": 1.7059501928136376, + "grad_norm": 0.17335210740566254, + "learning_rate": 3.8349830249105057e-05, + "loss": 0.3892, "step": 47335 }, { - "epoch": 1.67, - "learning_rate": 3.889751530895309e-05, - "loss": 0.2574, + "epoch": 1.7061303924748623, + "grad_norm": 0.17480070888996124, + "learning_rate": 3.8347362908949484e-05, + "loss": 0.4117, "step": 47340 }, { - "epoch": 1.67, - "learning_rate": 3.8895147222660264e-05, - "loss": 0.2951, + "epoch": 1.7063105921360868, + "grad_norm": 0.2083815336227417, + "learning_rate": 3.834489538693868e-05, + "loss": 0.4644, "step": 47345 }, { - "epoch": 1.67, - "learning_rate": 3.8892778955948104e-05, - "loss": 0.3026, + "epoch": 1.7064907917973113, + "grad_norm": 0.1669875979423523, + "learning_rate": 3.8342427683106276e-05, + "loss": 0.3913, "step": 47350 }, { - "epoch": 1.67, - "learning_rate": 3.8890410508847376e-05, - "loss": 0.2767, + "epoch": 1.706670991458536, + "grad_norm": 0.16881945729255676, + "learning_rate": 3.833995979748587e-05, + "loss": 0.4141, "step": 47355 }, { - "epoch": 1.67, - "learning_rate": 3.888804188138882e-05, - "loss": 0.303, + "epoch": 1.7068511911197608, + "grad_norm": 0.17470820248126984, + "learning_rate": 3.833749173011108e-05, + "loss": 0.4245, "step": 47360 }, { - "epoch": 1.67, - "learning_rate": 3.888567307360318e-05, - "loss": 0.2889, + "epoch": 1.7070313907809853, + "grad_norm": 0.18530166149139404, + "learning_rate": 3.833502348101556e-05, + "loss": 0.4304, "step": 47365 }, { - "epoch": 1.67, - "learning_rate": 3.8883304085521236e-05, - "loss": 0.2902, + "epoch": 1.7072115904422098, + "grad_norm": 0.206429585814476, + "learning_rate": 3.833255505023292e-05, + "loss": 0.4014, "step": 47370 }, { - "epoch": 1.67, - "learning_rate": 3.8880934917173735e-05, - "loss": 0.2759, + "epoch": 1.7073917901034346, + "grad_norm": 0.172637477517128, + "learning_rate": 3.83300864377968e-05, + "loss": 0.3989, "step": 47375 }, { - "epoch": 1.67, - "learning_rate": 3.887856556859144e-05, - "loss": 0.3042, + "epoch": 1.7075719897646593, + "grad_norm": 0.18102556467056274, + "learning_rate": 3.832761764374084e-05, + "loss": 0.4066, "step": 47380 }, { - "epoch": 1.67, - "learning_rate": 3.887619603980512e-05, - "loss": 0.3202, + "epoch": 1.707752189425884, + "grad_norm": 0.16687656939029694, + "learning_rate": 3.832514866809866e-05, + "loss": 0.4241, "step": 47385 }, { - "epoch": 1.67, - "learning_rate": 3.8873826330845545e-05, - "loss": 0.2898, + "epoch": 1.7079323890871085, + "grad_norm": 0.17765361070632935, + "learning_rate": 3.832267951090392e-05, + "loss": 0.404, "step": 47390 }, { - "epoch": 1.67, - "learning_rate": 3.887145644174347e-05, - "loss": 0.2714, + "epoch": 1.708112588748333, + "grad_norm": 0.20449407398700714, + "learning_rate": 3.832021017219025e-05, + "loss": 0.4424, "step": 47395 }, { - "epoch": 1.67, - "learning_rate": 3.886908637252967e-05, - "loss": 0.2699, + "epoch": 1.7082927884095578, + "grad_norm": 0.20661450922489166, + "learning_rate": 3.831774065199128e-05, + "loss": 0.444, "step": 47400 }, { - "epoch": 1.67, - "learning_rate": 3.8866716123234934e-05, - "loss": 0.269, + "epoch": 1.7084729880707825, + "grad_norm": 0.1424778401851654, + "learning_rate": 3.8315270950340684e-05, + "loss": 0.3854, "step": 47405 }, { - "epoch": 1.67, - "learning_rate": 3.8864345693890024e-05, - "loss": 0.2756, + "epoch": 1.708653187732007, + "grad_norm": 0.1613854020833969, + "learning_rate": 3.831280106727211e-05, + "loss": 0.4014, "step": 47410 }, { - "epoch": 1.67, - "learning_rate": 3.886197508452572e-05, - "loss": 0.2592, + "epoch": 1.7088333873932315, + "grad_norm": 0.19576317071914673, + "learning_rate": 3.8310331002819186e-05, + "loss": 0.428, "step": 47415 }, { - "epoch": 1.67, - "learning_rate": 3.8859604295172816e-05, - "loss": 0.273, + "epoch": 1.7090135870544563, + "grad_norm": 0.19375436007976532, + "learning_rate": 3.830786075701558e-05, + "loss": 0.4368, "step": 47420 }, { - "epoch": 1.67, - "learning_rate": 3.8857233325862074e-05, - "loss": 0.2826, + "epoch": 1.709193786715681, + "grad_norm": 0.24213196337223053, + "learning_rate": 3.8305390329894945e-05, + "loss": 0.4112, "step": 47425 }, { - "epoch": 1.67, - "learning_rate": 3.885486217662428e-05, - "loss": 0.3095, + "epoch": 1.7093739863769057, + "grad_norm": 0.15626302361488342, + "learning_rate": 3.830291972149095e-05, + "loss": 0.4208, "step": 47430 }, { - "epoch": 1.67, - "learning_rate": 3.885249084749025e-05, - "loss": 0.2889, + "epoch": 1.7095541860381303, + "grad_norm": 0.18249692022800446, + "learning_rate": 3.8300448931837244e-05, + "loss": 0.3906, "step": 47435 }, { - "epoch": 1.67, - "learning_rate": 3.8850119338490745e-05, - "loss": 0.2691, + "epoch": 1.7097343856993548, + "grad_norm": 0.15607425570487976, + "learning_rate": 3.829797796096749e-05, + "loss": 0.4015, "step": 47440 }, { - "epoch": 1.67, - "learning_rate": 3.884774764965658e-05, - "loss": 0.279, + "epoch": 1.7099145853605795, + "grad_norm": 0.1599874198436737, + "learning_rate": 3.829550680891537e-05, + "loss": 0.4124, "step": 47445 }, { - "epoch": 1.67, - "learning_rate": 3.8845375781018524e-05, - "loss": 0.3029, + "epoch": 1.7100947850218042, + "grad_norm": 0.2017892599105835, + "learning_rate": 3.8293035475714543e-05, + "loss": 0.4204, "step": 47450 }, { - "epoch": 1.67, - "learning_rate": 3.884300373260739e-05, - "loss": 0.2701, + "epoch": 1.7102749846830287, + "grad_norm": 0.17167171835899353, + "learning_rate": 3.8290563961398686e-05, + "loss": 0.3933, "step": 47455 }, { - "epoch": 1.67, - "learning_rate": 3.8840631504453975e-05, - "loss": 0.2512, + "epoch": 1.7104551843442535, + "grad_norm": 0.18356946110725403, + "learning_rate": 3.828809226600146e-05, + "loss": 0.4294, "step": 47460 }, { - "epoch": 1.67, - "learning_rate": 3.883825909658908e-05, - "loss": 0.2756, + "epoch": 1.710635384005478, + "grad_norm": 0.20430004596710205, + "learning_rate": 3.828562038955655e-05, + "loss": 0.4215, "step": 47465 }, { - "epoch": 1.67, - "learning_rate": 3.883588650904352e-05, - "loss": 0.2821, + "epoch": 1.7108155836667027, + "grad_norm": 0.19897079467773438, + "learning_rate": 3.828314833209764e-05, + "loss": 0.4301, "step": 47470 }, { - "epoch": 1.67, - "learning_rate": 3.883351374184808e-05, - "loss": 0.2952, + "epoch": 1.7109957833279275, + "grad_norm": 0.17606034874916077, + "learning_rate": 3.828067609365841e-05, + "loss": 0.4146, "step": 47475 }, { - "epoch": 1.67, - "learning_rate": 3.883114079503359e-05, - "loss": 0.2917, + "epoch": 1.711175982989152, + "grad_norm": 0.18910571932792664, + "learning_rate": 3.827869817262381e-05, + "loss": 0.4191, "step": 47480 }, { - "epoch": 1.67, - "learning_rate": 3.882876766863085e-05, - "loss": 0.2885, + "epoch": 1.7113561826503765, + "grad_norm": 0.16573849320411682, + "learning_rate": 3.827622560850488e-05, + "loss": 0.4207, "step": 47485 }, { - "epoch": 1.67, - "learning_rate": 3.8826394362670684e-05, - "loss": 0.3046, + "epoch": 1.7115363823116012, + "grad_norm": 0.18701373040676117, + "learning_rate": 3.827375286349993e-05, + "loss": 0.4094, "step": 47490 }, { - "epoch": 1.67, - "learning_rate": 3.8824020877183886e-05, - "loss": 0.2945, + "epoch": 1.711716581972826, + "grad_norm": 0.16349554061889648, + "learning_rate": 3.827127993764269e-05, + "loss": 0.4338, "step": 47495 }, { - "epoch": 1.67, - "learning_rate": 3.8821647212201286e-05, - "loss": 0.275, + "epoch": 1.7118967816340507, + "grad_norm": 0.17863896489143372, + "learning_rate": 3.826880683096681e-05, + "loss": 0.4307, "step": 47500 }, { - "epoch": 1.67, - "eval_loss": 0.2766549289226532, - "eval_runtime": 10.5338, - "eval_samples_per_second": 9.493, - "eval_steps_per_second": 9.493, + "epoch": 1.7118967816340507, + "eval_loss": 0.4411923587322235, + "eval_runtime": 3.5372, + "eval_samples_per_second": 28.271, + "eval_steps_per_second": 7.068, "step": 47500 }, { - "epoch": 1.67, - "learning_rate": 3.881927336775371e-05, - "loss": 0.3027, + "epoch": 1.7120769812952752, + "grad_norm": 0.2336321473121643, + "learning_rate": 3.8266333543506016e-05, + "loss": 0.4195, "step": 47505 }, { - "epoch": 1.67, - "learning_rate": 3.881689934387198e-05, - "loss": 0.2904, + "epoch": 1.7122571809564997, + "grad_norm": 0.22593377530574799, + "learning_rate": 3.826386007529399e-05, + "loss": 0.4147, "step": 47510 }, { - "epoch": 1.67, - "learning_rate": 3.881452514058691e-05, - "loss": 0.2958, + "epoch": 1.7124373806177244, + "grad_norm": 0.23086853325366974, + "learning_rate": 3.826138642636444e-05, + "loss": 0.4112, "step": 47515 }, { - "epoch": 1.67, - "learning_rate": 3.881215075792934e-05, - "loss": 0.2804, + "epoch": 1.7126175802789492, + "grad_norm": 0.2238939106464386, + "learning_rate": 3.8258912596751076e-05, + "loss": 0.3977, "step": 47520 }, { - "epoch": 1.67, - "learning_rate": 3.8809776195930095e-05, - "loss": 0.2713, + "epoch": 1.7127977799401737, + "grad_norm": 0.1743045598268509, + "learning_rate": 3.8256438586487584e-05, + "loss": 0.429, "step": 47525 }, { - "epoch": 1.67, - "learning_rate": 3.8807401454620004e-05, - "loss": 0.2963, + "epoch": 1.7129779796013982, + "grad_norm": 0.1635396033525467, + "learning_rate": 3.825396439560769e-05, + "loss": 0.3928, "step": 47530 }, { - "epoch": 1.67, - "learning_rate": 3.880502653402991e-05, - "loss": 0.2641, + "epoch": 1.713158179262623, + "grad_norm": 0.1982748806476593, + "learning_rate": 3.8251490024145085e-05, + "loss": 0.4473, "step": 47535 }, { - "epoch": 1.67, - "learning_rate": 3.880265143419065e-05, - "loss": 0.2932, + "epoch": 1.7133383789238477, + "grad_norm": 0.17309851944446564, + "learning_rate": 3.824901547213351e-05, + "loss": 0.3952, "step": 47540 }, { - "epoch": 1.67, - "learning_rate": 3.8800276155133044e-05, - "loss": 0.2958, + "epoch": 1.7135185785850724, + "grad_norm": 0.17767395079135895, + "learning_rate": 3.824654073960665e-05, + "loss": 0.379, "step": 47545 }, { - "epoch": 1.67, - "learning_rate": 3.879790069688795e-05, - "loss": 0.2934, + "epoch": 1.713698778246297, + "grad_norm": 0.19817490875720978, + "learning_rate": 3.824406582659824e-05, + "loss": 0.4156, "step": 47550 }, { - "epoch": 1.67, - "learning_rate": 3.8795525059486216e-05, - "loss": 0.284, + "epoch": 1.7138789779075214, + "grad_norm": 0.20314614474773407, + "learning_rate": 3.8241590733142004e-05, + "loss": 0.3943, "step": 47555 }, { - "epoch": 1.67, - "learning_rate": 3.879314924295867e-05, - "loss": 0.2561, + "epoch": 1.7140591775687462, + "grad_norm": 0.17916688323020935, + "learning_rate": 3.823911545927166e-05, + "loss": 0.3994, "step": 47560 }, { - "epoch": 1.67, - "learning_rate": 3.8790773247336176e-05, - "loss": 0.2807, + "epoch": 1.7142393772299709, + "grad_norm": 0.1822926551103592, + "learning_rate": 3.823664000502093e-05, + "loss": 0.4236, "step": 47565 }, { - "epoch": 1.67, - "learning_rate": 3.8788397072649584e-05, - "loss": 0.2702, + "epoch": 1.7144195768911954, + "grad_norm": 0.1775011420249939, + "learning_rate": 3.823416437042353e-05, + "loss": 0.4446, "step": 47570 }, { - "epoch": 1.67, - "learning_rate": 3.8786020718929737e-05, - "loss": 0.2759, + "epoch": 1.7145997765524201, + "grad_norm": 0.1689702570438385, + "learning_rate": 3.823168855551321e-05, + "loss": 0.3789, "step": 47575 }, { - "epoch": 1.67, - "learning_rate": 3.8783644186207494e-05, - "loss": 0.2952, + "epoch": 1.7147799762136446, + "grad_norm": 0.17035454511642456, + "learning_rate": 3.82292125603237e-05, + "loss": 0.4214, "step": 47580 }, { - "epoch": 1.67, - "learning_rate": 3.878126747451373e-05, - "loss": 0.2847, + "epoch": 1.7149601758748694, + "grad_norm": 0.17879173159599304, + "learning_rate": 3.822673638488873e-05, + "loss": 0.4425, "step": 47585 }, { - "epoch": 1.67, - "learning_rate": 3.877889058387927e-05, - "loss": 0.2939, + "epoch": 1.715140375536094, + "grad_norm": 0.2069813311100006, + "learning_rate": 3.8224260029242034e-05, + "loss": 0.4517, "step": 47590 }, { - "epoch": 1.67, - "learning_rate": 3.8776513514335014e-05, - "loss": 0.3119, + "epoch": 1.7153205751973186, + "grad_norm": 0.16887204349040985, + "learning_rate": 3.822178349341735e-05, + "loss": 0.4043, "step": 47595 }, { - "epoch": 1.67, - "learning_rate": 3.8774136265911795e-05, - "loss": 0.2785, + "epoch": 1.7155007748585431, + "grad_norm": 0.16069333255290985, + "learning_rate": 3.821930677744843e-05, + "loss": 0.4107, "step": 47600 }, { - "epoch": 1.67, - "learning_rate": 3.87717588386405e-05, - "loss": 0.3148, + "epoch": 1.7156809745197679, + "grad_norm": 0.1842743456363678, + "learning_rate": 3.821682988136902e-05, + "loss": 0.4391, "step": 47605 }, { - "epoch": 1.68, - "learning_rate": 3.876938123255199e-05, - "loss": 0.2824, + "epoch": 1.7158611741809926, + "grad_norm": 0.2043282687664032, + "learning_rate": 3.821435280521286e-05, + "loss": 0.4216, "step": 47610 }, { - "epoch": 1.68, - "learning_rate": 3.8767003447677145e-05, - "loss": 0.3029, + "epoch": 1.7160413738422173, + "grad_norm": 0.14843593537807465, + "learning_rate": 3.82118755490137e-05, + "loss": 0.3988, "step": 47615 }, { - "epoch": 1.68, - "learning_rate": 3.876462548404683e-05, - "loss": 0.2829, + "epoch": 1.7162215735034418, + "grad_norm": 0.17393402755260468, + "learning_rate": 3.820939811280528e-05, + "loss": 0.4165, "step": 47620 }, { - "epoch": 1.68, - "learning_rate": 3.876224734169193e-05, - "loss": 0.2836, + "epoch": 1.7164017731646664, + "grad_norm": 0.20571184158325195, + "learning_rate": 3.820692049662139e-05, + "loss": 0.3767, "step": 47625 }, { - "epoch": 1.68, - "learning_rate": 3.87598690206433e-05, - "loss": 0.2844, + "epoch": 1.716581972825891, + "grad_norm": 0.18904875218868256, + "learning_rate": 3.820444270049576e-05, + "loss": 0.4407, "step": 47630 }, { - "epoch": 1.68, - "learning_rate": 3.875749052093186e-05, - "loss": 0.2848, + "epoch": 1.7167621724871158, + "grad_norm": 0.16259777545928955, + "learning_rate": 3.820196472446215e-05, + "loss": 0.4126, "step": 47635 }, { - "epoch": 1.68, - "learning_rate": 3.875511184258845e-05, - "loss": 0.287, + "epoch": 1.7169423721483403, + "grad_norm": 0.19046054780483246, + "learning_rate": 3.819948656855432e-05, + "loss": 0.3814, "step": 47640 }, { - "epoch": 1.68, - "learning_rate": 3.8752732985644e-05, - "loss": 0.2968, + "epoch": 1.7171225718095648, + "grad_norm": 0.17224107682704926, + "learning_rate": 3.819700823280605e-05, + "loss": 0.4453, "step": 47645 }, { - "epoch": 1.68, - "learning_rate": 3.875035395012936e-05, - "loss": 0.3014, + "epoch": 1.7173027714707896, + "grad_norm": 0.22785985469818115, + "learning_rate": 3.8194529717251095e-05, + "loss": 0.3798, "step": 47650 }, { - "epoch": 1.68, - "learning_rate": 3.8747974736075444e-05, - "loss": 0.2758, + "epoch": 1.7174829711320143, + "grad_norm": 0.17245115339756012, + "learning_rate": 3.819205102192323e-05, + "loss": 0.3987, "step": 47655 }, { - "epoch": 1.68, - "learning_rate": 3.874559534351314e-05, - "loss": 0.2948, + "epoch": 1.717663170793239, + "grad_norm": 0.17877335846424103, + "learning_rate": 3.818957214685622e-05, + "loss": 0.3945, "step": 47660 }, { - "epoch": 1.68, - "learning_rate": 3.8743215772473326e-05, - "loss": 0.3045, + "epoch": 1.7178433704544636, + "grad_norm": 0.20267999172210693, + "learning_rate": 3.8187093092083845e-05, + "loss": 0.4068, "step": 47665 }, { - "epoch": 1.68, - "learning_rate": 3.874083602298693e-05, - "loss": 0.2855, + "epoch": 1.718023570115688, + "grad_norm": 0.1948103904724121, + "learning_rate": 3.818461385763988e-05, + "loss": 0.4051, "step": 47670 }, { - "epoch": 1.68, - "learning_rate": 3.873845609508482e-05, - "loss": 0.2766, + "epoch": 1.7182037697769128, + "grad_norm": 0.1381908804178238, + "learning_rate": 3.81821344435581e-05, + "loss": 0.4079, "step": 47675 }, { - "epoch": 1.68, - "learning_rate": 3.873607598879791e-05, - "loss": 0.3062, + "epoch": 1.7183839694381375, + "grad_norm": 0.16724207997322083, + "learning_rate": 3.81796548498723e-05, + "loss": 0.387, "step": 47680 }, { - "epoch": 1.68, - "learning_rate": 3.8733695704157115e-05, - "loss": 0.2835, + "epoch": 1.718564169099362, + "grad_norm": 0.19141383469104767, + "learning_rate": 3.817717507661625e-05, + "loss": 0.4073, "step": 47685 }, { - "epoch": 1.68, - "learning_rate": 3.8731315241193324e-05, - "loss": 0.3016, + "epoch": 1.7187443687605868, + "grad_norm": 0.21919883787631989, + "learning_rate": 3.8174695123823734e-05, + "loss": 0.4621, "step": 47690 }, { - "epoch": 1.68, - "learning_rate": 3.872893459993745e-05, - "loss": 0.2987, + "epoch": 1.7189245684218113, + "grad_norm": 0.1990518867969513, + "learning_rate": 3.8172214991528554e-05, + "loss": 0.4047, "step": 47695 }, { - "epoch": 1.68, - "learning_rate": 3.872655378042043e-05, - "loss": 0.2697, + "epoch": 1.719104768083036, + "grad_norm": 0.18987984955310822, + "learning_rate": 3.8169734679764494e-05, + "loss": 0.4128, "step": 47700 }, { - "epoch": 1.68, - "learning_rate": 3.872417278267313e-05, - "loss": 0.2787, + "epoch": 1.7192849677442608, + "grad_norm": 0.18330170214176178, + "learning_rate": 3.816725418856535e-05, + "loss": 0.4213, "step": 47705 }, { - "epoch": 1.68, - "learning_rate": 3.87217916067265e-05, - "loss": 0.2992, + "epoch": 1.7194651674054853, + "grad_norm": 0.1505930870771408, + "learning_rate": 3.816477351796491e-05, + "loss": 0.4264, "step": 47710 }, { - "epoch": 1.68, - "learning_rate": 3.871941025261146e-05, - "loss": 0.2912, + "epoch": 1.7196453670667098, + "grad_norm": 0.18332305550575256, + "learning_rate": 3.8162292667996986e-05, + "loss": 0.4012, "step": 47715 }, { - "epoch": 1.68, - "learning_rate": 3.8717028720358904e-05, - "loss": 0.2638, + "epoch": 1.7198255667279345, + "grad_norm": 0.21521466970443726, + "learning_rate": 3.815981163869537e-05, + "loss": 0.4301, "step": 47720 }, { - "epoch": 1.68, - "learning_rate": 3.871464700999978e-05, - "loss": 0.3052, + "epoch": 1.7200057663891593, + "grad_norm": 0.21533189713954926, + "learning_rate": 3.815733043009387e-05, + "loss": 0.4403, "step": 47725 }, { - "epoch": 1.68, - "learning_rate": 3.871226512156499e-05, - "loss": 0.2839, + "epoch": 1.720185966050384, + "grad_norm": 0.22986893355846405, + "learning_rate": 3.815484904222629e-05, + "loss": 0.4242, "step": 47730 }, { - "epoch": 1.68, - "learning_rate": 3.870988305508549e-05, - "loss": 0.291, + "epoch": 1.7203661657116085, + "grad_norm": 0.1726514995098114, + "learning_rate": 3.8152367475126436e-05, + "loss": 0.4429, "step": 47735 }, { - "epoch": 1.68, - "learning_rate": 3.870750081059219e-05, - "loss": 0.2973, + "epoch": 1.720546365372833, + "grad_norm": 0.18250474333763123, + "learning_rate": 3.814988572882813e-05, + "loss": 0.4109, "step": 47740 }, { - "epoch": 1.68, - "learning_rate": 3.870511838811601e-05, - "loss": 0.2885, + "epoch": 1.7207265650340577, + "grad_norm": 0.15575775504112244, + "learning_rate": 3.814740380336517e-05, + "loss": 0.3745, "step": 47745 }, { - "epoch": 1.68, - "learning_rate": 3.870273578768792e-05, - "loss": 0.2954, + "epoch": 1.7209067646952825, + "grad_norm": 0.23164531588554382, + "learning_rate": 3.814492169877138e-05, + "loss": 0.4285, "step": 47750 }, { - "epoch": 1.68, - "learning_rate": 3.870035300933883e-05, - "loss": 0.2994, + "epoch": 1.721086964356507, + "grad_norm": 0.18944543600082397, + "learning_rate": 3.814243941508058e-05, + "loss": 0.4425, "step": 47755 }, { - "epoch": 1.68, - "learning_rate": 3.869797005309969e-05, - "loss": 0.3017, + "epoch": 1.7212671640177315, + "grad_norm": 0.17386077344417572, + "learning_rate": 3.813995695232658e-05, + "loss": 0.4277, "step": 47760 }, { - "epoch": 1.68, - "learning_rate": 3.869558691900142e-05, - "loss": 0.3055, + "epoch": 1.7214473636789562, + "grad_norm": 0.16808000206947327, + "learning_rate": 3.813747431054321e-05, + "loss": 0.4155, "step": 47765 }, { - "epoch": 1.68, - "learning_rate": 3.869320360707499e-05, - "loss": 0.2589, + "epoch": 1.721627563340181, + "grad_norm": 0.21575501561164856, + "learning_rate": 3.8134991489764305e-05, + "loss": 0.4243, "step": 47770 }, { - "epoch": 1.68, - "learning_rate": 3.869082011735133e-05, - "loss": 0.2804, + "epoch": 1.7218077630014057, + "grad_norm": 0.16547565162181854, + "learning_rate": 3.8132508490023674e-05, + "loss": 0.4241, "step": 47775 }, { - "epoch": 1.68, - "learning_rate": 3.8688436449861406e-05, - "loss": 0.259, + "epoch": 1.7219879626626302, + "grad_norm": 0.17828123271465302, + "learning_rate": 3.813002531135517e-05, + "loss": 0.4356, "step": 47780 }, { - "epoch": 1.68, - "learning_rate": 3.8686052604636144e-05, - "loss": 0.2794, + "epoch": 1.7221681623238547, + "grad_norm": 0.21467016637325287, + "learning_rate": 3.81275419537926e-05, + "loss": 0.399, "step": 47785 }, { - "epoch": 1.68, - "learning_rate": 3.868366858170651e-05, - "loss": 0.285, + "epoch": 1.7223483619850795, + "grad_norm": 0.16447187960147858, + "learning_rate": 3.8125058417369824e-05, + "loss": 0.4181, "step": 47790 }, { - "epoch": 1.68, - "learning_rate": 3.868128438110346e-05, - "loss": 0.3079, + "epoch": 1.7225285616463042, + "grad_norm": 0.17198550701141357, + "learning_rate": 3.812257470212066e-05, + "loss": 0.3742, "step": 47795 }, { - "epoch": 1.68, - "learning_rate": 3.867890000285795e-05, - "loss": 0.2741, + "epoch": 1.7227087613075287, + "grad_norm": 0.1887580305337906, + "learning_rate": 3.812009080807896e-05, + "loss": 0.3888, "step": 47800 }, { - "epoch": 1.68, - "learning_rate": 3.867651544700093e-05, - "loss": 0.2971, + "epoch": 1.7228889609687532, + "grad_norm": 0.18961556255817413, + "learning_rate": 3.8117606735278556e-05, + "loss": 0.44, "step": 47805 }, { - "epoch": 1.68, - "learning_rate": 3.8674130713563376e-05, - "loss": 0.285, + "epoch": 1.723069160629978, + "grad_norm": 0.18733921647071838, + "learning_rate": 3.811512248375332e-05, + "loss": 0.4275, "step": 47810 }, { - "epoch": 1.68, - "learning_rate": 3.8671745802576254e-05, - "loss": 0.2907, + "epoch": 1.7232493602912027, + "grad_norm": 0.17053326964378357, + "learning_rate": 3.811263805353705e-05, + "loss": 0.4142, "step": 47815 }, { - "epoch": 1.68, - "learning_rate": 3.866936071407051e-05, - "loss": 0.2883, + "epoch": 1.7234295599524274, + "grad_norm": 0.2137068808078766, + "learning_rate": 3.811015344466365e-05, + "loss": 0.4018, "step": 47820 }, { - "epoch": 1.68, - "learning_rate": 3.8666975448077135e-05, - "loss": 0.2868, + "epoch": 1.723609759613652, + "grad_norm": 0.21045798063278198, + "learning_rate": 3.810766865716693e-05, + "loss": 0.4379, "step": 47825 }, { - "epoch": 1.68, - "learning_rate": 3.866459000462708e-05, - "loss": 0.3158, + "epoch": 1.7237899592748764, + "grad_norm": 0.2026238739490509, + "learning_rate": 3.810518369108077e-05, + "loss": 0.4311, "step": 47830 }, { - "epoch": 1.68, - "learning_rate": 3.866220438375134e-05, - "loss": 0.3045, + "epoch": 1.7239701589361012, + "grad_norm": 0.1852739304304123, + "learning_rate": 3.8102698546439025e-05, + "loss": 0.397, "step": 47835 }, { - "epoch": 1.68, - "learning_rate": 3.865981858548087e-05, - "loss": 0.2977, + "epoch": 1.724150358597326, + "grad_norm": 0.20483826100826263, + "learning_rate": 3.810021322327554e-05, + "loss": 0.4485, "step": 47840 }, { - "epoch": 1.68, - "learning_rate": 3.865743260984667e-05, - "loss": 0.2843, + "epoch": 1.7243305582585506, + "grad_norm": 0.18066450953483582, + "learning_rate": 3.809772772162419e-05, + "loss": 0.4222, "step": 47845 }, { - "epoch": 1.68, - "learning_rate": 3.865504645687969e-05, - "loss": 0.3085, + "epoch": 1.7245107579197752, + "grad_norm": 0.1802929937839508, + "learning_rate": 3.809524204151883e-05, + "loss": 0.4147, "step": 47850 }, { - "epoch": 1.68, - "learning_rate": 3.865266012661095e-05, - "loss": 0.3038, + "epoch": 1.7246909575809997, + "grad_norm": 0.177862748503685, + "learning_rate": 3.809275618299335e-05, + "loss": 0.445, "step": 47855 }, { - "epoch": 1.68, - "learning_rate": 3.865027361907141e-05, - "loss": 0.2652, + "epoch": 1.7248711572422244, + "grad_norm": 0.1601111888885498, + "learning_rate": 3.809027014608159e-05, + "loss": 0.424, "step": 47860 }, { - "epoch": 1.68, - "learning_rate": 3.864788693429206e-05, - "loss": 0.2831, + "epoch": 1.7250513569034491, + "grad_norm": 0.1917547881603241, + "learning_rate": 3.808778393081742e-05, + "loss": 0.4559, "step": 47865 }, { - "epoch": 1.68, - "learning_rate": 3.8645500072303886e-05, - "loss": 0.2692, + "epoch": 1.7252315565646736, + "grad_norm": 0.1599884331226349, + "learning_rate": 3.808529753723475e-05, + "loss": 0.4261, "step": 47870 }, { - "epoch": 1.68, - "learning_rate": 3.86431130331379e-05, - "loss": 0.2932, + "epoch": 1.7254117562258982, + "grad_norm": 0.1725659966468811, + "learning_rate": 3.808281096536742e-05, + "loss": 0.4202, "step": 47875 }, { - "epoch": 1.68, - "learning_rate": 3.8640725816825075e-05, - "loss": 0.2795, + "epoch": 1.7255919558871229, + "grad_norm": 0.177949458360672, + "learning_rate": 3.808032421524933e-05, + "loss": 0.431, "step": 47880 }, { - "epoch": 1.68, - "learning_rate": 3.8638338423396415e-05, - "loss": 0.2969, + "epoch": 1.7257721555483476, + "grad_norm": 0.20164379477500916, + "learning_rate": 3.807783728691435e-05, + "loss": 0.3792, "step": 47885 }, { - "epoch": 1.68, - "learning_rate": 3.863595085288292e-05, - "loss": 0.2695, + "epoch": 1.7259523552095724, + "grad_norm": 0.16568000614643097, + "learning_rate": 3.8075350180396376e-05, + "loss": 0.4351, "step": 47890 }, { - "epoch": 1.69, - "learning_rate": 3.8633563105315585e-05, - "loss": 0.2813, + "epoch": 1.7261325548707969, + "grad_norm": 0.1772766262292862, + "learning_rate": 3.807286289572929e-05, + "loss": 0.4139, "step": 47895 }, { - "epoch": 1.69, - "learning_rate": 3.863117518072542e-05, - "loss": 0.2802, + "epoch": 1.7263127545320214, + "grad_norm": 0.20613396167755127, + "learning_rate": 3.8070375432946965e-05, + "loss": 0.4108, "step": 47900 }, { - "epoch": 1.69, - "learning_rate": 3.8628787079143434e-05, - "loss": 0.2588, + "epoch": 1.726492954193246, + "grad_norm": 0.1459551453590393, + "learning_rate": 3.806788779208331e-05, + "loss": 0.4489, "step": 47905 }, { - "epoch": 1.69, - "learning_rate": 3.862639880060063e-05, - "loss": 0.298, + "epoch": 1.7266731538544708, + "grad_norm": 0.20022889971733093, + "learning_rate": 3.806539997317221e-05, + "loss": 0.438, "step": 47910 }, { - "epoch": 1.69, - "learning_rate": 3.8624010345128015e-05, - "loss": 0.2723, + "epoch": 1.7268533535156954, + "grad_norm": 0.1898205578327179, + "learning_rate": 3.806291197624758e-05, + "loss": 0.4088, "step": 47915 }, { - "epoch": 1.69, - "learning_rate": 3.8621621712756596e-05, - "loss": 0.2967, + "epoch": 1.7270335531769199, + "grad_norm": 0.18108853697776794, + "learning_rate": 3.8060423801343294e-05, + "loss": 0.4036, "step": 47920 }, { - "epoch": 1.69, - "learning_rate": 3.86192329035174e-05, - "loss": 0.2764, + "epoch": 1.7272137528381446, + "grad_norm": 0.21101535856723785, + "learning_rate": 3.805793544849326e-05, + "loss": 0.4368, "step": 47925 }, { - "epoch": 1.69, - "learning_rate": 3.8616843917441447e-05, - "loss": 0.2625, + "epoch": 1.7273939524993693, + "grad_norm": 0.16363751888275146, + "learning_rate": 3.8055446917731386e-05, + "loss": 0.4097, "step": 47930 }, { - "epoch": 1.69, - "learning_rate": 3.861445475455974e-05, - "loss": 0.3208, + "epoch": 1.727574152160594, + "grad_norm": 0.21080221235752106, + "learning_rate": 3.805295820909158e-05, + "loss": 0.4326, "step": 47935 }, { - "epoch": 1.69, - "learning_rate": 3.861206541490332e-05, - "loss": 0.2718, + "epoch": 1.7277543518218186, + "grad_norm": 0.18741890788078308, + "learning_rate": 3.805046932260774e-05, + "loss": 0.4329, "step": 47940 }, { - "epoch": 1.69, - "learning_rate": 3.86096758985032e-05, - "loss": 0.2794, + "epoch": 1.727934551483043, + "grad_norm": 0.20189730823040009, + "learning_rate": 3.804798025831379e-05, + "loss": 0.4618, "step": 47945 }, { - "epoch": 1.69, - "learning_rate": 3.86072862053904e-05, - "loss": 0.2741, + "epoch": 1.7281147511442678, + "grad_norm": 0.16234025359153748, + "learning_rate": 3.804549101624362e-05, + "loss": 0.3968, "step": 47950 }, { - "epoch": 1.69, - "learning_rate": 3.860489633559596e-05, - "loss": 0.2908, + "epoch": 1.7282949508054926, + "grad_norm": 0.17763473093509674, + "learning_rate": 3.804300159643117e-05, + "loss": 0.3923, "step": 47955 }, { - "epoch": 1.69, - "learning_rate": 3.860250628915091e-05, - "loss": 0.2994, + "epoch": 1.728475150466717, + "grad_norm": 0.20635025203227997, + "learning_rate": 3.804051199891035e-05, + "loss": 0.3992, "step": 47960 }, { - "epoch": 1.69, - "learning_rate": 3.860011606608628e-05, - "loss": 0.3038, + "epoch": 1.7286553501279418, + "grad_norm": 0.1754833310842514, + "learning_rate": 3.803802222371507e-05, + "loss": 0.4374, "step": 47965 }, { - "epoch": 1.69, - "learning_rate": 3.85977256664331e-05, - "loss": 0.2753, + "epoch": 1.7288355497891663, + "grad_norm": 0.17620554566383362, + "learning_rate": 3.803553227087928e-05, + "loss": 0.409, "step": 47970 }, { - "epoch": 1.69, - "learning_rate": 3.8595335090222414e-05, - "loss": 0.286, + "epoch": 1.729015749450391, + "grad_norm": 0.1946227103471756, + "learning_rate": 3.803304214043687e-05, + "loss": 0.4026, "step": 47975 }, { - "epoch": 1.69, - "learning_rate": 3.859294433748527e-05, - "loss": 0.297, + "epoch": 1.7291959491116158, + "grad_norm": 0.22340406477451324, + "learning_rate": 3.803055183242179e-05, + "loss": 0.3881, "step": 47980 }, { - "epoch": 1.69, - "learning_rate": 3.85905534082527e-05, - "loss": 0.2689, + "epoch": 1.7293761487728403, + "grad_norm": 0.16771641373634338, + "learning_rate": 3.8028061346867963e-05, + "loss": 0.4304, "step": 47985 }, { - "epoch": 1.69, - "learning_rate": 3.8588162302555744e-05, - "loss": 0.2753, + "epoch": 1.7295563484340648, + "grad_norm": 0.16595138609409332, + "learning_rate": 3.802557068380932e-05, + "loss": 0.4088, "step": 47990 }, { - "epoch": 1.69, - "learning_rate": 3.8585771020425454e-05, - "loss": 0.2754, + "epoch": 1.7297365480952895, + "grad_norm": 0.22411175072193146, + "learning_rate": 3.80230798432798e-05, + "loss": 0.4289, "step": 47995 }, { - "epoch": 1.69, - "learning_rate": 3.858337956189289e-05, - "loss": 0.2888, + "epoch": 1.7299167477565143, + "grad_norm": 0.2403663694858551, + "learning_rate": 3.802058882531334e-05, + "loss": 0.4267, "step": 48000 }, { - "epoch": 1.69, - "eval_loss": 0.2754231095314026, - "eval_runtime": 10.5474, - "eval_samples_per_second": 9.481, - "eval_steps_per_second": 9.481, + "epoch": 1.7299167477565143, + "eval_loss": 0.44221970438957214, + "eval_runtime": 3.5221, + "eval_samples_per_second": 28.392, + "eval_steps_per_second": 7.098, "step": 48000 }, { - "epoch": 1.69, - "learning_rate": 3.858098792698909e-05, - "loss": 0.2768, + "epoch": 1.730096947417739, + "grad_norm": 0.1950840801000595, + "learning_rate": 3.801809762994387e-05, + "loss": 0.4326, "step": 48005 }, { - "epoch": 1.69, - "learning_rate": 3.8578596115745114e-05, - "loss": 0.2832, + "epoch": 1.7302771470789635, + "grad_norm": 0.18309806287288666, + "learning_rate": 3.801560625720535e-05, + "loss": 0.4377, "step": 48010 }, { - "epoch": 1.69, - "learning_rate": 3.857620412819201e-05, - "loss": 0.2764, + "epoch": 1.730457346740188, + "grad_norm": 0.17351049184799194, + "learning_rate": 3.8013114707131716e-05, + "loss": 0.3877, "step": 48015 }, { - "epoch": 1.69, - "learning_rate": 3.8573811964360844e-05, - "loss": 0.2743, + "epoch": 1.7306375464014128, + "grad_norm": 0.18052922189235687, + "learning_rate": 3.80106229797569e-05, + "loss": 0.3962, "step": 48020 }, { - "epoch": 1.69, - "learning_rate": 3.8571419624282676e-05, - "loss": 0.2823, + "epoch": 1.7308177460626375, + "grad_norm": 0.17976753413677216, + "learning_rate": 3.8008131075114886e-05, + "loss": 0.3876, "step": 48025 }, { - "epoch": 1.69, - "learning_rate": 3.856902710798857e-05, - "loss": 0.2791, + "epoch": 1.730997945723862, + "grad_norm": 0.1913975179195404, + "learning_rate": 3.800563899323959e-05, + "loss": 0.3877, "step": 48030 }, { - "epoch": 1.69, - "learning_rate": 3.8566634415509584e-05, - "loss": 0.2706, + "epoch": 1.7311781453850865, + "grad_norm": 0.19166803359985352, + "learning_rate": 3.800314673416498e-05, + "loss": 0.4104, "step": 48035 }, { - "epoch": 1.69, - "learning_rate": 3.85642415468768e-05, - "loss": 0.2822, + "epoch": 1.7313583450463113, + "grad_norm": 0.19192728400230408, + "learning_rate": 3.800065429792501e-05, + "loss": 0.3646, "step": 48040 }, { - "epoch": 1.69, - "learning_rate": 3.856184850212128e-05, - "loss": 0.2911, + "epoch": 1.731538544707536, + "grad_norm": 0.21600832045078278, + "learning_rate": 3.7998161684553656e-05, + "loss": 0.4372, "step": 48045 }, { - "epoch": 1.69, - "learning_rate": 3.855945528127408e-05, - "loss": 0.26, + "epoch": 1.7317187443687607, + "grad_norm": 0.20406126976013184, + "learning_rate": 3.799566889408486e-05, + "loss": 0.4218, "step": 48050 }, { - "epoch": 1.69, - "learning_rate": 3.8557061884366294e-05, - "loss": 0.2653, + "epoch": 1.7318989440299852, + "grad_norm": 0.17205806076526642, + "learning_rate": 3.7993175926552596e-05, + "loss": 0.3918, "step": 48055 }, { - "epoch": 1.69, - "learning_rate": 3.8554668311429e-05, - "loss": 0.3004, + "epoch": 1.7320791436912097, + "grad_norm": 0.15462535619735718, + "learning_rate": 3.799068278199081e-05, + "loss": 0.3719, "step": 48060 }, { - "epoch": 1.69, - "learning_rate": 3.855227456249327e-05, - "loss": 0.2668, + "epoch": 1.7322593433524345, + "grad_norm": 0.2172221541404724, + "learning_rate": 3.7988189460433496e-05, + "loss": 0.4315, "step": 48065 }, { - "epoch": 1.69, - "learning_rate": 3.854988063759019e-05, - "loss": 0.2933, + "epoch": 1.7324395430136592, + "grad_norm": 0.23806485533714294, + "learning_rate": 3.7985695961914614e-05, + "loss": 0.4457, "step": 48070 }, { - "epoch": 1.69, - "learning_rate": 3.854748653675083e-05, - "loss": 0.3116, + "epoch": 1.7326197426748837, + "grad_norm": 0.19144880771636963, + "learning_rate": 3.798320228646814e-05, + "loss": 0.4054, "step": 48075 }, { - "epoch": 1.69, - "learning_rate": 3.8545092260006287e-05, - "loss": 0.2593, + "epoch": 1.7327999423361085, + "grad_norm": 0.13344308733940125, + "learning_rate": 3.798070843412805e-05, + "loss": 0.4203, "step": 48080 }, { - "epoch": 1.69, - "learning_rate": 3.8542697807387656e-05, - "loss": 0.2967, + "epoch": 1.732980141997333, + "grad_norm": 0.17505653202533722, + "learning_rate": 3.7978214404928315e-05, + "loss": 0.4313, "step": 48085 }, { - "epoch": 1.69, - "learning_rate": 3.854030317892601e-05, - "loss": 0.2844, + "epoch": 1.7331603416585577, + "grad_norm": 0.15916766226291656, + "learning_rate": 3.7975720198902924e-05, + "loss": 0.3778, "step": 48090 }, { - "epoch": 1.69, - "learning_rate": 3.8537908374652455e-05, - "loss": 0.3032, + "epoch": 1.7333405413197824, + "grad_norm": 0.2202104777097702, + "learning_rate": 3.797322581608585e-05, + "loss": 0.4379, "step": 48095 }, { - "epoch": 1.69, - "learning_rate": 3.853551339459808e-05, - "loss": 0.2862, + "epoch": 1.733520740981007, + "grad_norm": 0.17521396279335022, + "learning_rate": 3.7970731256511104e-05, + "loss": 0.4141, "step": 48100 }, { - "epoch": 1.69, - "learning_rate": 3.8533118238793985e-05, - "loss": 0.2812, + "epoch": 1.7337009406422315, + "grad_norm": 0.1417665183544159, + "learning_rate": 3.796823652021265e-05, + "loss": 0.3653, "step": 48105 }, { - "epoch": 1.69, - "learning_rate": 3.853072290727127e-05, - "loss": 0.2666, + "epoch": 1.7338811403034562, + "grad_norm": 0.20344464480876923, + "learning_rate": 3.796574160722448e-05, + "loss": 0.4285, "step": 48110 }, { - "epoch": 1.69, - "learning_rate": 3.852832740006103e-05, - "loss": 0.2676, + "epoch": 1.734061339964681, + "grad_norm": 0.1791670322418213, + "learning_rate": 3.796324651758059e-05, + "loss": 0.4186, "step": 48115 }, { - "epoch": 1.69, - "learning_rate": 3.852593171719437e-05, - "loss": 0.2995, + "epoch": 1.7342415396259057, + "grad_norm": 0.19157084822654724, + "learning_rate": 3.796075125131498e-05, + "loss": 0.4068, "step": 48120 }, { - "epoch": 1.69, - "learning_rate": 3.852353585870242e-05, - "loss": 0.297, + "epoch": 1.7344217392871302, + "grad_norm": 0.19911465048789978, + "learning_rate": 3.795825580846164e-05, + "loss": 0.4383, "step": 48125 }, { - "epoch": 1.69, - "learning_rate": 3.852113982461625e-05, - "loss": 0.2822, + "epoch": 1.7346019389483547, + "grad_norm": 0.1901884824037552, + "learning_rate": 3.7955760189054566e-05, + "loss": 0.4327, "step": 48130 }, { - "epoch": 1.69, - "learning_rate": 3.8518743614967e-05, - "loss": 0.2708, + "epoch": 1.7347821386095794, + "grad_norm": 0.17567721009254456, + "learning_rate": 3.7953264393127774e-05, + "loss": 0.406, "step": 48135 }, { - "epoch": 1.69, - "learning_rate": 3.8516347229785765e-05, - "loss": 0.2885, + "epoch": 1.7349623382708041, + "grad_norm": 0.20942769944667816, + "learning_rate": 3.795076842071526e-05, + "loss": 0.4276, "step": 48140 }, { - "epoch": 1.69, - "learning_rate": 3.8513950669103676e-05, - "loss": 0.2671, + "epoch": 1.7351425379320287, + "grad_norm": 0.18025130033493042, + "learning_rate": 3.794827227185105e-05, + "loss": 0.4079, "step": 48145 }, { - "epoch": 1.69, - "learning_rate": 3.851155393295184e-05, - "loss": 0.2828, + "epoch": 1.7353227375932532, + "grad_norm": 0.21213695406913757, + "learning_rate": 3.794577594656912e-05, + "loss": 0.4179, "step": 48150 }, { - "epoch": 1.69, - "learning_rate": 3.850915702136139e-05, - "loss": 0.2785, + "epoch": 1.735502937254478, + "grad_norm": 0.18721318244934082, + "learning_rate": 3.79432794449035e-05, + "loss": 0.4373, "step": 48155 }, { - "epoch": 1.69, - "learning_rate": 3.850675993436342e-05, - "loss": 0.266, + "epoch": 1.7356831369157026, + "grad_norm": 0.2057793140411377, + "learning_rate": 3.794078276688822e-05, + "loss": 0.3952, "step": 48160 }, { - "epoch": 1.69, - "learning_rate": 3.850436267198909e-05, - "loss": 0.28, + "epoch": 1.7358633365769274, + "grad_norm": 0.1834101676940918, + "learning_rate": 3.7938285912557256e-05, + "loss": 0.4069, "step": 48165 }, { - "epoch": 1.69, - "learning_rate": 3.85019652342695e-05, - "loss": 0.2823, + "epoch": 1.7360435362381519, + "grad_norm": 0.21136592328548431, + "learning_rate": 3.793578888194467e-05, + "loss": 0.4132, "step": 48170 }, { - "epoch": 1.69, - "learning_rate": 3.84995676212358e-05, - "loss": 0.299, + "epoch": 1.7362237358993764, + "grad_norm": 0.1626768410205841, + "learning_rate": 3.793329167508445e-05, + "loss": 0.359, "step": 48175 }, { - "epoch": 1.7, - "learning_rate": 3.8497169832919105e-05, - "loss": 0.2796, + "epoch": 1.7364039355606011, + "grad_norm": 0.1921762079000473, + "learning_rate": 3.7930794292010654e-05, + "loss": 0.4221, "step": 48180 }, { - "epoch": 1.7, - "learning_rate": 3.8494771869350555e-05, - "loss": 0.2853, + "epoch": 1.7365841352218259, + "grad_norm": 0.17365401983261108, + "learning_rate": 3.7928296732757276e-05, + "loss": 0.4299, "step": 48185 }, { - "epoch": 1.7, - "learning_rate": 3.849237373056128e-05, - "loss": 0.2868, + "epoch": 1.7367643348830504, + "grad_norm": 0.18681730329990387, + "learning_rate": 3.7925798997358354e-05, + "loss": 0.4318, "step": 48190 }, { - "epoch": 1.7, - "learning_rate": 3.8489975416582436e-05, - "loss": 0.2774, + "epoch": 1.736944534544275, + "grad_norm": 0.15825678408145905, + "learning_rate": 3.792330108584793e-05, + "loss": 0.3875, "step": 48195 }, { - "epoch": 1.7, - "learning_rate": 3.848757692744515e-05, - "loss": 0.2843, + "epoch": 1.7371247342054996, + "grad_norm": 0.17392268776893616, + "learning_rate": 3.792080299826003e-05, + "loss": 0.4145, "step": 48200 }, { - "epoch": 1.7, - "learning_rate": 3.848517826318055e-05, - "loss": 0.2581, + "epoch": 1.7373049338667244, + "grad_norm": 0.17445778846740723, + "learning_rate": 3.79183047346287e-05, + "loss": 0.3871, "step": 48205 }, { - "epoch": 1.7, - "learning_rate": 3.848277942381981e-05, - "loss": 0.3146, + "epoch": 1.737485133527949, + "grad_norm": 0.18119192123413086, + "learning_rate": 3.7915806294987955e-05, + "loss": 0.3867, "step": 48210 }, { - "epoch": 1.7, - "learning_rate": 3.8480380409394066e-05, - "loss": 0.298, + "epoch": 1.7376653331891736, + "grad_norm": 0.20309992134571075, + "learning_rate": 3.7913307679371856e-05, + "loss": 0.4133, "step": 48215 }, { - "epoch": 1.7, - "learning_rate": 3.8477981219934464e-05, - "loss": 0.2889, + "epoch": 1.7378455328503981, + "grad_norm": 0.1670549362897873, + "learning_rate": 3.791080888781444e-05, + "loss": 0.405, "step": 48220 }, { - "epoch": 1.7, - "learning_rate": 3.8475581855472156e-05, - "loss": 0.2909, + "epoch": 1.7380257325116228, + "grad_norm": 0.162832111120224, + "learning_rate": 3.790830992034974e-05, + "loss": 0.3856, "step": 48225 }, { - "epoch": 1.7, - "learning_rate": 3.8473182316038306e-05, - "loss": 0.3038, + "epoch": 1.7382059321728476, + "grad_norm": 0.16759029030799866, + "learning_rate": 3.7905810777011837e-05, + "loss": 0.3806, "step": 48230 }, { - "epoch": 1.7, - "learning_rate": 3.847078260166406e-05, - "loss": 0.3052, + "epoch": 1.7383861318340723, + "grad_norm": 0.18382543325424194, + "learning_rate": 3.790331145783474e-05, + "loss": 0.3809, "step": 48235 }, { - "epoch": 1.7, - "learning_rate": 3.846838271238058e-05, - "loss": 0.2841, + "epoch": 1.7385663314952968, + "grad_norm": 0.18206432461738586, + "learning_rate": 3.7900811962852544e-05, + "loss": 0.4206, "step": 48240 }, { - "epoch": 1.7, - "learning_rate": 3.846598264821901e-05, - "loss": 0.2738, + "epoch": 1.7387465311565213, + "grad_norm": 0.17950773239135742, + "learning_rate": 3.789831229209927e-05, + "loss": 0.436, "step": 48245 }, { - "epoch": 1.7, - "learning_rate": 3.8463582409210554e-05, - "loss": 0.3019, + "epoch": 1.738926730817746, + "grad_norm": 0.16286823153495789, + "learning_rate": 3.7895812445608994e-05, + "loss": 0.4264, "step": 48250 }, { - "epoch": 1.7, - "learning_rate": 3.8461181995386335e-05, - "loss": 0.2845, + "epoch": 1.7391069304789708, + "grad_norm": 0.18310612440109253, + "learning_rate": 3.789331242341576e-05, + "loss": 0.4107, "step": 48255 }, { - "epoch": 1.7, - "learning_rate": 3.845878140677756e-05, - "loss": 0.3173, + "epoch": 1.7392871301401953, + "grad_norm": 0.17169633507728577, + "learning_rate": 3.789081222555365e-05, + "loss": 0.4303, "step": 48260 }, { - "epoch": 1.7, - "learning_rate": 3.845638064341536e-05, - "loss": 0.269, + "epoch": 1.7394673298014198, + "grad_norm": 0.20822210609912872, + "learning_rate": 3.7888311852056725e-05, + "loss": 0.409, "step": 48265 }, { - "epoch": 1.7, - "learning_rate": 3.8453979705330926e-05, - "loss": 0.2784, + "epoch": 1.7396475294626446, + "grad_norm": 0.18613308668136597, + "learning_rate": 3.788581130295903e-05, + "loss": 0.3899, "step": 48270 }, { - "epoch": 1.7, - "learning_rate": 3.845157859255543e-05, - "loss": 0.2945, + "epoch": 1.7398277291238693, + "grad_norm": 0.1604495495557785, + "learning_rate": 3.788331057829466e-05, + "loss": 0.413, "step": 48275 }, { - "epoch": 1.7, - "learning_rate": 3.844917730512005e-05, - "loss": 0.3041, + "epoch": 1.740007928785094, + "grad_norm": 0.18513786792755127, + "learning_rate": 3.788080967809767e-05, + "loss": 0.4236, "step": 48280 }, { - "epoch": 1.7, - "learning_rate": 3.844677584305598e-05, - "loss": 0.2842, + "epoch": 1.7401881284463185, + "grad_norm": 0.20122095942497253, + "learning_rate": 3.7878308602402156e-05, + "loss": 0.3951, "step": 48285 }, { - "epoch": 1.7, - "learning_rate": 3.844437420639439e-05, - "loss": 0.2791, + "epoch": 1.740368328107543, + "grad_norm": 0.22590294480323792, + "learning_rate": 3.787580735124217e-05, + "loss": 0.4287, "step": 48290 }, { - "epoch": 1.7, - "learning_rate": 3.844197239516645e-05, - "loss": 0.2651, + "epoch": 1.7405485277687678, + "grad_norm": 0.15910333395004272, + "learning_rate": 3.78733059246518e-05, + "loss": 0.4064, "step": 48295 }, { - "epoch": 1.7, - "learning_rate": 3.8439570409403355e-05, - "loss": 0.2748, + "epoch": 1.7407287274299925, + "grad_norm": 0.20325346291065216, + "learning_rate": 3.787080432266514e-05, + "loss": 0.415, "step": 48300 }, { - "epoch": 1.7, - "learning_rate": 3.8437168249136294e-05, - "loss": 0.302, + "epoch": 1.740908927091217, + "grad_norm": 0.16542761027812958, + "learning_rate": 3.786830254531626e-05, + "loss": 0.4409, "step": 48305 }, { - "epoch": 1.7, - "learning_rate": 3.8434765914396465e-05, - "loss": 0.259, + "epoch": 1.7410891267524415, + "grad_norm": 0.19882872700691223, + "learning_rate": 3.7865800592639245e-05, + "loss": 0.4755, "step": 48310 }, { - "epoch": 1.7, - "learning_rate": 3.843236340521505e-05, - "loss": 0.2918, + "epoch": 1.7412693264136663, + "grad_norm": 0.17635419964790344, + "learning_rate": 3.786329846466818e-05, + "loss": 0.3941, "step": 48315 }, { - "epoch": 1.7, - "learning_rate": 3.8429960721623255e-05, - "loss": 0.2759, + "epoch": 1.741449526074891, + "grad_norm": 0.22185000777244568, + "learning_rate": 3.786079616143718e-05, + "loss": 0.4116, "step": 48320 }, { - "epoch": 1.7, - "learning_rate": 3.842755786365227e-05, - "loss": 0.3081, + "epoch": 1.7416297257361157, + "grad_norm": 0.21445868909358978, + "learning_rate": 3.7858293682980315e-05, + "loss": 0.4251, "step": 48325 }, { - "epoch": 1.7, - "learning_rate": 3.8425154831333287e-05, - "loss": 0.2736, + "epoch": 1.7418099253973403, + "grad_norm": 0.18230614066123962, + "learning_rate": 3.785579102933168e-05, + "loss": 0.4337, "step": 48330 }, { - "epoch": 1.7, - "learning_rate": 3.8422751624697526e-05, - "loss": 0.2834, + "epoch": 1.7419901250585648, + "grad_norm": 0.18611697852611542, + "learning_rate": 3.7853288200525394e-05, + "loss": 0.4349, "step": 48335 }, { - "epoch": 1.7, - "learning_rate": 3.842034824377617e-05, - "loss": 0.2849, + "epoch": 1.7421703247197895, + "grad_norm": 0.17176128923892975, + "learning_rate": 3.785078519659554e-05, + "loss": 0.4123, "step": 48340 }, { - "epoch": 1.7, - "learning_rate": 3.8417944688600446e-05, - "loss": 0.2805, + "epoch": 1.7423505243810142, + "grad_norm": 0.18628238141536713, + "learning_rate": 3.784828201757623e-05, + "loss": 0.417, "step": 48345 }, { - "epoch": 1.7, - "learning_rate": 3.8415540959201544e-05, - "loss": 0.308, + "epoch": 1.742530724042239, + "grad_norm": 0.17633478343486786, + "learning_rate": 3.784577866350155e-05, + "loss": 0.4247, "step": 48350 }, { - "epoch": 1.7, - "learning_rate": 3.8413137055610696e-05, - "loss": 0.259, + "epoch": 1.7427109237034635, + "grad_norm": 0.17602220177650452, + "learning_rate": 3.7843275134405645e-05, + "loss": 0.4179, "step": 48355 }, { - "epoch": 1.7, - "learning_rate": 3.8410732977859086e-05, - "loss": 0.2816, + "epoch": 1.742891123364688, + "grad_norm": 0.15746374428272247, + "learning_rate": 3.7840771430322586e-05, + "loss": 0.4164, "step": 48360 }, { - "epoch": 1.7, - "learning_rate": 3.8408328725977966e-05, - "loss": 0.2959, + "epoch": 1.7430713230259127, + "grad_norm": 0.22847017645835876, + "learning_rate": 3.7838267551286504e-05, + "loss": 0.4151, "step": 48365 }, { - "epoch": 1.7, - "learning_rate": 3.840592429999852e-05, - "loss": 0.2828, + "epoch": 1.7432515226871375, + "grad_norm": 0.19369705021381378, + "learning_rate": 3.783576349733152e-05, + "loss": 0.4186, "step": 48370 }, { - "epoch": 1.7, - "learning_rate": 3.8403519699951996e-05, - "loss": 0.3202, + "epoch": 1.743431722348362, + "grad_norm": 0.1731320321559906, + "learning_rate": 3.7833259268491735e-05, + "loss": 0.3926, "step": 48375 }, { - "epoch": 1.7, - "learning_rate": 3.840111492586959e-05, - "loss": 0.2739, + "epoch": 1.7436119220095865, + "grad_norm": 0.17335784435272217, + "learning_rate": 3.7830754864801284e-05, + "loss": 0.4057, "step": 48380 }, { - "epoch": 1.7, - "learning_rate": 3.839870997778253e-05, - "loss": 0.2863, + "epoch": 1.7437921216708112, + "grad_norm": 0.1906140148639679, + "learning_rate": 3.782825028629428e-05, + "loss": 0.438, "step": 48385 }, { - "epoch": 1.7, - "learning_rate": 3.839630485572206e-05, - "loss": 0.2788, + "epoch": 1.743972321332036, + "grad_norm": 0.21084898710250854, + "learning_rate": 3.782574553300485e-05, + "loss": 0.4079, "step": 48390 }, { - "epoch": 1.7, - "learning_rate": 3.83938995597194e-05, - "loss": 0.2885, + "epoch": 1.7441525209932607, + "grad_norm": 0.2448464035987854, + "learning_rate": 3.7823240604967116e-05, + "loss": 0.4115, "step": 48395 }, { - "epoch": 1.7, - "learning_rate": 3.839149408980579e-05, - "loss": 0.2836, + "epoch": 1.7443327206544852, + "grad_norm": 0.185360386967659, + "learning_rate": 3.782073550221521e-05, + "loss": 0.4011, "step": 48400 }, { - "epoch": 1.7, - "learning_rate": 3.8389088446012444e-05, - "loss": 0.284, + "epoch": 1.7445129203157097, + "grad_norm": 0.16345396637916565, + "learning_rate": 3.781823022478327e-05, + "loss": 0.4431, "step": 48405 }, { - "epoch": 1.7, - "learning_rate": 3.838668262837061e-05, - "loss": 0.3085, + "epoch": 1.7446931199769344, + "grad_norm": 0.13991603255271912, + "learning_rate": 3.7815724772705423e-05, + "loss": 0.4147, "step": 48410 }, { - "epoch": 1.7, - "learning_rate": 3.838427663691153e-05, - "loss": 0.2612, + "epoch": 1.7448733196381592, + "grad_norm": 0.15714098513126373, + "learning_rate": 3.781321914601581e-05, + "loss": 0.3805, "step": 48415 }, { - "epoch": 1.7, - "learning_rate": 3.838187047166644e-05, - "loss": 0.2742, + "epoch": 1.7450535192993837, + "grad_norm": 0.20374760031700134, + "learning_rate": 3.781071334474856e-05, + "loss": 0.4347, "step": 48420 }, { - "epoch": 1.7, - "learning_rate": 3.837946413266657e-05, - "loss": 0.3048, + "epoch": 1.7452337189606082, + "grad_norm": 0.22393597662448883, + "learning_rate": 3.780820736893783e-05, + "loss": 0.441, "step": 48425 }, { - "epoch": 1.7, - "learning_rate": 3.837705761994318e-05, - "loss": 0.2909, + "epoch": 1.745413918621833, + "grad_norm": 0.23388418555259705, + "learning_rate": 3.780570121861775e-05, + "loss": 0.4262, "step": 48430 }, { - "epoch": 1.7, - "learning_rate": 3.8374650933527514e-05, - "loss": 0.2658, + "epoch": 1.7455941182830577, + "grad_norm": 0.23228850960731506, + "learning_rate": 3.7803194893822466e-05, + "loss": 0.4326, "step": 48435 }, { - "epoch": 1.7, - "learning_rate": 3.837224407345082e-05, - "loss": 0.2941, + "epoch": 1.7457743179442824, + "grad_norm": 0.24053829908370972, + "learning_rate": 3.780068839458614e-05, + "loss": 0.4341, "step": 48440 }, { - "epoch": 1.7, - "learning_rate": 3.836983703974435e-05, - "loss": 0.3045, + "epoch": 1.745954517605507, + "grad_norm": 0.1622568517923355, + "learning_rate": 3.779818172094291e-05, + "loss": 0.3959, "step": 48445 }, { - "epoch": 1.7, - "learning_rate": 3.8367429832439356e-05, - "loss": 0.277, + "epoch": 1.7461347172667314, + "grad_norm": 0.20720703899860382, + "learning_rate": 3.779567487292693e-05, + "loss": 0.4108, "step": 48450 }, { - "epoch": 1.7, - "learning_rate": 3.836502245156709e-05, - "loss": 0.2902, + "epoch": 1.7463149169279562, + "grad_norm": 0.20429673790931702, + "learning_rate": 3.779316785057235e-05, + "loss": 0.434, "step": 48455 }, { - "epoch": 1.7, - "learning_rate": 3.836261489715882e-05, - "loss": 0.2796, + "epoch": 1.7464951165891809, + "grad_norm": 0.23048081994056702, + "learning_rate": 3.7790660653913346e-05, + "loss": 0.4158, "step": 48460 }, { - "epoch": 1.71, - "learning_rate": 3.83602071692458e-05, - "loss": 0.2701, + "epoch": 1.7466753162504054, + "grad_norm": 0.19923274219036102, + "learning_rate": 3.778815328298406e-05, + "loss": 0.4146, "step": 48465 }, { - "epoch": 1.71, - "learning_rate": 3.83577992678593e-05, - "loss": 0.2802, + "epoch": 1.7468555159116301, + "grad_norm": 0.18411320447921753, + "learning_rate": 3.778564573781866e-05, + "loss": 0.4568, "step": 48470 }, { - "epoch": 1.71, - "learning_rate": 3.835539119303058e-05, - "loss": 0.2777, + "epoch": 1.7470357155728546, + "grad_norm": 0.20448732376098633, + "learning_rate": 3.778313801845132e-05, + "loss": 0.4085, "step": 48475 }, { - "epoch": 1.71, - "learning_rate": 3.8352982944790905e-05, - "loss": 0.3027, + "epoch": 1.7472159152340794, + "grad_norm": 0.18641623854637146, + "learning_rate": 3.7780630124916195e-05, + "loss": 0.41, "step": 48480 }, { - "epoch": 1.71, - "learning_rate": 3.835057452317154e-05, - "loss": 0.2844, + "epoch": 1.747396114895304, + "grad_norm": 0.1633073091506958, + "learning_rate": 3.7778122057247464e-05, + "loss": 0.4043, "step": 48485 }, { - "epoch": 1.71, - "learning_rate": 3.8348165928203766e-05, - "loss": 0.2777, + "epoch": 1.7475763145565286, + "grad_norm": 0.15964628756046295, + "learning_rate": 3.777561381547929e-05, + "loss": 0.3637, "step": 48490 }, { - "epoch": 1.71, - "learning_rate": 3.8345757159918855e-05, - "loss": 0.3131, + "epoch": 1.7477565142177531, + "grad_norm": 0.15613767504692078, + "learning_rate": 3.7773105399645845e-05, + "loss": 0.402, "step": 48495 }, { - "epoch": 1.71, - "learning_rate": 3.8343348218348086e-05, - "loss": 0.2874, + "epoch": 1.7479367138789779, + "grad_norm": 0.1990021914243698, + "learning_rate": 3.777059680978132e-05, + "loss": 0.4074, "step": 48500 }, { - "epoch": 1.71, - "eval_loss": 0.27594712376594543, - "eval_runtime": 10.5531, - "eval_samples_per_second": 9.476, - "eval_steps_per_second": 9.476, + "epoch": 1.7479367138789779, + "eval_loss": 0.4423659145832062, + "eval_runtime": 3.5418, + "eval_samples_per_second": 28.234, + "eval_steps_per_second": 7.059, "step": 48500 }, { - "epoch": 1.71, - "learning_rate": 3.834093910352273e-05, - "loss": 0.2925, + "epoch": 1.7481169135402026, + "grad_norm": 0.20178571343421936, + "learning_rate": 3.776808804591989e-05, + "loss": 0.4374, "step": 48505 }, { - "epoch": 1.71, - "learning_rate": 3.833852981547407e-05, - "loss": 0.2861, + "epoch": 1.7482971132014273, + "grad_norm": 0.16582556068897247, + "learning_rate": 3.776557910809572e-05, + "loss": 0.4201, "step": 48510 }, { - "epoch": 1.71, - "learning_rate": 3.8336120354233395e-05, - "loss": 0.2601, + "epoch": 1.7484773128626518, + "grad_norm": 0.2059725672006607, + "learning_rate": 3.7763069996343015e-05, + "loss": 0.4146, "step": 48515 }, { - "epoch": 1.71, - "learning_rate": 3.8333710719831986e-05, - "loss": 0.2842, + "epoch": 1.7486575125238764, + "grad_norm": 0.24231673777103424, + "learning_rate": 3.776056071069595e-05, + "loss": 0.4654, "step": 48520 }, { - "epoch": 1.71, - "learning_rate": 3.833130091230113e-05, - "loss": 0.2735, + "epoch": 1.748837712185101, + "grad_norm": 0.16822253167629242, + "learning_rate": 3.775805125118871e-05, + "loss": 0.4132, "step": 48525 }, { - "epoch": 1.71, - "learning_rate": 3.832889093167211e-05, - "loss": 0.2881, + "epoch": 1.7490179118463258, + "grad_norm": 0.21261481940746307, + "learning_rate": 3.7755541617855505e-05, + "loss": 0.4319, "step": 48530 }, { - "epoch": 1.71, - "learning_rate": 3.832648077797624e-05, - "loss": 0.3089, + "epoch": 1.7491981115075503, + "grad_norm": 0.15500931441783905, + "learning_rate": 3.77530318107305e-05, + "loss": 0.3957, "step": 48535 }, { - "epoch": 1.71, - "learning_rate": 3.832407045124479e-05, - "loss": 0.272, + "epoch": 1.7493783111687748, + "grad_norm": 0.24944768846035004, + "learning_rate": 3.775052182984792e-05, + "loss": 0.4093, "step": 48540 }, { - "epoch": 1.71, - "learning_rate": 3.832165995150907e-05, - "loss": 0.2908, + "epoch": 1.7495585108299996, + "grad_norm": 0.17092691361904144, + "learning_rate": 3.7748011675241935e-05, + "loss": 0.4218, "step": 48545 }, { - "epoch": 1.71, - "learning_rate": 3.831924927880037e-05, - "loss": 0.2877, + "epoch": 1.7497387104912243, + "grad_norm": 0.20408369600772858, + "learning_rate": 3.7745501346946764e-05, + "loss": 0.4123, "step": 48550 }, { - "epoch": 1.71, - "learning_rate": 3.831683843315e-05, - "loss": 0.2693, + "epoch": 1.749918910152449, + "grad_norm": 0.25046631693840027, + "learning_rate": 3.7742990844996606e-05, + "loss": 0.4413, "step": 48555 }, { - "epoch": 1.71, - "learning_rate": 3.8314427414589255e-05, - "loss": 0.2799, + "epoch": 1.7500991098136736, + "grad_norm": 0.1823837161064148, + "learning_rate": 3.7740480169425666e-05, + "loss": 0.4226, "step": 48560 }, { - "epoch": 1.71, - "learning_rate": 3.8312016223149446e-05, - "loss": 0.3004, + "epoch": 1.750279309474898, + "grad_norm": 0.19407956302165985, + "learning_rate": 3.7737969320268143e-05, + "loss": 0.4308, "step": 48565 }, { - "epoch": 1.71, - "learning_rate": 3.830960485886188e-05, - "loss": 0.2897, + "epoch": 1.7504595091361228, + "grad_norm": 0.23119215667247772, + "learning_rate": 3.7735458297558266e-05, + "loss": 0.4491, "step": 48570 }, { - "epoch": 1.71, - "learning_rate": 3.8307193321757865e-05, - "loss": 0.2824, + "epoch": 1.7506397087973475, + "grad_norm": 0.2448967546224594, + "learning_rate": 3.773294710133023e-05, + "loss": 0.4618, "step": 48575 }, { - "epoch": 1.71, - "learning_rate": 3.830478161186872e-05, - "loss": 0.3045, + "epoch": 1.750819908458572, + "grad_norm": 0.16788001358509064, + "learning_rate": 3.773043573161825e-05, + "loss": 0.3828, "step": 48580 }, { - "epoch": 1.71, - "learning_rate": 3.830236972922575e-05, - "loss": 0.281, + "epoch": 1.7510001081197968, + "grad_norm": 0.1867685317993164, + "learning_rate": 3.772792418845655e-05, + "loss": 0.3888, "step": 48585 }, { - "epoch": 1.71, - "learning_rate": 3.829995767386028e-05, - "loss": 0.2744, + "epoch": 1.7511803077810213, + "grad_norm": 0.2027750462293625, + "learning_rate": 3.7725412471879354e-05, + "loss": 0.4142, "step": 48590 }, { - "epoch": 1.71, - "learning_rate": 3.829754544580362e-05, - "loss": 0.3124, + "epoch": 1.751360507442246, + "grad_norm": 0.19157275557518005, + "learning_rate": 3.7722900581920875e-05, + "loss": 0.3886, "step": 48595 }, { - "epoch": 1.71, - "learning_rate": 3.829513304508709e-05, - "loss": 0.287, + "epoch": 1.7515407071034708, + "grad_norm": 0.19362173974514008, + "learning_rate": 3.7720388518615335e-05, + "loss": 0.4182, "step": 48600 }, { - "epoch": 1.71, - "learning_rate": 3.8292720471742025e-05, - "loss": 0.2996, + "epoch": 1.7517209067646953, + "grad_norm": 0.1794130504131317, + "learning_rate": 3.771787628199696e-05, + "loss": 0.3702, "step": 48605 }, { - "epoch": 1.71, - "learning_rate": 3.8290307725799745e-05, - "loss": 0.2858, + "epoch": 1.7519011064259198, + "grad_norm": 0.1916467696428299, + "learning_rate": 3.771536387209999e-05, + "loss": 0.3976, "step": 48610 }, { - "epoch": 1.71, - "learning_rate": 3.828789480729158e-05, - "loss": 0.2869, + "epoch": 1.7520813060871445, + "grad_norm": 0.2182328701019287, + "learning_rate": 3.771285128895865e-05, + "loss": 0.4402, "step": 48615 }, { - "epoch": 1.71, - "learning_rate": 3.828548171624886e-05, - "loss": 0.2766, + "epoch": 1.7522615057483693, + "grad_norm": 0.1866350769996643, + "learning_rate": 3.771033853260717e-05, + "loss": 0.4152, "step": 48620 }, { - "epoch": 1.71, - "learning_rate": 3.828306845270291e-05, - "loss": 0.2747, + "epoch": 1.752441705409594, + "grad_norm": 0.15760236978530884, + "learning_rate": 3.770782560307978e-05, + "loss": 0.4226, "step": 48625 }, { - "epoch": 1.71, - "learning_rate": 3.828065501668506e-05, - "loss": 0.2949, + "epoch": 1.7526219050708185, + "grad_norm": 0.2243875414133072, + "learning_rate": 3.770531250041074e-05, + "loss": 0.4394, "step": 48630 }, { - "epoch": 1.71, - "learning_rate": 3.827824140822667e-05, - "loss": 0.2672, + "epoch": 1.752802104732043, + "grad_norm": 0.16506066918373108, + "learning_rate": 3.770279922463428e-05, + "loss": 0.3997, "step": 48635 }, { - "epoch": 1.71, - "learning_rate": 3.827582762735907e-05, - "loss": 0.2871, + "epoch": 1.7529823043932677, + "grad_norm": 0.17547442018985748, + "learning_rate": 3.770028577578462e-05, + "loss": 0.4006, "step": 48640 }, { - "epoch": 1.71, - "learning_rate": 3.827341367411359e-05, - "loss": 0.2998, + "epoch": 1.7531625040544925, + "grad_norm": 0.18571364879608154, + "learning_rate": 3.769777215389604e-05, + "loss": 0.4179, "step": 48645 }, { - "epoch": 1.71, - "learning_rate": 3.8270999548521586e-05, - "loss": 0.2506, + "epoch": 1.753342703715717, + "grad_norm": 0.17581316828727722, + "learning_rate": 3.7695258359002775e-05, + "loss": 0.4329, "step": 48650 }, { - "epoch": 1.71, - "learning_rate": 3.8268585250614397e-05, - "loss": 0.3298, + "epoch": 1.7535229033769415, + "grad_norm": 0.22021692991256714, + "learning_rate": 3.769274439113906e-05, + "loss": 0.4256, "step": 48655 }, { - "epoch": 1.71, - "learning_rate": 3.826617078042337e-05, - "loss": 0.2921, + "epoch": 1.7537031030381662, + "grad_norm": 0.16692475974559784, + "learning_rate": 3.769023025033917e-05, + "loss": 0.4274, "step": 48660 }, { - "epoch": 1.71, - "learning_rate": 3.8263756137979855e-05, - "loss": 0.2699, + "epoch": 1.753883302699391, + "grad_norm": 0.16848334670066833, + "learning_rate": 3.768771593663735e-05, + "loss": 0.4117, "step": 48665 }, { - "epoch": 1.71, - "learning_rate": 3.826134132331521e-05, - "loss": 0.2849, + "epoch": 1.7540635023606157, + "grad_norm": 0.2044806033372879, + "learning_rate": 3.7685201450067845e-05, + "loss": 0.4221, "step": 48670 }, { - "epoch": 1.71, - "learning_rate": 3.825892633646079e-05, - "loss": 0.2732, + "epoch": 1.7542437020218402, + "grad_norm": 0.16209609806537628, + "learning_rate": 3.768268679066494e-05, + "loss": 0.389, "step": 48675 }, { - "epoch": 1.71, - "learning_rate": 3.825651117744796e-05, - "loss": 0.2994, + "epoch": 1.7544239016830647, + "grad_norm": 0.18497797846794128, + "learning_rate": 3.7680171958462875e-05, + "loss": 0.4079, "step": 48680 }, { - "epoch": 1.71, - "learning_rate": 3.8254095846308046e-05, - "loss": 0.2828, + "epoch": 1.7546041013442895, + "grad_norm": 0.15725673735141754, + "learning_rate": 3.767765695349592e-05, + "loss": 0.3912, "step": 48685 }, { - "epoch": 1.71, - "learning_rate": 3.825168034307245e-05, - "loss": 0.2586, + "epoch": 1.7547843010055142, + "grad_norm": 0.19795434176921844, + "learning_rate": 3.767514177579836e-05, + "loss": 0.4147, "step": 48690 }, { - "epoch": 1.71, - "learning_rate": 3.824926466777251e-05, - "loss": 0.2924, + "epoch": 1.7549645006667387, + "grad_norm": 0.17479757964611053, + "learning_rate": 3.7672626425404436e-05, + "loss": 0.4329, "step": 48695 }, { - "epoch": 1.71, - "learning_rate": 3.824684882043961e-05, - "loss": 0.2748, + "epoch": 1.7551447003279634, + "grad_norm": 0.20223133265972137, + "learning_rate": 3.767011090234842e-05, + "loss": 0.3739, "step": 48700 }, { - "epoch": 1.71, - "learning_rate": 3.824443280110509e-05, - "loss": 0.2621, + "epoch": 1.755324899989188, + "grad_norm": 0.2165326178073883, + "learning_rate": 3.7667595206664606e-05, + "loss": 0.4324, "step": 48705 }, { - "epoch": 1.71, - "learning_rate": 3.8242016609800345e-05, - "loss": 0.2377, + "epoch": 1.7555050996504127, + "grad_norm": 0.2068873792886734, + "learning_rate": 3.766507933838726e-05, + "loss": 0.3831, "step": 48710 }, { - "epoch": 1.71, - "learning_rate": 3.823960024655675e-05, - "loss": 0.2684, + "epoch": 1.7556852993116374, + "grad_norm": 0.1555756777524948, + "learning_rate": 3.7662563297550666e-05, + "loss": 0.4153, "step": 48715 }, { - "epoch": 1.71, - "learning_rate": 3.8237183711405666e-05, - "loss": 0.2967, + "epoch": 1.755865498972862, + "grad_norm": 0.21677540242671967, + "learning_rate": 3.766004708418909e-05, + "loss": 0.4148, "step": 48720 }, { - "epoch": 1.71, - "learning_rate": 3.823476700437847e-05, - "loss": 0.3064, + "epoch": 1.7560456986340864, + "grad_norm": 0.18430842459201813, + "learning_rate": 3.765753069833683e-05, + "loss": 0.4374, "step": 48725 }, { - "epoch": 1.71, - "learning_rate": 3.823235012550655e-05, - "loss": 0.2617, + "epoch": 1.7562258982953112, + "grad_norm": 0.16822421550750732, + "learning_rate": 3.7655014140028156e-05, + "loss": 0.3953, "step": 48730 }, { - "epoch": 1.71, - "learning_rate": 3.822993307482129e-05, - "loss": 0.2875, + "epoch": 1.756406097956536, + "grad_norm": 0.18837454915046692, + "learning_rate": 3.765249740929737e-05, + "loss": 0.3994, "step": 48735 }, { - "epoch": 1.71, - "learning_rate": 3.822751585235406e-05, - "loss": 0.2954, + "epoch": 1.7565862976177606, + "grad_norm": 0.21585826575756073, + "learning_rate": 3.764998050617876e-05, + "loss": 0.4374, "step": 48740 }, { - "epoch": 1.71, - "learning_rate": 3.822509845813626e-05, - "loss": 0.2939, + "epoch": 1.7567664972789852, + "grad_norm": 0.18385829031467438, + "learning_rate": 3.7647463430706605e-05, + "loss": 0.4148, "step": 48745 }, { - "epoch": 1.72, - "learning_rate": 3.8222680892199266e-05, - "loss": 0.2767, + "epoch": 1.7569466969402097, + "grad_norm": 0.17298060655593872, + "learning_rate": 3.7644946182915215e-05, + "loss": 0.4257, "step": 48750 }, { - "epoch": 1.72, - "learning_rate": 3.822026315457448e-05, - "loss": 0.2901, + "epoch": 1.7571268966014344, + "grad_norm": 0.17734064161777496, + "learning_rate": 3.764242876283888e-05, + "loss": 0.3634, "step": 48755 }, { - "epoch": 1.72, - "learning_rate": 3.821784524529328e-05, - "loss": 0.2757, + "epoch": 1.7573070962626591, + "grad_norm": 0.19183120131492615, + "learning_rate": 3.76399111705119e-05, + "loss": 0.3898, "step": 48760 }, { - "epoch": 1.72, - "learning_rate": 3.8215427164387086e-05, - "loss": 0.2656, + "epoch": 1.7574872959238836, + "grad_norm": 0.19240742921829224, + "learning_rate": 3.763739340596858e-05, + "loss": 0.4226, "step": 48765 }, { - "epoch": 1.72, - "learning_rate": 3.821300891188727e-05, - "loss": 0.2772, + "epoch": 1.7576674955851082, + "grad_norm": 0.19744504988193512, + "learning_rate": 3.763487546924322e-05, + "loss": 0.4182, "step": 48770 }, { - "epoch": 1.72, - "learning_rate": 3.821059048782524e-05, - "loss": 0.3002, + "epoch": 1.7578476952463329, + "grad_norm": 0.15184414386749268, + "learning_rate": 3.763235736037014e-05, + "loss": 0.368, "step": 48775 }, { - "epoch": 1.72, - "learning_rate": 3.8208171892232406e-05, - "loss": 0.2943, + "epoch": 1.7580278949075576, + "grad_norm": 0.1860179305076599, + "learning_rate": 3.762983907938362e-05, + "loss": 0.3983, "step": 48780 }, { - "epoch": 1.72, - "learning_rate": 3.820575312514015e-05, - "loss": 0.2894, + "epoch": 1.7582080945687824, + "grad_norm": 0.1840306669473648, + "learning_rate": 3.7627320626317994e-05, + "loss": 0.4329, "step": 48785 }, { - "epoch": 1.72, - "learning_rate": 3.820333418657991e-05, - "loss": 0.3017, + "epoch": 1.7583882942300069, + "grad_norm": 0.20998184382915497, + "learning_rate": 3.762480200120756e-05, + "loss": 0.3834, "step": 48790 }, { - "epoch": 1.72, - "learning_rate": 3.8200915076583064e-05, - "loss": 0.2886, + "epoch": 1.7585684938912314, + "grad_norm": 0.22181224822998047, + "learning_rate": 3.7622283204086653e-05, + "loss": 0.4068, "step": 48795 }, { - "epoch": 1.72, - "learning_rate": 3.8198495795181036e-05, - "loss": 0.2696, + "epoch": 1.758748693552456, + "grad_norm": 0.16627205908298492, + "learning_rate": 3.761976423498958e-05, + "loss": 0.3937, "step": 48800 }, { - "epoch": 1.72, - "learning_rate": 3.819607634240525e-05, - "loss": 0.2835, + "epoch": 1.7589288932136808, + "grad_norm": 0.15934346616268158, + "learning_rate": 3.761724509395066e-05, + "loss": 0.4018, "step": 48805 }, { - "epoch": 1.72, - "learning_rate": 3.8193656718287097e-05, - "loss": 0.2748, + "epoch": 1.7591090928749054, + "grad_norm": 0.1558726578950882, + "learning_rate": 3.761472578100422e-05, + "loss": 0.4015, "step": 48810 }, { - "epoch": 1.72, - "learning_rate": 3.819123692285802e-05, - "loss": 0.2899, + "epoch": 1.7592892925361299, + "grad_norm": 0.17845207452774048, + "learning_rate": 3.7612206296184594e-05, + "loss": 0.3892, "step": 48815 }, { - "epoch": 1.72, - "learning_rate": 3.818881695614941e-05, - "loss": 0.2798, + "epoch": 1.7594694921973546, + "grad_norm": 0.1628832072019577, + "learning_rate": 3.7609686639526086e-05, + "loss": 0.394, "step": 48820 }, { - "epoch": 1.72, - "learning_rate": 3.818639681819271e-05, - "loss": 0.286, + "epoch": 1.7596496918585793, + "grad_norm": 0.21811261773109436, + "learning_rate": 3.760716681106304e-05, + "loss": 0.4561, "step": 48825 }, { - "epoch": 1.72, - "learning_rate": 3.8183976509019334e-05, - "loss": 0.2705, + "epoch": 1.759829891519804, + "grad_norm": 0.2100425362586975, + "learning_rate": 3.760464681082979e-05, + "loss": 0.4025, "step": 48830 }, { - "epoch": 1.72, - "learning_rate": 3.818155602866072e-05, - "loss": 0.2901, + "epoch": 1.7600100911810286, + "grad_norm": 0.25296419858932495, + "learning_rate": 3.760212663886067e-05, + "loss": 0.4405, "step": 48835 }, { - "epoch": 1.72, - "learning_rate": 3.817913537714828e-05, - "loss": 0.285, + "epoch": 1.760190290842253, + "grad_norm": 0.22018037736415863, + "learning_rate": 3.759960629519e-05, + "loss": 0.4108, "step": 48840 }, { - "epoch": 1.72, - "learning_rate": 3.817671455451346e-05, - "loss": 0.2788, + "epoch": 1.7603704905034778, + "grad_norm": 0.17130319774150848, + "learning_rate": 3.759708577985215e-05, + "loss": 0.4407, "step": 48845 }, { - "epoch": 1.72, - "learning_rate": 3.8174293560787676e-05, - "loss": 0.281, + "epoch": 1.7605506901647026, + "grad_norm": 0.2034503072500229, + "learning_rate": 3.759456509288144e-05, + "loss": 0.4196, "step": 48850 }, { - "epoch": 1.72, - "learning_rate": 3.817187239600238e-05, - "loss": 0.2748, + "epoch": 1.7607308898259273, + "grad_norm": 0.17805613577365875, + "learning_rate": 3.759204423431222e-05, + "loss": 0.4007, "step": 48855 }, { - "epoch": 1.72, - "learning_rate": 3.8169451060188996e-05, - "loss": 0.2743, + "epoch": 1.7609110894871518, + "grad_norm": 0.1614164412021637, + "learning_rate": 3.7589523204178836e-05, + "loss": 0.4025, "step": 48860 }, { - "epoch": 1.72, - "learning_rate": 3.8167029553378966e-05, - "loss": 0.2853, + "epoch": 1.7610912891483763, + "grad_norm": 0.17002156376838684, + "learning_rate": 3.7587002002515623e-05, + "loss": 0.4165, "step": 48865 }, { - "epoch": 1.72, - "learning_rate": 3.8164607875603744e-05, - "loss": 0.2706, + "epoch": 1.761271488809601, + "grad_norm": 0.18788275122642517, + "learning_rate": 3.758448062935696e-05, + "loss": 0.3999, "step": 48870 }, { - "epoch": 1.72, - "learning_rate": 3.816218602689477e-05, - "loss": 0.2811, + "epoch": 1.7614516884708258, + "grad_norm": 0.2487161010503769, + "learning_rate": 3.758195908473717e-05, + "loss": 0.4245, "step": 48875 }, { - "epoch": 1.72, - "learning_rate": 3.815976400728346e-05, - "loss": 0.2945, + "epoch": 1.7616318881320503, + "grad_norm": 0.18604964017868042, + "learning_rate": 3.757943736869064e-05, + "loss": 0.4501, "step": 48880 }, { - "epoch": 1.72, - "learning_rate": 3.8157341816801305e-05, - "loss": 0.3165, + "epoch": 1.7618120877932748, + "grad_norm": 0.15470033884048462, + "learning_rate": 3.757691548125171e-05, + "loss": 0.3926, "step": 48885 }, { - "epoch": 1.72, - "learning_rate": 3.815491945547974e-05, - "loss": 0.2618, + "epoch": 1.7619922874544995, + "grad_norm": 0.1927955448627472, + "learning_rate": 3.757439342245473e-05, + "loss": 0.4354, "step": 48890 }, { - "epoch": 1.72, - "learning_rate": 3.815249692335021e-05, - "loss": 0.296, + "epoch": 1.7621724871157243, + "grad_norm": 0.20000748336315155, + "learning_rate": 3.757187119233408e-05, + "loss": 0.4443, "step": 48895 }, { - "epoch": 1.72, - "learning_rate": 3.8150074220444175e-05, - "loss": 0.288, + "epoch": 1.762352686776949, + "grad_norm": 0.18299151957035065, + "learning_rate": 3.756934879092412e-05, + "loss": 0.3823, "step": 48900 }, { - "epoch": 1.72, - "learning_rate": 3.814765134679308e-05, - "loss": 0.29, + "epoch": 1.7625328864381735, + "grad_norm": 0.17722856998443604, + "learning_rate": 3.756682621825922e-05, + "loss": 0.3988, "step": 48905 }, { - "epoch": 1.72, - "learning_rate": 3.8145228302428416e-05, - "loss": 0.2876, + "epoch": 1.762713086099398, + "grad_norm": 0.20240166783332825, + "learning_rate": 3.756430347437374e-05, + "loss": 0.4445, "step": 48910 }, { - "epoch": 1.72, - "learning_rate": 3.814280508738162e-05, - "loss": 0.2828, + "epoch": 1.7628932857606228, + "grad_norm": 0.17536598443984985, + "learning_rate": 3.7561780559302064e-05, + "loss": 0.4041, "step": 48915 }, { - "epoch": 1.72, - "learning_rate": 3.814038170168417e-05, - "loss": 0.2716, + "epoch": 1.7630734854218475, + "grad_norm": 0.19098421931266785, + "learning_rate": 3.7559257473078554e-05, + "loss": 0.4033, "step": 48920 }, { - "epoch": 1.72, - "learning_rate": 3.81379581453675e-05, - "loss": 0.275, + "epoch": 1.763253685083072, + "grad_norm": 0.17195981740951538, + "learning_rate": 3.755673421573759e-05, + "loss": 0.4251, "step": 48925 }, { - "epoch": 1.72, - "learning_rate": 3.813553441846312e-05, - "loss": 0.2987, + "epoch": 1.7634338847442965, + "grad_norm": 0.15965524315834045, + "learning_rate": 3.7554210787313554e-05, + "loss": 0.378, "step": 48930 }, { - "epoch": 1.72, - "learning_rate": 3.813311052100248e-05, - "loss": 0.2832, + "epoch": 1.7636140844055213, + "grad_norm": 0.20123085379600525, + "learning_rate": 3.755168718784083e-05, + "loss": 0.4222, "step": 48935 }, { - "epoch": 1.72, - "learning_rate": 3.813068645301705e-05, - "loss": 0.2787, + "epoch": 1.763794284066746, + "grad_norm": 0.1748918741941452, + "learning_rate": 3.75491634173538e-05, + "loss": 0.4014, "step": 48940 }, { - "epoch": 1.72, - "learning_rate": 3.8128262214538316e-05, - "loss": 0.2945, + "epoch": 1.7639744837279707, + "grad_norm": 0.1668645590543747, + "learning_rate": 3.7546639475886844e-05, + "loss": 0.4385, "step": 48945 }, { - "epoch": 1.72, - "learning_rate": 3.8125837805597743e-05, - "loss": 0.28, + "epoch": 1.7641546833891952, + "grad_norm": 0.17461763322353363, + "learning_rate": 3.754411536347435e-05, + "loss": 0.3705, "step": 48950 }, { - "epoch": 1.72, - "learning_rate": 3.8123413226226824e-05, - "loss": 0.2776, + "epoch": 1.7643348830504197, + "grad_norm": 0.19829685986042023, + "learning_rate": 3.7541591080150725e-05, + "loss": 0.4261, "step": 48955 }, { - "epoch": 1.72, - "learning_rate": 3.812098847645702e-05, - "loss": 0.2849, + "epoch": 1.7645150827116445, + "grad_norm": 0.16521485149860382, + "learning_rate": 3.753906662595035e-05, + "loss": 0.3963, "step": 48960 }, { - "epoch": 1.72, - "learning_rate": 3.811856355631984e-05, - "loss": 0.2916, + "epoch": 1.7646952823728692, + "grad_norm": 0.19236351549625397, + "learning_rate": 3.75365420009076e-05, + "loss": 0.454, "step": 48965 }, { - "epoch": 1.72, - "learning_rate": 3.811613846584675e-05, - "loss": 0.2675, + "epoch": 1.7648754820340937, + "grad_norm": 0.19585488736629486, + "learning_rate": 3.7534017205056915e-05, + "loss": 0.4209, "step": 48970 }, { - "epoch": 1.72, - "learning_rate": 3.8113713205069254e-05, - "loss": 0.3024, + "epoch": 1.7650556816953185, + "grad_norm": 0.13935211300849915, + "learning_rate": 3.7531492238432656e-05, + "loss": 0.386, "step": 48975 }, { - "epoch": 1.72, - "learning_rate": 3.811128777401882e-05, - "loss": 0.2783, + "epoch": 1.765235881356543, + "grad_norm": 0.182199165225029, + "learning_rate": 3.7528967101069254e-05, + "loss": 0.4283, "step": 48980 }, { - "epoch": 1.72, - "learning_rate": 3.810886217272697e-05, - "loss": 0.2888, + "epoch": 1.7654160810177677, + "grad_norm": 0.23854130506515503, + "learning_rate": 3.7526441793001094e-05, + "loss": 0.4379, "step": 48985 }, { - "epoch": 1.72, - "learning_rate": 3.810643640122518e-05, - "loss": 0.3066, + "epoch": 1.7655962806789924, + "grad_norm": 0.20375332236289978, + "learning_rate": 3.7523916314262585e-05, + "loss": 0.3951, "step": 48990 }, { - "epoch": 1.72, - "learning_rate": 3.8104010459544946e-05, - "loss": 0.3046, + "epoch": 1.765776480340217, + "grad_norm": 0.20095662772655487, + "learning_rate": 3.752139066488815e-05, + "loss": 0.4075, "step": 48995 }, { - "epoch": 1.72, - "learning_rate": 3.810158434771777e-05, - "loss": 0.311, + "epoch": 1.7659566800014415, + "grad_norm": 0.21261079609394073, + "learning_rate": 3.751886484491219e-05, + "loss": 0.4467, "step": 49000 }, { - "epoch": 1.72, - "eval_loss": 0.2752542495727539, - "eval_runtime": 10.5525, - "eval_samples_per_second": 9.476, - "eval_steps_per_second": 9.476, + "epoch": 1.7659566800014415, + "eval_loss": 0.4405106008052826, + "eval_runtime": 3.5203, + "eval_samples_per_second": 28.407, + "eval_steps_per_second": 7.102, "step": 49000 }, { - "epoch": 1.72, - "learning_rate": 3.8099158065775166e-05, - "loss": 0.2867, + "epoch": 1.7661368796626662, + "grad_norm": 0.13927486538887024, + "learning_rate": 3.751633885436912e-05, + "loss": 0.4111, "step": 49005 }, { - "epoch": 1.72, - "learning_rate": 3.8096731613748624e-05, - "loss": 0.2566, + "epoch": 1.766317079323891, + "grad_norm": 0.18607810139656067, + "learning_rate": 3.751381269329335e-05, + "loss": 0.4157, "step": 49010 }, { - "epoch": 1.72, - "learning_rate": 3.809430499166965e-05, - "loss": 0.2895, + "epoch": 1.7664972789851157, + "grad_norm": 0.20816226303577423, + "learning_rate": 3.751128636171931e-05, + "loss": 0.4413, "step": 49015 }, { - "epoch": 1.72, - "learning_rate": 3.809187819956975e-05, - "loss": 0.2768, + "epoch": 1.7666774786463402, + "grad_norm": 0.17857316136360168, + "learning_rate": 3.7508759859681416e-05, + "loss": 0.4119, "step": 49020 }, { - "epoch": 1.72, - "learning_rate": 3.808945123748045e-05, - "loss": 0.3111, + "epoch": 1.7668576783075647, + "grad_norm": 0.24799132347106934, + "learning_rate": 3.750623318721409e-05, + "loss": 0.3855, "step": 49025 }, { - "epoch": 1.73, - "learning_rate": 3.808702410543324e-05, - "loss": 0.2982, + "epoch": 1.7670378779687894, + "grad_norm": 0.15814100205898285, + "learning_rate": 3.7503706344351766e-05, + "loss": 0.3871, "step": 49030 }, { - "epoch": 1.73, - "learning_rate": 3.808459680345966e-05, - "loss": 0.2851, + "epoch": 1.7672180776300142, + "grad_norm": 0.19214874505996704, + "learning_rate": 3.7501179331128844e-05, + "loss": 0.391, "step": 49035 }, { - "epoch": 1.73, - "learning_rate": 3.808216933159121e-05, - "loss": 0.2602, + "epoch": 1.7673982772912387, + "grad_norm": 0.17339515686035156, + "learning_rate": 3.7498652147579786e-05, + "loss": 0.3669, "step": 49040 }, { - "epoch": 1.73, - "learning_rate": 3.8079741689859404e-05, - "loss": 0.2768, + "epoch": 1.7675784769524632, + "grad_norm": 0.1755550056695938, + "learning_rate": 3.749612479373902e-05, + "loss": 0.4258, "step": 49045 }, { - "epoch": 1.73, - "learning_rate": 3.807731387829578e-05, - "loss": 0.2631, + "epoch": 1.767758676613688, + "grad_norm": 0.18907277286052704, + "learning_rate": 3.749359726964096e-05, + "loss": 0.398, "step": 49050 }, { - "epoch": 1.73, - "learning_rate": 3.807488589693186e-05, - "loss": 0.2645, + "epoch": 1.7679388762749126, + "grad_norm": 0.1572108417749405, + "learning_rate": 3.749106957532006e-05, + "loss": 0.4311, "step": 49055 }, { - "epoch": 1.73, - "learning_rate": 3.807245774579915e-05, - "loss": 0.2747, + "epoch": 1.7681190759361374, + "grad_norm": 0.17797333002090454, + "learning_rate": 3.748854171081076e-05, + "loss": 0.3821, "step": 49060 }, { - "epoch": 1.73, - "learning_rate": 3.80700294249292e-05, - "loss": 0.293, + "epoch": 1.7682992755973619, + "grad_norm": 0.18352951109409332, + "learning_rate": 3.7486013676147495e-05, + "loss": 0.3894, "step": 49065 }, { - "epoch": 1.73, - "learning_rate": 3.806760093435352e-05, - "loss": 0.2619, + "epoch": 1.7684794752585864, + "grad_norm": 0.22129513323307037, + "learning_rate": 3.748348547136471e-05, + "loss": 0.4171, "step": 49070 }, { - "epoch": 1.73, - "learning_rate": 3.806517227410367e-05, - "loss": 0.2908, + "epoch": 1.7686596749198111, + "grad_norm": 0.17449241876602173, + "learning_rate": 3.748095709649685e-05, + "loss": 0.3955, "step": 49075 }, { - "epoch": 1.73, - "learning_rate": 3.806274344421115e-05, - "loss": 0.2813, + "epoch": 1.7688398745810359, + "grad_norm": 0.18049213290214539, + "learning_rate": 3.747842855157836e-05, + "loss": 0.3908, "step": 49080 }, { - "epoch": 1.73, - "learning_rate": 3.806031444470753e-05, - "loss": 0.2782, + "epoch": 1.7690200742422604, + "grad_norm": 0.22616390883922577, + "learning_rate": 3.747589983664371e-05, + "loss": 0.4427, "step": 49085 }, { - "epoch": 1.73, - "learning_rate": 3.805788527562433e-05, - "loss": 0.2899, + "epoch": 1.769200273903485, + "grad_norm": 0.17422033846378326, + "learning_rate": 3.7473370951727335e-05, + "loss": 0.4336, "step": 49090 }, { - "epoch": 1.73, - "learning_rate": 3.805545593699309e-05, - "loss": 0.2656, + "epoch": 1.7693804735647096, + "grad_norm": 0.20560961961746216, + "learning_rate": 3.74708418968637e-05, + "loss": 0.4182, "step": 49095 }, { - "epoch": 1.73, - "learning_rate": 3.805302642884536e-05, - "loss": 0.2963, + "epoch": 1.7695606732259344, + "grad_norm": 0.208149254322052, + "learning_rate": 3.7468312672087245e-05, + "loss": 0.3874, "step": 49100 }, { - "epoch": 1.73, - "learning_rate": 3.8050596751212684e-05, - "loss": 0.2814, + "epoch": 1.769740872887159, + "grad_norm": 0.18670162558555603, + "learning_rate": 3.746578327743246e-05, + "loss": 0.4345, "step": 49105 }, { - "epoch": 1.73, - "learning_rate": 3.804816690412661e-05, - "loss": 0.2822, + "epoch": 1.7699210725483836, + "grad_norm": 0.17205697298049927, + "learning_rate": 3.746325371293379e-05, + "loss": 0.4163, "step": 49110 }, { - "epoch": 1.73, - "learning_rate": 3.80457368876187e-05, - "loss": 0.2573, + "epoch": 1.7701012722096081, + "grad_norm": 0.16193123161792755, + "learning_rate": 3.746072397862569e-05, + "loss": 0.4363, "step": 49115 }, { - "epoch": 1.73, - "learning_rate": 3.804330670172048e-05, - "loss": 0.2755, + "epoch": 1.7702814718708328, + "grad_norm": 0.16992096602916718, + "learning_rate": 3.7458194074542643e-05, + "loss": 0.4117, "step": 49120 }, { - "epoch": 1.73, - "learning_rate": 3.804087634646352e-05, - "loss": 0.28, + "epoch": 1.7704616715320576, + "grad_norm": 0.19616004824638367, + "learning_rate": 3.7455664000719113e-05, + "loss": 0.4117, "step": 49125 }, { - "epoch": 1.73, - "learning_rate": 3.803844582187938e-05, - "loss": 0.2762, + "epoch": 1.7706418711932823, + "grad_norm": 0.21857887506484985, + "learning_rate": 3.745313375718957e-05, + "loss": 0.3996, "step": 49130 }, { - "epoch": 1.73, - "learning_rate": 3.8036015127999615e-05, - "loss": 0.2853, + "epoch": 1.7708220708545068, + "grad_norm": 0.2159646898508072, + "learning_rate": 3.74506033439885e-05, + "loss": 0.414, "step": 49135 }, { - "epoch": 1.73, - "learning_rate": 3.803358426485578e-05, - "loss": 0.2931, + "epoch": 1.7710022705157313, + "grad_norm": 0.18626536428928375, + "learning_rate": 3.744807276115036e-05, + "loss": 0.3965, "step": 49140 }, { - "epoch": 1.73, - "learning_rate": 3.803115323247944e-05, - "loss": 0.2888, + "epoch": 1.771182470176956, + "grad_norm": 0.19798725843429565, + "learning_rate": 3.744554200870965e-05, + "loss": 0.421, "step": 49145 }, { - "epoch": 1.73, - "learning_rate": 3.8028722030902165e-05, - "loss": 0.2877, + "epoch": 1.7713626698381808, + "grad_norm": 0.170378640294075, + "learning_rate": 3.744301108670083e-05, + "loss": 0.3878, "step": 49150 }, { - "epoch": 1.73, - "learning_rate": 3.802629066015552e-05, - "loss": 0.2623, + "epoch": 1.7715428694994053, + "grad_norm": 0.1625005006790161, + "learning_rate": 3.744047999515839e-05, + "loss": 0.3824, "step": 49155 }, { - "epoch": 1.73, - "learning_rate": 3.8023859120271084e-05, - "loss": 0.2574, + "epoch": 1.7717230691606298, + "grad_norm": 0.21197515726089478, + "learning_rate": 3.743794873411682e-05, + "loss": 0.4288, "step": 49160 }, { - "epoch": 1.73, - "learning_rate": 3.802142741128041e-05, - "loss": 0.2714, + "epoch": 1.7719032688218546, + "grad_norm": 0.21333040297031403, + "learning_rate": 3.743541730361062e-05, + "loss": 0.4542, "step": 49165 }, { - "epoch": 1.73, - "learning_rate": 3.801899553321509e-05, - "loss": 0.2799, + "epoch": 1.7720834684830793, + "grad_norm": 0.18313553929328918, + "learning_rate": 3.743288570367426e-05, + "loss": 0.4284, "step": 49170 }, { - "epoch": 1.73, - "learning_rate": 3.801656348610669e-05, - "loss": 0.2804, + "epoch": 1.772263668144304, + "grad_norm": 0.1629858762025833, + "learning_rate": 3.7430353934342235e-05, + "loss": 0.4091, "step": 49175 }, { - "epoch": 1.73, - "learning_rate": 3.801413126998679e-05, - "loss": 0.2926, + "epoch": 1.7724438678055285, + "grad_norm": 0.1397615224123001, + "learning_rate": 3.7427821995649044e-05, + "loss": 0.4355, "step": 49180 }, { - "epoch": 1.73, - "learning_rate": 3.8011698884886973e-05, - "loss": 0.2721, + "epoch": 1.772624067466753, + "grad_norm": 0.18151454627513885, + "learning_rate": 3.742528988762919e-05, + "loss": 0.4128, "step": 49185 }, { - "epoch": 1.73, - "learning_rate": 3.800926633083882e-05, - "loss": 0.2686, + "epoch": 1.7728042671279778, + "grad_norm": 0.17477920651435852, + "learning_rate": 3.742275761031716e-05, + "loss": 0.4322, "step": 49190 }, { - "epoch": 1.73, - "learning_rate": 3.800683360787391e-05, - "loss": 0.2771, + "epoch": 1.7729844667892025, + "grad_norm": 0.15377044677734375, + "learning_rate": 3.742022516374747e-05, + "loss": 0.3747, "step": 49195 }, { - "epoch": 1.73, - "learning_rate": 3.800440071602386e-05, - "loss": 0.2874, + "epoch": 1.773164666450427, + "grad_norm": 0.22250111401081085, + "learning_rate": 3.741769254795461e-05, + "loss": 0.4256, "step": 49200 }, { - "epoch": 1.73, - "learning_rate": 3.8001967655320216e-05, - "loss": 0.2632, + "epoch": 1.7733448661116518, + "grad_norm": 0.20984488725662231, + "learning_rate": 3.7415159762973094e-05, + "loss": 0.4101, "step": 49205 }, { - "epoch": 1.73, - "learning_rate": 3.799953442579459e-05, - "loss": 0.2797, + "epoch": 1.7735250657728763, + "grad_norm": 0.15702414512634277, + "learning_rate": 3.741262680883743e-05, + "loss": 0.4372, "step": 49210 }, { - "epoch": 1.73, - "learning_rate": 3.799710102747859e-05, - "loss": 0.3042, + "epoch": 1.773705265434101, + "grad_norm": 0.170258030295372, + "learning_rate": 3.7410093685582135e-05, + "loss": 0.3799, "step": 49215 }, { - "epoch": 1.73, - "learning_rate": 3.799466746040379e-05, - "loss": 0.2858, + "epoch": 1.7738854650953257, + "grad_norm": 0.15923821926116943, + "learning_rate": 3.740756039324171e-05, + "loss": 0.414, "step": 49220 }, { - "epoch": 1.73, - "learning_rate": 3.79922337246018e-05, - "loss": 0.2786, + "epoch": 1.7740656647565503, + "grad_norm": 0.15464822947978973, + "learning_rate": 3.7405026931850676e-05, + "loss": 0.4139, "step": 49225 }, { - "epoch": 1.73, - "learning_rate": 3.7989799820104215e-05, - "loss": 0.2892, + "epoch": 1.7742458644177748, + "grad_norm": 0.16088750958442688, + "learning_rate": 3.7402493301443556e-05, + "loss": 0.3763, "step": 49230 }, { - "epoch": 1.73, - "learning_rate": 3.798736574694264e-05, - "loss": 0.2848, + "epoch": 1.7744260640789995, + "grad_norm": 0.1694214642047882, + "learning_rate": 3.739995950205487e-05, + "loss": 0.3844, "step": 49235 }, { - "epoch": 1.73, - "learning_rate": 3.798493150514868e-05, - "loss": 0.2859, + "epoch": 1.7746062637402242, + "grad_norm": 0.16688519716262817, + "learning_rate": 3.739742553371913e-05, + "loss": 0.4343, "step": 49240 }, { - "epoch": 1.73, - "learning_rate": 3.7982497094753947e-05, - "loss": 0.3, + "epoch": 1.774786463401449, + "grad_norm": 0.1897389143705368, + "learning_rate": 3.7394891396470866e-05, + "loss": 0.4178, "step": 49245 }, { - "epoch": 1.73, - "learning_rate": 3.7980062515790036e-05, - "loss": 0.2706, + "epoch": 1.7749666630626735, + "grad_norm": 0.19561418890953064, + "learning_rate": 3.739235709034461e-05, + "loss": 0.3847, "step": 49250 }, { - "epoch": 1.73, - "learning_rate": 3.797762776828858e-05, - "loss": 0.2738, + "epoch": 1.775146862723898, + "grad_norm": 0.17393264174461365, + "learning_rate": 3.7389822615374884e-05, + "loss": 0.4522, "step": 49255 }, { - "epoch": 1.73, - "learning_rate": 3.797519285228117e-05, - "loss": 0.2571, + "epoch": 1.7753270623851227, + "grad_norm": 0.18216289579868317, + "learning_rate": 3.738728797159623e-05, + "loss": 0.4269, "step": 49260 }, { - "epoch": 1.73, - "learning_rate": 3.7972757767799424e-05, - "loss": 0.2982, + "epoch": 1.7755072620463475, + "grad_norm": 0.1977429836988449, + "learning_rate": 3.738475315904317e-05, + "loss": 0.3856, "step": 49265 }, { - "epoch": 1.73, - "learning_rate": 3.797032251487498e-05, - "loss": 0.2826, + "epoch": 1.775687461707572, + "grad_norm": 0.21220578253269196, + "learning_rate": 3.738221817775025e-05, + "loss": 0.4363, "step": 49270 }, { - "epoch": 1.73, - "learning_rate": 3.796788709353945e-05, - "loss": 0.2615, + "epoch": 1.7758676613687965, + "grad_norm": 0.1830170750617981, + "learning_rate": 3.7379683027752e-05, + "loss": 0.4057, "step": 49275 }, { - "epoch": 1.73, - "learning_rate": 3.796545150382445e-05, - "loss": 0.3103, + "epoch": 1.7760478610300212, + "grad_norm": 0.1913861781358719, + "learning_rate": 3.7377147709082966e-05, + "loss": 0.4363, "step": 49280 }, { - "epoch": 1.73, - "learning_rate": 3.7963015745761597e-05, - "loss": 0.2841, + "epoch": 1.776228060691246, + "grad_norm": 0.16212737560272217, + "learning_rate": 3.7374612221777694e-05, + "loss": 0.3936, "step": 49285 }, { - "epoch": 1.73, - "learning_rate": 3.796057981938253e-05, - "loss": 0.2699, + "epoch": 1.7764082603524707, + "grad_norm": 0.2292328029870987, + "learning_rate": 3.737207656587073e-05, + "loss": 0.417, "step": 49290 }, { - "epoch": 1.73, - "learning_rate": 3.795814372471888e-05, - "loss": 0.2865, + "epoch": 1.7765884600136952, + "grad_norm": 0.19917726516723633, + "learning_rate": 3.7369540741396614e-05, + "loss": 0.4154, "step": 49295 }, { - "epoch": 1.73, - "learning_rate": 3.795570746180227e-05, - "loss": 0.2647, + "epoch": 1.7767686596749197, + "grad_norm": 0.18198366463184357, + "learning_rate": 3.7367004748389897e-05, + "loss": 0.4043, "step": 49300 }, { - "epoch": 1.73, - "learning_rate": 3.795327103066434e-05, - "loss": 0.2766, + "epoch": 1.7769488593361444, + "grad_norm": 0.2522968053817749, + "learning_rate": 3.736446858688513e-05, + "loss": 0.4596, "step": 49305 }, { - "epoch": 1.73, - "learning_rate": 3.7950834431336725e-05, - "loss": 0.2839, + "epoch": 1.7771290589973692, + "grad_norm": 0.21617339551448822, + "learning_rate": 3.736193225691689e-05, + "loss": 0.4162, "step": 49310 }, { - "epoch": 1.74, - "learning_rate": 3.794839766385106e-05, - "loss": 0.3372, + "epoch": 1.7773092586585937, + "grad_norm": 0.1531769037246704, + "learning_rate": 3.73593957585197e-05, + "loss": 0.4273, "step": 49315 }, { - "epoch": 1.74, - "learning_rate": 3.794596072823897e-05, - "loss": 0.2916, + "epoch": 1.7774894583198182, + "grad_norm": 0.16418246924877167, + "learning_rate": 3.735685909172815e-05, + "loss": 0.3954, "step": 49320 }, { - "epoch": 1.74, - "learning_rate": 3.794352362453213e-05, - "loss": 0.2933, + "epoch": 1.777669657981043, + "grad_norm": 0.1850021332502365, + "learning_rate": 3.7354322256576765e-05, + "loss": 0.4432, "step": 49325 }, { - "epoch": 1.74, - "learning_rate": 3.7941086352762155e-05, - "loss": 0.3035, + "epoch": 1.7778498576422677, + "grad_norm": 0.14618489146232605, + "learning_rate": 3.735178525310015e-05, + "loss": 0.3764, "step": 49330 }, { - "epoch": 1.74, - "learning_rate": 3.79386489129607e-05, - "loss": 0.2965, + "epoch": 1.7780300573034924, + "grad_norm": 0.22677135467529297, + "learning_rate": 3.734924808133285e-05, + "loss": 0.4131, "step": 49335 }, { - "epoch": 1.74, - "learning_rate": 3.793621130515942e-05, - "loss": 0.2873, + "epoch": 1.778210256964717, + "grad_norm": 0.1776285022497177, + "learning_rate": 3.734671074130943e-05, + "loss": 0.4312, "step": 49340 }, { - "epoch": 1.74, - "learning_rate": 3.793377352938996e-05, - "loss": 0.2836, + "epoch": 1.7783904566259414, + "grad_norm": 0.1667148470878601, + "learning_rate": 3.734417323306447e-05, + "loss": 0.3951, "step": 49345 }, { - "epoch": 1.74, - "learning_rate": 3.793133558568398e-05, - "loss": 0.2836, + "epoch": 1.7785706562871662, + "grad_norm": 0.18254049122333527, + "learning_rate": 3.7341635556632544e-05, + "loss": 0.4046, "step": 49350 }, { - "epoch": 1.74, - "learning_rate": 3.792889747407312e-05, - "loss": 0.2956, + "epoch": 1.7787508559483909, + "grad_norm": 0.18212547898292542, + "learning_rate": 3.733909771204821e-05, + "loss": 0.4332, "step": 49355 }, { - "epoch": 1.74, - "learning_rate": 3.792645919458906e-05, - "loss": 0.2759, + "epoch": 1.7789310556096156, + "grad_norm": 0.19516201317310333, + "learning_rate": 3.733655969934607e-05, + "loss": 0.4102, "step": 49360 }, { - "epoch": 1.74, - "learning_rate": 3.7924020747263436e-05, - "loss": 0.2733, + "epoch": 1.7791112552708401, + "grad_norm": 0.17682771384716034, + "learning_rate": 3.733402151856069e-05, + "loss": 0.4214, "step": 49365 }, { - "epoch": 1.74, - "learning_rate": 3.792158213212793e-05, - "loss": 0.2935, + "epoch": 1.7792914549320646, + "grad_norm": 0.1816743165254593, + "learning_rate": 3.733148316972665e-05, + "loss": 0.4073, "step": 49370 }, { - "epoch": 1.74, - "learning_rate": 3.791914334921418e-05, - "loss": 0.2866, + "epoch": 1.7794716545932894, + "grad_norm": 0.20628765225410461, + "learning_rate": 3.732894465287854e-05, + "loss": 0.4256, "step": 49375 }, { - "epoch": 1.74, - "learning_rate": 3.7916704398553884e-05, - "loss": 0.2597, + "epoch": 1.779651854254514, + "grad_norm": 0.1907653659582138, + "learning_rate": 3.732640596805096e-05, + "loss": 0.4183, "step": 49380 }, { - "epoch": 1.74, - "learning_rate": 3.791426528017869e-05, - "loss": 0.281, + "epoch": 1.7798320539157386, + "grad_norm": 0.18405283987522125, + "learning_rate": 3.732386711527847e-05, + "loss": 0.4181, "step": 49385 }, { - "epoch": 1.74, - "learning_rate": 3.7911825994120266e-05, - "loss": 0.2893, + "epoch": 1.7800122535769631, + "grad_norm": 0.24257473647594452, + "learning_rate": 3.7321328094595685e-05, + "loss": 0.4125, "step": 49390 }, { - "epoch": 1.74, - "learning_rate": 3.790938654041029e-05, - "loss": 0.2988, + "epoch": 1.7801924532381879, + "grad_norm": 0.19096507132053375, + "learning_rate": 3.731878890603718e-05, + "loss": 0.3772, "step": 49395 }, { - "epoch": 1.74, - "learning_rate": 3.790694691908044e-05, - "loss": 0.2972, + "epoch": 1.7803726528994126, + "grad_norm": 0.19265110790729523, + "learning_rate": 3.731624954963757e-05, + "loss": 0.4049, "step": 49400 }, { - "epoch": 1.74, - "learning_rate": 3.790450713016239e-05, - "loss": 0.2926, + "epoch": 1.7805528525606373, + "grad_norm": 0.24484902620315552, + "learning_rate": 3.731371002543144e-05, + "loss": 0.4594, "step": 49405 }, { - "epoch": 1.74, - "learning_rate": 3.790206717368782e-05, - "loss": 0.3084, + "epoch": 1.7807330522218618, + "grad_norm": 0.19407442212104797, + "learning_rate": 3.73111703334534e-05, + "loss": 0.3982, "step": 49410 }, { - "epoch": 1.74, - "learning_rate": 3.7899627049688416e-05, - "loss": 0.2983, + "epoch": 1.7809132518830864, + "grad_norm": 0.18388943374156952, + "learning_rate": 3.7308630473738046e-05, + "loss": 0.3976, "step": 49415 }, { - "epoch": 1.74, - "learning_rate": 3.789718675819585e-05, - "loss": 0.2852, + "epoch": 1.781093451544311, + "grad_norm": 0.15484626591205597, + "learning_rate": 3.730609044631998e-05, + "loss": 0.4456, "step": 49420 }, { - "epoch": 1.74, - "learning_rate": 3.789474629924181e-05, - "loss": 0.278, + "epoch": 1.7812736512055358, + "grad_norm": 0.18442393839359283, + "learning_rate": 3.7303550251233824e-05, + "loss": 0.3943, "step": 49425 }, { - "epoch": 1.74, - "learning_rate": 3.7892305672857986e-05, - "loss": 0.3003, + "epoch": 1.7814538508667603, + "grad_norm": 0.18085482716560364, + "learning_rate": 3.730100988851417e-05, + "loss": 0.4069, "step": 49430 }, { - "epoch": 1.74, - "learning_rate": 3.7889864879076074e-05, - "loss": 0.2931, + "epoch": 1.7816340505279848, + "grad_norm": 0.21936087310314178, + "learning_rate": 3.7298469358195635e-05, + "loss": 0.4314, "step": 49435 }, { - "epoch": 1.74, - "learning_rate": 3.7887423917927764e-05, - "loss": 0.2539, + "epoch": 1.7818142501892096, + "grad_norm": 0.18822318315505981, + "learning_rate": 3.729592866031284e-05, + "loss": 0.4145, "step": 49440 }, { - "epoch": 1.74, - "learning_rate": 3.788498278944475e-05, - "loss": 0.2906, + "epoch": 1.7819944498504343, + "grad_norm": 0.2043982595205307, + "learning_rate": 3.729338779490039e-05, + "loss": 0.429, "step": 49445 }, { - "epoch": 1.74, - "learning_rate": 3.788254149365872e-05, - "loss": 0.2911, + "epoch": 1.782174649511659, + "grad_norm": 0.1583716869354248, + "learning_rate": 3.7290846761992924e-05, + "loss": 0.4125, "step": 49450 }, { - "epoch": 1.74, - "learning_rate": 3.788010003060137e-05, - "loss": 0.2989, + "epoch": 1.7823548491728836, + "grad_norm": 0.20296910405158997, + "learning_rate": 3.728830556162505e-05, + "loss": 0.4143, "step": 49455 }, { - "epoch": 1.74, - "learning_rate": 3.787765840030442e-05, - "loss": 0.2723, + "epoch": 1.782535048834108, + "grad_norm": 0.20263749361038208, + "learning_rate": 3.7285764193831384e-05, + "loss": 0.3639, "step": 49460 }, { - "epoch": 1.74, - "learning_rate": 3.7875216602799565e-05, - "loss": 0.3076, + "epoch": 1.7827152484953328, + "grad_norm": 0.15118259191513062, + "learning_rate": 3.728322265864656e-05, + "loss": 0.4173, "step": 49465 }, { - "epoch": 1.74, - "learning_rate": 3.78727746381185e-05, - "loss": 0.2934, + "epoch": 1.7828954481565575, + "grad_norm": 0.15944749116897583, + "learning_rate": 3.7280680956105206e-05, + "loss": 0.3725, "step": 49470 }, { - "epoch": 1.74, - "learning_rate": 3.787033250629295e-05, - "loss": 0.2802, + "epoch": 1.783075647817782, + "grad_norm": 0.1850617229938507, + "learning_rate": 3.727813908624196e-05, + "loss": 0.4237, "step": 49475 }, { - "epoch": 1.74, - "learning_rate": 3.7867890207354604e-05, - "loss": 0.2747, + "epoch": 1.7832558474790068, + "grad_norm": 0.17814558744430542, + "learning_rate": 3.727559704909144e-05, + "loss": 0.4384, "step": 49480 }, { - "epoch": 1.74, - "learning_rate": 3.786544774133519e-05, - "loss": 0.2802, + "epoch": 1.7834360471402313, + "grad_norm": 0.18063758313655853, + "learning_rate": 3.727305484468828e-05, + "loss": 0.4244, "step": 49485 }, { - "epoch": 1.74, - "learning_rate": 3.7863005108266417e-05, - "loss": 0.2939, + "epoch": 1.783616246801456, + "grad_norm": 0.21019330620765686, + "learning_rate": 3.727051247306713e-05, + "loss": 0.3935, "step": 49490 }, { - "epoch": 1.74, - "learning_rate": 3.786056230817999e-05, - "loss": 0.2919, + "epoch": 1.7837964464626808, + "grad_norm": 0.17506244778633118, + "learning_rate": 3.726796993426263e-05, + "loss": 0.4377, "step": 49495 }, { - "epoch": 1.74, - "learning_rate": 3.785811934110764e-05, - "loss": 0.2557, + "epoch": 1.7839766461239053, + "grad_norm": 0.17135731875896454, + "learning_rate": 3.726542722830941e-05, + "loss": 0.3957, "step": 49500 }, { - "epoch": 1.74, - "eval_loss": 0.27525606751441956, - "eval_runtime": 10.5363, - "eval_samples_per_second": 9.491, - "eval_steps_per_second": 9.491, + "epoch": 1.7839766461239053, + "eval_loss": 0.4405691623687744, + "eval_runtime": 3.5268, + "eval_samples_per_second": 28.354, + "eval_steps_per_second": 7.089, "step": 49500 }, { - "epoch": 1.74, - "learning_rate": 3.7855676207081095e-05, - "loss": 0.2857, + "epoch": 1.7841568457851298, + "grad_norm": 0.17667745053768158, + "learning_rate": 3.7262884355242116e-05, + "loss": 0.4162, "step": 49505 }, { - "epoch": 1.74, - "learning_rate": 3.7853232906132055e-05, - "loss": 0.2636, + "epoch": 1.7843370454463545, + "grad_norm": 0.2062585949897766, + "learning_rate": 3.7260341315095394e-05, + "loss": 0.3877, "step": 49510 }, { - "epoch": 1.74, - "learning_rate": 3.785078943829227e-05, - "loss": 0.2768, + "epoch": 1.7845172451075793, + "grad_norm": 0.1851799339056015, + "learning_rate": 3.72577981079039e-05, + "loss": 0.4212, "step": 49515 }, { - "epoch": 1.74, - "learning_rate": 3.7848345803593436e-05, - "loss": 0.3081, + "epoch": 1.784697444768804, + "grad_norm": 0.1553850919008255, + "learning_rate": 3.725525473370228e-05, + "loss": 0.4183, "step": 49520 }, { - "epoch": 1.74, - "learning_rate": 3.784590200206731e-05, - "loss": 0.3179, + "epoch": 1.7848776444300285, + "grad_norm": 0.18418540060520172, + "learning_rate": 3.725271119252519e-05, + "loss": 0.4034, "step": 49525 }, { - "epoch": 1.74, - "learning_rate": 3.7843458033745606e-05, - "loss": 0.2832, + "epoch": 1.785057844091253, + "grad_norm": 0.21145012974739075, + "learning_rate": 3.7250167484407274e-05, + "loss": 0.4491, "step": 49530 }, { - "epoch": 1.74, - "learning_rate": 3.784101389866007e-05, - "loss": 0.3037, + "epoch": 1.7852380437524777, + "grad_norm": 0.18339501321315765, + "learning_rate": 3.7247623609383206e-05, + "loss": 0.4113, "step": 49535 }, { - "epoch": 1.74, - "learning_rate": 3.783856959684242e-05, - "loss": 0.2964, + "epoch": 1.7854182434137025, + "grad_norm": 0.1943238079547882, + "learning_rate": 3.7245079567487635e-05, + "loss": 0.3917, "step": 49540 }, { - "epoch": 1.74, - "learning_rate": 3.783612512832442e-05, - "loss": 0.2684, + "epoch": 1.785598443074927, + "grad_norm": 0.15589359402656555, + "learning_rate": 3.724253535875522e-05, + "loss": 0.4486, "step": 49545 }, { - "epoch": 1.74, - "learning_rate": 3.7833680493137786e-05, - "loss": 0.2917, + "epoch": 1.7857786427361515, + "grad_norm": 0.25339236855506897, + "learning_rate": 3.723999098322064e-05, + "loss": 0.3768, "step": 49550 }, { - "epoch": 1.74, - "learning_rate": 3.783123569131427e-05, - "loss": 0.2968, + "epoch": 1.7859588423973762, + "grad_norm": 0.2046034038066864, + "learning_rate": 3.7237446440918545e-05, + "loss": 0.4172, "step": 49555 }, { - "epoch": 1.74, - "learning_rate": 3.782879072288561e-05, - "loss": 0.3113, + "epoch": 1.786139042058601, + "grad_norm": 0.19488854706287384, + "learning_rate": 3.723490173188362e-05, + "loss": 0.4283, "step": 49560 }, { - "epoch": 1.74, - "learning_rate": 3.782634558788356e-05, - "loss": 0.291, + "epoch": 1.7863192417198257, + "grad_norm": 0.16177228093147278, + "learning_rate": 3.723235685615052e-05, + "loss": 0.3981, "step": 49565 }, { - "epoch": 1.74, - "learning_rate": 3.782390028633986e-05, - "loss": 0.2844, + "epoch": 1.7864994413810502, + "grad_norm": 0.15859396755695343, + "learning_rate": 3.722981181375392e-05, + "loss": 0.399, "step": 49570 }, { - "epoch": 1.74, - "learning_rate": 3.782145481828628e-05, - "loss": 0.2514, + "epoch": 1.7866796410422747, + "grad_norm": 0.17841722071170807, + "learning_rate": 3.7227266604728516e-05, + "loss": 0.4186, "step": 49575 }, { - "epoch": 1.74, - "learning_rate": 3.781900918375455e-05, - "loss": 0.291, + "epoch": 1.7868598407034995, + "grad_norm": 0.22205208241939545, + "learning_rate": 3.722472122910896e-05, + "loss": 0.4336, "step": 49580 }, { - "epoch": 1.74, - "learning_rate": 3.781656338277644e-05, - "loss": 0.3004, + "epoch": 1.7870400403647242, + "grad_norm": 0.17837925255298615, + "learning_rate": 3.7222175686929947e-05, + "loss": 0.4137, "step": 49585 }, { - "epoch": 1.74, - "learning_rate": 3.781411741538369e-05, - "loss": 0.2854, + "epoch": 1.7872202400259487, + "grad_norm": 0.19479599595069885, + "learning_rate": 3.7219629978226165e-05, + "loss": 0.4216, "step": 49590 }, { - "epoch": 1.74, - "learning_rate": 3.7811671281608073e-05, - "loss": 0.2908, + "epoch": 1.7874004396871734, + "grad_norm": 0.19044643640518188, + "learning_rate": 3.7217084103032284e-05, + "loss": 0.4043, "step": 49595 }, { - "epoch": 1.75, - "learning_rate": 3.780922498148136e-05, - "loss": 0.3035, + "epoch": 1.787580639348398, + "grad_norm": 0.17032626271247864, + "learning_rate": 3.7214538061383e-05, + "loss": 0.4044, "step": 49600 }, { - "epoch": 1.75, - "learning_rate": 3.780677851503529e-05, - "loss": 0.291, + "epoch": 1.7877608390096227, + "grad_norm": 0.19500906765460968, + "learning_rate": 3.7211991853312996e-05, + "loss": 0.3888, "step": 49605 }, { - "epoch": 1.75, - "learning_rate": 3.780433188230165e-05, - "loss": 0.2509, + "epoch": 1.7879410386708474, + "grad_norm": 0.20601950585842133, + "learning_rate": 3.720944547885697e-05, + "loss": 0.3977, "step": 49610 }, { - "epoch": 1.75, - "learning_rate": 3.780188508331219e-05, - "loss": 0.2715, + "epoch": 1.788121238332072, + "grad_norm": 0.15395066142082214, + "learning_rate": 3.720689893804962e-05, + "loss": 0.4093, "step": 49615 }, { - "epoch": 1.75, - "learning_rate": 3.779943811809869e-05, - "loss": 0.2831, + "epoch": 1.7883014379932964, + "grad_norm": 0.21535900235176086, + "learning_rate": 3.720435223092562e-05, + "loss": 0.4189, "step": 49620 }, { - "epoch": 1.75, - "learning_rate": 3.779699098669292e-05, - "loss": 0.2855, + "epoch": 1.7884816376545212, + "grad_norm": 0.1768050193786621, + "learning_rate": 3.720180535751969e-05, + "loss": 0.4222, "step": 49625 }, { - "epoch": 1.75, - "learning_rate": 3.779454368912666e-05, - "loss": 0.2876, + "epoch": 1.788661837315746, + "grad_norm": 0.1698492169380188, + "learning_rate": 3.719925831786653e-05, + "loss": 0.3944, "step": 49630 }, { - "epoch": 1.75, - "learning_rate": 3.779209622543168e-05, - "loss": 0.2821, + "epoch": 1.7888420369769706, + "grad_norm": 0.1947326809167862, + "learning_rate": 3.7196711112000835e-05, + "loss": 0.43, "step": 49635 }, { - "epoch": 1.75, - "learning_rate": 3.778964859563977e-05, - "loss": 0.2485, + "epoch": 1.7890222366381952, + "grad_norm": 0.15801461040973663, + "learning_rate": 3.7194163739957306e-05, + "loss": 0.4035, "step": 49640 }, { - "epoch": 1.75, - "learning_rate": 3.778720079978269e-05, - "loss": 0.2964, + "epoch": 1.7892024362994197, + "grad_norm": 0.2287774682044983, + "learning_rate": 3.719161620177066e-05, + "loss": 0.4457, "step": 49645 }, { - "epoch": 1.75, - "learning_rate": 3.7784752837892235e-05, - "loss": 0.2808, + "epoch": 1.7893826359606444, + "grad_norm": 0.19257451593875885, + "learning_rate": 3.71890684974756e-05, + "loss": 0.4043, "step": 49650 }, { - "epoch": 1.75, - "learning_rate": 3.778230471000019e-05, - "loss": 0.2695, + "epoch": 1.7895628356218691, + "grad_norm": 0.22990871965885162, + "learning_rate": 3.7186520627106855e-05, + "loss": 0.4, "step": 49655 }, { - "epoch": 1.75, - "learning_rate": 3.777985641613835e-05, - "loss": 0.2931, + "epoch": 1.7897430352830936, + "grad_norm": 0.17866025865077972, + "learning_rate": 3.718397259069911e-05, + "loss": 0.418, "step": 49660 }, { - "epoch": 1.75, - "learning_rate": 3.7777407956338484e-05, - "loss": 0.2756, + "epoch": 1.7899232349443182, + "grad_norm": 0.18929140269756317, + "learning_rate": 3.718142438828711e-05, + "loss": 0.408, "step": 49665 }, { - "epoch": 1.75, - "learning_rate": 3.777495933063241e-05, - "loss": 0.2654, + "epoch": 1.7901034346055429, + "grad_norm": 0.19318966567516327, + "learning_rate": 3.717887601990555e-05, + "loss": 0.4305, "step": 49670 }, { - "epoch": 1.75, - "learning_rate": 3.7772510539051895e-05, - "loss": 0.269, + "epoch": 1.7902836342667676, + "grad_norm": 0.21049553155899048, + "learning_rate": 3.717632748558917e-05, + "loss": 0.4117, "step": 49675 }, { - "epoch": 1.75, - "learning_rate": 3.777006158162876e-05, - "loss": 0.2727, + "epoch": 1.7904638339279924, + "grad_norm": 0.20448461174964905, + "learning_rate": 3.717377878537269e-05, + "loss": 0.4479, "step": 49680 }, { - "epoch": 1.75, - "learning_rate": 3.776761245839478e-05, - "loss": 0.2853, + "epoch": 1.7906440335892169, + "grad_norm": 0.15858854353427887, + "learning_rate": 3.717122991929082e-05, + "loss": 0.3854, "step": 49685 }, { - "epoch": 1.75, - "learning_rate": 3.7765163169381776e-05, - "loss": 0.27, + "epoch": 1.7908242332504414, + "grad_norm": 0.18554039299488068, + "learning_rate": 3.7168680887378306e-05, + "loss": 0.4105, "step": 49690 }, { - "epoch": 1.75, - "learning_rate": 3.7762713714621544e-05, - "loss": 0.301, + "epoch": 1.7910044329116661, + "grad_norm": 0.21041366457939148, + "learning_rate": 3.716613168966986e-05, + "loss": 0.4343, "step": 49695 }, { - "epoch": 1.75, - "learning_rate": 3.7760264094145876e-05, - "loss": 0.3008, + "epoch": 1.7911846325728908, + "grad_norm": 0.18072649836540222, + "learning_rate": 3.716358232620024e-05, + "loss": 0.4132, "step": 49700 }, { - "epoch": 1.75, - "learning_rate": 3.775781430798659e-05, - "loss": 0.2869, + "epoch": 1.7913648322341154, + "grad_norm": 0.16237974166870117, + "learning_rate": 3.716103279700416e-05, + "loss": 0.4437, "step": 49705 }, { - "epoch": 1.75, - "learning_rate": 3.7755364356175493e-05, - "loss": 0.304, + "epoch": 1.79154503189534, + "grad_norm": 0.21563240885734558, + "learning_rate": 3.7158483102116354e-05, + "loss": 0.4425, "step": 49710 }, { - "epoch": 1.75, - "learning_rate": 3.7752914238744395e-05, - "loss": 0.257, + "epoch": 1.7917252315565646, + "grad_norm": 0.18444405496120453, + "learning_rate": 3.715593324157158e-05, + "loss": 0.4327, "step": 49715 }, { - "epoch": 1.75, - "learning_rate": 3.775046395572511e-05, - "loss": 0.2746, + "epoch": 1.7919054312177893, + "grad_norm": 0.20448465645313263, + "learning_rate": 3.715338321540457e-05, + "loss": 0.414, "step": 49720 }, { - "epoch": 1.75, - "learning_rate": 3.774801350714946e-05, - "loss": 0.2908, + "epoch": 1.792085630879014, + "grad_norm": 0.1820807009935379, + "learning_rate": 3.715083302365006e-05, + "loss": 0.407, "step": 49725 }, { - "epoch": 1.75, - "learning_rate": 3.774556289304925e-05, - "loss": 0.2751, + "epoch": 1.7922658305402386, + "grad_norm": 0.17193655669689178, + "learning_rate": 3.7148282666342804e-05, + "loss": 0.4177, "step": 49730 }, { - "epoch": 1.75, - "learning_rate": 3.77431121134563e-05, - "loss": 0.2916, + "epoch": 1.792446030201463, + "grad_norm": 0.1984802633523941, + "learning_rate": 3.714573214351754e-05, + "loss": 0.4338, "step": 49735 }, { - "epoch": 1.75, - "learning_rate": 3.7740661168402445e-05, - "loss": 0.2611, + "epoch": 1.7926262298626878, + "grad_norm": 0.18184858560562134, + "learning_rate": 3.714318145520905e-05, + "loss": 0.4176, "step": 49740 }, { - "epoch": 1.75, - "learning_rate": 3.773821005791951e-05, - "loss": 0.3029, + "epoch": 1.7928064295239126, + "grad_norm": 0.16971564292907715, + "learning_rate": 3.7140630601452045e-05, + "loss": 0.3892, "step": 49745 }, { - "epoch": 1.75, - "learning_rate": 3.77357587820393e-05, - "loss": 0.2782, + "epoch": 1.7929866291851373, + "grad_norm": 0.15988574922084808, + "learning_rate": 3.7138079582281306e-05, + "loss": 0.4246, "step": 49750 }, { - "epoch": 1.75, - "learning_rate": 3.7733307340793656e-05, - "loss": 0.2969, + "epoch": 1.7931668288463618, + "grad_norm": 0.1885288655757904, + "learning_rate": 3.713552839773158e-05, + "loss": 0.4083, "step": 49755 }, { - "epoch": 1.75, - "learning_rate": 3.7730855734214405e-05, - "loss": 0.2977, + "epoch": 1.7933470285075863, + "grad_norm": 0.15891823172569275, + "learning_rate": 3.713297704783763e-05, + "loss": 0.3774, "step": 49760 }, { - "epoch": 1.75, - "learning_rate": 3.7728403962333394e-05, - "loss": 0.3303, + "epoch": 1.793527228168811, + "grad_norm": 0.21999859809875488, + "learning_rate": 3.7130425532634214e-05, + "loss": 0.4091, "step": 49765 }, { - "epoch": 1.75, - "learning_rate": 3.772595202518243e-05, - "loss": 0.2923, + "epoch": 1.7937074278300358, + "grad_norm": 0.21735869348049164, + "learning_rate": 3.71278738521561e-05, + "loss": 0.4353, "step": 49770 }, { - "epoch": 1.75, - "learning_rate": 3.772349992279338e-05, - "loss": 0.2865, + "epoch": 1.7938876274912603, + "grad_norm": 0.17658278346061707, + "learning_rate": 3.7125322006438055e-05, + "loss": 0.3801, "step": 49775 }, { - "epoch": 1.75, - "learning_rate": 3.772104765519806e-05, - "loss": 0.2976, + "epoch": 1.7940678271524848, + "grad_norm": 0.17942824959754944, + "learning_rate": 3.712276999551485e-05, + "loss": 0.4222, "step": 49780 }, { - "epoch": 1.75, - "learning_rate": 3.771859522242832e-05, - "loss": 0.2694, + "epoch": 1.7942480268137095, + "grad_norm": 0.17095422744750977, + "learning_rate": 3.712021781942124e-05, + "loss": 0.3908, "step": 49785 }, { - "epoch": 1.75, - "learning_rate": 3.771614262451601e-05, - "loss": 0.3007, + "epoch": 1.7944282264749343, + "grad_norm": 0.20140354335308075, + "learning_rate": 3.711766547819201e-05, + "loss": 0.3921, "step": 49790 }, { - "epoch": 1.75, - "learning_rate": 3.771368986149296e-05, - "loss": 0.2718, + "epoch": 1.794608426136159, + "grad_norm": 0.1793317049741745, + "learning_rate": 3.711511297186194e-05, + "loss": 0.4273, "step": 49795 }, { - "epoch": 1.75, - "learning_rate": 3.771123693339103e-05, - "loss": 0.2804, + "epoch": 1.7947886257973835, + "grad_norm": 0.1982317417860031, + "learning_rate": 3.711256030046581e-05, + "loss": 0.3903, "step": 49800 }, { - "epoch": 1.75, - "learning_rate": 3.770878384024206e-05, - "loss": 0.2479, + "epoch": 1.794968825458608, + "grad_norm": 0.18103168904781342, + "learning_rate": 3.7110007464038375e-05, + "loss": 0.4436, "step": 49805 }, { - "epoch": 1.75, - "learning_rate": 3.7706330582077915e-05, - "loss": 0.2908, + "epoch": 1.7951490251198328, + "grad_norm": 0.20677992701530457, + "learning_rate": 3.710745446261444e-05, + "loss": 0.3946, "step": 49810 }, { - "epoch": 1.75, - "learning_rate": 3.770387715893043e-05, - "loss": 0.2954, + "epoch": 1.7953292247810575, + "grad_norm": 0.17719873785972595, + "learning_rate": 3.710490129622878e-05, + "loss": 0.4143, "step": 49815 }, { - "epoch": 1.75, - "learning_rate": 3.770142357083149e-05, - "loss": 0.3016, + "epoch": 1.795509424442282, + "grad_norm": 0.15315468609333038, + "learning_rate": 3.7102347964916183e-05, + "loss": 0.3798, "step": 49820 }, { - "epoch": 1.75, - "learning_rate": 3.769896981781292e-05, - "loss": 0.3029, + "epoch": 1.7956896241035065, + "grad_norm": 0.15976719558238983, + "learning_rate": 3.709979446871144e-05, + "loss": 0.4025, "step": 49825 }, { - "epoch": 1.75, - "learning_rate": 3.769651589990661e-05, - "loss": 0.2911, + "epoch": 1.7958698237647313, + "grad_norm": 0.19692429900169373, + "learning_rate": 3.7097240807649334e-05, + "loss": 0.4113, "step": 49830 }, { - "epoch": 1.75, - "learning_rate": 3.76940618171444e-05, - "loss": 0.2742, + "epoch": 1.796050023425956, + "grad_norm": 0.2119201272726059, + "learning_rate": 3.709468698176467e-05, + "loss": 0.4152, "step": 49835 }, { - "epoch": 1.75, - "learning_rate": 3.769160756955816e-05, - "loss": 0.2924, + "epoch": 1.7962302230871807, + "grad_norm": 0.17838400602340698, + "learning_rate": 3.7092132991092236e-05, + "loss": 0.4198, "step": 49840 }, { - "epoch": 1.75, - "learning_rate": 3.768915315717976e-05, - "loss": 0.2557, + "epoch": 1.7964104227484052, + "grad_norm": 0.17005471885204315, + "learning_rate": 3.7089578835666834e-05, + "loss": 0.3886, "step": 49845 }, { - "epoch": 1.75, - "learning_rate": 3.7686698580041075e-05, - "loss": 0.2791, + "epoch": 1.7965906224096297, + "grad_norm": 0.18709275126457214, + "learning_rate": 3.708702451552326e-05, + "loss": 0.4055, "step": 49850 }, { - "epoch": 1.75, - "learning_rate": 3.7684243838173964e-05, - "loss": 0.2808, + "epoch": 1.7967708220708545, + "grad_norm": 0.21572504937648773, + "learning_rate": 3.708447003069631e-05, + "loss": 0.4095, "step": 49855 }, { - "epoch": 1.75, - "learning_rate": 3.7681788931610315e-05, - "loss": 0.2843, + "epoch": 1.7969510217320792, + "grad_norm": 0.17971207201480865, + "learning_rate": 3.70819153812208e-05, + "loss": 0.4189, "step": 49860 }, { - "epoch": 1.75, - "learning_rate": 3.7679333860381984e-05, - "loss": 0.2857, + "epoch": 1.797131221393304, + "grad_norm": 0.20288041234016418, + "learning_rate": 3.707936056713154e-05, + "loss": 0.4218, "step": 49865 }, { - "epoch": 1.75, - "learning_rate": 3.767687862452086e-05, - "loss": 0.2519, + "epoch": 1.7973114210545285, + "grad_norm": 0.18527214229106903, + "learning_rate": 3.7076805588463324e-05, + "loss": 0.4071, "step": 49870 }, { - "epoch": 1.75, - "learning_rate": 3.7674423224058825e-05, - "loss": 0.2961, + "epoch": 1.797491620715753, + "grad_norm": 0.17355002462863922, + "learning_rate": 3.7074250445250954e-05, + "loss": 0.408, "step": 49875 }, { - "epoch": 1.75, - "learning_rate": 3.7671967659027755e-05, - "loss": 0.2871, + "epoch": 1.7976718203769777, + "grad_norm": 0.16921040415763855, + "learning_rate": 3.707169513752928e-05, + "loss": 0.4111, "step": 49880 }, { - "epoch": 1.76, - "learning_rate": 3.766951192945954e-05, - "loss": 0.3087, + "epoch": 1.7978520200382024, + "grad_norm": 0.1597539633512497, + "learning_rate": 3.70691396653331e-05, + "loss": 0.4486, "step": 49885 }, { - "epoch": 1.76, - "learning_rate": 3.766705603538606e-05, - "loss": 0.2804, + "epoch": 1.798032219699427, + "grad_norm": 0.1976759433746338, + "learning_rate": 3.7066584028697224e-05, + "loss": 0.3784, "step": 49890 }, { - "epoch": 1.76, - "learning_rate": 3.76645999768392e-05, - "loss": 0.2909, + "epoch": 1.7982124193606515, + "grad_norm": 0.1847647726535797, + "learning_rate": 3.706402822765647e-05, + "loss": 0.4264, "step": 49895 }, { - "epoch": 1.76, - "learning_rate": 3.7662143753850864e-05, - "loss": 0.2842, + "epoch": 1.7983926190218762, + "grad_norm": 0.2059379369020462, + "learning_rate": 3.7061472262245664e-05, + "loss": 0.3677, "step": 49900 }, { - "epoch": 1.76, - "learning_rate": 3.7659687366452934e-05, - "loss": 0.2629, + "epoch": 1.798572818683101, + "grad_norm": 0.18027707934379578, + "learning_rate": 3.7058916132499645e-05, + "loss": 0.4286, "step": 49905 }, { - "epoch": 1.76, - "learning_rate": 3.76572308146773e-05, - "loss": 0.2873, + "epoch": 1.7987530183443257, + "grad_norm": 0.1781684011220932, + "learning_rate": 3.705635983845322e-05, + "loss": 0.447, "step": 49910 }, { - "epoch": 1.76, - "learning_rate": 3.7654774098555874e-05, - "loss": 0.289, + "epoch": 1.7989332180055502, + "grad_norm": 0.14217472076416016, + "learning_rate": 3.7053803380141233e-05, + "loss": 0.3952, "step": 49915 }, { - "epoch": 1.76, - "learning_rate": 3.765231721812054e-05, - "loss": 0.2851, + "epoch": 1.7991134176667747, + "grad_norm": 0.17906373739242554, + "learning_rate": 3.70512467575985e-05, + "loss": 0.4536, "step": 49920 }, { - "epoch": 1.76, - "learning_rate": 3.764986017340321e-05, - "loss": 0.2801, + "epoch": 1.7992936173279994, + "grad_norm": 0.18036003410816193, + "learning_rate": 3.7048689970859874e-05, + "loss": 0.3784, "step": 49925 }, { - "epoch": 1.76, - "learning_rate": 3.764740296443577e-05, - "loss": 0.3052, + "epoch": 1.7994738169892242, + "grad_norm": 0.15445788204669952, + "learning_rate": 3.704613301996017e-05, + "loss": 0.4119, "step": 49930 }, { - "epoch": 1.76, - "learning_rate": 3.764494559125016e-05, - "loss": 0.2867, + "epoch": 1.7996540166504487, + "grad_norm": 0.20820528268814087, + "learning_rate": 3.7043575904934246e-05, + "loss": 0.4091, "step": 49935 }, { - "epoch": 1.76, - "learning_rate": 3.7642488053878246e-05, - "loss": 0.2873, + "epoch": 1.7998342163116732, + "grad_norm": 0.1830679327249527, + "learning_rate": 3.7041018625816926e-05, + "loss": 0.4069, "step": 49940 }, { - "epoch": 1.76, - "learning_rate": 3.7640030352351965e-05, - "loss": 0.3017, + "epoch": 1.800014415972898, + "grad_norm": 0.18425950407981873, + "learning_rate": 3.703846118264306e-05, + "loss": 0.4254, "step": 49945 }, { - "epoch": 1.76, - "learning_rate": 3.763757248670321e-05, - "loss": 0.2837, + "epoch": 1.8001946156341226, + "grad_norm": 0.19316412508487701, + "learning_rate": 3.703590357544749e-05, + "loss": 0.4153, "step": 49950 }, { - "epoch": 1.76, - "learning_rate": 3.7635114456963914e-05, - "loss": 0.2532, + "epoch": 1.8003748152953474, + "grad_norm": 0.17735867202281952, + "learning_rate": 3.7033345804265054e-05, + "loss": 0.3632, "step": 49955 }, { - "epoch": 1.76, - "learning_rate": 3.7632656263165984e-05, - "loss": 0.3039, + "epoch": 1.8005550149565719, + "grad_norm": 0.22728107869625092, + "learning_rate": 3.703078786913063e-05, + "loss": 0.3849, "step": 49960 }, { - "epoch": 1.76, - "learning_rate": 3.7630197905341336e-05, - "loss": 0.2665, + "epoch": 1.8007352146177964, + "grad_norm": 0.16942258179187775, + "learning_rate": 3.702822977007904e-05, + "loss": 0.3961, "step": 49965 }, { - "epoch": 1.76, - "learning_rate": 3.762773938352189e-05, - "loss": 0.2835, + "epoch": 1.8009154142790211, + "grad_norm": 0.1619909405708313, + "learning_rate": 3.7025671507145156e-05, + "loss": 0.4082, "step": 49970 }, { - "epoch": 1.76, - "learning_rate": 3.7625280697739574e-05, - "loss": 0.2875, + "epoch": 1.8010956139402459, + "grad_norm": 0.17971543967723846, + "learning_rate": 3.702311308036381e-05, + "loss": 0.4318, "step": 49975 }, { - "epoch": 1.76, - "learning_rate": 3.76228218480263e-05, - "loss": 0.2812, + "epoch": 1.8012758136014704, + "grad_norm": 0.19710752367973328, + "learning_rate": 3.702055448976989e-05, + "loss": 0.4109, "step": 49980 }, { - "epoch": 1.76, - "learning_rate": 3.762036283441401e-05, - "loss": 0.2623, + "epoch": 1.801456013262695, + "grad_norm": 0.21794439852237701, + "learning_rate": 3.7017995735398237e-05, + "loss": 0.3768, "step": 49985 }, { - "epoch": 1.76, - "learning_rate": 3.761790365693463e-05, - "loss": 0.2721, + "epoch": 1.8016362129239196, + "grad_norm": 0.21101748943328857, + "learning_rate": 3.7015436817283724e-05, + "loss": 0.4516, "step": 49990 }, { - "epoch": 1.76, - "learning_rate": 3.761544431562008e-05, - "loss": 0.2883, + "epoch": 1.8018164125851444, + "grad_norm": 0.21973469853401184, + "learning_rate": 3.70128777354612e-05, + "loss": 0.4238, "step": 49995 }, { - "epoch": 1.76, - "learning_rate": 3.76129848105023e-05, - "loss": 0.2767, + "epoch": 1.801996612246369, + "grad_norm": 0.1689206063747406, + "learning_rate": 3.701031848996555e-05, + "loss": 0.4313, "step": 50000 }, { - "epoch": 1.76, - "eval_loss": 0.2745961546897888, - "eval_runtime": 10.5442, - "eval_samples_per_second": 9.484, - "eval_steps_per_second": 9.484, + "epoch": 1.801996612246369, + "eval_loss": 0.4403511881828308, + "eval_runtime": 3.5415, + "eval_samples_per_second": 28.237, + "eval_steps_per_second": 7.059, "step": 50000 }, { - "epoch": 1.76, - "learning_rate": 3.7610525141613226e-05, - "loss": 0.2897, + "epoch": 1.8021768119075936, + "grad_norm": 0.17889846861362457, + "learning_rate": 3.700775908083164e-05, + "loss": 0.4117, "step": 50005 }, { - "epoch": 1.76, - "learning_rate": 3.7608065308984786e-05, - "loss": 0.3061, + "epoch": 1.8023570115688181, + "grad_norm": 0.18117192387580872, + "learning_rate": 3.7005199508094326e-05, + "loss": 0.4065, "step": 50010 }, { - "epoch": 1.76, - "learning_rate": 3.760560531264894e-05, - "loss": 0.3162, + "epoch": 1.8025372112300428, + "grad_norm": 0.18847104907035828, + "learning_rate": 3.700263977178851e-05, + "loss": 0.4004, "step": 50015 }, { - "epoch": 1.76, - "learning_rate": 3.760314515263761e-05, - "loss": 0.2929, + "epoch": 1.8027174108912676, + "grad_norm": 0.19416354596614838, + "learning_rate": 3.700007987194903e-05, + "loss": 0.4118, "step": 50020 }, { - "epoch": 1.76, - "learning_rate": 3.7600684828982747e-05, - "loss": 0.2934, + "epoch": 1.8028976105524923, + "grad_norm": 0.19143450260162354, + "learning_rate": 3.69975198086108e-05, + "loss": 0.4157, "step": 50025 }, { - "epoch": 1.76, - "learning_rate": 3.759822434171629e-05, - "loss": 0.2888, + "epoch": 1.8030778102137168, + "grad_norm": 0.17741017043590546, + "learning_rate": 3.699495958180868e-05, + "loss": 0.3936, "step": 50030 }, { - "epoch": 1.76, - "learning_rate": 3.759576369087019e-05, - "loss": 0.3187, + "epoch": 1.8032580098749413, + "grad_norm": 0.18034659326076508, + "learning_rate": 3.6992399191577554e-05, + "loss": 0.4407, "step": 50035 }, { - "epoch": 1.76, - "learning_rate": 3.759330287647641e-05, - "loss": 0.2742, + "epoch": 1.803438209536166, + "grad_norm": 0.17005987465381622, + "learning_rate": 3.698983863795232e-05, + "loss": 0.4033, "step": 50040 }, { - "epoch": 1.76, - "learning_rate": 3.7590841898566884e-05, - "loss": 0.2864, + "epoch": 1.8036184091973908, + "grad_norm": 0.19476930797100067, + "learning_rate": 3.698727792096785e-05, + "loss": 0.4351, "step": 50045 }, { - "epoch": 1.76, - "learning_rate": 3.7588380757173586e-05, - "loss": 0.2757, + "epoch": 1.8037986088586153, + "grad_norm": 0.2065165936946869, + "learning_rate": 3.698471704065904e-05, + "loss": 0.4046, "step": 50050 }, { - "epoch": 1.76, - "learning_rate": 3.758591945232845e-05, - "loss": 0.2854, + "epoch": 1.8039788085198398, + "grad_norm": 0.21584558486938477, + "learning_rate": 3.698215599706078e-05, + "loss": 0.3787, "step": 50055 }, { - "epoch": 1.76, - "learning_rate": 3.758345798406344e-05, - "loss": 0.2772, + "epoch": 1.8041590081810646, + "grad_norm": 0.19947922229766846, + "learning_rate": 3.6979594790207964e-05, + "loss": 0.4095, "step": 50060 }, { - "epoch": 1.76, - "learning_rate": 3.758099635241053e-05, - "loss": 0.3134, + "epoch": 1.8043392078422893, + "grad_norm": 0.1538824588060379, + "learning_rate": 3.697703342013549e-05, + "loss": 0.39, "step": 50065 }, { - "epoch": 1.76, - "learning_rate": 3.757853455740167e-05, - "loss": 0.2818, + "epoch": 1.804519407503514, + "grad_norm": 0.19563712179660797, + "learning_rate": 3.6974471886878255e-05, + "loss": 0.4194, "step": 50070 }, { - "epoch": 1.76, - "learning_rate": 3.7576072599068834e-05, - "loss": 0.2701, + "epoch": 1.8046996071647385, + "grad_norm": 0.18470360338687897, + "learning_rate": 3.697191019047116e-05, + "loss": 0.3901, "step": 50075 }, { - "epoch": 1.76, - "learning_rate": 3.7573610477443974e-05, - "loss": 0.2831, + "epoch": 1.804879806825963, + "grad_norm": 0.24419862031936646, + "learning_rate": 3.69693483309491e-05, + "loss": 0.4253, "step": 50080 }, { - "epoch": 1.76, - "learning_rate": 3.757114819255908e-05, - "loss": 0.2732, + "epoch": 1.8050600064871878, + "grad_norm": 0.1856248527765274, + "learning_rate": 3.696678630834699e-05, + "loss": 0.4468, "step": 50085 }, { - "epoch": 1.76, - "learning_rate": 3.7568685744446094e-05, - "loss": 0.2824, + "epoch": 1.8052402061484125, + "grad_norm": 0.2281470149755478, + "learning_rate": 3.696422412269974e-05, + "loss": 0.3804, "step": 50090 }, { - "epoch": 1.76, - "learning_rate": 3.756622313313702e-05, - "loss": 0.2878, + "epoch": 1.805420405809637, + "grad_norm": 0.17991898953914642, + "learning_rate": 3.696166177404224e-05, + "loss": 0.3965, "step": 50095 }, { - "epoch": 1.76, - "learning_rate": 3.7563760358663814e-05, - "loss": 0.287, + "epoch": 1.8056006054708618, + "grad_norm": 0.15711066126823425, + "learning_rate": 3.6959099262409425e-05, + "loss": 0.3968, "step": 50100 }, { - "epoch": 1.76, - "learning_rate": 3.756129742105846e-05, - "loss": 0.2736, + "epoch": 1.8057808051320863, + "grad_norm": 0.18689851462841034, + "learning_rate": 3.6956536587836186e-05, + "loss": 0.3862, "step": 50105 }, { - "epoch": 1.76, - "learning_rate": 3.755883432035294e-05, - "loss": 0.309, + "epoch": 1.805961004793311, + "grad_norm": 0.13950034976005554, + "learning_rate": 3.695397375035747e-05, + "loss": 0.4016, "step": 50110 }, { - "epoch": 1.76, - "learning_rate": 3.755637105657923e-05, - "loss": 0.2891, + "epoch": 1.8061412044545357, + "grad_norm": 0.19982732832431793, + "learning_rate": 3.695141075000816e-05, + "loss": 0.4065, "step": 50115 }, { - "epoch": 1.76, - "learning_rate": 3.755390762976933e-05, - "loss": 0.2989, + "epoch": 1.8063214041157603, + "grad_norm": 0.1736806035041809, + "learning_rate": 3.6948847586823196e-05, + "loss": 0.4303, "step": 50120 }, { - "epoch": 1.76, - "learning_rate": 3.7551444039955196e-05, - "loss": 0.29, + "epoch": 1.8065016037769848, + "grad_norm": 0.18712177872657776, + "learning_rate": 3.69462842608375e-05, + "loss": 0.4068, "step": 50125 }, { - "epoch": 1.76, - "learning_rate": 3.7548980287168836e-05, - "loss": 0.2877, + "epoch": 1.8066818034382095, + "grad_norm": 0.2055433690547943, + "learning_rate": 3.694372077208599e-05, + "loss": 0.4056, "step": 50130 }, { - "epoch": 1.76, - "learning_rate": 3.7546516371442244e-05, - "loss": 0.2558, + "epoch": 1.8068620030994342, + "grad_norm": 0.1870342344045639, + "learning_rate": 3.69411571206036e-05, + "loss": 0.4669, "step": 50135 }, { - "epoch": 1.76, - "learning_rate": 3.75440522928074e-05, - "loss": 0.286, + "epoch": 1.807042202760659, + "grad_norm": 0.224535271525383, + "learning_rate": 3.6938593306425255e-05, + "loss": 0.4317, "step": 50140 }, { - "epoch": 1.76, - "learning_rate": 3.75415880512963e-05, - "loss": 0.2894, + "epoch": 1.8072224024218835, + "grad_norm": 0.2587486505508423, + "learning_rate": 3.693602932958589e-05, + "loss": 0.4924, "step": 50145 }, { - "epoch": 1.76, - "learning_rate": 3.753912364694095e-05, - "loss": 0.2904, + "epoch": 1.807402602083108, + "grad_norm": 0.22826701402664185, + "learning_rate": 3.6933465190120434e-05, + "loss": 0.4257, "step": 50150 }, { - "epoch": 1.76, - "learning_rate": 3.7536659079773336e-05, - "loss": 0.2695, + "epoch": 1.8075828017443327, + "grad_norm": 0.1683250367641449, + "learning_rate": 3.693090088806383e-05, + "loss": 0.4344, "step": 50155 }, { - "epoch": 1.76, - "learning_rate": 3.753419434982548e-05, - "loss": 0.2806, + "epoch": 1.8077630014055575, + "grad_norm": 0.1752661168575287, + "learning_rate": 3.6928336423451e-05, + "loss": 0.4172, "step": 50160 }, { - "epoch": 1.76, - "learning_rate": 3.753172945712936e-05, - "loss": 0.2857, + "epoch": 1.807943201066782, + "grad_norm": 0.16361607611179352, + "learning_rate": 3.69257717963169e-05, + "loss": 0.4046, "step": 50165 }, { - "epoch": 1.77, - "learning_rate": 3.752926440171699e-05, - "loss": 0.3019, + "epoch": 1.8081234007280065, + "grad_norm": 0.18544571101665497, + "learning_rate": 3.692320700669648e-05, + "loss": 0.4402, "step": 50170 }, { - "epoch": 1.77, - "learning_rate": 3.7526799183620384e-05, - "loss": 0.2934, + "epoch": 1.8083036003892312, + "grad_norm": 0.16442377865314484, + "learning_rate": 3.6920642054624655e-05, + "loss": 0.3902, "step": 50175 }, { - "epoch": 1.77, - "learning_rate": 3.752433380287155e-05, - "loss": 0.2921, + "epoch": 1.808483800050456, + "grad_norm": 0.19460293650627136, + "learning_rate": 3.6918076940136406e-05, + "loss": 0.4246, "step": 50180 }, { - "epoch": 1.77, - "learning_rate": 3.752186825950249e-05, - "loss": 0.2819, + "epoch": 1.8086639997116807, + "grad_norm": 0.1653093844652176, + "learning_rate": 3.691551166326665e-05, + "loss": 0.4109, "step": 50185 }, { - "epoch": 1.77, - "learning_rate": 3.7519402553545224e-05, - "loss": 0.2756, + "epoch": 1.8088441993729052, + "grad_norm": 0.17277853190898895, + "learning_rate": 3.691294622405037e-05, + "loss": 0.3898, "step": 50190 }, { - "epoch": 1.77, - "learning_rate": 3.751693668503177e-05, - "loss": 0.2899, + "epoch": 1.8090243990341297, + "grad_norm": 0.19189266860485077, + "learning_rate": 3.69103806225225e-05, + "loss": 0.4178, "step": 50195 }, { - "epoch": 1.77, - "learning_rate": 3.7514470653994124e-05, - "loss": 0.3066, + "epoch": 1.8092045986953544, + "grad_norm": 0.18027593195438385, + "learning_rate": 3.690781485871799e-05, + "loss": 0.4507, "step": 50200 }, { - "epoch": 1.77, - "learning_rate": 3.751200446046434e-05, - "loss": 0.3111, + "epoch": 1.8093847983565792, + "grad_norm": 0.17248685657978058, + "learning_rate": 3.690524893267181e-05, + "loss": 0.3912, "step": 50205 }, { - "epoch": 1.77, - "learning_rate": 3.750953810447443e-05, - "loss": 0.2624, + "epoch": 1.8095649980178037, + "grad_norm": 0.21308831870555878, + "learning_rate": 3.690268284441893e-05, + "loss": 0.4291, "step": 50210 }, { - "epoch": 1.77, - "learning_rate": 3.750707158605641e-05, - "loss": 0.267, + "epoch": 1.8097451976790284, + "grad_norm": 0.19350473582744598, + "learning_rate": 3.6900116593994295e-05, + "loss": 0.4422, "step": 50215 }, { - "epoch": 1.77, - "learning_rate": 3.7504604905242296e-05, - "loss": 0.2832, + "epoch": 1.809925397340253, + "grad_norm": 0.17806485295295715, + "learning_rate": 3.6897550181432865e-05, + "loss": 0.3978, "step": 50220 }, { - "epoch": 1.77, - "learning_rate": 3.750213806206413e-05, - "loss": 0.271, + "epoch": 1.8101055970014777, + "grad_norm": 0.17208920419216156, + "learning_rate": 3.689498360676963e-05, + "loss": 0.3817, "step": 50225 }, { - "epoch": 1.77, - "learning_rate": 3.749967105655394e-05, - "loss": 0.3105, + "epoch": 1.8102857966627024, + "grad_norm": 0.21627385914325714, + "learning_rate": 3.689241687003955e-05, + "loss": 0.3997, "step": 50230 }, { - "epoch": 1.77, - "learning_rate": 3.749720388874377e-05, - "loss": 0.2642, + "epoch": 1.810465996323927, + "grad_norm": 0.16115298867225647, + "learning_rate": 3.688984997127758e-05, + "loss": 0.4023, "step": 50235 }, { - "epoch": 1.77, - "learning_rate": 3.749473655866563e-05, - "loss": 0.2783, + "epoch": 1.8106461959851514, + "grad_norm": 0.19665555655956268, + "learning_rate": 3.688728291051871e-05, + "loss": 0.4401, "step": 50240 }, { - "epoch": 1.77, - "learning_rate": 3.749226906635159e-05, - "loss": 0.2739, + "epoch": 1.8108263956463762, + "grad_norm": 0.1811586618423462, + "learning_rate": 3.688471568779791e-05, + "loss": 0.4217, "step": 50245 }, { - "epoch": 1.77, - "learning_rate": 3.7489801411833644e-05, - "loss": 0.2714, + "epoch": 1.8110065953076009, + "grad_norm": 0.19190897047519684, + "learning_rate": 3.6882148303150166e-05, + "loss": 0.4226, "step": 50250 }, { - "epoch": 1.77, - "learning_rate": 3.7487333595143875e-05, - "loss": 0.2904, + "epoch": 1.8111867949688256, + "grad_norm": 0.16638103127479553, + "learning_rate": 3.687958075661045e-05, + "loss": 0.3893, "step": 50255 }, { - "epoch": 1.77, - "learning_rate": 3.74848656163143e-05, - "loss": 0.293, + "epoch": 1.8113669946300501, + "grad_norm": 0.18631191551685333, + "learning_rate": 3.687701304821374e-05, + "loss": 0.3919, "step": 50260 }, { - "epoch": 1.77, - "learning_rate": 3.748239747537697e-05, - "loss": 0.2806, + "epoch": 1.8115471942912746, + "grad_norm": 0.16074827313423157, + "learning_rate": 3.687444517799503e-05, + "loss": 0.3798, "step": 50265 }, { - "epoch": 1.77, - "learning_rate": 3.747992917236395e-05, - "loss": 0.2623, + "epoch": 1.8117273939524994, + "grad_norm": 0.18698133528232574, + "learning_rate": 3.6871877145989305e-05, + "loss": 0.3859, "step": 50270 }, { - "epoch": 1.77, - "learning_rate": 3.7477460707307265e-05, - "loss": 0.298, + "epoch": 1.811907593613724, + "grad_norm": 0.1874270737171173, + "learning_rate": 3.686930895223156e-05, + "loss": 0.3785, "step": 50275 }, { - "epoch": 1.77, - "learning_rate": 3.747499208023897e-05, - "loss": 0.2629, + "epoch": 1.8120877932749486, + "grad_norm": 0.17781777679920197, + "learning_rate": 3.686674059675677e-05, + "loss": 0.3977, "step": 50280 }, { - "epoch": 1.77, - "learning_rate": 3.747252329119113e-05, - "loss": 0.2498, + "epoch": 1.8122679929361731, + "grad_norm": 0.17600201070308685, + "learning_rate": 3.686417207959994e-05, + "loss": 0.4425, "step": 50285 }, { - "epoch": 1.77, - "learning_rate": 3.74700543401958e-05, - "loss": 0.2924, + "epoch": 1.8124481925973979, + "grad_norm": 0.17503990232944489, + "learning_rate": 3.686160340079605e-05, + "loss": 0.3825, "step": 50290 }, { - "epoch": 1.77, - "learning_rate": 3.746758522728502e-05, - "loss": 0.2857, + "epoch": 1.8126283922586226, + "grad_norm": 0.18147842586040497, + "learning_rate": 3.6859034560380125e-05, + "loss": 0.4118, "step": 50295 }, { - "epoch": 1.77, - "learning_rate": 3.7465115952490876e-05, - "loss": 0.2857, + "epoch": 1.8128085919198473, + "grad_norm": 0.19389557838439941, + "learning_rate": 3.6856465558387144e-05, + "loss": 0.3737, "step": 50300 }, { - "epoch": 1.77, - "learning_rate": 3.746264651584541e-05, - "loss": 0.2751, + "epoch": 1.8129887915810718, + "grad_norm": 0.17562417685985565, + "learning_rate": 3.685389639485211e-05, + "loss": 0.3936, "step": 50305 }, { - "epoch": 1.77, - "learning_rate": 3.746017691738069e-05, - "loss": 0.2849, + "epoch": 1.8131689912422964, + "grad_norm": 0.16235031187534332, + "learning_rate": 3.685132706981004e-05, + "loss": 0.4133, "step": 50310 }, { - "epoch": 1.77, - "learning_rate": 3.7457707157128784e-05, - "loss": 0.2778, + "epoch": 1.813349190903521, + "grad_norm": 0.16100172698497772, + "learning_rate": 3.684875758329593e-05, + "loss": 0.401, "step": 50315 }, { - "epoch": 1.77, - "learning_rate": 3.7455237235121765e-05, - "loss": 0.2708, + "epoch": 1.8135293905647458, + "grad_norm": 0.1928766369819641, + "learning_rate": 3.68461879353448e-05, + "loss": 0.4323, "step": 50320 }, { - "epoch": 1.77, - "learning_rate": 3.74527671513917e-05, - "loss": 0.276, + "epoch": 1.8137095902259703, + "grad_norm": 0.1594439446926117, + "learning_rate": 3.684361812599164e-05, + "loss": 0.4094, "step": 50325 }, { - "epoch": 1.77, - "learning_rate": 3.745029690597065e-05, - "loss": 0.314, + "epoch": 1.8138897898871948, + "grad_norm": 0.24857677519321442, + "learning_rate": 3.684104815527149e-05, + "loss": 0.3897, "step": 50330 }, { - "epoch": 1.77, - "learning_rate": 3.7447826498890705e-05, - "loss": 0.2613, + "epoch": 1.8140699895484196, + "grad_norm": 0.17496556043624878, + "learning_rate": 3.6838478023219344e-05, + "loss": 0.3763, "step": 50335 }, { - "epoch": 1.77, - "learning_rate": 3.7445355930183945e-05, - "loss": 0.2961, + "epoch": 1.8142501892096443, + "grad_norm": 0.2180209904909134, + "learning_rate": 3.683590772987022e-05, + "loss": 0.3818, "step": 50340 }, { - "epoch": 1.77, - "learning_rate": 3.744288519988243e-05, - "loss": 0.297, + "epoch": 1.814430388870869, + "grad_norm": 0.18124771118164062, + "learning_rate": 3.683333727525916e-05, + "loss": 0.4156, "step": 50345 }, { - "epoch": 1.77, - "learning_rate": 3.744041430801826e-05, - "loss": 0.2742, + "epoch": 1.8146105885320936, + "grad_norm": 0.2114446610212326, + "learning_rate": 3.683076665942115e-05, + "loss": 0.4311, "step": 50350 }, { - "epoch": 1.77, - "learning_rate": 3.74379432546235e-05, - "loss": 0.2893, + "epoch": 1.814790788193318, + "grad_norm": 0.1923656463623047, + "learning_rate": 3.682819588239126e-05, + "loss": 0.4145, "step": 50355 }, { - "epoch": 1.77, - "learning_rate": 3.743547203973024e-05, - "loss": 0.2973, + "epoch": 1.8149709878545428, + "grad_norm": 0.16528750956058502, + "learning_rate": 3.682562494420447e-05, + "loss": 0.3791, "step": 50360 }, { - "epoch": 1.77, - "learning_rate": 3.7433000663370575e-05, - "loss": 0.2927, + "epoch": 1.8151511875157675, + "grad_norm": 0.18421389162540436, + "learning_rate": 3.682305384489585e-05, + "loss": 0.473, "step": 50365 }, { - "epoch": 1.77, - "learning_rate": 3.74305291255766e-05, - "loss": 0.2889, + "epoch": 1.8153313871769923, + "grad_norm": 0.18511919677257538, + "learning_rate": 3.682048258450039e-05, + "loss": 0.4415, "step": 50370 }, { - "epoch": 1.77, - "learning_rate": 3.742805742638038e-05, - "loss": 0.2961, + "epoch": 1.8155115868382168, + "grad_norm": 0.1795085072517395, + "learning_rate": 3.681791116305315e-05, + "loss": 0.4432, "step": 50375 }, { - "epoch": 1.77, - "learning_rate": 3.742558556581403e-05, - "loss": 0.3069, + "epoch": 1.8156917864994413, + "grad_norm": 0.16460351645946503, + "learning_rate": 3.6815339580589165e-05, + "loss": 0.3958, "step": 50380 }, { - "epoch": 1.77, - "learning_rate": 3.742311354390965e-05, - "loss": 0.2718, + "epoch": 1.815871986160666, + "grad_norm": 0.18292105197906494, + "learning_rate": 3.6812767837143455e-05, + "loss": 0.4311, "step": 50385 }, { - "epoch": 1.77, - "learning_rate": 3.7420641360699315e-05, - "loss": 0.2959, + "epoch": 1.8160521858218908, + "grad_norm": 0.19992749392986298, + "learning_rate": 3.681019593275108e-05, + "loss": 0.4179, "step": 50390 }, { - "epoch": 1.77, - "learning_rate": 3.741816901621515e-05, - "loss": 0.2958, + "epoch": 1.8162323854831153, + "grad_norm": 0.1999056190252304, + "learning_rate": 3.680762386744707e-05, + "loss": 0.4146, "step": 50395 }, { - "epoch": 1.77, - "learning_rate": 3.741569651048923e-05, - "loss": 0.2924, + "epoch": 1.8164125851443398, + "grad_norm": 0.16766919195652008, + "learning_rate": 3.6805051641266476e-05, + "loss": 0.3939, "step": 50400 }, { - "epoch": 1.77, - "learning_rate": 3.7413223843553694e-05, - "loss": 0.2873, + "epoch": 1.8165927848055645, + "grad_norm": 0.1673685610294342, + "learning_rate": 3.6802479254244327e-05, + "loss": 0.416, "step": 50405 }, { - "epoch": 1.77, - "learning_rate": 3.74107510154406e-05, - "loss": 0.2992, + "epoch": 1.8167729844667893, + "grad_norm": 0.2412656992673874, + "learning_rate": 3.679990670641569e-05, + "loss": 0.4164, "step": 50410 }, { - "epoch": 1.77, - "learning_rate": 3.7408278026182106e-05, - "loss": 0.2784, + "epoch": 1.816953184128014, + "grad_norm": 0.18288861215114594, + "learning_rate": 3.679733399781561e-05, + "loss": 0.4088, "step": 50415 }, { - "epoch": 1.77, - "learning_rate": 3.7405804875810286e-05, - "loss": 0.2861, + "epoch": 1.8171333837892385, + "grad_norm": 0.15355655550956726, + "learning_rate": 3.6794761128479125e-05, + "loss": 0.4283, "step": 50420 }, { - "epoch": 1.77, - "learning_rate": 3.740333156435727e-05, - "loss": 0.2779, + "epoch": 1.817313583450463, + "grad_norm": 0.1338133066892624, + "learning_rate": 3.679218809844132e-05, + "loss": 0.3979, "step": 50425 }, { - "epoch": 1.77, - "learning_rate": 3.740085809185517e-05, - "loss": 0.2882, + "epoch": 1.8174937831116877, + "grad_norm": 0.2555032968521118, + "learning_rate": 3.6789614907737226e-05, + "loss": 0.4351, "step": 50430 }, { - "epoch": 1.77, - "learning_rate": 3.73983844583361e-05, - "loss": 0.2531, + "epoch": 1.8176739827729125, + "grad_norm": 0.1840798258781433, + "learning_rate": 3.6787041556401914e-05, + "loss": 0.3992, "step": 50435 }, { - "epoch": 1.77, - "learning_rate": 3.7395910663832175e-05, - "loss": 0.2711, + "epoch": 1.817854182434137, + "grad_norm": 0.18239086866378784, + "learning_rate": 3.678446804447044e-05, + "loss": 0.3901, "step": 50440 }, { - "epoch": 1.77, - "learning_rate": 3.7393436708375516e-05, - "loss": 0.2866, + "epoch": 1.8180343820953615, + "grad_norm": 0.2021910846233368, + "learning_rate": 3.678189437197788e-05, + "loss": 0.4164, "step": 50445 }, { - "epoch": 1.77, - "learning_rate": 3.739096259199825e-05, - "loss": 0.2892, + "epoch": 1.8182145817565862, + "grad_norm": 0.22721827030181885, + "learning_rate": 3.6779320538959275e-05, + "loss": 0.4761, "step": 50450 }, { - "epoch": 1.78, - "learning_rate": 3.73884883147325e-05, - "loss": 0.2926, + "epoch": 1.818394781417811, + "grad_norm": 0.148165762424469, + "learning_rate": 3.6776746545449715e-05, + "loss": 0.3898, "step": 50455 }, { - "epoch": 1.78, - "learning_rate": 3.73860138766104e-05, - "loss": 0.3057, + "epoch": 1.8185749810790357, + "grad_norm": 0.1854783147573471, + "learning_rate": 3.677417239148428e-05, + "loss": 0.3955, "step": 50460 }, { - "epoch": 1.78, - "learning_rate": 3.7383539277664074e-05, - "loss": 0.2761, + "epoch": 1.8187551807402602, + "grad_norm": 0.18385660648345947, + "learning_rate": 3.6771598077098e-05, + "loss": 0.4202, "step": 50465 }, { - "epoch": 1.78, - "learning_rate": 3.738106451792564e-05, - "loss": 0.2693, + "epoch": 1.8189353804014847, + "grad_norm": 0.19717253744602203, + "learning_rate": 3.6769023602325985e-05, + "loss": 0.3799, "step": 50470 }, { - "epoch": 1.78, - "learning_rate": 3.737858959742726e-05, - "loss": 0.2797, + "epoch": 1.8191155800627095, + "grad_norm": 0.20641092956066132, + "learning_rate": 3.67664489672033e-05, + "loss": 0.4063, "step": 50475 }, { - "epoch": 1.78, - "learning_rate": 3.7376114516201036e-05, - "loss": 0.2759, + "epoch": 1.8192957797239342, + "grad_norm": 0.18371793627738953, + "learning_rate": 3.676387417176503e-05, + "loss": 0.4177, "step": 50480 }, { - "epoch": 1.78, - "learning_rate": 3.737363927427913e-05, - "loss": 0.2778, + "epoch": 1.8194759793851587, + "grad_norm": 0.16387097537517548, + "learning_rate": 3.676129921604625e-05, + "loss": 0.4037, "step": 50485 }, { - "epoch": 1.78, - "learning_rate": 3.737116387169367e-05, - "loss": 0.3123, + "epoch": 1.8196561790463834, + "grad_norm": 0.2013285756111145, + "learning_rate": 3.675872410008204e-05, + "loss": 0.4411, "step": 50490 }, { - "epoch": 1.78, - "learning_rate": 3.7368688308476796e-05, - "loss": 0.2828, + "epoch": 1.819836378707608, + "grad_norm": 0.17570237815380096, + "learning_rate": 3.675614882390751e-05, + "loss": 0.4142, "step": 50495 }, { - "epoch": 1.78, - "learning_rate": 3.736621258466066e-05, - "loss": 0.2872, + "epoch": 1.8200165783688327, + "grad_norm": 0.17556174099445343, + "learning_rate": 3.675357338755771e-05, + "loss": 0.4109, "step": 50500 }, { - "epoch": 1.78, - "eval_loss": 0.27529093623161316, - "eval_runtime": 10.5518, - "eval_samples_per_second": 9.477, - "eval_steps_per_second": 9.477, + "epoch": 1.8200165783688327, + "eval_loss": 0.4410496950149536, + "eval_runtime": 3.5389, + "eval_samples_per_second": 28.257, + "eval_steps_per_second": 7.064, "step": 50500 }, { - "epoch": 1.78, - "learning_rate": 3.73637367002774e-05, - "loss": 0.27, + "epoch": 1.8201967780300574, + "grad_norm": 0.18107643723487854, + "learning_rate": 3.675099779106775e-05, + "loss": 0.3993, "step": 50505 }, { - "epoch": 1.78, - "learning_rate": 3.736126065535918e-05, - "loss": 0.2908, + "epoch": 1.820376977691282, + "grad_norm": 0.20702654123306274, + "learning_rate": 3.6748422034472725e-05, + "loss": 0.4126, "step": 50510 }, { - "epoch": 1.78, - "learning_rate": 3.735878444993812e-05, - "loss": 0.3056, + "epoch": 1.8205571773525064, + "grad_norm": 0.15964628756046295, + "learning_rate": 3.674584611780772e-05, + "loss": 0.406, "step": 50515 }, { - "epoch": 1.78, - "learning_rate": 3.73563080840464e-05, - "loss": 0.2857, + "epoch": 1.8207373770137312, + "grad_norm": 0.16199252009391785, + "learning_rate": 3.6743270041107846e-05, + "loss": 0.4537, "step": 50520 }, { - "epoch": 1.78, - "learning_rate": 3.735383155771616e-05, - "loss": 0.3068, + "epoch": 1.820917576674956, + "grad_norm": 0.23481187224388123, + "learning_rate": 3.6740693804408175e-05, + "loss": 0.4111, "step": 50525 }, { - "epoch": 1.78, - "learning_rate": 3.735135487097955e-05, - "loss": 0.2963, + "epoch": 1.8210977763361806, + "grad_norm": 0.1678016185760498, + "learning_rate": 3.673811740774384e-05, + "loss": 0.4329, "step": 50530 }, { - "epoch": 1.78, - "learning_rate": 3.734887802386874e-05, - "loss": 0.3042, + "epoch": 1.8212779759974052, + "grad_norm": 0.16724498569965363, + "learning_rate": 3.673554085114991e-05, + "loss": 0.4037, "step": 50535 }, { - "epoch": 1.78, - "learning_rate": 3.734640101641589e-05, - "loss": 0.2815, + "epoch": 1.8214581756586297, + "grad_norm": 0.17332807183265686, + "learning_rate": 3.673296413466153e-05, + "loss": 0.3876, "step": 50540 }, { - "epoch": 1.78, - "learning_rate": 3.734392384865316e-05, - "loss": 0.2768, + "epoch": 1.8216383753198544, + "grad_norm": 0.20103994011878967, + "learning_rate": 3.673038725831377e-05, + "loss": 0.4242, "step": 50545 }, { - "epoch": 1.78, - "learning_rate": 3.734144652061271e-05, - "loss": 0.2734, + "epoch": 1.8218185749810791, + "grad_norm": 0.1513690948486328, + "learning_rate": 3.672781022214176e-05, + "loss": 0.3812, "step": 50550 }, { - "epoch": 1.78, - "learning_rate": 3.733896903232671e-05, - "loss": 0.2725, + "epoch": 1.8219987746423036, + "grad_norm": 0.14303380250930786, + "learning_rate": 3.67252330261806e-05, + "loss": 0.4238, "step": 50555 }, { - "epoch": 1.78, - "learning_rate": 3.733649138382734e-05, - "loss": 0.3065, + "epoch": 1.8221789743035282, + "grad_norm": 0.17810964584350586, + "learning_rate": 3.6722655670465416e-05, + "loss": 0.4074, "step": 50560 }, { - "epoch": 1.78, - "learning_rate": 3.7334013575146756e-05, - "loss": 0.2916, + "epoch": 1.8223591739647529, + "grad_norm": 0.15270282328128815, + "learning_rate": 3.672007815503132e-05, + "loss": 0.4043, "step": 50565 }, { - "epoch": 1.78, - "learning_rate": 3.733153560631712e-05, - "loss": 0.2769, + "epoch": 1.8225393736259776, + "grad_norm": 0.2379712015390396, + "learning_rate": 3.671750047991343e-05, + "loss": 0.4042, "step": 50570 }, { - "epoch": 1.78, - "learning_rate": 3.7329057477370636e-05, - "loss": 0.2781, + "epoch": 1.8227195732872024, + "grad_norm": 0.19908632338047028, + "learning_rate": 3.6714922645146856e-05, + "loss": 0.4034, "step": 50575 }, { - "epoch": 1.78, - "learning_rate": 3.732657918833946e-05, - "loss": 0.2919, + "epoch": 1.8228997729484269, + "grad_norm": 0.16394317150115967, + "learning_rate": 3.671234465076673e-05, + "loss": 0.4139, "step": 50580 }, { - "epoch": 1.78, - "learning_rate": 3.7324100739255774e-05, - "loss": 0.2488, + "epoch": 1.8230799726096514, + "grad_norm": 0.18769471347332, + "learning_rate": 3.670976649680819e-05, + "loss": 0.4194, "step": 50585 }, { - "epoch": 1.78, - "learning_rate": 3.7321622130151775e-05, - "loss": 0.2713, + "epoch": 1.8232601722708761, + "grad_norm": 0.18766231834888458, + "learning_rate": 3.6707188183306326e-05, + "loss": 0.4128, "step": 50590 }, { - "epoch": 1.78, - "learning_rate": 3.7319143361059616e-05, - "loss": 0.2853, + "epoch": 1.8234403719321008, + "grad_norm": 0.15433186292648315, + "learning_rate": 3.6704609710296295e-05, + "loss": 0.4008, "step": 50595 }, { - "epoch": 1.78, - "learning_rate": 3.731666443201151e-05, - "loss": 0.2938, + "epoch": 1.8236205715933254, + "grad_norm": 0.20321685075759888, + "learning_rate": 3.670203107781324e-05, + "loss": 0.3944, "step": 50600 }, { - "epoch": 1.78, - "learning_rate": 3.731418534303963e-05, - "loss": 0.2751, + "epoch": 1.82380077125455, + "grad_norm": 0.21775129437446594, + "learning_rate": 3.669945228589225e-05, + "loss": 0.4413, "step": 50605 }, { - "epoch": 1.78, - "learning_rate": 3.7311706094176166e-05, - "loss": 0.2899, + "epoch": 1.8239809709157746, + "grad_norm": 0.2147761881351471, + "learning_rate": 3.669687333456852e-05, + "loss": 0.418, "step": 50610 }, { - "epoch": 1.78, - "learning_rate": 3.730922668545331e-05, - "loss": 0.2622, + "epoch": 1.8241611705769993, + "grad_norm": 0.20212721824645996, + "learning_rate": 3.669429422387713e-05, + "loss": 0.3872, "step": 50615 }, { - "epoch": 1.78, - "learning_rate": 3.730674711690326e-05, - "loss": 0.2915, + "epoch": 1.824341370238224, + "grad_norm": 0.1724279671907425, + "learning_rate": 3.669171495385325e-05, + "loss": 0.3979, "step": 50620 }, { - "epoch": 1.78, - "learning_rate": 3.7304267388558215e-05, - "loss": 0.2604, + "epoch": 1.8245215698994486, + "grad_norm": 0.1798325926065445, + "learning_rate": 3.668913552453203e-05, + "loss": 0.404, "step": 50625 }, { - "epoch": 1.78, - "learning_rate": 3.730178750045036e-05, - "loss": 0.2886, + "epoch": 1.824701769560673, + "grad_norm": 0.1844148486852646, + "learning_rate": 3.668655593594858e-05, + "loss": 0.4004, "step": 50630 }, { - "epoch": 1.78, - "learning_rate": 3.729930745261191e-05, - "loss": 0.275, + "epoch": 1.8248819692218978, + "grad_norm": 0.22137939929962158, + "learning_rate": 3.6683976188138084e-05, + "loss": 0.3921, "step": 50635 }, { - "epoch": 1.78, - "learning_rate": 3.729682724507505e-05, - "loss": 0.3038, + "epoch": 1.8250621688831226, + "grad_norm": 0.1671324074268341, + "learning_rate": 3.6681396281135676e-05, + "loss": 0.3937, "step": 50640 }, { - "epoch": 1.78, - "learning_rate": 3.7294346877872e-05, - "loss": 0.2918, + "epoch": 1.8252423685443473, + "grad_norm": 0.21605345606803894, + "learning_rate": 3.6678816214976504e-05, + "loss": 0.4299, "step": 50645 }, { - "epoch": 1.78, - "learning_rate": 3.729186635103495e-05, - "loss": 0.2943, + "epoch": 1.8254225682055718, + "grad_norm": 0.16101548075675964, + "learning_rate": 3.667623598969572e-05, + "loss": 0.3997, "step": 50650 }, { - "epoch": 1.78, - "learning_rate": 3.728938566459612e-05, - "loss": 0.2683, + "epoch": 1.8256027678667963, + "grad_norm": 0.1385546624660492, + "learning_rate": 3.6673655605328475e-05, + "loss": 0.3785, "step": 50655 }, { - "epoch": 1.78, - "learning_rate": 3.728690481858771e-05, - "loss": 0.2774, + "epoch": 1.825782967528021, + "grad_norm": 0.1622258722782135, + "learning_rate": 3.667107506190993e-05, + "loss": 0.4387, "step": 50660 }, { - "epoch": 1.78, - "learning_rate": 3.728442381304194e-05, - "loss": 0.2812, + "epoch": 1.8259631671892458, + "grad_norm": 0.14885656535625458, + "learning_rate": 3.666849435947526e-05, + "loss": 0.4072, "step": 50665 }, { - "epoch": 1.78, - "learning_rate": 3.728194264799103e-05, - "loss": 0.2777, + "epoch": 1.8261433668504703, + "grad_norm": 0.20128627121448517, + "learning_rate": 3.6665913498059615e-05, + "loss": 0.4241, "step": 50670 }, { - "epoch": 1.78, - "learning_rate": 3.727946132346719e-05, - "loss": 0.3065, + "epoch": 1.8263235665116948, + "grad_norm": 0.17123094201087952, + "learning_rate": 3.666333247769814e-05, + "loss": 0.4182, "step": 50675 }, { - "epoch": 1.78, - "learning_rate": 3.7276979839502626e-05, - "loss": 0.2789, + "epoch": 1.8265037661729195, + "grad_norm": 0.19727930426597595, + "learning_rate": 3.666075129842603e-05, + "loss": 0.3987, "step": 50680 }, { - "epoch": 1.78, - "learning_rate": 3.727449819612958e-05, - "loss": 0.2794, + "epoch": 1.8266839658341443, + "grad_norm": 0.2271791249513626, + "learning_rate": 3.665816996027844e-05, + "loss": 0.4111, "step": 50685 }, { - "epoch": 1.78, - "learning_rate": 3.7272016393380256e-05, - "loss": 0.2922, + "epoch": 1.826864165495369, + "grad_norm": 0.18186520040035248, + "learning_rate": 3.665558846329055e-05, + "loss": 0.4242, "step": 50690 }, { - "epoch": 1.78, - "learning_rate": 3.726953443128689e-05, - "loss": 0.2913, + "epoch": 1.8270443651565935, + "grad_norm": 0.18122968077659607, + "learning_rate": 3.66530068074975e-05, + "loss": 0.4055, "step": 50695 }, { - "epoch": 1.78, - "learning_rate": 3.726705230988171e-05, - "loss": 0.2764, + "epoch": 1.827224564817818, + "grad_norm": 0.20953094959259033, + "learning_rate": 3.6650424992934504e-05, + "loss": 0.4515, "step": 50700 }, { - "epoch": 1.78, - "learning_rate": 3.7264570029196935e-05, - "loss": 0.3015, + "epoch": 1.8274047644790428, + "grad_norm": 0.1821168065071106, + "learning_rate": 3.664784301963673e-05, + "loss": 0.4363, "step": 50705 }, { - "epoch": 1.78, - "learning_rate": 3.72620875892648e-05, - "loss": 0.2876, + "epoch": 1.8275849641402675, + "grad_norm": 0.16025103628635406, + "learning_rate": 3.664526088763934e-05, + "loss": 0.3871, "step": 50710 }, { - "epoch": 1.78, - "learning_rate": 3.7259604990117536e-05, - "loss": 0.2714, + "epoch": 1.827765163801492, + "grad_norm": 0.1833401620388031, + "learning_rate": 3.6642678596977526e-05, + "loss": 0.4348, "step": 50715 }, { - "epoch": 1.78, - "learning_rate": 3.725712223178738e-05, - "loss": 0.2936, + "epoch": 1.8279453634627167, + "grad_norm": 0.14645916223526, + "learning_rate": 3.6640096147686467e-05, + "loss": 0.4141, "step": 50720 }, { - "epoch": 1.78, - "learning_rate": 3.7254639314306585e-05, - "loss": 0.2801, + "epoch": 1.8281255631239413, + "grad_norm": 0.2093980610370636, + "learning_rate": 3.663751353980136e-05, + "loss": 0.4121, "step": 50725 }, { - "epoch": 1.78, - "learning_rate": 3.725215623770736e-05, - "loss": 0.2894, + "epoch": 1.828305762785166, + "grad_norm": 0.19935870170593262, + "learning_rate": 3.663493077335738e-05, + "loss": 0.4305, "step": 50730 }, { - "epoch": 1.78, - "learning_rate": 3.724967300202196e-05, - "loss": 0.2929, + "epoch": 1.8284859624463907, + "grad_norm": 0.17722731828689575, + "learning_rate": 3.663234784838972e-05, + "loss": 0.4046, "step": 50735 }, { - "epoch": 1.79, - "learning_rate": 3.724718960728264e-05, - "loss": 0.2622, + "epoch": 1.8286661621076152, + "grad_norm": 0.1792575865983963, + "learning_rate": 3.662976476493357e-05, + "loss": 0.433, "step": 50740 }, { - "epoch": 1.79, - "learning_rate": 3.724470605352163e-05, - "loss": 0.2968, + "epoch": 1.8288463617688397, + "grad_norm": 0.15851470828056335, + "learning_rate": 3.662718152302413e-05, + "loss": 0.417, "step": 50745 }, { - "epoch": 1.79, - "learning_rate": 3.7242222340771174e-05, - "loss": 0.283, + "epoch": 1.8290265614300645, + "grad_norm": 0.15749147534370422, + "learning_rate": 3.6624598122696595e-05, + "loss": 0.4008, "step": 50750 }, { - "epoch": 1.79, - "learning_rate": 3.7239738469063535e-05, - "loss": 0.2829, + "epoch": 1.8292067610912892, + "grad_norm": 0.16477638483047485, + "learning_rate": 3.6622014563986155e-05, + "loss": 0.403, "step": 50755 }, { - "epoch": 1.79, - "learning_rate": 3.723725443843096e-05, - "loss": 0.3006, + "epoch": 1.829386960752514, + "grad_norm": 0.16740094125270844, + "learning_rate": 3.661943084692802e-05, + "loss": 0.4044, "step": 50760 }, { - "epoch": 1.79, - "learning_rate": 3.723477024890569e-05, - "loss": 0.2904, + "epoch": 1.8295671604137385, + "grad_norm": 0.2223920226097107, + "learning_rate": 3.661684697155739e-05, + "loss": 0.4402, "step": 50765 }, { - "epoch": 1.79, - "learning_rate": 3.7232285900520006e-05, - "loss": 0.2768, + "epoch": 1.829747360074963, + "grad_norm": 0.21728269755840302, + "learning_rate": 3.6614262937909474e-05, + "loss": 0.4326, "step": 50770 }, { - "epoch": 1.79, - "learning_rate": 3.722980139330614e-05, - "loss": 0.2764, + "epoch": 1.8299275597361877, + "grad_norm": 0.1693524718284607, + "learning_rate": 3.6611678746019464e-05, + "loss": 0.3812, "step": 50775 }, { - "epoch": 1.79, - "learning_rate": 3.722731672729637e-05, - "loss": 0.2917, + "epoch": 1.8301077593974124, + "grad_norm": 0.2092827707529068, + "learning_rate": 3.6609094395922585e-05, + "loss": 0.4101, "step": 50780 }, { - "epoch": 1.79, - "learning_rate": 3.7224831902522944e-05, - "loss": 0.2634, + "epoch": 1.830287959058637, + "grad_norm": 0.19062267243862152, + "learning_rate": 3.6606509887654046e-05, + "loss": 0.4504, "step": 50785 }, { - "epoch": 1.79, - "learning_rate": 3.722234691901814e-05, - "loss": 0.2761, + "epoch": 1.8304681587198615, + "grad_norm": 0.4250420928001404, + "learning_rate": 3.660392522124905e-05, + "loss": 0.4012, "step": 50790 }, { - "epoch": 1.79, - "learning_rate": 3.721986177681421e-05, - "loss": 0.2781, + "epoch": 1.8306483583810862, + "grad_norm": 0.18966785073280334, + "learning_rate": 3.660134039674282e-05, + "loss": 0.4067, "step": 50795 }, { - "epoch": 1.79, - "learning_rate": 3.721737647594343e-05, - "loss": 0.2898, + "epoch": 1.830828558042311, + "grad_norm": 0.17551647126674652, + "learning_rate": 3.659875541417057e-05, + "loss": 0.408, "step": 50800 }, { - "epoch": 1.79, - "learning_rate": 3.721489101643806e-05, - "loss": 0.2843, + "epoch": 1.8310087577035357, + "grad_norm": 0.16543635725975037, + "learning_rate": 3.659617027356753e-05, + "loss": 0.4234, "step": 50805 }, { - "epoch": 1.79, - "learning_rate": 3.72124053983304e-05, - "loss": 0.2757, + "epoch": 1.8311889573647602, + "grad_norm": 0.20097027719020844, + "learning_rate": 3.6593584974968916e-05, + "loss": 0.4096, "step": 50810 }, { - "epoch": 1.79, - "learning_rate": 3.720991962165269e-05, - "loss": 0.2847, + "epoch": 1.8313691570259847, + "grad_norm": 0.1641996055841446, + "learning_rate": 3.659099951840995e-05, + "loss": 0.4007, "step": 50815 }, { - "epoch": 1.79, - "learning_rate": 3.7207433686437224e-05, - "loss": 0.2749, + "epoch": 1.8315493566872094, + "grad_norm": 0.20720233023166656, + "learning_rate": 3.658841390392585e-05, + "loss": 0.4468, "step": 50820 }, { - "epoch": 1.79, - "learning_rate": 3.7204947592716274e-05, - "loss": 0.2849, + "epoch": 1.8317295563484342, + "grad_norm": 0.1876114308834076, + "learning_rate": 3.658582813155187e-05, + "loss": 0.404, "step": 50825 }, { - "epoch": 1.79, - "learning_rate": 3.720295860363726e-05, - "loss": 0.2891, + "epoch": 1.8319097560096587, + "grad_norm": 0.18697507679462433, + "learning_rate": 3.658324220132322e-05, + "loss": 0.3958, "step": 50830 }, { - "epoch": 1.79, - "learning_rate": 3.720047222468779e-05, - "loss": 0.2837, + "epoch": 1.8320899556708832, + "grad_norm": 0.1502731889486313, + "learning_rate": 3.658065611327513e-05, + "loss": 0.4068, "step": 50835 }, { - "epoch": 1.79, - "learning_rate": 3.719798568732323e-05, - "loss": 0.2903, + "epoch": 1.832270155332108, + "grad_norm": 0.18511810898780823, + "learning_rate": 3.6578069867442846e-05, + "loss": 0.426, "step": 50840 }, { - "epoch": 1.79, - "learning_rate": 3.719549899157586e-05, - "loss": 0.3028, + "epoch": 1.8324503549933326, + "grad_norm": 0.1467100828886032, + "learning_rate": 3.6575483463861604e-05, + "loss": 0.4337, "step": 50845 }, { - "epoch": 1.79, - "learning_rate": 3.719301213747798e-05, - "loss": 0.2797, + "epoch": 1.8326305546545574, + "grad_norm": 0.147383913397789, + "learning_rate": 3.657289690256664e-05, + "loss": 0.3969, "step": 50850 }, { - "epoch": 1.79, - "learning_rate": 3.719052512506187e-05, - "loss": 0.2833, + "epoch": 1.8328107543157819, + "grad_norm": 0.16182565689086914, + "learning_rate": 3.65703101835932e-05, + "loss": 0.4109, "step": 50855 }, { - "epoch": 1.79, - "learning_rate": 3.7188037954359826e-05, - "loss": 0.2882, + "epoch": 1.8329909539770064, + "grad_norm": 0.21511298418045044, + "learning_rate": 3.656772330697651e-05, + "loss": 0.3946, "step": 50860 }, { - "epoch": 1.79, - "learning_rate": 3.718555062540414e-05, - "loss": 0.2785, + "epoch": 1.8331711536382311, + "grad_norm": 0.2106020152568817, + "learning_rate": 3.6565136272751844e-05, + "loss": 0.3559, "step": 50865 }, { - "epoch": 1.79, - "learning_rate": 3.7183063138227115e-05, - "loss": 0.2619, + "epoch": 1.8333513532994559, + "grad_norm": 0.22178612649440765, + "learning_rate": 3.656254908095443e-05, + "loss": 0.4217, "step": 50870 }, { - "epoch": 1.79, - "learning_rate": 3.7180575492861034e-05, - "loss": 0.2815, + "epoch": 1.8335315529606806, + "grad_norm": 0.1671254187822342, + "learning_rate": 3.655996173161953e-05, + "loss": 0.4105, "step": 50875 }, { - "epoch": 1.79, - "learning_rate": 3.717808768933823e-05, - "loss": 0.2977, + "epoch": 1.8337117526219051, + "grad_norm": 0.18210411071777344, + "learning_rate": 3.6557374224782384e-05, + "loss": 0.4154, "step": 50880 }, { - "epoch": 1.79, - "learning_rate": 3.717559972769096e-05, - "loss": 0.2808, + "epoch": 1.8338919522831296, + "grad_norm": 0.19609753787517548, + "learning_rate": 3.6554786560478245e-05, + "loss": 0.4294, "step": 50885 }, { - "epoch": 1.79, - "learning_rate": 3.717311160795156e-05, - "loss": 0.2925, + "epoch": 1.8340721519443544, + "grad_norm": 0.16133646667003632, + "learning_rate": 3.655219873874238e-05, + "loss": 0.425, "step": 50890 }, { - "epoch": 1.79, - "learning_rate": 3.7170623330152324e-05, - "loss": 0.2829, + "epoch": 1.834252351605579, + "grad_norm": 0.19089598953723907, + "learning_rate": 3.654961075961005e-05, + "loss": 0.4355, "step": 50895 }, { - "epoch": 1.79, - "learning_rate": 3.716813489432557e-05, - "loss": 0.2828, + "epoch": 1.8344325512668036, + "grad_norm": 0.176603302359581, + "learning_rate": 3.654702262311651e-05, + "loss": 0.3786, "step": 50900 }, { - "epoch": 1.79, - "learning_rate": 3.71656463005036e-05, - "loss": 0.2682, + "epoch": 1.8346127509280281, + "grad_norm": 0.18047241866588593, + "learning_rate": 3.654443432929701e-05, + "loss": 0.3985, "step": 50905 }, { - "epoch": 1.79, - "learning_rate": 3.716315754871873e-05, - "loss": 0.3047, + "epoch": 1.8347929505892528, + "grad_norm": 0.17832064628601074, + "learning_rate": 3.654184587818684e-05, + "loss": 0.3826, "step": 50910 }, { - "epoch": 1.79, - "learning_rate": 3.716066863900327e-05, - "loss": 0.2797, + "epoch": 1.8349731502504776, + "grad_norm": 0.1874881535768509, + "learning_rate": 3.653925726982125e-05, + "loss": 0.4242, "step": 50915 }, { - "epoch": 1.79, - "learning_rate": 3.715817957138954e-05, - "loss": 0.2997, + "epoch": 1.8351533499117023, + "grad_norm": 0.16851532459259033, + "learning_rate": 3.653666850423551e-05, + "loss": 0.4039, "step": 50920 }, { - "epoch": 1.79, - "learning_rate": 3.715569034590988e-05, - "loss": 0.2846, + "epoch": 1.8353335495729268, + "grad_norm": 0.19694097340106964, + "learning_rate": 3.65340795814649e-05, + "loss": 0.429, "step": 50925 }, { - "epoch": 1.79, - "learning_rate": 3.715320096259658e-05, - "loss": 0.2702, + "epoch": 1.8355137492341513, + "grad_norm": 0.1746322214603424, + "learning_rate": 3.653149050154469e-05, + "loss": 0.406, "step": 50930 }, { - "epoch": 1.79, - "learning_rate": 3.7150711421481975e-05, - "loss": 0.3049, + "epoch": 1.835693948895376, + "grad_norm": 0.21489328145980835, + "learning_rate": 3.652890126451015e-05, + "loss": 0.4699, "step": 50935 }, { - "epoch": 1.79, - "learning_rate": 3.714822172259839e-05, - "loss": 0.2774, + "epoch": 1.8358741485566008, + "grad_norm": 0.2216479480266571, + "learning_rate": 3.652631187039657e-05, + "loss": 0.4144, "step": 50940 }, { - "epoch": 1.79, - "learning_rate": 3.714573186597815e-05, - "loss": 0.276, + "epoch": 1.8360543482178253, + "grad_norm": 0.21755515038967133, + "learning_rate": 3.6523722319239214e-05, + "loss": 0.43, "step": 50945 }, { - "epoch": 1.79, - "learning_rate": 3.7143241851653586e-05, - "loss": 0.2749, + "epoch": 1.8362345478790498, + "grad_norm": 0.22237977385520935, + "learning_rate": 3.652113261107338e-05, + "loss": 0.4274, "step": 50950 }, { - "epoch": 1.79, - "learning_rate": 3.714075167965703e-05, - "loss": 0.3144, + "epoch": 1.8364147475402746, + "grad_norm": 0.1792164444923401, + "learning_rate": 3.651854274593433e-05, + "loss": 0.4141, "step": 50955 }, { - "epoch": 1.79, - "learning_rate": 3.713826135002082e-05, - "loss": 0.2635, + "epoch": 1.8365949472014993, + "grad_norm": 0.16846583783626556, + "learning_rate": 3.651595272385738e-05, + "loss": 0.3788, "step": 50960 }, { - "epoch": 1.79, - "learning_rate": 3.7135770862777286e-05, - "loss": 0.2974, + "epoch": 1.836775146862724, + "grad_norm": 0.23637813329696655, + "learning_rate": 3.6513362544877794e-05, + "loss": 0.4256, "step": 50965 }, { - "epoch": 1.79, - "learning_rate": 3.713328021795875e-05, - "loss": 0.2894, + "epoch": 1.8369553465239485, + "grad_norm": 0.21190734207630157, + "learning_rate": 3.6510772209030885e-05, + "loss": 0.4456, "step": 50970 }, { - "epoch": 1.79, - "learning_rate": 3.7130789415597587e-05, - "loss": 0.2923, + "epoch": 1.837135546185173, + "grad_norm": 0.1966804713010788, + "learning_rate": 3.650818171635192e-05, + "loss": 0.3872, "step": 50975 }, { - "epoch": 1.79, - "learning_rate": 3.7128298455726104e-05, - "loss": 0.3008, + "epoch": 1.8373157458463978, + "grad_norm": 0.24481886625289917, + "learning_rate": 3.650559106687621e-05, + "loss": 0.4143, "step": 50980 }, { - "epoch": 1.79, - "learning_rate": 3.712580733837666e-05, - "loss": 0.2606, + "epoch": 1.8374959455076225, + "grad_norm": 0.2108885794878006, + "learning_rate": 3.650300026063905e-05, + "loss": 0.3876, "step": 50985 }, { - "epoch": 1.79, - "learning_rate": 3.71233160635816e-05, - "loss": 0.3075, + "epoch": 1.837676145168847, + "grad_norm": 0.17803071439266205, + "learning_rate": 3.650040929767575e-05, + "loss": 0.3957, "step": 50990 }, { - "epoch": 1.79, - "learning_rate": 3.7120824631373274e-05, - "loss": 0.2828, + "epoch": 1.8378563448300718, + "grad_norm": 0.1737845540046692, + "learning_rate": 3.649781817802159e-05, + "loss": 0.3848, "step": 50995 }, { - "epoch": 1.79, - "learning_rate": 3.7118333041784024e-05, - "loss": 0.2842, + "epoch": 1.8380365444912963, + "grad_norm": 0.16461852192878723, + "learning_rate": 3.649522690171188e-05, + "loss": 0.41, "step": 51000 }, { - "epoch": 1.79, - "eval_loss": 0.2750702500343323, - "eval_runtime": 10.538, - "eval_samples_per_second": 9.489, - "eval_steps_per_second": 9.489, + "epoch": 1.8380365444912963, + "eval_loss": 0.4396840035915375, + "eval_runtime": 3.5402, + "eval_samples_per_second": 28.247, + "eval_steps_per_second": 7.062, "step": 51000 }, { - "epoch": 1.79, - "learning_rate": 3.711584129484621e-05, - "loss": 0.2738, + "epoch": 1.838216744152521, + "grad_norm": 0.19908668100833893, + "learning_rate": 3.6492635468781934e-05, + "loss": 0.411, "step": 51005 }, { - "epoch": 1.79, - "learning_rate": 3.711334939059218e-05, - "loss": 0.2937, + "epoch": 1.8383969438137457, + "grad_norm": 0.23171748220920563, + "learning_rate": 3.649004387926705e-05, + "loss": 0.3974, "step": 51010 }, { - "epoch": 1.79, - "learning_rate": 3.7110857329054295e-05, - "loss": 0.29, + "epoch": 1.8385771434749703, + "grad_norm": 0.19724468886852264, + "learning_rate": 3.6487452133202546e-05, + "loss": 0.4076, "step": 51015 }, { - "epoch": 1.8, - "learning_rate": 3.7108365110264895e-05, - "loss": 0.2818, + "epoch": 1.8387573431361948, + "grad_norm": 0.1985427588224411, + "learning_rate": 3.648486023062373e-05, + "loss": 0.45, "step": 51020 }, { - "epoch": 1.8, - "learning_rate": 3.710587273425636e-05, - "loss": 0.2837, + "epoch": 1.8389375427974195, + "grad_norm": 0.16115103662014008, + "learning_rate": 3.6482268171565914e-05, + "loss": 0.3991, "step": 51025 }, { - "epoch": 1.8, - "learning_rate": 3.710338020106106e-05, - "loss": 0.2928, + "epoch": 1.8391177424586442, + "grad_norm": 0.20082534849643707, + "learning_rate": 3.647967595606443e-05, + "loss": 0.4553, "step": 51030 }, { - "epoch": 1.8, - "learning_rate": 3.7100887510711326e-05, - "loss": 0.3059, + "epoch": 1.839297942119869, + "grad_norm": 0.1648685485124588, + "learning_rate": 3.647708358415457e-05, + "loss": 0.4469, "step": 51035 }, { - "epoch": 1.8, - "learning_rate": 3.709839466323956e-05, - "loss": 0.288, + "epoch": 1.8394781417810935, + "grad_norm": 0.16533087193965912, + "learning_rate": 3.6474491055871675e-05, + "loss": 0.3725, "step": 51040 }, { - "epoch": 1.8, - "learning_rate": 3.709590165867809e-05, - "loss": 0.2917, + "epoch": 1.839658341442318, + "grad_norm": 0.18278469145298004, + "learning_rate": 3.647189837125106e-05, + "loss": 0.427, "step": 51045 }, { - "epoch": 1.8, - "learning_rate": 3.7093408497059325e-05, - "loss": 0.2823, + "epoch": 1.8398385411035427, + "grad_norm": 0.18159344792366028, + "learning_rate": 3.646930553032805e-05, + "loss": 0.4254, "step": 51050 }, { - "epoch": 1.8, - "learning_rate": 3.709091517841561e-05, - "loss": 0.2821, + "epoch": 1.8400187407647675, + "grad_norm": 0.16814720630645752, + "learning_rate": 3.646671253313797e-05, + "loss": 0.4013, "step": 51055 }, { - "epoch": 1.8, - "learning_rate": 3.708842170277934e-05, - "loss": 0.29, + "epoch": 1.840198940425992, + "grad_norm": 0.15060807764530182, + "learning_rate": 3.646411937971615e-05, + "loss": 0.3897, "step": 51060 }, { - "epoch": 1.8, - "learning_rate": 3.708592807018287e-05, - "loss": 0.2864, + "epoch": 1.8403791400872165, + "grad_norm": 0.19889427721500397, + "learning_rate": 3.646152607009793e-05, + "loss": 0.436, "step": 51065 }, { - "epoch": 1.8, - "learning_rate": 3.7083434280658605e-05, - "loss": 0.3002, + "epoch": 1.8405593397484412, + "grad_norm": 0.22724641859531403, + "learning_rate": 3.645893260431863e-05, + "loss": 0.4595, "step": 51070 }, { - "epoch": 1.8, - "learning_rate": 3.70809403342389e-05, - "loss": 0.2825, + "epoch": 1.840739539409666, + "grad_norm": 0.19296084344387054, + "learning_rate": 3.64563389824136e-05, + "loss": 0.3611, "step": 51075 }, { - "epoch": 1.8, - "learning_rate": 3.7078446230956143e-05, - "loss": 0.2741, + "epoch": 1.8409197390708907, + "grad_norm": 0.16096292436122894, + "learning_rate": 3.6453745204418164e-05, + "loss": 0.4039, "step": 51080 }, { - "epoch": 1.8, - "learning_rate": 3.707595197084272e-05, - "loss": 0.2888, + "epoch": 1.8410999387321152, + "grad_norm": 0.18182432651519775, + "learning_rate": 3.6451151270367665e-05, + "loss": 0.3959, "step": 51085 }, { - "epoch": 1.8, - "learning_rate": 3.707345755393103e-05, - "loss": 0.3029, + "epoch": 1.8412801383933397, + "grad_norm": 0.14909334480762482, + "learning_rate": 3.644855718029745e-05, + "loss": 0.4084, "step": 51090 }, { - "epoch": 1.8, - "learning_rate": 3.707096298025344e-05, - "loss": 0.2536, + "epoch": 1.8414603380545644, + "grad_norm": 0.2044646441936493, + "learning_rate": 3.644596293424286e-05, + "loss": 0.4397, "step": 51095 }, { - "epoch": 1.8, - "learning_rate": 3.706846824984236e-05, - "loss": 0.2956, + "epoch": 1.8416405377157892, + "grad_norm": 0.20742791891098022, + "learning_rate": 3.644336853223924e-05, + "loss": 0.4362, "step": 51100 }, { - "epoch": 1.8, - "learning_rate": 3.7065973362730164e-05, - "loss": 0.2647, + "epoch": 1.8418207373770137, + "grad_norm": 0.17310570180416107, + "learning_rate": 3.644077397432194e-05, + "loss": 0.4036, "step": 51105 }, { - "epoch": 1.8, - "learning_rate": 3.7063478318949255e-05, - "loss": 0.3026, + "epoch": 1.8420009370382384, + "grad_norm": 0.17185135185718536, + "learning_rate": 3.64381792605263e-05, + "loss": 0.4234, "step": 51110 }, { - "epoch": 1.8, - "learning_rate": 3.706098311853204e-05, - "loss": 0.2727, + "epoch": 1.842181136699463, + "grad_norm": 0.18162250518798828, + "learning_rate": 3.64355843908877e-05, + "loss": 0.396, "step": 51115 }, { - "epoch": 1.8, - "learning_rate": 3.70584877615109e-05, - "loss": 0.2703, + "epoch": 1.8423613363606877, + "grad_norm": 0.18723055720329285, + "learning_rate": 3.6432989365441474e-05, + "loss": 0.3928, "step": 51120 }, { - "epoch": 1.8, - "learning_rate": 3.7055992247918247e-05, - "loss": 0.2926, + "epoch": 1.8425415360219124, + "grad_norm": 0.21605227887630463, + "learning_rate": 3.643039418422297e-05, + "loss": 0.3746, "step": 51125 }, { - "epoch": 1.8, - "learning_rate": 3.7053496577786485e-05, - "loss": 0.3351, + "epoch": 1.842721735683137, + "grad_norm": 0.15051497519016266, + "learning_rate": 3.642779884726757e-05, + "loss": 0.4126, "step": 51130 }, { - "epoch": 1.8, - "learning_rate": 3.7051000751148e-05, - "loss": 0.2617, + "epoch": 1.8429019353443614, + "grad_norm": 0.19938229024410248, + "learning_rate": 3.642520335461061e-05, + "loss": 0.4506, "step": 51135 }, { - "epoch": 1.8, - "learning_rate": 3.704850476803522e-05, - "loss": 0.2807, + "epoch": 1.8430821350055862, + "grad_norm": 0.17573609948158264, + "learning_rate": 3.642260770628747e-05, + "loss": 0.4178, "step": 51140 }, { - "epoch": 1.8, - "learning_rate": 3.704600862848054e-05, - "loss": 0.2904, + "epoch": 1.8432623346668109, + "grad_norm": 0.16785641014575958, + "learning_rate": 3.6420011902333516e-05, + "loss": 0.4336, "step": 51145 }, { - "epoch": 1.8, - "learning_rate": 3.704351233251638e-05, - "loss": 0.3106, + "epoch": 1.8434425343280356, + "grad_norm": 0.21680709719657898, + "learning_rate": 3.641741594278411e-05, + "loss": 0.4317, "step": 51150 }, { - "epoch": 1.8, - "learning_rate": 3.704101588017516e-05, - "loss": 0.2817, + "epoch": 1.8436227339892601, + "grad_norm": 0.18228915333747864, + "learning_rate": 3.641481982767463e-05, + "loss": 0.4118, "step": 51155 }, { - "epoch": 1.8, - "learning_rate": 3.703851927148927e-05, - "loss": 0.2982, + "epoch": 1.8438029336504846, + "grad_norm": 0.2226298749446869, + "learning_rate": 3.641222355704042e-05, + "loss": 0.4553, "step": 51160 }, { - "epoch": 1.8, - "learning_rate": 3.7036022506491146e-05, - "loss": 0.2926, + "epoch": 1.8439831333117094, + "grad_norm": 0.20143426954746246, + "learning_rate": 3.640962713091689e-05, + "loss": 0.43, "step": 51165 }, { - "epoch": 1.8, - "learning_rate": 3.7033525585213205e-05, - "loss": 0.2752, + "epoch": 1.844163332972934, + "grad_norm": 0.1602238118648529, + "learning_rate": 3.64070305493394e-05, + "loss": 0.4138, "step": 51170 }, { - "epoch": 1.8, - "learning_rate": 3.703102850768786e-05, - "loss": 0.2834, + "epoch": 1.8443435326341586, + "grad_norm": 0.19529132544994354, + "learning_rate": 3.640443381234331e-05, + "loss": 0.4168, "step": 51175 }, { - "epoch": 1.8, - "learning_rate": 3.7028531273947536e-05, - "loss": 0.3048, + "epoch": 1.8445237322953831, + "grad_norm": 0.19434118270874023, + "learning_rate": 3.6401836919964035e-05, + "loss": 0.4032, "step": 51180 }, { - "epoch": 1.8, - "learning_rate": 3.702603388402467e-05, - "loss": 0.2871, + "epoch": 1.8447039319566079, + "grad_norm": 0.1953050196170807, + "learning_rate": 3.639923987223693e-05, + "loss": 0.4151, "step": 51185 }, { - "epoch": 1.8, - "learning_rate": 3.702353633795167e-05, - "loss": 0.28, + "epoch": 1.8448841316178326, + "grad_norm": 0.16159437596797943, + "learning_rate": 3.639664266919739e-05, + "loss": 0.4209, "step": 51190 }, { - "epoch": 1.8, - "learning_rate": 3.702103863576098e-05, - "loss": 0.2954, + "epoch": 1.8450643312790573, + "grad_norm": 0.22107087075710297, + "learning_rate": 3.639404531088081e-05, + "loss": 0.4232, "step": 51195 }, { - "epoch": 1.8, - "learning_rate": 3.7018540777485024e-05, - "loss": 0.2968, + "epoch": 1.8452445309402818, + "grad_norm": 0.1914900243282318, + "learning_rate": 3.6391447797322556e-05, + "loss": 0.4235, "step": 51200 }, { - "epoch": 1.8, - "learning_rate": 3.7016042763156236e-05, - "loss": 0.2944, + "epoch": 1.8454247306015064, + "grad_norm": 0.1916929930448532, + "learning_rate": 3.6388850128558036e-05, + "loss": 0.4633, "step": 51205 }, { - "epoch": 1.8, - "learning_rate": 3.7013544592807054e-05, - "loss": 0.2831, + "epoch": 1.845604930262731, + "grad_norm": 0.16584375500679016, + "learning_rate": 3.6386252304622636e-05, + "loss": 0.4198, "step": 51210 }, { - "epoch": 1.8, - "learning_rate": 3.7011046266469905e-05, - "loss": 0.2764, + "epoch": 1.8457851299239558, + "grad_norm": 0.19661644101142883, + "learning_rate": 3.6383654325551756e-05, + "loss": 0.3869, "step": 51215 }, { - "epoch": 1.8, - "learning_rate": 3.7008547784177246e-05, - "loss": 0.2949, + "epoch": 1.8459653295851803, + "grad_norm": 0.17381379008293152, + "learning_rate": 3.638105619138079e-05, + "loss": 0.3688, "step": 51220 }, { - "epoch": 1.8, - "learning_rate": 3.70060491459615e-05, - "loss": 0.2917, + "epoch": 1.846145529246405, + "grad_norm": 0.18140283226966858, + "learning_rate": 3.637845790214513e-05, + "loss": 0.464, "step": 51225 }, { - "epoch": 1.8, - "learning_rate": 3.7003550351855124e-05, - "loss": 0.2698, + "epoch": 1.8463257289076296, + "grad_norm": 0.18589642643928528, + "learning_rate": 3.637585945788019e-05, + "loss": 0.4818, "step": 51230 }, { - "epoch": 1.8, - "learning_rate": 3.700105140189056e-05, - "loss": 0.2764, + "epoch": 1.8465059285688543, + "grad_norm": 0.1755005270242691, + "learning_rate": 3.6373260858621364e-05, + "loss": 0.4101, "step": 51235 }, { - "epoch": 1.8, - "learning_rate": 3.699855229610024e-05, - "loss": 0.3163, + "epoch": 1.846686128230079, + "grad_norm": 0.14478670060634613, + "learning_rate": 3.637066210440407e-05, + "loss": 0.3988, "step": 51240 }, { - "epoch": 1.8, - "learning_rate": 3.6996053034516634e-05, - "loss": 0.2672, + "epoch": 1.8468663278913036, + "grad_norm": 0.1898811310529709, + "learning_rate": 3.6368063195263694e-05, + "loss": 0.3998, "step": 51245 }, { - "epoch": 1.8, - "learning_rate": 3.699355361717218e-05, - "loss": 0.2975, + "epoch": 1.847046527552528, + "grad_norm": 0.2270967662334442, + "learning_rate": 3.6365464131235664e-05, + "loss": 0.4444, "step": 51250 }, { - "epoch": 1.8, - "learning_rate": 3.699105404409935e-05, - "loss": 0.3058, + "epoch": 1.8472267272137528, + "grad_norm": 0.17820177972316742, + "learning_rate": 3.636286491235538e-05, + "loss": 0.4097, "step": 51255 }, { - "epoch": 1.8, - "learning_rate": 3.698855431533057e-05, - "loss": 0.2785, + "epoch": 1.8474069268749775, + "grad_norm": 0.19704335927963257, + "learning_rate": 3.636026553865828e-05, + "loss": 0.4587, "step": 51260 }, { - "epoch": 1.8, - "learning_rate": 3.698605443089833e-05, - "loss": 0.2886, + "epoch": 1.8475871265362023, + "grad_norm": 0.18560896813869476, + "learning_rate": 3.635766601017974e-05, + "loss": 0.4076, "step": 51265 }, { - "epoch": 1.8, - "learning_rate": 3.698355439083506e-05, - "loss": 0.2679, + "epoch": 1.8477673261974268, + "grad_norm": 0.19222392141819, + "learning_rate": 3.635506632695521e-05, + "loss": 0.4196, "step": 51270 }, { - "epoch": 1.8, - "learning_rate": 3.698105419517323e-05, - "loss": 0.2909, + "epoch": 1.8479475258586513, + "grad_norm": 0.1405927538871765, + "learning_rate": 3.6352466489020095e-05, + "loss": 0.4081, "step": 51275 }, { - "epoch": 1.8, - "learning_rate": 3.6978553843945306e-05, - "loss": 0.2745, + "epoch": 1.848127725519876, + "grad_norm": 0.16948674619197845, + "learning_rate": 3.634986649640982e-05, + "loss": 0.3944, "step": 51280 }, { - "epoch": 1.8, - "learning_rate": 3.697605333718376e-05, - "loss": 0.2649, + "epoch": 1.8483079251811008, + "grad_norm": 0.16560353338718414, + "learning_rate": 3.6347266349159826e-05, + "loss": 0.4439, "step": 51285 }, { - "epoch": 1.8, - "learning_rate": 3.6973552674921067e-05, - "loss": 0.2923, + "epoch": 1.8484881248423253, + "grad_norm": 0.15636521577835083, + "learning_rate": 3.634466604730551e-05, + "loss": 0.4157, "step": 51290 }, { - "epoch": 1.8, - "learning_rate": 3.697105185718967e-05, - "loss": 0.2965, + "epoch": 1.8486683245035498, + "grad_norm": 0.19085612893104553, + "learning_rate": 3.634206559088232e-05, + "loss": 0.4038, "step": 51295 }, { - "epoch": 1.8, - "learning_rate": 3.696855088402204e-05, - "loss": 0.2692, + "epoch": 1.8488485241647745, + "grad_norm": 0.1767541766166687, + "learning_rate": 3.633946497992568e-05, + "loss": 0.4058, "step": 51300 }, { - "epoch": 1.81, - "learning_rate": 3.696604975545068e-05, - "loss": 0.2756, + "epoch": 1.8490287238259993, + "grad_norm": 0.1703364998102188, + "learning_rate": 3.6336864214471035e-05, + "loss": 0.3893, "step": 51305 }, { - "epoch": 1.81, - "learning_rate": 3.696354847150806e-05, - "loss": 0.3012, + "epoch": 1.849208923487224, + "grad_norm": 0.19773104786872864, + "learning_rate": 3.63342632945538e-05, + "loss": 0.4547, "step": 51310 }, { - "epoch": 1.81, - "learning_rate": 3.696104703222663e-05, - "loss": 0.267, + "epoch": 1.8493891231484485, + "grad_norm": 0.20999063551425934, + "learning_rate": 3.6331662220209416e-05, + "loss": 0.4344, "step": 51315 }, { - "epoch": 1.81, - "learning_rate": 3.6958545437638896e-05, - "loss": 0.2901, + "epoch": 1.849569322809673, + "grad_norm": 0.16823193430900574, + "learning_rate": 3.6329060991473344e-05, + "loss": 0.4112, "step": 51320 }, { - "epoch": 1.81, - "learning_rate": 3.695604368777733e-05, - "loss": 0.2705, + "epoch": 1.8497495224708977, + "grad_norm": 0.21957112848758698, + "learning_rate": 3.6326459608380994e-05, + "loss": 0.3883, "step": 51325 }, { - "epoch": 1.81, - "learning_rate": 3.695354178267441e-05, - "loss": 0.2869, + "epoch": 1.8499297221321225, + "grad_norm": 0.17172305285930634, + "learning_rate": 3.6323858070967834e-05, + "loss": 0.3975, "step": 51330 }, { - "epoch": 1.81, - "learning_rate": 3.695103972236263e-05, - "loss": 0.2809, + "epoch": 1.850109921793347, + "grad_norm": 0.23369702696800232, + "learning_rate": 3.6321256379269296e-05, + "loss": 0.4123, "step": 51335 }, { - "epoch": 1.81, - "learning_rate": 3.694853750687448e-05, - "loss": 0.2787, + "epoch": 1.8502901214545715, + "grad_norm": 0.19112427532672882, + "learning_rate": 3.631865453332084e-05, + "loss": 0.4109, "step": 51340 }, { - "epoch": 1.81, - "learning_rate": 3.694603513624244e-05, - "loss": 0.2472, + "epoch": 1.8504703211157962, + "grad_norm": 0.19734065234661102, + "learning_rate": 3.63160525331579e-05, + "loss": 0.4006, "step": 51345 }, { - "epoch": 1.81, - "learning_rate": 3.6943532610499014e-05, - "loss": 0.2754, + "epoch": 1.850650520777021, + "grad_norm": 0.15451741218566895, + "learning_rate": 3.631345037881593e-05, + "loss": 0.4063, "step": 51350 }, { - "epoch": 1.81, - "learning_rate": 3.6941029929676675e-05, - "loss": 0.2844, + "epoch": 1.8508307204382457, + "grad_norm": 0.1700822114944458, + "learning_rate": 3.631084807033041e-05, + "loss": 0.4052, "step": 51355 }, { - "epoch": 1.81, - "learning_rate": 3.693852709380794e-05, - "loss": 0.2816, + "epoch": 1.8510109200994702, + "grad_norm": 0.1707632839679718, + "learning_rate": 3.630824560773676e-05, + "loss": 0.3775, "step": 51360 }, { - "epoch": 1.81, - "learning_rate": 3.69360241029253e-05, - "loss": 0.2789, + "epoch": 1.8511911197606947, + "grad_norm": 0.16627883911132812, + "learning_rate": 3.630564299107045e-05, + "loss": 0.4075, "step": 51365 }, { - "epoch": 1.81, - "learning_rate": 3.693352095706125e-05, - "loss": 0.3022, + "epoch": 1.8513713194219195, + "grad_norm": 0.24305926263332367, + "learning_rate": 3.630304022036694e-05, + "loss": 0.4367, "step": 51370 }, { - "epoch": 1.81, - "learning_rate": 3.69310176562483e-05, - "loss": 0.2998, + "epoch": 1.8515515190831442, + "grad_norm": 0.15088029205799103, + "learning_rate": 3.6300437295661706e-05, + "loss": 0.398, "step": 51375 }, { - "epoch": 1.81, - "learning_rate": 3.692851420051894e-05, - "loss": 0.2976, + "epoch": 1.851731718744369, + "grad_norm": 0.16519147157669067, + "learning_rate": 3.62978342169902e-05, + "loss": 0.4295, "step": 51380 }, { - "epoch": 1.81, - "learning_rate": 3.6926010589905694e-05, - "loss": 0.2608, + "epoch": 1.8519119184055934, + "grad_norm": 0.19771799445152283, + "learning_rate": 3.6295230984387884e-05, + "loss": 0.3928, "step": 51385 }, { - "epoch": 1.81, - "learning_rate": 3.692350682444104e-05, - "loss": 0.2994, + "epoch": 1.852092118066818, + "grad_norm": 0.164763942360878, + "learning_rate": 3.629262759789024e-05, + "loss": 0.4141, "step": 51390 }, { - "epoch": 1.81, - "learning_rate": 3.6921002904157533e-05, - "loss": 0.28, + "epoch": 1.8522723177280427, + "grad_norm": 0.18047907948493958, + "learning_rate": 3.6290024057532726e-05, + "loss": 0.4259, "step": 51395 }, { - "epoch": 1.81, - "learning_rate": 3.691849882908764e-05, - "loss": 0.2896, + "epoch": 1.8524525173892674, + "grad_norm": 0.21674178540706635, + "learning_rate": 3.6287420363350824e-05, + "loss": 0.3962, "step": 51400 }, { - "epoch": 1.81, - "learning_rate": 3.691599459926391e-05, - "loss": 0.2934, + "epoch": 1.852632717050492, + "grad_norm": 0.1720811426639557, + "learning_rate": 3.628481651538e-05, + "loss": 0.4129, "step": 51405 }, { - "epoch": 1.81, - "learning_rate": 3.691349021471884e-05, - "loss": 0.286, + "epoch": 1.8528129167117164, + "grad_norm": 0.201126828789711, + "learning_rate": 3.628221251365574e-05, + "loss": 0.4334, "step": 51410 }, { - "epoch": 1.81, - "learning_rate": 3.691098567548494e-05, - "loss": 0.2833, + "epoch": 1.8529931163729412, + "grad_norm": 0.17763501405715942, + "learning_rate": 3.627960835821351e-05, + "loss": 0.4179, "step": 51415 }, { - "epoch": 1.81, - "learning_rate": 3.690848098159475e-05, - "loss": 0.283, + "epoch": 1.853173316034166, + "grad_norm": 0.16252952814102173, + "learning_rate": 3.6277004049088815e-05, + "loss": 0.3806, "step": 51420 }, { - "epoch": 1.81, - "learning_rate": 3.6905976133080785e-05, - "loss": 0.266, + "epoch": 1.8533535156953906, + "grad_norm": 0.17400288581848145, + "learning_rate": 3.627439958631712e-05, + "loss": 0.3974, "step": 51425 }, { - "epoch": 1.81, - "learning_rate": 3.6903471129975556e-05, - "loss": 0.2611, + "epoch": 1.8535337153566152, + "grad_norm": 0.1547224074602127, + "learning_rate": 3.6271794969933895e-05, + "loss": 0.373, "step": 51430 }, { - "epoch": 1.81, - "learning_rate": 3.69009659723116e-05, - "loss": 0.2934, + "epoch": 1.8537139150178397, + "grad_norm": 0.17776109278202057, + "learning_rate": 3.626919019997467e-05, + "loss": 0.3768, "step": 51435 }, { - "epoch": 1.81, - "learning_rate": 3.689846066012145e-05, - "loss": 0.2724, + "epoch": 1.8538941146790644, + "grad_norm": 0.188429594039917, + "learning_rate": 3.6266585276474896e-05, + "loss": 0.4463, "step": 51440 }, { - "epoch": 1.81, - "learning_rate": 3.689595519343763e-05, - "loss": 0.2909, + "epoch": 1.8540743143402891, + "grad_norm": 0.18767085671424866, + "learning_rate": 3.626398019947008e-05, + "loss": 0.4271, "step": 51445 }, { - "epoch": 1.81, - "learning_rate": 3.689344957229267e-05, - "loss": 0.2729, + "epoch": 1.8542545140015136, + "grad_norm": 0.19436992704868317, + "learning_rate": 3.626137496899572e-05, + "loss": 0.399, "step": 51450 }, { - "epoch": 1.81, - "learning_rate": 3.689094379671911e-05, - "loss": 0.2945, + "epoch": 1.8544347136627382, + "grad_norm": 0.2516634166240692, + "learning_rate": 3.62587695850873e-05, + "loss": 0.4383, "step": 51455 }, { - "epoch": 1.81, - "learning_rate": 3.688843786674947e-05, - "loss": 0.2714, + "epoch": 1.8546149133239629, + "grad_norm": 0.18735727667808533, + "learning_rate": 3.625616404778033e-05, + "loss": 0.4246, "step": 51460 }, { - "epoch": 1.81, - "learning_rate": 3.6885931782416304e-05, - "loss": 0.2573, + "epoch": 1.8547951129851876, + "grad_norm": 0.19105082750320435, + "learning_rate": 3.62535583571103e-05, + "loss": 0.4179, "step": 51465 }, { - "epoch": 1.81, - "learning_rate": 3.6883425543752145e-05, - "loss": 0.2973, + "epoch": 1.8549753126464124, + "grad_norm": 0.20437583327293396, + "learning_rate": 3.625095251311272e-05, + "loss": 0.4275, "step": 51470 }, { - "epoch": 1.81, - "learning_rate": 3.688091915078955e-05, - "loss": 0.2655, + "epoch": 1.8551555123076369, + "grad_norm": 0.1884140968322754, + "learning_rate": 3.6248346515823084e-05, + "loss": 0.3885, "step": 51475 }, { - "epoch": 1.81, - "learning_rate": 3.687841260356103e-05, - "loss": 0.3057, + "epoch": 1.8553357119688614, + "grad_norm": 0.22424067556858063, + "learning_rate": 3.6245740365276914e-05, + "loss": 0.4377, "step": 51480 }, { - "epoch": 1.81, - "learning_rate": 3.6875905902099166e-05, - "loss": 0.2944, + "epoch": 1.8555159116300861, + "grad_norm": 0.1814982295036316, + "learning_rate": 3.62431340615097e-05, + "loss": 0.4373, "step": 51485 }, { - "epoch": 1.81, - "learning_rate": 3.687339904643648e-05, - "loss": 0.2956, + "epoch": 1.8556961112913108, + "grad_norm": 0.18222209811210632, + "learning_rate": 3.624052760455696e-05, + "loss": 0.3891, "step": 51490 }, { - "epoch": 1.81, - "learning_rate": 3.687089203660554e-05, - "loss": 0.2672, + "epoch": 1.8558763109525354, + "grad_norm": 0.21884121000766754, + "learning_rate": 3.6237920994454216e-05, + "loss": 0.436, "step": 51495 }, { - "epoch": 1.81, - "learning_rate": 3.6868384872638886e-05, - "loss": 0.3063, + "epoch": 1.85605651061376, + "grad_norm": 0.16417382657527924, + "learning_rate": 3.623531423123697e-05, + "loss": 0.4172, "step": 51500 }, { - "epoch": 1.81, - "eval_loss": 0.274717777967453, - "eval_runtime": 10.5391, - "eval_samples_per_second": 9.488, - "eval_steps_per_second": 9.488, + "epoch": 1.85605651061376, + "eval_loss": 0.43941619992256165, + "eval_runtime": 3.5304, + "eval_samples_per_second": 28.326, + "eval_steps_per_second": 7.081, "step": 51500 }, { - "epoch": 1.81, - "learning_rate": 3.686587755456907e-05, - "loss": 0.2998, + "epoch": 1.8562367102749846, + "grad_norm": 0.20107519626617432, + "learning_rate": 3.623270731494075e-05, + "loss": 0.4246, "step": 51505 }, { - "epoch": 1.81, - "learning_rate": 3.6863370082428674e-05, - "loss": 0.2863, + "epoch": 1.8564169099362093, + "grad_norm": 0.18021084368228912, + "learning_rate": 3.6230100245601056e-05, + "loss": 0.4206, "step": 51510 }, { - "epoch": 1.81, - "learning_rate": 3.686086245625022e-05, - "loss": 0.2824, + "epoch": 1.856597109597434, + "grad_norm": 0.16509023308753967, + "learning_rate": 3.6227493023253425e-05, + "loss": 0.4185, "step": 51515 }, { - "epoch": 1.81, - "learning_rate": 3.685835467606628e-05, - "loss": 0.3065, + "epoch": 1.8567773092586586, + "grad_norm": 0.2372467815876007, + "learning_rate": 3.622488564793337e-05, + "loss": 0.4057, "step": 51520 }, { - "epoch": 1.81, - "learning_rate": 3.685584674190942e-05, - "loss": 0.287, + "epoch": 1.856957508919883, + "grad_norm": 0.2038879245519638, + "learning_rate": 3.622227811967643e-05, + "loss": 0.3873, "step": 51525 }, { - "epoch": 1.81, - "learning_rate": 3.685333865381222e-05, - "loss": 0.2882, + "epoch": 1.8571377085811078, + "grad_norm": 0.1562618464231491, + "learning_rate": 3.6219670438518125e-05, + "loss": 0.3674, "step": 51530 }, { - "epoch": 1.81, - "learning_rate": 3.685133207251927e-05, - "loss": 0.2919, + "epoch": 1.8573179082423326, + "grad_norm": 0.20719584822654724, + "learning_rate": 3.621706260449397e-05, + "loss": 0.4226, "step": 51535 }, { - "epoch": 1.81, - "learning_rate": 3.684882370741149e-05, - "loss": 0.292, + "epoch": 1.8574981079035573, + "grad_norm": 0.15713489055633545, + "learning_rate": 3.621445461763952e-05, + "loss": 0.4243, "step": 51540 }, { - "epoch": 1.81, - "learning_rate": 3.684631518845454e-05, - "loss": 0.294, + "epoch": 1.8576783075647818, + "grad_norm": 0.19836826622486115, + "learning_rate": 3.62118464779903e-05, + "loss": 0.4348, "step": 51545 }, { - "epoch": 1.81, - "learning_rate": 3.684380651568099e-05, - "loss": 0.2997, + "epoch": 1.8578585072260063, + "grad_norm": 0.17970958352088928, + "learning_rate": 3.620923818558183e-05, + "loss": 0.4002, "step": 51550 }, { - "epoch": 1.81, - "learning_rate": 3.6841297689123415e-05, - "loss": 0.2699, + "epoch": 1.858038706887231, + "grad_norm": 0.2058597356081009, + "learning_rate": 3.6206629740449666e-05, + "loss": 0.3783, "step": 51555 }, { - "epoch": 1.81, - "learning_rate": 3.68387887088144e-05, - "loss": 0.3079, + "epoch": 1.8582189065484558, + "grad_norm": 0.17197424173355103, + "learning_rate": 3.620402114262934e-05, + "loss": 0.423, "step": 51560 }, { - "epoch": 1.81, - "learning_rate": 3.6836279574786516e-05, - "loss": 0.2889, + "epoch": 1.8583991062096803, + "grad_norm": 0.2183474451303482, + "learning_rate": 3.6201412392156395e-05, + "loss": 0.4269, "step": 51565 }, { - "epoch": 1.81, - "learning_rate": 3.683377028707233e-05, - "loss": 0.289, + "epoch": 1.8585793058709048, + "grad_norm": 0.1597500592470169, + "learning_rate": 3.619880348906638e-05, + "loss": 0.3744, "step": 51570 }, { - "epoch": 1.81, - "learning_rate": 3.6831260845704445e-05, - "loss": 0.3018, + "epoch": 1.8587595055321295, + "grad_norm": 0.14877550303936005, + "learning_rate": 3.619619443339483e-05, + "loss": 0.3873, "step": 51575 }, { - "epoch": 1.81, - "learning_rate": 3.682875125071544e-05, - "loss": 0.2832, + "epoch": 1.8589397051933543, + "grad_norm": 0.21616920828819275, + "learning_rate": 3.6193585225177296e-05, + "loss": 0.4411, "step": 51580 }, { - "epoch": 1.81, - "learning_rate": 3.6826241502137884e-05, - "loss": 0.2844, + "epoch": 1.859119904854579, + "grad_norm": 0.20081162452697754, + "learning_rate": 3.619097586444934e-05, + "loss": 0.4237, "step": 51585 }, { - "epoch": 1.82, - "learning_rate": 3.682373160000438e-05, - "loss": 0.2827, + "epoch": 1.8593001045158035, + "grad_norm": 0.1473800092935562, + "learning_rate": 3.618836635124649e-05, + "loss": 0.3905, "step": 51590 }, { - "epoch": 1.82, - "learning_rate": 3.6821221544347516e-05, - "loss": 0.291, + "epoch": 1.859480304177028, + "grad_norm": 0.18639175593852997, + "learning_rate": 3.618575668560433e-05, + "loss": 0.4168, "step": 51595 }, { - "epoch": 1.82, - "learning_rate": 3.681871133519988e-05, - "loss": 0.2522, + "epoch": 1.8596605038382528, + "grad_norm": 0.17644469439983368, + "learning_rate": 3.6183146867558394e-05, + "loss": 0.4074, "step": 51600 }, { - "epoch": 1.82, - "learning_rate": 3.681620097259407e-05, - "loss": 0.2753, + "epoch": 1.8598407034994775, + "grad_norm": 0.15397772192955017, + "learning_rate": 3.6180536897144245e-05, + "loss": 0.4132, "step": 51605 }, { - "epoch": 1.82, - "learning_rate": 3.6813690456562674e-05, - "loss": 0.2762, + "epoch": 1.860020903160702, + "grad_norm": 0.17601633071899414, + "learning_rate": 3.6177926774397455e-05, + "loss": 0.4003, "step": 51610 }, { - "epoch": 1.82, - "learning_rate": 3.681117978713829e-05, - "loss": 0.2705, + "epoch": 1.8602011028219267, + "grad_norm": 0.20962753891944885, + "learning_rate": 3.617531649935356e-05, + "loss": 0.431, "step": 51615 }, { - "epoch": 1.82, - "learning_rate": 3.680866896435353e-05, - "loss": 0.2775, + "epoch": 1.8603813024831513, + "grad_norm": 0.18711386620998383, + "learning_rate": 3.617270607204816e-05, + "loss": 0.4172, "step": 51620 }, { - "epoch": 1.82, - "learning_rate": 3.680615798824097e-05, - "loss": 0.3353, + "epoch": 1.860561502144376, + "grad_norm": 0.19060426950454712, + "learning_rate": 3.6170095492516795e-05, + "loss": 0.4218, "step": 51625 }, { - "epoch": 1.82, - "learning_rate": 3.680364685883325e-05, - "loss": 0.2734, + "epoch": 1.8607417018056007, + "grad_norm": 0.24624811112880707, + "learning_rate": 3.616748476079504e-05, + "loss": 0.4342, "step": 51630 }, { - "epoch": 1.82, - "learning_rate": 3.6801135576162935e-05, - "loss": 0.277, + "epoch": 1.8609219014668252, + "grad_norm": 0.20159181952476501, + "learning_rate": 3.616487387691847e-05, + "loss": 0.3987, "step": 51635 }, { - "epoch": 1.82, - "learning_rate": 3.679862414026266e-05, - "loss": 0.2991, + "epoch": 1.8611021011280497, + "grad_norm": 0.18336619436740875, + "learning_rate": 3.616226284092264e-05, + "loss": 0.4065, "step": 51640 }, { - "epoch": 1.82, - "learning_rate": 3.679611255116502e-05, - "loss": 0.3159, + "epoch": 1.8612823007892745, + "grad_norm": 0.21609511971473694, + "learning_rate": 3.615965165284316e-05, + "loss": 0.4113, "step": 51645 }, { - "epoch": 1.82, - "learning_rate": 3.679360080890264e-05, - "loss": 0.2819, + "epoch": 1.8614625004504992, + "grad_norm": 0.22596147656440735, + "learning_rate": 3.615704031271558e-05, + "loss": 0.4017, "step": 51650 }, { - "epoch": 1.82, - "learning_rate": 3.679108891350813e-05, - "loss": 0.2818, + "epoch": 1.861642700111724, + "grad_norm": 0.15656688809394836, + "learning_rate": 3.6154428820575484e-05, + "loss": 0.4422, "step": 51655 }, { - "epoch": 1.82, - "learning_rate": 3.6788576865014084e-05, - "loss": 0.264, + "epoch": 1.8618228997729485, + "grad_norm": 0.19237256050109863, + "learning_rate": 3.615181717645845e-05, + "loss": 0.4242, "step": 51660 }, { - "epoch": 1.82, - "learning_rate": 3.6786064663453143e-05, - "loss": 0.2892, + "epoch": 1.862003099434173, + "grad_norm": 0.18129973113536835, + "learning_rate": 3.6149205380400074e-05, + "loss": 0.4334, "step": 51665 }, { - "epoch": 1.82, - "learning_rate": 3.678355230885793e-05, - "loss": 0.2763, + "epoch": 1.8621832990953977, + "grad_norm": 0.19113698601722717, + "learning_rate": 3.614659343243594e-05, + "loss": 0.4155, "step": 51670 }, { - "epoch": 1.82, - "learning_rate": 3.6781039801261054e-05, - "loss": 0.2865, + "epoch": 1.8623634987566224, + "grad_norm": 0.20505818724632263, + "learning_rate": 3.6143981332601615e-05, + "loss": 0.4176, "step": 51675 }, { - "epoch": 1.82, - "learning_rate": 3.6778527140695135e-05, - "loss": 0.2993, + "epoch": 1.862543698417847, + "grad_norm": 0.16761788725852966, + "learning_rate": 3.6141369080932705e-05, + "loss": 0.422, "step": 51680 }, { - "epoch": 1.82, - "learning_rate": 3.67760143271928e-05, - "loss": 0.266, + "epoch": 1.8627238980790715, + "grad_norm": 0.18111896514892578, + "learning_rate": 3.61387566774648e-05, + "loss": 0.4167, "step": 51685 }, { - "epoch": 1.82, - "learning_rate": 3.677350136078668e-05, - "loss": 0.2778, + "epoch": 1.8629040977402962, + "grad_norm": 0.15977542102336884, + "learning_rate": 3.6136144122233497e-05, + "loss": 0.3934, "step": 51690 }, { - "epoch": 1.82, - "learning_rate": 3.6770988241509406e-05, - "loss": 0.2742, + "epoch": 1.863084297401521, + "grad_norm": 0.17929469048976898, + "learning_rate": 3.6133531415274376e-05, + "loss": 0.4395, "step": 51695 }, { - "epoch": 1.82, - "learning_rate": 3.6768474969393607e-05, - "loss": 0.2973, + "epoch": 1.8632644970627457, + "grad_norm": 0.19982290267944336, + "learning_rate": 3.613091855662305e-05, + "loss": 0.3916, "step": 51700 }, { - "epoch": 1.82, - "learning_rate": 3.6765961544471925e-05, - "loss": 0.3009, + "epoch": 1.8634446967239702, + "grad_norm": 0.16737203299999237, + "learning_rate": 3.6128305546315114e-05, + "loss": 0.4053, "step": 51705 }, { - "epoch": 1.82, - "learning_rate": 3.6763447966776975e-05, - "loss": 0.2739, + "epoch": 1.8636248963851947, + "grad_norm": 0.1759185642004013, + "learning_rate": 3.6125692384386164e-05, + "loss": 0.3961, "step": 51710 }, { - "epoch": 1.82, - "learning_rate": 3.676093423634142e-05, - "loss": 0.2892, + "epoch": 1.8638050960464194, + "grad_norm": 0.1947273313999176, + "learning_rate": 3.612307907087182e-05, + "loss": 0.4288, "step": 51715 }, { - "epoch": 1.82, - "learning_rate": 3.6758420353197863e-05, - "loss": 0.3095, + "epoch": 1.8639852957076442, + "grad_norm": 0.14398063719272614, + "learning_rate": 3.6120465605807666e-05, + "loss": 0.3918, "step": 51720 }, { - "epoch": 1.82, - "learning_rate": 3.675590631737898e-05, - "loss": 0.2716, + "epoch": 1.8641654953688687, + "grad_norm": 0.18036890029907227, + "learning_rate": 3.611785198922933e-05, + "loss": 0.422, "step": 51725 }, { - "epoch": 1.82, - "learning_rate": 3.6753392128917396e-05, - "loss": 0.2891, + "epoch": 1.8643456950300934, + "grad_norm": 0.18519078195095062, + "learning_rate": 3.611523822117241e-05, + "loss": 0.4418, "step": 51730 }, { - "epoch": 1.82, - "learning_rate": 3.675087778784577e-05, - "loss": 0.2994, + "epoch": 1.864525894691318, + "grad_norm": 0.20603454113006592, + "learning_rate": 3.611262430167253e-05, + "loss": 0.4232, "step": 51735 }, { - "epoch": 1.82, - "learning_rate": 3.674836329419673e-05, - "loss": 0.2867, + "epoch": 1.8647060943525426, + "grad_norm": 0.18835075199604034, + "learning_rate": 3.6110010230765276e-05, + "loss": 0.4057, "step": 51740 }, { - "epoch": 1.82, - "learning_rate": 3.674584864800294e-05, - "loss": 0.2896, + "epoch": 1.8648862940137674, + "grad_norm": 0.18818290531635284, + "learning_rate": 3.6107396008486296e-05, + "loss": 0.4408, "step": 51745 }, { - "epoch": 1.82, - "learning_rate": 3.674333384929705e-05, - "loss": 0.301, + "epoch": 1.8650664936749919, + "grad_norm": 0.16613999009132385, + "learning_rate": 3.61047816348712e-05, + "loss": 0.3983, "step": 51750 }, { - "epoch": 1.82, - "learning_rate": 3.67408188981117e-05, - "loss": 0.2757, + "epoch": 1.8652466933362164, + "grad_norm": 0.19603601098060608, + "learning_rate": 3.6102167109955594e-05, + "loss": 0.433, "step": 51755 }, { - "epoch": 1.82, - "learning_rate": 3.6738303794479556e-05, - "loss": 0.2815, + "epoch": 1.8654268929974411, + "grad_norm": 0.1884978711605072, + "learning_rate": 3.609955243377511e-05, + "loss": 0.4285, "step": 51760 }, { - "epoch": 1.82, - "learning_rate": 3.673578853843328e-05, - "loss": 0.3074, + "epoch": 1.8656070926586659, + "grad_norm": 0.19464312493801117, + "learning_rate": 3.609693760636538e-05, + "loss": 0.3915, "step": 51765 }, { - "epoch": 1.82, - "learning_rate": 3.673327313000552e-05, - "loss": 0.2867, + "epoch": 1.8657872923198906, + "grad_norm": 0.20593151450157166, + "learning_rate": 3.609432262776202e-05, + "loss": 0.4311, "step": 51770 }, { - "epoch": 1.82, - "learning_rate": 3.673075756922894e-05, - "loss": 0.2663, + "epoch": 1.8659674919811151, + "grad_norm": 0.1879161298274994, + "learning_rate": 3.6091707498000666e-05, + "loss": 0.4121, "step": 51775 }, { - "epoch": 1.82, - "learning_rate": 3.6728241856136204e-05, - "loss": 0.2919, + "epoch": 1.8661476916423396, + "grad_norm": 0.19726385176181793, + "learning_rate": 3.608909221711694e-05, + "loss": 0.3844, "step": 51780 }, { - "epoch": 1.82, - "learning_rate": 3.672572599075998e-05, - "loss": 0.3072, + "epoch": 1.8663278913035644, + "grad_norm": 0.1846427619457245, + "learning_rate": 3.6086476785146486e-05, + "loss": 0.4014, "step": 51785 }, { - "epoch": 1.82, - "learning_rate": 3.672320997313292e-05, - "loss": 0.2797, + "epoch": 1.866508090964789, + "grad_norm": 0.1984597146511078, + "learning_rate": 3.6083861202124926e-05, + "loss": 0.4117, "step": 51790 }, { - "epoch": 1.82, - "learning_rate": 3.672069380328772e-05, - "loss": 0.2797, + "epoch": 1.8666882906260136, + "grad_norm": 0.14796842634677887, + "learning_rate": 3.60812454680879e-05, + "loss": 0.3917, "step": 51795 }, { - "epoch": 1.82, - "learning_rate": 3.671817748125702e-05, - "loss": 0.3024, + "epoch": 1.8668684902872381, + "grad_norm": 0.21340517699718475, + "learning_rate": 3.607862958307106e-05, + "loss": 0.3993, "step": 51800 }, { - "epoch": 1.82, - "learning_rate": 3.671566100707352e-05, - "loss": 0.2841, + "epoch": 1.8670486899484628, + "grad_norm": 0.17566043138504028, + "learning_rate": 3.607601354711003e-05, + "loss": 0.4282, "step": 51805 }, { - "epoch": 1.82, - "learning_rate": 3.6713144380769885e-05, - "loss": 0.3027, + "epoch": 1.8672288896096876, + "grad_norm": 0.19569726288318634, + "learning_rate": 3.607339736024046e-05, + "loss": 0.3953, "step": 51810 }, { - "epoch": 1.82, - "learning_rate": 3.671062760237878e-05, - "loss": 0.2932, + "epoch": 1.8674090892709123, + "grad_norm": 0.18134260177612305, + "learning_rate": 3.6070781022497996e-05, + "loss": 0.4215, "step": 51815 }, { - "epoch": 1.82, - "learning_rate": 3.6708110671932896e-05, - "loss": 0.2705, + "epoch": 1.8675892889321368, + "grad_norm": 0.18229928612709045, + "learning_rate": 3.606816453391828e-05, + "loss": 0.3922, "step": 51820 }, { - "epoch": 1.82, - "learning_rate": 3.670559358946491e-05, - "loss": 0.2865, + "epoch": 1.8677694885933613, + "grad_norm": 0.16680172085762024, + "learning_rate": 3.606554789453697e-05, + "loss": 0.3905, "step": 51825 }, { - "epoch": 1.82, - "learning_rate": 3.6703076355007506e-05, - "loss": 0.2862, + "epoch": 1.867949688254586, + "grad_norm": 0.20481085777282715, + "learning_rate": 3.606293110438972e-05, + "loss": 0.4282, "step": 51830 }, { - "epoch": 1.82, - "learning_rate": 3.670055896859337e-05, - "loss": 0.3049, + "epoch": 1.8681298879158108, + "grad_norm": 0.17302840948104858, + "learning_rate": 3.6060314163512164e-05, + "loss": 0.4102, "step": 51835 }, { - "epoch": 1.82, - "learning_rate": 3.669804143025519e-05, - "loss": 0.2863, + "epoch": 1.8683100875770353, + "grad_norm": 0.1852729618549347, + "learning_rate": 3.605769707193997e-05, + "loss": 0.4104, "step": 51840 }, { - "epoch": 1.82, - "learning_rate": 3.669552374002564e-05, - "loss": 0.2776, + "epoch": 1.8684902872382598, + "grad_norm": 0.1932128518819809, + "learning_rate": 3.6055079829708795e-05, + "loss": 0.4389, "step": 51845 }, { - "epoch": 1.82, - "learning_rate": 3.6693005897937435e-05, - "loss": 0.2851, + "epoch": 1.8686704868994846, + "grad_norm": 0.20994549989700317, + "learning_rate": 3.60524624368543e-05, + "loss": 0.4512, "step": 51850 }, { - "epoch": 1.82, - "learning_rate": 3.669048790402324e-05, - "loss": 0.2902, + "epoch": 1.8688506865607093, + "grad_norm": 0.2091553956270218, + "learning_rate": 3.6049844893412143e-05, + "loss": 0.4099, "step": 51855 }, { - "epoch": 1.82, - "learning_rate": 3.6687969758315774e-05, - "loss": 0.2835, + "epoch": 1.869030886221934, + "grad_norm": 0.16755080223083496, + "learning_rate": 3.6047227199417987e-05, + "loss": 0.4143, "step": 51860 }, { - "epoch": 1.82, - "learning_rate": 3.668545146084772e-05, - "loss": 0.2973, + "epoch": 1.8692110858831585, + "grad_norm": 0.23165898025035858, + "learning_rate": 3.6044609354907505e-05, + "loss": 0.4135, "step": 51865 }, { - "epoch": 1.82, - "learning_rate": 3.668293301165179e-05, - "loss": 0.2554, + "epoch": 1.869391285544383, + "grad_norm": 0.20910170674324036, + "learning_rate": 3.604199135991635e-05, + "loss": 0.4071, "step": 51870 }, { - "epoch": 1.83, - "learning_rate": 3.668041441076066e-05, - "loss": 0.2768, + "epoch": 1.8695714852056078, + "grad_norm": 0.17225544154644012, + "learning_rate": 3.603937321448021e-05, + "loss": 0.4114, "step": 51875 }, { - "epoch": 1.83, - "learning_rate": 3.667789565820705e-05, - "loss": 0.2787, + "epoch": 1.8697516848668325, + "grad_norm": 0.17193418741226196, + "learning_rate": 3.6036754918634744e-05, + "loss": 0.4112, "step": 51880 }, { - "epoch": 1.83, - "learning_rate": 3.667537675402366e-05, - "loss": 0.2967, + "epoch": 1.8699318845280573, + "grad_norm": 0.17545709013938904, + "learning_rate": 3.6034136472415624e-05, + "loss": 0.4146, "step": 51885 }, { - "epoch": 1.83, - "learning_rate": 3.667285769824321e-05, - "loss": 0.2772, + "epoch": 1.8701120841892818, + "grad_norm": 0.17598995566368103, + "learning_rate": 3.603151787585853e-05, + "loss": 0.3975, "step": 51890 }, { - "epoch": 1.83, - "learning_rate": 3.667033849089838e-05, - "loss": 0.3012, + "epoch": 1.8702922838505063, + "grad_norm": 0.19214802980422974, + "learning_rate": 3.602889912899915e-05, + "loss": 0.4483, "step": 51895 }, { - "epoch": 1.83, - "learning_rate": 3.666781913202191e-05, - "loss": 0.2883, + "epoch": 1.870472483511731, + "grad_norm": 0.19266071915626526, + "learning_rate": 3.602628023187315e-05, + "loss": 0.4325, "step": 51900 }, { - "epoch": 1.83, - "learning_rate": 3.666529962164648e-05, - "loss": 0.2899, + "epoch": 1.8706526831729557, + "grad_norm": 0.2069164216518402, + "learning_rate": 3.602366118451621e-05, + "loss": 0.4078, "step": 51905 }, { - "epoch": 1.83, - "learning_rate": 3.6662779959804833e-05, - "loss": 0.2894, + "epoch": 1.8708328828341803, + "grad_norm": 0.18193595111370087, + "learning_rate": 3.6021041986964035e-05, + "loss": 0.4199, "step": 51910 }, { - "epoch": 1.83, - "learning_rate": 3.6660260146529676e-05, - "loss": 0.2798, + "epoch": 1.8710130824954048, + "grad_norm": 0.15143196284770966, + "learning_rate": 3.601842263925228e-05, + "loss": 0.3955, "step": 51915 }, { - "epoch": 1.83, - "learning_rate": 3.665774018185372e-05, - "loss": 0.2764, + "epoch": 1.8711932821566295, + "grad_norm": 0.1509236842393875, + "learning_rate": 3.601580314141666e-05, + "loss": 0.3925, "step": 51920 }, { - "epoch": 1.83, - "learning_rate": 3.665522006580969e-05, - "loss": 0.2734, + "epoch": 1.8713734818178542, + "grad_norm": 0.20824167132377625, + "learning_rate": 3.601318349349285e-05, + "loss": 0.4159, "step": 51925 }, { - "epoch": 1.83, - "learning_rate": 3.665269979843031e-05, - "loss": 0.3093, + "epoch": 1.871553681479079, + "grad_norm": 0.1683613657951355, + "learning_rate": 3.601056369551655e-05, + "loss": 0.3946, "step": 51930 }, { - "epoch": 1.83, - "learning_rate": 3.665017937974831e-05, - "loss": 0.2899, + "epoch": 1.8717338811403035, + "grad_norm": 0.20717251300811768, + "learning_rate": 3.600794374752346e-05, + "loss": 0.4327, "step": 51935 }, { - "epoch": 1.83, - "learning_rate": 3.664765880979639e-05, - "loss": 0.2641, + "epoch": 1.871914080801528, + "grad_norm": 0.1377881020307541, + "learning_rate": 3.600532364954926e-05, + "loss": 0.4057, "step": 51940 }, { - "epoch": 1.83, - "learning_rate": 3.664513808860732e-05, - "loss": 0.2931, + "epoch": 1.8720942804627527, + "grad_norm": 0.20057149231433868, + "learning_rate": 3.600270340162966e-05, + "loss": 0.4513, "step": 51945 }, { - "epoch": 1.83, - "learning_rate": 3.664261721621379e-05, - "loss": 0.2899, + "epoch": 1.8722744801239775, + "grad_norm": 0.1850452721118927, + "learning_rate": 3.600008300380035e-05, + "loss": 0.3977, "step": 51950 }, { - "epoch": 1.83, - "learning_rate": 3.6640096192648554e-05, - "loss": 0.3063, + "epoch": 1.872454679785202, + "grad_norm": 0.21717745065689087, + "learning_rate": 3.599746245609703e-05, + "loss": 0.3927, "step": 51955 }, { - "epoch": 1.83, - "learning_rate": 3.663757501794434e-05, - "loss": 0.2803, + "epoch": 1.8726348794464265, + "grad_norm": 0.17817428708076477, + "learning_rate": 3.599484175855543e-05, + "loss": 0.4003, "step": 51960 }, { - "epoch": 1.83, - "learning_rate": 3.663505369213388e-05, - "loss": 0.2783, + "epoch": 1.8728150791076512, + "grad_norm": 0.1645549237728119, + "learning_rate": 3.599222091121123e-05, + "loss": 0.4091, "step": 51965 }, { - "epoch": 1.83, - "learning_rate": 3.663253221524992e-05, - "loss": 0.2873, + "epoch": 1.872995278768876, + "grad_norm": 0.26613032817840576, + "learning_rate": 3.598959991410015e-05, + "loss": 0.4515, "step": 51970 }, { - "epoch": 1.83, - "learning_rate": 3.66300105873252e-05, - "loss": 0.306, + "epoch": 1.8731754784301007, + "grad_norm": 0.15055564045906067, + "learning_rate": 3.5986978767257906e-05, + "loss": 0.4026, "step": 51975 }, { - "epoch": 1.83, - "learning_rate": 3.6627488808392444e-05, - "loss": 0.2775, + "epoch": 1.8733556780913252, + "grad_norm": 0.19910356402397156, + "learning_rate": 3.59843574707202e-05, + "loss": 0.4153, "step": 51980 }, { - "epoch": 1.83, - "learning_rate": 3.662496687848442e-05, - "loss": 0.2764, + "epoch": 1.8735358777525497, + "grad_norm": 0.17660050094127655, + "learning_rate": 3.598173602452274e-05, + "loss": 0.4038, "step": 51985 }, { - "epoch": 1.83, - "learning_rate": 3.662244479763386e-05, - "loss": 0.3088, + "epoch": 1.8737160774137744, + "grad_norm": 0.19887585937976837, + "learning_rate": 3.5979114428701265e-05, + "loss": 0.4239, "step": 51990 }, { - "epoch": 1.83, - "learning_rate": 3.661992256587351e-05, - "loss": 0.2778, + "epoch": 1.8738962770749992, + "grad_norm": 0.1948709338903427, + "learning_rate": 3.597649268329148e-05, + "loss": 0.4, "step": 51995 }, { - "epoch": 1.83, - "learning_rate": 3.661740018323612e-05, - "loss": 0.3253, + "epoch": 1.8740764767362237, + "grad_norm": 0.2058577537536621, + "learning_rate": 3.59738707883291e-05, + "loss": 0.4223, "step": 52000 }, { - "epoch": 1.83, - "eval_loss": 0.2752087414264679, - "eval_runtime": 10.5359, - "eval_samples_per_second": 9.491, - "eval_steps_per_second": 9.491, + "epoch": 1.8740764767362237, + "eval_loss": 0.4389624297618866, + "eval_runtime": 3.5417, + "eval_samples_per_second": 28.235, + "eval_steps_per_second": 7.059, "step": 52000 }, { - "epoch": 1.83, - "learning_rate": 3.661487764975445e-05, - "loss": 0.2783, + "epoch": 1.8742566763974484, + "grad_norm": 0.19060571491718292, + "learning_rate": 3.5971248743849864e-05, + "loss": 0.4135, "step": 52005 }, { - "epoch": 1.83, - "learning_rate": 3.661235496546125e-05, - "loss": 0.2796, + "epoch": 1.874436876058673, + "grad_norm": 0.14245115220546722, + "learning_rate": 3.596862654988948e-05, + "loss": 0.3737, "step": 52010 }, { - "epoch": 1.83, - "learning_rate": 3.6609832130389274e-05, - "loss": 0.2624, + "epoch": 1.8746170757198977, + "grad_norm": 0.1959165632724762, + "learning_rate": 3.5966004206483687e-05, + "loss": 0.4575, "step": 52015 }, { - "epoch": 1.83, - "learning_rate": 3.6607309144571274e-05, - "loss": 0.307, + "epoch": 1.8747972753811224, + "grad_norm": 0.17014524340629578, + "learning_rate": 3.596338171366821e-05, + "loss": 0.3932, "step": 52020 }, { - "epoch": 1.83, - "learning_rate": 3.6604786008040024e-05, - "loss": 0.2823, + "epoch": 1.874977475042347, + "grad_norm": 0.18894478678703308, + "learning_rate": 3.596075907147878e-05, + "loss": 0.387, "step": 52025 }, { - "epoch": 1.83, - "learning_rate": 3.660226272082826e-05, - "loss": 0.2917, + "epoch": 1.8751576747035714, + "grad_norm": 0.1582806408405304, + "learning_rate": 3.595813627995113e-05, + "loss": 0.39, "step": 52030 }, { - "epoch": 1.83, - "learning_rate": 3.659973928296878e-05, - "loss": 0.2749, + "epoch": 1.8753378743647962, + "grad_norm": 0.20237891376018524, + "learning_rate": 3.595551333912099e-05, + "loss": 0.4171, "step": 52035 }, { - "epoch": 1.83, - "learning_rate": 3.659721569449431e-05, - "loss": 0.2948, + "epoch": 1.8755180740260209, + "grad_norm": 0.1860736459493637, + "learning_rate": 3.595289024902411e-05, + "loss": 0.406, "step": 52040 }, { - "epoch": 1.83, - "learning_rate": 3.6594691955437646e-05, - "loss": 0.2705, + "epoch": 1.8756982736872456, + "grad_norm": 0.191467747092247, + "learning_rate": 3.5950267009696206e-05, + "loss": 0.4538, "step": 52045 }, { - "epoch": 1.83, - "learning_rate": 3.659216806583155e-05, - "loss": 0.2612, + "epoch": 1.8758784733484701, + "grad_norm": 0.1970950961112976, + "learning_rate": 3.594764362117305e-05, + "loss": 0.3997, "step": 52050 }, { - "epoch": 1.83, - "learning_rate": 3.6589644025708785e-05, - "loss": 0.2771, + "epoch": 1.8760586730096946, + "grad_norm": 0.2063102424144745, + "learning_rate": 3.594502008349036e-05, + "loss": 0.4225, "step": 52055 }, { - "epoch": 1.83, - "learning_rate": 3.6587119835102135e-05, - "loss": 0.2768, + "epoch": 1.8762388726709194, + "grad_norm": 0.18709443509578705, + "learning_rate": 3.59423963966839e-05, + "loss": 0.3833, "step": 52060 }, { - "epoch": 1.83, - "learning_rate": 3.658459549404437e-05, - "loss": 0.2863, + "epoch": 1.8764190723321441, + "grad_norm": 0.1764218658208847, + "learning_rate": 3.59397725607894e-05, + "loss": 0.4111, "step": 52065 }, { - "epoch": 1.83, - "learning_rate": 3.658207100256827e-05, - "loss": 0.2957, + "epoch": 1.8765992719933686, + "grad_norm": 0.1787770837545395, + "learning_rate": 3.593714857584261e-05, + "loss": 0.4018, "step": 52070 }, { - "epoch": 1.83, - "learning_rate": 3.6579546360706606e-05, - "loss": 0.2587, + "epoch": 1.8767794716545931, + "grad_norm": 0.1761235147714615, + "learning_rate": 3.59345244418793e-05, + "loss": 0.4015, "step": 52075 }, { - "epoch": 1.83, - "learning_rate": 3.657702156849216e-05, - "loss": 0.2844, + "epoch": 1.8769596713158179, + "grad_norm": 0.1807931810617447, + "learning_rate": 3.593190015893521e-05, + "loss": 0.4127, "step": 52080 }, { - "epoch": 1.83, - "learning_rate": 3.657449662595773e-05, - "loss": 0.2673, + "epoch": 1.8771398709770426, + "grad_norm": 0.16054491698741913, + "learning_rate": 3.5929275727046095e-05, + "loss": 0.4037, "step": 52085 }, { - "epoch": 1.83, - "learning_rate": 3.6571971533136085e-05, - "loss": 0.2779, + "epoch": 1.8773200706382673, + "grad_norm": 0.23387478291988373, + "learning_rate": 3.5926651146247715e-05, + "loss": 0.429, "step": 52090 }, { - "epoch": 1.83, - "learning_rate": 3.656944629006001e-05, - "loss": 0.2795, + "epoch": 1.8775002702994918, + "grad_norm": 0.17151391506195068, + "learning_rate": 3.5924026416575826e-05, + "loss": 0.432, "step": 52095 }, { - "epoch": 1.83, - "learning_rate": 3.6566920896762304e-05, - "loss": 0.2798, + "epoch": 1.8776804699607164, + "grad_norm": 0.20386628806591034, + "learning_rate": 3.5921401538066195e-05, + "loss": 0.419, "step": 52100 }, { - "epoch": 1.83, - "learning_rate": 3.656439535327575e-05, - "loss": 0.281, + "epoch": 1.877860669621941, + "grad_norm": 0.16705821454524994, + "learning_rate": 3.591877651075458e-05, + "loss": 0.4308, "step": 52105 }, { - "epoch": 1.83, - "learning_rate": 3.656186965963315e-05, - "loss": 0.2881, + "epoch": 1.8780408692831658, + "grad_norm": 0.16681896150112152, + "learning_rate": 3.591615133467675e-05, + "loss": 0.4394, "step": 52110 }, { - "epoch": 1.83, - "learning_rate": 3.655934381586728e-05, - "loss": 0.2843, + "epoch": 1.8782210689443903, + "grad_norm": 0.16043955087661743, + "learning_rate": 3.591352600986847e-05, + "loss": 0.3904, "step": 52115 }, { - "epoch": 1.83, - "learning_rate": 3.6556817822010966e-05, - "loss": 0.2842, + "epoch": 1.878401268605615, + "grad_norm": 0.24545589089393616, + "learning_rate": 3.5910900536365517e-05, + "loss": 0.4429, "step": 52120 }, { - "epoch": 1.83, - "learning_rate": 3.6554291678096975e-05, - "loss": 0.2903, + "epoch": 1.8785814682668396, + "grad_norm": 0.18531683087348938, + "learning_rate": 3.590827491420365e-05, + "loss": 0.3819, "step": 52125 }, { - "epoch": 1.83, - "learning_rate": 3.655176538415813e-05, - "loss": 0.2705, + "epoch": 1.8787616679280643, + "grad_norm": 0.21406356990337372, + "learning_rate": 3.5905649143418654e-05, + "loss": 0.4126, "step": 52130 }, { - "epoch": 1.83, - "learning_rate": 3.6549238940227215e-05, - "loss": 0.2743, + "epoch": 1.878941867589289, + "grad_norm": 0.2111523151397705, + "learning_rate": 3.590302322404629e-05, + "loss": 0.4129, "step": 52135 }, { - "epoch": 1.83, - "learning_rate": 3.6546712346337054e-05, - "loss": 0.283, + "epoch": 1.8791220672505136, + "grad_norm": 0.19074268639087677, + "learning_rate": 3.590039715612236e-05, + "loss": 0.3669, "step": 52140 }, { - "epoch": 1.83, - "learning_rate": 3.654418560252043e-05, - "loss": 0.3023, + "epoch": 1.879302266911738, + "grad_norm": 0.1679174304008484, + "learning_rate": 3.5897770939682616e-05, + "loss": 0.4018, "step": 52145 }, { - "epoch": 1.83, - "learning_rate": 3.6541658708810175e-05, - "loss": 0.2607, + "epoch": 1.8794824665729628, + "grad_norm": 0.28113430738449097, + "learning_rate": 3.5895144574762855e-05, + "loss": 0.4003, "step": 52150 }, { - "epoch": 1.83, - "learning_rate": 3.6539131665239086e-05, - "loss": 0.2968, + "epoch": 1.8796626662341875, + "grad_norm": 0.1835562139749527, + "learning_rate": 3.589251806139887e-05, + "loss": 0.3801, "step": 52155 }, { - "epoch": 1.84, - "learning_rate": 3.6536604471839964e-05, - "loss": 0.2697, + "epoch": 1.8798428658954123, + "grad_norm": 0.21328282356262207, + "learning_rate": 3.588989139962642e-05, + "loss": 0.4041, "step": 52160 }, { - "epoch": 1.84, - "learning_rate": 3.6534077128645646e-05, - "loss": 0.2761, + "epoch": 1.8800230655566368, + "grad_norm": 0.15331655740737915, + "learning_rate": 3.5887264589481324e-05, + "loss": 0.399, "step": 52165 }, { - "epoch": 1.84, - "learning_rate": 3.6531549635688945e-05, - "loss": 0.2768, + "epoch": 1.8802032652178613, + "grad_norm": 0.20487786829471588, + "learning_rate": 3.588463763099934e-05, + "loss": 0.4275, "step": 52170 }, { - "epoch": 1.84, - "learning_rate": 3.652902199300266e-05, - "loss": 0.3106, + "epoch": 1.880383464879086, + "grad_norm": 0.17890450358390808, + "learning_rate": 3.5882010524216284e-05, + "loss": 0.4195, "step": 52175 }, { - "epoch": 1.84, - "learning_rate": 3.652649420061963e-05, - "loss": 0.2749, + "epoch": 1.8805636645403108, + "grad_norm": 0.1512042135000229, + "learning_rate": 3.587938326916794e-05, + "loss": 0.398, "step": 52180 }, { - "epoch": 1.84, - "learning_rate": 3.652396625857267e-05, - "loss": 0.2913, + "epoch": 1.8807438642015353, + "grad_norm": 0.17925597727298737, + "learning_rate": 3.587675586589011e-05, + "loss": 0.4313, "step": 52185 }, { - "epoch": 1.84, - "learning_rate": 3.65214381668946e-05, - "loss": 0.2954, + "epoch": 1.8809240638627598, + "grad_norm": 0.20325055718421936, + "learning_rate": 3.587412831441858e-05, + "loss": 0.3756, "step": 52190 }, { - "epoch": 1.84, - "learning_rate": 3.651890992561824e-05, - "loss": 0.2702, + "epoch": 1.8811042635239845, + "grad_norm": 0.1322261095046997, + "learning_rate": 3.5871500614789155e-05, + "loss": 0.3894, "step": 52195 }, { - "epoch": 1.84, - "learning_rate": 3.6516381534776436e-05, - "loss": 0.2934, + "epoch": 1.8812844631852093, + "grad_norm": 0.1917523741722107, + "learning_rate": 3.5868872767037646e-05, + "loss": 0.4009, "step": 52200 }, { - "epoch": 1.84, - "learning_rate": 3.651385299440201e-05, - "loss": 0.2758, + "epoch": 1.881464662846434, + "grad_norm": 0.174628347158432, + "learning_rate": 3.5866244771199855e-05, + "loss": 0.395, "step": 52205 }, { - "epoch": 1.84, - "learning_rate": 3.651132430452778e-05, - "loss": 0.2816, + "epoch": 1.8816448625076585, + "grad_norm": 0.17742396891117096, + "learning_rate": 3.586361662731157e-05, + "loss": 0.4242, "step": 52210 }, { - "epoch": 1.84, - "learning_rate": 3.6508795465186596e-05, - "loss": 0.2823, + "epoch": 1.881825062168883, + "grad_norm": 0.1793578863143921, + "learning_rate": 3.5860988335408616e-05, + "loss": 0.4331, "step": 52215 }, { - "epoch": 1.84, - "learning_rate": 3.650626647641129e-05, - "loss": 0.2805, + "epoch": 1.8820052618301077, + "grad_norm": 0.18306861817836761, + "learning_rate": 3.5858359895526807e-05, + "loss": 0.4215, "step": 52220 }, { - "epoch": 1.84, - "learning_rate": 3.65037373382347e-05, - "loss": 0.3046, + "epoch": 1.8821854614913325, + "grad_norm": 0.16969892382621765, + "learning_rate": 3.585573130770193e-05, + "loss": 0.4109, "step": 52225 }, { - "epoch": 1.84, - "learning_rate": 3.650120805068965e-05, - "loss": 0.2818, + "epoch": 1.882365661152557, + "grad_norm": 0.1748346984386444, + "learning_rate": 3.585310257196983e-05, + "loss": 0.3951, "step": 52230 }, { - "epoch": 1.84, - "learning_rate": 3.6498678613809e-05, - "loss": 0.2896, + "epoch": 1.8825458608137817, + "grad_norm": 0.20251350104808807, + "learning_rate": 3.5850473688366306e-05, + "loss": 0.4113, "step": 52235 }, { - "epoch": 1.84, - "learning_rate": 3.649614902762558e-05, - "loss": 0.2778, + "epoch": 1.8827260604750062, + "grad_norm": 0.19220057129859924, + "learning_rate": 3.5847844656927176e-05, + "loss": 0.3578, "step": 52240 }, { - "epoch": 1.84, - "learning_rate": 3.649361929217224e-05, - "loss": 0.2767, + "epoch": 1.882906260136231, + "grad_norm": 0.17758119106292725, + "learning_rate": 3.584521547768826e-05, + "loss": 0.3993, "step": 52245 }, { - "epoch": 1.84, - "learning_rate": 3.649108940748184e-05, - "loss": 0.2824, + "epoch": 1.8830864597974557, + "grad_norm": 0.17641225457191467, + "learning_rate": 3.584258615068539e-05, + "loss": 0.4343, "step": 52250 }, { - "epoch": 1.84, - "learning_rate": 3.648855937358721e-05, - "loss": 0.2869, + "epoch": 1.8832666594586802, + "grad_norm": 0.1818428784608841, + "learning_rate": 3.583995667595437e-05, + "loss": 0.3988, "step": 52255 }, { - "epoch": 1.84, - "learning_rate": 3.648602919052121e-05, - "loss": 0.3008, + "epoch": 1.8834468591199047, + "grad_norm": 0.16791364550590515, + "learning_rate": 3.583732705353105e-05, + "loss": 0.4238, "step": 52260 }, { - "epoch": 1.84, - "learning_rate": 3.648349885831669e-05, - "loss": 0.2764, + "epoch": 1.8836270587811295, + "grad_norm": 0.17646898329257965, + "learning_rate": 3.5834697283451244e-05, + "loss": 0.4154, "step": 52265 }, { - "epoch": 1.84, - "learning_rate": 3.6480968377006495e-05, - "loss": 0.2816, + "epoch": 1.8838072584423542, + "grad_norm": 0.1645815521478653, + "learning_rate": 3.5832067365750794e-05, + "loss": 0.4345, "step": 52270 }, { - "epoch": 1.84, - "learning_rate": 3.647843774662351e-05, - "loss": 0.2847, + "epoch": 1.883987458103579, + "grad_norm": 0.1957109421491623, + "learning_rate": 3.5829437300465504e-05, + "loss": 0.3884, "step": 52275 }, { - "epoch": 1.84, - "learning_rate": 3.647590696720057e-05, - "loss": 0.2593, + "epoch": 1.8841676577648034, + "grad_norm": 0.20072638988494873, + "learning_rate": 3.5826807087631243e-05, + "loss": 0.3996, "step": 52280 }, { - "epoch": 1.84, - "learning_rate": 3.647337603877054e-05, - "loss": 0.2581, + "epoch": 1.884347857426028, + "grad_norm": 0.1894032508134842, + "learning_rate": 3.582417672728383e-05, + "loss": 0.3905, "step": 52285 }, { - "epoch": 1.84, - "learning_rate": 3.647084496136627e-05, - "loss": 0.2778, + "epoch": 1.8845280570872527, + "grad_norm": 0.20433971285820007, + "learning_rate": 3.58215462194591e-05, + "loss": 0.4112, "step": 52290 }, { - "epoch": 1.84, - "learning_rate": 3.646831373502065e-05, - "loss": 0.2942, + "epoch": 1.8847082567484774, + "grad_norm": 0.19983576238155365, + "learning_rate": 3.58189155641929e-05, + "loss": 0.4133, "step": 52295 }, { - "epoch": 1.84, - "learning_rate": 3.6465782359766534e-05, - "loss": 0.3032, + "epoch": 1.884888456409702, + "grad_norm": 0.1698193997144699, + "learning_rate": 3.581628476152107e-05, + "loss": 0.4222, "step": 52300 }, { - "epoch": 1.84, - "learning_rate": 3.646325083563679e-05, - "loss": 0.2676, + "epoch": 1.8850686560709264, + "grad_norm": 0.18257997930049896, + "learning_rate": 3.581365381147946e-05, + "loss": 0.4192, "step": 52305 }, { - "epoch": 1.84, - "learning_rate": 3.646071916266429e-05, - "loss": 0.2821, + "epoch": 1.8852488557321512, + "grad_norm": 0.22537992894649506, + "learning_rate": 3.5811022714103906e-05, + "loss": 0.4401, "step": 52310 }, { - "epoch": 1.84, - "learning_rate": 3.64581873408819e-05, - "loss": 0.3013, + "epoch": 1.885429055393376, + "grad_norm": 0.19952908158302307, + "learning_rate": 3.580839146943026e-05, + "loss": 0.3866, "step": 52315 }, { - "epoch": 1.84, - "learning_rate": 3.64556553703225e-05, - "loss": 0.2605, + "epoch": 1.8856092550546006, + "grad_norm": 0.25064241886138916, + "learning_rate": 3.5805760077494366e-05, + "loss": 0.4678, "step": 52320 }, { - "epoch": 1.84, - "learning_rate": 3.645312325101896e-05, - "loss": 0.2636, + "epoch": 1.8857894547158252, + "grad_norm": 0.19262924790382385, + "learning_rate": 3.580312853833209e-05, + "loss": 0.4435, "step": 52325 }, { - "epoch": 1.84, - "learning_rate": 3.645059098300417e-05, - "loss": 0.2734, + "epoch": 1.8859696543770497, + "grad_norm": 0.12994927167892456, + "learning_rate": 3.580049685197928e-05, + "loss": 0.3827, "step": 52330 }, { - "epoch": 1.84, - "learning_rate": 3.644805856631101e-05, - "loss": 0.3045, + "epoch": 1.8861498540382744, + "grad_norm": 0.12715496122837067, + "learning_rate": 3.5797865018471785e-05, + "loss": 0.4211, "step": 52335 }, { - "epoch": 1.84, - "learning_rate": 3.644552600097234e-05, - "loss": 0.2675, + "epoch": 1.8863300536994991, + "grad_norm": 0.18410317599773407, + "learning_rate": 3.5795233037845475e-05, + "loss": 0.4057, "step": 52340 }, { - "epoch": 1.84, - "learning_rate": 3.644299328702106e-05, - "loss": 0.2913, + "epoch": 1.8865102533607236, + "grad_norm": 0.2159508764743805, + "learning_rate": 3.579260091013621e-05, + "loss": 0.4253, "step": 52345 }, { - "epoch": 1.84, - "learning_rate": 3.6440460424490055e-05, - "loss": 0.2898, + "epoch": 1.8866904530219482, + "grad_norm": 0.23478345572948456, + "learning_rate": 3.578996863537983e-05, + "loss": 0.4077, "step": 52350 }, { - "epoch": 1.84, - "learning_rate": 3.643792741341221e-05, - "loss": 0.2949, + "epoch": 1.8868706526831729, + "grad_norm": 0.13263140618801117, + "learning_rate": 3.578733621361223e-05, + "loss": 0.381, "step": 52355 }, { - "epoch": 1.84, - "learning_rate": 3.643539425382042e-05, - "loss": 0.2776, + "epoch": 1.8870508523443976, + "grad_norm": 0.19846735894680023, + "learning_rate": 3.578470364486926e-05, + "loss": 0.4142, "step": 52360 }, { - "epoch": 1.84, - "learning_rate": 3.643286094574757e-05, - "loss": 0.2794, + "epoch": 1.8872310520056224, + "grad_norm": 0.1773809790611267, + "learning_rate": 3.578259748407673e-05, + "loss": 0.4116, "step": 52365 }, { - "epoch": 1.84, - "learning_rate": 3.643032748922655e-05, - "loss": 0.3072, + "epoch": 1.8874112516668469, + "grad_norm": 0.1703425943851471, + "learning_rate": 3.577996465086848e-05, + "loss": 0.4306, "step": 52370 }, { - "epoch": 1.84, - "learning_rate": 3.6427793884290265e-05, - "loss": 0.2788, + "epoch": 1.8875914513280714, + "grad_norm": 0.1680893748998642, + "learning_rate": 3.5777331670785305e-05, + "loss": 0.4214, "step": 52375 }, { - "epoch": 1.84, - "learning_rate": 3.6425260130971604e-05, - "loss": 0.2757, + "epoch": 1.8877716509892961, + "grad_norm": 0.2334313541650772, + "learning_rate": 3.577469854386308e-05, + "loss": 0.4327, "step": 52380 }, { - "epoch": 1.84, - "learning_rate": 3.642272622930347e-05, - "loss": 0.3123, + "epoch": 1.8879518506505208, + "grad_norm": 0.19660332798957825, + "learning_rate": 3.5772065270137665e-05, + "loss": 0.4319, "step": 52385 }, { - "epoch": 1.84, - "learning_rate": 3.642019217931877e-05, - "loss": 0.2864, + "epoch": 1.8881320503117456, + "grad_norm": 0.20360225439071655, + "learning_rate": 3.5769431849644955e-05, + "loss": 0.4157, "step": 52390 }, { - "epoch": 1.84, - "learning_rate": 3.6417657981050396e-05, - "loss": 0.2776, + "epoch": 1.88831224997297, + "grad_norm": 0.1686132401227951, + "learning_rate": 3.5766798282420814e-05, + "loss": 0.3861, "step": 52395 }, { - "epoch": 1.84, - "learning_rate": 3.641512363453126e-05, - "loss": 0.2893, + "epoch": 1.8884924496341946, + "grad_norm": 0.2118513137102127, + "learning_rate": 3.576416456850113e-05, + "loss": 0.4282, "step": 52400 }, { - "epoch": 1.84, - "learning_rate": 3.6412589139794264e-05, - "loss": 0.303, + "epoch": 1.8886726492954193, + "grad_norm": 0.16424524784088135, + "learning_rate": 3.5761530707921794e-05, + "loss": 0.4134, "step": 52405 }, { - "epoch": 1.84, - "learning_rate": 3.641005449687231e-05, - "loss": 0.2639, + "epoch": 1.888852848956644, + "grad_norm": 0.17415526509284973, + "learning_rate": 3.575889670071868e-05, + "loss": 0.4505, "step": 52410 }, { - "epoch": 1.84, - "learning_rate": 3.640751970579834e-05, - "loss": 0.3054, + "epoch": 1.8890330486178686, + "grad_norm": 0.21902436017990112, + "learning_rate": 3.575626254692768e-05, + "loss": 0.3874, "step": 52415 }, { - "epoch": 1.84, - "learning_rate": 3.6404984766605235e-05, - "loss": 0.2825, + "epoch": 1.889213248279093, + "grad_norm": 0.16111133992671967, + "learning_rate": 3.5753628246584694e-05, + "loss": 0.4127, "step": 52420 }, { - "epoch": 1.84, - "learning_rate": 3.640244967932591e-05, - "loss": 0.2827, + "epoch": 1.8893934479403178, + "grad_norm": 0.1768888384103775, + "learning_rate": 3.57509937997256e-05, + "loss": 0.4033, "step": 52425 }, { - "epoch": 1.84, - "learning_rate": 3.63999144439933e-05, - "loss": 0.2741, + "epoch": 1.8895736476015426, + "grad_norm": 0.18009909987449646, + "learning_rate": 3.574835920638629e-05, + "loss": 0.4052, "step": 52430 }, { - "epoch": 1.84, - "learning_rate": 3.639737906064031e-05, - "loss": 0.2635, + "epoch": 1.8897538472627673, + "grad_norm": 0.21755820512771606, + "learning_rate": 3.574572446660268e-05, + "loss": 0.4571, "step": 52435 }, { - "epoch": 1.84, - "learning_rate": 3.639484352929986e-05, - "loss": 0.2647, + "epoch": 1.8899340469239918, + "grad_norm": 0.21082909405231476, + "learning_rate": 3.574308958041064e-05, + "loss": 0.3826, "step": 52440 }, { - "epoch": 1.85, - "learning_rate": 3.639230785000489e-05, - "loss": 0.268, + "epoch": 1.8901142465852163, + "grad_norm": 0.23044735193252563, + "learning_rate": 3.57404545478461e-05, + "loss": 0.4161, "step": 52445 }, { - "epoch": 1.85, - "learning_rate": 3.6389772022788295e-05, - "loss": 0.2776, + "epoch": 1.890294446246441, + "grad_norm": 0.21793107688426971, + "learning_rate": 3.573781936894493e-05, + "loss": 0.4393, "step": 52450 }, { - "epoch": 1.85, - "learning_rate": 3.638723604768302e-05, - "loss": 0.2842, + "epoch": 1.8904746459076658, + "grad_norm": 0.15808548033237457, + "learning_rate": 3.573518404374306e-05, + "loss": 0.3924, "step": 52455 }, { - "epoch": 1.85, - "learning_rate": 3.638469992472199e-05, - "loss": 0.2796, + "epoch": 1.8906548455688903, + "grad_norm": 0.18602894246578217, + "learning_rate": 3.5732548572276386e-05, + "loss": 0.4046, "step": 52460 }, { - "epoch": 1.85, - "learning_rate": 3.638216365393814e-05, - "loss": 0.2851, + "epoch": 1.8908350452301148, + "grad_norm": 0.22336095571517944, + "learning_rate": 3.572991295458081e-05, + "loss": 0.4219, "step": 52465 }, { - "epoch": 1.85, - "learning_rate": 3.6379627235364395e-05, - "loss": 0.3005, + "epoch": 1.8910152448913395, + "grad_norm": 0.1639654040336609, + "learning_rate": 3.572727719069225e-05, + "loss": 0.4163, "step": 52470 }, { - "epoch": 1.85, - "learning_rate": 3.637709066903368e-05, - "loss": 0.2855, + "epoch": 1.8911954445525643, + "grad_norm": 0.14199529588222504, + "learning_rate": 3.572464128064662e-05, + "loss": 0.4071, "step": 52475 }, { - "epoch": 1.85, - "learning_rate": 3.6374553954978955e-05, - "loss": 0.2723, + "epoch": 1.891375644213789, + "grad_norm": 0.22127693891525269, + "learning_rate": 3.5722005224479826e-05, + "loss": 0.3902, "step": 52480 }, { - "epoch": 1.85, - "learning_rate": 3.637201709323314e-05, - "loss": 0.2943, + "epoch": 1.8915558438750135, + "grad_norm": 0.19781959056854248, + "learning_rate": 3.571936902222778e-05, + "loss": 0.4002, "step": 52485 }, { - "epoch": 1.85, - "learning_rate": 3.636948008382917e-05, - "loss": 0.2891, + "epoch": 1.891736043536238, + "grad_norm": 0.19771304726600647, + "learning_rate": 3.571673267392642e-05, + "loss": 0.4017, "step": 52490 }, { - "epoch": 1.85, - "learning_rate": 3.63669429268e-05, - "loss": 0.2916, + "epoch": 1.8919162431974628, + "grad_norm": 0.22566063702106476, + "learning_rate": 3.571409617961164e-05, + "loss": 0.3925, "step": 52495 }, { - "epoch": 1.85, - "learning_rate": 3.636440562217856e-05, - "loss": 0.2823, + "epoch": 1.8920964428586875, + "grad_norm": 0.1664046347141266, + "learning_rate": 3.571145953931938e-05, + "loss": 0.3956, "step": 52500 }, { - "epoch": 1.85, - "eval_loss": 0.27478304505348206, - "eval_runtime": 10.5283, - "eval_samples_per_second": 9.498, - "eval_steps_per_second": 9.498, + "epoch": 1.8920964428586875, + "eval_loss": 0.4388228952884674, + "eval_runtime": 3.5346, + "eval_samples_per_second": 28.292, + "eval_steps_per_second": 7.073, "step": 52500 }, { - "epoch": 1.85, - "learning_rate": 3.636186816999781e-05, - "loss": 0.2729, + "epoch": 1.892276642519912, + "grad_norm": 0.18760746717453003, + "learning_rate": 3.5708822753085555e-05, + "loss": 0.407, "step": 52505 }, { - "epoch": 1.85, - "learning_rate": 3.6359330570290686e-05, - "loss": 0.2818, + "epoch": 1.8924568421811367, + "grad_norm": 0.19066941738128662, + "learning_rate": 3.5706185820946094e-05, + "loss": 0.3779, "step": 52510 }, { - "epoch": 1.85, - "learning_rate": 3.635679282309013e-05, - "loss": 0.2786, + "epoch": 1.8926370418423613, + "grad_norm": 0.18899337947368622, + "learning_rate": 3.570354874293692e-05, + "loss": 0.355, "step": 52515 }, { - "epoch": 1.85, - "learning_rate": 3.635425492842912e-05, - "loss": 0.2766, + "epoch": 1.892817241503586, + "grad_norm": 0.22324331104755402, + "learning_rate": 3.570091151909397e-05, + "loss": 0.406, "step": 52520 }, { - "epoch": 1.85, - "learning_rate": 3.6351716886340585e-05, - "loss": 0.287, + "epoch": 1.8929974411648107, + "grad_norm": 0.18958257138729095, + "learning_rate": 3.569827414945317e-05, + "loss": 0.426, "step": 52525 }, { - "epoch": 1.85, - "learning_rate": 3.6349178696857496e-05, - "loss": 0.2887, + "epoch": 1.8931776408260352, + "grad_norm": 0.20323504507541656, + "learning_rate": 3.5695636634050466e-05, + "loss": 0.4198, "step": 52530 }, { - "epoch": 1.85, - "learning_rate": 3.634664036001279e-05, - "loss": 0.2625, + "epoch": 1.8933578404872597, + "grad_norm": 0.2171027809381485, + "learning_rate": 3.569299897292177e-05, + "loss": 0.4075, "step": 52535 }, { - "epoch": 1.85, - "learning_rate": 3.634410187583944e-05, - "loss": 0.2751, + "epoch": 1.8935380401484845, + "grad_norm": 0.17089755833148956, + "learning_rate": 3.5690361166103045e-05, + "loss": 0.3932, "step": 52540 }, { - "epoch": 1.85, - "learning_rate": 3.6341563244370405e-05, - "loss": 0.303, + "epoch": 1.8937182398097092, + "grad_norm": 0.21072880923748016, + "learning_rate": 3.568772321363021e-05, + "loss": 0.4229, "step": 52545 }, { - "epoch": 1.85, - "learning_rate": 3.633902446563865e-05, - "loss": 0.2807, + "epoch": 1.893898439470934, + "grad_norm": 0.20696797966957092, + "learning_rate": 3.5685085115539225e-05, + "loss": 0.4261, "step": 52550 }, { - "epoch": 1.85, - "learning_rate": 3.633648553967712e-05, - "loss": 0.2919, + "epoch": 1.8940786391321585, + "grad_norm": 0.1554139405488968, + "learning_rate": 3.568244687186602e-05, + "loss": 0.388, "step": 52555 }, { - "epoch": 1.85, - "learning_rate": 3.633394646651881e-05, - "loss": 0.2839, + "epoch": 1.894258838793383, + "grad_norm": 0.19356495141983032, + "learning_rate": 3.5679808482646535e-05, + "loss": 0.4374, "step": 52560 }, { - "epoch": 1.85, - "learning_rate": 3.6331407246196674e-05, - "loss": 0.2871, + "epoch": 1.8944390384546077, + "grad_norm": 0.14796990156173706, + "learning_rate": 3.567716994791674e-05, + "loss": 0.3883, "step": 52565 }, { - "epoch": 1.85, - "learning_rate": 3.632886787874368e-05, - "loss": 0.2677, + "epoch": 1.8946192381158324, + "grad_norm": 0.21466509997844696, + "learning_rate": 3.5674531267712566e-05, + "loss": 0.435, "step": 52570 }, { - "epoch": 1.85, - "learning_rate": 3.6326328364192795e-05, - "loss": 0.2702, + "epoch": 1.894799437777057, + "grad_norm": 0.16786347329616547, + "learning_rate": 3.567189244206996e-05, + "loss": 0.3936, "step": 52575 }, { - "epoch": 1.85, - "learning_rate": 3.632378870257701e-05, - "loss": 0.28, + "epoch": 1.8949796374382815, + "grad_norm": 0.2334979772567749, + "learning_rate": 3.5669253471024904e-05, + "loss": 0.4226, "step": 52580 }, { - "epoch": 1.85, - "learning_rate": 3.632124889392929e-05, - "loss": 0.2936, + "epoch": 1.8951598370995062, + "grad_norm": 0.14717628061771393, + "learning_rate": 3.5666614354613325e-05, + "loss": 0.3976, "step": 52585 }, { - "epoch": 1.85, - "learning_rate": 3.631870893828262e-05, - "loss": 0.2671, + "epoch": 1.895340036760731, + "grad_norm": 0.17818215489387512, + "learning_rate": 3.5663975092871194e-05, + "loss": 0.3997, "step": 52590 }, { - "epoch": 1.85, - "learning_rate": 3.631616883566996e-05, - "loss": 0.2628, + "epoch": 1.8955202364219557, + "grad_norm": 0.20797117054462433, + "learning_rate": 3.5661335685834466e-05, + "loss": 0.4407, "step": 52595 }, { - "epoch": 1.85, - "learning_rate": 3.6313628586124317e-05, - "loss": 0.284, + "epoch": 1.8957004360831802, + "grad_norm": 0.17417512834072113, + "learning_rate": 3.56586961335391e-05, + "loss": 0.4273, "step": 52600 }, { - "epoch": 1.85, - "learning_rate": 3.631108818967865e-05, - "loss": 0.2837, + "epoch": 1.8958806357444047, + "grad_norm": 0.22176715731620789, + "learning_rate": 3.565605643602107e-05, + "loss": 0.4448, "step": 52605 }, { - "epoch": 1.85, - "learning_rate": 3.630854764636597e-05, - "loss": 0.2802, + "epoch": 1.8960608354056294, + "grad_norm": 0.1701037585735321, + "learning_rate": 3.565341659331633e-05, + "loss": 0.3807, "step": 52610 }, { - "epoch": 1.85, - "learning_rate": 3.630600695621924e-05, - "loss": 0.2782, + "epoch": 1.8962410350668542, + "grad_norm": 0.17541612684726715, + "learning_rate": 3.565077660546085e-05, + "loss": 0.4121, "step": 52615 }, { - "epoch": 1.85, - "learning_rate": 3.630346611927147e-05, - "loss": 0.2628, + "epoch": 1.8964212347280787, + "grad_norm": 0.17228418588638306, + "learning_rate": 3.5648136472490604e-05, + "loss": 0.4114, "step": 52620 }, { - "epoch": 1.85, - "learning_rate": 3.6300925135555634e-05, - "loss": 0.286, + "epoch": 1.8966014343893034, + "grad_norm": 0.1823887676000595, + "learning_rate": 3.5645496194441555e-05, + "loss": 0.422, "step": 52625 }, { - "epoch": 1.85, - "learning_rate": 3.6298384005104736e-05, - "loss": 0.2759, + "epoch": 1.896781634050528, + "grad_norm": 0.181901216506958, + "learning_rate": 3.564285577134969e-05, + "loss": 0.4025, "step": 52630 }, { - "epoch": 1.85, - "learning_rate": 3.629584272795177e-05, - "loss": 0.2929, + "epoch": 1.8969618337117526, + "grad_norm": 0.19372449815273285, + "learning_rate": 3.564021520325096e-05, + "loss": 0.4179, "step": 52635 }, { - "epoch": 1.85, - "learning_rate": 3.629330130412974e-05, - "loss": 0.285, + "epoch": 1.8971420333729774, + "grad_norm": 0.2167021781206131, + "learning_rate": 3.5637574490181376e-05, + "loss": 0.4143, "step": 52640 }, { - "epoch": 1.85, - "learning_rate": 3.629075973367162e-05, - "loss": 0.2624, + "epoch": 1.8973222330342019, + "grad_norm": 0.20177282392978668, + "learning_rate": 3.563493363217689e-05, + "loss": 0.3899, "step": 52645 }, { - "epoch": 1.85, - "learning_rate": 3.628821801661043e-05, - "loss": 0.2926, + "epoch": 1.8975024326954264, + "grad_norm": 0.18920306861400604, + "learning_rate": 3.563229262927349e-05, + "loss": 0.4006, "step": 52650 }, { - "epoch": 1.85, - "learning_rate": 3.628567615297916e-05, - "loss": 0.2976, + "epoch": 1.8976826323566511, + "grad_norm": 0.14058130979537964, + "learning_rate": 3.562965148150716e-05, + "loss": 0.3701, "step": 52655 }, { - "epoch": 1.85, - "learning_rate": 3.628313414281084e-05, - "loss": 0.3159, + "epoch": 1.8978628320178759, + "grad_norm": 0.20638848841190338, + "learning_rate": 3.56270101889139e-05, + "loss": 0.4196, "step": 52660 }, { - "epoch": 1.85, - "learning_rate": 3.6280591986138446e-05, - "loss": 0.2822, + "epoch": 1.8980430316791006, + "grad_norm": 0.22467444837093353, + "learning_rate": 3.562436875152967e-05, + "loss": 0.4454, "step": 52665 }, { - "epoch": 1.85, - "learning_rate": 3.6278049682995e-05, - "loss": 0.2769, + "epoch": 1.8982232313403251, + "grad_norm": 0.17210647463798523, + "learning_rate": 3.562172716939048e-05, + "loss": 0.4351, "step": 52670 }, { - "epoch": 1.85, - "learning_rate": 3.6275507233413516e-05, - "loss": 0.2939, + "epoch": 1.8984034310015496, + "grad_norm": 0.1578112095594406, + "learning_rate": 3.561908544253231e-05, + "loss": 0.4413, "step": 52675 }, { - "epoch": 1.85, - "learning_rate": 3.6272964637427e-05, - "loss": 0.2559, + "epoch": 1.8985836306627744, + "grad_norm": 0.15398284792900085, + "learning_rate": 3.561644357099116e-05, + "loss": 0.3976, "step": 52680 }, { - "epoch": 1.85, - "learning_rate": 3.6270421895068465e-05, - "loss": 0.2857, + "epoch": 1.898763830323999, + "grad_norm": 0.18252988159656525, + "learning_rate": 3.561380155480302e-05, + "loss": 0.4166, "step": 52685 }, { - "epoch": 1.85, - "learning_rate": 3.6267879006370936e-05, - "loss": 0.2951, + "epoch": 1.8989440299852236, + "grad_norm": 0.16655157506465912, + "learning_rate": 3.561115939400389e-05, + "loss": 0.4013, "step": 52690 }, { - "epoch": 1.85, - "learning_rate": 3.626533597136742e-05, - "loss": 0.2767, + "epoch": 1.8991242296464481, + "grad_norm": 0.1663331389427185, + "learning_rate": 3.560851708862977e-05, + "loss": 0.3884, "step": 52695 }, { - "epoch": 1.85, - "learning_rate": 3.626279279009094e-05, - "loss": 0.2571, + "epoch": 1.8993044293076728, + "grad_norm": 0.16247087717056274, + "learning_rate": 3.560587463871665e-05, + "loss": 0.3918, "step": 52700 }, { - "epoch": 1.85, - "learning_rate": 3.626024946257452e-05, - "loss": 0.3061, + "epoch": 1.8994846289688976, + "grad_norm": 0.18449072539806366, + "learning_rate": 3.560323204430055e-05, + "loss": 0.4069, "step": 52705 }, { - "epoch": 1.85, - "learning_rate": 3.6257705988851176e-05, - "loss": 0.2548, + "epoch": 1.8996648286301223, + "grad_norm": 0.19125069677829742, + "learning_rate": 3.560058930541746e-05, + "loss": 0.4051, "step": 52710 }, { - "epoch": 1.85, - "learning_rate": 3.6255162368953946e-05, - "loss": 0.2965, + "epoch": 1.8998450282913468, + "grad_norm": 0.15312950313091278, + "learning_rate": 3.55979464221034e-05, + "loss": 0.4207, "step": 52715 }, { - "epoch": 1.85, - "learning_rate": 3.6252618602915845e-05, - "loss": 0.307, + "epoch": 1.9000252279525713, + "grad_norm": 0.2004600614309311, + "learning_rate": 3.559530339439436e-05, + "loss": 0.4074, "step": 52720 }, { - "epoch": 1.86, - "learning_rate": 3.6250074690769906e-05, - "loss": 0.2867, + "epoch": 1.900205427613796, + "grad_norm": 0.1745137721300125, + "learning_rate": 3.5592660222326375e-05, + "loss": 0.3963, "step": 52725 }, { - "epoch": 1.86, - "learning_rate": 3.6247530632549165e-05, - "loss": 0.2846, + "epoch": 1.9003856272750208, + "grad_norm": 0.15934212505817413, + "learning_rate": 3.5590016905935436e-05, + "loss": 0.3571, "step": 52730 }, { - "epoch": 1.86, - "learning_rate": 3.6244986428286656e-05, - "loss": 0.2995, + "epoch": 1.9005658269362453, + "grad_norm": 0.20348936319351196, + "learning_rate": 3.558737344525758e-05, + "loss": 0.4435, "step": 52735 }, { - "epoch": 1.86, - "learning_rate": 3.62424420780154e-05, - "loss": 0.2915, + "epoch": 1.90074602659747, + "grad_norm": 0.20140963792800903, + "learning_rate": 3.558472984032879e-05, + "loss": 0.4502, "step": 52740 }, { - "epoch": 1.86, - "learning_rate": 3.623989758176844e-05, - "loss": 0.2876, + "epoch": 1.9009262262586946, + "grad_norm": 0.16998355090618134, + "learning_rate": 3.558208609118512e-05, + "loss": 0.4119, "step": 52745 }, { - "epoch": 1.86, - "learning_rate": 3.623735293957883e-05, - "loss": 0.2866, + "epoch": 1.9011064259199193, + "grad_norm": 0.17498774826526642, + "learning_rate": 3.5579442197862575e-05, + "loss": 0.3913, "step": 52750 }, { - "epoch": 1.86, - "learning_rate": 3.623480815147959e-05, - "loss": 0.299, + "epoch": 1.901286625581144, + "grad_norm": 0.1859028935432434, + "learning_rate": 3.557679816039717e-05, + "loss": 0.4298, "step": 52755 }, { - "epoch": 1.86, - "learning_rate": 3.623226321750377e-05, - "loss": 0.3092, + "epoch": 1.9014668252423685, + "grad_norm": 0.21767857670783997, + "learning_rate": 3.5574153978824945e-05, + "loss": 0.4364, "step": 52760 }, { - "epoch": 1.86, - "learning_rate": 3.622971813768441e-05, - "loss": 0.3142, + "epoch": 1.901647024903593, + "grad_norm": 0.1995837390422821, + "learning_rate": 3.557150965318192e-05, + "loss": 0.408, "step": 52765 }, { - "epoch": 1.86, - "learning_rate": 3.622717291205457e-05, - "loss": 0.2707, + "epoch": 1.9018272245648178, + "grad_norm": 0.22697186470031738, + "learning_rate": 3.5568865183504127e-05, + "loss": 0.4431, "step": 52770 }, { - "epoch": 1.86, - "learning_rate": 3.6224627540647286e-05, - "loss": 0.2993, + "epoch": 1.9020074242260425, + "grad_norm": 0.17042651772499084, + "learning_rate": 3.556622056982758e-05, + "loss": 0.3834, "step": 52775 }, { - "epoch": 1.86, - "learning_rate": 3.622208202349561e-05, - "loss": 0.2707, + "epoch": 1.9021876238872673, + "grad_norm": 0.16556668281555176, + "learning_rate": 3.556357581218833e-05, + "loss": 0.4083, "step": 52780 }, { - "epoch": 1.86, - "learning_rate": 3.621953636063259e-05, - "loss": 0.2887, + "epoch": 1.9023678235484918, + "grad_norm": 0.16349823772907257, + "learning_rate": 3.556093091062241e-05, + "loss": 0.3985, "step": 52785 }, { - "epoch": 1.86, - "learning_rate": 3.621699055209129e-05, - "loss": 0.2877, + "epoch": 1.9025480232097163, + "grad_norm": 0.18236073851585388, + "learning_rate": 3.555828586516584e-05, + "loss": 0.4089, "step": 52790 }, { - "epoch": 1.86, - "learning_rate": 3.621444459790476e-05, - "loss": 0.2692, + "epoch": 1.902728222870941, + "grad_norm": 0.15176883339881897, + "learning_rate": 3.5555640675854675e-05, + "loss": 0.4, "step": 52795 }, { - "epoch": 1.86, - "learning_rate": 3.621189849810606e-05, - "loss": 0.2881, + "epoch": 1.9029084225321657, + "grad_norm": 0.16588926315307617, + "learning_rate": 3.555299534272495e-05, + "loss": 0.4054, "step": 52800 }, { - "epoch": 1.86, - "learning_rate": 3.620935225272823e-05, - "loss": 0.2931, + "epoch": 1.9030886221933903, + "grad_norm": 0.17433229088783264, + "learning_rate": 3.55503498658127e-05, + "loss": 0.4082, "step": 52805 }, { - "epoch": 1.86, - "learning_rate": 3.620680586180437e-05, - "loss": 0.2877, + "epoch": 1.9032688218546148, + "grad_norm": 0.178829625248909, + "learning_rate": 3.5547704245153984e-05, + "loss": 0.4344, "step": 52810 }, { - "epoch": 1.86, - "learning_rate": 3.620425932536751e-05, - "loss": 0.2648, + "epoch": 1.9034490215158395, + "grad_norm": 0.259390652179718, + "learning_rate": 3.554505848078483e-05, + "loss": 0.407, "step": 52815 }, { - "epoch": 1.86, - "learning_rate": 3.620171264345073e-05, - "loss": 0.2814, + "epoch": 1.9036292211770642, + "grad_norm": 0.20816822350025177, + "learning_rate": 3.554241257274131e-05, + "loss": 0.4278, "step": 52820 }, { - "epoch": 1.86, - "learning_rate": 3.619916581608709e-05, - "loss": 0.2752, + "epoch": 1.903809420838289, + "grad_norm": 0.22725282609462738, + "learning_rate": 3.5539766521059455e-05, + "loss": 0.414, "step": 52825 }, { - "epoch": 1.86, - "learning_rate": 3.619661884330967e-05, - "loss": 0.2623, + "epoch": 1.9039896204995135, + "grad_norm": 0.20979876816272736, + "learning_rate": 3.553712032577532e-05, + "loss": 0.4301, "step": 52830 }, { - "epoch": 1.86, - "learning_rate": 3.619407172515153e-05, - "loss": 0.2936, + "epoch": 1.904169820160738, + "grad_norm": 0.16655777394771576, + "learning_rate": 3.5534473986924954e-05, + "loss": 0.4324, "step": 52835 }, { - "epoch": 1.86, - "learning_rate": 3.6191524461645736e-05, - "loss": 0.2566, + "epoch": 1.9043500198219627, + "grad_norm": 0.165932297706604, + "learning_rate": 3.553182750454442e-05, + "loss": 0.4295, "step": 52840 }, { - "epoch": 1.86, - "learning_rate": 3.618897705282538e-05, - "loss": 0.2904, + "epoch": 1.9045302194831875, + "grad_norm": 0.1914936900138855, + "learning_rate": 3.552918087866979e-05, + "loss": 0.4105, "step": 52845 }, { - "epoch": 1.86, - "learning_rate": 3.6186429498723535e-05, - "loss": 0.2571, + "epoch": 1.904710419144412, + "grad_norm": 0.15866640210151672, + "learning_rate": 3.552653410933709e-05, + "loss": 0.3874, "step": 52850 }, { - "epoch": 1.86, - "learning_rate": 3.618388179937327e-05, - "loss": 0.2748, + "epoch": 1.9048906188056365, + "grad_norm": 0.15294025838375092, + "learning_rate": 3.552388719658242e-05, + "loss": 0.3981, "step": 52855 }, { - "epoch": 1.86, - "learning_rate": 3.6181333954807674e-05, - "loss": 0.3076, + "epoch": 1.9050708184668612, + "grad_norm": 0.19104799628257751, + "learning_rate": 3.552124014044181e-05, + "loss": 0.4481, "step": 52860 }, { - "epoch": 1.86, - "learning_rate": 3.617878596505982e-05, - "loss": 0.2754, + "epoch": 1.905251018128086, + "grad_norm": 0.1697293370962143, + "learning_rate": 3.551859294095135e-05, + "loss": 0.4, "step": 52865 }, { - "epoch": 1.86, - "learning_rate": 3.6176237830162804e-05, - "loss": 0.2669, + "epoch": 1.9054312177893107, + "grad_norm": 0.17631365358829498, + "learning_rate": 3.551594559814709e-05, + "loss": 0.3584, "step": 52870 }, { - "epoch": 1.86, - "learning_rate": 3.61736895501497e-05, - "loss": 0.2724, + "epoch": 1.9056114174505352, + "grad_norm": 0.20345862209796906, + "learning_rate": 3.551329811206511e-05, + "loss": 0.4251, "step": 52875 }, { - "epoch": 1.86, - "learning_rate": 3.6171141125053595e-05, - "loss": 0.2857, + "epoch": 1.9057916171117597, + "grad_norm": 0.1694919317960739, + "learning_rate": 3.551065048274147e-05, + "loss": 0.4144, "step": 52880 }, { - "epoch": 1.86, - "learning_rate": 3.61685925549076e-05, - "loss": 0.2971, + "epoch": 1.9059718167729844, + "grad_norm": 0.1483502835035324, + "learning_rate": 3.550800271021226e-05, + "loss": 0.3912, "step": 52885 }, { - "epoch": 1.86, - "learning_rate": 3.616604383974478e-05, - "loss": 0.2753, + "epoch": 1.9061520164342092, + "grad_norm": 0.169077530503273, + "learning_rate": 3.550535479451356e-05, + "loss": 0.4053, "step": 52890 }, { - "epoch": 1.86, - "learning_rate": 3.616349497959824e-05, - "loss": 0.3067, + "epoch": 1.906332216095434, + "grad_norm": 0.20600876212120056, + "learning_rate": 3.550270673568141e-05, + "loss": 0.4135, "step": 52895 }, { - "epoch": 1.86, - "learning_rate": 3.6160945974501077e-05, - "loss": 0.2854, + "epoch": 1.9065124157566584, + "grad_norm": 0.2073904126882553, + "learning_rate": 3.550005853375193e-05, + "loss": 0.4526, "step": 52900 }, { - "epoch": 1.86, - "learning_rate": 3.6158396824486385e-05, - "loss": 0.287, + "epoch": 1.906692615417883, + "grad_norm": 0.17720499634742737, + "learning_rate": 3.549741018876118e-05, + "loss": 0.4018, "step": 52905 }, { - "epoch": 1.86, - "learning_rate": 3.6155847529587264e-05, - "loss": 0.3051, + "epoch": 1.9068728150791077, + "grad_norm": 0.1841294914484024, + "learning_rate": 3.549476170074526e-05, + "loss": 0.4054, "step": 52910 }, { - "epoch": 1.86, - "learning_rate": 3.6153298089836815e-05, - "loss": 0.2939, + "epoch": 1.9070530147403324, + "grad_norm": 0.20570293068885803, + "learning_rate": 3.549211306974023e-05, + "loss": 0.4207, "step": 52915 }, { - "epoch": 1.86, - "learning_rate": 3.615074850526815e-05, - "loss": 0.2884, + "epoch": 1.907233214401557, + "grad_norm": 0.1706700176000595, + "learning_rate": 3.548946429578219e-05, + "loss": 0.4166, "step": 52920 }, { - "epoch": 1.86, - "learning_rate": 3.614819877591435e-05, - "loss": 0.3121, + "epoch": 1.9074134140627814, + "grad_norm": 0.19486333429813385, + "learning_rate": 3.5486815378907234e-05, + "loss": 0.4408, "step": 52925 }, { - "epoch": 1.86, - "learning_rate": 3.6145648901808535e-05, - "loss": 0.2799, + "epoch": 1.9075936137240062, + "grad_norm": 0.15442302823066711, + "learning_rate": 3.548416631915146e-05, + "loss": 0.4376, "step": 52930 }, { - "epoch": 1.86, - "learning_rate": 3.614309888298383e-05, - "loss": 0.3066, + "epoch": 1.9077738133852309, + "grad_norm": 0.1942632794380188, + "learning_rate": 3.5481517116550936e-05, + "loss": 0.3904, "step": 52935 }, { - "epoch": 1.86, - "learning_rate": 3.614054871947332e-05, - "loss": 0.2771, + "epoch": 1.9079540130464556, + "grad_norm": 0.16237981617450714, + "learning_rate": 3.547886777114177e-05, + "loss": 0.353, "step": 52940 }, { - "epoch": 1.86, - "learning_rate": 3.6137998411310126e-05, - "loss": 0.2908, + "epoch": 1.9081342127076801, + "grad_norm": 0.2059374898672104, + "learning_rate": 3.5476218282960064e-05, + "loss": 0.4277, "step": 52945 }, { - "epoch": 1.86, - "learning_rate": 3.613544795852736e-05, - "loss": 0.2769, + "epoch": 1.9083144123689046, + "grad_norm": 0.1902727484703064, + "learning_rate": 3.547356865204191e-05, + "loss": 0.4187, "step": 52950 }, { - "epoch": 1.86, - "learning_rate": 3.6132897361158145e-05, - "loss": 0.2981, + "epoch": 1.9084946120301294, + "grad_norm": 0.19110523164272308, + "learning_rate": 3.5470918878423414e-05, + "loss": 0.4122, "step": 52955 }, { - "epoch": 1.86, - "learning_rate": 3.613034661923559e-05, - "loss": 0.2712, + "epoch": 1.9086748116913541, + "grad_norm": 0.19622482359409332, + "learning_rate": 3.546826896214067e-05, + "loss": 0.3987, "step": 52960 }, { - "epoch": 1.86, - "learning_rate": 3.612779573279283e-05, - "loss": 0.2836, + "epoch": 1.9088550113525786, + "grad_norm": 0.19886255264282227, + "learning_rate": 3.546561890322979e-05, + "loss": 0.4114, "step": 52965 }, { - "epoch": 1.86, - "learning_rate": 3.612524470186297e-05, - "loss": 0.2873, + "epoch": 1.9090352110138031, + "grad_norm": 0.1465664505958557, + "learning_rate": 3.546296870172689e-05, + "loss": 0.3839, "step": 52970 }, { - "epoch": 1.86, - "learning_rate": 3.6122693526479135e-05, - "loss": 0.2933, + "epoch": 1.9092154106750279, + "grad_norm": 0.19967186450958252, + "learning_rate": 3.546031835766806e-05, + "loss": 0.4433, "step": 52975 }, { - "epoch": 1.86, - "learning_rate": 3.6120142206674455e-05, - "loss": 0.2809, + "epoch": 1.9093956103362526, + "grad_norm": 0.17404790222644806, + "learning_rate": 3.545766787108941e-05, + "loss": 0.4215, "step": 52980 }, { - "epoch": 1.86, - "learning_rate": 3.611759074248206e-05, - "loss": 0.2955, + "epoch": 1.9095758099974773, + "grad_norm": 0.15910834074020386, + "learning_rate": 3.545501724202706e-05, + "loss": 0.3836, "step": 52985 }, { - "epoch": 1.86, - "learning_rate": 3.611503913393508e-05, - "loss": 0.2707, + "epoch": 1.9097560096587018, + "grad_norm": 0.17526118457317352, + "learning_rate": 3.545236647051713e-05, + "loss": 0.3867, "step": 52990 }, { - "epoch": 1.86, - "learning_rate": 3.611248738106663e-05, - "loss": 0.2893, + "epoch": 1.9099362093199264, + "grad_norm": 0.1866360753774643, + "learning_rate": 3.544971555659574e-05, + "loss": 0.3812, "step": 52995 }, { - "epoch": 1.86, - "learning_rate": 3.6109935483909876e-05, - "loss": 0.3082, + "epoch": 1.910116408981151, + "grad_norm": 0.1662186235189438, + "learning_rate": 3.544706450029898e-05, + "loss": 0.4114, "step": 53000 }, { - "epoch": 1.86, - "eval_loss": 0.27502691745758057, - "eval_runtime": 10.5394, - "eval_samples_per_second": 9.488, - "eval_steps_per_second": 9.488, + "epoch": 1.910116408981151, + "eval_loss": 0.4382114112377167, + "eval_runtime": 3.5422, + "eval_samples_per_second": 28.231, + "eval_steps_per_second": 7.058, "step": 53000 }, { - "epoch": 1.86, - "learning_rate": 3.610738344249791e-05, - "loss": 0.2816, + "epoch": 1.9102966086423758, + "grad_norm": 0.1859690397977829, + "learning_rate": 3.5444413301662996e-05, + "loss": 0.4291, "step": 53005 }, { - "epoch": 1.87, - "learning_rate": 3.6104831256863905e-05, - "loss": 0.2889, + "epoch": 1.9104768083036003, + "grad_norm": 0.2037961333990097, + "learning_rate": 3.544176196072391e-05, + "loss": 0.4178, "step": 53010 }, { - "epoch": 1.87, - "learning_rate": 3.610227892704097e-05, - "loss": 0.2816, + "epoch": 1.910657007964825, + "grad_norm": 0.19950434565544128, + "learning_rate": 3.543911047751783e-05, + "loss": 0.3977, "step": 53015 }, { - "epoch": 1.87, - "learning_rate": 3.609972645306227e-05, - "loss": 0.2639, + "epoch": 1.9108372076260496, + "grad_norm": 0.169523224234581, + "learning_rate": 3.5436458852080895e-05, + "loss": 0.4187, "step": 53020 }, { - "epoch": 1.87, - "learning_rate": 3.6097173834960935e-05, - "loss": 0.2738, + "epoch": 1.9110174072872743, + "grad_norm": 0.19487643241882324, + "learning_rate": 3.543380708444923e-05, + "loss": 0.4207, "step": 53025 }, { - "epoch": 1.87, - "learning_rate": 3.609462107277012e-05, - "loss": 0.2803, + "epoch": 1.911197606948499, + "grad_norm": 0.1871878057718277, + "learning_rate": 3.543115517465896e-05, + "loss": 0.4376, "step": 53030 }, { - "epoch": 1.87, - "learning_rate": 3.609206816652295e-05, - "loss": 0.2838, + "epoch": 1.9113778066097236, + "grad_norm": 0.17613734304904938, + "learning_rate": 3.542850312274622e-05, + "loss": 0.4136, "step": 53035 }, { - "epoch": 1.87, - "learning_rate": 3.6089515116252584e-05, - "loss": 0.2923, + "epoch": 1.911558006270948, + "grad_norm": 0.17650562524795532, + "learning_rate": 3.542585092874714e-05, + "loss": 0.3665, "step": 53040 }, { - "epoch": 1.87, - "learning_rate": 3.6086961921992173e-05, - "loss": 0.283, + "epoch": 1.9117382059321728, + "grad_norm": 0.16049924492835999, + "learning_rate": 3.542319859269787e-05, + "loss": 0.3824, "step": 53045 }, { - "epoch": 1.87, - "learning_rate": 3.6084408583774875e-05, - "loss": 0.2805, + "epoch": 1.9119184055933975, + "grad_norm": 0.1577320694923401, + "learning_rate": 3.5420546114634535e-05, + "loss": 0.3998, "step": 53050 }, { - "epoch": 1.87, - "learning_rate": 3.608185510163383e-05, - "loss": 0.2876, + "epoch": 1.9120986052546223, + "grad_norm": 0.22599272429943085, + "learning_rate": 3.541789349459327e-05, + "loss": 0.4059, "step": 53055 }, { - "epoch": 1.87, - "learning_rate": 3.60793014756022e-05, - "loss": 0.2843, + "epoch": 1.9122788049158468, + "grad_norm": 0.16319070756435394, + "learning_rate": 3.541524073261023e-05, + "loss": 0.3643, "step": 53060 }, { - "epoch": 1.87, - "learning_rate": 3.6076747705713145e-05, - "loss": 0.2778, + "epoch": 1.9124590045770713, + "grad_norm": 0.20491214096546173, + "learning_rate": 3.541258782872154e-05, + "loss": 0.412, "step": 53065 }, { - "epoch": 1.87, - "learning_rate": 3.607419379199982e-05, - "loss": 0.3115, + "epoch": 1.912639204238296, + "grad_norm": 0.14605240523815155, + "learning_rate": 3.540993478296337e-05, + "loss": 0.3939, "step": 53070 }, { - "epoch": 1.87, - "learning_rate": 3.6071639734495386e-05, - "loss": 0.28, + "epoch": 1.9128194038995208, + "grad_norm": 0.18107111752033234, + "learning_rate": 3.540728159537185e-05, + "loss": 0.4318, "step": 53075 }, { - "epoch": 1.87, - "learning_rate": 3.606908553323301e-05, - "loss": 0.2613, + "epoch": 1.9129996035607453, + "grad_norm": 0.16898123919963837, + "learning_rate": 3.540462826598313e-05, + "loss": 0.4414, "step": 53080 }, { - "epoch": 1.87, - "learning_rate": 3.606653118824586e-05, - "loss": 0.2805, + "epoch": 1.9131798032219698, + "grad_norm": 0.1803431212902069, + "learning_rate": 3.540197479483337e-05, + "loss": 0.4014, "step": 53085 }, { - "epoch": 1.87, - "learning_rate": 3.606397669956707e-05, - "loss": 0.2614, + "epoch": 1.9133600028831945, + "grad_norm": 0.17735476791858673, + "learning_rate": 3.539932118195871e-05, + "loss": 0.3945, "step": 53090 }, { - "epoch": 1.87, - "learning_rate": 3.606142206722985e-05, - "loss": 0.293, + "epoch": 1.9135402025444193, + "grad_norm": 0.1508413851261139, + "learning_rate": 3.539666742739532e-05, + "loss": 0.4031, "step": 53095 }, { - "epoch": 1.87, - "learning_rate": 3.6058867291267354e-05, - "loss": 0.2816, + "epoch": 1.913720402205644, + "grad_norm": 0.21895846724510193, + "learning_rate": 3.539401353117935e-05, + "loss": 0.4405, "step": 53100 }, { - "epoch": 1.87, - "learning_rate": 3.605631237171277e-05, - "loss": 0.287, + "epoch": 1.9139006018668685, + "grad_norm": 0.22758044302463531, + "learning_rate": 3.539135949334695e-05, + "loss": 0.4415, "step": 53105 }, { - "epoch": 1.87, - "learning_rate": 3.6053757308599235e-05, - "loss": 0.2713, + "epoch": 1.914080801528093, + "grad_norm": 0.17309550940990448, + "learning_rate": 3.538870531393429e-05, + "loss": 0.4228, "step": 53110 }, { - "epoch": 1.87, - "learning_rate": 3.6051202101959955e-05, - "loss": 0.3059, + "epoch": 1.9142610011893177, + "grad_norm": 0.2273998111486435, + "learning_rate": 3.538605099297754e-05, + "loss": 0.4158, "step": 53115 }, { - "epoch": 1.87, - "learning_rate": 3.604864675182809e-05, - "loss": 0.2723, + "epoch": 1.9144412008505425, + "grad_norm": 0.17294365167617798, + "learning_rate": 3.538339653051285e-05, + "loss": 0.3855, "step": 53120 }, { - "epoch": 1.87, - "learning_rate": 3.6046091258236834e-05, - "loss": 0.2759, + "epoch": 1.914621400511767, + "grad_norm": 0.2204235941171646, + "learning_rate": 3.5380741926576385e-05, + "loss": 0.4159, "step": 53125 }, { - "epoch": 1.87, - "learning_rate": 3.604353562121936e-05, - "loss": 0.2998, + "epoch": 1.9148016001729917, + "grad_norm": 0.18429414927959442, + "learning_rate": 3.537808718120433e-05, + "loss": 0.4255, "step": 53130 }, { - "epoch": 1.87, - "learning_rate": 3.6040979840808866e-05, - "loss": 0.2741, + "epoch": 1.9149817998342162, + "grad_norm": 0.18683667480945587, + "learning_rate": 3.537543229443285e-05, + "loss": 0.4042, "step": 53135 }, { - "epoch": 1.87, - "learning_rate": 3.603842391703851e-05, - "loss": 0.2959, + "epoch": 1.915161999495441, + "grad_norm": 0.20948028564453125, + "learning_rate": 3.537330828323237e-05, + "loss": 0.3947, "step": 53140 }, { - "epoch": 1.87, - "learning_rate": 3.60358678499415e-05, - "loss": 0.2795, + "epoch": 1.9153421991566657, + "grad_norm": 0.17518873512744904, + "learning_rate": 3.5370653142033075e-05, + "loss": 0.4003, "step": 53145 }, { - "epoch": 1.87, - "learning_rate": 3.603331163955102e-05, - "loss": 0.2791, + "epoch": 1.9155223988178902, + "grad_norm": 0.13344982266426086, + "learning_rate": 3.536799785953563e-05, + "loss": 0.4035, "step": 53150 }, { - "epoch": 1.87, - "learning_rate": 3.6030755285900255e-05, - "loss": 0.2649, + "epoch": 1.9157025984791147, + "grad_norm": 0.1912095695734024, + "learning_rate": 3.5365342435776225e-05, + "loss": 0.4021, "step": 53155 }, { - "epoch": 1.87, - "learning_rate": 3.602819878902241e-05, - "loss": 0.291, + "epoch": 1.9158827981403395, + "grad_norm": 0.19049333035945892, + "learning_rate": 3.536268687079104e-05, + "loss": 0.3966, "step": 53160 }, { - "epoch": 1.87, - "learning_rate": 3.602564214895067e-05, - "loss": 0.2913, + "epoch": 1.9160629978015642, + "grad_norm": 0.14792177081108093, + "learning_rate": 3.5360031164616244e-05, + "loss": 0.4132, "step": 53165 }, { - "epoch": 1.87, - "learning_rate": 3.602308536571822e-05, - "loss": 0.3051, + "epoch": 1.916243197462789, + "grad_norm": 0.1720864176750183, + "learning_rate": 3.535737531728803e-05, + "loss": 0.4254, "step": 53170 }, { - "epoch": 1.87, - "learning_rate": 3.602052843935828e-05, - "loss": 0.2615, + "epoch": 1.9164233971240134, + "grad_norm": 0.2024461328983307, + "learning_rate": 3.5354719328842585e-05, + "loss": 0.4462, "step": 53175 }, { - "epoch": 1.87, - "learning_rate": 3.601797136990404e-05, - "loss": 0.2853, + "epoch": 1.916603596785238, + "grad_norm": 0.19676236808300018, + "learning_rate": 3.53520631993161e-05, + "loss": 0.4637, "step": 53180 }, { - "epoch": 1.87, - "learning_rate": 3.601541415738871e-05, - "loss": 0.3029, + "epoch": 1.9167837964464627, + "grad_norm": 0.19073833525180817, + "learning_rate": 3.534940692874475e-05, + "loss": 0.4156, "step": 53185 }, { - "epoch": 1.87, - "learning_rate": 3.6012856801845476e-05, - "loss": 0.2798, + "epoch": 1.9169639961076874, + "grad_norm": 0.18771030008792877, + "learning_rate": 3.5346750517164736e-05, + "loss": 0.3749, "step": 53190 }, { - "epoch": 1.87, - "learning_rate": 3.6010299303307564e-05, - "loss": 0.2792, + "epoch": 1.917144195768912, + "grad_norm": 0.20612174272537231, + "learning_rate": 3.534409396461225e-05, + "loss": 0.4305, "step": 53195 }, { - "epoch": 1.87, - "learning_rate": 3.6007741661808156e-05, - "loss": 0.2701, + "epoch": 1.9173243954301364, + "grad_norm": 0.18494312465190887, + "learning_rate": 3.534143727112349e-05, + "loss": 0.4385, "step": 53200 }, { - "epoch": 1.87, - "learning_rate": 3.60051838773805e-05, - "loss": 0.2634, + "epoch": 1.9175045950913612, + "grad_norm": 0.22901670634746552, + "learning_rate": 3.533878043673464e-05, + "loss": 0.4131, "step": 53205 }, { - "epoch": 1.87, - "learning_rate": 3.600262595005777e-05, - "loss": 0.274, + "epoch": 1.917684794752586, + "grad_norm": 0.18775056302547455, + "learning_rate": 3.533612346148192e-05, + "loss": 0.4204, "step": 53210 }, { - "epoch": 1.87, - "learning_rate": 3.600006787987319e-05, - "loss": 0.276, + "epoch": 1.9178649944138106, + "grad_norm": 0.2237263172864914, + "learning_rate": 3.533346634540151e-05, + "loss": 0.4032, "step": 53215 }, { - "epoch": 1.87, - "learning_rate": 3.5997509666859986e-05, - "loss": 0.254, + "epoch": 1.9180451940750352, + "grad_norm": 0.24082402884960175, + "learning_rate": 3.533080908852962e-05, + "loss": 0.3854, "step": 53220 }, { - "epoch": 1.87, - "learning_rate": 3.599495131105136e-05, - "loss": 0.2867, + "epoch": 1.9182253937362597, + "grad_norm": 0.18689344823360443, + "learning_rate": 3.5328151690902465e-05, + "loss": 0.3829, "step": 53225 }, { - "epoch": 1.87, - "learning_rate": 3.599239281248054e-05, - "loss": 0.2812, + "epoch": 1.9184055933974844, + "grad_norm": 0.20460011065006256, + "learning_rate": 3.532549415255624e-05, + "loss": 0.4378, "step": 53230 }, { - "epoch": 1.87, - "learning_rate": 3.598983417118075e-05, - "loss": 0.2513, + "epoch": 1.9185857930587091, + "grad_norm": 0.18038150668144226, + "learning_rate": 3.5322836473527154e-05, + "loss": 0.4306, "step": 53235 }, { - "epoch": 1.87, - "learning_rate": 3.5987275387185196e-05, - "loss": 0.3058, + "epoch": 1.9187659927199336, + "grad_norm": 0.18723049759864807, + "learning_rate": 3.5320178653851425e-05, + "loss": 0.384, "step": 53240 }, { - "epoch": 1.87, - "learning_rate": 3.5984716460527116e-05, - "loss": 0.2823, + "epoch": 1.9189461923811584, + "grad_norm": 0.21049338579177856, + "learning_rate": 3.531752069356525e-05, + "loss": 0.4149, "step": 53245 }, { - "epoch": 1.87, - "learning_rate": 3.598215739123973e-05, - "loss": 0.2615, + "epoch": 1.919126392042383, + "grad_norm": 0.17135761678218842, + "learning_rate": 3.531486259270486e-05, + "loss": 0.4073, "step": 53250 }, { - "epoch": 1.87, - "learning_rate": 3.597959817935628e-05, - "loss": 0.2778, + "epoch": 1.9193065917036076, + "grad_norm": 0.1982400119304657, + "learning_rate": 3.531220435130646e-05, + "loss": 0.3981, "step": 53255 }, { - "epoch": 1.87, - "learning_rate": 3.5977038824909966e-05, - "loss": 0.272, + "epoch": 1.9194867913648324, + "grad_norm": 0.18633022904396057, + "learning_rate": 3.530954596940628e-05, + "loss": 0.3868, "step": 53260 }, { - "epoch": 1.87, - "learning_rate": 3.597447932793404e-05, - "loss": 0.2803, + "epoch": 1.9196669910260569, + "grad_norm": 0.20298829674720764, + "learning_rate": 3.530688744704053e-05, + "loss": 0.4442, "step": 53265 }, { - "epoch": 1.87, - "learning_rate": 3.5971919688461744e-05, - "loss": 0.2661, + "epoch": 1.9198471906872814, + "grad_norm": 0.17261956632137299, + "learning_rate": 3.530422878424543e-05, + "loss": 0.3916, "step": 53270 }, { - "epoch": 1.87, - "learning_rate": 3.596935990652629e-05, - "loss": 0.2634, + "epoch": 1.9200273903485061, + "grad_norm": 0.2012072056531906, + "learning_rate": 3.5301569981057205e-05, + "loss": 0.4336, "step": 53275 }, { - "epoch": 1.87, - "learning_rate": 3.596679998216093e-05, - "loss": 0.2915, + "epoch": 1.9202075900097308, + "grad_norm": 0.17168846726417542, + "learning_rate": 3.5298911037512086e-05, + "loss": 0.3852, "step": 53280 }, { - "epoch": 1.87, - "learning_rate": 3.59642399153989e-05, - "loss": 0.2551, + "epoch": 1.9203877896709556, + "grad_norm": 0.21444140374660492, + "learning_rate": 3.52962519536463e-05, + "loss": 0.4031, "step": 53285 }, { - "epoch": 1.87, - "learning_rate": 3.5961679706273447e-05, - "loss": 0.2735, + "epoch": 1.92056798933218, + "grad_norm": 0.206976979970932, + "learning_rate": 3.5293592729496076e-05, + "loss": 0.3876, "step": 53290 }, { - "epoch": 1.88, - "learning_rate": 3.59591193548178e-05, - "loss": 0.2879, + "epoch": 1.9207481889934046, + "grad_norm": 0.17099922895431519, + "learning_rate": 3.529093336509764e-05, + "loss": 0.4347, "step": 53295 }, { - "epoch": 1.88, - "learning_rate": 3.595655886106521e-05, - "loss": 0.3059, + "epoch": 1.9209283886546293, + "grad_norm": 0.19171853363513947, + "learning_rate": 3.528827386048723e-05, + "loss": 0.4066, "step": 53300 }, { - "epoch": 1.88, - "learning_rate": 3.5953998225048925e-05, - "loss": 0.2938, + "epoch": 1.921108588315854, + "grad_norm": 0.24429461359977722, + "learning_rate": 3.528561421570108e-05, + "loss": 0.4134, "step": 53305 }, { - "epoch": 1.88, - "learning_rate": 3.595143744680219e-05, - "loss": 0.2826, + "epoch": 1.9212887879770786, + "grad_norm": 0.1823347806930542, + "learning_rate": 3.528295443077543e-05, + "loss": 0.4363, "step": 53310 }, { - "epoch": 1.88, - "learning_rate": 3.594887652635826e-05, - "loss": 0.2837, + "epoch": 1.921468987638303, + "grad_norm": 0.22016625106334686, + "learning_rate": 3.528029450574651e-05, + "loss": 0.4154, "step": 53315 }, { - "epoch": 1.88, - "learning_rate": 3.594631546375038e-05, - "loss": 0.2787, + "epoch": 1.9216491872995278, + "grad_norm": 0.18516206741333008, + "learning_rate": 3.5277634440650584e-05, + "loss": 0.4497, "step": 53320 }, { - "epoch": 1.88, - "learning_rate": 3.5943754259011813e-05, - "loss": 0.2896, + "epoch": 1.9218293869607526, + "grad_norm": 0.18409641087055206, + "learning_rate": 3.527497423552386e-05, + "loss": 0.4074, "step": 53325 }, { - "epoch": 1.88, - "learning_rate": 3.594119291217581e-05, - "loss": 0.2584, + "epoch": 1.9220095866219773, + "grad_norm": 0.18460574746131897, + "learning_rate": 3.527231389040262e-05, + "loss": 0.385, "step": 53330 }, { - "epoch": 1.88, - "learning_rate": 3.5938631423275625e-05, - "loss": 0.2586, + "epoch": 1.9221897862832018, + "grad_norm": 0.18730106949806213, + "learning_rate": 3.526965340532308e-05, + "loss": 0.4215, "step": 53335 }, { - "epoch": 1.88, - "learning_rate": 3.5936069792344514e-05, - "loss": 0.2713, + "epoch": 1.9223699859444263, + "grad_norm": 0.20932596921920776, + "learning_rate": 3.52669927803215e-05, + "loss": 0.38, "step": 53340 }, { - "epoch": 1.88, - "learning_rate": 3.593350801941575e-05, - "loss": 0.286, + "epoch": 1.922550185605651, + "grad_norm": 0.20945709943771362, + "learning_rate": 3.5264332015434134e-05, + "loss": 0.386, "step": 53345 }, { - "epoch": 1.88, - "learning_rate": 3.593094610452259e-05, - "loss": 0.2882, + "epoch": 1.9227303852668758, + "grad_norm": 0.19625355303287506, + "learning_rate": 3.5261671110697234e-05, + "loss": 0.4177, "step": 53350 }, { - "epoch": 1.88, - "learning_rate": 3.59283840476983e-05, - "loss": 0.2778, + "epoch": 1.9229105849281003, + "grad_norm": 0.16801773011684418, + "learning_rate": 3.525901006614705e-05, + "loss": 0.4078, "step": 53355 }, { - "epoch": 1.88, - "learning_rate": 3.592582184897614e-05, - "loss": 0.2799, + "epoch": 1.9230907845893248, + "grad_norm": 0.23051504790782928, + "learning_rate": 3.525634888181983e-05, + "loss": 0.4157, "step": 53360 }, { - "epoch": 1.88, - "learning_rate": 3.592325950838938e-05, - "loss": 0.3066, + "epoch": 1.9232709842505495, + "grad_norm": 0.17896528542041779, + "learning_rate": 3.525368755775186e-05, + "loss": 0.4013, "step": 53365 }, { - "epoch": 1.88, - "learning_rate": 3.5920697025971306e-05, - "loss": 0.2768, + "epoch": 1.9234511839117743, + "grad_norm": 0.24281032383441925, + "learning_rate": 3.525102609397937e-05, + "loss": 0.4071, "step": 53370 }, { - "epoch": 1.88, - "learning_rate": 3.591813440175518e-05, - "loss": 0.2807, + "epoch": 1.923631383572999, + "grad_norm": 0.19194422662258148, + "learning_rate": 3.524836449053864e-05, + "loss": 0.4225, "step": 53375 }, { - "epoch": 1.88, - "learning_rate": 3.5915571635774256e-05, - "loss": 0.2799, + "epoch": 1.9238115832342235, + "grad_norm": 0.18676306307315826, + "learning_rate": 3.5245702747465924e-05, + "loss": 0.4121, "step": 53380 }, { - "epoch": 1.88, - "learning_rate": 3.5913008728061845e-05, - "loss": 0.2797, + "epoch": 1.923991782895448, + "grad_norm": 0.1551360785961151, + "learning_rate": 3.524304086479749e-05, + "loss": 0.4243, "step": 53385 }, { - "epoch": 1.88, - "learning_rate": 3.59104456786512e-05, - "loss": 0.2818, + "epoch": 1.9241719825566728, + "grad_norm": 0.1882885843515396, + "learning_rate": 3.5240378842569614e-05, + "loss": 0.3898, "step": 53390 }, { - "epoch": 1.88, - "learning_rate": 3.59078824875756e-05, - "loss": 0.2837, + "epoch": 1.9243521822178975, + "grad_norm": 0.22999207675457, + "learning_rate": 3.5237716680818554e-05, + "loss": 0.4386, "step": 53395 }, { - "epoch": 1.88, - "learning_rate": 3.590531915486835e-05, - "loss": 0.2849, + "epoch": 1.9245323818791222, + "grad_norm": 0.18411701917648315, + "learning_rate": 3.523505437958059e-05, + "loss": 0.3995, "step": 53400 }, { - "epoch": 1.88, - "learning_rate": 3.590275568056272e-05, - "loss": 0.2767, + "epoch": 1.9247125815403467, + "grad_norm": 0.1591692715883255, + "learning_rate": 3.5232391938891983e-05, + "loss": 0.3968, "step": 53405 }, { - "epoch": 1.88, - "learning_rate": 3.5900192064691976e-05, - "loss": 0.2593, + "epoch": 1.9248927812015713, + "grad_norm": 0.1875472217798233, + "learning_rate": 3.5229729358789026e-05, + "loss": 0.4125, "step": 53410 }, { - "epoch": 1.88, - "learning_rate": 3.5897628307289435e-05, - "loss": 0.2942, + "epoch": 1.925072980862796, + "grad_norm": 0.19325365126132965, + "learning_rate": 3.522706663930799e-05, + "loss": 0.404, "step": 53415 }, { - "epoch": 1.88, - "learning_rate": 3.589506440838836e-05, - "loss": 0.2855, + "epoch": 1.9252531805240207, + "grad_norm": 0.17323115468025208, + "learning_rate": 3.5224403780485136e-05, + "loss": 0.4047, "step": 53420 }, { - "epoch": 1.88, - "learning_rate": 3.589250036802206e-05, - "loss": 0.2643, + "epoch": 1.9254333801852452, + "grad_norm": 0.15711285173892975, + "learning_rate": 3.522174078235677e-05, + "loss": 0.3875, "step": 53425 }, { - "epoch": 1.88, - "learning_rate": 3.588993618622383e-05, - "loss": 0.3168, + "epoch": 1.9256135798464697, + "grad_norm": 0.1933702826499939, + "learning_rate": 3.521907764495917e-05, + "loss": 0.4531, "step": 53430 }, { - "epoch": 1.88, - "learning_rate": 3.588737186302695e-05, - "loss": 0.2947, + "epoch": 1.9257937795076945, + "grad_norm": 0.20678189396858215, + "learning_rate": 3.5216414368328607e-05, + "loss": 0.4352, "step": 53435 }, { - "epoch": 1.88, - "learning_rate": 3.5884807398464713e-05, - "loss": 0.271, + "epoch": 1.9259739791689192, + "grad_norm": 0.17905567586421967, + "learning_rate": 3.521375095250138e-05, + "loss": 0.4394, "step": 53440 }, { - "epoch": 1.88, - "learning_rate": 3.588224279257043e-05, - "loss": 0.2879, + "epoch": 1.926154178830144, + "grad_norm": 0.19163614511489868, + "learning_rate": 3.521108739751377e-05, + "loss": 0.3998, "step": 53445 }, { - "epoch": 1.88, - "learning_rate": 3.58796780453774e-05, - "loss": 0.2801, + "epoch": 1.9263343784913685, + "grad_norm": 0.1524534672498703, + "learning_rate": 3.5208423703402075e-05, + "loss": 0.3911, "step": 53450 }, { - "epoch": 1.88, - "learning_rate": 3.587711315691891e-05, - "loss": 0.288, + "epoch": 1.926514578152593, + "grad_norm": 0.15856072306632996, + "learning_rate": 3.520575987020258e-05, + "loss": 0.4365, "step": 53455 }, { - "epoch": 1.88, - "learning_rate": 3.587454812722829e-05, - "loss": 0.2895, + "epoch": 1.9266947778138177, + "grad_norm": 0.20642130076885223, + "learning_rate": 3.520309589795159e-05, + "loss": 0.4418, "step": 53460 }, { - "epoch": 1.88, - "learning_rate": 3.587198295633882e-05, - "loss": 0.2864, + "epoch": 1.9268749774750424, + "grad_norm": 0.1950474977493286, + "learning_rate": 3.520043178668538e-05, + "loss": 0.4325, "step": 53465 }, { - "epoch": 1.88, - "learning_rate": 3.586941764428381e-05, - "loss": 0.2726, + "epoch": 1.927055177136267, + "grad_norm": 0.1779826283454895, + "learning_rate": 3.519776753644028e-05, + "loss": 0.4068, "step": 53470 }, { - "epoch": 1.88, - "learning_rate": 3.586685219109659e-05, - "loss": 0.2633, + "epoch": 1.9272353767974915, + "grad_norm": 0.19303326308727264, + "learning_rate": 3.5195103147252564e-05, + "loss": 0.4287, "step": 53475 }, { - "epoch": 1.88, - "learning_rate": 3.586428659681044e-05, - "loss": 0.274, + "epoch": 1.9274155764587162, + "grad_norm": 0.20207709074020386, + "learning_rate": 3.5192438619158536e-05, + "loss": 0.4266, "step": 53480 }, { - "epoch": 1.88, - "learning_rate": 3.5861720861458695e-05, - "loss": 0.2746, + "epoch": 1.927595776119941, + "grad_norm": 0.16578246653079987, + "learning_rate": 3.5189773952194506e-05, + "loss": 0.377, "step": 53485 }, { - "epoch": 1.88, - "learning_rate": 3.585915498507467e-05, - "loss": 0.2978, + "epoch": 1.9277759757811657, + "grad_norm": 0.17136074602603912, + "learning_rate": 3.518710914639678e-05, + "loss": 0.3963, "step": 53490 }, { - "epoch": 1.88, - "learning_rate": 3.5856588967691664e-05, - "loss": 0.258, + "epoch": 1.9279561754423902, + "grad_norm": 0.1691044121980667, + "learning_rate": 3.518444420180167e-05, + "loss": 0.4504, "step": 53495 }, { - "epoch": 1.88, - "learning_rate": 3.585402280934301e-05, - "loss": 0.3014, + "epoch": 1.9281363751036147, + "grad_norm": 0.1666347235441208, + "learning_rate": 3.518177911844547e-05, + "loss": 0.4177, "step": 53500 }, { - "epoch": 1.88, - "eval_loss": 0.27490025758743286, - "eval_runtime": 10.529, - "eval_samples_per_second": 9.498, - "eval_steps_per_second": 9.498, + "epoch": 1.9281363751036147, + "eval_loss": 0.43853840231895447, + "eval_runtime": 3.5305, + "eval_samples_per_second": 28.325, + "eval_steps_per_second": 7.081, "step": 53500 }, { - "epoch": 1.88, - "learning_rate": 3.585145651006202e-05, - "loss": 0.2879, + "epoch": 1.9283165747648394, + "grad_norm": 0.18548129498958588, + "learning_rate": 3.5179113896364504e-05, + "loss": 0.4455, "step": 53505 }, { - "epoch": 1.88, - "learning_rate": 3.584889006988202e-05, - "loss": 0.2526, + "epoch": 1.9284967744260642, + "grad_norm": 0.16688363254070282, + "learning_rate": 3.517644853559509e-05, + "loss": 0.3915, "step": 53510 }, { - "epoch": 1.88, - "learning_rate": 3.584632348883633e-05, - "loss": 0.2781, + "epoch": 1.9286769740872887, + "grad_norm": 0.14306481182575226, + "learning_rate": 3.517378303617353e-05, + "loss": 0.425, "step": 53515 }, { - "epoch": 1.88, - "learning_rate": 3.5843756766958285e-05, - "loss": 0.3115, + "epoch": 1.9288571737485134, + "grad_norm": 0.13732267916202545, + "learning_rate": 3.517111739813615e-05, + "loss": 0.4324, "step": 53520 }, { - "epoch": 1.88, - "learning_rate": 3.58411899042812e-05, - "loss": 0.2705, + "epoch": 1.929037373409738, + "grad_norm": 0.1663437783718109, + "learning_rate": 3.516845162151925e-05, + "loss": 0.4314, "step": 53525 }, { - "epoch": 1.88, - "learning_rate": 3.583862290083841e-05, - "loss": 0.2759, + "epoch": 1.9292175730709626, + "grad_norm": 0.18951812386512756, + "learning_rate": 3.516578570635917e-05, + "loss": 0.4099, "step": 53530 }, { - "epoch": 1.88, - "learning_rate": 3.583605575666324e-05, - "loss": 0.3126, + "epoch": 1.9293977727321874, + "grad_norm": 0.17902812361717224, + "learning_rate": 3.5163119652692236e-05, + "loss": 0.4343, "step": 53535 }, { - "epoch": 1.88, - "learning_rate": 3.583348847178903e-05, - "loss": 0.3062, + "epoch": 1.9295779723934119, + "grad_norm": 0.1834252029657364, + "learning_rate": 3.5160453460554766e-05, + "loss": 0.3859, "step": 53540 }, { - "epoch": 1.88, - "learning_rate": 3.583092104624911e-05, - "loss": 0.28, + "epoch": 1.9297581720546364, + "grad_norm": 0.21351586282253265, + "learning_rate": 3.515778712998307e-05, + "loss": 0.4282, "step": 53545 }, { - "epoch": 1.88, - "learning_rate": 3.582835348007682e-05, - "loss": 0.3002, + "epoch": 1.9299383717158611, + "grad_norm": 0.18002468347549438, + "learning_rate": 3.515512066101351e-05, + "loss": 0.4183, "step": 53550 }, { - "epoch": 1.88, - "learning_rate": 3.5825785773305496e-05, - "loss": 0.2674, + "epoch": 1.9301185713770859, + "grad_norm": 0.18808430433273315, + "learning_rate": 3.515245405368238e-05, + "loss": 0.3664, "step": 53555 }, { - "epoch": 1.88, - "learning_rate": 3.582373150667954e-05, - "loss": 0.2849, + "epoch": 1.9302987710383106, + "grad_norm": 0.17705509066581726, + "learning_rate": 3.5149787308026036e-05, + "loss": 0.3765, "step": 53560 }, { - "epoch": 1.88, - "learning_rate": 3.582116354691397e-05, - "loss": 0.2912, + "epoch": 1.9304789706995351, + "grad_norm": 0.17073272168636322, + "learning_rate": 3.514712042408081e-05, + "loss": 0.4367, "step": 53565 }, { - "epoch": 1.88, - "learning_rate": 3.5818595446642725e-05, - "loss": 0.2874, + "epoch": 1.9306591703607596, + "grad_norm": 0.19067902863025665, + "learning_rate": 3.514445340188303e-05, + "loss": 0.4082, "step": 53570 }, { - "epoch": 1.88, - "learning_rate": 3.581602720589915e-05, - "loss": 0.2762, + "epoch": 1.9308393700219844, + "grad_norm": 0.19914193451404572, + "learning_rate": 3.5141786241469033e-05, + "loss": 0.4077, "step": 53575 }, { - "epoch": 1.89, - "learning_rate": 3.5813458824716594e-05, - "loss": 0.296, + "epoch": 1.931019569683209, + "grad_norm": 0.15693803131580353, + "learning_rate": 3.513911894287516e-05, + "loss": 0.3914, "step": 53580 }, { - "epoch": 1.89, - "learning_rate": 3.58108903031284e-05, - "loss": 0.2853, + "epoch": 1.9311997693444336, + "grad_norm": 0.18516100943088531, + "learning_rate": 3.5136451506137766e-05, + "loss": 0.4613, "step": 53585 }, { - "epoch": 1.89, - "learning_rate": 3.580832164116791e-05, - "loss": 0.2781, + "epoch": 1.9313799690056581, + "grad_norm": 0.18044835329055786, + "learning_rate": 3.513378393129317e-05, + "loss": 0.4471, "step": 53590 }, { - "epoch": 1.89, - "learning_rate": 3.580575283886849e-05, - "loss": 0.2912, + "epoch": 1.9315601686668828, + "grad_norm": 0.19149909913539886, + "learning_rate": 3.5131116218377735e-05, + "loss": 0.4116, "step": 53595 }, { - "epoch": 1.89, - "learning_rate": 3.5803183896263495e-05, - "loss": 0.2779, + "epoch": 1.9317403683281076, + "grad_norm": 0.1986377239227295, + "learning_rate": 3.51284483674278e-05, + "loss": 0.4369, "step": 53600 }, { - "epoch": 1.89, - "learning_rate": 3.5800614813386273e-05, - "loss": 0.2694, + "epoch": 1.9319205679893323, + "grad_norm": 0.16995567083358765, + "learning_rate": 3.5125780378479725e-05, + "loss": 0.4208, "step": 53605 }, { - "epoch": 1.89, - "learning_rate": 3.579804559027019e-05, - "loss": 0.2939, + "epoch": 1.9321007676505568, + "grad_norm": 0.16595984995365143, + "learning_rate": 3.5123112251569844e-05, + "loss": 0.3805, "step": 53610 }, { - "epoch": 1.89, - "learning_rate": 3.57954762269486e-05, - "loss": 0.2649, + "epoch": 1.9322809673117813, + "grad_norm": 0.2101103663444519, + "learning_rate": 3.5120443986734526e-05, + "loss": 0.3964, "step": 53615 }, { - "epoch": 1.89, - "learning_rate": 3.579290672345486e-05, - "loss": 0.2454, + "epoch": 1.932461166973006, + "grad_norm": 0.15714338421821594, + "learning_rate": 3.511777558401012e-05, + "loss": 0.3843, "step": 53620 }, { - "epoch": 1.89, - "learning_rate": 3.579033707982234e-05, - "loss": 0.3101, + "epoch": 1.9326413666342308, + "grad_norm": 0.14750663936138153, + "learning_rate": 3.511510704343297e-05, + "loss": 0.4092, "step": 53625 }, { - "epoch": 1.89, - "learning_rate": 3.578776729608441e-05, - "loss": 0.2627, + "epoch": 1.9328215662954553, + "grad_norm": 0.16553977131843567, + "learning_rate": 3.511243836503944e-05, + "loss": 0.419, "step": 53630 }, { - "epoch": 1.89, - "learning_rate": 3.578519737227443e-05, - "loss": 0.2664, + "epoch": 1.93300176595668, + "grad_norm": 0.23781326413154602, + "learning_rate": 3.5109769548865914e-05, + "loss": 0.3938, "step": 53635 }, { - "epoch": 1.89, - "learning_rate": 3.578262730842577e-05, - "loss": 0.2712, + "epoch": 1.9331819656179046, + "grad_norm": 0.19198031723499298, + "learning_rate": 3.510710059494871e-05, + "loss": 0.4057, "step": 53640 }, { - "epoch": 1.89, - "learning_rate": 3.5780057104571796e-05, - "loss": 0.3049, + "epoch": 1.9333621652791293, + "grad_norm": 0.1985791027545929, + "learning_rate": 3.5104431503324244e-05, + "loss": 0.4248, "step": 53645 }, { - "epoch": 1.89, - "learning_rate": 3.5777486760745887e-05, - "loss": 0.2892, + "epoch": 1.933542364940354, + "grad_norm": 0.18147054314613342, + "learning_rate": 3.510176227402884e-05, + "loss": 0.3811, "step": 53650 }, { - "epoch": 1.89, - "learning_rate": 3.577491627698142e-05, - "loss": 0.2862, + "epoch": 1.9337225646015785, + "grad_norm": 0.20852439105510712, + "learning_rate": 3.509909290709889e-05, + "loss": 0.4246, "step": 53655 }, { - "epoch": 1.89, - "learning_rate": 3.5772345653311755e-05, - "loss": 0.2894, + "epoch": 1.933902764262803, + "grad_norm": 0.19854693114757538, + "learning_rate": 3.5096423402570746e-05, + "loss": 0.4155, "step": 53660 }, { - "epoch": 1.89, - "learning_rate": 3.57697748897703e-05, - "loss": 0.2764, + "epoch": 1.9340829639240278, + "grad_norm": 0.15732385218143463, + "learning_rate": 3.5093753760480794e-05, + "loss": 0.4039, "step": 53665 }, { - "epoch": 1.89, - "learning_rate": 3.57672039863904e-05, - "loss": 0.2925, + "epoch": 1.9342631635852525, + "grad_norm": 0.17299677431583405, + "learning_rate": 3.509108398086539e-05, + "loss": 0.4189, "step": 53670 }, { - "epoch": 1.89, - "learning_rate": 3.576463294320545e-05, - "loss": 0.2872, + "epoch": 1.9344433632464773, + "grad_norm": 0.17880766093730927, + "learning_rate": 3.508841406376093e-05, + "loss": 0.4369, "step": 53675 }, { - "epoch": 1.89, - "learning_rate": 3.576206176024884e-05, - "loss": 0.2931, + "epoch": 1.9346235629077018, + "grad_norm": 0.23910261690616608, + "learning_rate": 3.508574400920379e-05, + "loss": 0.377, "step": 53680 }, { - "epoch": 1.89, - "learning_rate": 3.575949043755395e-05, - "loss": 0.2835, + "epoch": 1.9348037625689263, + "grad_norm": 0.165726900100708, + "learning_rate": 3.508307381723032e-05, + "loss": 0.4052, "step": 53685 }, { - "epoch": 1.89, - "learning_rate": 3.5756918975154174e-05, - "loss": 0.2891, + "epoch": 1.934983962230151, + "grad_norm": 0.18893593549728394, + "learning_rate": 3.508040348787694e-05, + "loss": 0.3806, "step": 53690 }, { - "epoch": 1.89, - "learning_rate": 3.575434737308289e-05, - "loss": 0.2525, + "epoch": 1.9351641618913757, + "grad_norm": 0.22258023917675018, + "learning_rate": 3.5077733021180006e-05, + "loss": 0.402, "step": 53695 }, { - "epoch": 1.89, - "learning_rate": 3.575177563137349e-05, - "loss": 0.3179, + "epoch": 1.9353443615526003, + "grad_norm": 0.17295654118061066, + "learning_rate": 3.5075062417175905e-05, + "loss": 0.4317, "step": 53700 }, { - "epoch": 1.89, - "learning_rate": 3.5749203750059365e-05, - "loss": 0.3078, + "epoch": 1.9355245612138248, + "grad_norm": 0.1404528021812439, + "learning_rate": 3.507239167590104e-05, + "loss": 0.4074, "step": 53705 }, { - "epoch": 1.89, - "learning_rate": 3.574663172917392e-05, - "loss": 0.2811, + "epoch": 1.9357047608750495, + "grad_norm": 0.16644835472106934, + "learning_rate": 3.5069720797391784e-05, + "loss": 0.4192, "step": 53710 }, { - "epoch": 1.89, - "learning_rate": 3.574405956875055e-05, - "loss": 0.2791, + "epoch": 1.9358849605362742, + "grad_norm": 0.21839553117752075, + "learning_rate": 3.506704978168453e-05, + "loss": 0.4101, "step": 53715 }, { - "epoch": 1.89, - "learning_rate": 3.574148726882264e-05, - "loss": 0.3034, + "epoch": 1.936065160197499, + "grad_norm": 0.21238647401332855, + "learning_rate": 3.506437862881567e-05, + "loss": 0.4314, "step": 53720 }, { - "epoch": 1.89, - "learning_rate": 3.5738914829423596e-05, - "loss": 0.2864, + "epoch": 1.9362453598587235, + "grad_norm": 0.19997538626194, + "learning_rate": 3.506170733882161e-05, + "loss": 0.371, "step": 53725 }, { - "epoch": 1.89, - "learning_rate": 3.573634225058682e-05, - "loss": 0.2926, + "epoch": 1.936425559519948, + "grad_norm": 0.18178488314151764, + "learning_rate": 3.505903591173872e-05, + "loss": 0.4023, "step": 53730 }, { - "epoch": 1.89, - "learning_rate": 3.5733769532345715e-05, - "loss": 0.2712, + "epoch": 1.9366057591811727, + "grad_norm": 0.161050945520401, + "learning_rate": 3.505636434760343e-05, + "loss": 0.4041, "step": 53735 }, { - "epoch": 1.89, - "learning_rate": 3.573119667473369e-05, - "loss": 0.2971, + "epoch": 1.9367859588423975, + "grad_norm": 0.22217293083667755, + "learning_rate": 3.505369264645211e-05, + "loss": 0.4249, "step": 53740 }, { - "epoch": 1.89, - "learning_rate": 3.572862367778415e-05, - "loss": 0.2853, + "epoch": 1.936966158503622, + "grad_norm": 0.1790328472852707, + "learning_rate": 3.505102080832118e-05, + "loss": 0.405, "step": 53745 }, { - "epoch": 1.89, - "learning_rate": 3.57260505415305e-05, - "loss": 0.2959, + "epoch": 1.9371463581648467, + "grad_norm": 0.17486228048801422, + "learning_rate": 3.504834883324704e-05, + "loss": 0.3947, "step": 53750 }, { - "epoch": 1.89, - "learning_rate": 3.5723477266006145e-05, - "loss": 0.2991, + "epoch": 1.9373265578260712, + "grad_norm": 0.1511772871017456, + "learning_rate": 3.504567672126608e-05, + "loss": 0.3633, "step": 53755 }, { - "epoch": 1.89, - "learning_rate": 3.572090385124451e-05, - "loss": 0.3107, + "epoch": 1.937506757487296, + "grad_norm": 0.18033269047737122, + "learning_rate": 3.504300447241473e-05, + "loss": 0.393, "step": 53760 }, { - "epoch": 1.89, - "learning_rate": 3.5718330297279e-05, - "loss": 0.2728, + "epoch": 1.9376869571485207, + "grad_norm": 0.20827585458755493, + "learning_rate": 3.504033208672939e-05, + "loss": 0.4061, "step": 53765 }, { - "epoch": 1.89, - "learning_rate": 3.5715756604143045e-05, - "loss": 0.2799, + "epoch": 1.9378671568097452, + "grad_norm": 0.22322550415992737, + "learning_rate": 3.5037659564246464e-05, + "loss": 0.4297, "step": 53770 }, { - "epoch": 1.89, - "learning_rate": 3.571318277187005e-05, - "loss": 0.2949, + "epoch": 1.9380473564709697, + "grad_norm": 0.22495147585868835, + "learning_rate": 3.5034986905002365e-05, + "loss": 0.396, "step": 53775 }, { - "epoch": 1.89, - "learning_rate": 3.5710608800493425e-05, - "loss": 0.2918, + "epoch": 1.9382275561321944, + "grad_norm": 0.1754370629787445, + "learning_rate": 3.503231410903352e-05, + "loss": 0.4142, "step": 53780 }, { - "epoch": 1.89, - "learning_rate": 3.570803469004661e-05, - "loss": 0.3069, + "epoch": 1.9384077557934192, + "grad_norm": 0.168187215924263, + "learning_rate": 3.5029641176376335e-05, + "loss": 0.413, "step": 53785 }, { - "epoch": 1.89, - "learning_rate": 3.570546044056303e-05, - "loss": 0.3087, + "epoch": 1.938587955454644, + "grad_norm": 0.17752215266227722, + "learning_rate": 3.502696810706723e-05, + "loss": 0.4108, "step": 53790 }, { - "epoch": 1.89, - "learning_rate": 3.570288605207609e-05, - "loss": 0.2924, + "epoch": 1.9387681551158684, + "grad_norm": 0.19909372925758362, + "learning_rate": 3.502429490114263e-05, + "loss": 0.4196, "step": 53795 }, { - "epoch": 1.89, - "learning_rate": 3.570031152461923e-05, - "loss": 0.2727, + "epoch": 1.938948354777093, + "grad_norm": 0.16391627490520477, + "learning_rate": 3.502162155863896e-05, + "loss": 0.429, "step": 53800 }, { - "epoch": 1.89, - "learning_rate": 3.5697736858225885e-05, - "loss": 0.3101, + "epoch": 1.9391285544383177, + "grad_norm": 0.19088739156723022, + "learning_rate": 3.5018948079592626e-05, + "loss": 0.356, "step": 53805 }, { - "epoch": 1.89, - "learning_rate": 3.569516205292947e-05, - "loss": 0.2856, + "epoch": 1.9393087540995424, + "grad_norm": 0.16824400424957275, + "learning_rate": 3.501627446404006e-05, + "loss": 0.3539, "step": 53810 }, { - "epoch": 1.89, - "learning_rate": 3.5692587108763424e-05, - "loss": 0.2822, + "epoch": 1.939488953760767, + "grad_norm": 0.19399607181549072, + "learning_rate": 3.5013600712017704e-05, + "loss": 0.3882, "step": 53815 }, { - "epoch": 1.89, - "learning_rate": 3.569001202576119e-05, - "loss": 0.289, + "epoch": 1.9396691534219914, + "grad_norm": 0.159646674990654, + "learning_rate": 3.501092682356197e-05, + "loss": 0.3816, "step": 53820 }, { - "epoch": 1.89, - "learning_rate": 3.5687436803956184e-05, - "loss": 0.2755, + "epoch": 1.9398493530832162, + "grad_norm": 0.22390936315059662, + "learning_rate": 3.5008252798709294e-05, + "loss": 0.4189, "step": 53825 }, { - "epoch": 1.89, - "learning_rate": 3.568486144338186e-05, - "loss": 0.2801, + "epoch": 1.9400295527444409, + "grad_norm": 0.18720601499080658, + "learning_rate": 3.5005578637496114e-05, + "loss": 0.3893, "step": 53830 }, { - "epoch": 1.89, - "learning_rate": 3.5682285944071645e-05, - "loss": 0.2956, + "epoch": 1.9402097524056656, + "grad_norm": 0.2625311613082886, + "learning_rate": 3.500290433995886e-05, + "loss": 0.386, "step": 53835 }, { - "epoch": 1.89, - "learning_rate": 3.567971030605899e-05, - "loss": 0.2879, + "epoch": 1.9403899520668901, + "grad_norm": 0.19546449184417725, + "learning_rate": 3.500022990613398e-05, + "loss": 0.4227, "step": 53840 }, { - "epoch": 1.89, - "learning_rate": 3.5677134529377334e-05, - "loss": 0.2941, + "epoch": 1.9405701517281146, + "grad_norm": 0.19765974581241608, + "learning_rate": 3.49975553360579e-05, + "loss": 0.3913, "step": 53845 }, { - "epoch": 1.89, - "learning_rate": 3.567455861406013e-05, - "loss": 0.2889, + "epoch": 1.9407503513893394, + "grad_norm": 0.17598837614059448, + "learning_rate": 3.4994880629767056e-05, + "loss": 0.3854, "step": 53850 }, { - "epoch": 1.89, - "learning_rate": 3.567198256014081e-05, - "loss": 0.2969, + "epoch": 1.9409305510505641, + "grad_norm": 0.17875751852989197, + "learning_rate": 3.499220578729791e-05, + "loss": 0.407, "step": 53855 }, { - "epoch": 1.89, - "learning_rate": 3.566940636765284e-05, - "loss": 0.2961, + "epoch": 1.9411107507117886, + "grad_norm": 0.20579689741134644, + "learning_rate": 3.498953080868689e-05, + "loss": 0.3687, "step": 53860 }, { - "epoch": 1.9, - "learning_rate": 3.566683003662964e-05, - "loss": 0.2817, + "epoch": 1.9412909503730131, + "grad_norm": 0.20091566443443298, + "learning_rate": 3.498685569397045e-05, + "loss": 0.4192, "step": 53865 }, { - "epoch": 1.9, - "learning_rate": 3.5664253567104694e-05, - "loss": 0.2542, + "epoch": 1.9414711500342379, + "grad_norm": 0.17543958127498627, + "learning_rate": 3.498418044318502e-05, + "loss": 0.3872, "step": 53870 }, { - "epoch": 1.9, - "learning_rate": 3.5661676959111444e-05, - "loss": 0.2545, + "epoch": 1.9416513496954626, + "grad_norm": 0.1831250786781311, + "learning_rate": 3.498150505636708e-05, + "loss": 0.3941, "step": 53875 }, { - "epoch": 1.9, - "learning_rate": 3.565910021268334e-05, - "loss": 0.2826, + "epoch": 1.9418315493566873, + "grad_norm": 0.18950533866882324, + "learning_rate": 3.4978829533553064e-05, + "loss": 0.4167, "step": 53880 }, { - "epoch": 1.9, - "learning_rate": 3.565652332785385e-05, - "loss": 0.2631, + "epoch": 1.9420117490179118, + "grad_norm": 0.2181074172258377, + "learning_rate": 3.497615387477942e-05, + "loss": 0.3822, "step": 53885 }, { - "epoch": 1.9, - "learning_rate": 3.5653946304656426e-05, - "loss": 0.2975, + "epoch": 1.9421919486791364, + "grad_norm": 0.20885571837425232, + "learning_rate": 3.497347808008262e-05, + "loss": 0.4137, "step": 53890 }, { - "epoch": 1.9, - "learning_rate": 3.565136914312453e-05, - "loss": 0.29, + "epoch": 1.942372148340361, + "grad_norm": 0.18355844914913177, + "learning_rate": 3.4970802149499106e-05, + "loss": 0.439, "step": 53895 }, { - "epoch": 1.9, - "learning_rate": 3.564879184329162e-05, - "loss": 0.277, + "epoch": 1.9425523480015858, + "grad_norm": 0.17857640981674194, + "learning_rate": 3.496812608306535e-05, + "loss": 0.4262, "step": 53900 }, { - "epoch": 1.9, - "learning_rate": 3.5646214405191176e-05, - "loss": 0.2721, + "epoch": 1.9427325476628103, + "grad_norm": 0.21502956748008728, + "learning_rate": 3.4965449880817795e-05, + "loss": 0.438, "step": 53905 }, { - "epoch": 1.9, - "learning_rate": 3.564363682885665e-05, - "loss": 0.283, + "epoch": 1.942912747324035, + "grad_norm": 0.17269867658615112, + "learning_rate": 3.4962773542792925e-05, + "loss": 0.3961, "step": 53910 }, { - "epoch": 1.9, - "learning_rate": 3.564105911432151e-05, - "loss": 0.2778, + "epoch": 1.9430929469852596, + "grad_norm": 0.1496947705745697, + "learning_rate": 3.4960097069027184e-05, + "loss": 0.4213, "step": 53915 }, { - "epoch": 1.9, - "learning_rate": 3.563848126161924e-05, - "loss": 0.2981, + "epoch": 1.9432731466464843, + "grad_norm": 0.19583889842033386, + "learning_rate": 3.495742045955706e-05, + "loss": 0.4293, "step": 53920 }, { - "epoch": 1.9, - "learning_rate": 3.563590327078329e-05, - "loss": 0.3012, + "epoch": 1.943453346307709, + "grad_norm": 0.1784418672323227, + "learning_rate": 3.4954743714419006e-05, + "loss": 0.3941, "step": 53925 }, { - "epoch": 1.9, - "learning_rate": 3.5633325141847144e-05, - "loss": 0.2836, + "epoch": 1.9436335459689336, + "grad_norm": 0.2068331241607666, + "learning_rate": 3.4952066833649495e-05, + "loss": 0.4088, "step": 53930 }, { - "epoch": 1.9, - "learning_rate": 3.563074687484429e-05, - "loss": 0.2967, + "epoch": 1.943813745630158, + "grad_norm": 0.18008601665496826, + "learning_rate": 3.4949389817285e-05, + "loss": 0.3759, "step": 53935 }, { - "epoch": 1.9, - "learning_rate": 3.5628168469808184e-05, - "loss": 0.2996, + "epoch": 1.9439939452913828, + "grad_norm": 0.17084579169750214, + "learning_rate": 3.494671266536199e-05, + "loss": 0.3866, "step": 53940 }, { - "epoch": 1.9, - "learning_rate": 3.562558992677233e-05, - "loss": 0.2953, + "epoch": 1.9441741449526075, + "grad_norm": 0.1742352992296219, + "learning_rate": 3.494403537791696e-05, + "loss": 0.4175, "step": 53945 }, { - "epoch": 1.9, - "learning_rate": 3.562301124577018e-05, - "loss": 0.2628, + "epoch": 1.9443543446138323, + "grad_norm": 0.2424996942281723, + "learning_rate": 3.494135795498636e-05, + "loss": 0.416, "step": 53950 }, { - "epoch": 1.9, - "learning_rate": 3.5620432426835226e-05, - "loss": 0.2767, + "epoch": 1.9445345442750568, + "grad_norm": 0.17355568706989288, + "learning_rate": 3.493868039660669e-05, + "loss": 0.4145, "step": 53955 }, { - "epoch": 1.9, - "learning_rate": 3.561785347000096e-05, - "loss": 0.2936, + "epoch": 1.9447147439362813, + "grad_norm": 0.15699560940265656, + "learning_rate": 3.493600270281442e-05, + "loss": 0.4125, "step": 53960 }, { - "epoch": 1.9, - "learning_rate": 3.561527437530088e-05, - "loss": 0.295, + "epoch": 1.944894943597506, + "grad_norm": 0.16016830503940582, + "learning_rate": 3.493332487364604e-05, + "loss": 0.4297, "step": 53965 }, { - "epoch": 1.9, - "learning_rate": 3.561269514276844e-05, - "loss": 0.2892, + "epoch": 1.9450751432587308, + "grad_norm": 0.23133453726768494, + "learning_rate": 3.4930646909138026e-05, + "loss": 0.4289, "step": 53970 }, { - "epoch": 1.9, - "learning_rate": 3.561011577243715e-05, - "loss": 0.2718, + "epoch": 1.9452553429199553, + "grad_norm": 0.23153449594974518, + "learning_rate": 3.492796880932687e-05, + "loss": 0.4518, "step": 53975 }, { - "epoch": 1.9, - "learning_rate": 3.5607536264340497e-05, - "loss": 0.2776, + "epoch": 1.9454355425811798, + "grad_norm": 0.1748327612876892, + "learning_rate": 3.492529057424907e-05, + "loss": 0.41, "step": 53980 }, { - "epoch": 1.9, - "learning_rate": 3.560495661851198e-05, - "loss": 0.2718, + "epoch": 1.9456157422424045, + "grad_norm": 0.2023671418428421, + "learning_rate": 3.49226122039411e-05, + "loss": 0.4217, "step": 53985 }, { - "epoch": 1.9, - "learning_rate": 3.5602376834985094e-05, - "loss": 0.2969, + "epoch": 1.9457959419036293, + "grad_norm": 0.15689332783222198, + "learning_rate": 3.491993369843946e-05, + "loss": 0.3638, "step": 53990 }, { - "epoch": 1.9, - "learning_rate": 3.5599796913793326e-05, - "loss": 0.2846, + "epoch": 1.945976141564854, + "grad_norm": 0.19829018414020538, + "learning_rate": 3.4917255057780646e-05, + "loss": 0.4338, "step": 53995 }, { - "epoch": 1.9, - "learning_rate": 3.559721685497019e-05, - "loss": 0.2723, + "epoch": 1.9461563412260785, + "grad_norm": 0.19208745658397675, + "learning_rate": 3.491457628200115e-05, + "loss": 0.4192, "step": 54000 }, { - "epoch": 1.9, - "eval_loss": 0.27377253770828247, - "eval_runtime": 10.5385, - "eval_samples_per_second": 9.489, - "eval_steps_per_second": 9.489, + "epoch": 1.9461563412260785, + "eval_loss": 0.4374231994152069, + "eval_runtime": 3.5248, + "eval_samples_per_second": 28.37, + "eval_steps_per_second": 7.093, "step": 54000 }, { - "epoch": 1.9, - "learning_rate": 3.559463665854916e-05, - "loss": 0.2901, + "epoch": 1.946336540887303, + "grad_norm": 0.20834042131900787, + "learning_rate": 3.491189737113748e-05, + "loss": 0.4054, "step": 54005 }, { - "epoch": 1.9, - "learning_rate": 3.5592056324563775e-05, - "loss": 0.2751, + "epoch": 1.9465167405485277, + "grad_norm": 0.2228306233882904, + "learning_rate": 3.490921832522612e-05, + "loss": 0.394, "step": 54010 }, { - "epoch": 1.9, - "learning_rate": 3.5589475853047505e-05, - "loss": 0.2912, + "epoch": 1.9466969402097525, + "grad_norm": 0.1861565113067627, + "learning_rate": 3.490653914430358e-05, + "loss": 0.4106, "step": 54015 }, { - "epoch": 1.9, - "learning_rate": 3.5586895244033874e-05, - "loss": 0.2607, + "epoch": 1.946877139870977, + "grad_norm": 0.1798921376466751, + "learning_rate": 3.490385982840636e-05, + "loss": 0.4095, "step": 54020 }, { - "epoch": 1.9, - "learning_rate": 3.5584314497556385e-05, - "loss": 0.2925, + "epoch": 1.9470573395322017, + "grad_norm": 0.15824037790298462, + "learning_rate": 3.490118037757097e-05, + "loss": 0.4057, "step": 54025 }, { - "epoch": 1.9, - "learning_rate": 3.558173361364855e-05, - "loss": 0.2835, + "epoch": 1.9472375391934262, + "grad_norm": 0.18199415504932404, + "learning_rate": 3.489850079183391e-05, + "loss": 0.3894, "step": 54030 }, { - "epoch": 1.9, - "learning_rate": 3.557915259234387e-05, - "loss": 0.275, + "epoch": 1.947417738854651, + "grad_norm": 0.18391664326190948, + "learning_rate": 3.489582107123169e-05, + "loss": 0.4136, "step": 54035 }, { - "epoch": 1.9, - "learning_rate": 3.5576571433675876e-05, - "loss": 0.2868, + "epoch": 1.9475979385158757, + "grad_norm": 0.1818101406097412, + "learning_rate": 3.489314121580084e-05, + "loss": 0.3785, "step": 54040 }, { - "epoch": 1.9, - "learning_rate": 3.5573990137678065e-05, - "loss": 0.2768, + "epoch": 1.9477781381771002, + "grad_norm": 0.2508971095085144, + "learning_rate": 3.4890461225577844e-05, + "loss": 0.4335, "step": 54045 }, { - "epoch": 1.9, - "learning_rate": 3.557140870438397e-05, - "loss": 0.2995, + "epoch": 1.9479583378383247, + "grad_norm": 0.17361201345920563, + "learning_rate": 3.488778110059924e-05, + "loss": 0.3918, "step": 54050 }, { - "epoch": 1.9, - "learning_rate": 3.556882713382709e-05, - "loss": 0.2717, + "epoch": 1.9481385374995495, + "grad_norm": 0.19530875980854034, + "learning_rate": 3.488510084090152e-05, + "loss": 0.4348, "step": 54055 }, { - "epoch": 1.9, - "learning_rate": 3.556624542604096e-05, - "loss": 0.2686, + "epoch": 1.9483187371607742, + "grad_norm": 0.19267144799232483, + "learning_rate": 3.488242044652122e-05, + "loss": 0.4229, "step": 54060 }, { - "epoch": 1.9, - "learning_rate": 3.5563663581059084e-05, - "loss": 0.2718, + "epoch": 1.948498936821999, + "grad_norm": 0.1983078569173813, + "learning_rate": 3.487973991749485e-05, + "loss": 0.4049, "step": 54065 }, { - "epoch": 1.9, - "learning_rate": 3.556108159891502e-05, - "loss": 0.3123, + "epoch": 1.9486791364832234, + "grad_norm": 0.156129390001297, + "learning_rate": 3.487705925385894e-05, + "loss": 0.4582, "step": 54070 }, { - "epoch": 1.9, - "learning_rate": 3.555849947964225e-05, - "loss": 0.2785, + "epoch": 1.948859336144448, + "grad_norm": 0.17622871696949005, + "learning_rate": 3.4874378455650016e-05, + "loss": 0.3908, "step": 54075 }, { - "epoch": 1.9, - "learning_rate": 3.555591722327434e-05, - "loss": 0.2834, + "epoch": 1.9490395358056727, + "grad_norm": 0.21170586347579956, + "learning_rate": 3.487169752290458e-05, + "loss": 0.4382, "step": 54080 }, { - "epoch": 1.9, - "learning_rate": 3.555333482984479e-05, - "loss": 0.277, + "epoch": 1.9492197354668974, + "grad_norm": 0.2054653912782669, + "learning_rate": 3.48690164556592e-05, + "loss": 0.4048, "step": 54085 }, { - "epoch": 1.9, - "learning_rate": 3.555075229938715e-05, - "loss": 0.3086, + "epoch": 1.949399935128122, + "grad_norm": 0.17421171069145203, + "learning_rate": 3.486633525395037e-05, + "loss": 0.3885, "step": 54090 }, { - "epoch": 1.9, - "learning_rate": 3.5548169631934936e-05, - "loss": 0.3118, + "epoch": 1.9495801347893464, + "grad_norm": 0.21200191974639893, + "learning_rate": 3.4863653917814626e-05, + "loss": 0.439, "step": 54095 }, { - "epoch": 1.9, - "learning_rate": 3.5545586827521695e-05, - "loss": 0.2825, + "epoch": 1.9497603344505712, + "grad_norm": 0.17257794737815857, + "learning_rate": 3.486097244728851e-05, + "loss": 0.3801, "step": 54100 }, { - "epoch": 1.9, - "learning_rate": 3.554300388618096e-05, - "loss": 0.3147, + "epoch": 1.949940534111796, + "grad_norm": 0.1847560554742813, + "learning_rate": 3.485829084240856e-05, + "loss": 0.3753, "step": 54105 }, { - "epoch": 1.9, - "learning_rate": 3.5540420807946264e-05, - "loss": 0.2986, + "epoch": 1.9501207337730206, + "grad_norm": 0.2439274787902832, + "learning_rate": 3.48556091032113e-05, + "loss": 0.4313, "step": 54110 }, { - "epoch": 1.9, - "learning_rate": 3.553783759285115e-05, - "loss": 0.2992, + "epoch": 1.9503009334342452, + "grad_norm": 0.1944105327129364, + "learning_rate": 3.485292722973327e-05, + "loss": 0.4109, "step": 54115 }, { - "epoch": 1.9, - "learning_rate": 3.553525424092916e-05, - "loss": 0.2876, + "epoch": 1.9504811330954697, + "grad_norm": 0.189129039645195, + "learning_rate": 3.4850245222011025e-05, + "loss": 0.3661, "step": 54120 }, { - "epoch": 1.9, - "learning_rate": 3.553267075221385e-05, - "loss": 0.2788, + "epoch": 1.9506613327566944, + "grad_norm": 0.2111881524324417, + "learning_rate": 3.484756308008109e-05, + "loss": 0.4375, "step": 54125 }, { - "epoch": 1.9, - "learning_rate": 3.553008712673873e-05, - "loss": 0.2929, + "epoch": 1.9508415324179191, + "grad_norm": 0.14287078380584717, + "learning_rate": 3.484488080398002e-05, + "loss": 0.399, "step": 54130 }, { - "epoch": 1.9, - "learning_rate": 3.552750336453738e-05, - "loss": 0.3044, + "epoch": 1.9510217320791436, + "grad_norm": 0.1772930771112442, + "learning_rate": 3.4842198393744354e-05, + "loss": 0.4539, "step": 54135 }, { - "epoch": 1.9, - "learning_rate": 3.552491946564334e-05, - "loss": 0.2799, + "epoch": 1.9512019317403684, + "grad_norm": 0.143804669380188, + "learning_rate": 3.483951584941063e-05, + "loss": 0.4164, "step": 54140 }, { - "epoch": 1.9, - "learning_rate": 3.5522335430090153e-05, - "loss": 0.2941, + "epoch": 1.951382131401593, + "grad_norm": 0.2153077870607376, + "learning_rate": 3.483683317101541e-05, + "loss": 0.4466, "step": 54145 }, { - "epoch": 1.91, - "learning_rate": 3.551975125791137e-05, - "loss": 0.2747, + "epoch": 1.9515623310628176, + "grad_norm": 0.2091062068939209, + "learning_rate": 3.483415035859525e-05, + "loss": 0.4344, "step": 54150 }, { - "epoch": 1.91, - "learning_rate": 3.551716694914056e-05, - "loss": 0.286, + "epoch": 1.9517425307240424, + "grad_norm": 0.19160053133964539, + "learning_rate": 3.48314674121867e-05, + "loss": 0.3617, "step": 54155 }, { - "epoch": 1.91, - "learning_rate": 3.551458250381126e-05, - "loss": 0.2917, + "epoch": 1.9519227303852669, + "grad_norm": 0.1973278522491455, + "learning_rate": 3.48287843318263e-05, + "loss": 0.3868, "step": 54160 }, { - "epoch": 1.91, - "learning_rate": 3.551199792195704e-05, - "loss": 0.3029, + "epoch": 1.9521029300464914, + "grad_norm": 0.24400056898593903, + "learning_rate": 3.482610111755062e-05, + "loss": 0.4246, "step": 54165 }, { - "epoch": 1.91, - "learning_rate": 3.5509413203611445e-05, - "loss": 0.2845, + "epoch": 1.9522831297077161, + "grad_norm": 0.18288040161132812, + "learning_rate": 3.4823417769396214e-05, + "loss": 0.4284, "step": 54170 }, { - "epoch": 1.91, - "learning_rate": 3.550682834880806e-05, - "loss": 0.2963, + "epoch": 1.9524633293689408, + "grad_norm": 0.18425007164478302, + "learning_rate": 3.482073428739964e-05, + "loss": 0.4324, "step": 54175 }, { - "epoch": 1.91, - "learning_rate": 3.550424335758042e-05, - "loss": 0.2871, + "epoch": 1.9526435290301656, + "grad_norm": 0.1836194396018982, + "learning_rate": 3.481805067159747e-05, + "loss": 0.4368, "step": 54180 }, { - "epoch": 1.91, - "learning_rate": 3.550165822996212e-05, - "loss": 0.2846, + "epoch": 1.95282372869139, + "grad_norm": 0.1682935655117035, + "learning_rate": 3.481536692202625e-05, + "loss": 0.4079, "step": 54185 }, { - "epoch": 1.91, - "learning_rate": 3.5499072965986694e-05, - "loss": 0.2951, + "epoch": 1.9530039283526146, + "grad_norm": 0.18854603171348572, + "learning_rate": 3.4812683038722565e-05, + "loss": 0.4287, "step": 54190 }, { - "epoch": 1.91, - "learning_rate": 3.549648756568773e-05, - "loss": 0.2784, + "epoch": 1.9531841280138393, + "grad_norm": 0.1703491359949112, + "learning_rate": 3.4809999021722964e-05, + "loss": 0.3948, "step": 54195 }, { - "epoch": 1.91, - "learning_rate": 3.549390202909879e-05, - "loss": 0.278, + "epoch": 1.953364327675064, + "grad_norm": 0.18725082278251648, + "learning_rate": 3.480731487106404e-05, + "loss": 0.4472, "step": 54200 }, { - "epoch": 1.91, - "learning_rate": 3.5491316356253446e-05, - "loss": 0.2768, + "epoch": 1.9535445273362886, + "grad_norm": 0.17395414412021637, + "learning_rate": 3.4804630586782336e-05, + "loss": 0.4191, "step": 54205 }, { - "epoch": 1.91, - "learning_rate": 3.5488730547185276e-05, - "loss": 0.2931, + "epoch": 1.953724726997513, + "grad_norm": 0.20272298157215118, + "learning_rate": 3.4801946168914445e-05, + "loss": 0.4106, "step": 54210 }, { - "epoch": 1.91, - "learning_rate": 3.548614460192785e-05, - "loss": 0.2693, + "epoch": 1.9539049266587378, + "grad_norm": 0.2384205460548401, + "learning_rate": 3.4799261617496936e-05, + "loss": 0.3815, "step": 54215 }, { - "epoch": 1.91, - "learning_rate": 3.5483558520514755e-05, - "loss": 0.2674, + "epoch": 1.9540851263199626, + "grad_norm": 0.1440509706735611, + "learning_rate": 3.4796576932566374e-05, + "loss": 0.4261, "step": 54220 }, { - "epoch": 1.91, - "learning_rate": 3.548097230297954e-05, - "loss": 0.2962, + "epoch": 1.9542653259811873, + "grad_norm": 0.20167045295238495, + "learning_rate": 3.4793892114159364e-05, + "loss": 0.4292, "step": 54225 }, { - "epoch": 1.91, - "learning_rate": 3.547838594935583e-05, - "loss": 0.2953, + "epoch": 1.9544455256424118, + "grad_norm": 0.179216668009758, + "learning_rate": 3.4791207162312464e-05, + "loss": 0.399, "step": 54230 }, { - "epoch": 1.91, - "learning_rate": 3.547579945967718e-05, - "loss": 0.2914, + "epoch": 1.9546257253036363, + "grad_norm": 0.18981371819972992, + "learning_rate": 3.478852207706226e-05, + "loss": 0.4152, "step": 54235 }, { - "epoch": 1.91, - "learning_rate": 3.547321283397718e-05, - "loss": 0.3063, + "epoch": 1.954805924964861, + "grad_norm": 0.1702110916376114, + "learning_rate": 3.4785836858445334e-05, + "loss": 0.4375, "step": 54240 }, { - "epoch": 1.91, - "learning_rate": 3.54706260722894e-05, - "loss": 0.2954, + "epoch": 1.9549861246260858, + "grad_norm": 0.17524947226047516, + "learning_rate": 3.478315150649829e-05, + "loss": 0.4262, "step": 54245 }, { - "epoch": 1.91, - "learning_rate": 3.5468039174647446e-05, - "loss": 0.298, + "epoch": 1.9551663242873103, + "grad_norm": 0.20712916553020477, + "learning_rate": 3.4780466021257685e-05, + "loss": 0.4056, "step": 54250 }, { - "epoch": 1.91, - "learning_rate": 3.54654521410849e-05, - "loss": 0.2783, + "epoch": 1.955346523948535, + "grad_norm": 0.171823188662529, + "learning_rate": 3.477778040276013e-05, + "loss": 0.3729, "step": 54255 }, { - "epoch": 1.91, - "learning_rate": 3.546286497163537e-05, - "loss": 0.2827, + "epoch": 1.9555267236097595, + "grad_norm": 0.16928203403949738, + "learning_rate": 3.4775094651042204e-05, + "loss": 0.4196, "step": 54260 }, { - "epoch": 1.91, - "learning_rate": 3.546027766633242e-05, - "loss": 0.2954, + "epoch": 1.9557069232709843, + "grad_norm": 0.19515909254550934, + "learning_rate": 3.4772408766140516e-05, + "loss": 0.4314, "step": 54265 }, { - "epoch": 1.91, - "learning_rate": 3.545769022520966e-05, - "loss": 0.2752, + "epoch": 1.955887122932209, + "grad_norm": 0.1939311921596527, + "learning_rate": 3.4769722748091646e-05, + "loss": 0.4058, "step": 54270 }, { - "epoch": 1.91, - "learning_rate": 3.5455102648300686e-05, - "loss": 0.2836, + "epoch": 1.9560673225934335, + "grad_norm": 0.2215987592935562, + "learning_rate": 3.47670365969322e-05, + "loss": 0.3656, "step": 54275 }, { - "epoch": 1.91, - "learning_rate": 3.545251493563909e-05, - "loss": 0.2787, + "epoch": 1.956247522254658, + "grad_norm": 0.18643298745155334, + "learning_rate": 3.476435031269876e-05, + "loss": 0.4374, "step": 54280 }, { - "epoch": 1.91, - "learning_rate": 3.544992708725848e-05, - "loss": 0.2801, + "epoch": 1.9564277219158828, + "grad_norm": 0.20477153360843658, + "learning_rate": 3.4761663895427946e-05, + "loss": 0.419, "step": 54285 }, { - "epoch": 1.91, - "learning_rate": 3.5447339103192454e-05, - "loss": 0.2571, + "epoch": 1.9566079215771075, + "grad_norm": 0.1482023149728775, + "learning_rate": 3.475897734515635e-05, + "loss": 0.4041, "step": 54290 }, { - "epoch": 1.91, - "learning_rate": 3.544475098347462e-05, - "loss": 0.2798, + "epoch": 1.9567881212383322, + "grad_norm": 0.17524316906929016, + "learning_rate": 3.4756290661920575e-05, + "loss": 0.4322, "step": 54295 }, { - "epoch": 1.91, - "learning_rate": 3.544216272813857e-05, - "loss": 0.2982, + "epoch": 1.9569683208995567, + "grad_norm": 0.21270814538002014, + "learning_rate": 3.4753603845757235e-05, + "loss": 0.373, "step": 54300 }, { - "epoch": 1.91, - "learning_rate": 3.543957433721792e-05, - "loss": 0.276, + "epoch": 1.9571485205607813, + "grad_norm": 0.1944366991519928, + "learning_rate": 3.475091689670292e-05, + "loss": 0.4083, "step": 54305 }, { - "epoch": 1.91, - "learning_rate": 3.543698581074628e-05, - "loss": 0.2727, + "epoch": 1.957328720222006, + "grad_norm": 0.2040632963180542, + "learning_rate": 3.4748229814794256e-05, + "loss": 0.386, "step": 54310 }, { - "epoch": 1.91, - "learning_rate": 3.543439714875725e-05, - "loss": 0.2706, + "epoch": 1.9575089198832307, + "grad_norm": 0.19412831962108612, + "learning_rate": 3.474554260006785e-05, + "loss": 0.4259, "step": 54315 }, { - "epoch": 1.91, - "learning_rate": 3.5431808351284455e-05, - "loss": 0.2784, + "epoch": 1.9576891195444552, + "grad_norm": 0.17978821694850922, + "learning_rate": 3.4742855252560315e-05, + "loss": 0.404, "step": 54320 }, { - "epoch": 1.91, - "learning_rate": 3.54292194183615e-05, - "loss": 0.295, + "epoch": 1.9578693192056797, + "grad_norm": 0.19575315713882446, + "learning_rate": 3.474016777230825e-05, + "loss": 0.4097, "step": 54325 }, { - "epoch": 1.91, - "learning_rate": 3.542663035002201e-05, - "loss": 0.2611, + "epoch": 1.9580495188669045, + "grad_norm": 0.16315071284770966, + "learning_rate": 3.473748015934829e-05, + "loss": 0.4187, "step": 54330 }, { - "epoch": 1.91, - "learning_rate": 3.542404114629959e-05, - "loss": 0.2928, + "epoch": 1.9582297185281292, + "grad_norm": 0.20219777524471283, + "learning_rate": 3.473479241371706e-05, + "loss": 0.4193, "step": 54335 }, { - "epoch": 1.91, - "learning_rate": 3.5421451807227866e-05, - "loss": 0.2922, + "epoch": 1.958409918189354, + "grad_norm": 0.17881831526756287, + "learning_rate": 3.473210453545116e-05, + "loss": 0.3867, "step": 54340 }, { - "epoch": 1.91, - "learning_rate": 3.541886233284047e-05, - "loss": 0.277, + "epoch": 1.9585901178505785, + "grad_norm": 0.17221704125404358, + "learning_rate": 3.472941652458722e-05, + "loss": 0.4314, "step": 54345 }, { - "epoch": 1.91, - "learning_rate": 3.5416272723171e-05, - "loss": 0.2538, + "epoch": 1.958770317511803, + "grad_norm": 0.15523254871368408, + "learning_rate": 3.472672838116187e-05, + "loss": 0.3914, "step": 54350 }, { - "epoch": 1.91, - "learning_rate": 3.5413682978253096e-05, - "loss": 0.2852, + "epoch": 1.9589505171730277, + "grad_norm": 0.19016127288341522, + "learning_rate": 3.472404010521172e-05, + "loss": 0.4245, "step": 54355 }, { - "epoch": 1.91, - "learning_rate": 3.541109309812038e-05, - "loss": 0.3012, + "epoch": 1.9591307168342524, + "grad_norm": 0.17222009599208832, + "learning_rate": 3.472135169677341e-05, + "loss": 0.4452, "step": 54360 }, { - "epoch": 1.91, - "learning_rate": 3.540850308280649e-05, - "loss": 0.2755, + "epoch": 1.959310916495477, + "grad_norm": 0.205901101231575, + "learning_rate": 3.471866315588356e-05, + "loss": 0.4402, "step": 54365 }, { - "epoch": 1.91, - "learning_rate": 3.5405912932345034e-05, - "loss": 0.2756, + "epoch": 1.9594911161567015, + "grad_norm": 0.21535156667232513, + "learning_rate": 3.4715974482578814e-05, + "loss": 0.4408, "step": 54370 }, { - "epoch": 1.91, - "learning_rate": 3.540332264676967e-05, - "loss": 0.2885, + "epoch": 1.9596713158179262, + "grad_norm": 0.18163524568080902, + "learning_rate": 3.471328567689579e-05, + "loss": 0.4052, "step": 54375 }, { - "epoch": 1.91, - "learning_rate": 3.540073222611401e-05, - "loss": 0.2794, + "epoch": 1.959851515479151, + "grad_norm": 0.20107874274253845, + "learning_rate": 3.471059673887114e-05, + "loss": 0.3852, "step": 54380 }, { - "epoch": 1.91, - "learning_rate": 3.53981416704117e-05, - "loss": 0.2606, + "epoch": 1.9600317151403757, + "grad_norm": 0.19046540558338165, + "learning_rate": 3.470790766854148e-05, + "loss": 0.4287, "step": 54385 }, { - "epoch": 1.91, - "learning_rate": 3.5395550979696366e-05, - "loss": 0.2811, + "epoch": 1.9602119148016002, + "grad_norm": 0.16740958392620087, + "learning_rate": 3.4705218465943456e-05, + "loss": 0.4202, "step": 54390 }, { - "epoch": 1.91, - "learning_rate": 3.539296015400166e-05, - "loss": 0.2918, + "epoch": 1.9603921144628247, + "grad_norm": 0.21119292080402374, + "learning_rate": 3.4702529131113715e-05, + "loss": 0.4048, "step": 54395 }, { - "epoch": 1.91, - "learning_rate": 3.5390369193361204e-05, - "loss": 0.2801, + "epoch": 1.9605723141240494, + "grad_norm": 0.20554935932159424, + "learning_rate": 3.469983966408889e-05, + "loss": 0.4192, "step": 54400 }, { - "epoch": 1.91, - "learning_rate": 3.538829632771053e-05, - "loss": 0.2677, + "epoch": 1.9607525137852742, + "grad_norm": 0.2031758427619934, + "learning_rate": 3.469715006490563e-05, + "loss": 0.4199, "step": 54405 }, { - "epoch": 1.91, - "learning_rate": 3.5385705124252526e-05, - "loss": 0.2817, + "epoch": 1.9609327134464987, + "grad_norm": 0.18050605058670044, + "learning_rate": 3.4694460333600574e-05, + "loss": 0.3987, "step": 54410 }, { - "epoch": 1.91, - "learning_rate": 3.5383113785943e-05, - "loss": 0.2995, + "epoch": 1.9611129131077234, + "grad_norm": 0.19206275045871735, + "learning_rate": 3.4691770470210374e-05, + "loss": 0.4103, "step": 54415 }, { - "epoch": 1.91, - "learning_rate": 3.5380522312815574e-05, - "loss": 0.264, + "epoch": 1.961293112768948, + "grad_norm": 0.16883911192417145, + "learning_rate": 3.4689080474771676e-05, + "loss": 0.4011, "step": 54420 }, { - "epoch": 1.91, - "learning_rate": 3.53779307049039e-05, - "loss": 0.2944, + "epoch": 1.9614733124301726, + "grad_norm": 0.1800912320613861, + "learning_rate": 3.4686390347321144e-05, + "loss": 0.3789, "step": 54425 }, { - "epoch": 1.91, - "learning_rate": 3.537533896224165e-05, - "loss": 0.2951, + "epoch": 1.9616535120913974, + "grad_norm": 0.1670406460762024, + "learning_rate": 3.46837000878954e-05, + "loss": 0.4023, "step": 54430 }, { - "epoch": 1.92, - "learning_rate": 3.537274708486246e-05, - "loss": 0.2744, + "epoch": 1.961833711752622, + "grad_norm": 0.14875911176204681, + "learning_rate": 3.468100969653114e-05, + "loss": 0.3879, "step": 54435 }, { - "epoch": 1.92, - "learning_rate": 3.5370155072799974e-05, - "loss": 0.3025, + "epoch": 1.9620139114138464, + "grad_norm": 0.1597927361726761, + "learning_rate": 3.4678319173264975e-05, + "loss": 0.4384, "step": 54440 }, { - "epoch": 1.92, - "learning_rate": 3.536756292608787e-05, - "loss": 0.3011, + "epoch": 1.9621941110750711, + "grad_norm": 0.15800374746322632, + "learning_rate": 3.46756285181336e-05, + "loss": 0.378, "step": 54445 }, { - "epoch": 1.92, - "learning_rate": 3.536497064475979e-05, - "loss": 0.2933, + "epoch": 1.9623743107362959, + "grad_norm": 0.1771484762430191, + "learning_rate": 3.467293773117365e-05, + "loss": 0.4328, "step": 54450 }, { - "epoch": 1.92, - "learning_rate": 3.536237822884939e-05, - "loss": 0.2763, + "epoch": 1.9625545103975206, + "grad_norm": 0.16408397257328033, + "learning_rate": 3.46702468124218e-05, + "loss": 0.3782, "step": 54455 }, { - "epoch": 1.92, - "learning_rate": 3.5359785678390346e-05, - "loss": 0.3083, + "epoch": 1.9627347100587451, + "grad_norm": 0.17288720607757568, + "learning_rate": 3.466755576191471e-05, + "loss": 0.4191, "step": 54460 }, { - "epoch": 1.92, - "learning_rate": 3.535719299341631e-05, - "loss": 0.289, + "epoch": 1.9629149097199696, + "grad_norm": 0.1751626580953598, + "learning_rate": 3.4664864579689036e-05, + "loss": 0.4097, "step": 54465 }, { - "epoch": 1.92, - "learning_rate": 3.535460017396095e-05, - "loss": 0.2785, + "epoch": 1.9630951093811944, + "grad_norm": 0.19084684550762177, + "learning_rate": 3.4662173265781464e-05, + "loss": 0.4035, "step": 54470 }, { - "epoch": 1.92, - "learning_rate": 3.535200722005793e-05, - "loss": 0.2717, + "epoch": 1.963275309042419, + "grad_norm": 0.1722564846277237, + "learning_rate": 3.4659481820228654e-05, + "loss": 0.4273, "step": 54475 }, { - "epoch": 1.92, - "learning_rate": 3.5349414131740914e-05, - "loss": 0.2753, + "epoch": 1.9634555087036436, + "grad_norm": 0.17883461713790894, + "learning_rate": 3.4656790243067274e-05, + "loss": 0.397, "step": 54480 }, { - "epoch": 1.92, - "learning_rate": 3.5346820909043585e-05, - "loss": 0.2864, + "epoch": 1.9636357083648681, + "grad_norm": 0.1756594032049179, + "learning_rate": 3.465409853433399e-05, + "loss": 0.39, "step": 54485 }, { - "epoch": 1.92, - "learning_rate": 3.53442275519996e-05, - "loss": 0.289, + "epoch": 1.9638159080260928, + "grad_norm": 0.1727340817451477, + "learning_rate": 3.465140669406548e-05, + "loss": 0.4083, "step": 54490 }, { - "epoch": 1.92, - "learning_rate": 3.534163406064264e-05, - "loss": 0.3057, + "epoch": 1.9639961076873176, + "grad_norm": 0.16905559599399567, + "learning_rate": 3.464871472229843e-05, + "loss": 0.4025, "step": 54495 }, { - "epoch": 1.92, - "learning_rate": 3.533904043500637e-05, - "loss": 0.293, + "epoch": 1.9641763073485423, + "grad_norm": 0.1733274906873703, + "learning_rate": 3.464602261906951e-05, + "loss": 0.4237, "step": 54500 }, { - "epoch": 1.92, - "eval_loss": 0.27375850081443787, - "eval_runtime": 10.5429, - "eval_samples_per_second": 9.485, - "eval_steps_per_second": 9.485, + "epoch": 1.9641763073485423, + "eval_loss": 0.4376771152019501, + "eval_runtime": 3.5433, + "eval_samples_per_second": 28.223, + "eval_steps_per_second": 7.056, "step": 54500 }, { - "epoch": 1.92, - "learning_rate": 3.533644667512448e-05, - "loss": 0.3125, + "epoch": 1.9643565070097668, + "grad_norm": 0.19322708249092102, + "learning_rate": 3.4643330384415396e-05, + "loss": 0.4047, "step": 54505 }, { - "epoch": 1.92, - "learning_rate": 3.5333852781030635e-05, - "loss": 0.3063, + "epoch": 1.9645367066709913, + "grad_norm": 0.18007676303386688, + "learning_rate": 3.464063801837277e-05, + "loss": 0.4018, "step": 54510 }, { - "epoch": 1.92, - "learning_rate": 3.5331258752758536e-05, - "loss": 0.2904, + "epoch": 1.964716906332216, + "grad_norm": 0.17371296882629395, + "learning_rate": 3.463794552097832e-05, + "loss": 0.3826, "step": 54515 }, { - "epoch": 1.92, - "learning_rate": 3.5328664590341836e-05, - "loss": 0.3004, + "epoch": 1.9648971059934408, + "grad_norm": 0.1630249321460724, + "learning_rate": 3.463525289226873e-05, + "loss": 0.3994, "step": 54520 }, { - "epoch": 1.92, - "learning_rate": 3.532607029381424e-05, - "loss": 0.2887, + "epoch": 1.9650773056546653, + "grad_norm": 0.15053841471672058, + "learning_rate": 3.463256013228068e-05, + "loss": 0.385, "step": 54525 }, { - "epoch": 1.92, - "learning_rate": 3.532347586320942e-05, - "loss": 0.2682, + "epoch": 1.96525750531589, + "grad_norm": 0.19676600396633148, + "learning_rate": 3.462986724105087e-05, + "loss": 0.3725, "step": 54530 }, { - "epoch": 1.92, - "learning_rate": 3.532088129856107e-05, - "loss": 0.2665, + "epoch": 1.9654377049771146, + "grad_norm": 0.1895761638879776, + "learning_rate": 3.462717421861597e-05, + "loss": 0.4309, "step": 54535 }, { - "epoch": 1.92, - "learning_rate": 3.531828659990289e-05, - "loss": 0.3099, + "epoch": 1.9656179046383393, + "grad_norm": 0.19437263906002045, + "learning_rate": 3.46244810650127e-05, + "loss": 0.3823, "step": 54540 }, { - "epoch": 1.92, - "learning_rate": 3.531569176726855e-05, - "loss": 0.2926, + "epoch": 1.965798104299564, + "grad_norm": 0.19000405073165894, + "learning_rate": 3.462178778027774e-05, + "loss": 0.4287, "step": 54545 }, { - "epoch": 1.92, - "learning_rate": 3.5313096800691756e-05, - "loss": 0.2811, + "epoch": 1.9659783039607885, + "grad_norm": 0.22047989070415497, + "learning_rate": 3.4619094364447776e-05, + "loss": 0.4186, "step": 54550 }, { - "epoch": 1.92, - "learning_rate": 3.5310501700206186e-05, - "loss": 0.2583, + "epoch": 1.966158503622013, + "grad_norm": 0.22895020246505737, + "learning_rate": 3.461640081755951e-05, + "loss": 0.4308, "step": 54555 }, { - "epoch": 1.92, - "learning_rate": 3.5307906465845554e-05, - "loss": 0.314, + "epoch": 1.9663387032832378, + "grad_norm": 0.1648799479007721, + "learning_rate": 3.4613707139649654e-05, + "loss": 0.4343, "step": 54560 }, { - "epoch": 1.92, - "learning_rate": 3.5305311097643545e-05, - "loss": 0.3002, + "epoch": 1.9665189029444625, + "grad_norm": 0.22859375178813934, + "learning_rate": 3.46110133307549e-05, + "loss": 0.4118, "step": 54565 }, { - "epoch": 1.92, - "learning_rate": 3.530271559563387e-05, - "loss": 0.32, + "epoch": 1.9666991026056873, + "grad_norm": 0.14714932441711426, + "learning_rate": 3.4608319390911937e-05, + "loss": 0.4129, "step": 54570 }, { - "epoch": 1.92, - "learning_rate": 3.530011995985022e-05, - "loss": 0.2996, + "epoch": 1.9668793022669118, + "grad_norm": 0.16554409265518188, + "learning_rate": 3.4605625320157496e-05, + "loss": 0.4315, "step": 54575 }, { - "epoch": 1.92, - "learning_rate": 3.52975241903263e-05, - "loss": 0.2864, + "epoch": 1.9670595019281363, + "grad_norm": 0.1864306479692459, + "learning_rate": 3.460293111852827e-05, + "loss": 0.4123, "step": 54580 }, { - "epoch": 1.92, - "learning_rate": 3.529492828709582e-05, - "loss": 0.2945, + "epoch": 1.967239701589361, + "grad_norm": 0.22947263717651367, + "learning_rate": 3.460023678606096e-05, + "loss": 0.3806, "step": 54585 }, { - "epoch": 1.92, - "learning_rate": 3.5292332250192474e-05, - "loss": 0.2948, + "epoch": 1.9674199012505857, + "grad_norm": 0.19999592006206512, + "learning_rate": 3.459754232279228e-05, + "loss": 0.3781, "step": 54590 }, { - "epoch": 1.92, - "learning_rate": 3.528973607964997e-05, - "loss": 0.3156, + "epoch": 1.9676001009118103, + "grad_norm": 0.15868158638477325, + "learning_rate": 3.459484772875895e-05, + "loss": 0.4076, "step": 54595 }, { - "epoch": 1.92, - "learning_rate": 3.5287139775502034e-05, - "loss": 0.2755, + "epoch": 1.9677803005730348, + "grad_norm": 0.18088065087795258, + "learning_rate": 3.459215300399768e-05, + "loss": 0.3592, "step": 54600 }, { - "epoch": 1.92, - "learning_rate": 3.528454333778237e-05, - "loss": 0.2495, + "epoch": 1.9679605002342595, + "grad_norm": 0.19539353251457214, + "learning_rate": 3.4589458148545174e-05, + "loss": 0.4784, "step": 54605 }, { - "epoch": 1.92, - "learning_rate": 3.528194676652468e-05, - "loss": 0.2742, + "epoch": 1.9681406998954842, + "grad_norm": 0.1806805580854416, + "learning_rate": 3.458676316243816e-05, + "loss": 0.3996, "step": 54610 }, { - "epoch": 1.92, - "learning_rate": 3.527935006176269e-05, - "loss": 0.2778, + "epoch": 1.968320899556709, + "grad_norm": 0.22320657968521118, + "learning_rate": 3.458406804571335e-05, + "loss": 0.3974, "step": 54615 }, { - "epoch": 1.92, - "learning_rate": 3.527675322353011e-05, - "loss": 0.2977, + "epoch": 1.9685010992179335, + "grad_norm": 0.21854592859745026, + "learning_rate": 3.458137279840749e-05, + "loss": 0.3921, "step": 54620 }, { - "epoch": 1.92, - "learning_rate": 3.527415625186067e-05, - "loss": 0.2769, + "epoch": 1.968681298879158, + "grad_norm": 0.20072756707668304, + "learning_rate": 3.457867742055725e-05, + "loss": 0.3974, "step": 54625 }, { - "epoch": 1.92, - "learning_rate": 3.527155914678808e-05, - "loss": 0.3208, + "epoch": 1.9688614985403827, + "grad_norm": 0.19224333763122559, + "learning_rate": 3.4575981912199404e-05, + "loss": 0.436, "step": 54630 }, { - "epoch": 1.92, - "learning_rate": 3.526896190834607e-05, - "loss": 0.2947, + "epoch": 1.9690416982016075, + "grad_norm": 0.1824558526277542, + "learning_rate": 3.457328627337065e-05, + "loss": 0.4161, "step": 54635 }, { - "epoch": 1.92, - "learning_rate": 3.526636453656834e-05, - "loss": 0.283, + "epoch": 1.969221897862832, + "grad_norm": 0.16836610436439514, + "learning_rate": 3.457059050410773e-05, + "loss": 0.4122, "step": 54640 }, { - "epoch": 1.92, - "learning_rate": 3.526376703148864e-05, - "loss": 0.2712, + "epoch": 1.9694020975240567, + "grad_norm": 0.17469918727874756, + "learning_rate": 3.456789460444737e-05, + "loss": 0.4447, "step": 54645 }, { - "epoch": 1.92, - "learning_rate": 3.526116939314069e-05, - "loss": 0.2911, + "epoch": 1.9695822971852812, + "grad_norm": 0.18369092047214508, + "learning_rate": 3.456519857442629e-05, + "loss": 0.4229, "step": 54650 }, { - "epoch": 1.92, - "learning_rate": 3.525857162155822e-05, - "loss": 0.2716, + "epoch": 1.969762496846506, + "grad_norm": 0.20481358468532562, + "learning_rate": 3.456250241408123e-05, + "loss": 0.3953, "step": 54655 }, { - "epoch": 1.92, - "learning_rate": 3.525597371677496e-05, - "loss": 0.274, + "epoch": 1.9699426965077307, + "grad_norm": 0.277506023645401, + "learning_rate": 3.455980612344894e-05, + "loss": 0.4109, "step": 54660 }, { - "epoch": 1.92, - "learning_rate": 3.525337567882464e-05, - "loss": 0.2763, + "epoch": 1.9701228961689552, + "grad_norm": 0.18882182240486145, + "learning_rate": 3.455710970256613e-05, + "loss": 0.3827, "step": 54665 }, { - "epoch": 1.92, - "learning_rate": 3.5250777507740986e-05, - "loss": 0.3088, + "epoch": 1.9703030958301797, + "grad_norm": 0.19233731925487518, + "learning_rate": 3.4554413151469553e-05, + "loss": 0.4228, "step": 54670 }, { - "epoch": 1.92, - "learning_rate": 3.524817920355775e-05, - "loss": 0.257, + "epoch": 1.9704832954914044, + "grad_norm": 0.19764070212841034, + "learning_rate": 3.455171647019595e-05, + "loss": 0.401, "step": 54675 }, { - "epoch": 1.92, - "learning_rate": 3.524558076630865e-05, - "loss": 0.2821, + "epoch": 1.9706634951526292, + "grad_norm": 0.17623180150985718, + "learning_rate": 3.454901965878205e-05, + "loss": 0.4398, "step": 54680 }, { - "epoch": 1.92, - "learning_rate": 3.524298219602745e-05, - "loss": 0.2752, + "epoch": 1.970843694813854, + "grad_norm": 0.16016143560409546, + "learning_rate": 3.454632271726461e-05, + "loss": 0.4032, "step": 54685 }, { - "epoch": 1.92, - "learning_rate": 3.5240383492747874e-05, - "loss": 0.293, + "epoch": 1.9710238944750784, + "grad_norm": 0.1639336496591568, + "learning_rate": 3.4543625645680375e-05, + "loss": 0.4031, "step": 54690 }, { - "epoch": 1.92, - "learning_rate": 3.523778465650366e-05, - "loss": 0.2954, + "epoch": 1.971204094136303, + "grad_norm": 0.1621045023202896, + "learning_rate": 3.454092844406609e-05, + "loss": 0.3798, "step": 54695 }, { - "epoch": 1.92, - "learning_rate": 3.523518568732856e-05, - "loss": 0.2765, + "epoch": 1.9713842937975277, + "grad_norm": 0.19391906261444092, + "learning_rate": 3.4538231112458497e-05, + "loss": 0.4085, "step": 54700 }, { - "epoch": 1.92, - "learning_rate": 3.5232586585256324e-05, - "loss": 0.2841, + "epoch": 1.9715644934587524, + "grad_norm": 0.2154162973165512, + "learning_rate": 3.4535533650894356e-05, + "loss": 0.4131, "step": 54705 }, { - "epoch": 1.92, - "learning_rate": 3.5229987350320695e-05, - "loss": 0.2696, + "epoch": 1.971744693119977, + "grad_norm": 0.16932529211044312, + "learning_rate": 3.453283605941041e-05, + "loss": 0.3887, "step": 54710 }, { - "epoch": 1.93, - "learning_rate": 3.522738798255542e-05, - "loss": 0.291, + "epoch": 1.9719248927812014, + "grad_norm": 0.18449749052524567, + "learning_rate": 3.453013833804342e-05, + "loss": 0.4192, "step": 54715 }, { - "epoch": 1.93, - "learning_rate": 3.522478848199425e-05, - "loss": 0.2888, + "epoch": 1.9721050924424262, + "grad_norm": 0.1401815116405487, + "learning_rate": 3.452744048683014e-05, + "loss": 0.4199, "step": 54720 }, { - "epoch": 1.93, - "learning_rate": 3.5222188848670936e-05, - "loss": 0.2965, + "epoch": 1.9722852921036509, + "grad_norm": 0.17515966296195984, + "learning_rate": 3.452474250580734e-05, + "loss": 0.3881, "step": 54725 }, { - "epoch": 1.93, - "learning_rate": 3.521958908261924e-05, - "loss": 0.2867, + "epoch": 1.9724654917648756, + "grad_norm": 0.19562271237373352, + "learning_rate": 3.452204439501175e-05, + "loss": 0.4005, "step": 54730 }, { - "epoch": 1.93, - "learning_rate": 3.5216989183872925e-05, - "loss": 0.2465, + "epoch": 1.9726456914261001, + "grad_norm": 0.16657692193984985, + "learning_rate": 3.451934615448016e-05, + "loss": 0.3684, "step": 54735 }, { - "epoch": 1.93, - "learning_rate": 3.521438915246572e-05, - "loss": 0.2976, + "epoch": 1.9728258910873246, + "grad_norm": 0.2561399042606354, + "learning_rate": 3.451664778424931e-05, + "loss": 0.4216, "step": 54740 }, { - "epoch": 1.93, - "learning_rate": 3.5211788988431424e-05, - "loss": 0.2708, + "epoch": 1.9730060907485494, + "grad_norm": 0.18860171735286713, + "learning_rate": 3.451394928435598e-05, + "loss": 0.4218, "step": 54745 }, { - "epoch": 1.93, - "learning_rate": 3.520918869180376e-05, - "loss": 0.2836, + "epoch": 1.9731862904097741, + "grad_norm": 0.17932887375354767, + "learning_rate": 3.451125065483695e-05, + "loss": 0.3872, "step": 54750 }, { - "epoch": 1.93, - "learning_rate": 3.520658826261651e-05, - "loss": 0.2953, + "epoch": 1.9733664900709986, + "grad_norm": 0.19358305633068085, + "learning_rate": 3.450855189572895e-05, + "loss": 0.4098, "step": 54755 }, { - "epoch": 1.93, - "learning_rate": 3.520398770090345e-05, - "loss": 0.2719, + "epoch": 1.9735466897322234, + "grad_norm": 0.1724853366613388, + "learning_rate": 3.450585300706878e-05, + "loss": 0.4266, "step": 54760 }, { - "epoch": 1.93, - "learning_rate": 3.5201387006698326e-05, - "loss": 0.3359, + "epoch": 1.9737268893934479, + "grad_norm": 0.17911700904369354, + "learning_rate": 3.45031539888932e-05, + "loss": 0.417, "step": 54765 }, { - "epoch": 1.93, - "learning_rate": 3.519878618003493e-05, - "loss": 0.2837, + "epoch": 1.9739070890546726, + "grad_norm": 0.16080361604690552, + "learning_rate": 3.450045484123899e-05, + "loss": 0.4101, "step": 54770 }, { - "epoch": 1.93, - "learning_rate": 3.5196185220947e-05, - "loss": 0.2681, + "epoch": 1.9740872887158973, + "grad_norm": 0.19026781618595123, + "learning_rate": 3.449775556414292e-05, + "loss": 0.4418, "step": 54775 }, { - "epoch": 1.93, - "learning_rate": 3.5193584129468326e-05, - "loss": 0.2896, + "epoch": 1.9742674883771218, + "grad_norm": 0.16302798688411713, + "learning_rate": 3.449505615764177e-05, + "loss": 0.4291, "step": 54780 }, { - "epoch": 1.93, - "learning_rate": 3.519098290563267e-05, - "loss": 0.3084, + "epoch": 1.9744476880383464, + "grad_norm": 0.13728991150856018, + "learning_rate": 3.449235662177233e-05, + "loss": 0.4054, "step": 54785 }, { - "epoch": 1.93, - "learning_rate": 3.518838154947384e-05, - "loss": 0.2752, + "epoch": 1.974627887699571, + "grad_norm": 0.16925320029258728, + "learning_rate": 3.4489656956571345e-05, + "loss": 0.3945, "step": 54790 }, { - "epoch": 1.93, - "learning_rate": 3.518578006102558e-05, - "loss": 0.2897, + "epoch": 1.9748080873607958, + "grad_norm": 0.20769056677818298, + "learning_rate": 3.448695716207564e-05, + "loss": 0.416, "step": 54795 }, { - "epoch": 1.93, - "learning_rate": 3.518317844032168e-05, - "loss": 0.2674, + "epoch": 1.9749882870220206, + "grad_norm": 0.20202518999576569, + "learning_rate": 3.448425723832197e-05, + "loss": 0.3943, "step": 54800 }, { - "epoch": 1.93, - "learning_rate": 3.5180576687395915e-05, - "loss": 0.2786, + "epoch": 1.975168486683245, + "grad_norm": 0.19320270419120789, + "learning_rate": 3.448155718534714e-05, + "loss": 0.4378, "step": 54805 }, { - "epoch": 1.93, - "learning_rate": 3.517797480228207e-05, - "loss": 0.2788, + "epoch": 1.9753486863444696, + "grad_norm": 0.15490759909152985, + "learning_rate": 3.447885700318792e-05, + "loss": 0.3863, "step": 54810 }, { - "epoch": 1.93, - "learning_rate": 3.517537278501393e-05, - "loss": 0.2804, + "epoch": 1.9755288860056943, + "grad_norm": 0.1758003681898117, + "learning_rate": 3.447615669188111e-05, + "loss": 0.4029, "step": 54815 }, { - "epoch": 1.93, - "learning_rate": 3.517277063562529e-05, - "loss": 0.2814, + "epoch": 1.975709085666919, + "grad_norm": 0.1570878028869629, + "learning_rate": 3.44734562514635e-05, + "loss": 0.386, "step": 54820 }, { - "epoch": 1.93, - "learning_rate": 3.517016835414992e-05, - "loss": 0.2579, + "epoch": 1.9758892853281436, + "grad_norm": 0.15345117449760437, + "learning_rate": 3.4470755681971886e-05, + "loss": 0.401, "step": 54825 }, { - "epoch": 1.93, - "learning_rate": 3.5167565940621614e-05, - "loss": 0.2763, + "epoch": 1.976069484989368, + "grad_norm": 0.201069176197052, + "learning_rate": 3.446805498344307e-05, + "loss": 0.4219, "step": 54830 }, { - "epoch": 1.93, - "learning_rate": 3.516496339507416e-05, - "loss": 0.2942, + "epoch": 1.9762496846505928, + "grad_norm": 0.16482049226760864, + "learning_rate": 3.446535415591382e-05, + "loss": 0.4091, "step": 54835 }, { - "epoch": 1.93, - "learning_rate": 3.5162360717541374e-05, - "loss": 0.2744, + "epoch": 1.9764298843118175, + "grad_norm": 0.19689258933067322, + "learning_rate": 3.446265319942096e-05, + "loss": 0.3863, "step": 54840 }, { - "epoch": 1.93, - "learning_rate": 3.515975790805701e-05, - "loss": 0.28, + "epoch": 1.9766100839730423, + "grad_norm": 0.14975933730602264, + "learning_rate": 3.4459952114001284e-05, + "loss": 0.426, "step": 54845 }, { - "epoch": 1.93, - "learning_rate": 3.51571549666549e-05, - "loss": 0.2785, + "epoch": 1.9767902836342668, + "grad_norm": 0.17771407961845398, + "learning_rate": 3.4457250899691586e-05, + "loss": 0.4053, "step": 54850 }, { - "epoch": 1.93, - "learning_rate": 3.5154551893368824e-05, - "loss": 0.2837, + "epoch": 1.9769704832954913, + "grad_norm": 0.16967308521270752, + "learning_rate": 3.4454549556528674e-05, + "loss": 0.3714, "step": 54855 }, { - "epoch": 1.93, - "learning_rate": 3.5151948688232575e-05, - "loss": 0.2921, + "epoch": 1.977150682956716, + "grad_norm": 0.19800063967704773, + "learning_rate": 3.445184808454936e-05, + "loss": 0.4534, "step": 54860 }, { - "epoch": 1.93, - "learning_rate": 3.5149345351279975e-05, - "loss": 0.2606, + "epoch": 1.9773308826179408, + "grad_norm": 0.19260716438293457, + "learning_rate": 3.444914648379045e-05, + "loss": 0.4158, "step": 54865 }, { - "epoch": 1.93, - "learning_rate": 3.51467418825448e-05, - "loss": 0.2747, + "epoch": 1.9775110822791653, + "grad_norm": 0.18546351790428162, + "learning_rate": 3.4446444754288745e-05, + "loss": 0.3974, "step": 54870 }, { - "epoch": 1.93, - "learning_rate": 3.5144138282060886e-05, - "loss": 0.2833, + "epoch": 1.9776912819403898, + "grad_norm": 0.17617270350456238, + "learning_rate": 3.444374289608105e-05, + "loss": 0.3895, "step": 54875 }, { - "epoch": 1.93, - "learning_rate": 3.514153454986201e-05, - "loss": 0.302, + "epoch": 1.9778714816016145, + "grad_norm": 0.20239382982254028, + "learning_rate": 3.444104090920419e-05, + "loss": 0.417, "step": 54880 }, { - "epoch": 1.93, - "learning_rate": 3.5138930685982e-05, - "loss": 0.2866, + "epoch": 1.9780516812628393, + "grad_norm": 0.15842540562152863, + "learning_rate": 3.443833879369499e-05, + "loss": 0.4095, "step": 54885 }, { - "epoch": 1.93, - "learning_rate": 3.513632669045464e-05, - "loss": 0.2823, + "epoch": 1.978231880924064, + "grad_norm": 0.1709548830986023, + "learning_rate": 3.443563654959024e-05, + "loss": 0.4075, "step": 54890 }, { - "epoch": 1.93, - "learning_rate": 3.513372256331378e-05, - "loss": 0.2745, + "epoch": 1.9784120805852885, + "grad_norm": 0.20768848061561584, + "learning_rate": 3.4432934176926765e-05, + "loss": 0.427, "step": 54895 }, { - "epoch": 1.93, - "learning_rate": 3.513111830459319e-05, - "loss": 0.2888, + "epoch": 1.978592280246513, + "grad_norm": 0.16096894443035126, + "learning_rate": 3.443023167574139e-05, + "loss": 0.4047, "step": 54900 }, { - "epoch": 1.93, - "learning_rate": 3.512851391432672e-05, - "loss": 0.2857, + "epoch": 1.9787724799077377, + "grad_norm": 0.17708556354045868, + "learning_rate": 3.4427529046070936e-05, + "loss": 0.4069, "step": 54905 }, { - "epoch": 1.93, - "learning_rate": 3.512590939254816e-05, - "loss": 0.2964, + "epoch": 1.9789526795689625, + "grad_norm": 0.1786879301071167, + "learning_rate": 3.442482628795223e-05, + "loss": 0.3877, "step": 54910 }, { - "epoch": 1.93, - "learning_rate": 3.512330473929135e-05, - "loss": 0.2619, + "epoch": 1.979132879230187, + "grad_norm": 0.2042560875415802, + "learning_rate": 3.442212340142209e-05, + "loss": 0.4307, "step": 54915 }, { - "epoch": 1.93, - "learning_rate": 3.51206999545901e-05, - "loss": 0.2906, + "epoch": 1.9793130788914117, + "grad_norm": 0.16689689457416534, + "learning_rate": 3.441942038651733e-05, + "loss": 0.4006, "step": 54920 }, { - "epoch": 1.93, - "learning_rate": 3.5118095038478225e-05, - "loss": 0.2607, + "epoch": 1.9794932785526362, + "grad_norm": 0.16733959317207336, + "learning_rate": 3.44167172432748e-05, + "loss": 0.4051, "step": 54925 }, { - "epoch": 1.93, - "learning_rate": 3.511548999098956e-05, - "loss": 0.2805, + "epoch": 1.979673478213861, + "grad_norm": 0.18891684710979462, + "learning_rate": 3.4414013971731323e-05, + "loss": 0.4374, "step": 54930 }, { - "epoch": 1.93, - "learning_rate": 3.511288481215792e-05, - "loss": 0.263, + "epoch": 1.9798536778750857, + "grad_norm": 0.18657353520393372, + "learning_rate": 3.441131057192373e-05, + "loss": 0.4465, "step": 54935 }, { - "epoch": 1.93, - "learning_rate": 3.5110279502017126e-05, - "loss": 0.2726, + "epoch": 1.9800338775363102, + "grad_norm": 0.1951577365398407, + "learning_rate": 3.440860704388884e-05, + "loss": 0.3765, "step": 54940 }, { - "epoch": 1.93, - "learning_rate": 3.510767406060103e-05, - "loss": 0.285, + "epoch": 1.9802140771975347, + "grad_norm": 0.21163040399551392, + "learning_rate": 3.440590338766351e-05, + "loss": 0.4099, "step": 54945 }, { - "epoch": 1.93, - "learning_rate": 3.5105068487943436e-05, - "loss": 0.2792, + "epoch": 1.9803942768587595, + "grad_norm": 0.19105908274650574, + "learning_rate": 3.4403199603284567e-05, + "loss": 0.4014, "step": 54950 }, { - "epoch": 1.93, - "learning_rate": 3.510246278407821e-05, - "loss": 0.2855, + "epoch": 1.9805744765199842, + "grad_norm": 0.18041996657848358, + "learning_rate": 3.440049569078885e-05, + "loss": 0.4271, "step": 54955 }, { - "epoch": 1.93, - "learning_rate": 3.509985694903913e-05, - "loss": 0.287, + "epoch": 1.980754676181209, + "grad_norm": 0.19663773477077484, + "learning_rate": 3.4397791650213193e-05, + "loss": 0.4073, "step": 54960 }, { - "epoch": 1.93, - "learning_rate": 3.509725098286009e-05, - "loss": 0.2961, + "epoch": 1.9809348758424334, + "grad_norm": 0.17666995525360107, + "learning_rate": 3.439508748159445e-05, + "loss": 0.3892, "step": 54965 }, { - "epoch": 1.93, - "learning_rate": 3.5094644885574886e-05, - "loss": 0.2824, + "epoch": 1.981115075503658, + "grad_norm": 0.14386975765228271, + "learning_rate": 3.4392383184969464e-05, + "loss": 0.3762, "step": 54970 }, { - "epoch": 1.93, - "learning_rate": 3.5092038657217376e-05, - "loss": 0.2817, + "epoch": 1.9812952751648827, + "grad_norm": 0.19409483671188354, + "learning_rate": 3.438967876037507e-05, + "loss": 0.375, "step": 54975 }, { - "epoch": 1.93, - "learning_rate": 3.508943229782139e-05, - "loss": 0.2871, + "epoch": 1.9814754748261074, + "grad_norm": 0.22416892647743225, + "learning_rate": 3.438697420784812e-05, + "loss": 0.4125, "step": 54980 }, { - "epoch": 1.93, - "learning_rate": 3.508682580742079e-05, - "loss": 0.2864, + "epoch": 1.981655674487332, + "grad_norm": 0.20359428226947784, + "learning_rate": 3.438426952742546e-05, + "loss": 0.4154, "step": 54985 }, { - "epoch": 1.93, - "learning_rate": 3.508421918604938e-05, - "loss": 0.2935, + "epoch": 1.9818358741485564, + "grad_norm": 0.20030748844146729, + "learning_rate": 3.4381564719143947e-05, + "loss": 0.3902, "step": 54990 }, { - "epoch": 1.93, - "learning_rate": 3.508161243374105e-05, - "loss": 0.2825, + "epoch": 1.9820160738097812, + "grad_norm": 0.15663334727287292, + "learning_rate": 3.437885978304043e-05, + "loss": 0.4205, "step": 54995 }, { - "epoch": 1.94, - "learning_rate": 3.5079005550529614e-05, - "loss": 0.2652, + "epoch": 1.982196273471006, + "grad_norm": 0.17842374742031097, + "learning_rate": 3.437615471915177e-05, + "loss": 0.4141, "step": 55000 }, { - "epoch": 1.94, - "eval_loss": 0.2729741930961609, - "eval_runtime": 10.5572, - "eval_samples_per_second": 9.472, - "eval_steps_per_second": 9.472, + "epoch": 1.982196273471006, + "eval_loss": 0.43720561265945435, + "eval_runtime": 3.5352, + "eval_samples_per_second": 28.287, + "eval_steps_per_second": 7.072, "step": 55000 }, { - "epoch": 1.94, - "learning_rate": 3.507639853644893e-05, - "loss": 0.2845, + "epoch": 1.9823764731322306, + "grad_norm": 0.155527725815773, + "learning_rate": 3.437344952751481e-05, + "loss": 0.4354, "step": 55005 }, { - "epoch": 1.94, - "learning_rate": 3.507379139153287e-05, - "loss": 0.2745, + "epoch": 1.9825566727934552, + "grad_norm": 0.16779151558876038, + "learning_rate": 3.4370744208166424e-05, + "loss": 0.4442, "step": 55010 }, { - "epoch": 1.94, - "learning_rate": 3.5071184115815255e-05, - "loss": 0.2683, + "epoch": 1.9827368724546797, + "grad_norm": 0.25572511553764343, + "learning_rate": 3.436803876114346e-05, + "loss": 0.4068, "step": 55015 }, { - "epoch": 1.94, - "learning_rate": 3.506857670932996e-05, - "loss": 0.3008, + "epoch": 1.9829170721159044, + "grad_norm": 0.1537223905324936, + "learning_rate": 3.436533318648277e-05, + "loss": 0.4256, "step": 55020 }, { - "epoch": 1.94, - "learning_rate": 3.506596917211082e-05, - "loss": 0.2836, + "epoch": 1.9830972717771291, + "grad_norm": 0.17157311737537384, + "learning_rate": 3.436262748422124e-05, + "loss": 0.3851, "step": 55025 }, { - "epoch": 1.94, - "learning_rate": 3.5063361504191714e-05, - "loss": 0.2907, + "epoch": 1.9832774714383536, + "grad_norm": 0.21722643077373505, + "learning_rate": 3.435992165439572e-05, + "loss": 0.4225, "step": 55030 }, { - "epoch": 1.94, - "learning_rate": 3.506075370560648e-05, - "loss": 0.2794, + "epoch": 1.9834576710995784, + "grad_norm": 0.20483043789863586, + "learning_rate": 3.435721569704308e-05, + "loss": 0.4113, "step": 55035 }, { - "epoch": 1.94, - "learning_rate": 3.5058145776389e-05, - "loss": 0.2697, + "epoch": 1.983637870760803, + "grad_norm": 0.2097228616476059, + "learning_rate": 3.4354509612200194e-05, + "loss": 0.407, "step": 55040 }, { - "epoch": 1.94, - "learning_rate": 3.505553771657312e-05, - "loss": 0.2841, + "epoch": 1.9838180704220276, + "grad_norm": 0.1592007428407669, + "learning_rate": 3.435180339990391e-05, + "loss": 0.3818, "step": 55045 }, { - "epoch": 1.94, - "learning_rate": 3.505292952619272e-05, - "loss": 0.2808, + "epoch": 1.9839982700832524, + "grad_norm": 0.1988215148448944, + "learning_rate": 3.4349097060191135e-05, + "loss": 0.3992, "step": 55050 }, { - "epoch": 1.94, - "learning_rate": 3.5050321205281636e-05, - "loss": 0.2969, + "epoch": 1.9841784697444769, + "grad_norm": 0.19926537573337555, + "learning_rate": 3.434639059309871e-05, + "loss": 0.3907, "step": 55055 }, { - "epoch": 1.94, - "learning_rate": 3.504771275387377e-05, - "loss": 0.2951, + "epoch": 1.9843586694057014, + "grad_norm": 0.1831171214580536, + "learning_rate": 3.434368399866352e-05, + "loss": 0.4176, "step": 55060 }, { - "epoch": 1.94, - "learning_rate": 3.504510417200297e-05, - "loss": 0.2714, + "epoch": 1.9845388690669261, + "grad_norm": 0.17820048332214355, + "learning_rate": 3.434097727692245e-05, + "loss": 0.4282, "step": 55065 }, { - "epoch": 1.94, - "learning_rate": 3.504249545970312e-05, - "loss": 0.2777, + "epoch": 1.9847190687281508, + "grad_norm": 0.21258282661437988, + "learning_rate": 3.433827042791238e-05, + "loss": 0.4548, "step": 55070 }, { - "epoch": 1.94, - "learning_rate": 3.503988661700807e-05, - "loss": 0.2983, + "epoch": 1.9848992683893756, + "grad_norm": 0.17607107758522034, + "learning_rate": 3.433556345167017e-05, + "loss": 0.4106, "step": 55075 }, { - "epoch": 1.94, - "learning_rate": 3.503727764395172e-05, - "loss": 0.2766, + "epoch": 1.9850794680506, + "grad_norm": 0.18194575607776642, + "learning_rate": 3.4332856348232725e-05, + "loss": 0.4119, "step": 55080 }, { - "epoch": 1.94, - "learning_rate": 3.503466854056793e-05, - "loss": 0.2603, + "epoch": 1.9852596677118246, + "grad_norm": 0.16195902228355408, + "learning_rate": 3.433014911763691e-05, + "loss": 0.3892, "step": 55085 }, { - "epoch": 1.94, - "learning_rate": 3.503205930689059e-05, - "loss": 0.3038, + "epoch": 1.9854398673730493, + "grad_norm": 0.17873816192150116, + "learning_rate": 3.432744175991963e-05, + "loss": 0.406, "step": 55090 }, { - "epoch": 1.94, - "learning_rate": 3.5029449942953555e-05, - "loss": 0.3096, + "epoch": 1.985620067034274, + "grad_norm": 0.17679697275161743, + "learning_rate": 3.432473427511776e-05, + "loss": 0.4588, "step": 55095 }, { - "epoch": 1.94, - "learning_rate": 3.5026840448790735e-05, - "loss": 0.2746, + "epoch": 1.9858002666954986, + "grad_norm": 0.1970539093017578, + "learning_rate": 3.4322026663268186e-05, + "loss": 0.3898, "step": 55100 }, { - "epoch": 1.94, - "learning_rate": 3.5024230824436e-05, - "loss": 0.2508, + "epoch": 1.985980466356723, + "grad_norm": 0.2121034860610962, + "learning_rate": 3.4319318924407806e-05, + "loss": 0.4556, "step": 55105 }, { - "epoch": 1.94, - "learning_rate": 3.502162106992322e-05, - "loss": 0.2831, + "epoch": 1.9861606660179478, + "grad_norm": 0.1846599280834198, + "learning_rate": 3.431661105857351e-05, + "loss": 0.4054, "step": 55110 }, { - "epoch": 1.94, - "learning_rate": 3.501901118528631e-05, - "loss": 0.2803, + "epoch": 1.9863408656791726, + "grad_norm": 0.17753110826015472, + "learning_rate": 3.4313903065802187e-05, + "loss": 0.3984, "step": 55115 }, { - "epoch": 1.94, - "learning_rate": 3.501640117055913e-05, - "loss": 0.2564, + "epoch": 1.9865210653403973, + "grad_norm": 0.1930995136499405, + "learning_rate": 3.431119494613075e-05, + "loss": 0.4278, "step": 55120 }, { - "epoch": 1.94, - "learning_rate": 3.50137910257756e-05, - "loss": 0.262, + "epoch": 1.9867012650016218, + "grad_norm": 0.17854326963424683, + "learning_rate": 3.430848669959607e-05, + "loss": 0.406, "step": 55125 }, { - "epoch": 1.94, - "learning_rate": 3.501118075096958e-05, - "loss": 0.2641, + "epoch": 1.9868814646628463, + "grad_norm": 0.16902780532836914, + "learning_rate": 3.430577832623507e-05, + "loss": 0.3918, "step": 55130 }, { - "epoch": 1.94, - "learning_rate": 3.500857034617497e-05, - "loss": 0.2845, + "epoch": 1.987061664324071, + "grad_norm": 0.23383301496505737, + "learning_rate": 3.4303069826084646e-05, + "loss": 0.3994, "step": 55135 }, { - "epoch": 1.94, - "learning_rate": 3.500595981142568e-05, - "loss": 0.2981, + "epoch": 1.9872418639852958, + "grad_norm": 0.19351869821548462, + "learning_rate": 3.430036119918168e-05, + "loss": 0.4034, "step": 55140 }, { - "epoch": 1.94, - "learning_rate": 3.50033491467556e-05, - "loss": 0.2878, + "epoch": 1.9874220636465203, + "grad_norm": 0.19689901173114777, + "learning_rate": 3.429765244556311e-05, + "loss": 0.4055, "step": 55145 }, { - "epoch": 1.94, - "learning_rate": 3.500073835219861e-05, - "loss": 0.2686, + "epoch": 1.987602263307745, + "grad_norm": 0.19208216667175293, + "learning_rate": 3.429494356526581e-05, + "loss": 0.3989, "step": 55150 }, { - "epoch": 1.94, - "learning_rate": 3.4998127427788644e-05, - "loss": 0.28, + "epoch": 1.9877824629689695, + "grad_norm": 0.19520142674446106, + "learning_rate": 3.4292234558326724e-05, + "loss": 0.3776, "step": 55155 }, { - "epoch": 1.94, - "learning_rate": 3.4995516373559566e-05, - "loss": 0.2787, + "epoch": 1.9879626626301943, + "grad_norm": 0.22114428877830505, + "learning_rate": 3.4289525424782726e-05, + "loss": 0.422, "step": 55160 }, { - "epoch": 1.94, - "learning_rate": 3.49929051895453e-05, - "loss": 0.2695, + "epoch": 1.988142862291419, + "grad_norm": 0.16735602915287018, + "learning_rate": 3.428681616467075e-05, + "loss": 0.4248, "step": 55165 }, { - "epoch": 1.94, - "learning_rate": 3.499029387577974e-05, - "loss": 0.2726, + "epoch": 1.9883230619526435, + "grad_norm": 0.18746241927146912, + "learning_rate": 3.4284106778027696e-05, + "loss": 0.4177, "step": 55170 }, { - "epoch": 1.94, - "learning_rate": 3.4987682432296805e-05, - "loss": 0.2867, + "epoch": 1.988503261613868, + "grad_norm": 0.24923117458820343, + "learning_rate": 3.4281397264890484e-05, + "loss": 0.4006, "step": 55175 }, { - "epoch": 1.94, - "learning_rate": 3.498507085913039e-05, - "loss": 0.2906, + "epoch": 1.9886834612750928, + "grad_norm": 0.147854283452034, + "learning_rate": 3.427868762529604e-05, + "loss": 0.4182, "step": 55180 }, { - "epoch": 1.94, - "learning_rate": 3.4982459156314415e-05, - "loss": 0.2819, + "epoch": 1.9888636609363175, + "grad_norm": 0.14005811512470245, + "learning_rate": 3.427597785928126e-05, + "loss": 0.3873, "step": 55185 }, { - "epoch": 1.94, - "learning_rate": 3.497984732388278e-05, - "loss": 0.2734, + "epoch": 1.9890438605975422, + "grad_norm": 0.18232989311218262, + "learning_rate": 3.4273267966883094e-05, + "loss": 0.374, "step": 55190 }, { - "epoch": 1.94, - "learning_rate": 3.497723536186941e-05, - "loss": 0.2846, + "epoch": 1.9892240602587667, + "grad_norm": 0.18193449079990387, + "learning_rate": 3.427055794813844e-05, + "loss": 0.3967, "step": 55195 }, { - "epoch": 1.94, - "learning_rate": 3.4974623270308215e-05, - "loss": 0.3048, + "epoch": 1.9894042599199913, + "grad_norm": 0.1848289668560028, + "learning_rate": 3.426784780308423e-05, + "loss": 0.3852, "step": 55200 }, { - "epoch": 1.94, - "learning_rate": 3.497201104923311e-05, - "loss": 0.3095, + "epoch": 1.989584459581216, + "grad_norm": 0.2115050107240677, + "learning_rate": 3.426513753175738e-05, + "loss": 0.4434, "step": 55205 }, { - "epoch": 1.94, - "learning_rate": 3.496939869867801e-05, - "loss": 0.2599, + "epoch": 1.9897646592424407, + "grad_norm": 0.20298829674720764, + "learning_rate": 3.426242713419483e-05, + "loss": 0.4076, "step": 55210 }, { - "epoch": 1.94, - "learning_rate": 3.4966786218676836e-05, - "loss": 0.2875, + "epoch": 1.9899448589036652, + "grad_norm": 0.20630566775798798, + "learning_rate": 3.425971661043351e-05, + "loss": 0.434, "step": 55215 }, { - "epoch": 1.94, - "learning_rate": 3.496417360926352e-05, - "loss": 0.2867, + "epoch": 1.9901250585648897, + "grad_norm": 0.1815614402294159, + "learning_rate": 3.425700596051033e-05, + "loss": 0.4045, "step": 55220 }, { - "epoch": 1.94, - "learning_rate": 3.496156087047197e-05, - "loss": 0.2812, + "epoch": 1.9903052582261145, + "grad_norm": 0.1584119349718094, + "learning_rate": 3.425429518446225e-05, + "loss": 0.4018, "step": 55225 }, { - "epoch": 1.94, - "learning_rate": 3.495894800233612e-05, - "loss": 0.2859, + "epoch": 1.9904854578873392, + "grad_norm": 0.17491492629051208, + "learning_rate": 3.425158428232618e-05, + "loss": 0.4208, "step": 55230 }, { - "epoch": 1.94, - "learning_rate": 3.495633500488989e-05, - "loss": 0.2807, + "epoch": 1.990665657548564, + "grad_norm": 0.16318289935588837, + "learning_rate": 3.4248873254139066e-05, + "loss": 0.4531, "step": 55235 }, { - "epoch": 1.94, - "learning_rate": 3.495372187816722e-05, - "loss": 0.2775, + "epoch": 1.9908458572097885, + "grad_norm": 0.18993204832077026, + "learning_rate": 3.424616209993785e-05, + "loss": 0.3925, "step": 55240 }, { - "epoch": 1.94, - "learning_rate": 3.4951108622202024e-05, - "loss": 0.2604, + "epoch": 1.991026056871013, + "grad_norm": 0.20490875840187073, + "learning_rate": 3.4243450819759464e-05, + "loss": 0.3857, "step": 55245 }, { - "epoch": 1.94, - "learning_rate": 3.494849523702824e-05, - "loss": 0.2818, + "epoch": 1.9912062565322377, + "grad_norm": 0.22841264307498932, + "learning_rate": 3.424073941364085e-05, + "loss": 0.4436, "step": 55250 }, { - "epoch": 1.94, - "learning_rate": 3.49458817226798e-05, - "loss": 0.2879, + "epoch": 1.9913864561934624, + "grad_norm": 0.18176591396331787, + "learning_rate": 3.423802788161895e-05, + "loss": 0.4367, "step": 55255 }, { - "epoch": 1.94, - "learning_rate": 3.494326807919065e-05, - "loss": 0.281, + "epoch": 1.991566655854687, + "grad_norm": 0.20801039040088654, + "learning_rate": 3.4235316223730706e-05, + "loss": 0.4058, "step": 55260 }, { - "epoch": 1.94, - "learning_rate": 3.4940654306594705e-05, - "loss": 0.2658, + "epoch": 1.9917468555159117, + "grad_norm": 0.19156023859977722, + "learning_rate": 3.423260444001307e-05, + "loss": 0.4368, "step": 55265 }, { - "epoch": 1.94, - "learning_rate": 3.493804040492592e-05, - "loss": 0.3026, + "epoch": 1.9919270551771362, + "grad_norm": 0.1914074718952179, + "learning_rate": 3.422989253050298e-05, + "loss": 0.4253, "step": 55270 }, { - "epoch": 1.94, - "learning_rate": 3.493542637421823e-05, - "loss": 0.2766, + "epoch": 1.992107254838361, + "grad_norm": 0.15868034958839417, + "learning_rate": 3.42271804952374e-05, + "loss": 0.3822, "step": 55275 }, { - "epoch": 1.94, - "learning_rate": 3.4932812214505576e-05, - "loss": 0.3027, + "epoch": 1.9922874544995857, + "grad_norm": 0.20103418827056885, + "learning_rate": 3.4224468334253265e-05, + "loss": 0.4241, "step": 55280 }, { - "epoch": 1.95, - "learning_rate": 3.49301979258219e-05, - "loss": 0.2719, + "epoch": 1.9924676541608102, + "grad_norm": 0.22683289647102356, + "learning_rate": 3.422175604758754e-05, + "loss": 0.4063, "step": 55285 }, { - "epoch": 1.95, - "learning_rate": 3.4927583508201155e-05, - "loss": 0.2899, + "epoch": 1.9926478538220347, + "grad_norm": 0.20226216316223145, + "learning_rate": 3.4219043635277173e-05, + "loss": 0.417, "step": 55290 }, { - "epoch": 1.95, - "learning_rate": 3.492496896167727e-05, - "loss": 0.2836, + "epoch": 1.9928280534832594, + "grad_norm": 0.20145370066165924, + "learning_rate": 3.4216331097359123e-05, + "loss": 0.4072, "step": 55295 }, { - "epoch": 1.95, - "learning_rate": 3.492235428628421e-05, - "loss": 0.2778, + "epoch": 1.9930082531444842, + "grad_norm": 0.1815030723810196, + "learning_rate": 3.421361843387034e-05, + "loss": 0.4481, "step": 55300 }, { - "epoch": 1.95, - "learning_rate": 3.4919739482055914e-05, - "loss": 0.3118, + "epoch": 1.9931884528057089, + "grad_norm": 0.16565322875976562, + "learning_rate": 3.42109056448478e-05, + "loss": 0.4235, "step": 55305 }, { - "epoch": 1.95, - "learning_rate": 3.491712454902635e-05, - "loss": 0.2832, + "epoch": 1.9933686524669334, + "grad_norm": 0.19166086614131927, + "learning_rate": 3.420819273032844e-05, + "loss": 0.4324, "step": 55310 }, { - "epoch": 1.95, - "learning_rate": 3.491450948722945e-05, - "loss": 0.2933, + "epoch": 1.993548852128158, + "grad_norm": 0.1797381490468979, + "learning_rate": 3.4205479690349246e-05, + "loss": 0.4356, "step": 55315 }, { - "epoch": 1.95, - "learning_rate": 3.491189429669918e-05, - "loss": 0.2857, + "epoch": 1.9937290517893826, + "grad_norm": 0.19711998105049133, + "learning_rate": 3.420276652494717e-05, + "loss": 0.3925, "step": 55320 }, { - "epoch": 1.95, - "learning_rate": 3.49092789774695e-05, - "loss": 0.2761, + "epoch": 1.9939092514506074, + "grad_norm": 0.157618910074234, + "learning_rate": 3.4200053234159185e-05, + "loss": 0.386, "step": 55325 }, { - "epoch": 1.95, - "learning_rate": 3.4906663529574344e-05, - "loss": 0.2938, + "epoch": 1.994089451111832, + "grad_norm": 0.2216421216726303, + "learning_rate": 3.419733981802226e-05, + "loss": 0.4304, "step": 55330 }, { - "epoch": 1.95, - "learning_rate": 3.490404795304771e-05, - "loss": 0.2615, + "epoch": 1.9942696507730564, + "grad_norm": 0.16847671568393707, + "learning_rate": 3.419462627657335e-05, + "loss": 0.4074, "step": 55335 }, { - "epoch": 1.95, - "learning_rate": 3.490143224792353e-05, - "loss": 0.2918, + "epoch": 1.9944498504342811, + "grad_norm": 0.19874538481235504, + "learning_rate": 3.4191912609849444e-05, + "loss": 0.386, "step": 55340 }, { - "epoch": 1.95, - "learning_rate": 3.489881641423579e-05, - "loss": 0.3023, + "epoch": 1.9946300500955059, + "grad_norm": 0.19660434126853943, + "learning_rate": 3.41891988178875e-05, + "loss": 0.3841, "step": 55345 }, { - "epoch": 1.95, - "learning_rate": 3.489620045201843e-05, - "loss": 0.304, + "epoch": 1.9948102497567306, + "grad_norm": 0.2128075808286667, + "learning_rate": 3.4186484900724514e-05, + "loss": 0.4227, "step": 55350 }, { - "epoch": 1.95, - "learning_rate": 3.489358436130543e-05, - "loss": 0.296, + "epoch": 1.9949904494179551, + "grad_norm": 0.1658129245042801, + "learning_rate": 3.418377085839744e-05, + "loss": 0.4055, "step": 55355 }, { - "epoch": 1.95, - "learning_rate": 3.489096814213076e-05, - "loss": 0.287, + "epoch": 1.9951706490791796, + "grad_norm": 0.1386236697435379, + "learning_rate": 3.418105669094327e-05, + "loss": 0.3855, "step": 55360 }, { - "epoch": 1.95, - "learning_rate": 3.488835179452839e-05, - "loss": 0.2749, + "epoch": 1.9953508487404044, + "grad_norm": 0.16813713312149048, + "learning_rate": 3.4178342398398985e-05, + "loss": 0.4049, "step": 55365 }, { - "epoch": 1.95, - "learning_rate": 3.488573531853229e-05, - "loss": 0.2931, + "epoch": 1.995531048401629, + "grad_norm": 0.19493988156318665, + "learning_rate": 3.417562798080155e-05, + "loss": 0.3933, "step": 55370 }, { - "epoch": 1.95, - "learning_rate": 3.4883118714176427e-05, - "loss": 0.2849, + "epoch": 1.9957112480628536, + "grad_norm": 0.19702892005443573, + "learning_rate": 3.417291343818797e-05, + "loss": 0.453, "step": 55375 }, { - "epoch": 1.95, - "learning_rate": 3.488050198149478e-05, - "loss": 0.2739, + "epoch": 1.9958914477240781, + "grad_norm": 0.18380559980869293, + "learning_rate": 3.417019877059522e-05, + "loss": 0.3968, "step": 55380 }, { - "epoch": 1.95, - "learning_rate": 3.487788512052132e-05, - "loss": 0.2894, + "epoch": 1.9960716473853028, + "grad_norm": 0.2419026494026184, + "learning_rate": 3.4167483978060285e-05, + "loss": 0.4219, "step": 55385 }, { - "epoch": 1.95, - "learning_rate": 3.4875268131290046e-05, - "loss": 0.2739, + "epoch": 1.9962518470465276, + "grad_norm": 0.19167737662792206, + "learning_rate": 3.416476906062015e-05, + "loss": 0.4485, "step": 55390 }, { - "epoch": 1.95, - "learning_rate": 3.4872651013834916e-05, - "loss": 0.2682, + "epoch": 1.9964320467077523, + "grad_norm": 0.18639305233955383, + "learning_rate": 3.416205401831182e-05, + "loss": 0.4201, "step": 55395 }, { - "epoch": 1.95, - "learning_rate": 3.487003376818992e-05, - "loss": 0.2523, + "epoch": 1.9966122463689768, + "grad_norm": 0.2547195851802826, + "learning_rate": 3.4159338851172276e-05, + "loss": 0.4503, "step": 55400 }, { - "epoch": 1.95, - "learning_rate": 3.486741639438904e-05, - "loss": 0.3135, + "epoch": 1.9967924460302013, + "grad_norm": 0.19804906845092773, + "learning_rate": 3.41566235592385e-05, + "loss": 0.4057, "step": 55405 }, { - "epoch": 1.95, - "learning_rate": 3.4864798892466256e-05, - "loss": 0.2938, + "epoch": 1.996972645691426, + "grad_norm": 0.21647123992443085, + "learning_rate": 3.415390814254752e-05, + "loss": 0.4265, "step": 55410 }, { - "epoch": 1.95, - "learning_rate": 3.4862181262455565e-05, - "loss": 0.263, + "epoch": 1.9971528453526508, + "grad_norm": 0.13523387908935547, + "learning_rate": 3.41511926011363e-05, + "loss": 0.4253, "step": 55415 }, { - "epoch": 1.95, - "learning_rate": 3.485956350439095e-05, - "loss": 0.2899, + "epoch": 1.9973330450138753, + "grad_norm": 0.16237081587314606, + "learning_rate": 3.414847693504186e-05, + "loss": 0.3982, "step": 55420 }, { - "epoch": 1.95, - "learning_rate": 3.48569456183064e-05, - "loss": 0.2736, + "epoch": 1.9975132446751, + "grad_norm": 0.12674693763256073, + "learning_rate": 3.414576114430119e-05, + "loss": 0.4324, "step": 55425 }, { - "epoch": 1.95, - "learning_rate": 3.485432760423591e-05, - "loss": 0.2928, + "epoch": 1.9976934443363246, + "grad_norm": 0.18315039575099945, + "learning_rate": 3.4143045228951296e-05, + "loss": 0.4158, "step": 55430 }, { - "epoch": 1.95, - "learning_rate": 3.485170946221347e-05, - "loss": 0.2833, + "epoch": 1.9978736439975493, + "grad_norm": 0.1764761209487915, + "learning_rate": 3.414032918902918e-05, + "loss": 0.3859, "step": 55435 }, { - "epoch": 1.95, - "learning_rate": 3.4849091192273064e-05, - "loss": 0.2792, + "epoch": 1.998053843658774, + "grad_norm": 0.19060806930065155, + "learning_rate": 3.413761302457185e-05, + "loss": 0.4238, "step": 55440 }, { - "epoch": 1.95, - "learning_rate": 3.484647279444872e-05, - "loss": 0.2718, + "epoch": 1.9982340433199985, + "grad_norm": 0.14990010857582092, + "learning_rate": 3.413489673561632e-05, + "loss": 0.4259, "step": 55445 }, { - "epoch": 1.95, - "learning_rate": 3.48438542687744e-05, - "loss": 0.2887, + "epoch": 1.998414242981223, + "grad_norm": 0.16765926778316498, + "learning_rate": 3.413218032219957e-05, + "loss": 0.4103, "step": 55450 }, { - "epoch": 1.95, - "learning_rate": 3.484123561528412e-05, - "loss": 0.274, + "epoch": 1.9985944426424478, + "grad_norm": 0.1568935662508011, + "learning_rate": 3.412946378435865e-05, + "loss": 0.4334, "step": 55455 }, { - "epoch": 1.95, - "learning_rate": 3.483861683401188e-05, - "loss": 0.2888, + "epoch": 1.9987746423036725, + "grad_norm": 0.1629842072725296, + "learning_rate": 3.4126747122130545e-05, + "loss": 0.4099, "step": 55460 }, { - "epoch": 1.95, - "learning_rate": 3.483599792499169e-05, - "loss": 0.2933, + "epoch": 1.9989548419648973, + "grad_norm": 0.1970033496618271, + "learning_rate": 3.4124030335552274e-05, + "loss": 0.4265, "step": 55465 }, { - "epoch": 1.95, - "learning_rate": 3.483337888825755e-05, - "loss": 0.2908, + "epoch": 1.9991350416261218, + "grad_norm": 0.2183963507413864, + "learning_rate": 3.4121313424660866e-05, + "loss": 0.4178, "step": 55470 }, { - "epoch": 1.95, - "learning_rate": 3.4830759723843464e-05, - "loss": 0.2973, + "epoch": 1.9993152412873463, + "grad_norm": 0.15539808571338654, + "learning_rate": 3.411859638949332e-05, + "loss": 0.4105, "step": 55475 }, { - "epoch": 1.95, - "learning_rate": 3.482814043178344e-05, - "loss": 0.2742, + "epoch": 1.999495440948571, + "grad_norm": 0.22591222822666168, + "learning_rate": 3.411642267190535e-05, + "loss": 0.4401, "step": 55480 }, { - "epoch": 1.95, - "learning_rate": 3.482552101211149e-05, - "loss": 0.285, + "epoch": 1.9996756406097957, + "grad_norm": 0.13716307282447815, + "learning_rate": 3.4113705413134064e-05, + "loss": 0.3836, "step": 55485 }, { - "epoch": 1.95, - "learning_rate": 3.4822901464861633e-05, - "loss": 0.287, + "epoch": 1.9998558402710203, + "grad_norm": 0.20352575182914734, + "learning_rate": 3.411098803019031e-05, + "loss": 0.4013, "step": 55490 }, { - "epoch": 1.95, - "learning_rate": 3.482028179006786e-05, - "loss": 0.2726, + "epoch": 2.0000360399322448, + "grad_norm": 0.2378479391336441, + "learning_rate": 3.41082705231111e-05, + "loss": 0.4204, "step": 55495 }, { - "epoch": 1.95, - "learning_rate": 3.48176619877642e-05, - "loss": 0.2736, + "epoch": 2.0002162395934695, + "grad_norm": 0.17885233461856842, + "learning_rate": 3.410555289193347e-05, + "loss": 0.3819, "step": 55500 }, { - "epoch": 1.95, - "eval_loss": 0.27318820357322693, - "eval_runtime": 10.5335, - "eval_samples_per_second": 9.494, - "eval_steps_per_second": 9.494, + "epoch": 2.0002162395934695, + "eval_loss": 0.43618687987327576, + "eval_runtime": 3.5358, + "eval_samples_per_second": 28.282, + "eval_steps_per_second": 7.071, "step": 55500 }, { - "epoch": 1.95, - "learning_rate": 3.4815042057984683e-05, - "loss": 0.2924, + "epoch": 2.0003964392546942, + "grad_norm": 0.21153418719768524, + "learning_rate": 3.4102835136694445e-05, + "loss": 0.4348, "step": 55505 }, { - "epoch": 1.95, - "learning_rate": 3.481242200076331e-05, - "loss": 0.2876, + "epoch": 2.000576638915919, + "grad_norm": 0.18446187674999237, + "learning_rate": 3.4100117257431055e-05, + "loss": 0.387, "step": 55510 }, { - "epoch": 1.95, - "learning_rate": 3.4809801816134096e-05, - "loss": 0.2819, + "epoch": 2.0007568385771433, + "grad_norm": 0.16465526819229126, + "learning_rate": 3.409739925418034e-05, + "loss": 0.3821, "step": 55515 }, { - "epoch": 1.95, - "learning_rate": 3.4807181504131075e-05, - "loss": 0.2526, + "epoch": 2.000937038238368, + "grad_norm": 0.19307419657707214, + "learning_rate": 3.40946811269793e-05, + "loss": 0.4331, "step": 55520 }, { - "epoch": 1.95, - "learning_rate": 3.480456106478826e-05, - "loss": 0.277, + "epoch": 2.0011172378995927, + "grad_norm": 0.18775279819965363, + "learning_rate": 3.4091962875865e-05, + "loss": 0.3975, "step": 55525 }, { - "epoch": 1.95, - "learning_rate": 3.480194049813969e-05, - "loss": 0.2633, + "epoch": 2.0012974375608175, + "grad_norm": 0.1970471441745758, + "learning_rate": 3.4089244500874474e-05, + "loss": 0.4205, "step": 55530 }, { - "epoch": 1.95, - "learning_rate": 3.479931980421938e-05, - "loss": 0.2715, + "epoch": 2.001477637222042, + "grad_norm": 0.15047413110733032, + "learning_rate": 3.408652600204474e-05, + "loss": 0.4004, "step": 55535 }, { - "epoch": 1.95, - "learning_rate": 3.479669898306136e-05, - "loss": 0.2846, + "epoch": 2.0016578368832665, + "grad_norm": 0.18766699731349945, + "learning_rate": 3.408380737941285e-05, + "loss": 0.3961, "step": 55540 }, { - "epoch": 1.95, - "learning_rate": 3.479407803469966e-05, - "loss": 0.2837, + "epoch": 2.001838036544491, + "grad_norm": 0.16125249862670898, + "learning_rate": 3.4081088633015845e-05, + "loss": 0.3797, "step": 55545 }, { - "epoch": 1.95, - "learning_rate": 3.47914569591683e-05, - "loss": 0.2764, + "epoch": 2.002018236205716, + "grad_norm": 0.23382239043712616, + "learning_rate": 3.407836976289077e-05, + "loss": 0.414, "step": 55550 }, { - "epoch": 1.95, - "learning_rate": 3.478883575650134e-05, - "loss": 0.2934, + "epoch": 2.0021984358669407, + "grad_norm": 0.17505022883415222, + "learning_rate": 3.4075650769074664e-05, + "loss": 0.3889, "step": 55555 }, { - "epoch": 1.95, - "learning_rate": 3.478621442673279e-05, - "loss": 0.2913, + "epoch": 2.0023786355281654, + "grad_norm": 0.21252861618995667, + "learning_rate": 3.4072931651604566e-05, + "loss": 0.4107, "step": 55560 }, { - "epoch": 1.95, - "learning_rate": 3.47835929698967e-05, - "loss": 0.2786, + "epoch": 2.0025588351893897, + "grad_norm": 0.1734471619129181, + "learning_rate": 3.407021241051754e-05, + "loss": 0.3442, "step": 55565 }, { - "epoch": 1.96, - "learning_rate": 3.47809713860271e-05, - "loss": 0.2932, + "epoch": 2.0027390348506144, + "grad_norm": 0.21779297292232513, + "learning_rate": 3.406749304585062e-05, + "loss": 0.3749, "step": 55570 }, { - "epoch": 1.96, - "learning_rate": 3.477834967515803e-05, - "loss": 0.2587, + "epoch": 2.002919234511839, + "grad_norm": 0.17229339480400085, + "learning_rate": 3.406477355764087e-05, + "loss": 0.3806, "step": 55575 }, { - "epoch": 1.96, - "learning_rate": 3.477572783732354e-05, - "loss": 0.303, + "epoch": 2.003099434173064, + "grad_norm": 0.18596251308918, + "learning_rate": 3.406205394592532e-05, + "loss": 0.3808, "step": 55580 }, { - "epoch": 1.96, - "learning_rate": 3.477310587255766e-05, - "loss": 0.2909, + "epoch": 2.003279633834288, + "grad_norm": 0.20308445394039154, + "learning_rate": 3.405933421074105e-05, + "loss": 0.395, "step": 55585 }, { - "epoch": 1.96, - "learning_rate": 3.477048378089444e-05, - "loss": 0.3092, + "epoch": 2.003459833495513, + "grad_norm": 0.26746681332588196, + "learning_rate": 3.40566143521251e-05, + "loss": 0.3969, "step": 55590 }, { - "epoch": 1.96, - "learning_rate": 3.476786156236794e-05, - "loss": 0.2843, + "epoch": 2.0036400331567377, + "grad_norm": 0.20009200274944305, + "learning_rate": 3.405389437011454e-05, + "loss": 0.3977, "step": 55595 }, { - "epoch": 1.96, - "learning_rate": 3.476523921701218e-05, - "loss": 0.2992, + "epoch": 2.0038202328179624, + "grad_norm": 0.17441947758197784, + "learning_rate": 3.405117426474642e-05, + "loss": 0.3884, "step": 55600 }, { - "epoch": 1.96, - "learning_rate": 3.4762616744861226e-05, - "loss": 0.2587, + "epoch": 2.004000432479187, + "grad_norm": 0.19016499817371368, + "learning_rate": 3.40484540360578e-05, + "loss": 0.3964, "step": 55605 }, { - "epoch": 1.96, - "learning_rate": 3.475999414594913e-05, - "loss": 0.273, + "epoch": 2.0041806321404114, + "grad_norm": 0.16852574050426483, + "learning_rate": 3.4045733684085745e-05, + "loss": 0.4337, "step": 55610 }, { - "epoch": 1.96, - "learning_rate": 3.475737142030995e-05, - "loss": 0.2752, + "epoch": 2.004360831801636, + "grad_norm": 0.20671193301677704, + "learning_rate": 3.404301320886732e-05, + "loss": 0.4207, "step": 55615 }, { - "epoch": 1.96, - "learning_rate": 3.4754748567977725e-05, - "loss": 0.2721, + "epoch": 2.004541031462861, + "grad_norm": 0.17593398690223694, + "learning_rate": 3.404029261043961e-05, + "loss": 0.3864, "step": 55620 }, { - "epoch": 1.96, - "learning_rate": 3.4752125588986525e-05, - "loss": 0.2776, + "epoch": 2.0047212311240856, + "grad_norm": 0.18823134899139404, + "learning_rate": 3.4037571888839635e-05, + "loss": 0.3806, "step": 55625 }, { - "epoch": 1.96, - "learning_rate": 3.474950248337039e-05, - "loss": 0.2845, + "epoch": 2.00490143078531, + "grad_norm": 0.16644109785556793, + "learning_rate": 3.403485104410451e-05, + "loss": 0.3839, "step": 55630 }, { - "epoch": 1.96, - "learning_rate": 3.4746879251163396e-05, - "loss": 0.2974, + "epoch": 2.0050816304465346, + "grad_norm": 0.16421851515769958, + "learning_rate": 3.403213007627128e-05, + "loss": 0.3643, "step": 55635 }, { - "epoch": 1.96, - "learning_rate": 3.4744255892399594e-05, - "loss": 0.2803, + "epoch": 2.0052618301077594, + "grad_norm": 0.14267808198928833, + "learning_rate": 3.402940898537703e-05, + "loss": 0.4057, "step": 55640 }, { - "epoch": 1.96, - "learning_rate": 3.4741632407113064e-05, - "loss": 0.2813, + "epoch": 2.005442029768984, + "grad_norm": 0.19570884108543396, + "learning_rate": 3.402668777145883e-05, + "loss": 0.3704, "step": 55645 }, { - "epoch": 1.96, - "learning_rate": 3.4739008795337845e-05, - "loss": 0.289, + "epoch": 2.005622229430209, + "grad_norm": 0.20084281265735626, + "learning_rate": 3.402396643455376e-05, + "loss": 0.43, "step": 55650 }, { - "epoch": 1.96, - "learning_rate": 3.473638505710802e-05, - "loss": 0.2605, + "epoch": 2.005802429091433, + "grad_norm": 0.17763961851596832, + "learning_rate": 3.402124497469889e-05, + "loss": 0.4088, "step": 55655 }, { - "epoch": 1.96, - "learning_rate": 3.4733761192457645e-05, - "loss": 0.2777, + "epoch": 2.005982628752658, + "grad_norm": 0.19127719104290009, + "learning_rate": 3.40185233919313e-05, + "loss": 0.3433, "step": 55660 }, { - "epoch": 1.96, - "learning_rate": 3.473113720142081e-05, - "loss": 0.2803, + "epoch": 2.0061628284138826, + "grad_norm": 0.20189210772514343, + "learning_rate": 3.4015801686288086e-05, + "loss": 0.3741, "step": 55665 }, { - "epoch": 1.96, - "learning_rate": 3.472851308403156e-05, - "loss": 0.2543, + "epoch": 2.0063430280751073, + "grad_norm": 0.16015605628490448, + "learning_rate": 3.401307985780631e-05, + "loss": 0.3657, "step": 55670 }, { - "epoch": 1.96, - "learning_rate": 3.472588884032398e-05, - "loss": 0.2648, + "epoch": 2.006523227736332, + "grad_norm": 0.16415055096149445, + "learning_rate": 3.401035790652307e-05, + "loss": 0.3841, "step": 55675 }, { - "epoch": 1.96, - "learning_rate": 3.472326447033214e-05, - "loss": 0.2898, + "epoch": 2.0067034273975564, + "grad_norm": 0.169295534491539, + "learning_rate": 3.400763583247545e-05, + "loss": 0.3687, "step": 55680 }, { - "epoch": 1.96, - "learning_rate": 3.4720639974090126e-05, - "loss": 0.2609, + "epoch": 2.006883627058781, + "grad_norm": 0.17803806066513062, + "learning_rate": 3.400491363570053e-05, + "loss": 0.3908, "step": 55685 }, { - "epoch": 1.96, - "learning_rate": 3.4718015351632e-05, - "loss": 0.2882, + "epoch": 2.007063826720006, + "grad_norm": 0.1836196929216385, + "learning_rate": 3.400219131623541e-05, + "loss": 0.4002, "step": 55690 }, { - "epoch": 1.96, - "learning_rate": 3.4715390602991855e-05, - "loss": 0.2933, + "epoch": 2.0072440263812306, + "grad_norm": 0.21328406035900116, + "learning_rate": 3.3999468874117176e-05, + "loss": 0.3604, "step": 55695 }, { - "epoch": 1.96, - "learning_rate": 3.471276572820377e-05, - "loss": 0.3122, + "epoch": 2.007424226042455, + "grad_norm": 0.1810711771249771, + "learning_rate": 3.399674630938292e-05, + "loss": 0.4015, "step": 55700 }, { - "epoch": 1.96, - "learning_rate": 3.471014072730181e-05, - "loss": 0.2917, + "epoch": 2.0076044257036796, + "grad_norm": 0.21469327807426453, + "learning_rate": 3.3994023622069734e-05, + "loss": 0.4499, "step": 55705 }, { - "epoch": 1.96, - "learning_rate": 3.470751560032009e-05, - "loss": 0.2758, + "epoch": 2.0077846253649043, + "grad_norm": 0.20458552241325378, + "learning_rate": 3.3991300812214724e-05, + "loss": 0.4261, "step": 55710 }, { - "epoch": 1.96, - "learning_rate": 3.4704890347292656e-05, - "loss": 0.2865, + "epoch": 2.007964825026129, + "grad_norm": 0.13773076236248016, + "learning_rate": 3.398857787985498e-05, + "loss": 0.3517, "step": 55715 }, { - "epoch": 1.96, - "learning_rate": 3.470226496825363e-05, - "loss": 0.2942, + "epoch": 2.008145024687354, + "grad_norm": 0.17209510505199432, + "learning_rate": 3.39858548250276e-05, + "loss": 0.3871, "step": 55720 }, { - "epoch": 1.96, - "learning_rate": 3.469963946323709e-05, - "loss": 0.2814, + "epoch": 2.008325224348578, + "grad_norm": 0.18736940622329712, + "learning_rate": 3.398313164776968e-05, + "loss": 0.3906, "step": 55725 }, { - "epoch": 1.96, - "learning_rate": 3.469701383227712e-05, - "loss": 0.2787, + "epoch": 2.008505424009803, + "grad_norm": 0.17324689030647278, + "learning_rate": 3.3980408348118337e-05, + "loss": 0.4141, "step": 55730 }, { - "epoch": 1.96, - "learning_rate": 3.469438807540781e-05, - "loss": 0.2709, + "epoch": 2.0086856236710275, + "grad_norm": 0.21437758207321167, + "learning_rate": 3.397768492611068e-05, + "loss": 0.3743, "step": 55735 }, { - "epoch": 1.96, - "learning_rate": 3.469176219266326e-05, - "loss": 0.2839, + "epoch": 2.0088658233322523, + "grad_norm": 0.20862148702144623, + "learning_rate": 3.397496138178379e-05, + "loss": 0.3897, "step": 55740 }, { - "epoch": 1.96, - "learning_rate": 3.468913618407758e-05, - "loss": 0.2847, + "epoch": 2.0090460229934766, + "grad_norm": 0.2141301929950714, + "learning_rate": 3.397223771517479e-05, + "loss": 0.4016, "step": 55745 }, { - "epoch": 1.96, - "learning_rate": 3.468651004968484e-05, - "loss": 0.2982, + "epoch": 2.0092262226547013, + "grad_norm": 0.21781469881534576, + "learning_rate": 3.39695139263208e-05, + "loss": 0.4502, "step": 55750 }, { - "epoch": 1.96, - "learning_rate": 3.468388378951915e-05, - "loss": 0.3017, + "epoch": 2.009406422315926, + "grad_norm": 0.1999913901090622, + "learning_rate": 3.39667900152589e-05, + "loss": 0.4362, "step": 55755 }, { - "epoch": 1.96, - "learning_rate": 3.468125740361461e-05, - "loss": 0.3006, + "epoch": 2.0095866219771508, + "grad_norm": 0.17094168066978455, + "learning_rate": 3.396406598202624e-05, + "loss": 0.4162, "step": 55760 }, { - "epoch": 1.96, - "learning_rate": 3.4678630892005334e-05, - "loss": 0.2897, + "epoch": 2.0097668216383755, + "grad_norm": 0.19397181272506714, + "learning_rate": 3.39613418266599e-05, + "loss": 0.4285, "step": 55765 }, { - "epoch": 1.96, - "learning_rate": 3.46760042547254e-05, - "loss": 0.3039, + "epoch": 2.0099470212996, + "grad_norm": 0.1974741369485855, + "learning_rate": 3.3958617549197024e-05, + "loss": 0.3795, "step": 55770 }, { - "epoch": 1.96, - "learning_rate": 3.4673377491808935e-05, - "loss": 0.2681, + "epoch": 2.0101272209608245, + "grad_norm": 0.16916494071483612, + "learning_rate": 3.395589314967472e-05, + "loss": 0.4219, "step": 55775 }, { - "epoch": 1.96, - "learning_rate": 3.467075060329004e-05, - "loss": 0.2855, + "epoch": 2.0103074206220493, + "grad_norm": 0.1723833829164505, + "learning_rate": 3.39531686281301e-05, + "loss": 0.3858, "step": 55780 }, { - "epoch": 1.96, - "learning_rate": 3.466812358920282e-05, - "loss": 0.2613, + "epoch": 2.010487620283274, + "grad_norm": 0.19679874181747437, + "learning_rate": 3.39504439846003e-05, + "loss": 0.4087, "step": 55785 }, { - "epoch": 1.96, - "learning_rate": 3.466549644958138e-05, - "loss": 0.2904, + "epoch": 2.0106678199444987, + "grad_norm": 0.18012060225009918, + "learning_rate": 3.3947719219122415e-05, + "loss": 0.4004, "step": 55790 }, { - "epoch": 1.96, - "learning_rate": 3.466286918445985e-05, - "loss": 0.2838, + "epoch": 2.010848019605723, + "grad_norm": 0.190474733710289, + "learning_rate": 3.39449943317336e-05, + "loss": 0.3732, "step": 55795 }, { - "epoch": 1.96, - "learning_rate": 3.466024179387232e-05, - "loss": 0.2562, + "epoch": 2.0110282192669477, + "grad_norm": 0.19163033366203308, + "learning_rate": 3.394226932247097e-05, + "loss": 0.4261, "step": 55800 }, { - "epoch": 1.96, - "learning_rate": 3.4657614277852926e-05, - "loss": 0.2805, + "epoch": 2.0112084189281725, + "grad_norm": 0.18268360197544098, + "learning_rate": 3.3939544191371654e-05, + "loss": 0.3969, "step": 55805 }, { - "epoch": 1.96, - "learning_rate": 3.465498663643576e-05, - "loss": 0.2851, + "epoch": 2.011388618589397, + "grad_norm": 0.1998191624879837, + "learning_rate": 3.393681893847277e-05, + "loss": 0.3728, "step": 55810 }, { - "epoch": 1.96, - "learning_rate": 3.465235886965498e-05, - "loss": 0.279, + "epoch": 2.0115688182506215, + "grad_norm": 0.20111285150051117, + "learning_rate": 3.393409356381147e-05, + "loss": 0.3964, "step": 55815 }, { - "epoch": 1.96, - "learning_rate": 3.464973097754466e-05, - "loss": 0.2746, + "epoch": 2.0117490179118462, + "grad_norm": 0.18400463461875916, + "learning_rate": 3.393136806742487e-05, + "loss": 0.3896, "step": 55820 }, { - "epoch": 1.96, - "learning_rate": 3.464710296013895e-05, - "loss": 0.2792, + "epoch": 2.011929217573071, + "grad_norm": 0.16936840116977692, + "learning_rate": 3.39286424493501e-05, + "loss": 0.3907, "step": 55825 }, { - "epoch": 1.96, - "learning_rate": 3.4644474817471964e-05, - "loss": 0.2919, + "epoch": 2.0121094172342957, + "grad_norm": 0.1799679696559906, + "learning_rate": 3.392591670962432e-05, + "loss": 0.3636, "step": 55830 }, { - "epoch": 1.96, - "learning_rate": 3.4641846549577836e-05, - "loss": 0.2743, + "epoch": 2.0122896168955204, + "grad_norm": 0.17350183427333832, + "learning_rate": 3.392319084828464e-05, + "loss": 0.3845, "step": 55835 }, { - "epoch": 1.96, - "learning_rate": 3.463921815649068e-05, - "loss": 0.2919, + "epoch": 2.0124698165567447, + "grad_norm": 0.18203559517860413, + "learning_rate": 3.3920464865368225e-05, + "loss": 0.4245, "step": 55840 }, { - "epoch": 1.96, - "learning_rate": 3.4636589638244635e-05, - "loss": 0.263, + "epoch": 2.0126500162179695, + "grad_norm": 0.18975740671157837, + "learning_rate": 3.3917738760912186e-05, + "loss": 0.3997, "step": 55845 }, { - "epoch": 1.96, - "learning_rate": 3.463396099487381e-05, - "loss": 0.3008, + "epoch": 2.012830215879194, + "grad_norm": 0.2037028819322586, + "learning_rate": 3.39150125349537e-05, + "loss": 0.3931, "step": 55850 }, { - "epoch": 1.97, - "learning_rate": 3.463133222641236e-05, - "loss": 0.2541, + "epoch": 2.013010415540419, + "grad_norm": 0.2006373107433319, + "learning_rate": 3.391228618752988e-05, + "loss": 0.3418, "step": 55855 }, { - "epoch": 1.97, - "learning_rate": 3.462870333289441e-05, - "loss": 0.2645, + "epoch": 2.013190615201643, + "grad_norm": 0.2143949270248413, + "learning_rate": 3.39095597186779e-05, + "loss": 0.3607, "step": 55860 }, { - "epoch": 1.97, - "learning_rate": 3.462607431435409e-05, - "loss": 0.283, + "epoch": 2.013370814862868, + "grad_norm": 0.18995679914951324, + "learning_rate": 3.390683312843489e-05, + "loss": 0.3774, "step": 55865 }, { - "epoch": 1.97, - "learning_rate": 3.462344517082553e-05, - "loss": 0.3032, + "epoch": 2.0135510145240927, + "grad_norm": 0.2137516885995865, + "learning_rate": 3.3904106416837994e-05, + "loss": 0.3916, "step": 55870 }, { - "epoch": 1.97, - "learning_rate": 3.462081590234289e-05, - "loss": 0.2615, + "epoch": 2.0137312141853174, + "grad_norm": 0.18241751194000244, + "learning_rate": 3.390137958392438e-05, + "loss": 0.3773, "step": 55875 }, { - "epoch": 1.97, - "learning_rate": 3.4618186508940283e-05, - "loss": 0.2786, + "epoch": 2.013911413846542, + "grad_norm": 0.19162051379680634, + "learning_rate": 3.3898652629731195e-05, + "loss": 0.3743, "step": 55880 }, { - "epoch": 1.97, - "learning_rate": 3.461555699065188e-05, - "loss": 0.2817, + "epoch": 2.0140916135077664, + "grad_norm": 0.17561465501785278, + "learning_rate": 3.389592555429558e-05, + "loss": 0.3987, "step": 55885 }, { - "epoch": 1.97, - "learning_rate": 3.461292734751179e-05, - "loss": 0.2885, + "epoch": 2.014271813168991, + "grad_norm": 0.20353040099143982, + "learning_rate": 3.389319835765471e-05, + "loss": 0.3807, "step": 55890 }, { - "epoch": 1.97, - "learning_rate": 3.461029757955419e-05, - "loss": 0.3076, + "epoch": 2.014452012830216, + "grad_norm": 0.1713477373123169, + "learning_rate": 3.389047103984573e-05, + "loss": 0.4177, "step": 55895 }, { - "epoch": 1.97, - "learning_rate": 3.4607667686813196e-05, - "loss": 0.2632, + "epoch": 2.0146322124914406, + "grad_norm": 0.18497739732265472, + "learning_rate": 3.388774360090581e-05, + "loss": 0.3734, "step": 55900 }, { - "epoch": 1.97, - "learning_rate": 3.460503766932297e-05, - "loss": 0.2721, + "epoch": 2.014812412152665, + "grad_norm": 0.1842721551656723, + "learning_rate": 3.38850160408721e-05, + "loss": 0.367, "step": 55905 }, { - "epoch": 1.97, - "learning_rate": 3.460240752711766e-05, - "loss": 0.2715, + "epoch": 2.0149926118138897, + "grad_norm": 0.14854677021503448, + "learning_rate": 3.3882288359781764e-05, + "loss": 0.4289, "step": 55910 }, { - "epoch": 1.97, - "learning_rate": 3.459977726023142e-05, - "loss": 0.2827, + "epoch": 2.0151728114751144, + "grad_norm": 0.2103215456008911, + "learning_rate": 3.387956055767197e-05, + "loss": 0.3974, "step": 55915 }, { - "epoch": 1.97, - "learning_rate": 3.45971468686984e-05, - "loss": 0.3024, + "epoch": 2.015353011136339, + "grad_norm": 0.2002989947795868, + "learning_rate": 3.387683263457989e-05, + "loss": 0.3743, "step": 55920 }, { - "epoch": 1.97, - "learning_rate": 3.459451635255275e-05, - "loss": 0.2799, + "epoch": 2.015533210797564, + "grad_norm": 0.1742120236158371, + "learning_rate": 3.3874104590542676e-05, + "loss": 0.386, "step": 55925 }, { - "epoch": 1.97, - "learning_rate": 3.459188571182863e-05, - "loss": 0.2953, + "epoch": 2.015713410458788, + "grad_norm": 0.19794981181621552, + "learning_rate": 3.38713764255975e-05, + "loss": 0.4048, "step": 55930 }, { - "epoch": 1.97, - "learning_rate": 3.458925494656019e-05, - "loss": 0.2639, + "epoch": 2.015893610120013, + "grad_norm": 0.21913039684295654, + "learning_rate": 3.3868648139781545e-05, + "loss": 0.4255, "step": 55935 }, { - "epoch": 1.97, - "learning_rate": 3.45866240567816e-05, - "loss": 0.2732, + "epoch": 2.0160738097812376, + "grad_norm": 0.19685392081737518, + "learning_rate": 3.3865919733131975e-05, + "loss": 0.3708, "step": 55940 }, { - "epoch": 1.97, - "learning_rate": 3.458399304252701e-05, - "loss": 0.2735, + "epoch": 2.0162540094424624, + "grad_norm": 0.23018737137317657, + "learning_rate": 3.386319120568597e-05, + "loss": 0.411, "step": 55945 }, { - "epoch": 1.97, - "learning_rate": 3.458136190383059e-05, - "loss": 0.266, + "epoch": 2.016434209103687, + "grad_norm": 0.196548730134964, + "learning_rate": 3.38604625574807e-05, + "loss": 0.3945, "step": 55950 }, { - "epoch": 1.97, - "learning_rate": 3.4578730640726496e-05, - "loss": 0.2605, + "epoch": 2.0166144087649114, + "grad_norm": 0.24264010787010193, + "learning_rate": 3.3857733788553335e-05, + "loss": 0.4077, "step": 55955 }, { - "epoch": 1.97, - "learning_rate": 3.4576099253248905e-05, - "loss": 0.2745, + "epoch": 2.016794608426136, + "grad_norm": 0.24170154333114624, + "learning_rate": 3.385500489894107e-05, + "loss": 0.3743, "step": 55960 }, { - "epoch": 1.97, - "learning_rate": 3.457346774143197e-05, - "loss": 0.2644, + "epoch": 2.016974808087361, + "grad_norm": 0.1775246411561966, + "learning_rate": 3.3852275888681076e-05, + "loss": 0.3975, "step": 55965 }, { - "epoch": 1.97, - "learning_rate": 3.457083610530987e-05, - "loss": 0.2862, + "epoch": 2.0171550077485856, + "grad_norm": 0.18977300822734833, + "learning_rate": 3.3849546757810536e-05, + "loss": 0.4093, "step": 55970 }, { - "epoch": 1.97, - "learning_rate": 3.4568204344916766e-05, - "loss": 0.2671, + "epoch": 2.01733520740981, + "grad_norm": 0.1697954684495926, + "learning_rate": 3.3846817506366625e-05, + "loss": 0.4306, "step": 55975 }, { - "epoch": 1.97, - "learning_rate": 3.4565572460286835e-05, - "loss": 0.2652, + "epoch": 2.0175154070710346, + "grad_norm": 0.20978444814682007, + "learning_rate": 3.384408813438655e-05, + "loss": 0.4178, "step": 55980 }, { - "epoch": 1.97, - "learning_rate": 3.456294045145425e-05, - "loss": 0.2871, + "epoch": 2.0176956067322593, + "grad_norm": 0.23438295722007751, + "learning_rate": 3.384135864190748e-05, + "loss": 0.4378, "step": 55985 }, { - "epoch": 1.97, - "learning_rate": 3.456030831845318e-05, - "loss": 0.2856, + "epoch": 2.017875806393484, + "grad_norm": 0.19239699840545654, + "learning_rate": 3.383862902896662e-05, + "loss": 0.3842, "step": 55990 }, { - "epoch": 1.97, - "learning_rate": 3.455767606131781e-05, - "loss": 0.3036, + "epoch": 2.018056006054709, + "grad_norm": 0.1987629383802414, + "learning_rate": 3.3835899295601146e-05, + "loss": 0.3805, "step": 55995 }, { - "epoch": 1.97, - "learning_rate": 3.455504368008232e-05, - "loss": 0.3041, + "epoch": 2.018236205715933, + "grad_norm": 0.19622477889060974, + "learning_rate": 3.383316944184825e-05, + "loss": 0.3615, "step": 56000 }, { - "epoch": 1.97, - "eval_loss": 0.27226722240448, - "eval_runtime": 10.5443, - "eval_samples_per_second": 9.484, - "eval_steps_per_second": 9.484, + "epoch": 2.018236205715933, + "eval_loss": 0.4372096359729767, + "eval_runtime": 3.5364, + "eval_samples_per_second": 28.278, + "eval_steps_per_second": 7.069, "step": 56000 }, { - "epoch": 1.97, - "learning_rate": 3.4552411174780886e-05, - "loss": 0.285, + "epoch": 2.018416405377158, + "grad_norm": 0.1874462217092514, + "learning_rate": 3.383043946774514e-05, + "loss": 0.4029, "step": 56005 }, { - "epoch": 1.97, - "learning_rate": 3.4549778545447685e-05, - "loss": 0.3094, + "epoch": 2.0185966050383826, + "grad_norm": 0.1793491393327713, + "learning_rate": 3.382770937332899e-05, + "loss": 0.4104, "step": 56010 }, { - "epoch": 1.97, - "learning_rate": 3.45471457921169e-05, - "loss": 0.2706, + "epoch": 2.0187768046996073, + "grad_norm": 0.15393778681755066, + "learning_rate": 3.382497915863702e-05, + "loss": 0.4142, "step": 56015 }, { - "epoch": 1.97, - "learning_rate": 3.454451291482272e-05, - "loss": 0.268, + "epoch": 2.0189570043608316, + "grad_norm": 0.17542561888694763, + "learning_rate": 3.3822248823706406e-05, + "loss": 0.3852, "step": 56020 }, { - "epoch": 1.97, - "learning_rate": 3.454187991359933e-05, - "loss": 0.2826, + "epoch": 2.0191372040220563, + "grad_norm": 0.20072583854198456, + "learning_rate": 3.3819518368574365e-05, + "loss": 0.4079, "step": 56025 }, { - "epoch": 1.97, - "learning_rate": 3.453924678848093e-05, - "loss": 0.2784, + "epoch": 2.019317403683281, + "grad_norm": 0.24251943826675415, + "learning_rate": 3.38167877932781e-05, + "loss": 0.4273, "step": 56030 }, { - "epoch": 1.97, - "learning_rate": 3.4536613539501684e-05, - "loss": 0.2817, + "epoch": 2.019497603344506, + "grad_norm": 0.19765913486480713, + "learning_rate": 3.3814057097854804e-05, + "loss": 0.3785, "step": 56035 }, { - "epoch": 1.97, - "learning_rate": 3.453398016669579e-05, - "loss": 0.2681, + "epoch": 2.0196778030057305, + "grad_norm": 0.1533811390399933, + "learning_rate": 3.381132628234168e-05, + "loss": 0.3913, "step": 56040 }, { - "epoch": 1.97, - "learning_rate": 3.453134667009745e-05, - "loss": 0.3129, + "epoch": 2.019858002666955, + "grad_norm": 0.15910130739212036, + "learning_rate": 3.380859534677595e-05, + "loss": 0.3877, "step": 56045 }, { - "epoch": 1.97, - "learning_rate": 3.452871304974086e-05, - "loss": 0.292, + "epoch": 2.0200382023281795, + "grad_norm": 0.298079252243042, + "learning_rate": 3.380586429119481e-05, + "loss": 0.418, "step": 56050 }, { - "epoch": 1.97, - "learning_rate": 3.452607930566022e-05, - "loss": 0.2589, + "epoch": 2.0202184019894043, + "grad_norm": 0.22058376669883728, + "learning_rate": 3.380313311563548e-05, + "loss": 0.4053, "step": 56055 }, { - "epoch": 1.97, - "learning_rate": 3.45234454378897e-05, - "loss": 0.2759, + "epoch": 2.020398601650629, + "grad_norm": 0.2122090458869934, + "learning_rate": 3.380040182013515e-05, + "loss": 0.4353, "step": 56060 }, { - "epoch": 1.97, - "learning_rate": 3.452081144646353e-05, - "loss": 0.3165, + "epoch": 2.0205788013118537, + "grad_norm": 0.15873844921588898, + "learning_rate": 3.3797670404731066e-05, + "loss": 0.4032, "step": 56065 }, { - "epoch": 1.97, - "learning_rate": 3.451817733141589e-05, - "loss": 0.2685, + "epoch": 2.020759000973078, + "grad_norm": 0.24122844636440277, + "learning_rate": 3.3794938869460416e-05, + "loss": 0.3695, "step": 56070 }, { - "epoch": 1.97, - "learning_rate": 3.4515543092781e-05, - "loss": 0.29, + "epoch": 2.0209392006343028, + "grad_norm": 0.15981855988502502, + "learning_rate": 3.3792207214360434e-05, + "loss": 0.3844, "step": 56075 }, { - "epoch": 1.97, - "learning_rate": 3.4512908730593044e-05, - "loss": 0.2866, + "epoch": 2.0211194002955275, + "grad_norm": 0.19484391808509827, + "learning_rate": 3.3789475439468324e-05, + "loss": 0.4208, "step": 56080 }, { - "epoch": 1.97, - "learning_rate": 3.451027424488624e-05, - "loss": 0.2695, + "epoch": 2.0212995999567522, + "grad_norm": 0.21466252207756042, + "learning_rate": 3.378674354482132e-05, + "loss": 0.4127, "step": 56085 }, { - "epoch": 1.97, - "learning_rate": 3.450763963569479e-05, - "loss": 0.2717, + "epoch": 2.0214797996179765, + "grad_norm": 0.20310963690280914, + "learning_rate": 3.378401153045663e-05, + "loss": 0.4158, "step": 56090 }, { - "epoch": 1.97, - "learning_rate": 3.4505004903052915e-05, - "loss": 0.2972, + "epoch": 2.0216599992792013, + "grad_norm": 0.21700692176818848, + "learning_rate": 3.3781279396411484e-05, + "loss": 0.4001, "step": 56095 }, { - "epoch": 1.97, - "learning_rate": 3.4502370046994806e-05, - "loss": 0.2862, + "epoch": 2.021840198940426, + "grad_norm": 0.17587092518806458, + "learning_rate": 3.3778547142723104e-05, + "loss": 0.3782, "step": 56100 }, { - "epoch": 1.97, - "learning_rate": 3.4499735067554686e-05, - "loss": 0.2698, + "epoch": 2.0220203986016507, + "grad_norm": 0.2222931832075119, + "learning_rate": 3.3775814769428725e-05, + "loss": 0.388, "step": 56105 }, { - "epoch": 1.97, - "learning_rate": 3.449709996476677e-05, - "loss": 0.2861, + "epoch": 2.0222005982628755, + "grad_norm": 0.2034565955400467, + "learning_rate": 3.377308227656557e-05, + "loss": 0.4348, "step": 56110 }, { - "epoch": 1.97, - "learning_rate": 3.449446473866527e-05, - "loss": 0.2772, + "epoch": 2.0223807979240997, + "grad_norm": 0.17816239595413208, + "learning_rate": 3.3770349664170865e-05, + "loss": 0.3529, "step": 56115 }, { - "epoch": 1.97, - "learning_rate": 3.4491829389284396e-05, - "loss": 0.2967, + "epoch": 2.0225609975853245, + "grad_norm": 0.18056336045265198, + "learning_rate": 3.376761693228184e-05, + "loss": 0.429, "step": 56120 }, { - "epoch": 1.97, - "learning_rate": 3.4489193916658374e-05, - "loss": 0.2691, + "epoch": 2.022741197246549, + "grad_norm": 0.21382421255111694, + "learning_rate": 3.3764884080935735e-05, + "loss": 0.3775, "step": 56125 }, { - "epoch": 1.97, - "learning_rate": 3.448655832082143e-05, - "loss": 0.2875, + "epoch": 2.022921396907774, + "grad_norm": 0.19596946239471436, + "learning_rate": 3.376215111016978e-05, + "loss": 0.4107, "step": 56130 }, { - "epoch": 1.97, - "learning_rate": 3.448392260180777e-05, - "loss": 0.3062, + "epoch": 2.0231015965689982, + "grad_norm": 0.1608893871307373, + "learning_rate": 3.375941802002122e-05, + "loss": 0.3723, "step": 56135 }, { - "epoch": 1.98, - "learning_rate": 3.448128675965163e-05, - "loss": 0.2866, + "epoch": 2.023281796230223, + "grad_norm": 0.22571824491024017, + "learning_rate": 3.3756684810527275e-05, + "loss": 0.4514, "step": 56140 }, { - "epoch": 1.98, - "learning_rate": 3.447865079438723e-05, - "loss": 0.295, + "epoch": 2.0234619958914477, + "grad_norm": 0.17446362972259521, + "learning_rate": 3.3753951481725196e-05, + "loss": 0.3783, "step": 56145 }, { - "epoch": 1.98, - "learning_rate": 3.447601470604879e-05, - "loss": 0.2987, + "epoch": 2.0236421955526724, + "grad_norm": 0.2468831092119217, + "learning_rate": 3.3751218033652225e-05, + "loss": 0.4116, "step": 56150 }, { - "epoch": 1.98, - "learning_rate": 3.447337849467055e-05, - "loss": 0.2784, + "epoch": 2.023822395213897, + "grad_norm": 0.22208808362483978, + "learning_rate": 3.374848446634561e-05, + "loss": 0.4228, "step": 56155 }, { - "epoch": 1.98, - "learning_rate": 3.447074216028673e-05, - "loss": 0.2711, + "epoch": 2.0240025948751215, + "grad_norm": 0.20974206924438477, + "learning_rate": 3.3745750779842575e-05, + "loss": 0.406, "step": 56160 }, { - "epoch": 1.98, - "learning_rate": 3.4468105702931565e-05, - "loss": 0.2909, + "epoch": 2.024182794536346, + "grad_norm": 0.14893017709255219, + "learning_rate": 3.374301697418039e-05, + "loss": 0.3712, "step": 56165 }, { - "epoch": 1.98, - "learning_rate": 3.446546912263929e-05, - "loss": 0.276, + "epoch": 2.024362994197571, + "grad_norm": 0.16727864742279053, + "learning_rate": 3.3740283049396285e-05, + "loss": 0.415, "step": 56170 }, { - "epoch": 1.98, - "learning_rate": 3.4462832419444135e-05, - "loss": 0.3056, + "epoch": 2.0245431938587957, + "grad_norm": 0.2555207908153534, + "learning_rate": 3.3737549005527514e-05, + "loss": 0.4133, "step": 56175 }, { - "epoch": 1.98, - "learning_rate": 3.446019559338034e-05, - "loss": 0.2793, + "epoch": 2.0247233935200204, + "grad_norm": 0.22973132133483887, + "learning_rate": 3.3734814842611326e-05, + "loss": 0.4124, "step": 56180 }, { - "epoch": 1.98, - "learning_rate": 3.445755864448213e-05, - "loss": 0.2806, + "epoch": 2.0249035931812447, + "grad_norm": 0.21145066618919373, + "learning_rate": 3.373208056068499e-05, + "loss": 0.3879, "step": 56185 }, { - "epoch": 1.98, - "learning_rate": 3.445492157278377e-05, - "loss": 0.288, + "epoch": 2.0250837928424694, + "grad_norm": 0.19409604370594025, + "learning_rate": 3.372934615978574e-05, + "loss": 0.3742, "step": 56190 }, { - "epoch": 1.98, - "learning_rate": 3.4452284378319465e-05, - "loss": 0.2845, + "epoch": 2.025263992503694, + "grad_norm": 0.21370406448841095, + "learning_rate": 3.372661163995084e-05, + "loss": 0.3909, "step": 56195 }, { - "epoch": 1.98, - "learning_rate": 3.444964706112349e-05, - "loss": 0.2675, + "epoch": 2.025444192164919, + "grad_norm": 0.22447529435157776, + "learning_rate": 3.372387700121754e-05, + "loss": 0.4121, "step": 56200 }, { - "epoch": 1.98, - "learning_rate": 3.444700962123007e-05, - "loss": 0.2838, + "epoch": 2.025624391826143, + "grad_norm": 0.22073820233345032, + "learning_rate": 3.372114224362311e-05, + "loss": 0.3848, "step": 56205 }, { - "epoch": 1.98, - "learning_rate": 3.444437205867346e-05, - "loss": 0.2862, + "epoch": 2.025804591487368, + "grad_norm": 0.18990257382392883, + "learning_rate": 3.3718407367204807e-05, + "loss": 0.3688, "step": 56210 }, { - "epoch": 1.98, - "learning_rate": 3.44417343734879e-05, - "loss": 0.2953, + "epoch": 2.0259847911485926, + "grad_norm": 0.172977477312088, + "learning_rate": 3.371567237199989e-05, + "loss": 0.3931, "step": 56215 }, { - "epoch": 1.98, - "learning_rate": 3.4439096565707647e-05, - "loss": 0.2968, + "epoch": 2.0261649908098174, + "grad_norm": 0.2128041833639145, + "learning_rate": 3.371293725804562e-05, + "loss": 0.4185, "step": 56220 }, { - "epoch": 1.98, - "learning_rate": 3.4436458635366935e-05, - "loss": 0.295, + "epoch": 2.026345190471042, + "grad_norm": 0.1763525903224945, + "learning_rate": 3.3710202025379276e-05, + "loss": 0.4008, "step": 56225 }, { - "epoch": 1.98, - "learning_rate": 3.4433820582500035e-05, - "loss": 0.2711, + "epoch": 2.0265253901322664, + "grad_norm": 0.19905276596546173, + "learning_rate": 3.370746667403811e-05, + "loss": 0.3789, "step": 56230 }, { - "epoch": 1.98, - "learning_rate": 3.4431182407141186e-05, - "loss": 0.2735, + "epoch": 2.026705589793491, + "grad_norm": 0.19667592644691467, + "learning_rate": 3.37047312040594e-05, + "loss": 0.3738, "step": 56235 }, { - "epoch": 1.98, - "learning_rate": 3.4428544109324656e-05, - "loss": 0.2823, + "epoch": 2.026885789454716, + "grad_norm": 0.19164909422397614, + "learning_rate": 3.37019956154804e-05, + "loss": 0.433, "step": 56240 }, { - "epoch": 1.98, - "learning_rate": 3.442590568908469e-05, - "loss": 0.2839, + "epoch": 2.0270659891159406, + "grad_norm": 0.1493864804506302, + "learning_rate": 3.369925990833841e-05, + "loss": 0.4096, "step": 56245 }, { - "epoch": 1.98, - "learning_rate": 3.442326714645555e-05, - "loss": 0.2897, + "epoch": 2.027246188777165, + "grad_norm": 0.18463338911533356, + "learning_rate": 3.3696524082670684e-05, + "loss": 0.3974, "step": 56250 }, { - "epoch": 1.98, - "learning_rate": 3.44206284814715e-05, - "loss": 0.2949, + "epoch": 2.0274263884383896, + "grad_norm": 0.22145789861679077, + "learning_rate": 3.369378813851449e-05, + "loss": 0.3888, "step": 56255 }, { - "epoch": 1.98, - "learning_rate": 3.441798969416679e-05, - "loss": 0.2782, + "epoch": 2.0276065880996144, + "grad_norm": 0.1729925125837326, + "learning_rate": 3.369105207590713e-05, + "loss": 0.4284, "step": 56260 }, { - "epoch": 1.98, - "learning_rate": 3.44153507845757e-05, - "loss": 0.2896, + "epoch": 2.027786787760839, + "grad_norm": 0.1994604617357254, + "learning_rate": 3.368831589488586e-05, + "loss": 0.4122, "step": 56265 }, { - "epoch": 1.98, - "learning_rate": 3.441271175273248e-05, - "loss": 0.2937, + "epoch": 2.027966987422064, + "grad_norm": 0.16139256954193115, + "learning_rate": 3.3685579595487985e-05, + "loss": 0.3947, "step": 56270 }, { - "epoch": 1.98, - "learning_rate": 3.44100725986714e-05, - "loss": 0.2904, + "epoch": 2.028147187083288, + "grad_norm": 0.1860417276620865, + "learning_rate": 3.3682843177750745e-05, + "loss": 0.4223, "step": 56275 }, { - "epoch": 1.98, - "learning_rate": 3.440743332242673e-05, - "loss": 0.3106, + "epoch": 2.028327386744513, + "grad_norm": 0.1829526275396347, + "learning_rate": 3.368010664171146e-05, + "loss": 0.3896, "step": 56280 }, { - "epoch": 1.98, - "learning_rate": 3.440479392403274e-05, - "loss": 0.2914, + "epoch": 2.0285075864057376, + "grad_norm": 0.23350392282009125, + "learning_rate": 3.36773699874074e-05, + "loss": 0.4205, "step": 56285 }, { - "epoch": 1.98, - "learning_rate": 3.440215440352369e-05, - "loss": 0.2849, + "epoch": 2.0286877860669623, + "grad_norm": 0.18550008535385132, + "learning_rate": 3.3674633214875855e-05, + "loss": 0.4167, "step": 56290 }, { - "epoch": 1.98, - "learning_rate": 3.439951476093387e-05, - "loss": 0.3019, + "epoch": 2.028867985728187, + "grad_norm": 0.2336433231830597, + "learning_rate": 3.367189632415411e-05, + "loss": 0.3576, "step": 56295 }, { - "epoch": 1.98, - "learning_rate": 3.4396874996297545e-05, - "loss": 0.2609, + "epoch": 2.0290481853894113, + "grad_norm": 0.18545769155025482, + "learning_rate": 3.366915931527946e-05, + "loss": 0.4111, "step": 56300 }, { - "epoch": 1.98, - "learning_rate": 3.4394235109649e-05, - "loss": 0.3112, + "epoch": 2.029228385050636, + "grad_norm": 0.1865958422422409, + "learning_rate": 3.3666969623134705e-05, + "loss": 0.4044, "step": 56305 }, { - "epoch": 1.98, - "learning_rate": 3.4391595101022486e-05, - "loss": 0.2749, + "epoch": 2.029408584711861, + "grad_norm": 0.202582448720932, + "learning_rate": 3.366423240167879e-05, + "loss": 0.3743, "step": 56310 }, { - "epoch": 1.98, - "learning_rate": 3.43889549704523e-05, - "loss": 0.272, + "epoch": 2.0295887843730855, + "grad_norm": 0.19475452601909637, + "learning_rate": 3.366149506217439e-05, + "loss": 0.419, "step": 56315 }, { - "epoch": 1.98, - "learning_rate": 3.438631471797273e-05, - "loss": 0.2844, + "epoch": 2.02976898403431, + "grad_norm": 0.19832675158977509, + "learning_rate": 3.36587576046588e-05, + "loss": 0.387, "step": 56320 }, { - "epoch": 1.98, - "learning_rate": 3.438367434361805e-05, - "loss": 0.2786, + "epoch": 2.0299491836955346, + "grad_norm": 0.19266216456890106, + "learning_rate": 3.3656020029169306e-05, + "loss": 0.4015, "step": 56325 }, { - "epoch": 1.98, - "learning_rate": 3.438103384742254e-05, - "loss": 0.2764, + "epoch": 2.0301293833567593, + "grad_norm": 0.19489936530590057, + "learning_rate": 3.365328233574322e-05, + "loss": 0.4126, "step": 56330 }, { - "epoch": 1.98, - "learning_rate": 3.437839322942049e-05, - "loss": 0.3176, + "epoch": 2.030309583017984, + "grad_norm": 0.1996261477470398, + "learning_rate": 3.365054452441783e-05, + "loss": 0.4134, "step": 56335 }, { - "epoch": 1.98, - "learning_rate": 3.437575248964618e-05, - "loss": 0.2639, + "epoch": 2.0304897826792088, + "grad_norm": 0.1988709717988968, + "learning_rate": 3.364780659523046e-05, + "loss": 0.4044, "step": 56340 }, { - "epoch": 1.98, - "learning_rate": 3.4373111628133906e-05, - "loss": 0.287, + "epoch": 2.030669982340433, + "grad_norm": 0.21509073674678802, + "learning_rate": 3.3645068548218385e-05, + "loss": 0.3803, "step": 56345 }, { - "epoch": 1.98, - "learning_rate": 3.437047064491795e-05, - "loss": 0.289, + "epoch": 2.030850182001658, + "grad_norm": 0.20546337962150574, + "learning_rate": 3.364233038341893e-05, + "loss": 0.3971, "step": 56350 }, { - "epoch": 1.98, - "learning_rate": 3.436782954003261e-05, - "loss": 0.2841, + "epoch": 2.0310303816628825, + "grad_norm": 0.20153018832206726, + "learning_rate": 3.36395921008694e-05, + "loss": 0.3793, "step": 56355 }, { - "epoch": 1.98, - "learning_rate": 3.436518831351218e-05, - "loss": 0.2736, + "epoch": 2.0312105813241073, + "grad_norm": 0.23244844377040863, + "learning_rate": 3.3636853700607095e-05, + "loss": 0.3696, "step": 56360 }, { - "epoch": 1.98, - "learning_rate": 3.436254696539095e-05, - "loss": 0.2837, + "epoch": 2.0313907809853315, + "grad_norm": 0.20720942318439484, + "learning_rate": 3.3634115182669335e-05, + "loss": 0.3894, "step": 56365 }, { - "epoch": 1.98, - "learning_rate": 3.435990549570321e-05, - "loss": 0.2807, + "epoch": 2.0315709806465563, + "grad_norm": 0.19618476927280426, + "learning_rate": 3.363137654709342e-05, + "loss": 0.4301, "step": 56370 }, { - "epoch": 1.98, - "learning_rate": 3.435726390448328e-05, - "loss": 0.2494, + "epoch": 2.031751180307781, + "grad_norm": 0.1612580120563507, + "learning_rate": 3.362863779391668e-05, + "loss": 0.3662, "step": 56375 }, { - "epoch": 1.98, - "learning_rate": 3.435462219176543e-05, - "loss": 0.2607, + "epoch": 2.0319313799690057, + "grad_norm": 0.14148502051830292, + "learning_rate": 3.362589892317641e-05, + "loss": 0.3881, "step": 56380 }, { - "epoch": 1.98, - "learning_rate": 3.435198035758399e-05, - "loss": 0.2763, + "epoch": 2.0321115796302305, + "grad_norm": 0.19896137714385986, + "learning_rate": 3.3623159934909954e-05, + "loss": 0.3847, "step": 56385 }, { - "epoch": 1.98, - "learning_rate": 3.434933840197324e-05, - "loss": 0.3139, + "epoch": 2.0322917792914548, + "grad_norm": 0.16365410387516022, + "learning_rate": 3.3620420829154605e-05, + "loss": 0.3822, "step": 56390 }, { - "epoch": 1.98, - "learning_rate": 3.4346696324967494e-05, - "loss": 0.2886, + "epoch": 2.0324719789526795, + "grad_norm": 0.1631069928407669, + "learning_rate": 3.361768160594768e-05, + "loss": 0.4095, "step": 56395 }, { - "epoch": 1.98, - "learning_rate": 3.434405412660106e-05, - "loss": 0.2772, + "epoch": 2.0326521786139042, + "grad_norm": 0.22321099042892456, + "learning_rate": 3.361494226532653e-05, + "loss": 0.3992, "step": 56400 }, { - "epoch": 1.98, - "learning_rate": 3.434141180690823e-05, - "loss": 0.2993, + "epoch": 2.032832378275129, + "grad_norm": 0.2479621022939682, + "learning_rate": 3.361220280732845e-05, + "loss": 0.3764, "step": 56405 }, { - "epoch": 1.98, - "learning_rate": 3.4338769365923335e-05, - "loss": 0.3046, + "epoch": 2.0330125779363533, + "grad_norm": 0.2009827196598053, + "learning_rate": 3.3609463231990774e-05, + "loss": 0.4009, "step": 56410 }, { - "epoch": 1.98, - "learning_rate": 3.4336126803680665e-05, - "loss": 0.2795, + "epoch": 2.033192777597578, + "grad_norm": 0.20739832520484924, + "learning_rate": 3.3606723539350825e-05, + "loss": 0.3841, "step": 56415 }, { - "epoch": 1.99, - "learning_rate": 3.433348412021456e-05, - "loss": 0.2812, + "epoch": 2.0333729772588027, + "grad_norm": 0.2028648406267166, + "learning_rate": 3.360398372944594e-05, + "loss": 0.4247, "step": 56420 }, { - "epoch": 1.99, - "learning_rate": 3.433084131555929e-05, - "loss": 0.2734, + "epoch": 2.0335531769200275, + "grad_norm": 0.19367647171020508, + "learning_rate": 3.360124380231344e-05, + "loss": 0.381, "step": 56425 }, { - "epoch": 1.99, - "learning_rate": 3.43281983897492e-05, - "loss": 0.2761, + "epoch": 2.033733376581252, + "grad_norm": 0.1919804960489273, + "learning_rate": 3.3598503757990664e-05, + "loss": 0.4649, "step": 56430 }, { - "epoch": 1.99, - "learning_rate": 3.4325555342818614e-05, - "loss": 0.2884, + "epoch": 2.0339135762424765, + "grad_norm": 0.17203310132026672, + "learning_rate": 3.359576359651493e-05, + "loss": 0.3987, "step": 56435 }, { - "epoch": 1.99, - "learning_rate": 3.432291217480183e-05, - "loss": 0.2656, + "epoch": 2.034093775903701, + "grad_norm": 0.23200438916683197, + "learning_rate": 3.359302331792358e-05, + "loss": 0.385, "step": 56440 }, { - "epoch": 1.99, - "learning_rate": 3.4320268885733174e-05, - "loss": 0.2916, + "epoch": 2.034273975564926, + "grad_norm": 0.20147477090358734, + "learning_rate": 3.3590282922253966e-05, + "loss": 0.3981, "step": 56445 }, { - "epoch": 1.99, - "learning_rate": 3.431762547564698e-05, - "loss": 0.2979, + "epoch": 2.0344541752261507, + "grad_norm": 0.20963551104068756, + "learning_rate": 3.3587542409543396e-05, + "loss": 0.4105, "step": 56450 }, { - "epoch": 1.99, - "learning_rate": 3.431498194457755e-05, - "loss": 0.2739, + "epoch": 2.0346343748873754, + "grad_norm": 0.20766286551952362, + "learning_rate": 3.358480177982923e-05, + "loss": 0.4136, "step": 56455 }, { - "epoch": 1.99, - "learning_rate": 3.4312338292559214e-05, - "loss": 0.2616, + "epoch": 2.0348145745485997, + "grad_norm": 0.22245804965496063, + "learning_rate": 3.3582061033148795e-05, + "loss": 0.4043, "step": 56460 }, { - "epoch": 1.99, - "learning_rate": 3.430969451962631e-05, - "loss": 0.2783, + "epoch": 2.0349947742098244, + "grad_norm": 0.18696880340576172, + "learning_rate": 3.3579320169539445e-05, + "loss": 0.3749, "step": 56465 }, { - "epoch": 1.99, - "learning_rate": 3.430705062581316e-05, - "loss": 0.2749, + "epoch": 2.035174973871049, + "grad_norm": 0.19157637655735016, + "learning_rate": 3.3576579189038525e-05, + "loss": 0.4225, "step": 56470 }, { - "epoch": 1.99, - "learning_rate": 3.4304406611154085e-05, - "loss": 0.2792, + "epoch": 2.035355173532274, + "grad_norm": 0.18588903546333313, + "learning_rate": 3.3573838091683366e-05, + "loss": 0.3961, "step": 56475 }, { - "epoch": 1.99, - "learning_rate": 3.430176247568342e-05, - "loss": 0.2631, + "epoch": 2.035535373193498, + "grad_norm": 0.196426123380661, + "learning_rate": 3.3571096877511324e-05, + "loss": 0.4047, "step": 56480 }, { - "epoch": 1.99, - "learning_rate": 3.42991182194355e-05, - "loss": 0.2848, + "epoch": 2.035715572854723, + "grad_norm": 0.1676120012998581, + "learning_rate": 3.356835554655975e-05, + "loss": 0.4061, "step": 56485 }, { - "epoch": 1.99, - "learning_rate": 3.4296473842444663e-05, - "loss": 0.2854, + "epoch": 2.0358957725159477, + "grad_norm": 0.22126492857933044, + "learning_rate": 3.3565614098865985e-05, + "loss": 0.4079, "step": 56490 }, { - "epoch": 1.99, - "learning_rate": 3.429382934474524e-05, - "loss": 0.2827, + "epoch": 2.0360759721771724, + "grad_norm": 0.1804613471031189, + "learning_rate": 3.356287253446738e-05, + "loss": 0.3552, "step": 56495 }, { - "epoch": 1.99, - "learning_rate": 3.429118472637157e-05, - "loss": 0.2898, + "epoch": 2.036256171838397, + "grad_norm": 0.16997934877872467, + "learning_rate": 3.3560130853401306e-05, + "loss": 0.3867, "step": 56500 }, { - "epoch": 1.99, - "eval_loss": 0.2718130946159363, - "eval_runtime": 10.5492, - "eval_samples_per_second": 9.479, - "eval_steps_per_second": 9.479, + "epoch": 2.036256171838397, + "eval_loss": 0.43729928135871887, + "eval_runtime": 3.5477, + "eval_samples_per_second": 28.187, + "eval_steps_per_second": 7.047, "step": 56500 }, { - "epoch": 1.99, - "learning_rate": 3.4288539987357984e-05, - "loss": 0.2907, + "epoch": 2.0364363714996214, + "grad_norm": 0.1884368509054184, + "learning_rate": 3.355738905570511e-05, + "loss": 0.3983, "step": 56505 }, { - "epoch": 1.99, - "learning_rate": 3.428589512773882e-05, - "loss": 0.2519, + "epoch": 2.036616571160846, + "grad_norm": 0.21548138558864594, + "learning_rate": 3.3554647141416136e-05, + "loss": 0.4085, "step": 56510 }, { - "epoch": 1.99, - "learning_rate": 3.428325014754844e-05, - "loss": 0.2855, + "epoch": 2.036796770822071, + "grad_norm": 0.23149925470352173, + "learning_rate": 3.355190511057175e-05, + "loss": 0.3907, "step": 56515 }, { - "epoch": 1.99, - "learning_rate": 3.4280605046821175e-05, - "loss": 0.2906, + "epoch": 2.0369769704832956, + "grad_norm": 0.18845608830451965, + "learning_rate": 3.354916296320931e-05, + "loss": 0.4065, "step": 56520 }, { - "epoch": 1.99, - "learning_rate": 3.4277959825591366e-05, - "loss": 0.2636, + "epoch": 2.03715717014452, + "grad_norm": 0.19569791853427887, + "learning_rate": 3.354642069936619e-05, + "loss": 0.3941, "step": 56525 }, { - "epoch": 1.99, - "learning_rate": 3.427531448389336e-05, - "loss": 0.2877, + "epoch": 2.0373373698057446, + "grad_norm": 0.24140161275863647, + "learning_rate": 3.354367831907974e-05, + "loss": 0.4019, "step": 56530 }, { - "epoch": 1.99, - "learning_rate": 3.427266902176151e-05, - "loss": 0.2604, + "epoch": 2.0375175694669694, + "grad_norm": 0.15203697979450226, + "learning_rate": 3.354093582238732e-05, + "loss": 0.3785, "step": 56535 }, { - "epoch": 1.99, - "learning_rate": 3.4270023439230166e-05, - "loss": 0.2789, + "epoch": 2.037697769128194, + "grad_norm": 0.17884066700935364, + "learning_rate": 3.353819320932632e-05, + "loss": 0.3748, "step": 56540 }, { - "epoch": 1.99, - "learning_rate": 3.4267377736333676e-05, - "loss": 0.3074, + "epoch": 2.037877968789419, + "grad_norm": 0.20854458212852478, + "learning_rate": 3.353545047993408e-05, + "loss": 0.3853, "step": 56545 }, { - "epoch": 1.99, - "learning_rate": 3.42647319131064e-05, - "loss": 0.2577, + "epoch": 2.038058168450643, + "grad_norm": 0.20811249315738678, + "learning_rate": 3.3532707634247975e-05, + "loss": 0.3865, "step": 56550 }, { - "epoch": 1.99, - "learning_rate": 3.4262085969582674e-05, - "loss": 0.2948, + "epoch": 2.038238368111868, + "grad_norm": 0.1722935438156128, + "learning_rate": 3.352996467230538e-05, + "loss": 0.3549, "step": 56555 }, { - "epoch": 1.99, - "learning_rate": 3.425943990579687e-05, - "loss": 0.2721, + "epoch": 2.0384185677730926, + "grad_norm": 0.2165447473526001, + "learning_rate": 3.3527221594143675e-05, + "loss": 0.3848, "step": 56560 }, { - "epoch": 1.99, - "learning_rate": 3.4256793721783345e-05, - "loss": 0.3173, + "epoch": 2.0385987674343173, + "grad_norm": 0.221002995967865, + "learning_rate": 3.352447839980022e-05, + "loss": 0.3889, "step": 56565 }, { - "epoch": 1.99, - "learning_rate": 3.4254147417576445e-05, - "loss": 0.2765, + "epoch": 2.038778967095542, + "grad_norm": 0.2625134587287903, + "learning_rate": 3.35217350893124e-05, + "loss": 0.3828, "step": 56570 }, { - "epoch": 1.99, - "learning_rate": 3.425150099321055e-05, - "loss": 0.2899, + "epoch": 2.0389591667567664, + "grad_norm": 0.21043117344379425, + "learning_rate": 3.35189916627176e-05, + "loss": 0.3867, "step": 56575 }, { - "epoch": 1.99, - "learning_rate": 3.4248854448720004e-05, - "loss": 0.2773, + "epoch": 2.039139366417991, + "grad_norm": 0.19859947264194489, + "learning_rate": 3.351624812005317e-05, + "loss": 0.3922, "step": 56580 }, { - "epoch": 1.99, - "learning_rate": 3.424620778413918e-05, - "loss": 0.2734, + "epoch": 2.039319566079216, + "grad_norm": 0.15382780134677887, + "learning_rate": 3.351350446135651e-05, + "loss": 0.3614, "step": 56585 }, { - "epoch": 1.99, - "learning_rate": 3.4243560999502445e-05, - "loss": 0.2944, + "epoch": 2.0394997657404406, + "grad_norm": 0.21880534291267395, + "learning_rate": 3.3510760686665014e-05, + "loss": 0.396, "step": 56590 }, { - "epoch": 1.99, - "learning_rate": 3.4240914094844154e-05, - "loss": 0.2746, + "epoch": 2.039679965401665, + "grad_norm": 0.21891264617443085, + "learning_rate": 3.3508016796016047e-05, + "loss": 0.392, "step": 56595 }, { - "epoch": 1.99, - "learning_rate": 3.4238267070198686e-05, - "loss": 0.2693, + "epoch": 2.0398601650628896, + "grad_norm": 0.20019477605819702, + "learning_rate": 3.350527278944699e-05, + "loss": 0.3729, "step": 56600 }, { - "epoch": 1.99, - "learning_rate": 3.4235619925600417e-05, - "loss": 0.2708, + "epoch": 2.0400403647241143, + "grad_norm": 0.18630538880825043, + "learning_rate": 3.350252866699525e-05, + "loss": 0.3704, "step": 56605 }, { - "epoch": 1.99, - "learning_rate": 3.42329726610837e-05, - "loss": 0.28, + "epoch": 2.040220564385339, + "grad_norm": 0.20929007232189178, + "learning_rate": 3.349978442869819e-05, + "loss": 0.4028, "step": 56610 }, { - "epoch": 1.99, - "learning_rate": 3.4230325276682924e-05, - "loss": 0.2826, + "epoch": 2.040400764046564, + "grad_norm": 0.18913328647613525, + "learning_rate": 3.3497040074593215e-05, + "loss": 0.3956, "step": 56615 }, { - "epoch": 1.99, - "learning_rate": 3.422767777243245e-05, - "loss": 0.294, + "epoch": 2.040580963707788, + "grad_norm": 0.1790267825126648, + "learning_rate": 3.349429560471772e-05, + "loss": 0.4009, "step": 56620 }, { - "epoch": 1.99, - "learning_rate": 3.4225030148366664e-05, - "loss": 0.2748, + "epoch": 2.040761163369013, + "grad_norm": 0.20545370876789093, + "learning_rate": 3.349155101910909e-05, + "loss": 0.3951, "step": 56625 }, { - "epoch": 1.99, - "learning_rate": 3.422238240451995e-05, - "loss": 0.2786, + "epoch": 2.0409413630302375, + "grad_norm": 0.21455180644989014, + "learning_rate": 3.348880631780472e-05, + "loss": 0.4135, "step": 56630 }, { - "epoch": 1.99, - "learning_rate": 3.4219734540926665e-05, - "loss": 0.2295, + "epoch": 2.0411215626914623, + "grad_norm": 0.19538956880569458, + "learning_rate": 3.3486061500842014e-05, + "loss": 0.4057, "step": 56635 }, { - "epoch": 1.99, - "learning_rate": 3.421708655762122e-05, - "loss": 0.2852, + "epoch": 2.0413017623526866, + "grad_norm": 0.19726307690143585, + "learning_rate": 3.348331656825835e-05, + "loss": 0.4318, "step": 56640 }, { - "epoch": 1.99, - "learning_rate": 3.421443845463797e-05, - "loss": 0.2934, + "epoch": 2.0414819620139113, + "grad_norm": 0.21314232051372528, + "learning_rate": 3.348057152009115e-05, + "loss": 0.4373, "step": 56645 }, { - "epoch": 1.99, - "learning_rate": 3.4211790232011305e-05, - "loss": 0.2506, + "epoch": 2.041662161675136, + "grad_norm": 0.19441983103752136, + "learning_rate": 3.347782635637781e-05, + "loss": 0.3647, "step": 56650 }, { - "epoch": 1.99, - "learning_rate": 3.4209141889775615e-05, - "loss": 0.265, + "epoch": 2.0418423613363608, + "grad_norm": 0.19044964015483856, + "learning_rate": 3.347508107715572e-05, + "loss": 0.3825, "step": 56655 }, { - "epoch": 1.99, - "learning_rate": 3.42064934279653e-05, - "loss": 0.3019, + "epoch": 2.0420225609975855, + "grad_norm": 0.19193421304225922, + "learning_rate": 3.347233568246228e-05, + "loss": 0.4275, "step": 56660 }, { - "epoch": 1.99, - "learning_rate": 3.4203844846614716e-05, - "loss": 0.2981, + "epoch": 2.04220276065881, + "grad_norm": 0.225186288356781, + "learning_rate": 3.3469590172334926e-05, + "loss": 0.3661, "step": 56665 }, { - "epoch": 1.99, - "learning_rate": 3.4201196145758286e-05, - "loss": 0.2878, + "epoch": 2.0423829603200345, + "grad_norm": 0.16008292138576508, + "learning_rate": 3.346684454681104e-05, + "loss": 0.355, "step": 56670 }, { - "epoch": 1.99, - "learning_rate": 3.419854732543038e-05, - "loss": 0.2791, + "epoch": 2.0425631599812593, + "grad_norm": 0.20448531210422516, + "learning_rate": 3.346409880592802e-05, + "loss": 0.3863, "step": 56675 }, { - "epoch": 1.99, - "learning_rate": 3.41958983856654e-05, - "loss": 0.2633, + "epoch": 2.042743359642484, + "grad_norm": 0.15298992395401, + "learning_rate": 3.346135294972331e-05, + "loss": 0.3827, "step": 56680 }, { - "epoch": 1.99, - "learning_rate": 3.4193249326497744e-05, - "loss": 0.2844, + "epoch": 2.0429235593037087, + "grad_norm": 0.1888943463563919, + "learning_rate": 3.34586069782343e-05, + "loss": 0.367, "step": 56685 }, { - "epoch": 1.99, - "learning_rate": 3.419060014796181e-05, - "loss": 0.3059, + "epoch": 2.043103758964933, + "grad_norm": 0.18571306765079498, + "learning_rate": 3.345586089149841e-05, + "loss": 0.3533, "step": 56690 }, { - "epoch": 1.99, - "learning_rate": 3.418795085009198e-05, - "loss": 0.2673, + "epoch": 2.0432839586261577, + "grad_norm": 0.2252894788980484, + "learning_rate": 3.345311468955305e-05, + "loss": 0.3834, "step": 56695 }, { - "epoch": 1.99, - "learning_rate": 3.418530143292267e-05, - "loss": 0.288, + "epoch": 2.0434641582873825, + "grad_norm": 0.20659992098808289, + "learning_rate": 3.3450368372435643e-05, + "loss": 0.4081, "step": 56700 }, { - "epoch": 2.0, - "learning_rate": 3.418265189648826e-05, - "loss": 0.2726, + "epoch": 2.043644357948607, + "grad_norm": 0.20119290053844452, + "learning_rate": 3.344762194018359e-05, + "loss": 0.3907, "step": 56705 }, { - "epoch": 2.0, - "learning_rate": 3.418000224082318e-05, - "loss": 0.2807, + "epoch": 2.0438245576098315, + "grad_norm": 0.17512202262878418, + "learning_rate": 3.344487539283434e-05, + "loss": 0.3676, "step": 56710 }, { - "epoch": 2.0, - "learning_rate": 3.417735246596182e-05, - "loss": 0.2939, + "epoch": 2.0440047572710562, + "grad_norm": 0.18519236147403717, + "learning_rate": 3.3442128730425295e-05, + "loss": 0.4245, "step": 56715 }, { - "epoch": 2.0, - "learning_rate": 3.417470257193859e-05, - "loss": 0.297, + "epoch": 2.044184956932281, + "grad_norm": 0.20480823516845703, + "learning_rate": 3.343938195299387e-05, + "loss": 0.4162, "step": 56720 }, { - "epoch": 2.0, - "learning_rate": 3.417205255878788e-05, - "loss": 0.2763, + "epoch": 2.0443651565935057, + "grad_norm": 0.2772522568702698, + "learning_rate": 3.3436635060577506e-05, + "loss": 0.3941, "step": 56725 }, { - "epoch": 2.0, - "learning_rate": 3.416940242654412e-05, - "loss": 0.2568, + "epoch": 2.0445453562547304, + "grad_norm": 0.19306902587413788, + "learning_rate": 3.3433888053213624e-05, + "loss": 0.4025, "step": 56730 }, { - "epoch": 2.0, - "learning_rate": 3.416675217524171e-05, - "loss": 0.2837, + "epoch": 2.0447255559159547, + "grad_norm": 0.22703427076339722, + "learning_rate": 3.3431140930939653e-05, + "loss": 0.3602, "step": 56735 }, { - "epoch": 2.0, - "learning_rate": 3.4164101804915064e-05, - "loss": 0.2868, + "epoch": 2.0449057555771795, + "grad_norm": 0.17691659927368164, + "learning_rate": 3.3428393693793006e-05, + "loss": 0.3831, "step": 56740 }, { - "epoch": 2.0, - "learning_rate": 3.41614513155986e-05, - "loss": 0.2832, + "epoch": 2.045085955238404, + "grad_norm": 0.2599165439605713, + "learning_rate": 3.342564634181114e-05, + "loss": 0.4334, "step": 56745 }, { - "epoch": 2.0, - "learning_rate": 3.4158800707326723e-05, - "loss": 0.2483, + "epoch": 2.045266154899629, + "grad_norm": 0.16780352592468262, + "learning_rate": 3.3422898875031475e-05, + "loss": 0.4114, "step": 56750 }, { - "epoch": 2.0, - "learning_rate": 3.4156680135084465e-05, - "loss": 0.2854, + "epoch": 2.045446354560853, + "grad_norm": 0.21082206070423126, + "learning_rate": 3.342015129349143e-05, + "loss": 0.3919, "step": 56755 }, { - "epoch": 2.0, - "learning_rate": 3.415402931277958e-05, - "loss": 0.2904, + "epoch": 2.045626554222078, + "grad_norm": 0.24321195483207703, + "learning_rate": 3.341740359722846e-05, + "loss": 0.4228, "step": 56760 }, { - "epoch": 2.0, - "learning_rate": 3.415137837161566e-05, - "loss": 0.2945, + "epoch": 2.0458067538833027, + "grad_norm": 0.189229354262352, + "learning_rate": 3.341465578628e-05, + "loss": 0.378, "step": 56765 }, { - "epoch": 2.0, - "learning_rate": 3.414872731162713e-05, - "loss": 0.2769, + "epoch": 2.0459869535445274, + "grad_norm": 0.2039814442396164, + "learning_rate": 3.341190786068349e-05, + "loss": 0.3869, "step": 56770 }, { - "epoch": 2.0, - "learning_rate": 3.4146076132848406e-05, - "loss": 0.2743, + "epoch": 2.046167153205752, + "grad_norm": 0.15500949323177338, + "learning_rate": 3.340915982047635e-05, + "loss": 0.3749, "step": 56775 }, { - "epoch": 2.0, - "learning_rate": 3.414342483531391e-05, - "loss": 0.2749, + "epoch": 2.0463473528669764, + "grad_norm": 0.21698758006095886, + "learning_rate": 3.340641166569604e-05, + "loss": 0.3944, "step": 56780 }, { - "epoch": 2.0, - "learning_rate": 3.414077341905806e-05, - "loss": 0.296, + "epoch": 2.046527552528201, + "grad_norm": 0.1889837086200714, + "learning_rate": 3.3403663396379997e-05, + "loss": 0.3515, "step": 56785 }, { - "epoch": 2.0, - "learning_rate": 3.41381218841153e-05, - "loss": 0.2932, + "epoch": 2.046707752189426, + "grad_norm": 0.1569356471300125, + "learning_rate": 3.340091501256567e-05, + "loss": 0.4297, "step": 56790 }, { - "epoch": 2.0, - "learning_rate": 3.413547023052004e-05, - "loss": 0.2935, + "epoch": 2.0468879518506506, + "grad_norm": 0.17683616280555725, + "learning_rate": 3.339816651429051e-05, + "loss": 0.4058, "step": 56795 }, { - "epoch": 2.0, - "learning_rate": 3.4132818458306735e-05, - "loss": 0.2933, + "epoch": 2.0470681515118754, + "grad_norm": 0.22128205001354218, + "learning_rate": 3.339541790159194e-05, + "loss": 0.429, "step": 56800 }, { - "epoch": 2.0, - "learning_rate": 3.4130166567509785e-05, - "loss": 0.284, + "epoch": 2.0472483511730997, + "grad_norm": 0.1818923056125641, + "learning_rate": 3.339266917450744e-05, + "loss": 0.4136, "step": 56805 }, { - "epoch": 2.0, - "learning_rate": 3.4127514558163656e-05, - "loss": 0.2524, + "epoch": 2.0474285508343244, + "grad_norm": 0.19361944496631622, + "learning_rate": 3.3389920333074434e-05, + "loss": 0.4062, "step": 56810 }, { - "epoch": 2.0, - "learning_rate": 3.412486243030275e-05, - "loss": 0.3027, + "epoch": 2.047608750495549, + "grad_norm": 0.1958514004945755, + "learning_rate": 3.33871713773304e-05, + "loss": 0.4092, "step": 56815 }, { - "epoch": 2.0, - "learning_rate": 3.412221018396152e-05, - "loss": 0.2836, + "epoch": 2.047788950156774, + "grad_norm": 0.2066863626241684, + "learning_rate": 3.338442230731278e-05, + "loss": 0.4023, "step": 56820 }, { - "epoch": 2.0, - "learning_rate": 3.4119557819174406e-05, - "loss": 0.2806, + "epoch": 2.047969149817998, + "grad_norm": 0.18415698409080505, + "learning_rate": 3.338167312305902e-05, + "loss": 0.3767, "step": 56825 }, { - "epoch": 2.0, - "learning_rate": 3.411690533597585e-05, - "loss": 0.2758, + "epoch": 2.048149349479223, + "grad_norm": 0.19590577483177185, + "learning_rate": 3.33789238246066e-05, + "loss": 0.3947, "step": 56830 }, { - "epoch": 2.0, - "learning_rate": 3.4114252734400274e-05, - "loss": 0.28, + "epoch": 2.0483295491404476, + "grad_norm": 0.17477436363697052, + "learning_rate": 3.337617441199295e-05, + "loss": 0.3764, "step": 56835 }, { - "epoch": 2.0, - "learning_rate": 3.411160001448214e-05, - "loss": 0.3095, + "epoch": 2.0485097488016724, + "grad_norm": 0.20771732926368713, + "learning_rate": 3.337342488525556e-05, + "loss": 0.437, "step": 56840 }, { - "epoch": 2.0, - "learning_rate": 3.4108947176255884e-05, - "loss": 0.3184, + "epoch": 2.048689948462897, + "grad_norm": 0.18982523679733276, + "learning_rate": 3.337067524443187e-05, + "loss": 0.4064, "step": 56845 }, { - "epoch": 2.0, - "learning_rate": 3.4106294219755955e-05, - "loss": 0.2491, + "epoch": 2.0488701481241214, + "grad_norm": 0.19457025825977325, + "learning_rate": 3.3367925489559346e-05, + "loss": 0.4211, "step": 56850 }, { - "epoch": 2.0, - "learning_rate": 3.410364114501679e-05, - "loss": 0.2714, + "epoch": 2.049050347785346, + "grad_norm": 0.17263978719711304, + "learning_rate": 3.336517562067546e-05, + "loss": 0.366, "step": 56855 }, { - "epoch": 2.0, - "learning_rate": 3.410098795207285e-05, - "loss": 0.2563, + "epoch": 2.049230547446571, + "grad_norm": 0.22003693878650665, + "learning_rate": 3.336242563781768e-05, + "loss": 0.4007, "step": 56860 }, { - "epoch": 2.0, - "learning_rate": 3.4098334640958584e-05, - "loss": 0.288, + "epoch": 2.0494107471077956, + "grad_norm": 0.24091579020023346, + "learning_rate": 3.3359675541023474e-05, + "loss": 0.4483, "step": 56865 }, { - "epoch": 2.0, - "learning_rate": 3.409568121170843e-05, - "loss": 0.2888, + "epoch": 2.04959094676902, + "grad_norm": 0.1734263151884079, + "learning_rate": 3.3356925330330294e-05, + "loss": 0.3737, "step": 56870 }, { - "epoch": 2.0, - "learning_rate": 3.409302766435685e-05, - "loss": 0.2673, + "epoch": 2.0497711464302446, + "grad_norm": 0.20855608582496643, + "learning_rate": 3.335417500577563e-05, + "loss": 0.4281, "step": 56875 }, { - "epoch": 2.0, - "learning_rate": 3.40903739989383e-05, - "loss": 0.2644, + "epoch": 2.0499513460914693, + "grad_norm": 0.2285163253545761, + "learning_rate": 3.335142456739695e-05, + "loss": 0.4231, "step": 56880 }, { - "epoch": 2.0, - "learning_rate": 3.408772021548724e-05, - "loss": 0.2681, + "epoch": 2.050131545752694, + "grad_norm": 0.17819838225841522, + "learning_rate": 3.334867401523173e-05, + "loss": 0.3831, "step": 56885 }, { - "epoch": 2.0, - "learning_rate": 3.408506631403811e-05, - "loss": 0.2682, + "epoch": 2.050311745413919, + "grad_norm": 0.2181788682937622, + "learning_rate": 3.3345923349317436e-05, + "loss": 0.4211, "step": 56890 }, { - "epoch": 2.0, - "learning_rate": 3.408241229462539e-05, - "loss": 0.2903, + "epoch": 2.050491945075143, + "grad_norm": 0.18571996688842773, + "learning_rate": 3.334317256969156e-05, + "loss": 0.4292, "step": 56895 }, { - "epoch": 2.0, - "learning_rate": 3.4079758157283526e-05, - "loss": 0.2452, + "epoch": 2.050672144736368, + "grad_norm": 0.20971627533435822, + "learning_rate": 3.334042167639157e-05, + "loss": 0.4386, "step": 56900 }, { - "epoch": 2.0, - "learning_rate": 3.407710390204699e-05, - "loss": 0.272, + "epoch": 2.0508523443975926, + "grad_norm": 0.17739515006542206, + "learning_rate": 3.3337670669454945e-05, + "loss": 0.3931, "step": 56905 }, { - "epoch": 2.0, - "learning_rate": 3.407444952895025e-05, - "loss": 0.2846, + "epoch": 2.0510325440588173, + "grad_norm": 0.24408304691314697, + "learning_rate": 3.3334919548919177e-05, + "loss": 0.3871, "step": 56910 }, { - "epoch": 2.0, - "learning_rate": 3.4071795038027755e-05, - "loss": 0.2791, + "epoch": 2.0512127437200416, + "grad_norm": 0.21893776953220367, + "learning_rate": 3.333216831482174e-05, + "loss": 0.381, "step": 56915 }, { - "epoch": 2.0, - "learning_rate": 3.406914042931398e-05, - "loss": 0.267, + "epoch": 2.0513929433812663, + "grad_norm": 0.19919031858444214, + "learning_rate": 3.332941696720012e-05, + "loss": 0.432, "step": 56920 }, { - "epoch": 2.0, - "learning_rate": 3.4066485702843385e-05, - "loss": 0.2816, + "epoch": 2.051573143042491, + "grad_norm": 0.18088489770889282, + "learning_rate": 3.3326665506091816e-05, + "loss": 0.3544, "step": 56925 }, { - "epoch": 2.0, - "learning_rate": 3.406383085865046e-05, - "loss": 0.2655, + "epoch": 2.051753342703716, + "grad_norm": 0.19796021282672882, + "learning_rate": 3.332391393153429e-05, + "loss": 0.3834, "step": 56930 }, { - "epoch": 2.0, - "learning_rate": 3.406117589676966e-05, - "loss": 0.2744, + "epoch": 2.0519335423649405, + "grad_norm": 0.15804626047611237, + "learning_rate": 3.3321162243565074e-05, + "loss": 0.351, "step": 56935 }, { - "epoch": 2.0, - "learning_rate": 3.405852081723547e-05, - "loss": 0.2504, + "epoch": 2.052113742026165, + "grad_norm": 0.18689633905887604, + "learning_rate": 3.3318410442221614e-05, + "loss": 0.4289, "step": 56940 }, { - "epoch": 2.0, - "learning_rate": 3.4055865620082346e-05, - "loss": 0.2855, + "epoch": 2.0522939416873895, + "grad_norm": 0.18491008877754211, + "learning_rate": 3.3315658527541425e-05, + "loss": 0.3782, "step": 56945 }, { - "epoch": 2.0, - "learning_rate": 3.405321030534479e-05, - "loss": 0.2821, + "epoch": 2.0524741413486143, + "grad_norm": 0.29741668701171875, + "learning_rate": 3.3312906499562e-05, + "loss": 0.4184, "step": 56950 }, { - "epoch": 2.0, - "learning_rate": 3.4050554873057245e-05, - "loss": 0.2763, + "epoch": 2.052654341009839, + "grad_norm": 0.20080535113811493, + "learning_rate": 3.331015435832084e-05, + "loss": 0.3833, "step": 56955 }, { - "epoch": 2.0, - "learning_rate": 3.404789932325422e-05, - "loss": 0.2822, + "epoch": 2.0528345406710637, + "grad_norm": 0.16579574346542358, + "learning_rate": 3.3307402103855426e-05, + "loss": 0.414, "step": 56960 }, { - "epoch": 2.0, - "learning_rate": 3.404524365597018e-05, - "loss": 0.2653, + "epoch": 2.053014740332288, + "grad_norm": 0.2213381975889206, + "learning_rate": 3.330464973620327e-05, + "loss": 0.3911, "step": 56965 }, { - "epoch": 2.0, - "learning_rate": 3.404258787123962e-05, - "loss": 0.2609, + "epoch": 2.0531949399935128, + "grad_norm": 0.1835223287343979, + "learning_rate": 3.330189725540187e-05, + "loss": 0.4325, "step": 56970 }, { - "epoch": 2.0, - "learning_rate": 3.4039931969097006e-05, - "loss": 0.2755, + "epoch": 2.0533751396547375, + "grad_norm": 0.18675781786441803, + "learning_rate": 3.329914466148872e-05, + "loss": 0.4228, "step": 56975 }, { - "epoch": 2.0, - "learning_rate": 3.403727594957684e-05, - "loss": 0.2875, + "epoch": 2.0535553393159622, + "grad_norm": 0.26438936591148376, + "learning_rate": 3.329639195450135e-05, + "loss": 0.4218, "step": 56980 }, { - "epoch": 2.0, - "learning_rate": 3.40346198127136e-05, - "loss": 0.2825, + "epoch": 2.0537355389771865, + "grad_norm": 0.1843392252922058, + "learning_rate": 3.329363913447723e-05, + "loss": 0.3871, "step": 56985 }, { - "epoch": 2.01, - "learning_rate": 3.403196355854178e-05, - "loss": 0.2678, + "epoch": 2.0539157386384113, + "grad_norm": 0.1797383427619934, + "learning_rate": 3.329088620145389e-05, + "loss": 0.3869, "step": 56990 }, { - "epoch": 2.01, - "learning_rate": 3.402930718709586e-05, - "loss": 0.2585, + "epoch": 2.054095938299636, + "grad_norm": 0.19929052889347076, + "learning_rate": 3.3288133155468826e-05, + "loss": 0.4024, "step": 56995 }, { - "epoch": 2.01, - "learning_rate": 3.402665069841034e-05, - "loss": 0.2683, + "epoch": 2.0542761379608607, + "grad_norm": 0.2309020608663559, + "learning_rate": 3.328537999655955e-05, + "loss": 0.394, "step": 57000 }, { - "epoch": 2.01, - "eval_loss": 0.271958589553833, - "eval_runtime": 10.5499, - "eval_samples_per_second": 9.479, - "eval_steps_per_second": 9.479, + "epoch": 2.0542761379608607, + "eval_loss": 0.43758705258369446, + "eval_runtime": 3.5257, + "eval_samples_per_second": 28.363, + "eval_steps_per_second": 7.091, "step": 57000 }, { - "epoch": 2.01, - "learning_rate": 3.402399409251971e-05, - "loss": 0.2793, + "epoch": 2.0544563376220855, + "grad_norm": 0.1692163497209549, + "learning_rate": 3.328262672476358e-05, + "loss": 0.3992, "step": 57005 }, { - "epoch": 2.01, - "learning_rate": 3.402133736945846e-05, - "loss": 0.2684, + "epoch": 2.0546365372833097, + "grad_norm": 0.2034154087305069, + "learning_rate": 3.327987334011842e-05, + "loss": 0.4069, "step": 57010 }, { - "epoch": 2.01, - "learning_rate": 3.401868052926108e-05, - "loss": 0.2791, + "epoch": 2.0548167369445345, + "grad_norm": 0.20764844119548798, + "learning_rate": 3.32771198426616e-05, + "loss": 0.3666, "step": 57015 }, { - "epoch": 2.01, - "learning_rate": 3.40160235719621e-05, - "loss": 0.2628, + "epoch": 2.054996936605759, + "grad_norm": 0.1821412444114685, + "learning_rate": 3.327436623243061e-05, + "loss": 0.4083, "step": 57020 }, { - "epoch": 2.01, - "learning_rate": 3.401336649759598e-05, - "loss": 0.2844, + "epoch": 2.055177136266984, + "grad_norm": 0.17971596121788025, + "learning_rate": 3.3271612509463e-05, + "loss": 0.4364, "step": 57025 }, { - "epoch": 2.01, - "learning_rate": 3.401070930619724e-05, - "loss": 0.2726, + "epoch": 2.0553573359282082, + "grad_norm": 0.17184744775295258, + "learning_rate": 3.326885867379625e-05, + "loss": 0.3749, "step": 57030 }, { - "epoch": 2.01, - "learning_rate": 3.400805199780037e-05, - "loss": 0.2613, + "epoch": 2.055537535589433, + "grad_norm": 0.20721890032291412, + "learning_rate": 3.326610472546792e-05, + "loss": 0.3955, "step": 57035 }, { - "epoch": 2.01, - "learning_rate": 3.400539457243989e-05, - "loss": 0.2656, + "epoch": 2.0557177352506577, + "grad_norm": 0.1466008871793747, + "learning_rate": 3.32633506645155e-05, + "loss": 0.4099, "step": 57040 }, { - "epoch": 2.01, - "learning_rate": 3.4002737030150294e-05, - "loss": 0.2687, + "epoch": 2.0558979349118824, + "grad_norm": 0.1782764494419098, + "learning_rate": 3.326059649097652e-05, + "loss": 0.4146, "step": 57045 }, { - "epoch": 2.01, - "learning_rate": 3.400007937096609e-05, - "loss": 0.2662, + "epoch": 2.056078134573107, + "grad_norm": 0.21149560809135437, + "learning_rate": 3.3257842204888536e-05, + "loss": 0.4045, "step": 57050 }, { - "epoch": 2.01, - "learning_rate": 3.3997421594921795e-05, - "loss": 0.2857, + "epoch": 2.0562583342343315, + "grad_norm": 0.1797577291727066, + "learning_rate": 3.325508780628903e-05, + "loss": 0.356, "step": 57055 }, { - "epoch": 2.01, - "learning_rate": 3.39947637020519e-05, - "loss": 0.2797, + "epoch": 2.056438533895556, + "grad_norm": 0.17433103919029236, + "learning_rate": 3.325233329521557e-05, + "loss": 0.4083, "step": 57060 }, { - "epoch": 2.01, - "learning_rate": 3.399210569239093e-05, - "loss": 0.2614, + "epoch": 2.056618733556781, + "grad_norm": 0.24138209223747253, + "learning_rate": 3.324957867170565e-05, + "loss": 0.3696, "step": 57065 }, { - "epoch": 2.01, - "learning_rate": 3.398944756597339e-05, - "loss": 0.282, + "epoch": 2.0567989332180057, + "grad_norm": 0.17933684587478638, + "learning_rate": 3.324682393579682e-05, + "loss": 0.4377, "step": 57070 }, { - "epoch": 2.01, - "learning_rate": 3.398678932283381e-05, - "loss": 0.2605, + "epoch": 2.0569791328792304, + "grad_norm": 0.21966612339019775, + "learning_rate": 3.324406908752662e-05, + "loss": 0.3704, "step": 57075 }, { - "epoch": 2.01, - "learning_rate": 3.398413096300668e-05, - "loss": 0.2592, + "epoch": 2.0571593325404547, + "grad_norm": 0.21100081503391266, + "learning_rate": 3.324131412693257e-05, + "loss": 0.382, "step": 57080 }, { - "epoch": 2.01, - "learning_rate": 3.398147248652653e-05, - "loss": 0.2464, + "epoch": 2.0573395322016794, + "grad_norm": 0.17946301400661469, + "learning_rate": 3.3238559054052206e-05, + "loss": 0.3886, "step": 57085 }, { - "epoch": 2.01, - "learning_rate": 3.3978813893427886e-05, - "loss": 0.2636, + "epoch": 2.057519731862904, + "grad_norm": 0.19525077939033508, + "learning_rate": 3.3235803868923077e-05, + "loss": 0.3696, "step": 57090 }, { - "epoch": 2.01, - "learning_rate": 3.397615518374526e-05, - "loss": 0.2526, + "epoch": 2.057699931524129, + "grad_norm": 0.22821834683418274, + "learning_rate": 3.3233048571582715e-05, + "loss": 0.4039, "step": 57095 }, { - "epoch": 2.01, - "learning_rate": 3.397349635751317e-05, - "loss": 0.2737, + "epoch": 2.057880131185353, + "grad_norm": 0.23264740407466888, + "learning_rate": 3.3230293162068655e-05, + "loss": 0.406, "step": 57100 }, { - "epoch": 2.01, - "learning_rate": 3.397083741476614e-05, - "loss": 0.2877, + "epoch": 2.058060330846578, + "grad_norm": 0.22399888932704926, + "learning_rate": 3.3227537640418445e-05, + "loss": 0.3937, "step": 57105 }, { - "epoch": 2.01, - "learning_rate": 3.396817835553871e-05, - "loss": 0.2706, + "epoch": 2.0582405305078026, + "grad_norm": 0.18442541360855103, + "learning_rate": 3.3224782006669624e-05, + "loss": 0.412, "step": 57110 }, { - "epoch": 2.01, - "learning_rate": 3.396551917986539e-05, - "loss": 0.2725, + "epoch": 2.0584207301690274, + "grad_norm": 0.23033219575881958, + "learning_rate": 3.3222026260859747e-05, + "loss": 0.3833, "step": 57115 }, { - "epoch": 2.01, - "learning_rate": 3.39628598877807e-05, - "loss": 0.2833, + "epoch": 2.058600929830252, + "grad_norm": 0.18063490092754364, + "learning_rate": 3.3219270403026354e-05, + "loss": 0.3814, "step": 57120 }, { - "epoch": 2.01, - "learning_rate": 3.39602004793192e-05, - "loss": 0.2687, + "epoch": 2.0587811294914764, + "grad_norm": 0.17391562461853027, + "learning_rate": 3.321651443320699e-05, + "loss": 0.3861, "step": 57125 }, { - "epoch": 2.01, - "learning_rate": 3.395754095451539e-05, - "loss": 0.2735, + "epoch": 2.058961329152701, + "grad_norm": 0.21318939328193665, + "learning_rate": 3.32137583514392e-05, + "loss": 0.4278, "step": 57130 }, { - "epoch": 2.01, - "learning_rate": 3.395488131340382e-05, - "loss": 0.2931, + "epoch": 2.059141528813926, + "grad_norm": 0.17975524067878723, + "learning_rate": 3.3211002157760554e-05, + "loss": 0.3949, "step": 57135 }, { - "epoch": 2.01, - "learning_rate": 3.395222155601902e-05, - "loss": 0.2854, + "epoch": 2.0593217284751506, + "grad_norm": 0.18724536895751953, + "learning_rate": 3.32082458522086e-05, + "loss": 0.3923, "step": 57140 }, { - "epoch": 2.01, - "learning_rate": 3.394956168239551e-05, - "loss": 0.2831, + "epoch": 2.059501928136375, + "grad_norm": 0.19911803305149078, + "learning_rate": 3.3205489434820876e-05, + "loss": 0.3708, "step": 57145 }, { - "epoch": 2.01, - "learning_rate": 3.3946901692567844e-05, - "loss": 0.2905, + "epoch": 2.0596821277975996, + "grad_norm": 0.21658672392368317, + "learning_rate": 3.3202732905634936e-05, + "loss": 0.4215, "step": 57150 }, { - "epoch": 2.01, - "learning_rate": 3.3944241586570565e-05, - "loss": 0.2555, + "epoch": 2.0598623274588244, + "grad_norm": 0.2002650946378708, + "learning_rate": 3.3199976264688365e-05, + "loss": 0.4014, "step": 57155 }, { - "epoch": 2.01, - "learning_rate": 3.394158136443819e-05, - "loss": 0.2624, + "epoch": 2.060042527120049, + "grad_norm": 0.20283368229866028, + "learning_rate": 3.3197219512018704e-05, + "loss": 0.4014, "step": 57160 }, { - "epoch": 2.01, - "learning_rate": 3.393892102620528e-05, - "loss": 0.2626, + "epoch": 2.060222726781274, + "grad_norm": 0.1828315407037735, + "learning_rate": 3.319446264766351e-05, + "loss": 0.4021, "step": 57165 }, { - "epoch": 2.01, - "learning_rate": 3.393626057190638e-05, - "loss": 0.28, + "epoch": 2.060402926442498, + "grad_norm": 0.1759759783744812, + "learning_rate": 3.319170567166034e-05, + "loss": 0.4176, "step": 57170 }, { - "epoch": 2.01, - "learning_rate": 3.393360000157601e-05, - "loss": 0.2743, + "epoch": 2.060583126103723, + "grad_norm": 0.19315093755722046, + "learning_rate": 3.318894858404678e-05, + "loss": 0.3938, "step": 57175 }, { - "epoch": 2.01, - "learning_rate": 3.393093931524875e-05, - "loss": 0.2711, + "epoch": 2.0607633257649476, + "grad_norm": 0.19143354892730713, + "learning_rate": 3.318619138486037e-05, + "loss": 0.3987, "step": 57180 }, { - "epoch": 2.01, - "learning_rate": 3.392827851295911e-05, - "loss": 0.2572, + "epoch": 2.0609435254261723, + "grad_norm": 0.21193043887615204, + "learning_rate": 3.31834340741387e-05, + "loss": 0.4053, "step": 57185 }, { - "epoch": 2.01, - "learning_rate": 3.392561759474167e-05, - "loss": 0.2988, + "epoch": 2.061123725087397, + "grad_norm": 0.17882151901721954, + "learning_rate": 3.318067665191932e-05, + "loss": 0.3777, "step": 57190 }, { - "epoch": 2.01, - "learning_rate": 3.392295656063096e-05, - "loss": 0.2747, + "epoch": 2.0613039247486213, + "grad_norm": 0.19089268147945404, + "learning_rate": 3.3177919118239794e-05, + "loss": 0.3783, "step": 57195 }, { - "epoch": 2.01, - "learning_rate": 3.392029541066154e-05, - "loss": 0.2537, + "epoch": 2.061484124409846, + "grad_norm": 0.1889115571975708, + "learning_rate": 3.317516147313771e-05, + "loss": 0.4081, "step": 57200 }, { - "epoch": 2.01, - "learning_rate": 3.3917634144867965e-05, - "loss": 0.2639, + "epoch": 2.061664324071071, + "grad_norm": 0.1909630447626114, + "learning_rate": 3.317240371665064e-05, + "loss": 0.3764, "step": 57205 }, { - "epoch": 2.01, - "learning_rate": 3.3914972763284794e-05, - "loss": 0.2387, + "epoch": 2.0618445237322955, + "grad_norm": 0.1816696971654892, + "learning_rate": 3.3169645848816146e-05, + "loss": 0.4064, "step": 57210 }, { - "epoch": 2.01, - "learning_rate": 3.391231126594657e-05, - "loss": 0.3002, + "epoch": 2.06202472339352, + "grad_norm": 0.15952745079994202, + "learning_rate": 3.316688786967181e-05, + "loss": 0.3982, "step": 57215 }, { - "epoch": 2.01, - "learning_rate": 3.390964965288786e-05, - "loss": 0.2646, + "epoch": 2.0622049230547446, + "grad_norm": 0.17730703949928284, + "learning_rate": 3.31641297792552e-05, + "loss": 0.421, "step": 57220 }, { - "epoch": 2.01, - "learning_rate": 3.390698792414322e-05, - "loss": 0.277, + "epoch": 2.0623851227159693, + "grad_norm": 0.19057762622833252, + "learning_rate": 3.316137157760392e-05, + "loss": 0.3998, "step": 57225 }, { - "epoch": 2.01, - "learning_rate": 3.390432607974721e-05, - "loss": 0.2545, + "epoch": 2.062565322377194, + "grad_norm": 0.20309016108512878, + "learning_rate": 3.315861326475552e-05, + "loss": 0.3808, "step": 57230 }, { - "epoch": 2.01, - "learning_rate": 3.390166411973439e-05, - "loss": 0.2576, + "epoch": 2.0627455220384188, + "grad_norm": 0.1632905900478363, + "learning_rate": 3.3155854840747596e-05, + "loss": 0.3994, "step": 57235 }, { - "epoch": 2.01, - "learning_rate": 3.389900204413934e-05, - "loss": 0.2717, + "epoch": 2.062925721699643, + "grad_norm": 0.20555098354816437, + "learning_rate": 3.315364802153166e-05, + "loss": 0.4086, "step": 57240 }, { - "epoch": 2.01, - "learning_rate": 3.3896339852996604e-05, - "loss": 0.2625, + "epoch": 2.063105921360868, + "grad_norm": 0.22128038108348846, + "learning_rate": 3.315088939753129e-05, + "loss": 0.4045, "step": 57245 }, { - "epoch": 2.01, - "learning_rate": 3.3893677546340765e-05, - "loss": 0.2338, + "epoch": 2.0632861210220925, + "grad_norm": 0.23114217817783356, + "learning_rate": 3.314813066247664e-05, + "loss": 0.3849, "step": 57250 }, { - "epoch": 2.01, - "learning_rate": 3.3891015124206376e-05, - "loss": 0.2643, + "epoch": 2.0634663206833173, + "grad_norm": 0.17400681972503662, + "learning_rate": 3.314537181640529e-05, + "loss": 0.3957, "step": 57255 }, { - "epoch": 2.01, - "learning_rate": 3.388835258662802e-05, - "loss": 0.2568, + "epoch": 2.0636465203445415, + "grad_norm": 0.18826141953468323, + "learning_rate": 3.314261285935483e-05, + "loss": 0.3964, "step": 57260 }, { - "epoch": 2.01, - "learning_rate": 3.388568993364026e-05, - "loss": 0.2942, + "epoch": 2.0638267200057663, + "grad_norm": 0.21883268654346466, + "learning_rate": 3.313985379136283e-05, + "loss": 0.4175, "step": 57265 }, { - "epoch": 2.01, - "learning_rate": 3.388302716527768e-05, - "loss": 0.2486, + "epoch": 2.064006919666991, + "grad_norm": 0.1698734164237976, + "learning_rate": 3.3137094612466914e-05, + "loss": 0.4036, "step": 57270 }, { - "epoch": 2.02, - "learning_rate": 3.3880364281574836e-05, - "loss": 0.2601, + "epoch": 2.0641871193282157, + "grad_norm": 0.17218467593193054, + "learning_rate": 3.3134335322704646e-05, + "loss": 0.3493, "step": 57275 }, { - "epoch": 2.02, - "learning_rate": 3.3877701282566314e-05, - "loss": 0.2685, + "epoch": 2.0643673189894405, + "grad_norm": 0.19277000427246094, + "learning_rate": 3.313157592211364e-05, + "loss": 0.4359, "step": 57280 }, { - "epoch": 2.02, - "learning_rate": 3.38750381682867e-05, - "loss": 0.255, + "epoch": 2.0645475186506648, + "grad_norm": 0.1942652016878128, + "learning_rate": 3.312881641073149e-05, + "loss": 0.4229, "step": 57285 }, { - "epoch": 2.02, - "learning_rate": 3.387237493877056e-05, - "loss": 0.2693, + "epoch": 2.0647277183118895, + "grad_norm": 0.23124489188194275, + "learning_rate": 3.3126056788595786e-05, + "loss": 0.38, "step": 57290 }, { - "epoch": 2.02, - "learning_rate": 3.386971159405248e-05, - "loss": 0.2861, + "epoch": 2.0649079179731142, + "grad_norm": 0.1828756481409073, + "learning_rate": 3.312329705574413e-05, + "loss": 0.4185, "step": 57295 }, { - "epoch": 2.02, - "learning_rate": 3.3867048134167035e-05, - "loss": 0.2617, + "epoch": 2.065088117634339, + "grad_norm": 0.18582990765571594, + "learning_rate": 3.3120537212214134e-05, + "loss": 0.376, "step": 57300 }, { - "epoch": 2.02, - "learning_rate": 3.386438455914882e-05, - "loss": 0.2772, + "epoch": 2.0652683172955637, + "grad_norm": 0.18808506429195404, + "learning_rate": 3.3117777258043384e-05, + "loss": 0.3729, "step": 57305 }, { - "epoch": 2.02, - "learning_rate": 3.386172086903241e-05, - "loss": 0.2497, + "epoch": 2.065448516956788, + "grad_norm": 0.2194932997226715, + "learning_rate": 3.311501719326948e-05, + "loss": 0.4381, "step": 57310 }, { - "epoch": 2.02, - "learning_rate": 3.38590570638524e-05, - "loss": 0.2822, + "epoch": 2.0656287166180127, + "grad_norm": 0.1534089893102646, + "learning_rate": 3.311225701793005e-05, + "loss": 0.3966, "step": 57315 }, { - "epoch": 2.02, - "learning_rate": 3.385639314364336e-05, - "loss": 0.2675, + "epoch": 2.0658089162792375, + "grad_norm": 0.19418910145759583, + "learning_rate": 3.3109496732062686e-05, + "loss": 0.4222, "step": 57320 }, { - "epoch": 2.02, - "learning_rate": 3.38537291084399e-05, - "loss": 0.2926, + "epoch": 2.065989115940462, + "grad_norm": 0.20485571026802063, + "learning_rate": 3.3106736335705e-05, + "loss": 0.3795, "step": 57325 }, { - "epoch": 2.02, - "learning_rate": 3.38510649582766e-05, - "loss": 0.2611, + "epoch": 2.0661693156016865, + "grad_norm": 0.198880136013031, + "learning_rate": 3.31039758288946e-05, + "loss": 0.4168, "step": 57330 }, { - "epoch": 2.02, - "learning_rate": 3.384840069318807e-05, - "loss": 0.2586, + "epoch": 2.066349515262911, + "grad_norm": 0.18784551322460175, + "learning_rate": 3.3101215211669104e-05, + "loss": 0.3876, "step": 57335 }, { - "epoch": 2.02, - "learning_rate": 3.384573631320886e-05, - "loss": 0.2682, + "epoch": 2.066529714924136, + "grad_norm": 0.1909671276807785, + "learning_rate": 3.309845448406612e-05, + "loss": 0.3914, "step": 57340 }, { - "epoch": 2.02, - "learning_rate": 3.3843071818373617e-05, - "loss": 0.2768, + "epoch": 2.0667099145853607, + "grad_norm": 0.18267397582530975, + "learning_rate": 3.309569364612325e-05, + "loss": 0.3791, "step": 57345 }, { - "epoch": 2.02, - "learning_rate": 3.38404072087169e-05, - "loss": 0.26, + "epoch": 2.0668901142465854, + "grad_norm": 0.16980095207691193, + "learning_rate": 3.3092932697878127e-05, + "loss": 0.3921, "step": 57350 }, { - "epoch": 2.02, - "learning_rate": 3.383774248427333e-05, - "loss": 0.2657, + "epoch": 2.0670703139078097, + "grad_norm": 0.2372709959745407, + "learning_rate": 3.3090171639368364e-05, + "loss": 0.4233, "step": 57355 }, { - "epoch": 2.02, - "learning_rate": 3.38350776450775e-05, - "loss": 0.2644, + "epoch": 2.0672505135690344, + "grad_norm": 0.2137833833694458, + "learning_rate": 3.3087410470631575e-05, + "loss": 0.4325, "step": 57360 }, { - "epoch": 2.02, - "learning_rate": 3.3832412691164e-05, - "loss": 0.2567, + "epoch": 2.067430713230259, + "grad_norm": 0.20736674964427948, + "learning_rate": 3.308464919170539e-05, + "loss": 0.4262, "step": 57365 }, { - "epoch": 2.02, - "learning_rate": 3.382974762256744e-05, - "loss": 0.2573, + "epoch": 2.067610912891484, + "grad_norm": 0.23613141477108002, + "learning_rate": 3.308188780262742e-05, + "loss": 0.3844, "step": 57370 }, { - "epoch": 2.02, - "learning_rate": 3.3827082439322445e-05, - "loss": 0.2531, + "epoch": 2.067791112552708, + "grad_norm": 0.23751363158226013, + "learning_rate": 3.30791263034353e-05, + "loss": 0.3884, "step": 57375 }, { - "epoch": 2.02, - "learning_rate": 3.382441714146358e-05, - "loss": 0.2777, + "epoch": 2.067971312213933, + "grad_norm": 0.22667773067951202, + "learning_rate": 3.307636469416664e-05, + "loss": 0.3871, "step": 57380 }, { - "epoch": 2.02, - "learning_rate": 3.382175172902549e-05, - "loss": 0.2603, + "epoch": 2.0681515118751577, + "grad_norm": 0.1814824640750885, + "learning_rate": 3.307360297485907e-05, + "loss": 0.3844, "step": 57385 }, { - "epoch": 2.02, - "learning_rate": 3.381908620204276e-05, - "loss": 0.2681, + "epoch": 2.0683317115363824, + "grad_norm": 0.19579993188381195, + "learning_rate": 3.307084114555023e-05, + "loss": 0.4153, "step": 57390 }, { - "epoch": 2.02, - "learning_rate": 3.3816420560550004e-05, - "loss": 0.2705, + "epoch": 2.068511911197607, + "grad_norm": 0.2431478351354599, + "learning_rate": 3.306807920627775e-05, + "loss": 0.3946, "step": 57395 }, { - "epoch": 2.02, - "learning_rate": 3.3813754804581844e-05, - "loss": 0.2606, + "epoch": 2.0686921108588314, + "grad_norm": 0.16477234661579132, + "learning_rate": 3.306531715707925e-05, + "loss": 0.3787, "step": 57400 }, { - "epoch": 2.02, - "learning_rate": 3.381108893417288e-05, - "loss": 0.2762, + "epoch": 2.068872310520056, + "grad_norm": 0.14775709807872772, + "learning_rate": 3.306255499799235e-05, + "loss": 0.3659, "step": 57405 }, { - "epoch": 2.02, - "learning_rate": 3.3808422949357746e-05, - "loss": 0.2848, + "epoch": 2.069052510181281, + "grad_norm": 0.2197529375553131, + "learning_rate": 3.305979272905472e-05, + "loss": 0.4249, "step": 57410 }, { - "epoch": 2.02, - "learning_rate": 3.3805756850171035e-05, - "loss": 0.2976, + "epoch": 2.0692327098425056, + "grad_norm": 0.1822102814912796, + "learning_rate": 3.3057030350303954e-05, + "loss": 0.4078, "step": 57415 }, { - "epoch": 2.02, - "learning_rate": 3.380309063664737e-05, - "loss": 0.2616, + "epoch": 2.06941290950373, + "grad_norm": 0.2318163812160492, + "learning_rate": 3.305426786177772e-05, + "loss": 0.4369, "step": 57420 }, { - "epoch": 2.02, - "learning_rate": 3.380042430882138e-05, - "loss": 0.2677, + "epoch": 2.0695931091649546, + "grad_norm": 0.1871294230222702, + "learning_rate": 3.305150526351365e-05, + "loss": 0.3798, "step": 57425 }, { - "epoch": 2.02, - "learning_rate": 3.379775786672768e-05, - "loss": 0.2842, + "epoch": 2.0697733088261794, + "grad_norm": 0.20110271871089935, + "learning_rate": 3.304874255554937e-05, + "loss": 0.3754, "step": 57430 }, { - "epoch": 2.02, - "learning_rate": 3.3795091310400895e-05, - "loss": 0.2763, + "epoch": 2.069953508487404, + "grad_norm": 0.17223583161830902, + "learning_rate": 3.304597973792254e-05, + "loss": 0.3773, "step": 57435 }, { - "epoch": 2.02, - "learning_rate": 3.379242463987564e-05, - "loss": 0.2363, + "epoch": 2.070133708148629, + "grad_norm": 0.19843506813049316, + "learning_rate": 3.304321681067079e-05, + "loss": 0.3627, "step": 57440 }, { - "epoch": 2.02, - "learning_rate": 3.378975785518655e-05, - "loss": 0.2622, + "epoch": 2.070313907809853, + "grad_norm": 0.21979078650474548, + "learning_rate": 3.304045377383177e-05, + "loss": 0.401, "step": 57445 }, { - "epoch": 2.02, - "learning_rate": 3.3787090956368244e-05, - "loss": 0.2869, + "epoch": 2.070494107471078, + "grad_norm": 0.18887178599834442, + "learning_rate": 3.303769062744312e-05, + "loss": 0.3996, "step": 57450 }, { - "epoch": 2.02, - "learning_rate": 3.3784423943455354e-05, - "loss": 0.2896, + "epoch": 2.0706743071323026, + "grad_norm": 0.19170895218849182, + "learning_rate": 3.303492737154249e-05, + "loss": 0.3618, "step": 57455 }, { - "epoch": 2.02, - "learning_rate": 3.378175681648251e-05, - "loss": 0.2588, + "epoch": 2.0708545067935273, + "grad_norm": 0.19946110248565674, + "learning_rate": 3.303216400616754e-05, + "loss": 0.3732, "step": 57460 }, { - "epoch": 2.02, - "learning_rate": 3.3779089575484335e-05, - "loss": 0.249, + "epoch": 2.071034706454752, + "grad_norm": 0.20270924270153046, + "learning_rate": 3.3029400531355897e-05, + "loss": 0.3963, "step": 57465 }, { - "epoch": 2.02, - "learning_rate": 3.377642222049548e-05, - "loss": 0.266, + "epoch": 2.0712149061159764, + "grad_norm": 0.24232977628707886, + "learning_rate": 3.302663694714524e-05, + "loss": 0.3853, "step": 57470 }, { - "epoch": 2.02, - "learning_rate": 3.3773754751550554e-05, - "loss": 0.279, + "epoch": 2.071395105777201, + "grad_norm": 0.17532899975776672, + "learning_rate": 3.30238732535732e-05, + "loss": 0.3874, "step": 57475 }, { - "epoch": 2.02, - "learning_rate": 3.377108716868421e-05, - "loss": 0.261, + "epoch": 2.071575305438426, + "grad_norm": 0.18288403749465942, + "learning_rate": 3.3021109450677445e-05, + "loss": 0.3988, "step": 57480 }, { - "epoch": 2.02, - "learning_rate": 3.376841947193108e-05, - "loss": 0.2634, + "epoch": 2.0717555050996506, + "grad_norm": 0.22599361836910248, + "learning_rate": 3.301834553849562e-05, + "loss": 0.4176, "step": 57485 }, { - "epoch": 2.02, - "learning_rate": 3.37657516613258e-05, - "loss": 0.279, + "epoch": 2.071935704760875, + "grad_norm": 0.20328468084335327, + "learning_rate": 3.30155815170654e-05, + "loss": 0.3994, "step": 57490 }, { - "epoch": 2.02, - "learning_rate": 3.376308373690301e-05, - "loss": 0.2691, + "epoch": 2.0721159044220996, + "grad_norm": 0.2231900542974472, + "learning_rate": 3.3012817386424416e-05, + "loss": 0.3832, "step": 57495 }, { - "epoch": 2.02, - "learning_rate": 3.376041569869735e-05, - "loss": 0.2587, + "epoch": 2.0722961040833243, + "grad_norm": 0.19576068222522736, + "learning_rate": 3.301005314661037e-05, + "loss": 0.3742, "step": 57500 }, { - "epoch": 2.02, - "eval_loss": 0.27126699686050415, - "eval_runtime": 10.5272, - "eval_samples_per_second": 9.499, - "eval_steps_per_second": 9.499, + "epoch": 2.0722961040833243, + "eval_loss": 0.43739715218544006, + "eval_runtime": 3.5417, + "eval_samples_per_second": 28.235, + "eval_steps_per_second": 7.059, "step": 57500 }, { - "epoch": 2.02, - "learning_rate": 3.375774754674347e-05, - "loss": 0.2689, + "epoch": 2.072476303744549, + "grad_norm": 0.18939457833766937, + "learning_rate": 3.3007288797660886e-05, + "loss": 0.3979, "step": 57505 }, { - "epoch": 2.02, - "learning_rate": 3.375507928107601e-05, - "loss": 0.2951, + "epoch": 2.072656503405774, + "grad_norm": 0.15962184965610504, + "learning_rate": 3.3004524339613636e-05, + "loss": 0.3678, "step": 57510 }, { - "epoch": 2.02, - "learning_rate": 3.375241090172961e-05, - "loss": 0.2765, + "epoch": 2.072836703066998, + "grad_norm": 0.20595142245292664, + "learning_rate": 3.30017597725063e-05, + "loss": 0.4048, "step": 57515 }, { - "epoch": 2.02, - "learning_rate": 3.3749742408738935e-05, - "loss": 0.2652, + "epoch": 2.073016902728223, + "grad_norm": 0.23062437772750854, + "learning_rate": 3.299899509637654e-05, + "loss": 0.3793, "step": 57520 }, { - "epoch": 2.02, - "learning_rate": 3.3747073802138616e-05, - "loss": 0.2837, + "epoch": 2.0731971023894475, + "grad_norm": 0.23280833661556244, + "learning_rate": 3.299623031126201e-05, + "loss": 0.4115, "step": 57525 }, { - "epoch": 2.02, - "learning_rate": 3.37444050819633e-05, - "loss": 0.2809, + "epoch": 2.0733773020506723, + "grad_norm": 0.18664391338825226, + "learning_rate": 3.2993465417200406e-05, + "loss": 0.4042, "step": 57530 }, { - "epoch": 2.02, - "learning_rate": 3.374173624824765e-05, - "loss": 0.2927, + "epoch": 2.0735575017118966, + "grad_norm": 0.19512483477592468, + "learning_rate": 3.299070041422937e-05, + "loss": 0.4043, "step": 57535 }, { - "epoch": 2.02, - "learning_rate": 3.373906730102632e-05, - "loss": 0.2504, + "epoch": 2.0737377013731213, + "grad_norm": 0.2118794023990631, + "learning_rate": 3.2987935302386594e-05, + "loss": 0.4164, "step": 57540 }, { - "epoch": 2.02, - "learning_rate": 3.373639824033396e-05, - "loss": 0.2715, + "epoch": 2.073917901034346, + "grad_norm": 0.20261244475841522, + "learning_rate": 3.298517008170974e-05, + "loss": 0.4024, "step": 57545 }, { - "epoch": 2.02, - "learning_rate": 3.3733729066205225e-05, - "loss": 0.2624, + "epoch": 2.0740981006955708, + "grad_norm": 0.18971537053585052, + "learning_rate": 3.29824047522365e-05, + "loss": 0.4133, "step": 57550 }, { - "epoch": 2.02, - "learning_rate": 3.373105977867478e-05, - "loss": 0.2941, + "epoch": 2.0742783003567955, + "grad_norm": 0.26411035656929016, + "learning_rate": 3.297963931400453e-05, + "loss": 0.397, "step": 57555 }, { - "epoch": 2.03, - "learning_rate": 3.3728390377777263e-05, - "loss": 0.2496, + "epoch": 2.07445850001802, + "grad_norm": 0.19595226645469666, + "learning_rate": 3.297687376705153e-05, + "loss": 0.4261, "step": 57560 }, { - "epoch": 2.03, - "learning_rate": 3.372572086354736e-05, - "loss": 0.2969, + "epoch": 2.0746386996792445, + "grad_norm": 0.18524418771266937, + "learning_rate": 3.297410811141516e-05, + "loss": 0.3958, "step": 57565 }, { - "epoch": 2.03, - "learning_rate": 3.372305123601972e-05, - "loss": 0.3003, + "epoch": 2.0748188993404693, + "grad_norm": 0.1883489340543747, + "learning_rate": 3.297134234713311e-05, + "loss": 0.3905, "step": 57570 }, { - "epoch": 2.03, - "learning_rate": 3.372038149522901e-05, - "loss": 0.2764, + "epoch": 2.074999099001694, + "grad_norm": 0.1916968673467636, + "learning_rate": 3.296857647424307e-05, + "loss": 0.3914, "step": 57575 }, { - "epoch": 2.03, - "learning_rate": 3.371771164120989e-05, - "loss": 0.2714, + "epoch": 2.0751792986629187, + "grad_norm": 0.19922398030757904, + "learning_rate": 3.296581049278272e-05, + "loss": 0.3591, "step": 57580 }, { - "epoch": 2.03, - "learning_rate": 3.3715041673997035e-05, - "loss": 0.2711, + "epoch": 2.075359498324143, + "grad_norm": 0.18488508462905884, + "learning_rate": 3.2963044402789736e-05, + "loss": 0.3581, "step": 57585 }, { - "epoch": 2.03, - "learning_rate": 3.37123715936251e-05, - "loss": 0.2781, + "epoch": 2.0755396979853677, + "grad_norm": 0.21221880614757538, + "learning_rate": 3.296027820430182e-05, + "loss": 0.4096, "step": 57590 }, { - "epoch": 2.03, - "learning_rate": 3.3709701400128765e-05, - "loss": 0.2579, + "epoch": 2.0757198976465925, + "grad_norm": 0.1988646239042282, + "learning_rate": 3.295751189735665e-05, + "loss": 0.3703, "step": 57595 }, { - "epoch": 2.03, - "learning_rate": 3.370703109354269e-05, - "loss": 0.2673, + "epoch": 2.075900097307817, + "grad_norm": 0.17219915986061096, + "learning_rate": 3.295474548199193e-05, + "loss": 0.3574, "step": 57600 }, { - "epoch": 2.03, - "learning_rate": 3.370436067390157e-05, - "loss": 0.2843, + "epoch": 2.0760802969690415, + "grad_norm": 0.21070365607738495, + "learning_rate": 3.2951978958245336e-05, + "loss": 0.4147, "step": 57605 }, { - "epoch": 2.03, - "learning_rate": 3.3701690141240056e-05, - "loss": 0.2998, + "epoch": 2.0762604966302662, + "grad_norm": 0.18376260995864868, + "learning_rate": 3.294921232615457e-05, + "loss": 0.4007, "step": 57610 }, { - "epoch": 2.03, - "learning_rate": 3.369901949559283e-05, - "loss": 0.2908, + "epoch": 2.076440696291491, + "grad_norm": 0.21397769451141357, + "learning_rate": 3.294644558575732e-05, + "loss": 0.3699, "step": 57615 }, { - "epoch": 2.03, - "learning_rate": 3.369634873699457e-05, - "loss": 0.276, + "epoch": 2.0766208959527157, + "grad_norm": 0.2026335448026657, + "learning_rate": 3.294367873709129e-05, + "loss": 0.4035, "step": 57620 }, { - "epoch": 2.03, - "learning_rate": 3.3693677865479945e-05, - "loss": 0.2981, + "epoch": 2.0768010956139404, + "grad_norm": 0.21820026636123657, + "learning_rate": 3.294091178019418e-05, + "loss": 0.3559, "step": 57625 }, { - "epoch": 2.03, - "learning_rate": 3.369100688108365e-05, - "loss": 0.2664, + "epoch": 2.0769812952751647, + "grad_norm": 0.1874457746744156, + "learning_rate": 3.293814471510368e-05, + "loss": 0.398, "step": 57630 }, { - "epoch": 2.03, - "learning_rate": 3.3688335783840355e-05, - "loss": 0.2672, + "epoch": 2.0771614949363895, + "grad_norm": 0.20042584836483002, + "learning_rate": 3.293537754185749e-05, + "loss": 0.366, "step": 57635 }, { - "epoch": 2.03, - "learning_rate": 3.368566457378474e-05, - "loss": 0.2837, + "epoch": 2.077341694597614, + "grad_norm": 0.24218867719173431, + "learning_rate": 3.293261026049333e-05, + "loss": 0.4413, "step": 57640 }, { - "epoch": 2.03, - "learning_rate": 3.36829932509515e-05, - "loss": 0.26, + "epoch": 2.077521894258839, + "grad_norm": 0.17626816034317017, + "learning_rate": 3.2929842871048885e-05, + "loss": 0.3598, "step": 57645 }, { - "epoch": 2.03, - "learning_rate": 3.368032181537532e-05, - "loss": 0.2865, + "epoch": 2.077702093920063, + "grad_norm": 0.2106868326663971, + "learning_rate": 3.292707537356186e-05, + "loss": 0.3975, "step": 57650 }, { - "epoch": 2.03, - "learning_rate": 3.3677650267090875e-05, - "loss": 0.3003, + "epoch": 2.077882293581288, + "grad_norm": 0.19412852823734283, + "learning_rate": 3.292430776806997e-05, + "loss": 0.3979, "step": 57655 }, { - "epoch": 2.03, - "learning_rate": 3.3674978606132866e-05, - "loss": 0.2764, + "epoch": 2.0780624932425127, + "grad_norm": 0.2266625612974167, + "learning_rate": 3.292154005461093e-05, + "loss": 0.4145, "step": 57660 }, { - "epoch": 2.03, - "learning_rate": 3.3672306832535974e-05, - "loss": 0.2719, + "epoch": 2.0782426929037374, + "grad_norm": 0.2035531848669052, + "learning_rate": 3.291877223322244e-05, + "loss": 0.3795, "step": 57665 }, { - "epoch": 2.03, - "learning_rate": 3.366963494633489e-05, - "loss": 0.269, + "epoch": 2.078422892564962, + "grad_norm": 0.22533543407917023, + "learning_rate": 3.29160043039422e-05, + "loss": 0.4332, "step": 57670 }, { - "epoch": 2.03, - "learning_rate": 3.366696294756431e-05, - "loss": 0.2546, + "epoch": 2.0786030922261864, + "grad_norm": 0.13781999051570892, + "learning_rate": 3.291323626680793e-05, + "loss": 0.3766, "step": 57675 }, { - "epoch": 2.03, - "learning_rate": 3.3664290836258934e-05, - "loss": 0.2789, + "epoch": 2.078783291887411, + "grad_norm": 0.20056338608264923, + "learning_rate": 3.291046812185736e-05, + "loss": 0.3835, "step": 57680 }, { - "epoch": 2.03, - "learning_rate": 3.3661618612453446e-05, - "loss": 0.2519, + "epoch": 2.078963491548636, + "grad_norm": 0.17711946368217468, + "learning_rate": 3.29076998691282e-05, + "loss": 0.4235, "step": 57685 }, { - "epoch": 2.03, - "learning_rate": 3.3658946276182556e-05, - "loss": 0.2899, + "epoch": 2.0791436912098606, + "grad_norm": 0.19499550759792328, + "learning_rate": 3.2904931508658154e-05, + "loss": 0.3613, "step": 57690 }, { - "epoch": 2.03, - "learning_rate": 3.3656273827480944e-05, - "loss": 0.2752, + "epoch": 2.079323890871085, + "grad_norm": 0.20534376800060272, + "learning_rate": 3.290216304048494e-05, + "loss": 0.3908, "step": 57695 }, { - "epoch": 2.03, - "learning_rate": 3.3653601266383326e-05, - "loss": 0.2817, + "epoch": 2.0795040905323097, + "grad_norm": 0.18097397685050964, + "learning_rate": 3.289939446464629e-05, + "loss": 0.409, "step": 57700 }, { - "epoch": 2.03, - "learning_rate": 3.3650928592924395e-05, - "loss": 0.2655, + "epoch": 2.0796842901935344, + "grad_norm": 0.18338914215564728, + "learning_rate": 3.289662578117992e-05, + "loss": 0.3821, "step": 57705 }, { - "epoch": 2.03, - "learning_rate": 3.3648255807138866e-05, - "loss": 0.2676, + "epoch": 2.079864489854759, + "grad_norm": 0.1914772242307663, + "learning_rate": 3.2893856990123544e-05, + "loss": 0.3561, "step": 57710 }, { - "epoch": 2.03, - "learning_rate": 3.364558290906143e-05, - "loss": 0.2963, + "epoch": 2.080044689515984, + "grad_norm": 0.25630638003349304, + "learning_rate": 3.2891088091514905e-05, + "loss": 0.3759, "step": 57715 }, { - "epoch": 2.03, - "learning_rate": 3.364290989872679e-05, - "loss": 0.2762, + "epoch": 2.080224889177208, + "grad_norm": 0.18817178905010223, + "learning_rate": 3.288831908539171e-05, + "loss": 0.3603, "step": 57720 }, { - "epoch": 2.03, - "learning_rate": 3.364023677616967e-05, - "loss": 0.2767, + "epoch": 2.080405088838433, + "grad_norm": 0.15790888667106628, + "learning_rate": 3.288554997179171e-05, + "loss": 0.3602, "step": 57725 }, { - "epoch": 2.03, - "learning_rate": 3.363756354142478e-05, - "loss": 0.2809, + "epoch": 2.0805852884996576, + "grad_norm": 0.17841728031635284, + "learning_rate": 3.2882780750752604e-05, + "loss": 0.3972, "step": 57730 }, { - "epoch": 2.03, - "learning_rate": 3.36348901945268e-05, - "loss": 0.2566, + "epoch": 2.0807654881608824, + "grad_norm": 0.20370912551879883, + "learning_rate": 3.288001142231214e-05, + "loss": 0.3692, "step": 57735 }, { - "epoch": 2.03, - "learning_rate": 3.363221673551047e-05, - "loss": 0.285, + "epoch": 2.080945687822107, + "grad_norm": 0.19954347610473633, + "learning_rate": 3.2877241986508045e-05, + "loss": 0.3974, "step": 57740 }, { - "epoch": 2.03, - "learning_rate": 3.3629543164410496e-05, - "loss": 0.247, + "epoch": 2.0811258874833314, + "grad_norm": 0.1677468866109848, + "learning_rate": 3.2874472443378056e-05, + "loss": 0.3678, "step": 57745 }, { - "epoch": 2.03, - "learning_rate": 3.3626869481261586e-05, - "loss": 0.2778, + "epoch": 2.081306087144556, + "grad_norm": 0.2071567326784134, + "learning_rate": 3.287170279295991e-05, + "loss": 0.3873, "step": 57750 }, { - "epoch": 2.03, - "learning_rate": 3.3624195686098465e-05, - "loss": 0.2722, + "epoch": 2.081486286805781, + "grad_norm": 0.1877603381872177, + "learning_rate": 3.286893303529132e-05, + "loss": 0.3882, "step": 57755 }, { - "epoch": 2.03, - "learning_rate": 3.362152177895584e-05, - "loss": 0.2861, + "epoch": 2.0816664864670056, + "grad_norm": 0.20741944015026093, + "learning_rate": 3.286616317041006e-05, + "loss": 0.4103, "step": 57760 }, { - "epoch": 2.03, - "learning_rate": 3.3618847759868445e-05, - "loss": 0.2812, + "epoch": 2.08184668612823, + "grad_norm": 0.18143180012702942, + "learning_rate": 3.286339319835384e-05, + "loss": 0.4236, "step": 57765 }, { - "epoch": 2.03, - "learning_rate": 3.361617362887099e-05, - "loss": 0.252, + "epoch": 2.0820268857894546, + "grad_norm": 0.19399969279766083, + "learning_rate": 3.286062311916041e-05, + "loss": 0.3963, "step": 57770 }, { - "epoch": 2.03, - "learning_rate": 3.3613499385998196e-05, - "loss": 0.2673, + "epoch": 2.0822070854506793, + "grad_norm": 0.19400350749492645, + "learning_rate": 3.285785293286751e-05, + "loss": 0.4274, "step": 57775 }, { - "epoch": 2.03, - "learning_rate": 3.3610825031284784e-05, - "loss": 0.2857, + "epoch": 2.082387285111904, + "grad_norm": 0.21012353897094727, + "learning_rate": 3.285508263951289e-05, + "loss": 0.4118, "step": 57780 }, { - "epoch": 2.03, - "learning_rate": 3.36081505647655e-05, - "loss": 0.2488, + "epoch": 2.082567484773129, + "grad_norm": 0.21954594552516937, + "learning_rate": 3.285231223913429e-05, + "loss": 0.4276, "step": 57785 }, { - "epoch": 2.03, - "learning_rate": 3.360547598647504e-05, - "loss": 0.2621, + "epoch": 2.082747684434353, + "grad_norm": 0.1681157350540161, + "learning_rate": 3.284954173176945e-05, + "loss": 0.3828, "step": 57790 }, { - "epoch": 2.03, - "learning_rate": 3.3602801296448154e-05, - "loss": 0.2739, + "epoch": 2.082927884095578, + "grad_norm": 0.19216038286685944, + "learning_rate": 3.2846771117456134e-05, + "loss": 0.3608, "step": 57795 }, { - "epoch": 2.03, - "learning_rate": 3.3600126494719555e-05, - "loss": 0.269, + "epoch": 2.0831080837568026, + "grad_norm": 0.20407375693321228, + "learning_rate": 3.2844000396232064e-05, + "loss": 0.3638, "step": 57800 }, { - "epoch": 2.03, - "learning_rate": 3.359745158132399e-05, - "loss": 0.2596, + "epoch": 2.0832882834180273, + "grad_norm": 0.186093270778656, + "learning_rate": 3.284122956813503e-05, + "loss": 0.4093, "step": 57805 }, { - "epoch": 2.03, - "learning_rate": 3.359477655629617e-05, - "loss": 0.3004, + "epoch": 2.083468483079252, + "grad_norm": 0.19043242931365967, + "learning_rate": 3.2838458633202745e-05, + "loss": 0.4267, "step": 57810 }, { - "epoch": 2.03, - "learning_rate": 3.359210141967085e-05, - "loss": 0.2488, + "epoch": 2.0836486827404763, + "grad_norm": 0.22312848269939423, + "learning_rate": 3.283568759147298e-05, + "loss": 0.3784, "step": 57815 }, { - "epoch": 2.03, - "learning_rate": 3.3589426171482754e-05, - "loss": 0.2864, + "epoch": 2.083828882401701, + "grad_norm": 0.207689568400383, + "learning_rate": 3.283291644298349e-05, + "loss": 0.3612, "step": 57820 }, { - "epoch": 2.03, - "learning_rate": 3.358675081176662e-05, - "loss": 0.2883, + "epoch": 2.084009082062926, + "grad_norm": 0.1909976601600647, + "learning_rate": 3.283014518777203e-05, + "loss": 0.412, "step": 57825 }, { - "epoch": 2.03, - "learning_rate": 3.358407534055718e-05, - "loss": 0.2591, + "epoch": 2.0841892817241505, + "grad_norm": 0.17157649993896484, + "learning_rate": 3.2827373825876364e-05, + "loss": 0.4281, "step": 57830 }, { - "epoch": 2.03, - "learning_rate": 3.358139975788918e-05, - "loss": 0.2733, + "epoch": 2.084369481385375, + "grad_norm": 0.20599566400051117, + "learning_rate": 3.282460235733424e-05, + "loss": 0.4165, "step": 57835 }, { - "epoch": 2.03, - "learning_rate": 3.357872406379736e-05, - "loss": 0.2702, + "epoch": 2.0845496810465995, + "grad_norm": 0.23705652356147766, + "learning_rate": 3.282183078218342e-05, + "loss": 0.4269, "step": 57840 }, { - "epoch": 2.04, - "learning_rate": 3.357604825831647e-05, - "loss": 0.2588, + "epoch": 2.0847298807078243, + "grad_norm": 0.17649759352207184, + "learning_rate": 3.281905910046167e-05, + "loss": 0.3876, "step": 57845 }, { - "epoch": 2.04, - "learning_rate": 3.357337234148124e-05, - "loss": 0.2755, + "epoch": 2.084910080369049, + "grad_norm": 0.16404461860656738, + "learning_rate": 3.281628731220676e-05, + "loss": 0.3754, "step": 57850 }, { - "epoch": 2.04, - "learning_rate": 3.357069631332642e-05, - "loss": 0.2558, + "epoch": 2.0850902800302737, + "grad_norm": 0.1914927363395691, + "learning_rate": 3.2813515417456456e-05, + "loss": 0.4207, "step": 57855 }, { - "epoch": 2.04, - "learning_rate": 3.356802017388675e-05, - "loss": 0.2694, + "epoch": 2.085270479691498, + "grad_norm": 0.19702866673469543, + "learning_rate": 3.2810743416248496e-05, + "loss": 0.4151, "step": 57860 }, { - "epoch": 2.04, - "learning_rate": 3.356534392319699e-05, - "loss": 0.2742, + "epoch": 2.0854506793527228, + "grad_norm": 0.19254851341247559, + "learning_rate": 3.2807971308620685e-05, + "loss": 0.389, "step": 57865 }, { - "epoch": 2.04, - "learning_rate": 3.356266756129189e-05, - "loss": 0.2663, + "epoch": 2.0856308790139475, + "grad_norm": 0.2003370225429535, + "learning_rate": 3.2805199094610774e-05, + "loss": 0.3934, "step": 57870 }, { - "epoch": 2.04, - "learning_rate": 3.355999108820619e-05, - "loss": 0.2779, + "epoch": 2.0858110786751722, + "grad_norm": 0.18016155064105988, + "learning_rate": 3.2802426774256534e-05, + "loss": 0.3857, "step": 57875 }, { - "epoch": 2.04, - "learning_rate": 3.3557314503974646e-05, - "loss": 0.3263, + "epoch": 2.0859912783363965, + "grad_norm": 0.16226695477962494, + "learning_rate": 3.2799654347595736e-05, + "loss": 0.4044, "step": 57880 }, { - "epoch": 2.04, - "learning_rate": 3.355463780863201e-05, - "loss": 0.2674, + "epoch": 2.0861714779976213, + "grad_norm": 0.1907840520143509, + "learning_rate": 3.2796881814666164e-05, + "loss": 0.4128, "step": 57885 }, { - "epoch": 2.04, - "learning_rate": 3.355196100221304e-05, - "loss": 0.2731, + "epoch": 2.086351677658846, + "grad_norm": 0.19597387313842773, + "learning_rate": 3.279410917550559e-05, + "loss": 0.4094, "step": 57890 }, { - "epoch": 2.04, - "learning_rate": 3.3549284084752494e-05, - "loss": 0.2736, + "epoch": 2.0865318773200707, + "grad_norm": 0.2085808515548706, + "learning_rate": 3.2791336430151775e-05, + "loss": 0.416, "step": 57895 }, { - "epoch": 2.04, - "learning_rate": 3.354660705628514e-05, - "loss": 0.2806, + "epoch": 2.0867120769812955, + "grad_norm": 0.22985954582691193, + "learning_rate": 3.278856357864252e-05, + "loss": 0.4266, "step": 57900 }, { - "epoch": 2.04, - "learning_rate": 3.354392991684571e-05, - "loss": 0.2514, + "epoch": 2.0868922766425198, + "grad_norm": 0.2437005490064621, + "learning_rate": 3.278579062101559e-05, + "loss": 0.4272, "step": 57905 }, { - "epoch": 2.04, - "learning_rate": 3.354125266646899e-05, - "loss": 0.2987, + "epoch": 2.0870724763037445, + "grad_norm": 0.1723114401102066, + "learning_rate": 3.278301755730878e-05, + "loss": 0.3741, "step": 57910 }, { - "epoch": 2.04, - "learning_rate": 3.353857530518972e-05, - "loss": 0.2642, + "epoch": 2.087252675964969, + "grad_norm": 0.1853064000606537, + "learning_rate": 3.2780244387559846e-05, + "loss": 0.4152, "step": 57915 }, { - "epoch": 2.04, - "learning_rate": 3.353589783304269e-05, - "loss": 0.258, + "epoch": 2.087432875626194, + "grad_norm": 0.2420477718114853, + "learning_rate": 3.27774711118066e-05, + "loss": 0.3977, "step": 57920 }, { - "epoch": 2.04, - "learning_rate": 3.3533220250062646e-05, - "loss": 0.2822, + "epoch": 2.0876130752874182, + "grad_norm": 0.23371827602386475, + "learning_rate": 3.277469773008681e-05, + "loss": 0.4223, "step": 57925 }, { - "epoch": 2.04, - "learning_rate": 3.3530542556284363e-05, - "loss": 0.2591, + "epoch": 2.087793274948643, + "grad_norm": 0.1876314878463745, + "learning_rate": 3.277192424243827e-05, + "loss": 0.4122, "step": 57930 }, { - "epoch": 2.04, - "learning_rate": 3.352786475174261e-05, - "loss": 0.2674, + "epoch": 2.0879734746098677, + "grad_norm": 0.1702897995710373, + "learning_rate": 3.276915064889877e-05, + "loss": 0.3766, "step": 57935 }, { - "epoch": 2.04, - "learning_rate": 3.3525186836472144e-05, - "loss": 0.2903, + "epoch": 2.0881536742710924, + "grad_norm": 0.22066879272460938, + "learning_rate": 3.2766376949506085e-05, + "loss": 0.4061, "step": 57940 }, { - "epoch": 2.04, - "learning_rate": 3.3522508810507746e-05, - "loss": 0.2626, + "epoch": 2.088333873932317, + "grad_norm": 0.19222991168498993, + "learning_rate": 3.2763603144298026e-05, + "loss": 0.372, "step": 57945 }, { - "epoch": 2.04, - "learning_rate": 3.35198306738842e-05, - "loss": 0.2595, + "epoch": 2.0885140735935415, + "grad_norm": 0.17189811170101166, + "learning_rate": 3.2760829233312385e-05, + "loss": 0.3949, "step": 57950 }, { - "epoch": 2.04, - "learning_rate": 3.351715242663626e-05, - "loss": 0.2622, + "epoch": 2.088694273254766, + "grad_norm": 0.19438187777996063, + "learning_rate": 3.275805521658694e-05, + "loss": 0.4451, "step": 57955 }, { - "epoch": 2.04, - "learning_rate": 3.351447406879871e-05, - "loss": 0.2506, + "epoch": 2.088874472915991, + "grad_norm": 0.17549294233322144, + "learning_rate": 3.27552810941595e-05, + "loss": 0.39, "step": 57960 }, { - "epoch": 2.04, - "learning_rate": 3.351179560040632e-05, - "loss": 0.2667, + "epoch": 2.0890546725772157, + "grad_norm": 0.18093203008174896, + "learning_rate": 3.275250686606784e-05, + "loss": 0.3649, "step": 57965 }, { - "epoch": 2.04, - "learning_rate": 3.350911702149388e-05, - "loss": 0.2623, + "epoch": 2.0892348722384404, + "grad_norm": 0.17545856535434723, + "learning_rate": 3.2749732532349796e-05, + "loss": 0.4106, "step": 57970 }, { - "epoch": 2.04, - "learning_rate": 3.350643833209616e-05, - "loss": 0.285, + "epoch": 2.0894150718996647, + "grad_norm": 0.21285952627658844, + "learning_rate": 3.2746958093043136e-05, + "loss": 0.3845, "step": 57975 }, { - "epoch": 2.04, - "learning_rate": 3.350375953224794e-05, - "loss": 0.2561, + "epoch": 2.0895952715608894, + "grad_norm": 0.2009621560573578, + "learning_rate": 3.2744183548185674e-05, + "loss": 0.3827, "step": 57980 }, { - "epoch": 2.04, - "learning_rate": 3.3501080621984013e-05, - "loss": 0.2873, + "epoch": 2.089775471222114, + "grad_norm": 0.21972990036010742, + "learning_rate": 3.274140889781521e-05, + "loss": 0.367, "step": 57985 }, { - "epoch": 2.04, - "learning_rate": 3.3498401601339156e-05, - "loss": 0.2427, + "epoch": 2.089955670883339, + "grad_norm": 0.1965899020433426, + "learning_rate": 3.273863414196955e-05, + "loss": 0.3854, "step": 57990 }, { - "epoch": 2.04, - "learning_rate": 3.349572247034816e-05, - "loss": 0.2907, + "epoch": 2.090135870544563, + "grad_norm": 0.17252902686595917, + "learning_rate": 3.27358592806865e-05, + "loss": 0.4063, "step": 57995 }, { - "epoch": 2.04, - "learning_rate": 3.34930432290458e-05, - "loss": 0.3012, + "epoch": 2.090316070205788, + "grad_norm": 0.1891898810863495, + "learning_rate": 3.273308431400386e-05, + "loss": 0.4243, "step": 58000 }, { - "epoch": 2.04, - "eval_loss": 0.2711896598339081, - "eval_runtime": 10.5364, - "eval_samples_per_second": 9.491, - "eval_steps_per_second": 9.491, + "epoch": 2.090316070205788, + "eval_loss": 0.4369398057460785, + "eval_runtime": 3.5377, + "eval_samples_per_second": 28.267, + "eval_steps_per_second": 7.067, "step": 58000 }, { - "epoch": 2.04, - "learning_rate": 3.349036387746687e-05, - "loss": 0.2664, + "epoch": 2.0904962698670126, + "grad_norm": 0.17718040943145752, + "learning_rate": 3.2730309241959446e-05, + "loss": 0.4142, "step": 58005 }, { - "epoch": 2.04, - "learning_rate": 3.348768441564616e-05, - "loss": 0.2721, + "epoch": 2.0906764695282374, + "grad_norm": 0.16075636446475983, + "learning_rate": 3.272753406459106e-05, + "loss": 0.4091, "step": 58010 }, { - "epoch": 2.04, - "learning_rate": 3.348500484361846e-05, - "loss": 0.2644, + "epoch": 2.090856669189462, + "grad_norm": 0.21720968186855316, + "learning_rate": 3.272475878193653e-05, + "loss": 0.404, "step": 58015 }, { - "epoch": 2.04, - "learning_rate": 3.348232516141857e-05, - "loss": 0.2656, + "epoch": 2.0910368688506864, + "grad_norm": 0.19920498132705688, + "learning_rate": 3.2721983394033645e-05, + "loss": 0.39, "step": 58020 }, { - "epoch": 2.04, - "learning_rate": 3.347964536908128e-05, - "loss": 0.2686, + "epoch": 2.091217068511911, + "grad_norm": 0.17115192115306854, + "learning_rate": 3.271920790092024e-05, + "loss": 0.398, "step": 58025 }, { - "epoch": 2.04, - "learning_rate": 3.347696546664137e-05, - "loss": 0.2754, + "epoch": 2.091397268173136, + "grad_norm": 0.15494772791862488, + "learning_rate": 3.2716432302634116e-05, + "loss": 0.4062, "step": 58030 }, { - "epoch": 2.04, - "learning_rate": 3.347428545413366e-05, - "loss": 0.2692, + "epoch": 2.0915774678343606, + "grad_norm": 0.1952543705701828, + "learning_rate": 3.2713656599213094e-05, + "loss": 0.4006, "step": 58035 }, { - "epoch": 2.04, - "learning_rate": 3.3471605331592935e-05, - "loss": 0.2756, + "epoch": 2.091757667495585, + "grad_norm": 0.2297617346048355, + "learning_rate": 3.2710880790695e-05, + "loss": 0.393, "step": 58040 }, { - "epoch": 2.04, - "learning_rate": 3.346892509905401e-05, - "loss": 0.2624, + "epoch": 2.0919378671568096, + "grad_norm": 0.18533745408058167, + "learning_rate": 3.2708104877117644e-05, + "loss": 0.375, "step": 58045 }, { - "epoch": 2.04, - "learning_rate": 3.3466244756551656e-05, - "loss": 0.2684, + "epoch": 2.0921180668180344, + "grad_norm": 0.19725355505943298, + "learning_rate": 3.270532885851886e-05, + "loss": 0.3976, "step": 58050 }, { - "epoch": 2.04, - "learning_rate": 3.34635643041207e-05, - "loss": 0.2734, + "epoch": 2.092298266479259, + "grad_norm": 0.2088615596294403, + "learning_rate": 3.2702552734936454e-05, + "loss": 0.423, "step": 58055 }, { - "epoch": 2.04, - "learning_rate": 3.346088374179593e-05, - "loss": 0.2706, + "epoch": 2.092478466140484, + "grad_norm": 0.1870979368686676, + "learning_rate": 3.2699776506408266e-05, + "loss": 0.3923, "step": 58060 }, { - "epoch": 2.04, - "learning_rate": 3.345820306961218e-05, - "loss": 0.2608, + "epoch": 2.092658665801708, + "grad_norm": 0.18204568326473236, + "learning_rate": 3.2697000172972106e-05, + "loss": 0.3888, "step": 58065 }, { - "epoch": 2.04, - "learning_rate": 3.345552228760422e-05, - "loss": 0.2507, + "epoch": 2.092838865462933, + "grad_norm": 0.16376551985740662, + "learning_rate": 3.2694223734665806e-05, + "loss": 0.3746, "step": 58070 }, { - "epoch": 2.04, - "learning_rate": 3.3452841395806887e-05, - "loss": 0.2956, + "epoch": 2.0930190651241576, + "grad_norm": 0.21093137562274933, + "learning_rate": 3.26914471915272e-05, + "loss": 0.4078, "step": 58075 }, { - "epoch": 2.04, - "learning_rate": 3.345016039425497e-05, - "loss": 0.2717, + "epoch": 2.0931992647853823, + "grad_norm": 0.199448361992836, + "learning_rate": 3.2688670543594116e-05, + "loss": 0.3742, "step": 58080 }, { - "epoch": 2.04, - "learning_rate": 3.34474792829833e-05, - "loss": 0.2607, + "epoch": 2.093379464446607, + "grad_norm": 0.21736840903759003, + "learning_rate": 3.268589379090439e-05, + "loss": 0.3834, "step": 58085 }, { - "epoch": 2.04, - "learning_rate": 3.3444798062026674e-05, - "loss": 0.2719, + "epoch": 2.0935596641078313, + "grad_norm": 0.22230195999145508, + "learning_rate": 3.2683116933495844e-05, + "loss": 0.4269, "step": 58090 }, { - "epoch": 2.04, - "learning_rate": 3.344211673141991e-05, - "loss": 0.2629, + "epoch": 2.093739863769056, + "grad_norm": 0.20408831536769867, + "learning_rate": 3.2680339971406325e-05, + "loss": 0.4019, "step": 58095 }, { - "epoch": 2.04, - "learning_rate": 3.3439435291197824e-05, - "loss": 0.2764, + "epoch": 2.093920063430281, + "grad_norm": 0.25087887048721313, + "learning_rate": 3.267756290467365e-05, + "loss": 0.4346, "step": 58100 }, { - "epoch": 2.04, - "learning_rate": 3.343675374139523e-05, - "loss": 0.2804, + "epoch": 2.0941002630915055, + "grad_norm": 0.2030712068080902, + "learning_rate": 3.267478573333567e-05, + "loss": 0.3983, "step": 58105 }, { - "epoch": 2.04, - "learning_rate": 3.343407208204695e-05, - "loss": 0.2788, + "epoch": 2.09428046275273, + "grad_norm": 0.24952729046344757, + "learning_rate": 3.267200845743022e-05, + "loss": 0.3967, "step": 58110 }, { - "epoch": 2.04, - "learning_rate": 3.34313903131878e-05, - "loss": 0.2597, + "epoch": 2.0944606624139546, + "grad_norm": 0.16245847940444946, + "learning_rate": 3.2669231076995146e-05, + "loss": 0.3667, "step": 58115 }, { - "epoch": 2.04, - "learning_rate": 3.342870843485261e-05, - "loss": 0.2737, + "epoch": 2.0946408620751793, + "grad_norm": 0.22543512284755707, + "learning_rate": 3.266645359206827e-05, + "loss": 0.3892, "step": 58120 }, { - "epoch": 2.04, - "learning_rate": 3.3426026447076186e-05, - "loss": 0.2646, + "epoch": 2.094821061736404, + "grad_norm": 0.1827520877122879, + "learning_rate": 3.266367600268746e-05, + "loss": 0.3929, "step": 58125 }, { - "epoch": 2.05, - "learning_rate": 3.342334434989337e-05, - "loss": 0.2729, + "epoch": 2.0950012613976288, + "grad_norm": 0.20801417529582977, + "learning_rate": 3.2660898308890546e-05, + "loss": 0.3763, "step": 58130 }, { - "epoch": 2.05, - "learning_rate": 3.3420662143338974e-05, - "loss": 0.2923, + "epoch": 2.095181461058853, + "grad_norm": 0.19347546994686127, + "learning_rate": 3.265812051071537e-05, + "loss": 0.3982, "step": 58135 }, { - "epoch": 2.05, - "learning_rate": 3.341797982744783e-05, - "loss": 0.2681, + "epoch": 2.095361660720078, + "grad_norm": 0.19839556515216827, + "learning_rate": 3.265534260819979e-05, + "loss": 0.4412, "step": 58140 }, { - "epoch": 2.05, - "learning_rate": 3.3415297402254764e-05, - "loss": 0.2529, + "epoch": 2.0955418603813025, + "grad_norm": 0.20302700996398926, + "learning_rate": 3.265256460138165e-05, + "loss": 0.4398, "step": 58145 }, { - "epoch": 2.05, - "learning_rate": 3.341261486779462e-05, - "loss": 0.2796, + "epoch": 2.0957220600425273, + "grad_norm": 0.19693662226200104, + "learning_rate": 3.2649786490298796e-05, + "loss": 0.4202, "step": 58150 }, { - "epoch": 2.05, - "learning_rate": 3.3409932224102205e-05, - "loss": 0.2909, + "epoch": 2.0959022597037515, + "grad_norm": 0.17689436674118042, + "learning_rate": 3.2647008274989087e-05, + "loss": 0.4202, "step": 58155 }, { - "epoch": 2.05, - "learning_rate": 3.340724947121236e-05, - "loss": 0.2871, + "epoch": 2.0960824593649763, + "grad_norm": 0.20521299540996552, + "learning_rate": 3.264422995549037e-05, + "loss": 0.3863, "step": 58160 }, { - "epoch": 2.05, - "learning_rate": 3.340456660915992e-05, - "loss": 0.2733, + "epoch": 2.096262659026201, + "grad_norm": 0.16490758955478668, + "learning_rate": 3.26414515318405e-05, + "loss": 0.372, "step": 58165 }, { - "epoch": 2.05, - "learning_rate": 3.340188363797973e-05, - "loss": 0.2731, + "epoch": 2.0964428586874257, + "grad_norm": 0.19816817343235016, + "learning_rate": 3.263867300407732e-05, + "loss": 0.4008, "step": 58170 }, { - "epoch": 2.05, - "learning_rate": 3.339920055770661e-05, - "loss": 0.2587, + "epoch": 2.0966230583486505, + "grad_norm": 0.19456301629543304, + "learning_rate": 3.263589437223871e-05, + "loss": 0.3828, "step": 58175 }, { - "epoch": 2.05, - "learning_rate": 3.339651736837542e-05, - "loss": 0.2766, + "epoch": 2.0968032580098748, + "grad_norm": 0.20074453949928284, + "learning_rate": 3.263311563636252e-05, + "loss": 0.3943, "step": 58180 }, { - "epoch": 2.05, - "learning_rate": 3.3393834070020966e-05, - "loss": 0.2629, + "epoch": 2.0969834576710995, + "grad_norm": 0.21630464494228363, + "learning_rate": 3.26303367964866e-05, + "loss": 0.3669, "step": 58185 }, { - "epoch": 2.05, - "learning_rate": 3.339115066267812e-05, - "loss": 0.2745, + "epoch": 2.0971636573323242, + "grad_norm": 0.17707021534442902, + "learning_rate": 3.262755785264882e-05, + "loss": 0.3598, "step": 58190 }, { - "epoch": 2.05, - "learning_rate": 3.338846714638171e-05, - "loss": 0.2617, + "epoch": 2.097343856993549, + "grad_norm": 0.1683293730020523, + "learning_rate": 3.262477880488705e-05, + "loss": 0.3759, "step": 58195 }, { - "epoch": 2.05, - "learning_rate": 3.338578352116659e-05, - "loss": 0.2845, + "epoch": 2.0975240566547733, + "grad_norm": 0.20540519058704376, + "learning_rate": 3.262199965323913e-05, + "loss": 0.4094, "step": 58200 }, { - "epoch": 2.05, - "learning_rate": 3.338309978706758e-05, - "loss": 0.2641, + "epoch": 2.097704256315998, + "grad_norm": 0.17812873423099518, + "learning_rate": 3.261922039774295e-05, + "loss": 0.401, "step": 58205 }, { - "epoch": 2.05, - "learning_rate": 3.338041594411956e-05, - "loss": 0.3062, + "epoch": 2.0978844559772227, + "grad_norm": 0.21470516920089722, + "learning_rate": 3.261644103843637e-05, + "loss": 0.3805, "step": 58210 }, { - "epoch": 2.05, - "learning_rate": 3.3377731992357355e-05, - "loss": 0.2742, + "epoch": 2.0980646556384475, + "grad_norm": 0.23254641890525818, + "learning_rate": 3.261366157535725e-05, + "loss": 0.4054, "step": 58215 }, { - "epoch": 2.05, - "learning_rate": 3.3375047931815816e-05, - "loss": 0.2501, + "epoch": 2.098244855299672, + "grad_norm": 0.2531212568283081, + "learning_rate": 3.2610882008543466e-05, + "loss": 0.3981, "step": 58220 }, { - "epoch": 2.05, - "learning_rate": 3.3372363762529815e-05, - "loss": 0.2379, + "epoch": 2.0984250549608965, + "grad_norm": 0.2381640523672104, + "learning_rate": 3.260810233803289e-05, + "loss": 0.4355, "step": 58225 }, { - "epoch": 2.05, - "learning_rate": 3.3369679484534175e-05, - "loss": 0.2645, + "epoch": 2.098605254622121, + "grad_norm": 0.1578020453453064, + "learning_rate": 3.260532256386338e-05, + "loss": 0.3769, "step": 58230 }, { - "epoch": 2.05, - "learning_rate": 3.336699509786377e-05, - "loss": 0.2771, + "epoch": 2.098785454283346, + "grad_norm": 0.23701877892017365, + "learning_rate": 3.2602542686072835e-05, + "loss": 0.4272, "step": 58235 }, { - "epoch": 2.05, - "learning_rate": 3.336431060255344e-05, - "loss": 0.2872, + "epoch": 2.0989656539445707, + "grad_norm": 0.2273239940404892, + "learning_rate": 3.259976270469912e-05, + "loss": 0.4038, "step": 58240 }, { - "epoch": 2.05, - "learning_rate": 3.336162599863805e-05, - "loss": 0.2668, + "epoch": 2.0991458536057954, + "grad_norm": 0.1812596321105957, + "learning_rate": 3.25969826197801e-05, + "loss": 0.3835, "step": 58245 }, { - "epoch": 2.05, - "learning_rate": 3.3358941286152465e-05, - "loss": 0.2605, + "epoch": 2.0993260532670197, + "grad_norm": 0.21768318116664886, + "learning_rate": 3.259420243135367e-05, + "loss": 0.4035, "step": 58250 }, { - "epoch": 2.05, - "learning_rate": 3.335625646513153e-05, - "loss": 0.2445, + "epoch": 2.0995062529282444, + "grad_norm": 0.16613122820854187, + "learning_rate": 3.25914221394577e-05, + "loss": 0.3626, "step": 58255 }, { - "epoch": 2.05, - "learning_rate": 3.335357153561011e-05, - "loss": 0.2861, + "epoch": 2.099686452589469, + "grad_norm": 0.21264012157917023, + "learning_rate": 3.258864174413008e-05, + "loss": 0.4092, "step": 58260 }, { - "epoch": 2.05, - "learning_rate": 3.335088649762308e-05, - "loss": 0.2569, + "epoch": 2.099866652250694, + "grad_norm": 0.22502678632736206, + "learning_rate": 3.2585861245408676e-05, + "loss": 0.4298, "step": 58265 }, { - "epoch": 2.05, - "learning_rate": 3.3348201351205274e-05, - "loss": 0.2772, + "epoch": 2.100046851911918, + "grad_norm": 0.17495203018188477, + "learning_rate": 3.2583080643331385e-05, + "loss": 0.3757, "step": 58270 }, { - "epoch": 2.05, - "learning_rate": 3.334551609639159e-05, - "loss": 0.2614, + "epoch": 2.100227051573143, + "grad_norm": 0.16976486146450043, + "learning_rate": 3.2580299937936084e-05, + "loss": 0.4213, "step": 58275 }, { - "epoch": 2.05, - "learning_rate": 3.334283073321687e-05, - "loss": 0.2743, + "epoch": 2.1004072512343677, + "grad_norm": 0.25477150082588196, + "learning_rate": 3.2577519129260666e-05, + "loss": 0.4203, "step": 58280 }, { - "epoch": 2.05, - "learning_rate": 3.3340145261716e-05, - "loss": 0.3164, + "epoch": 2.1005874508955924, + "grad_norm": 0.27590665221214294, + "learning_rate": 3.257473821734302e-05, + "loss": 0.4257, "step": 58285 }, { - "epoch": 2.05, - "learning_rate": 3.3337459681923835e-05, - "loss": 0.3013, + "epoch": 2.100767650556817, + "grad_norm": 0.18008361756801605, + "learning_rate": 3.257195720222103e-05, + "loss": 0.3971, "step": 58290 }, { - "epoch": 2.05, - "learning_rate": 3.3334773993875254e-05, - "loss": 0.2843, + "epoch": 2.1009478502180414, + "grad_norm": 0.22707924246788025, + "learning_rate": 3.256917608393259e-05, + "loss": 0.3946, "step": 58295 }, { - "epoch": 2.05, - "learning_rate": 3.3332088197605115e-05, - "loss": 0.2746, + "epoch": 2.101128049879266, + "grad_norm": 0.18620999157428741, + "learning_rate": 3.2566394862515596e-05, + "loss": 0.3527, "step": 58300 }, { - "epoch": 2.05, - "learning_rate": 3.332940229314832e-05, - "loss": 0.2719, + "epoch": 2.101308249540491, + "grad_norm": 0.20725572109222412, + "learning_rate": 3.256361353800793e-05, + "loss": 0.4064, "step": 58305 }, { - "epoch": 2.05, - "learning_rate": 3.33267162805397e-05, - "loss": 0.2672, + "epoch": 2.1014884492017156, + "grad_norm": 0.2142266035079956, + "learning_rate": 3.2560832110447495e-05, + "loss": 0.4219, "step": 58310 }, { - "epoch": 2.05, - "learning_rate": 3.332403015981418e-05, - "loss": 0.2518, + "epoch": 2.1016686488629404, + "grad_norm": 0.2480008453130722, + "learning_rate": 3.255805057987218e-05, + "loss": 0.4156, "step": 58315 }, { - "epoch": 2.05, - "learning_rate": 3.332134393100661e-05, - "loss": 0.2824, + "epoch": 2.1018488485241646, + "grad_norm": 0.22101950645446777, + "learning_rate": 3.255526894631991e-05, + "loss": 0.3769, "step": 58320 }, { - "epoch": 2.05, - "learning_rate": 3.331865759415186e-05, - "loss": 0.2766, + "epoch": 2.1020290481853894, + "grad_norm": 0.1711684614419937, + "learning_rate": 3.255248720982854e-05, + "loss": 0.4069, "step": 58325 }, { - "epoch": 2.05, - "learning_rate": 3.331597114928484e-05, - "loss": 0.263, + "epoch": 2.102209247846614, + "grad_norm": 0.2133340984582901, + "learning_rate": 3.254970537043601e-05, + "loss": 0.4075, "step": 58330 }, { - "epoch": 2.05, - "learning_rate": 3.33132845964404e-05, - "loss": 0.2864, + "epoch": 2.102389447507839, + "grad_norm": 0.27530649304389954, + "learning_rate": 3.2546923428180184e-05, + "loss": 0.4109, "step": 58335 }, { - "epoch": 2.05, - "learning_rate": 3.331059793565345e-05, - "loss": 0.3015, + "epoch": 2.102569647169063, + "grad_norm": 0.19913306832313538, + "learning_rate": 3.2544141383099014e-05, + "loss": 0.4136, "step": 58340 }, { - "epoch": 2.05, - "learning_rate": 3.330791116695886e-05, - "loss": 0.2648, + "epoch": 2.102749846830288, + "grad_norm": 0.2158804088830948, + "learning_rate": 3.254135923523037e-05, + "loss": 0.3929, "step": 58345 }, { - "epoch": 2.05, - "learning_rate": 3.330522429039152e-05, - "loss": 0.2498, + "epoch": 2.1029300464915126, + "grad_norm": 0.23708494007587433, + "learning_rate": 3.253857698461216e-05, + "loss": 0.4019, "step": 58350 }, { - "epoch": 2.05, - "learning_rate": 3.330253730598631e-05, - "loss": 0.2872, + "epoch": 2.1031102461527373, + "grad_norm": 0.20912696421146393, + "learning_rate": 3.25357946312823e-05, + "loss": 0.4115, "step": 58355 }, { - "epoch": 2.05, - "learning_rate": 3.3299850213778125e-05, - "loss": 0.281, + "epoch": 2.103290445813962, + "grad_norm": 0.1815493106842041, + "learning_rate": 3.25330121752787e-05, + "loss": 0.4154, "step": 58360 }, { - "epoch": 2.05, - "learning_rate": 3.329716301380185e-05, - "loss": 0.2943, + "epoch": 2.1034706454751864, + "grad_norm": 0.2595524489879608, + "learning_rate": 3.253022961663927e-05, + "loss": 0.4105, "step": 58365 }, { - "epoch": 2.05, - "learning_rate": 3.32944757060924e-05, - "loss": 0.2771, + "epoch": 2.103650845136411, + "grad_norm": 0.1517713963985443, + "learning_rate": 3.252744695540191e-05, + "loss": 0.4013, "step": 58370 }, { - "epoch": 2.05, - "learning_rate": 3.329178829068463e-05, - "loss": 0.277, + "epoch": 2.103831044797636, + "grad_norm": 0.1964835673570633, + "learning_rate": 3.252466419160455e-05, + "loss": 0.4256, "step": 58375 }, { - "epoch": 2.05, - "learning_rate": 3.3289100767613464e-05, - "loss": 0.2565, + "epoch": 2.1040112444588606, + "grad_norm": 0.18157154321670532, + "learning_rate": 3.25218813252851e-05, + "loss": 0.3714, "step": 58380 }, { - "epoch": 2.05, - "learning_rate": 3.328641313691378e-05, - "loss": 0.2783, + "epoch": 2.104191444120085, + "grad_norm": 0.18907707929611206, + "learning_rate": 3.251909835648147e-05, + "loss": 0.4031, "step": 58385 }, { - "epoch": 2.05, - "learning_rate": 3.328372539862049e-05, - "loss": 0.2651, + "epoch": 2.1043716437813096, + "grad_norm": 0.17940135300159454, + "learning_rate": 3.2516315285231576e-05, + "loss": 0.414, "step": 58390 }, { - "epoch": 2.05, - "learning_rate": 3.328103755276848e-05, - "loss": 0.2727, + "epoch": 2.1045518434425343, + "grad_norm": 0.18015572428703308, + "learning_rate": 3.251353211157334e-05, + "loss": 0.3899, "step": 58395 }, { - "epoch": 2.05, - "learning_rate": 3.3278349599392664e-05, - "loss": 0.2875, + "epoch": 2.104732043103759, + "grad_norm": 0.20908917486667633, + "learning_rate": 3.25107488355447e-05, + "loss": 0.4163, "step": 58400 }, { - "epoch": 2.05, - "learning_rate": 3.3275661538527926e-05, - "loss": 0.2729, + "epoch": 2.104912242764984, + "grad_norm": 0.1619907170534134, + "learning_rate": 3.2507965457183545e-05, + "loss": 0.4148, "step": 58405 }, { - "epoch": 2.06, - "learning_rate": 3.327297337020917e-05, - "loss": 0.2701, + "epoch": 2.105092442426208, + "grad_norm": 0.1768513172864914, + "learning_rate": 3.2505181976527815e-05, + "loss": 0.4, "step": 58410 }, { - "epoch": 2.06, - "learning_rate": 3.327028509447132e-05, - "loss": 0.2621, + "epoch": 2.105272642087433, + "grad_norm": 0.17166374623775482, + "learning_rate": 3.2502398393615435e-05, + "loss": 0.394, "step": 58415 }, { - "epoch": 2.06, - "learning_rate": 3.3267596711349256e-05, - "loss": 0.2715, + "epoch": 2.1054528417486575, + "grad_norm": 0.21615466475486755, + "learning_rate": 3.249961470848433e-05, + "loss": 0.3977, "step": 58420 }, { - "epoch": 2.06, - "learning_rate": 3.3264908220877904e-05, - "loss": 0.2651, + "epoch": 2.1056330414098823, + "grad_norm": 0.1769254207611084, + "learning_rate": 3.249683092117243e-05, + "loss": 0.3984, "step": 58425 }, { - "epoch": 2.06, - "learning_rate": 3.3262219623092174e-05, - "loss": 0.2773, + "epoch": 2.1058132410711066, + "grad_norm": 0.19884678721427917, + "learning_rate": 3.249404703171766e-05, + "loss": 0.3915, "step": 58430 }, { - "epoch": 2.06, - "learning_rate": 3.325953091802696e-05, - "loss": 0.2623, + "epoch": 2.1059934407323313, + "grad_norm": 0.186051145195961, + "learning_rate": 3.249126304015795e-05, + "loss": 0.388, "step": 58435 }, { - "epoch": 2.06, - "learning_rate": 3.325684210571717e-05, - "loss": 0.267, + "epoch": 2.106173640393556, + "grad_norm": 0.16118311882019043, + "learning_rate": 3.248847894653122e-05, + "loss": 0.3806, "step": 58440 }, { - "epoch": 2.06, - "learning_rate": 3.325415318619773e-05, - "loss": 0.2918, + "epoch": 2.1063538400547808, + "grad_norm": 0.24056154489517212, + "learning_rate": 3.248569475087544e-05, + "loss": 0.4179, "step": 58445 }, { - "epoch": 2.06, - "learning_rate": 3.325146415950356e-05, - "loss": 0.2827, + "epoch": 2.1065340397160055, + "grad_norm": 0.204376682639122, + "learning_rate": 3.24829104532285e-05, + "loss": 0.3641, "step": 58450 }, { - "epoch": 2.06, - "learning_rate": 3.324877502566957e-05, - "loss": 0.2751, + "epoch": 2.10671423937723, + "grad_norm": 0.2513475716114044, + "learning_rate": 3.248012605362836e-05, + "loss": 0.3944, "step": 58455 }, { - "epoch": 2.06, - "learning_rate": 3.324608578473066e-05, - "loss": 0.251, + "epoch": 2.1068944390384545, + "grad_norm": 0.24770568311214447, + "learning_rate": 3.247734155211294e-05, + "loss": 0.3903, "step": 58460 }, { - "epoch": 2.06, - "learning_rate": 3.324339643672176e-05, - "loss": 0.3062, + "epoch": 2.1070746386996793, + "grad_norm": 0.28164035081863403, + "learning_rate": 3.2474556948720197e-05, + "loss": 0.3925, "step": 58465 }, { - "epoch": 2.06, - "learning_rate": 3.324070698167779e-05, - "loss": 0.259, + "epoch": 2.107254838360904, + "grad_norm": 0.2520608603954315, + "learning_rate": 3.2471772243488064e-05, + "loss": 0.3713, "step": 58470 }, { - "epoch": 2.06, - "learning_rate": 3.3238017419633684e-05, - "loss": 0.2959, + "epoch": 2.1074350380221287, + "grad_norm": 0.18132475018501282, + "learning_rate": 3.2468987436454476e-05, + "loss": 0.4178, "step": 58475 }, { - "epoch": 2.06, - "learning_rate": 3.323532775062434e-05, - "loss": 0.2834, + "epoch": 2.107615237683353, + "grad_norm": 0.20063307881355286, + "learning_rate": 3.246620252765739e-05, + "loss": 0.417, "step": 58480 }, { - "epoch": 2.06, - "learning_rate": 3.32326379746847e-05, - "loss": 0.2654, + "epoch": 2.1077954373445777, + "grad_norm": 0.227056086063385, + "learning_rate": 3.2463417517134734e-05, + "loss": 0.4046, "step": 58485 }, { - "epoch": 2.06, - "learning_rate": 3.3229948091849674e-05, - "loss": 0.2705, + "epoch": 2.1079756370058025, + "grad_norm": 0.16620944440364838, + "learning_rate": 3.246063240492445e-05, + "loss": 0.3735, "step": 58490 }, { - "epoch": 2.06, - "learning_rate": 3.32272581021542e-05, - "loss": 0.2969, + "epoch": 2.108155836667027, + "grad_norm": 0.18251557648181915, + "learning_rate": 3.2457847191064505e-05, + "loss": 0.3888, "step": 58495 }, { - "epoch": 2.06, - "learning_rate": 3.3224568005633195e-05, - "loss": 0.2682, + "epoch": 2.1083360363282515, + "grad_norm": 0.20727889239788055, + "learning_rate": 3.245506187559283e-05, + "loss": 0.4018, "step": 58500 }, { - "epoch": 2.06, - "eval_loss": 0.2701320946216583, - "eval_runtime": 10.5303, - "eval_samples_per_second": 9.496, - "eval_steps_per_second": 9.496, + "epoch": 2.1083360363282515, + "eval_loss": 0.4373342990875244, + "eval_runtime": 3.5409, + "eval_samples_per_second": 28.242, + "eval_steps_per_second": 7.06, "step": 58500 }, { - "epoch": 2.06, - "learning_rate": 3.32218778023216e-05, - "loss": 0.2425, + "epoch": 2.1085162359894762, + "grad_norm": 0.2316305935382843, + "learning_rate": 3.245227645854739e-05, + "loss": 0.3647, "step": 58505 }, { - "epoch": 2.06, - "learning_rate": 3.321918749225434e-05, - "loss": 0.2702, + "epoch": 2.108696435650701, + "grad_norm": 0.22615645825862885, + "learning_rate": 3.244949093996612e-05, + "loss": 0.3822, "step": 58510 }, { - "epoch": 2.06, - "learning_rate": 3.321649707546635e-05, - "loss": 0.2658, + "epoch": 2.1088766353119257, + "grad_norm": 0.2182559221982956, + "learning_rate": 3.244670531988697e-05, + "loss": 0.3833, "step": 58515 }, { - "epoch": 2.06, - "learning_rate": 3.321380655199255e-05, - "loss": 0.2684, + "epoch": 2.1090568349731504, + "grad_norm": 0.19747740030288696, + "learning_rate": 3.244391959834791e-05, + "loss": 0.3731, "step": 58520 }, { - "epoch": 2.06, - "learning_rate": 3.3211115921867894e-05, - "loss": 0.2719, + "epoch": 2.1092370346343747, + "grad_norm": 0.165822371840477, + "learning_rate": 3.244113377538689e-05, + "loss": 0.3825, "step": 58525 }, { - "epoch": 2.06, - "learning_rate": 3.3208425185127305e-05, - "loss": 0.2789, + "epoch": 2.1094172342955995, + "grad_norm": 0.19013039767742157, + "learning_rate": 3.243834785104186e-05, + "loss": 0.3978, "step": 58530 }, { - "epoch": 2.06, - "learning_rate": 3.3205734341805725e-05, - "loss": 0.2787, + "epoch": 2.109597433956824, + "grad_norm": 0.16244149208068848, + "learning_rate": 3.243556182535077e-05, + "loss": 0.3966, "step": 58535 }, { - "epoch": 2.06, - "learning_rate": 3.3203043391938094e-05, - "loss": 0.2594, + "epoch": 2.109777633618049, + "grad_norm": 0.19332407414913177, + "learning_rate": 3.2432775698351605e-05, + "loss": 0.3793, "step": 58540 }, { - "epoch": 2.06, - "learning_rate": 3.320035233555935e-05, - "loss": 0.2918, + "epoch": 2.109957833279273, + "grad_norm": 0.18343211710453033, + "learning_rate": 3.242998947008231e-05, + "loss": 0.4015, "step": 58545 }, { - "epoch": 2.06, - "learning_rate": 3.319766117270444e-05, - "loss": 0.2422, + "epoch": 2.110138032940498, + "grad_norm": 0.17457517981529236, + "learning_rate": 3.242720314058084e-05, + "loss": 0.3965, "step": 58550 }, { - "epoch": 2.06, - "learning_rate": 3.3194969903408294e-05, - "loss": 0.281, + "epoch": 2.1103182326017227, + "grad_norm": 0.1578681319952011, + "learning_rate": 3.2424416709885165e-05, + "loss": 0.4025, "step": 58555 }, { - "epoch": 2.06, - "learning_rate": 3.319227852770587e-05, - "loss": 0.2878, + "epoch": 2.1104984322629474, + "grad_norm": 0.20855949819087982, + "learning_rate": 3.242163017803325e-05, + "loss": 0.4154, "step": 58560 }, { - "epoch": 2.06, - "learning_rate": 3.31895870456321e-05, - "loss": 0.2678, + "epoch": 2.110678631924172, + "grad_norm": 0.15797370672225952, + "learning_rate": 3.2418843545063065e-05, + "loss": 0.3684, "step": 58565 }, { - "epoch": 2.06, - "learning_rate": 3.318689545722195e-05, - "loss": 0.2968, + "epoch": 2.1108588315853964, + "grad_norm": 0.22277599573135376, + "learning_rate": 3.241605681101256e-05, + "loss": 0.401, "step": 58570 }, { - "epoch": 2.06, - "learning_rate": 3.318420376251035e-05, - "loss": 0.2631, + "epoch": 2.111039031246621, + "grad_norm": 0.19849325716495514, + "learning_rate": 3.2413269975919736e-05, + "loss": 0.4138, "step": 58575 }, { - "epoch": 2.06, - "learning_rate": 3.318151196153226e-05, - "loss": 0.2765, + "epoch": 2.111219230907846, + "grad_norm": 0.22037936747074127, + "learning_rate": 3.2410483039822527e-05, + "loss": 0.3959, "step": 58580 }, { - "epoch": 2.06, - "learning_rate": 3.317882005432263e-05, - "loss": 0.2809, + "epoch": 2.1113994305690706, + "grad_norm": 0.1882595717906952, + "learning_rate": 3.2407696002758936e-05, + "loss": 0.3915, "step": 58585 }, { - "epoch": 2.06, - "learning_rate": 3.317612804091641e-05, - "loss": 0.2923, + "epoch": 2.1115796302302954, + "grad_norm": 0.20610591769218445, + "learning_rate": 3.240490886476691e-05, + "loss": 0.4286, "step": 58590 }, { - "epoch": 2.06, - "learning_rate": 3.317343592134856e-05, - "loss": 0.2739, + "epoch": 2.1117598298915197, + "grad_norm": 0.22938185930252075, + "learning_rate": 3.240212162588444e-05, + "loss": 0.3866, "step": 58595 }, { - "epoch": 2.06, - "learning_rate": 3.3170743695654026e-05, - "loss": 0.2769, + "epoch": 2.1119400295527444, + "grad_norm": 0.20700225234031677, + "learning_rate": 3.2399334286149495e-05, + "loss": 0.3753, "step": 58600 }, { - "epoch": 2.06, - "learning_rate": 3.3168051363867774e-05, - "loss": 0.2654, + "epoch": 2.112120229213969, + "grad_norm": 0.19341452419757843, + "learning_rate": 3.239654684560005e-05, + "loss": 0.4153, "step": 58605 }, { - "epoch": 2.06, - "learning_rate": 3.316535892602476e-05, - "loss": 0.2636, + "epoch": 2.112300428875194, + "grad_norm": 0.20919573307037354, + "learning_rate": 3.23937593042741e-05, + "loss": 0.4088, "step": 58610 }, { - "epoch": 2.06, - "learning_rate": 3.3162666382159935e-05, - "loss": 0.264, + "epoch": 2.112480628536418, + "grad_norm": 0.20595782995224, + "learning_rate": 3.239097166220959e-05, + "loss": 0.3805, "step": 58615 }, { - "epoch": 2.06, - "learning_rate": 3.315997373230828e-05, - "loss": 0.3044, + "epoch": 2.112660828197643, + "grad_norm": 0.18179921805858612, + "learning_rate": 3.238818391944453e-05, + "loss": 0.4064, "step": 58620 }, { - "epoch": 2.06, - "learning_rate": 3.3157280976504726e-05, - "loss": 0.2707, + "epoch": 2.1128410278588676, + "grad_norm": 0.1617589145898819, + "learning_rate": 3.2385396076016896e-05, + "loss": 0.395, "step": 58625 }, { - "epoch": 2.06, - "learning_rate": 3.315458811478426e-05, - "loss": 0.315, + "epoch": 2.1130212275200924, + "grad_norm": 0.195877343416214, + "learning_rate": 3.2382608131964676e-05, + "loss": 0.4229, "step": 58630 }, { - "epoch": 2.06, - "learning_rate": 3.315189514718184e-05, - "loss": 0.2899, + "epoch": 2.113201427181317, + "grad_norm": 0.20379838347434998, + "learning_rate": 3.2379820087325844e-05, + "loss": 0.4083, "step": 58635 }, { - "epoch": 2.06, - "learning_rate": 3.3149202073732446e-05, - "loss": 0.2767, + "epoch": 2.1133816268425414, + "grad_norm": 0.2131894826889038, + "learning_rate": 3.237703194213839e-05, + "loss": 0.3798, "step": 58640 }, { - "epoch": 2.06, - "learning_rate": 3.314650889447102e-05, - "loss": 0.2539, + "epoch": 2.113561826503766, + "grad_norm": 0.23146046698093414, + "learning_rate": 3.2374243696440305e-05, + "loss": 0.4218, "step": 58645 }, { - "epoch": 2.06, - "learning_rate": 3.314381560943255e-05, - "loss": 0.2922, + "epoch": 2.113742026164991, + "grad_norm": 0.1999361366033554, + "learning_rate": 3.2371455350269574e-05, + "loss": 0.4121, "step": 58650 }, { - "epoch": 2.06, - "learning_rate": 3.3141122218652e-05, - "loss": 0.2856, + "epoch": 2.1139222258262156, + "grad_norm": 0.16494305431842804, + "learning_rate": 3.23686669036642e-05, + "loss": 0.392, "step": 58655 }, { - "epoch": 2.06, - "learning_rate": 3.313842872216434e-05, - "loss": 0.2456, + "epoch": 2.11410242548744, + "grad_norm": 0.24381648004055023, + "learning_rate": 3.236587835666216e-05, + "loss": 0.3991, "step": 58660 }, { - "epoch": 2.06, - "learning_rate": 3.313573512000455e-05, - "loss": 0.2928, + "epoch": 2.1142826251486646, + "grad_norm": 0.15115749835968018, + "learning_rate": 3.236308970930145e-05, + "loss": 0.3447, "step": 58665 }, { - "epoch": 2.06, - "learning_rate": 3.3133041412207596e-05, - "loss": 0.2632, + "epoch": 2.1144628248098893, + "grad_norm": 0.19442135095596313, + "learning_rate": 3.236030096162008e-05, + "loss": 0.3937, "step": 58670 }, { - "epoch": 2.06, - "learning_rate": 3.313034759880847e-05, - "loss": 0.2838, + "epoch": 2.114643024471114, + "grad_norm": 0.23755992949008942, + "learning_rate": 3.235751211365602e-05, + "loss": 0.381, "step": 58675 }, { - "epoch": 2.06, - "learning_rate": 3.312765367984213e-05, - "loss": 0.2953, + "epoch": 2.114823224132339, + "grad_norm": 0.19326242804527283, + "learning_rate": 3.23547231654473e-05, + "loss": 0.3761, "step": 58680 }, { - "epoch": 2.06, - "learning_rate": 3.3124959655343564e-05, - "loss": 0.264, + "epoch": 2.115003423793563, + "grad_norm": 0.2071000635623932, + "learning_rate": 3.2351934117031877e-05, + "loss": 0.4316, "step": 58685 }, { - "epoch": 2.06, - "learning_rate": 3.3122265525347754e-05, - "loss": 0.2863, + "epoch": 2.115183623454788, + "grad_norm": 0.17056724429130554, + "learning_rate": 3.23491449684478e-05, + "loss": 0.3951, "step": 58690 }, { - "epoch": 2.07, - "learning_rate": 3.3119571289889675e-05, - "loss": 0.2702, + "epoch": 2.1153638231160126, + "grad_norm": 0.172649085521698, + "learning_rate": 3.234635571973303e-05, + "loss": 0.3895, "step": 58695 }, { - "epoch": 2.07, - "learning_rate": 3.311687694900432e-05, - "loss": 0.303, + "epoch": 2.1155440227772373, + "grad_norm": 0.21372824907302856, + "learning_rate": 3.2343566370925594e-05, + "loss": 0.398, "step": 58700 }, { - "epoch": 2.07, - "learning_rate": 3.3114182502726664e-05, - "loss": 0.2653, + "epoch": 2.1157242224384616, + "grad_norm": 0.23039843142032623, + "learning_rate": 3.234077692206347e-05, + "loss": 0.4055, "step": 58705 }, { - "epoch": 2.07, - "learning_rate": 3.31114879510917e-05, - "loss": 0.2366, + "epoch": 2.1159044220996863, + "grad_norm": 0.22688081860542297, + "learning_rate": 3.2337987373184704e-05, + "loss": 0.3998, "step": 58710 }, { - "epoch": 2.07, - "learning_rate": 3.3108793294134405e-05, - "loss": 0.2524, + "epoch": 2.116084621760911, + "grad_norm": 0.15555809438228607, + "learning_rate": 3.233519772432727e-05, + "loss": 0.3709, "step": 58715 }, { - "epoch": 2.07, - "learning_rate": 3.310609853188977e-05, - "loss": 0.2888, + "epoch": 2.116264821422136, + "grad_norm": 0.22005966305732727, + "learning_rate": 3.233240797552919e-05, + "loss": 0.3701, "step": 58720 }, { - "epoch": 2.07, - "learning_rate": 3.3103403664392797e-05, - "loss": 0.2645, + "epoch": 2.1164450210833605, + "grad_norm": 0.21118348836898804, + "learning_rate": 3.232961812682847e-05, + "loss": 0.3888, "step": 58725 }, { - "epoch": 2.07, - "learning_rate": 3.310070869167846e-05, - "loss": 0.2613, + "epoch": 2.116625220744585, + "grad_norm": 0.18728987872600555, + "learning_rate": 3.2326828178263125e-05, + "loss": 0.4143, "step": 58730 }, { - "epoch": 2.07, - "learning_rate": 3.309801361378177e-05, - "loss": 0.2458, + "epoch": 2.1168054204058095, + "grad_norm": 0.19817517697811127, + "learning_rate": 3.2324038129871166e-05, + "loss": 0.4269, "step": 58735 }, { - "epoch": 2.07, - "learning_rate": 3.309531843073769e-05, - "loss": 0.2817, + "epoch": 2.1169856200670343, + "grad_norm": 0.19187265634536743, + "learning_rate": 3.232124798169059e-05, + "loss": 0.4259, "step": 58740 }, { - "epoch": 2.07, - "learning_rate": 3.309262314258125e-05, - "loss": 0.269, + "epoch": 2.117165819728259, + "grad_norm": 0.2186998724937439, + "learning_rate": 3.231845773375944e-05, + "loss": 0.4291, "step": 58745 }, { - "epoch": 2.07, - "learning_rate": 3.3089927749347426e-05, - "loss": 0.2837, + "epoch": 2.1173460193894837, + "grad_norm": 0.22712989151477814, + "learning_rate": 3.231566738611572e-05, + "loss": 0.3747, "step": 58750 }, { - "epoch": 2.07, - "learning_rate": 3.308723225107123e-05, - "loss": 0.2785, + "epoch": 2.117526219050708, + "grad_norm": 0.15697424113750458, + "learning_rate": 3.231287693879745e-05, + "loss": 0.3601, "step": 58755 }, { - "epoch": 2.07, - "learning_rate": 3.308453664778765e-05, - "loss": 0.2643, + "epoch": 2.1177064187119328, + "grad_norm": 0.23286950588226318, + "learning_rate": 3.231008639184265e-05, + "loss": 0.388, "step": 58760 }, { - "epoch": 2.07, - "learning_rate": 3.3081840939531684e-05, - "loss": 0.2622, + "epoch": 2.1178866183731575, + "grad_norm": 0.19460979104042053, + "learning_rate": 3.230729574528932e-05, + "loss": 0.4061, "step": 58765 }, { - "epoch": 2.07, - "learning_rate": 3.307914512633834e-05, - "loss": 0.2705, + "epoch": 2.1180668180343822, + "grad_norm": 0.1705007404088974, + "learning_rate": 3.230450499917552e-05, + "loss": 0.4024, "step": 58770 }, { - "epoch": 2.07, - "learning_rate": 3.3076449208242625e-05, - "loss": 0.2742, + "epoch": 2.1182470176956065, + "grad_norm": 0.20845447480678558, + "learning_rate": 3.2301714153539244e-05, + "loss": 0.4182, "step": 58775 }, { - "epoch": 2.07, - "learning_rate": 3.307375318527954e-05, - "loss": 0.2669, + "epoch": 2.1184272173568313, + "grad_norm": 0.18309171497821808, + "learning_rate": 3.2298923208418535e-05, + "loss": 0.3745, "step": 58780 }, { - "epoch": 2.07, - "learning_rate": 3.307105705748409e-05, - "loss": 0.2481, + "epoch": 2.118607417018056, + "grad_norm": 0.2173132747411728, + "learning_rate": 3.22961321638514e-05, + "loss": 0.397, "step": 58785 }, { - "epoch": 2.07, - "learning_rate": 3.306836082489128e-05, - "loss": 0.2822, + "epoch": 2.1187876166792807, + "grad_norm": 0.16794531047344208, + "learning_rate": 3.229334101987588e-05, + "loss": 0.3844, "step": 58790 }, { - "epoch": 2.07, - "learning_rate": 3.3065664487536124e-05, - "loss": 0.2952, + "epoch": 2.1189678163405055, + "grad_norm": 0.21813854575157166, + "learning_rate": 3.229054977653001e-05, + "loss": 0.42, "step": 58795 }, { - "epoch": 2.07, - "learning_rate": 3.306296804545362e-05, - "loss": 0.248, + "epoch": 2.1191480160017298, + "grad_norm": 0.21310067176818848, + "learning_rate": 3.22877584338518e-05, + "loss": 0.413, "step": 58800 }, { - "epoch": 2.07, - "learning_rate": 3.306027149867881e-05, - "loss": 0.2751, + "epoch": 2.1193282156629545, + "grad_norm": 0.20644977688789368, + "learning_rate": 3.2284966991879295e-05, + "loss": 0.3717, "step": 58805 }, { - "epoch": 2.07, - "learning_rate": 3.305757484724667e-05, - "loss": 0.2852, + "epoch": 2.119508415324179, + "grad_norm": 0.2088310867547989, + "learning_rate": 3.228217545065052e-05, + "loss": 0.4002, "step": 58810 }, { - "epoch": 2.07, - "learning_rate": 3.3054878091192245e-05, - "loss": 0.2712, + "epoch": 2.119688614985404, + "grad_norm": 0.23381415009498596, + "learning_rate": 3.227938381020353e-05, + "loss": 0.392, "step": 58815 }, { - "epoch": 2.07, - "learning_rate": 3.305218123055052e-05, - "loss": 0.2791, + "epoch": 2.1198688146466287, + "grad_norm": 0.19192180037498474, + "learning_rate": 3.227659207057633e-05, + "loss": 0.3862, "step": 58820 }, { - "epoch": 2.07, - "learning_rate": 3.3049484265356536e-05, - "loss": 0.2696, + "epoch": 2.120049014307853, + "grad_norm": 0.17391976714134216, + "learning_rate": 3.2273800231806974e-05, + "loss": 0.3916, "step": 58825 }, { - "epoch": 2.07, - "learning_rate": 3.3046787195645306e-05, - "loss": 0.2566, + "epoch": 2.1202292139690777, + "grad_norm": 0.236240416765213, + "learning_rate": 3.2271008293933496e-05, + "loss": 0.3979, "step": 58830 }, { - "epoch": 2.07, - "learning_rate": 3.3044090021451845e-05, - "loss": 0.2605, + "epoch": 2.1204094136303024, + "grad_norm": 0.15316404402256012, + "learning_rate": 3.2268216256993944e-05, + "loss": 0.3986, "step": 58835 }, { - "epoch": 2.07, - "learning_rate": 3.3041392742811184e-05, - "loss": 0.2699, + "epoch": 2.120589613291527, + "grad_norm": 0.20581558346748352, + "learning_rate": 3.2265424121026355e-05, + "loss": 0.3874, "step": 58840 }, { - "epoch": 2.07, - "learning_rate": 3.3038695359758324e-05, - "loss": 0.2714, + "epoch": 2.1207698129527515, + "grad_norm": 0.1844107061624527, + "learning_rate": 3.226263188606876e-05, + "loss": 0.4366, "step": 58845 }, { - "epoch": 2.07, - "learning_rate": 3.3035997872328304e-05, - "loss": 0.2648, + "epoch": 2.120950012613976, + "grad_norm": 0.17652803659439087, + "learning_rate": 3.225983955215922e-05, + "loss": 0.3898, "step": 58850 }, { - "epoch": 2.07, - "learning_rate": 3.303330028055616e-05, - "loss": 0.274, + "epoch": 2.121130212275201, + "grad_norm": 0.20647066831588745, + "learning_rate": 3.225704711933577e-05, + "loss": 0.3942, "step": 58855 }, { - "epoch": 2.07, - "learning_rate": 3.30306025844769e-05, - "loss": 0.2708, + "epoch": 2.1213104119364257, + "grad_norm": 0.24524955451488495, + "learning_rate": 3.2254254587636454e-05, + "loss": 0.4065, "step": 58860 }, { - "epoch": 2.07, - "learning_rate": 3.302790478412555e-05, - "loss": 0.2962, + "epoch": 2.1214906115976504, + "grad_norm": 0.23697435855865479, + "learning_rate": 3.225146195709933e-05, + "loss": 0.4129, "step": 58865 }, { - "epoch": 2.07, - "learning_rate": 3.3025206879537166e-05, - "loss": 0.2773, + "epoch": 2.1216708112588747, + "grad_norm": 0.2048749029636383, + "learning_rate": 3.2248669227762427e-05, + "loss": 0.39, "step": 58870 }, { - "epoch": 2.07, - "learning_rate": 3.302250887074674e-05, - "loss": 0.2597, + "epoch": 2.1218510109200994, + "grad_norm": 0.18117062747478485, + "learning_rate": 3.2245876399663826e-05, + "loss": 0.3636, "step": 58875 }, { - "epoch": 2.07, - "learning_rate": 3.3019810757789325e-05, - "loss": 0.2545, + "epoch": 2.122031210581324, + "grad_norm": 0.20858252048492432, + "learning_rate": 3.224308347284155e-05, + "loss": 0.3803, "step": 58880 }, { - "epoch": 2.07, - "learning_rate": 3.301711254069996e-05, - "loss": 0.2814, + "epoch": 2.122211410242549, + "grad_norm": 0.23120179772377014, + "learning_rate": 3.224029044733367e-05, + "loss": 0.4179, "step": 58885 }, { - "epoch": 2.07, - "learning_rate": 3.3014414219513676e-05, - "loss": 0.2782, + "epoch": 2.122391609903773, + "grad_norm": 0.20885401964187622, + "learning_rate": 3.2237497323178234e-05, + "loss": 0.3951, "step": 58890 }, { - "epoch": 2.07, - "learning_rate": 3.3011715794265494e-05, - "loss": 0.2859, + "epoch": 2.122571809564998, + "grad_norm": 0.22970037162303925, + "learning_rate": 3.223470410041329e-05, + "loss": 0.4036, "step": 58895 }, { - "epoch": 2.07, - "learning_rate": 3.300901726499046e-05, - "loss": 0.2637, + "epoch": 2.1227520092262226, + "grad_norm": 0.2186799943447113, + "learning_rate": 3.2231910779076916e-05, + "loss": 0.4109, "step": 58900 }, { - "epoch": 2.07, - "learning_rate": 3.300631863172363e-05, - "loss": 0.2925, + "epoch": 2.1229322088874474, + "grad_norm": 0.19462229311466217, + "learning_rate": 3.222911735920715e-05, + "loss": 0.4233, "step": 58905 }, { - "epoch": 2.07, - "learning_rate": 3.300361989450002e-05, - "loss": 0.2558, + "epoch": 2.123112408548672, + "grad_norm": 0.18175840377807617, + "learning_rate": 3.222632384084207e-05, + "loss": 0.3827, "step": 58910 }, { - "epoch": 2.07, - "learning_rate": 3.3000921053354675e-05, - "loss": 0.2598, + "epoch": 2.1232926082098964, + "grad_norm": 0.2110246866941452, + "learning_rate": 3.222353022401971e-05, + "loss": 0.3749, "step": 58915 }, { - "epoch": 2.07, - "learning_rate": 3.299822210832265e-05, - "loss": 0.2496, + "epoch": 2.123472807871121, + "grad_norm": 0.18468889594078064, + "learning_rate": 3.2220736508778166e-05, + "loss": 0.396, "step": 58920 }, { - "epoch": 2.07, - "learning_rate": 3.299552305943899e-05, - "loss": 0.2827, + "epoch": 2.123653007532346, + "grad_norm": 0.18783201277256012, + "learning_rate": 3.221794269515547e-05, + "loss": 0.3639, "step": 58925 }, { - "epoch": 2.07, - "learning_rate": 3.299282390673872e-05, - "loss": 0.2558, + "epoch": 2.1238332071935706, + "grad_norm": 0.21580557525157928, + "learning_rate": 3.2215148783189716e-05, + "loss": 0.4192, "step": 58930 }, { - "epoch": 2.07, - "learning_rate": 3.29901246502569e-05, - "loss": 0.2689, + "epoch": 2.124013406854795, + "grad_norm": 0.18120792508125305, + "learning_rate": 3.221235477291895e-05, + "loss": 0.3901, "step": 58935 }, { - "epoch": 2.07, - "learning_rate": 3.298742529002857e-05, - "loss": 0.282, + "epoch": 2.1241936065160196, + "grad_norm": 0.18788616359233856, + "learning_rate": 3.2209560664381244e-05, + "loss": 0.3894, "step": 58940 }, { - "epoch": 2.07, - "learning_rate": 3.298472582608881e-05, - "loss": 0.2818, + "epoch": 2.1243738061772444, + "grad_norm": 0.18515391647815704, + "learning_rate": 3.220676645761467e-05, + "loss": 0.4085, "step": 58945 }, { - "epoch": 2.07, - "learning_rate": 3.298202625847263e-05, - "loss": 0.2714, + "epoch": 2.124554005838469, + "grad_norm": 0.17624923586845398, + "learning_rate": 3.2203972152657294e-05, + "loss": 0.4079, "step": 58950 }, { - "epoch": 2.07, - "learning_rate": 3.297932658721511e-05, - "loss": 0.2718, + "epoch": 2.124734205499694, + "grad_norm": 0.2669857144355774, + "learning_rate": 3.22011777495472e-05, + "loss": 0.3783, "step": 58955 }, { - "epoch": 2.07, - "learning_rate": 3.2976626812351275e-05, - "loss": 0.2591, + "epoch": 2.124914405160918, + "grad_norm": 0.21977299451828003, + "learning_rate": 3.219838324832246e-05, + "loss": 0.3794, "step": 58960 }, { - "epoch": 2.07, - "learning_rate": 3.297392693391621e-05, - "loss": 0.277, + "epoch": 2.125094604822143, + "grad_norm": 0.18140974640846252, + "learning_rate": 3.219558864902113e-05, + "loss": 0.3855, "step": 58965 }, { - "epoch": 2.07, - "learning_rate": 3.2971226951944955e-05, - "loss": 0.26, + "epoch": 2.1252748044833676, + "grad_norm": 0.2620275020599365, + "learning_rate": 3.21927939516813e-05, + "loss": 0.3959, "step": 58970 }, { - "epoch": 2.07, - "learning_rate": 3.2968526866472576e-05, - "loss": 0.269, + "epoch": 2.1254550041445923, + "grad_norm": 0.2260800153017044, + "learning_rate": 3.218999915634105e-05, + "loss": 0.3754, "step": 58975 }, { - "epoch": 2.08, - "learning_rate": 3.2965826677534126e-05, - "loss": 0.2814, + "epoch": 2.125635203805817, + "grad_norm": 0.22423382103443146, + "learning_rate": 3.218720426303845e-05, + "loss": 0.3836, "step": 58980 }, { - "epoch": 2.08, - "learning_rate": 3.296312638516466e-05, - "loss": 0.3001, + "epoch": 2.1258154034670413, + "grad_norm": 0.21626518666744232, + "learning_rate": 3.2184409271811586e-05, + "loss": 0.4172, "step": 58985 }, { - "epoch": 2.08, - "learning_rate": 3.296042598939925e-05, - "loss": 0.2824, + "epoch": 2.125995603128266, + "grad_norm": 0.2051631510257721, + "learning_rate": 3.218161418269853e-05, + "loss": 0.384, "step": 58990 }, { - "epoch": 2.08, - "learning_rate": 3.295772549027296e-05, - "loss": 0.256, + "epoch": 2.126175802789491, + "grad_norm": 0.16707560420036316, + "learning_rate": 3.217881899573738e-05, + "loss": 0.4404, "step": 58995 }, { - "epoch": 2.08, - "learning_rate": 3.295502488782084e-05, - "loss": 0.2646, + "epoch": 2.1263560024507155, + "grad_norm": 0.20359165966510773, + "learning_rate": 3.21760237109662e-05, + "loss": 0.383, "step": 59000 }, { - "epoch": 2.08, - "eval_loss": 0.27035436034202576, - "eval_runtime": 10.5421, - "eval_samples_per_second": 9.486, - "eval_steps_per_second": 9.486, + "epoch": 2.1263560024507155, + "eval_loss": 0.43649742007255554, + "eval_runtime": 3.5299, + "eval_samples_per_second": 28.33, + "eval_steps_per_second": 7.082, "step": 59000 }, { - "epoch": 2.08, - "learning_rate": 3.295232418207798e-05, - "loss": 0.275, + "epoch": 2.12653620211194, + "grad_norm": 0.24366778135299683, + "learning_rate": 3.2173228328423095e-05, + "loss": 0.3883, "step": 59005 }, { - "epoch": 2.08, - "learning_rate": 3.2949623373079406e-05, - "loss": 0.2762, + "epoch": 2.1267164017731646, + "grad_norm": 0.18067045509815216, + "learning_rate": 3.217043284814614e-05, + "loss": 0.4133, "step": 59010 }, { - "epoch": 2.08, - "learning_rate": 3.294692246086023e-05, - "loss": 0.2561, + "epoch": 2.1268966014343893, + "grad_norm": 0.18387949466705322, + "learning_rate": 3.2167637270173425e-05, + "loss": 0.4048, "step": 59015 }, { - "epoch": 2.08, - "learning_rate": 3.2944221445455494e-05, - "loss": 0.2895, + "epoch": 2.127076801095614, + "grad_norm": 0.19910840690135956, + "learning_rate": 3.2164841594543044e-05, + "loss": 0.4296, "step": 59020 }, { - "epoch": 2.08, - "learning_rate": 3.294152032690029e-05, - "loss": 0.2872, + "epoch": 2.1272570007568388, + "grad_norm": 0.24461115896701813, + "learning_rate": 3.216204582129308e-05, + "loss": 0.3926, "step": 59025 }, { - "epoch": 2.08, - "learning_rate": 3.293881910522967e-05, - "loss": 0.2779, + "epoch": 2.127437200418063, + "grad_norm": 0.2087002843618393, + "learning_rate": 3.215924995046163e-05, + "loss": 0.4611, "step": 59030 }, { - "epoch": 2.08, - "learning_rate": 3.293611778047871e-05, - "loss": 0.2808, + "epoch": 2.127617400079288, + "grad_norm": 0.20317687094211578, + "learning_rate": 3.215645398208678e-05, + "loss": 0.4292, "step": 59035 }, { - "epoch": 2.08, - "learning_rate": 3.293341635268249e-05, - "loss": 0.2849, + "epoch": 2.1277975997405125, + "grad_norm": 0.17067931592464447, + "learning_rate": 3.215365791620664e-05, + "loss": 0.4072, "step": 59040 }, { - "epoch": 2.08, - "learning_rate": 3.2930714821876094e-05, - "loss": 0.242, + "epoch": 2.1279777994017373, + "grad_norm": 0.23524482548236847, + "learning_rate": 3.2150861752859286e-05, + "loss": 0.405, "step": 59045 }, { - "epoch": 2.08, - "learning_rate": 3.29280131880946e-05, - "loss": 0.2622, + "epoch": 2.128157999062962, + "grad_norm": 0.24678029119968414, + "learning_rate": 3.214806549208283e-05, + "loss": 0.4318, "step": 59050 }, { - "epoch": 2.08, - "learning_rate": 3.292531145137307e-05, - "loss": 0.2645, + "epoch": 2.1283381987241863, + "grad_norm": 0.22866828739643097, + "learning_rate": 3.214526913391536e-05, + "loss": 0.3886, "step": 59055 }, { - "epoch": 2.08, - "learning_rate": 3.2922609611746595e-05, - "loss": 0.2838, + "epoch": 2.128518398385411, + "grad_norm": 0.2560563385486603, + "learning_rate": 3.2142472678395e-05, + "loss": 0.422, "step": 59060 }, { - "epoch": 2.08, - "learning_rate": 3.291990766925026e-05, - "loss": 0.2518, + "epoch": 2.1286985980466357, + "grad_norm": 0.23088377714157104, + "learning_rate": 3.213967612555981e-05, + "loss": 0.4044, "step": 59065 }, { - "epoch": 2.08, - "learning_rate": 3.2917205623919134e-05, - "loss": 0.2829, + "epoch": 2.1288787977078605, + "grad_norm": 0.21493729948997498, + "learning_rate": 3.2136879475447924e-05, + "loss": 0.3938, "step": 59070 }, { - "epoch": 2.08, - "learning_rate": 3.291450347578832e-05, - "loss": 0.2837, + "epoch": 2.1290589973690848, + "grad_norm": 0.1736428588628769, + "learning_rate": 3.213408272809744e-05, + "loss": 0.4205, "step": 59075 }, { - "epoch": 2.08, - "learning_rate": 3.2911801224892894e-05, - "loss": 0.2655, + "epoch": 2.1292391970303095, + "grad_norm": 0.18080323934555054, + "learning_rate": 3.213128588354645e-05, + "loss": 0.363, "step": 59080 }, { - "epoch": 2.08, - "learning_rate": 3.2909098871267936e-05, - "loss": 0.2522, + "epoch": 2.1294193966915342, + "grad_norm": 0.19538411498069763, + "learning_rate": 3.212848894183308e-05, + "loss": 0.3855, "step": 59085 }, { - "epoch": 2.08, - "learning_rate": 3.290639641494855e-05, - "loss": 0.2826, + "epoch": 2.129599596352759, + "grad_norm": 0.2233283966779709, + "learning_rate": 3.212569190299542e-05, + "loss": 0.4047, "step": 59090 }, { - "epoch": 2.08, - "learning_rate": 3.290369385596981e-05, - "loss": 0.2664, + "epoch": 2.1297797960139837, + "grad_norm": 0.19378229975700378, + "learning_rate": 3.21228947670716e-05, + "loss": 0.3827, "step": 59095 }, { - "epoch": 2.08, - "learning_rate": 3.290099119436681e-05, - "loss": 0.2599, + "epoch": 2.129959995675208, + "grad_norm": 0.18152061104774475, + "learning_rate": 3.212009753409971e-05, + "loss": 0.4019, "step": 59100 }, { - "epoch": 2.08, - "learning_rate": 3.289828843017465e-05, - "loss": 0.2667, + "epoch": 2.1301401953364327, + "grad_norm": 0.2508603632450104, + "learning_rate": 3.2117300204117876e-05, + "loss": 0.399, "step": 59105 }, { - "epoch": 2.08, - "learning_rate": 3.289558556342843e-05, - "loss": 0.2931, + "epoch": 2.1303203949976575, + "grad_norm": 0.16633902490139008, + "learning_rate": 3.211450277716419e-05, + "loss": 0.3941, "step": 59110 }, { - "epoch": 2.08, - "learning_rate": 3.289288259416322e-05, - "loss": 0.2535, + "epoch": 2.130500594658882, + "grad_norm": 0.1936190277338028, + "learning_rate": 3.211170525327679e-05, + "loss": 0.4041, "step": 59115 }, { - "epoch": 2.08, - "learning_rate": 3.289017952241414e-05, - "loss": 0.265, + "epoch": 2.1306807943201065, + "grad_norm": 0.2269958257675171, + "learning_rate": 3.210890763249379e-05, + "loss": 0.4195, "step": 59120 }, { - "epoch": 2.08, - "learning_rate": 3.2887476348216275e-05, - "loss": 0.2608, + "epoch": 2.130860993981331, + "grad_norm": 0.18237541615962982, + "learning_rate": 3.210610991485329e-05, + "loss": 0.3866, "step": 59125 }, { - "epoch": 2.08, - "learning_rate": 3.288477307160474e-05, - "loss": 0.269, + "epoch": 2.131041193642556, + "grad_norm": 0.19430314004421234, + "learning_rate": 3.210331210039342e-05, + "loss": 0.4121, "step": 59130 }, { - "epoch": 2.08, - "learning_rate": 3.28820696926146e-05, - "loss": 0.2513, + "epoch": 2.1312213933037807, + "grad_norm": 0.18985673785209656, + "learning_rate": 3.2100514189152297e-05, + "loss": 0.4231, "step": 59135 }, { - "epoch": 2.08, - "learning_rate": 3.2879366211281e-05, - "loss": 0.2632, + "epoch": 2.1314015929650054, + "grad_norm": 0.19704566895961761, + "learning_rate": 3.209771618116805e-05, + "loss": 0.3891, "step": 59140 }, { - "epoch": 2.08, - "learning_rate": 3.287666262763901e-05, - "loss": 0.2539, + "epoch": 2.1315817926262297, + "grad_norm": 0.20834749937057495, + "learning_rate": 3.209491807647879e-05, + "loss": 0.3716, "step": 59145 }, { - "epoch": 2.08, - "learning_rate": 3.2873958941723745e-05, - "loss": 0.26, + "epoch": 2.1317619922874544, + "grad_norm": 0.15605294704437256, + "learning_rate": 3.2092119875122636e-05, + "loss": 0.3832, "step": 59150 }, { - "epoch": 2.08, - "learning_rate": 3.287125515357032e-05, - "loss": 0.2569, + "epoch": 2.131942191948679, + "grad_norm": 0.19391027092933655, + "learning_rate": 3.208932157713773e-05, + "loss": 0.4034, "step": 59155 }, { - "epoch": 2.08, - "learning_rate": 3.286855126321383e-05, - "loss": 0.2446, + "epoch": 2.132122391609904, + "grad_norm": 0.20829804241657257, + "learning_rate": 3.208652318256219e-05, + "loss": 0.3805, "step": 59160 }, { - "epoch": 2.08, - "learning_rate": 3.28658472706894e-05, - "loss": 0.2793, + "epoch": 2.132302591271128, + "grad_norm": 0.21649335324764252, + "learning_rate": 3.2083724691434145e-05, + "loss": 0.4418, "step": 59165 }, { - "epoch": 2.08, - "learning_rate": 3.286314317603211e-05, - "loss": 0.276, + "epoch": 2.132482790932353, + "grad_norm": 0.18842458724975586, + "learning_rate": 3.208092610379172e-05, + "loss": 0.396, "step": 59170 }, { - "epoch": 2.08, - "learning_rate": 3.2860438979277096e-05, - "loss": 0.294, + "epoch": 2.1326629905935777, + "grad_norm": 0.21697010099887848, + "learning_rate": 3.2078127419673046e-05, + "loss": 0.3576, "step": 59175 }, { - "epoch": 2.08, - "learning_rate": 3.2857734680459455e-05, - "loss": 0.2766, + "epoch": 2.1328431902548024, + "grad_norm": 0.20509681105613708, + "learning_rate": 3.207532863911627e-05, + "loss": 0.4059, "step": 59180 }, { - "epoch": 2.08, - "learning_rate": 3.2855030279614316e-05, - "loss": 0.2468, + "epoch": 2.133023389916027, + "grad_norm": 0.19861997663974762, + "learning_rate": 3.2072529762159495e-05, + "loss": 0.3848, "step": 59185 }, { - "epoch": 2.08, - "learning_rate": 3.285232577677679e-05, - "loss": 0.2683, + "epoch": 2.1332035895772514, + "grad_norm": 0.17639335989952087, + "learning_rate": 3.206973078884087e-05, + "loss": 0.4051, "step": 59190 }, { - "epoch": 2.08, - "learning_rate": 3.284962117198198e-05, - "loss": 0.266, + "epoch": 2.133383789238476, + "grad_norm": 0.2027795910835266, + "learning_rate": 3.206693171919854e-05, + "loss": 0.4244, "step": 59195 }, { - "epoch": 2.08, - "learning_rate": 3.2846916465265016e-05, - "loss": 0.2587, + "epoch": 2.133563988899701, + "grad_norm": 0.18750731647014618, + "learning_rate": 3.206413255327063e-05, + "loss": 0.4224, "step": 59200 }, { - "epoch": 2.08, - "learning_rate": 3.284421165666101e-05, - "loss": 0.2774, + "epoch": 2.1337441885609256, + "grad_norm": 0.21276669204235077, + "learning_rate": 3.206133329109529e-05, + "loss": 0.4186, "step": 59205 }, { - "epoch": 2.08, - "learning_rate": 3.284150674620508e-05, - "loss": 0.2789, + "epoch": 2.13392438822215, + "grad_norm": 0.17720721662044525, + "learning_rate": 3.205853393271064e-05, + "loss": 0.3877, "step": 59210 }, { - "epoch": 2.08, - "learning_rate": 3.283880173393236e-05, - "loss": 0.2867, + "epoch": 2.1341045878833746, + "grad_norm": 0.2149345576763153, + "learning_rate": 3.2055734478154834e-05, + "loss": 0.3937, "step": 59215 }, { - "epoch": 2.08, - "learning_rate": 3.283609661987797e-05, - "loss": 0.2893, + "epoch": 2.1342847875445994, + "grad_norm": 0.18279078602790833, + "learning_rate": 3.2052934927466016e-05, + "loss": 0.3948, "step": 59220 }, { - "epoch": 2.08, - "learning_rate": 3.2833391404077025e-05, - "loss": 0.3004, + "epoch": 2.134464987205824, + "grad_norm": 0.24067635834217072, + "learning_rate": 3.2050135280682327e-05, + "loss": 0.4092, "step": 59225 }, { - "epoch": 2.08, - "learning_rate": 3.283068608656465e-05, - "loss": 0.2544, + "epoch": 2.134645186867049, + "grad_norm": 0.22247518599033356, + "learning_rate": 3.20473355378419e-05, + "loss": 0.4003, "step": 59230 }, { - "epoch": 2.08, - "learning_rate": 3.282798066737598e-05, - "loss": 0.2583, + "epoch": 2.134825386528273, + "grad_norm": 0.16726024448871613, + "learning_rate": 3.2044535698982895e-05, + "loss": 0.3806, "step": 59235 }, { - "epoch": 2.08, - "learning_rate": 3.2825275146546136e-05, - "loss": 0.2954, + "epoch": 2.135005586189498, + "grad_norm": 0.18711987137794495, + "learning_rate": 3.204173576414345e-05, + "loss": 0.3994, "step": 59240 }, { - "epoch": 2.08, - "learning_rate": 3.282256952411026e-05, - "loss": 0.2731, + "epoch": 2.1351857858507226, + "grad_norm": 0.1729484349489212, + "learning_rate": 3.2038935733361734e-05, + "loss": 0.3954, "step": 59245 }, { - "epoch": 2.08, - "learning_rate": 3.2819863800103466e-05, - "loss": 0.2743, + "epoch": 2.1353659855119473, + "grad_norm": 0.16871348023414612, + "learning_rate": 3.203613560667587e-05, + "loss": 0.3775, "step": 59250 }, { - "epoch": 2.08, - "learning_rate": 3.2817157974560904e-05, - "loss": 0.2589, + "epoch": 2.135546185173172, + "grad_norm": 0.21796941757202148, + "learning_rate": 3.203333538412402e-05, + "loss": 0.4258, "step": 59255 }, { - "epoch": 2.08, - "learning_rate": 3.2814452047517685e-05, - "loss": 0.2905, + "epoch": 2.1357263848343964, + "grad_norm": 0.15358436107635498, + "learning_rate": 3.203053506574434e-05, + "loss": 0.3841, "step": 59260 }, { - "epoch": 2.09, - "learning_rate": 3.2811746019008966e-05, - "loss": 0.2651, + "epoch": 2.135906584495621, + "grad_norm": 0.2047918438911438, + "learning_rate": 3.202773465157498e-05, + "loss": 0.4093, "step": 59265 }, { - "epoch": 2.09, - "learning_rate": 3.280903988906986e-05, - "loss": 0.2655, + "epoch": 2.136086784156846, + "grad_norm": 0.22522784769535065, + "learning_rate": 3.20249341416541e-05, + "loss": 0.3782, "step": 59270 }, { - "epoch": 2.09, - "learning_rate": 3.280633365773553e-05, - "loss": 0.2618, + "epoch": 2.1362669838180706, + "grad_norm": 0.20467866957187653, + "learning_rate": 3.202213353601985e-05, + "loss": 0.3872, "step": 59275 }, { - "epoch": 2.09, - "learning_rate": 3.28036273250411e-05, - "loss": 0.2809, + "epoch": 2.136447183479295, + "grad_norm": 0.24087628722190857, + "learning_rate": 3.201933283471039e-05, + "loss": 0.4176, "step": 59280 }, { - "epoch": 2.09, - "learning_rate": 3.2800920891021714e-05, - "loss": 0.2727, + "epoch": 2.1366273831405196, + "grad_norm": 0.20225749909877777, + "learning_rate": 3.201653203776388e-05, + "loss": 0.3515, "step": 59285 }, { - "epoch": 2.09, - "learning_rate": 3.27982143557125e-05, - "loss": 0.2796, + "epoch": 2.1368075828017443, + "grad_norm": 0.24749909341335297, + "learning_rate": 3.201373114521847e-05, + "loss": 0.3979, "step": 59290 }, { - "epoch": 2.09, - "learning_rate": 3.2795507719148625e-05, - "loss": 0.2735, + "epoch": 2.136987782462969, + "grad_norm": 0.14851811528205872, + "learning_rate": 3.201093015711234e-05, + "loss": 0.3873, "step": 59295 }, { - "epoch": 2.09, - "learning_rate": 3.279280098136521e-05, - "loss": 0.246, + "epoch": 2.137167982124194, + "grad_norm": 0.2001311480998993, + "learning_rate": 3.200812907348364e-05, + "loss": 0.3825, "step": 59300 }, { - "epoch": 2.09, - "learning_rate": 3.279009414239742e-05, - "loss": 0.2973, + "epoch": 2.137348181785418, + "grad_norm": 0.1819891780614853, + "learning_rate": 3.200532789437055e-05, + "loss": 0.4069, "step": 59305 }, { - "epoch": 2.09, - "learning_rate": 3.278738720228039e-05, - "loss": 0.2841, + "epoch": 2.137528381446643, + "grad_norm": 0.20626020431518555, + "learning_rate": 3.200252661981121e-05, + "loss": 0.4005, "step": 59310 }, { - "epoch": 2.09, - "learning_rate": 3.278468016104926e-05, - "loss": 0.2841, + "epoch": 2.1377085811078675, + "grad_norm": 0.15189993381500244, + "learning_rate": 3.1999725249843806e-05, + "loss": 0.3802, "step": 59315 }, { - "epoch": 2.09, - "learning_rate": 3.278197301873919e-05, - "loss": 0.2635, + "epoch": 2.1378887807690923, + "grad_norm": 0.2139143943786621, + "learning_rate": 3.1996923784506494e-05, + "loss": 0.3658, "step": 59320 }, { - "epoch": 2.09, - "learning_rate": 3.2779265775385335e-05, - "loss": 0.2695, + "epoch": 2.138068980430317, + "grad_norm": 0.17770539224147797, + "learning_rate": 3.199412222383746e-05, + "loss": 0.3744, "step": 59325 }, { - "epoch": 2.09, - "learning_rate": 3.277655843102284e-05, - "loss": 0.2893, + "epoch": 2.1382491800915413, + "grad_norm": 0.18923962116241455, + "learning_rate": 3.1991320567874863e-05, + "loss": 0.399, "step": 59330 }, { - "epoch": 2.09, - "learning_rate": 3.277385098568685e-05, - "loss": 0.2398, + "epoch": 2.138429379752766, + "grad_norm": 0.2178812175989151, + "learning_rate": 3.198851881665687e-05, + "loss": 0.3778, "step": 59335 }, { - "epoch": 2.09, - "learning_rate": 3.277114343941254e-05, - "loss": 0.2809, + "epoch": 2.1386095794139908, + "grad_norm": 0.16876627504825592, + "learning_rate": 3.198571697022167e-05, + "loss": 0.4179, "step": 59340 }, { - "epoch": 2.09, - "learning_rate": 3.2768435792235044e-05, - "loss": 0.2598, + "epoch": 2.1387897790752155, + "grad_norm": 0.17547792196273804, + "learning_rate": 3.198291502860742e-05, + "loss": 0.3674, "step": 59345 }, { - "epoch": 2.09, - "learning_rate": 3.276572804418954e-05, - "loss": 0.2785, + "epoch": 2.13896997873644, + "grad_norm": 0.20084445178508759, + "learning_rate": 3.198011299185232e-05, + "loss": 0.3989, "step": 59350 }, { - "epoch": 2.09, - "learning_rate": 3.276302019531116e-05, - "loss": 0.2738, + "epoch": 2.1391501783976645, + "grad_norm": 0.1786831021308899, + "learning_rate": 3.197731085999451e-05, + "loss": 0.3793, "step": 59355 }, { - "epoch": 2.09, - "learning_rate": 3.2760312245635094e-05, - "loss": 0.293, + "epoch": 2.1393303780588893, + "grad_norm": 0.1777740865945816, + "learning_rate": 3.197450863307221e-05, + "loss": 0.3599, "step": 59360 }, { - "epoch": 2.09, - "learning_rate": 3.2757604195196477e-05, - "loss": 0.2736, + "epoch": 2.139510577720114, + "grad_norm": 0.19308814406394958, + "learning_rate": 3.1971706311123564e-05, + "loss": 0.422, "step": 59365 }, { - "epoch": 2.09, - "learning_rate": 3.275489604403048e-05, - "loss": 0.2656, + "epoch": 2.1396907773813387, + "grad_norm": 0.23540256917476654, + "learning_rate": 3.196890389418678e-05, + "loss": 0.4141, "step": 59370 }, { - "epoch": 2.09, - "learning_rate": 3.2752187792172276e-05, - "loss": 0.2904, + "epoch": 2.139870977042563, + "grad_norm": 0.2817511260509491, + "learning_rate": 3.196610138230003e-05, + "loss": 0.3921, "step": 59375 }, { - "epoch": 2.09, - "learning_rate": 3.274947943965703e-05, - "loss": 0.2758, + "epoch": 2.1400511767037877, + "grad_norm": 0.2218995839357376, + "learning_rate": 3.196329877550149e-05, + "loss": 0.3879, "step": 59380 }, { - "epoch": 2.09, - "learning_rate": 3.274677098651989e-05, - "loss": 0.2808, + "epoch": 2.1402313763650125, + "grad_norm": 0.1908760666847229, + "learning_rate": 3.196049607382936e-05, + "loss": 0.3864, "step": 59385 }, { - "epoch": 2.09, - "learning_rate": 3.274406243279603e-05, - "loss": 0.3196, + "epoch": 2.140411576026237, + "grad_norm": 0.22384685277938843, + "learning_rate": 3.195769327732181e-05, + "loss": 0.3835, "step": 59390 }, { - "epoch": 2.09, - "learning_rate": 3.2741353778520636e-05, - "loss": 0.2627, + "epoch": 2.1405917756874615, + "grad_norm": 0.20056529343128204, + "learning_rate": 3.195489038601704e-05, + "loss": 0.3914, "step": 59395 }, { - "epoch": 2.09, - "learning_rate": 3.273864502372885e-05, - "loss": 0.2666, + "epoch": 2.1407719753486862, + "grad_norm": 0.1963140219449997, + "learning_rate": 3.1952087399953236e-05, + "loss": 0.4141, "step": 59400 }, { - "epoch": 2.09, - "learning_rate": 3.2735936168455864e-05, - "loss": 0.2909, + "epoch": 2.140952175009911, + "grad_norm": 0.20005357265472412, + "learning_rate": 3.194928431916858e-05, + "loss": 0.4379, "step": 59405 }, { - "epoch": 2.09, - "learning_rate": 3.273322721273685e-05, - "loss": 0.2701, + "epoch": 2.1411323746711357, + "grad_norm": 0.1524960994720459, + "learning_rate": 3.194648114370129e-05, + "loss": 0.3916, "step": 59410 }, { - "epoch": 2.09, - "learning_rate": 3.273051815660698e-05, - "loss": 0.266, + "epoch": 2.1413125743323604, + "grad_norm": 0.20259173214435577, + "learning_rate": 3.1943677873589525e-05, + "loss": 0.4089, "step": 59415 }, { - "epoch": 2.09, - "learning_rate": 3.2727809000101413e-05, - "loss": 0.2632, + "epoch": 2.1414927739935847, + "grad_norm": 0.23878759145736694, + "learning_rate": 3.1940874508871496e-05, + "loss": 0.4275, "step": 59420 }, { - "epoch": 2.09, - "learning_rate": 3.2725099743255345e-05, - "loss": 0.2612, + "epoch": 2.1416729736548095, + "grad_norm": 0.15623390674591064, + "learning_rate": 3.193807104958539e-05, + "loss": 0.4258, "step": 59425 }, { - "epoch": 2.09, - "learning_rate": 3.2722390386103946e-05, - "loss": 0.2551, + "epoch": 2.141853173316034, + "grad_norm": 0.1934848129749298, + "learning_rate": 3.1935267495769416e-05, + "loss": 0.3741, "step": 59430 }, { - "epoch": 2.09, - "learning_rate": 3.271968092868241e-05, - "loss": 0.2569, + "epoch": 2.142033372977259, + "grad_norm": 0.2278786599636078, + "learning_rate": 3.193246384746176e-05, + "loss": 0.417, "step": 59435 }, { - "epoch": 2.09, - "learning_rate": 3.271697137102588e-05, - "loss": 0.2754, + "epoch": 2.142213572638483, + "grad_norm": 0.22830450534820557, + "learning_rate": 3.192966010470063e-05, + "loss": 0.3854, "step": 59440 }, { - "epoch": 2.09, - "learning_rate": 3.271426171316959e-05, - "loss": 0.2608, + "epoch": 2.142393772299708, + "grad_norm": 0.236038476228714, + "learning_rate": 3.192685626752422e-05, + "loss": 0.4147, "step": 59445 }, { - "epoch": 2.09, - "learning_rate": 3.271155195514868e-05, - "loss": 0.2745, + "epoch": 2.1425739719609327, + "grad_norm": 0.2309621423482895, + "learning_rate": 3.1924052335970736e-05, + "loss": 0.4035, "step": 59450 }, { - "epoch": 2.09, - "learning_rate": 3.2708842096998346e-05, - "loss": 0.2517, + "epoch": 2.1427541716221574, + "grad_norm": 0.2083134800195694, + "learning_rate": 3.1921248310078386e-05, + "loss": 0.4002, "step": 59455 }, { - "epoch": 2.09, - "learning_rate": 3.2706132138753784e-05, - "loss": 0.2879, + "epoch": 2.142934371283382, + "grad_norm": 0.22884568572044373, + "learning_rate": 3.1918444189885355e-05, + "loss": 0.4022, "step": 59460 }, { - "epoch": 2.09, - "learning_rate": 3.270342208045018e-05, - "loss": 0.2594, + "epoch": 2.1431145709446064, + "grad_norm": 0.18977230787277222, + "learning_rate": 3.191563997542987e-05, + "loss": 0.375, "step": 59465 }, { - "epoch": 2.09, - "learning_rate": 3.270071192212271e-05, - "loss": 0.2515, + "epoch": 2.143294770605831, + "grad_norm": 0.21738559007644653, + "learning_rate": 3.191283566675013e-05, + "loss": 0.3757, "step": 59470 }, { - "epoch": 2.09, - "learning_rate": 3.2698001663806575e-05, - "loss": 0.2823, + "epoch": 2.143474970267056, + "grad_norm": 0.19232676923274994, + "learning_rate": 3.1910031263884335e-05, + "loss": 0.4058, "step": 59475 }, { - "epoch": 2.09, - "learning_rate": 3.269529130553695e-05, - "loss": 0.2801, + "epoch": 2.1436551699282806, + "grad_norm": 0.21828657388687134, + "learning_rate": 3.1907226766870714e-05, + "loss": 0.4174, "step": 59480 }, { - "epoch": 2.09, - "learning_rate": 3.269258084734905e-05, - "loss": 0.2701, + "epoch": 2.1438353695895054, + "grad_norm": 0.19182120263576508, + "learning_rate": 3.190442217574745e-05, + "loss": 0.3884, "step": 59485 }, { - "epoch": 2.09, - "learning_rate": 3.268987028927806e-05, - "loss": 0.2805, + "epoch": 2.1440155692507297, + "grad_norm": 0.16395203769207, + "learning_rate": 3.190161749055279e-05, + "loss": 0.3748, "step": 59490 }, { - "epoch": 2.09, - "learning_rate": 3.2687159631359164e-05, - "loss": 0.2796, + "epoch": 2.1441957689119544, + "grad_norm": 0.18212929368019104, + "learning_rate": 3.189881271132491e-05, + "loss": 0.3492, "step": 59495 }, { - "epoch": 2.09, - "learning_rate": 3.268444887362757e-05, - "loss": 0.2773, + "epoch": 2.144375968573179, + "grad_norm": 0.19863685965538025, + "learning_rate": 3.189600783810205e-05, + "loss": 0.3785, "step": 59500 }, { - "epoch": 2.09, - "eval_loss": 0.26948508620262146, - "eval_runtime": 10.5443, - "eval_samples_per_second": 9.484, - "eval_steps_per_second": 9.484, + "epoch": 2.144375968573179, + "eval_loss": 0.43618133664131165, + "eval_runtime": 3.5383, + "eval_samples_per_second": 28.262, + "eval_steps_per_second": 7.065, "step": 59500 }, { - "epoch": 2.09, - "learning_rate": 3.268173801611847e-05, - "loss": 0.2598, + "epoch": 2.144556168234404, + "grad_norm": 0.17493978142738342, + "learning_rate": 3.1893202870922414e-05, + "loss": 0.3907, "step": 59505 }, { - "epoch": 2.09, - "learning_rate": 3.267902705886706e-05, - "loss": 0.2798, + "epoch": 2.144736367895628, + "grad_norm": 0.18247364461421967, + "learning_rate": 3.189039780982423e-05, + "loss": 0.4129, "step": 59510 }, { - "epoch": 2.09, - "learning_rate": 3.267631600190855e-05, - "loss": 0.2612, + "epoch": 2.144916567556853, + "grad_norm": 0.2383251041173935, + "learning_rate": 3.188759265484571e-05, + "loss": 0.3971, "step": 59515 }, { - "epoch": 2.09, - "learning_rate": 3.267360484527814e-05, - "loss": 0.2736, + "epoch": 2.1450967672180776, + "grad_norm": 0.1882295161485672, + "learning_rate": 3.188478740602506e-05, + "loss": 0.3815, "step": 59520 }, { - "epoch": 2.09, - "learning_rate": 3.2670893589011025e-05, - "loss": 0.2493, + "epoch": 2.1452769668793024, + "grad_norm": 0.19644425809383392, + "learning_rate": 3.1881982063400526e-05, + "loss": 0.3999, "step": 59525 }, { - "epoch": 2.09, - "learning_rate": 3.2668182233142414e-05, - "loss": 0.257, + "epoch": 2.145457166540527, + "grad_norm": 0.1579708456993103, + "learning_rate": 3.1879176627010324e-05, + "loss": 0.3654, "step": 59530 }, { - "epoch": 2.09, - "learning_rate": 3.266547077770751e-05, - "loss": 0.2727, + "epoch": 2.1456373662017514, + "grad_norm": 0.23646564781665802, + "learning_rate": 3.187637109689267e-05, + "loss": 0.3908, "step": 59535 }, { - "epoch": 2.09, - "learning_rate": 3.266275922274151e-05, - "loss": 0.2915, + "epoch": 2.145817565862976, + "grad_norm": 0.2409384548664093, + "learning_rate": 3.187356547308578e-05, + "loss": 0.4183, "step": 59540 }, { - "epoch": 2.09, - "learning_rate": 3.266004756827963e-05, - "loss": 0.2943, + "epoch": 2.145997765524201, + "grad_norm": 0.19728174805641174, + "learning_rate": 3.18707597556279e-05, + "loss": 0.4135, "step": 59545 }, { - "epoch": 2.1, - "learning_rate": 3.26573358143571e-05, - "loss": 0.2857, + "epoch": 2.1461779651854256, + "grad_norm": 0.22944805026054382, + "learning_rate": 3.186795394455725e-05, + "loss": 0.4424, "step": 59550 }, { - "epoch": 2.1, - "learning_rate": 3.26546239610091e-05, - "loss": 0.2579, + "epoch": 2.1463581648466503, + "grad_norm": 0.21976561844348907, + "learning_rate": 3.186514803991205e-05, + "loss": 0.4277, "step": 59555 }, { - "epoch": 2.1, - "learning_rate": 3.265191200827085e-05, - "loss": 0.2391, + "epoch": 2.1465383645078746, + "grad_norm": 0.2405269593000412, + "learning_rate": 3.1862342041730545e-05, + "loss": 0.3874, "step": 59560 }, { - "epoch": 2.1, - "learning_rate": 3.2649199956177564e-05, - "loss": 0.2757, + "epoch": 2.1467185641690993, + "grad_norm": 0.18481574952602386, + "learning_rate": 3.185953595005095e-05, + "loss": 0.3858, "step": 59565 }, { - "epoch": 2.1, - "learning_rate": 3.2646487804764456e-05, - "loss": 0.2885, + "epoch": 2.146898763830324, + "grad_norm": 0.1834956705570221, + "learning_rate": 3.1856729764911506e-05, + "loss": 0.4078, "step": 59570 }, { - "epoch": 2.1, - "learning_rate": 3.2643775554066754e-05, - "loss": 0.2694, + "epoch": 2.147078963491549, + "grad_norm": 0.2032243311405182, + "learning_rate": 3.1853923486350455e-05, + "loss": 0.4089, "step": 59575 }, { - "epoch": 2.1, - "learning_rate": 3.264106320411966e-05, - "loss": 0.2738, + "epoch": 2.147259163152773, + "grad_norm": 0.19479459524154663, + "learning_rate": 3.185111711440601e-05, + "loss": 0.4115, "step": 59580 }, { - "epoch": 2.1, - "learning_rate": 3.263835075495839e-05, - "loss": 0.2896, + "epoch": 2.147439362813998, + "grad_norm": 0.20436975359916687, + "learning_rate": 3.184831064911644e-05, + "loss": 0.3796, "step": 59585 }, { - "epoch": 2.1, - "learning_rate": 3.263563820661817e-05, - "loss": 0.2797, + "epoch": 2.1476195624752226, + "grad_norm": 0.2070939838886261, + "learning_rate": 3.1845504090519954e-05, + "loss": 0.3942, "step": 59590 }, { - "epoch": 2.1, - "learning_rate": 3.263292555913422e-05, - "loss": 0.2776, + "epoch": 2.1477997621364473, + "grad_norm": 0.18167893588542938, + "learning_rate": 3.18426974386548e-05, + "loss": 0.4023, "step": 59595 }, { - "epoch": 2.1, - "learning_rate": 3.263021281254177e-05, - "loss": 0.272, + "epoch": 2.147979961797672, + "grad_norm": 0.19285474717617035, + "learning_rate": 3.1839890693559216e-05, + "loss": 0.4162, "step": 59600 }, { - "epoch": 2.1, - "learning_rate": 3.262749996687603e-05, - "loss": 0.2816, + "epoch": 2.1481601614588963, + "grad_norm": 0.23616044223308563, + "learning_rate": 3.183708385527144e-05, + "loss": 0.3866, "step": 59605 }, { - "epoch": 2.1, - "learning_rate": 3.262478702217223e-05, - "loss": 0.2527, + "epoch": 2.148340361120121, + "grad_norm": 0.2100156992673874, + "learning_rate": 3.1834276923829734e-05, + "loss": 0.4258, "step": 59610 }, { - "epoch": 2.1, - "learning_rate": 3.262207397846559e-05, - "loss": 0.2852, + "epoch": 2.148520560781346, + "grad_norm": 0.23628145456314087, + "learning_rate": 3.183146989927232e-05, + "loss": 0.4279, "step": 59615 }, { - "epoch": 2.1, - "learning_rate": 3.261936083579135e-05, - "loss": 0.2905, + "epoch": 2.1487007604425705, + "grad_norm": 0.17379525303840637, + "learning_rate": 3.1828662781637455e-05, + "loss": 0.4271, "step": 59620 }, { - "epoch": 2.1, - "learning_rate": 3.2616647594184726e-05, - "loss": 0.2726, + "epoch": 2.148880960103795, + "grad_norm": 0.20716167986392975, + "learning_rate": 3.182585557096337e-05, + "loss": 0.4243, "step": 59625 }, { - "epoch": 2.1, - "learning_rate": 3.2613934253680956e-05, - "loss": 0.2932, + "epoch": 2.1490611597650195, + "grad_norm": 0.19413475692272186, + "learning_rate": 3.1823048267288336e-05, + "loss": 0.3949, "step": 59630 }, { - "epoch": 2.1, - "learning_rate": 3.261122081431527e-05, - "loss": 0.2569, + "epoch": 2.1492413594262443, + "grad_norm": 0.2092110812664032, + "learning_rate": 3.182024087065059e-05, + "loss": 0.3402, "step": 59635 }, { - "epoch": 2.1, - "learning_rate": 3.260850727612289e-05, - "loss": 0.2814, + "epoch": 2.149421559087469, + "grad_norm": 0.18373681604862213, + "learning_rate": 3.1817433381088385e-05, + "loss": 0.3934, "step": 59640 }, { - "epoch": 2.1, - "learning_rate": 3.260633637443745e-05, - "loss": 0.2669, + "epoch": 2.1496017587486937, + "grad_norm": 0.2630181908607483, + "learning_rate": 3.181462579863996e-05, + "loss": 0.415, "step": 59645 }, { - "epoch": 2.1, - "learning_rate": 3.260362265844583e-05, - "loss": 0.2827, + "epoch": 2.149781958409918, + "grad_norm": 0.20736628770828247, + "learning_rate": 3.1811818123343584e-05, + "loss": 0.4196, "step": 59650 }, { - "epoch": 2.1, - "learning_rate": 3.2600908843726175e-05, - "loss": 0.2866, + "epoch": 2.1499621580711428, + "grad_norm": 0.2118118554353714, + "learning_rate": 3.180901035523751e-05, + "loss": 0.408, "step": 59655 }, { - "epoch": 2.1, - "learning_rate": 3.2598194930313734e-05, - "loss": 0.2778, + "epoch": 2.1501423577323675, + "grad_norm": 0.218112975358963, + "learning_rate": 3.180620249435998e-05, + "loss": 0.3967, "step": 59660 }, { - "epoch": 2.1, - "learning_rate": 3.2595480918243734e-05, - "loss": 0.2808, + "epoch": 2.1503225573935922, + "grad_norm": 0.19679492712020874, + "learning_rate": 3.180339454074926e-05, + "loss": 0.3891, "step": 59665 }, { - "epoch": 2.1, - "learning_rate": 3.259276680755143e-05, - "loss": 0.2708, + "epoch": 2.1505027570548165, + "grad_norm": 0.19406099617481232, + "learning_rate": 3.18005864944436e-05, + "loss": 0.406, "step": 59670 }, { - "epoch": 2.1, - "learning_rate": 3.259005259827205e-05, - "loss": 0.2914, + "epoch": 2.1506829567160413, + "grad_norm": 0.21432217955589294, + "learning_rate": 3.1797778355481285e-05, + "loss": 0.4209, "step": 59675 }, { - "epoch": 2.1, - "learning_rate": 3.2587338290440824e-05, - "loss": 0.2822, + "epoch": 2.150863156377266, + "grad_norm": 0.20044691860675812, + "learning_rate": 3.179497012390054e-05, + "loss": 0.3638, "step": 59680 }, { - "epoch": 2.1, - "learning_rate": 3.2584623884093027e-05, - "loss": 0.2654, + "epoch": 2.1510433560384907, + "grad_norm": 0.2646994888782501, + "learning_rate": 3.179216179973964e-05, + "loss": 0.4082, "step": 59685 }, { - "epoch": 2.1, - "learning_rate": 3.258190937926387e-05, - "loss": 0.2899, + "epoch": 2.1512235556997155, + "grad_norm": 0.1870126724243164, + "learning_rate": 3.178935338303686e-05, + "loss": 0.3992, "step": 59690 }, { - "epoch": 2.1, - "learning_rate": 3.257919477598863e-05, - "loss": 0.2628, + "epoch": 2.1514037553609398, + "grad_norm": 0.19518372416496277, + "learning_rate": 3.178654487383045e-05, + "loss": 0.359, "step": 59695 }, { - "epoch": 2.1, - "learning_rate": 3.2576480074302544e-05, - "loss": 0.2657, + "epoch": 2.1515839550221645, + "grad_norm": 0.18647782504558563, + "learning_rate": 3.178373627215869e-05, + "loss": 0.4009, "step": 59700 }, { - "epoch": 2.1, - "learning_rate": 3.2573765274240856e-05, - "loss": 0.2717, + "epoch": 2.151764154683389, + "grad_norm": 0.16734398901462555, + "learning_rate": 3.178092757805982e-05, + "loss": 0.4091, "step": 59705 }, { - "epoch": 2.1, - "learning_rate": 3.2571050375838806e-05, - "loss": 0.2832, + "epoch": 2.151944354344614, + "grad_norm": 0.1782519370317459, + "learning_rate": 3.177811879157214e-05, + "loss": 0.384, "step": 59710 }, { - "epoch": 2.1, - "learning_rate": 3.256833537913166e-05, - "loss": 0.2799, + "epoch": 2.1521245540058382, + "grad_norm": 0.16030040383338928, + "learning_rate": 3.1775309912733897e-05, + "loss": 0.3843, "step": 59715 }, { - "epoch": 2.1, - "learning_rate": 3.256562028415467e-05, - "loss": 0.2696, + "epoch": 2.152304753667063, + "grad_norm": 0.19871199131011963, + "learning_rate": 3.177250094158336e-05, + "loss": 0.4164, "step": 59720 }, { - "epoch": 2.1, - "learning_rate": 3.2562905090943095e-05, - "loss": 0.2867, + "epoch": 2.1524849533282877, + "grad_norm": 0.2312350869178772, + "learning_rate": 3.1769691878158823e-05, + "loss": 0.3661, "step": 59725 }, { - "epoch": 2.1, - "learning_rate": 3.256018979953217e-05, - "loss": 0.2534, + "epoch": 2.1526651529895124, + "grad_norm": 0.20747533440589905, + "learning_rate": 3.176688272249854e-05, + "loss": 0.3765, "step": 59730 }, { - "epoch": 2.1, - "learning_rate": 3.255747440995717e-05, - "loss": 0.2905, + "epoch": 2.152845352650737, + "grad_norm": 0.19539090991020203, + "learning_rate": 3.176407347464079e-05, + "loss": 0.4251, "step": 59735 }, { - "epoch": 2.1, - "learning_rate": 3.2554758922253336e-05, - "loss": 0.2662, + "epoch": 2.1530255523119615, + "grad_norm": 0.21648086607456207, + "learning_rate": 3.1761264134623844e-05, + "loss": 0.3954, "step": 59740 }, { - "epoch": 2.1, - "learning_rate": 3.255204333645595e-05, - "loss": 0.2624, + "epoch": 2.153205751973186, + "grad_norm": 0.21644406020641327, + "learning_rate": 3.175845470248599e-05, + "loss": 0.3859, "step": 59745 }, { - "epoch": 2.1, - "learning_rate": 3.2549327652600245e-05, - "loss": 0.284, + "epoch": 2.153385951634411, + "grad_norm": 0.19673486053943634, + "learning_rate": 3.175564517826549e-05, + "loss": 0.3904, "step": 59750 }, { - "epoch": 2.1, - "learning_rate": 3.2546611870721506e-05, - "loss": 0.2606, + "epoch": 2.1535661512956357, + "grad_norm": 0.22667108476161957, + "learning_rate": 3.1752835562000636e-05, + "loss": 0.4018, "step": 59755 }, { - "epoch": 2.1, - "learning_rate": 3.254389599085498e-05, - "loss": 0.2445, + "epoch": 2.1537463509568604, + "grad_norm": 0.22213202714920044, + "learning_rate": 3.175002585372971e-05, + "loss": 0.3882, "step": 59760 }, { - "epoch": 2.1, - "learning_rate": 3.254118001303593e-05, - "loss": 0.2853, + "epoch": 2.1539265506180847, + "grad_norm": 0.1795310080051422, + "learning_rate": 3.174721605349099e-05, + "loss": 0.3719, "step": 59765 }, { - "epoch": 2.1, - "learning_rate": 3.2538463937299635e-05, - "loss": 0.2689, + "epoch": 2.1541067502793094, + "grad_norm": 0.20347318053245544, + "learning_rate": 3.174440616132275e-05, + "loss": 0.3895, "step": 59770 }, { - "epoch": 2.1, - "learning_rate": 3.253574776368135e-05, - "loss": 0.2578, + "epoch": 2.154286949940534, + "grad_norm": 0.2064104527235031, + "learning_rate": 3.174159617726329e-05, + "loss": 0.3691, "step": 59775 }, { - "epoch": 2.1, - "learning_rate": 3.253303149221635e-05, - "loss": 0.2666, + "epoch": 2.154467149601759, + "grad_norm": 0.22160401940345764, + "learning_rate": 3.1738786101350884e-05, + "loss": 0.394, "step": 59780 }, { - "epoch": 2.1, - "learning_rate": 3.25303151229399e-05, - "loss": 0.2814, + "epoch": 2.154647349262983, + "grad_norm": 0.18739666044712067, + "learning_rate": 3.173597593362381e-05, + "loss": 0.393, "step": 59785 }, { - "epoch": 2.1, - "learning_rate": 3.252759865588726e-05, - "loss": 0.2801, + "epoch": 2.154827548924208, + "grad_norm": 0.21694833040237427, + "learning_rate": 3.173316567412038e-05, + "loss": 0.3736, "step": 59790 }, { - "epoch": 2.1, - "learning_rate": 3.252488209109373e-05, - "loss": 0.2776, + "epoch": 2.1550077485854326, + "grad_norm": 0.2537897229194641, + "learning_rate": 3.173035532287887e-05, + "loss": 0.3989, "step": 59795 }, { - "epoch": 2.1, - "learning_rate": 3.252216542859456e-05, - "loss": 0.2657, + "epoch": 2.1551879482466574, + "grad_norm": 0.16658885776996613, + "learning_rate": 3.172754487993757e-05, + "loss": 0.4302, "step": 59800 }, { - "epoch": 2.1, - "learning_rate": 3.251944866842502e-05, - "loss": 0.2865, + "epoch": 2.155368147907882, + "grad_norm": 0.18496111035346985, + "learning_rate": 3.1724734345334775e-05, + "loss": 0.3784, "step": 59805 }, { - "epoch": 2.1, - "learning_rate": 3.25167318106204e-05, - "loss": 0.2521, + "epoch": 2.1555483475691064, + "grad_norm": 0.19085146486759186, + "learning_rate": 3.1721923719108775e-05, + "loss": 0.3818, "step": 59810 }, { - "epoch": 2.1, - "learning_rate": 3.251401485521598e-05, - "loss": 0.2784, + "epoch": 2.155728547230331, + "grad_norm": 0.21254359185695648, + "learning_rate": 3.1719113001297866e-05, + "loss": 0.4093, "step": 59815 }, { - "epoch": 2.1, - "learning_rate": 3.251129780224702e-05, - "loss": 0.2967, + "epoch": 2.155908746891556, + "grad_norm": 0.16708926856517792, + "learning_rate": 3.171630219194035e-05, + "loss": 0.4064, "step": 59820 }, { - "epoch": 2.1, - "learning_rate": 3.2508580651748804e-05, - "loss": 0.2709, + "epoch": 2.1560889465527806, + "grad_norm": 0.20576465129852295, + "learning_rate": 3.171349129107451e-05, + "loss": 0.4028, "step": 59825 }, { - "epoch": 2.1, - "learning_rate": 3.250586340375663e-05, - "loss": 0.2599, + "epoch": 2.1562691462140053, + "grad_norm": 0.16969583928585052, + "learning_rate": 3.171068029873865e-05, + "loss": 0.3922, "step": 59830 }, { - "epoch": 2.11, - "learning_rate": 3.250314605830575e-05, - "loss": 0.2538, + "epoch": 2.1564493458752296, + "grad_norm": 0.1903369426727295, + "learning_rate": 3.170786921497107e-05, + "loss": 0.3691, "step": 59835 }, { - "epoch": 2.11, - "learning_rate": 3.2500428615431474e-05, - "loss": 0.2799, + "epoch": 2.1566295455364544, + "grad_norm": 0.1657458245754242, + "learning_rate": 3.1705058039810075e-05, + "loss": 0.385, "step": 59840 }, { - "epoch": 2.11, - "learning_rate": 3.249771107516907e-05, - "loss": 0.2776, + "epoch": 2.156809745197679, + "grad_norm": 0.17674793303012848, + "learning_rate": 3.170224677329396e-05, + "loss": 0.3697, "step": 59845 }, { - "epoch": 2.11, - "learning_rate": 3.2494993437553836e-05, - "loss": 0.2709, + "epoch": 2.156989944858904, + "grad_norm": 0.1935698390007019, + "learning_rate": 3.1699435415461034e-05, + "loss": 0.3688, "step": 59850 }, { - "epoch": 2.11, - "learning_rate": 3.2492275702621045e-05, - "loss": 0.247, + "epoch": 2.157170144520128, + "grad_norm": 0.22389110922813416, + "learning_rate": 3.1696623966349586e-05, + "loss": 0.4113, "step": 59855 }, { - "epoch": 2.11, - "learning_rate": 3.2489557870406e-05, - "loss": 0.2678, + "epoch": 2.157350344181353, + "grad_norm": 0.2064765840768814, + "learning_rate": 3.1693812425997946e-05, + "loss": 0.3933, "step": 59860 }, { - "epoch": 2.11, - "learning_rate": 3.2486839940943967e-05, - "loss": 0.2833, + "epoch": 2.1575305438425776, + "grad_norm": 0.17828290164470673, + "learning_rate": 3.1691000794444404e-05, + "loss": 0.4174, "step": 59865 }, { - "epoch": 2.11, - "learning_rate": 3.248412191427027e-05, - "loss": 0.3118, + "epoch": 2.1577107435038023, + "grad_norm": 0.18194176256656647, + "learning_rate": 3.168818907172727e-05, + "loss": 0.421, "step": 59870 }, { - "epoch": 2.11, - "learning_rate": 3.2481403790420164e-05, - "loss": 0.2482, + "epoch": 2.157890943165027, + "grad_norm": 0.1584172546863556, + "learning_rate": 3.168537725788485e-05, + "loss": 0.4051, "step": 59875 }, { - "epoch": 2.11, - "learning_rate": 3.2478685569428975e-05, - "loss": 0.2719, + "epoch": 2.1580711428262513, + "grad_norm": 0.21431034803390503, + "learning_rate": 3.168256535295547e-05, + "loss": 0.4027, "step": 59880 }, { - "epoch": 2.11, - "learning_rate": 3.247596725133197e-05, - "loss": 0.2679, + "epoch": 2.158251342487476, + "grad_norm": 0.18599197268486023, + "learning_rate": 3.167975335697743e-05, + "loss": 0.4038, "step": 59885 }, { - "epoch": 2.11, - "learning_rate": 3.247324883616447e-05, - "loss": 0.3003, + "epoch": 2.158431542148701, + "grad_norm": 0.18698103725910187, + "learning_rate": 3.167694126998903e-05, + "loss": 0.4269, "step": 59890 }, { - "epoch": 2.11, - "learning_rate": 3.247053032396175e-05, - "loss": 0.268, + "epoch": 2.1586117418099255, + "grad_norm": 0.2108622044324875, + "learning_rate": 3.1674129092028607e-05, + "loss": 0.4332, "step": 59895 }, { - "epoch": 2.11, - "learning_rate": 3.2467811714759114e-05, - "loss": 0.2698, + "epoch": 2.15879194147115, + "grad_norm": 0.22758710384368896, + "learning_rate": 3.167131682313447e-05, + "loss": 0.4095, "step": 59900 }, { - "epoch": 2.11, - "learning_rate": 3.246509300859187e-05, - "loss": 0.2986, + "epoch": 2.1589721411323746, + "grad_norm": 0.17931969463825226, + "learning_rate": 3.1668504463344926e-05, + "loss": 0.4192, "step": 59905 }, { - "epoch": 2.11, - "learning_rate": 3.2462374205495314e-05, - "loss": 0.255, + "epoch": 2.1591523407935993, + "grad_norm": 0.1740187406539917, + "learning_rate": 3.166569201269831e-05, + "loss": 0.4038, "step": 59910 }, { - "epoch": 2.11, - "learning_rate": 3.245965530550475e-05, - "loss": 0.2731, + "epoch": 2.159332540454824, + "grad_norm": 0.199792742729187, + "learning_rate": 3.166287947123292e-05, + "loss": 0.4077, "step": 59915 }, { - "epoch": 2.11, - "learning_rate": 3.2456936308655475e-05, - "loss": 0.2569, + "epoch": 2.1595127401160488, + "grad_norm": 0.22314900159835815, + "learning_rate": 3.1660066838987095e-05, + "loss": 0.4176, "step": 59920 }, { - "epoch": 2.11, - "learning_rate": 3.245421721498279e-05, - "loss": 0.2728, + "epoch": 2.159692939777273, + "grad_norm": 0.2095964401960373, + "learning_rate": 3.165725411599914e-05, + "loss": 0.391, "step": 59925 }, { - "epoch": 2.11, - "learning_rate": 3.245149802452201e-05, - "loss": 0.2508, + "epoch": 2.159873139438498, + "grad_norm": 0.2291499823331833, + "learning_rate": 3.16544413023074e-05, + "loss": 0.4169, "step": 59930 }, { - "epoch": 2.11, - "learning_rate": 3.244877873730844e-05, - "loss": 0.2521, + "epoch": 2.1600533390997225, + "grad_norm": 0.17338679730892181, + "learning_rate": 3.165162839795017e-05, + "loss": 0.3882, "step": 59935 }, { - "epoch": 2.11, - "learning_rate": 3.244605935337739e-05, - "loss": 0.297, + "epoch": 2.1602335387609473, + "grad_norm": 0.16860181093215942, + "learning_rate": 3.16488154029658e-05, + "loss": 0.3689, "step": 59940 }, { - "epoch": 2.11, - "learning_rate": 3.244333987276417e-05, - "loss": 0.2943, + "epoch": 2.1604137384221715, + "grad_norm": 0.2292938083410263, + "learning_rate": 3.164600231739261e-05, + "loss": 0.3986, "step": 59945 }, { - "epoch": 2.11, - "learning_rate": 3.2440620295504076e-05, - "loss": 0.2698, + "epoch": 2.1605939380833963, + "grad_norm": 0.20005054771900177, + "learning_rate": 3.164318914126891e-05, + "loss": 0.4175, "step": 59950 }, { - "epoch": 2.11, - "learning_rate": 3.2437900621632435e-05, - "loss": 0.2422, + "epoch": 2.160774137744621, + "grad_norm": 0.18958815932273865, + "learning_rate": 3.164037587463306e-05, + "loss": 0.3758, "step": 59955 }, { - "epoch": 2.11, - "learning_rate": 3.243518085118456e-05, - "loss": 0.2802, + "epoch": 2.1609543374058457, + "grad_norm": 0.218599334359169, + "learning_rate": 3.1637562517523374e-05, + "loss": 0.3645, "step": 59960 }, { - "epoch": 2.11, - "learning_rate": 3.243246098419576e-05, - "loss": 0.2635, + "epoch": 2.1611345370670705, + "grad_norm": 0.19358861446380615, + "learning_rate": 3.163474906997818e-05, + "loss": 0.3981, "step": 59965 }, { - "epoch": 2.11, - "learning_rate": 3.242974102070134e-05, - "loss": 0.2706, + "epoch": 2.1613147367282948, + "grad_norm": 0.21078740060329437, + "learning_rate": 3.1631935532035814e-05, + "loss": 0.3875, "step": 59970 }, { - "epoch": 2.11, - "learning_rate": 3.2427020960736645e-05, - "loss": 0.2721, + "epoch": 2.1614949363895195, + "grad_norm": 0.14373530447483063, + "learning_rate": 3.16291219037346e-05, + "loss": 0.4011, "step": 59975 }, { - "epoch": 2.11, - "learning_rate": 3.242430080433698e-05, - "loss": 0.2893, + "epoch": 2.1616751360507442, + "grad_norm": 0.19193744659423828, + "learning_rate": 3.16263081851129e-05, + "loss": 0.3839, "step": 59980 }, { - "epoch": 2.11, - "learning_rate": 3.242158055153764e-05, - "loss": 0.2645, + "epoch": 2.161855335711969, + "grad_norm": 0.24244289100170135, + "learning_rate": 3.162349437620903e-05, + "loss": 0.3978, "step": 59985 }, { - "epoch": 2.11, - "learning_rate": 3.241886020237399e-05, - "loss": 0.2629, + "epoch": 2.1620355353731937, + "grad_norm": 0.24157002568244934, + "learning_rate": 3.162068047706133e-05, + "loss": 0.3703, "step": 59990 }, { - "epoch": 2.11, - "learning_rate": 3.241613975688132e-05, - "loss": 0.2837, + "epoch": 2.162215735034418, + "grad_norm": 0.20742827653884888, + "learning_rate": 3.161786648770813e-05, + "loss": 0.375, "step": 59995 }, { - "epoch": 2.11, - "learning_rate": 3.2413419215094966e-05, - "loss": 0.2808, + "epoch": 2.1623959346956427, + "grad_norm": 0.1863773912191391, + "learning_rate": 3.1615052408187797e-05, + "loss": 0.3623, "step": 60000 }, { - "epoch": 2.11, - "eval_loss": 0.2701481282711029, - "eval_runtime": 10.5458, - "eval_samples_per_second": 9.482, - "eval_steps_per_second": 9.482, + "epoch": 2.1623959346956427, + "eval_loss": 0.4364284873008728, + "eval_runtime": 3.5244, + "eval_samples_per_second": 28.373, + "eval_steps_per_second": 7.093, "step": 60000 }, { - "epoch": 2.11, - "learning_rate": 3.241069857705025e-05, - "loss": 0.2705, + "epoch": 2.1625761343568675, + "grad_norm": 0.21003571152687073, + "learning_rate": 3.1612238238538646e-05, + "loss": 0.3717, "step": 60005 }, { - "epoch": 2.11, - "learning_rate": 3.240797784278249e-05, - "loss": 0.2748, + "epoch": 2.162756334018092, + "grad_norm": 0.1988876461982727, + "learning_rate": 3.160942397879903e-05, + "loss": 0.3693, "step": 60010 }, { - "epoch": 2.11, - "learning_rate": 3.240525701232703e-05, - "loss": 0.2734, + "epoch": 2.1629365336793165, + "grad_norm": 0.22915050387382507, + "learning_rate": 3.160660962900729e-05, + "loss": 0.3922, "step": 60015 }, { - "epoch": 2.11, - "learning_rate": 3.240253608571918e-05, - "loss": 0.2739, + "epoch": 2.163116733340541, + "grad_norm": 0.18933100998401642, + "learning_rate": 3.160379518920177e-05, + "loss": 0.4227, "step": 60020 }, { - "epoch": 2.11, - "learning_rate": 3.239981506299428e-05, - "loss": 0.2664, + "epoch": 2.163296933001766, + "grad_norm": 0.19733920693397522, + "learning_rate": 3.160098065942084e-05, + "loss": 0.3909, "step": 60025 }, { - "epoch": 2.11, - "learning_rate": 3.2397093944187656e-05, - "loss": 0.2603, + "epoch": 2.1634771326629907, + "grad_norm": 0.21198596060276031, + "learning_rate": 3.1598166039702805e-05, + "loss": 0.3822, "step": 60030 }, { - "epoch": 2.11, - "learning_rate": 3.239437272933464e-05, - "loss": 0.2468, + "epoch": 2.1636573323242154, + "grad_norm": 0.20547007024288177, + "learning_rate": 3.159535133008604e-05, + "loss": 0.3746, "step": 60035 }, { - "epoch": 2.11, - "learning_rate": 3.239165141847057e-05, - "loss": 0.286, + "epoch": 2.1638375319854397, + "grad_norm": 0.21266385912895203, + "learning_rate": 3.1592536530608894e-05, + "loss": 0.372, "step": 60040 }, { - "epoch": 2.11, - "learning_rate": 3.238893001163078e-05, - "loss": 0.2758, + "epoch": 2.1640177316466644, + "grad_norm": 0.21298128366470337, + "learning_rate": 3.158972164130971e-05, + "loss": 0.3811, "step": 60045 }, { - "epoch": 2.11, - "learning_rate": 3.2386208508850605e-05, - "loss": 0.288, + "epoch": 2.164197931307889, + "grad_norm": 0.17953209578990936, + "learning_rate": 3.158690666222685e-05, + "loss": 0.3941, "step": 60050 }, { - "epoch": 2.11, - "learning_rate": 3.238348691016537e-05, - "loss": 0.2894, + "epoch": 2.164378130969114, + "grad_norm": 0.15556363761425018, + "learning_rate": 3.158409159339866e-05, + "loss": 0.3773, "step": 60055 }, { - "epoch": 2.11, - "learning_rate": 3.238076521561044e-05, - "loss": 0.2633, + "epoch": 2.1645583306303386, + "grad_norm": 0.1946418732404709, + "learning_rate": 3.158127643486349e-05, + "loss": 0.3913, "step": 60060 }, { - "epoch": 2.11, - "learning_rate": 3.2378043425221116e-05, - "loss": 0.2684, + "epoch": 2.164738530291563, + "grad_norm": 0.23327438533306122, + "learning_rate": 3.157846118665971e-05, + "loss": 0.4158, "step": 60065 }, { - "epoch": 2.11, - "learning_rate": 3.237532153903278e-05, - "loss": 0.3006, + "epoch": 2.1649187299527877, + "grad_norm": 0.1919216811656952, + "learning_rate": 3.1575645848825675e-05, + "loss": 0.3949, "step": 60070 }, { - "epoch": 2.11, - "learning_rate": 3.237259955708074e-05, - "loss": 0.2576, + "epoch": 2.1650989296140124, + "grad_norm": 0.18845520913600922, + "learning_rate": 3.1572830421399724e-05, + "loss": 0.4004, "step": 60075 }, { - "epoch": 2.11, - "learning_rate": 3.236987747940037e-05, - "loss": 0.2727, + "epoch": 2.165279129275237, + "grad_norm": 0.19371269643306732, + "learning_rate": 3.1570014904420245e-05, + "loss": 0.4108, "step": 60080 }, { - "epoch": 2.11, - "learning_rate": 3.236715530602698e-05, - "loss": 0.2846, + "epoch": 2.1654593289364614, + "grad_norm": 0.1754084974527359, + "learning_rate": 3.156719929792557e-05, + "loss": 0.4292, "step": 60085 }, { - "epoch": 2.11, - "learning_rate": 3.236443303699595e-05, - "loss": 0.2586, + "epoch": 2.165639528597686, + "grad_norm": 0.2297254204750061, + "learning_rate": 3.156438360195409e-05, + "loss": 0.3867, "step": 60090 }, { - "epoch": 2.11, - "learning_rate": 3.236171067234259e-05, - "loss": 0.2887, + "epoch": 2.165819728258911, + "grad_norm": 0.18021254241466522, + "learning_rate": 3.156156781654415e-05, + "loss": 0.4032, "step": 60095 }, { - "epoch": 2.11, - "learning_rate": 3.235898821210229e-05, - "loss": 0.2896, + "epoch": 2.1659999279201356, + "grad_norm": 0.20905311405658722, + "learning_rate": 3.155875194173411e-05, + "loss": 0.4037, "step": 60100 }, { - "epoch": 2.11, - "learning_rate": 3.2356265656310365e-05, - "loss": 0.2694, + "epoch": 2.1661801275813604, + "grad_norm": 0.1627390831708908, + "learning_rate": 3.1555935977562354e-05, + "loss": 0.3906, "step": 60105 }, { - "epoch": 2.11, - "learning_rate": 3.235354300500218e-05, - "loss": 0.25, + "epoch": 2.1663603272425846, + "grad_norm": 0.2780887484550476, + "learning_rate": 3.155311992406724e-05, + "loss": 0.4073, "step": 60110 }, { - "epoch": 2.12, - "learning_rate": 3.235082025821309e-05, - "loss": 0.2701, + "epoch": 2.1665405269038094, + "grad_norm": 0.17551511526107788, + "learning_rate": 3.155030378128713e-05, + "loss": 0.4206, "step": 60115 }, { - "epoch": 2.12, - "learning_rate": 3.234809741597844e-05, - "loss": 0.293, + "epoch": 2.166720726565034, + "grad_norm": 0.16955679655075073, + "learning_rate": 3.15474875492604e-05, + "loss": 0.402, "step": 60120 }, { - "epoch": 2.12, - "learning_rate": 3.234537447833358e-05, - "loss": 0.2771, + "epoch": 2.166900926226259, + "grad_norm": 0.16937977075576782, + "learning_rate": 3.154467122802543e-05, + "loss": 0.3718, "step": 60125 }, { - "epoch": 2.12, - "learning_rate": 3.234265144531389e-05, - "loss": 0.282, + "epoch": 2.167081125887483, + "grad_norm": 0.18345296382904053, + "learning_rate": 3.154185481762057e-05, + "loss": 0.4043, "step": 60130 }, { - "epoch": 2.12, - "learning_rate": 3.2339928316954696e-05, - "loss": 0.2647, + "epoch": 2.167261325548708, + "grad_norm": 0.19128628075122833, + "learning_rate": 3.153903831808421e-05, + "loss": 0.4128, "step": 60135 }, { - "epoch": 2.12, - "learning_rate": 3.2337205093291375e-05, - "loss": 0.282, + "epoch": 2.1674415252099326, + "grad_norm": 0.18635614216327667, + "learning_rate": 3.153622172945472e-05, + "loss": 0.3931, "step": 60140 }, { - "epoch": 2.12, - "learning_rate": 3.233448177435928e-05, - "loss": 0.2892, + "epoch": 2.1676217248711573, + "grad_norm": 0.22392097115516663, + "learning_rate": 3.153340505177047e-05, + "loss": 0.3779, "step": 60145 }, { - "epoch": 2.12, - "learning_rate": 3.2331758360193774e-05, - "loss": 0.2863, + "epoch": 2.167801924532382, + "grad_norm": 0.18912804126739502, + "learning_rate": 3.153058828506984e-05, + "loss": 0.387, "step": 60150 }, { - "epoch": 2.12, - "learning_rate": 3.232903485083022e-05, - "loss": 0.2757, + "epoch": 2.1679821241936064, + "grad_norm": 0.17587023973464966, + "learning_rate": 3.152777142939122e-05, + "loss": 0.4231, "step": 60155 }, { - "epoch": 2.12, - "learning_rate": 3.2326311246303967e-05, - "loss": 0.2936, + "epoch": 2.168162323854831, + "grad_norm": 0.19323548674583435, + "learning_rate": 3.152495448477296e-05, + "loss": 0.3857, "step": 60160 }, { - "epoch": 2.12, - "learning_rate": 3.2323587546650406e-05, - "loss": 0.254, + "epoch": 2.168342523516056, + "grad_norm": 0.2253098487854004, + "learning_rate": 3.152213745125348e-05, + "loss": 0.3647, "step": 60165 }, { - "epoch": 2.12, - "learning_rate": 3.2320863751904875e-05, - "loss": 0.2743, + "epoch": 2.1685227231772806, + "grad_norm": 0.26896703243255615, + "learning_rate": 3.151932032887112e-05, + "loss": 0.4163, "step": 60170 }, { - "epoch": 2.12, - "learning_rate": 3.231813986210276e-05, - "loss": 0.2725, + "epoch": 2.168702922838505, + "grad_norm": 0.16771872341632843, + "learning_rate": 3.151650311766429e-05, + "loss": 0.3935, "step": 60175 }, { - "epoch": 2.12, - "learning_rate": 3.231541587727942e-05, - "loss": 0.2518, + "epoch": 2.1688831224997296, + "grad_norm": 0.1727340817451477, + "learning_rate": 3.1513685817671365e-05, + "loss": 0.3809, "step": 60180 }, { - "epoch": 2.12, - "learning_rate": 3.231269179747023e-05, - "loss": 0.2639, + "epoch": 2.1690633221609543, + "grad_norm": 0.17910057306289673, + "learning_rate": 3.151086842893074e-05, + "loss": 0.4137, "step": 60185 }, { - "epoch": 2.12, - "learning_rate": 3.230996762271055e-05, - "loss": 0.2875, + "epoch": 2.169243521822179, + "grad_norm": 0.21039491891860962, + "learning_rate": 3.150805095148079e-05, + "loss": 0.3931, "step": 60190 }, { - "epoch": 2.12, - "learning_rate": 3.230724335303576e-05, - "loss": 0.2559, + "epoch": 2.169423721483404, + "grad_norm": 0.15780936181545258, + "learning_rate": 3.15052333853599e-05, + "loss": 0.3762, "step": 60195 }, { - "epoch": 2.12, - "learning_rate": 3.2304518988481234e-05, - "loss": 0.2703, + "epoch": 2.169603921144628, + "grad_norm": 0.2158997654914856, + "learning_rate": 3.150241573060647e-05, + "loss": 0.413, "step": 60200 }, { - "epoch": 2.12, - "learning_rate": 3.230179452908234e-05, - "loss": 0.272, + "epoch": 2.169784120805853, + "grad_norm": 0.22214800119400024, + "learning_rate": 3.1499597987258876e-05, + "loss": 0.406, "step": 60205 }, { - "epoch": 2.12, - "learning_rate": 3.229906997487445e-05, - "loss": 0.2638, + "epoch": 2.1699643204670775, + "grad_norm": 0.21257710456848145, + "learning_rate": 3.149678015535553e-05, + "loss": 0.4273, "step": 60210 }, { - "epoch": 2.12, - "learning_rate": 3.229634532589296e-05, - "loss": 0.2637, + "epoch": 2.1701445201283023, + "grad_norm": 0.18004845082759857, + "learning_rate": 3.14939622349348e-05, + "loss": 0.3658, "step": 60215 }, { - "epoch": 2.12, - "learning_rate": 3.2293620582173226e-05, - "loss": 0.2767, + "epoch": 2.1703247197895266, + "grad_norm": 0.20979435741901398, + "learning_rate": 3.149114422603511e-05, + "loss": 0.397, "step": 60220 }, { - "epoch": 2.12, - "learning_rate": 3.2290895743750634e-05, - "loss": 0.2669, + "epoch": 2.1705049194507513, + "grad_norm": 0.20553478598594666, + "learning_rate": 3.148832612869482e-05, + "loss": 0.3749, "step": 60225 }, { - "epoch": 2.12, - "learning_rate": 3.2288170810660565e-05, - "loss": 0.2701, + "epoch": 2.170685119111976, + "grad_norm": 0.15849637985229492, + "learning_rate": 3.148550794295235e-05, + "loss": 0.3821, "step": 60230 }, { - "epoch": 2.12, - "learning_rate": 3.228544578293841e-05, - "loss": 0.2598, + "epoch": 2.1708653187732008, + "grad_norm": 0.19608749449253082, + "learning_rate": 3.148268966884609e-05, + "loss": 0.3915, "step": 60235 }, { - "epoch": 2.12, - "learning_rate": 3.228272066061953e-05, - "loss": 0.2803, + "epoch": 2.1710455184344255, + "grad_norm": 0.16606873273849487, + "learning_rate": 3.147987130641443e-05, + "loss": 0.3923, "step": 60240 }, { - "epoch": 2.12, - "learning_rate": 3.227999544373933e-05, - "loss": 0.2956, + "epoch": 2.17122571809565, + "grad_norm": 0.17005008459091187, + "learning_rate": 3.147705285569579e-05, + "loss": 0.417, "step": 60245 }, { - "epoch": 2.12, - "learning_rate": 3.2277270132333185e-05, - "loss": 0.249, + "epoch": 2.1714059177568745, + "grad_norm": 0.1739397495985031, + "learning_rate": 3.1474234316728554e-05, + "loss": 0.4078, "step": 60250 }, { - "epoch": 2.12, - "learning_rate": 3.227454472643649e-05, - "loss": 0.259, + "epoch": 2.1715861174180993, + "grad_norm": 0.21615131199359894, + "learning_rate": 3.1471415689551124e-05, + "loss": 0.3828, "step": 60255 }, { - "epoch": 2.12, - "learning_rate": 3.227181922608462e-05, - "loss": 0.2769, + "epoch": 2.171766317079324, + "grad_norm": 0.18913507461547852, + "learning_rate": 3.1468596974201915e-05, + "loss": 0.4276, "step": 60260 }, { - "epoch": 2.12, - "learning_rate": 3.226909363131296e-05, - "loss": 0.2997, + "epoch": 2.1719465167405487, + "grad_norm": 0.17284362018108368, + "learning_rate": 3.1465778170719314e-05, + "loss": 0.4342, "step": 60265 }, { - "epoch": 2.12, - "learning_rate": 3.226636794215692e-05, - "loss": 0.2831, + "epoch": 2.172126716401773, + "grad_norm": 0.21640555560588837, + "learning_rate": 3.146295927914175e-05, + "loss": 0.4252, "step": 60270 }, { - "epoch": 2.12, - "learning_rate": 3.226364215865188e-05, - "loss": 0.2585, + "epoch": 2.1723069160629978, + "grad_norm": 0.1958705186843872, + "learning_rate": 3.1460140299507614e-05, + "loss": 0.4443, "step": 60275 }, { - "epoch": 2.12, - "learning_rate": 3.2260916280833244e-05, - "loss": 0.2614, + "epoch": 2.1724871157242225, + "grad_norm": 0.18334457278251648, + "learning_rate": 3.145732123185531e-05, + "loss": 0.3744, "step": 60280 }, { - "epoch": 2.12, - "learning_rate": 3.2258190308736375e-05, - "loss": 0.2785, + "epoch": 2.172667315385447, + "grad_norm": 0.2335662692785263, + "learning_rate": 3.1454502076223255e-05, + "loss": 0.4132, "step": 60285 }, { - "epoch": 2.12, - "learning_rate": 3.22554642423967e-05, - "loss": 0.2941, + "epoch": 2.1728475150466715, + "grad_norm": 0.1668962836265564, + "learning_rate": 3.145168283264987e-05, + "loss": 0.3935, "step": 60290 }, { - "epoch": 2.12, - "learning_rate": 3.22527380818496e-05, - "loss": 0.2816, + "epoch": 2.1730277147078962, + "grad_norm": 0.2275708168745041, + "learning_rate": 3.144886350117355e-05, + "loss": 0.3886, "step": 60295 }, { - "epoch": 2.12, - "learning_rate": 3.225001182713048e-05, - "loss": 0.2667, + "epoch": 2.173207914369121, + "grad_norm": 0.21035301685333252, + "learning_rate": 3.144604408183271e-05, + "loss": 0.4101, "step": 60300 }, { - "epoch": 2.12, - "learning_rate": 3.224728547827473e-05, - "loss": 0.2592, + "epoch": 2.1733881140303457, + "grad_norm": 0.20323359966278076, + "learning_rate": 3.144322457466577e-05, + "loss": 0.3783, "step": 60305 }, { - "epoch": 2.12, - "learning_rate": 3.224455903531775e-05, - "loss": 0.2858, + "epoch": 2.1735683136915704, + "grad_norm": 0.21606206893920898, + "learning_rate": 3.144040497971115e-05, + "loss": 0.4244, "step": 60310 }, { - "epoch": 2.12, - "learning_rate": 3.224183249829495e-05, - "loss": 0.2857, + "epoch": 2.1737485133527947, + "grad_norm": 0.1799047589302063, + "learning_rate": 3.143758529700724e-05, + "loss": 0.3813, "step": 60315 }, { - "epoch": 2.12, - "learning_rate": 3.223910586724173e-05, - "loss": 0.268, + "epoch": 2.1739287130140195, + "grad_norm": 0.17974518239498138, + "learning_rate": 3.143476552659249e-05, + "loss": 0.4338, "step": 60320 }, { - "epoch": 2.12, - "learning_rate": 3.223637914219349e-05, - "loss": 0.275, + "epoch": 2.174108912675244, + "grad_norm": 0.17822134494781494, + "learning_rate": 3.14319456685053e-05, + "loss": 0.3918, "step": 60325 }, { - "epoch": 2.12, - "learning_rate": 3.2233652323185636e-05, - "loss": 0.2762, + "epoch": 2.174289112336469, + "grad_norm": 0.21287919580936432, + "learning_rate": 3.1429125722784105e-05, + "loss": 0.4043, "step": 60330 }, { - "epoch": 2.12, - "learning_rate": 3.2230925410253576e-05, - "loss": 0.2881, + "epoch": 2.1744693119976937, + "grad_norm": 0.2515278160572052, + "learning_rate": 3.1426305689467304e-05, + "loss": 0.3708, "step": 60335 }, { - "epoch": 2.12, - "learning_rate": 3.2228198403432705e-05, - "loss": 0.2852, + "epoch": 2.174649511658918, + "grad_norm": 0.20817367732524872, + "learning_rate": 3.142348556859335e-05, + "loss": 0.4246, "step": 60340 }, { - "epoch": 2.12, - "learning_rate": 3.222547130275845e-05, - "loss": 0.2704, + "epoch": 2.1748297113201427, + "grad_norm": 0.20883257687091827, + "learning_rate": 3.142066536020063e-05, + "loss": 0.4121, "step": 60345 }, { - "epoch": 2.12, - "learning_rate": 3.222274410826621e-05, - "loss": 0.2745, + "epoch": 2.1750099109813674, + "grad_norm": 0.2272225171327591, + "learning_rate": 3.14178450643276e-05, + "loss": 0.4404, "step": 60350 }, { - "epoch": 2.12, - "learning_rate": 3.22200168199914e-05, - "loss": 0.2804, + "epoch": 2.175190110642592, + "grad_norm": 0.20350000262260437, + "learning_rate": 3.141502468101267e-05, + "loss": 0.4316, "step": 60355 }, { - "epoch": 2.12, - "learning_rate": 3.2217289437969435e-05, - "loss": 0.2782, + "epoch": 2.1753703103038164, + "grad_norm": 0.16587962210178375, + "learning_rate": 3.141220421029427e-05, + "loss": 0.367, "step": 60360 }, { - "epoch": 2.12, - "learning_rate": 3.22145619622357e-05, - "loss": 0.2681, + "epoch": 2.175550509965041, + "grad_norm": 0.16492369771003723, + "learning_rate": 3.140938365221082e-05, + "loss": 0.417, "step": 60365 }, { - "epoch": 2.12, - "learning_rate": 3.221183439282566e-05, - "loss": 0.2567, + "epoch": 2.175730709626266, + "grad_norm": 0.1600433737039566, + "learning_rate": 3.140656300680077e-05, + "loss": 0.3949, "step": 60370 }, { - "epoch": 2.12, - "learning_rate": 3.2209106729774686e-05, - "loss": 0.2655, + "epoch": 2.1759109092874906, + "grad_norm": 0.2284023016691208, + "learning_rate": 3.140374227410254e-05, + "loss": 0.433, "step": 60375 }, { - "epoch": 2.12, - "learning_rate": 3.220637897311822e-05, - "loss": 0.2645, + "epoch": 2.1760911089487154, + "grad_norm": 0.267976313829422, + "learning_rate": 3.1400921454154553e-05, + "loss": 0.3941, "step": 60380 }, { - "epoch": 2.12, - "learning_rate": 3.2203651122891664e-05, - "loss": 0.2739, + "epoch": 2.1762713086099397, + "grad_norm": 0.19785836338996887, + "learning_rate": 3.1398100546995256e-05, + "loss": 0.4047, "step": 60385 }, { - "epoch": 2.12, - "learning_rate": 3.2200923179130455e-05, - "loss": 0.282, + "epoch": 2.1764515082711644, + "grad_norm": 0.19465455412864685, + "learning_rate": 3.1395279552663075e-05, + "loss": 0.4211, "step": 60390 }, { - "epoch": 2.12, - "learning_rate": 3.2198195141869995e-05, - "loss": 0.2717, + "epoch": 2.176631707932389, + "grad_norm": 0.22500787675380707, + "learning_rate": 3.139245847119646e-05, + "loss": 0.4166, "step": 60395 }, { - "epoch": 2.13, - "learning_rate": 3.219546701114572e-05, - "loss": 0.251, + "epoch": 2.176811907593614, + "grad_norm": 0.19240856170654297, + "learning_rate": 3.1389637302633816e-05, + "loss": 0.3926, "step": 60400 }, { - "epoch": 2.13, - "learning_rate": 3.219273878699305e-05, - "loss": 0.2748, + "epoch": 2.176992107254838, + "grad_norm": 0.22199541330337524, + "learning_rate": 3.1386816047013615e-05, + "loss": 0.3848, "step": 60405 }, { - "epoch": 2.13, - "learning_rate": 3.219001046944741e-05, - "loss": 0.2541, + "epoch": 2.177172306916063, + "grad_norm": 0.21987438201904297, + "learning_rate": 3.1383994704374276e-05, + "loss": 0.3841, "step": 60410 }, { - "epoch": 2.13, - "learning_rate": 3.2187282058544214e-05, - "loss": 0.2729, + "epoch": 2.1773525065772876, + "grad_norm": 0.21120600402355194, + "learning_rate": 3.138117327475425e-05, + "loss": 0.3895, "step": 60415 }, { - "epoch": 2.13, - "learning_rate": 3.2184553554318894e-05, - "loss": 0.2877, + "epoch": 2.1775327062385124, + "grad_norm": 0.14966708421707153, + "learning_rate": 3.137835175819197e-05, + "loss": 0.3998, "step": 60420 }, { - "epoch": 2.13, - "learning_rate": 3.2181824956806896e-05, - "loss": 0.2689, + "epoch": 2.177712905899737, + "grad_norm": 0.18459533154964447, + "learning_rate": 3.137553015472588e-05, + "loss": 0.3855, "step": 60425 }, { - "epoch": 2.13, - "learning_rate": 3.217909626604362e-05, - "loss": 0.2677, + "epoch": 2.1778931055609614, + "grad_norm": 0.18099567294120789, + "learning_rate": 3.1372708464394427e-05, + "loss": 0.3645, "step": 60430 }, { - "epoch": 2.13, - "learning_rate": 3.217636748206452e-05, - "loss": 0.2678, + "epoch": 2.178073305222186, + "grad_norm": 0.20282356441020966, + "learning_rate": 3.136988668723606e-05, + "loss": 0.4062, "step": 60435 }, { - "epoch": 2.13, - "learning_rate": 3.217363860490501e-05, - "loss": 0.2515, + "epoch": 2.178253504883411, + "grad_norm": 0.22229717671871185, + "learning_rate": 3.136706482328922e-05, + "loss": 0.3824, "step": 60440 }, { - "epoch": 2.13, - "learning_rate": 3.217090963460053e-05, - "loss": 0.2667, + "epoch": 2.1784337045446356, + "grad_norm": 0.22084426879882812, + "learning_rate": 3.1364242872592345e-05, + "loss": 0.3796, "step": 60445 }, { - "epoch": 2.13, - "learning_rate": 3.216818057118652e-05, - "loss": 0.2902, + "epoch": 2.17861390420586, + "grad_norm": 0.24100591242313385, + "learning_rate": 3.1361420835183894e-05, + "loss": 0.4144, "step": 60450 }, { - "epoch": 2.13, - "learning_rate": 3.216545141469841e-05, - "loss": 0.2829, + "epoch": 2.1787941038670846, + "grad_norm": 0.18646858632564545, + "learning_rate": 3.1358598711102336e-05, + "loss": 0.3665, "step": 60455 }, { - "epoch": 2.13, - "learning_rate": 3.2162722165171624e-05, - "loss": 0.2775, + "epoch": 2.1789743035283093, + "grad_norm": 0.22386908531188965, + "learning_rate": 3.135577650038608e-05, + "loss": 0.3888, "step": 60460 }, { - "epoch": 2.13, - "learning_rate": 3.2159992822641626e-05, - "loss": 0.2978, + "epoch": 2.179154503189534, + "grad_norm": 0.1991269886493683, + "learning_rate": 3.135295420307361e-05, + "loss": 0.4148, "step": 60465 }, { - "epoch": 2.13, - "learning_rate": 3.215726338714383e-05, - "loss": 0.2594, + "epoch": 2.179334702850759, + "grad_norm": 0.18925313651561737, + "learning_rate": 3.135013181920336e-05, + "loss": 0.3654, "step": 60470 }, { - "epoch": 2.13, - "learning_rate": 3.215453385871369e-05, - "loss": 0.282, + "epoch": 2.179514902511983, + "grad_norm": 0.2189246565103531, + "learning_rate": 3.13473093488138e-05, + "loss": 0.3729, "step": 60475 }, { - "epoch": 2.13, - "learning_rate": 3.215180423738664e-05, - "loss": 0.2855, + "epoch": 2.179695102173208, + "grad_norm": 0.17129361629486084, + "learning_rate": 3.134448679194338e-05, + "loss": 0.4165, "step": 60480 }, { - "epoch": 2.13, - "learning_rate": 3.214907452319813e-05, - "loss": 0.268, + "epoch": 2.1798753018344326, + "grad_norm": 0.20182783901691437, + "learning_rate": 3.134166414863055e-05, + "loss": 0.3914, "step": 60485 }, { - "epoch": 2.13, - "learning_rate": 3.21463447161836e-05, - "loss": 0.2403, + "epoch": 2.1800555014956573, + "grad_norm": 0.18997080624103546, + "learning_rate": 3.1338841418913776e-05, + "loss": 0.3937, "step": 60490 }, { - "epoch": 2.13, - "learning_rate": 3.2143614816378486e-05, - "loss": 0.2631, + "epoch": 2.1802357011568816, + "grad_norm": 0.1698470115661621, + "learning_rate": 3.133601860283152e-05, + "loss": 0.3883, "step": 60495 }, { - "epoch": 2.13, - "learning_rate": 3.2140884823818246e-05, - "loss": 0.2759, + "epoch": 2.1804159008181063, + "grad_norm": 0.2348821759223938, + "learning_rate": 3.133319570042224e-05, + "loss": 0.4298, "step": 60500 }, { - "epoch": 2.13, - "eval_loss": 0.26962408423423767, - "eval_runtime": 10.5459, - "eval_samples_per_second": 9.482, - "eval_steps_per_second": 9.482, + "epoch": 2.1804159008181063, + "eval_loss": 0.4370137155056, + "eval_runtime": 3.5451, + "eval_samples_per_second": 28.208, + "eval_steps_per_second": 7.052, "step": 60500 }, { - "epoch": 2.13, - "learning_rate": 3.213815473853832e-05, - "loss": 0.2721, + "epoch": 2.180596100479331, + "grad_norm": 0.16450156271457672, + "learning_rate": 3.1330372711724385e-05, + "loss": 0.3918, "step": 60505 }, { - "epoch": 2.13, - "learning_rate": 3.213542456057416e-05, - "loss": 0.2745, + "epoch": 2.180776300140556, + "grad_norm": 0.23515671491622925, + "learning_rate": 3.132754963677643e-05, + "loss": 0.3826, "step": 60510 }, { - "epoch": 2.13, - "learning_rate": 3.213269428996121e-05, - "loss": 0.2793, + "epoch": 2.1809564998017805, + "grad_norm": 0.1602397859096527, + "learning_rate": 3.132472647561684e-05, + "loss": 0.3654, "step": 60515 }, { - "epoch": 2.13, - "learning_rate": 3.212996392673494e-05, - "loss": 0.2722, + "epoch": 2.181136699463005, + "grad_norm": 0.15597917139530182, + "learning_rate": 3.1321903228284064e-05, + "loss": 0.3848, "step": 60520 }, { - "epoch": 2.13, - "learning_rate": 3.212723347093077e-05, - "loss": 0.2784, + "epoch": 2.1813168991242295, + "grad_norm": 0.1703244298696518, + "learning_rate": 3.1319079894816595e-05, + "loss": 0.4033, "step": 60525 }, { - "epoch": 2.13, - "learning_rate": 3.212450292258418e-05, - "loss": 0.2722, + "epoch": 2.1814970987854543, + "grad_norm": 0.19135898351669312, + "learning_rate": 3.131625647525288e-05, + "loss": 0.389, "step": 60530 }, { - "epoch": 2.13, - "learning_rate": 3.2121772281730603e-05, - "loss": 0.2757, + "epoch": 2.181677298446679, + "grad_norm": 0.22922751307487488, + "learning_rate": 3.1313432969631397e-05, + "loss": 0.422, "step": 60535 }, { - "epoch": 2.13, - "learning_rate": 3.2119041548405526e-05, - "loss": 0.2651, + "epoch": 2.1818574981079037, + "grad_norm": 0.24647971987724304, + "learning_rate": 3.1310609377990603e-05, + "loss": 0.4223, "step": 60540 }, { - "epoch": 2.13, - "learning_rate": 3.2116310722644374e-05, - "loss": 0.2571, + "epoch": 2.182037697769128, + "grad_norm": 0.2550751268863678, + "learning_rate": 3.130778570036898e-05, + "loss": 0.4325, "step": 60545 }, { - "epoch": 2.13, - "learning_rate": 3.211357980448262e-05, - "loss": 0.2568, + "epoch": 2.1822178974303528, + "grad_norm": 0.20791274309158325, + "learning_rate": 3.130496193680501e-05, + "loss": 0.4028, "step": 60550 }, { - "epoch": 2.13, - "learning_rate": 3.211084879395572e-05, - "loss": 0.2969, + "epoch": 2.1823980970915775, + "grad_norm": 0.1871381402015686, + "learning_rate": 3.130213808733714e-05, + "loss": 0.3844, "step": 60555 }, { - "epoch": 2.13, - "learning_rate": 3.210811769109913e-05, - "loss": 0.2799, + "epoch": 2.1825782967528022, + "grad_norm": 0.2246485948562622, + "learning_rate": 3.129931415200387e-05, + "loss": 0.4274, "step": 60560 }, { - "epoch": 2.13, - "learning_rate": 3.210538649594832e-05, - "loss": 0.2843, + "epoch": 2.182758496414027, + "grad_norm": 0.22834356129169464, + "learning_rate": 3.129649013084365e-05, + "loss": 0.4065, "step": 60565 }, { - "epoch": 2.13, - "learning_rate": 3.210265520853875e-05, - "loss": 0.2762, + "epoch": 2.1829386960752513, + "grad_norm": 0.2202819585800171, + "learning_rate": 3.1293666023894984e-05, + "loss": 0.3958, "step": 60570 }, { - "epoch": 2.13, - "learning_rate": 3.2099923828905883e-05, - "loss": 0.2606, + "epoch": 2.183118895736476, + "grad_norm": 0.23861894011497498, + "learning_rate": 3.129084183119634e-05, + "loss": 0.3895, "step": 60575 }, { - "epoch": 2.13, - "learning_rate": 3.209719235708519e-05, - "loss": 0.2838, + "epoch": 2.1832990953977007, + "grad_norm": 0.20570367574691772, + "learning_rate": 3.128801755278618e-05, + "loss": 0.3865, "step": 60580 }, { - "epoch": 2.13, - "learning_rate": 3.209446079311211e-05, - "loss": 0.2753, + "epoch": 2.1834792950589255, + "grad_norm": 0.2041841596364975, + "learning_rate": 3.128519318870301e-05, + "loss": 0.417, "step": 60585 }, { - "epoch": 2.13, - "learning_rate": 3.209172913702215e-05, - "loss": 0.2683, + "epoch": 2.1836594947201498, + "grad_norm": 0.19558176398277283, + "learning_rate": 3.128236873898529e-05, + "loss": 0.3732, "step": 60590 }, { - "epoch": 2.13, - "learning_rate": 3.208899738885076e-05, - "loss": 0.2761, + "epoch": 2.1838396943813745, + "grad_norm": 0.2021927386522293, + "learning_rate": 3.1279544203671516e-05, + "loss": 0.4129, "step": 60595 }, { - "epoch": 2.13, - "learning_rate": 3.208626554863341e-05, - "loss": 0.2663, + "epoch": 2.184019894042599, + "grad_norm": 0.19117502868175507, + "learning_rate": 3.1276719582800176e-05, + "loss": 0.4208, "step": 60600 }, { - "epoch": 2.13, - "learning_rate": 3.208353361640557e-05, - "loss": 0.2608, + "epoch": 2.184200093703824, + "grad_norm": 0.18669839203357697, + "learning_rate": 3.127389487640974e-05, + "loss": 0.4171, "step": 60605 }, { - "epoch": 2.13, - "learning_rate": 3.2080801592202715e-05, - "loss": 0.2925, + "epoch": 2.1843802933650487, + "grad_norm": 0.18443822860717773, + "learning_rate": 3.1271070084538703e-05, + "loss": 0.4149, "step": 60610 }, { - "epoch": 2.13, - "learning_rate": 3.2078069476060315e-05, - "loss": 0.2767, + "epoch": 2.184560493026273, + "grad_norm": 0.20405270159244537, + "learning_rate": 3.126824520722554e-05, + "loss": 0.3927, "step": 60615 }, { - "epoch": 2.13, - "learning_rate": 3.207533726801385e-05, - "loss": 0.2809, + "epoch": 2.1847406926874977, + "grad_norm": 0.19594557583332062, + "learning_rate": 3.126542024450876e-05, + "loss": 0.3956, "step": 60620 }, { - "epoch": 2.13, - "learning_rate": 3.207260496809879e-05, - "loss": 0.2835, + "epoch": 2.1849208923487224, + "grad_norm": 0.21037110686302185, + "learning_rate": 3.126259519642684e-05, + "loss": 0.3782, "step": 60625 }, { - "epoch": 2.13, - "learning_rate": 3.206987257635062e-05, - "loss": 0.2855, + "epoch": 2.185101092009947, + "grad_norm": 0.21924084424972534, + "learning_rate": 3.125977006301828e-05, + "loss": 0.3774, "step": 60630 }, { - "epoch": 2.13, - "learning_rate": 3.206714009280481e-05, - "loss": 0.2797, + "epoch": 2.1852812916711715, + "grad_norm": 0.22989316284656525, + "learning_rate": 3.125694484432155e-05, + "loss": 0.4115, "step": 60635 }, { - "epoch": 2.13, - "learning_rate": 3.206440751749685e-05, - "loss": 0.2827, + "epoch": 2.185461491332396, + "grad_norm": 0.20585152506828308, + "learning_rate": 3.1254119540375173e-05, + "loss": 0.3983, "step": 60640 }, { - "epoch": 2.13, - "learning_rate": 3.206167485046221e-05, - "loss": 0.2775, + "epoch": 2.185641690993621, + "grad_norm": 0.20063400268554688, + "learning_rate": 3.1251294151217614e-05, + "loss": 0.3712, "step": 60645 }, { - "epoch": 2.13, - "learning_rate": 3.2058942091736366e-05, - "loss": 0.2738, + "epoch": 2.1858218906548457, + "grad_norm": 0.18336834013462067, + "learning_rate": 3.1248468676887396e-05, + "loss": 0.4123, "step": 60650 }, { - "epoch": 2.13, - "learning_rate": 3.2056209241354826e-05, - "loss": 0.2301, + "epoch": 2.1860020903160704, + "grad_norm": 0.2062855362892151, + "learning_rate": 3.124564311742299e-05, + "loss": 0.3857, "step": 60655 }, { - "epoch": 2.13, - "learning_rate": 3.205347629935305e-05, - "loss": 0.3128, + "epoch": 2.1861822899772947, + "grad_norm": 0.2320207953453064, + "learning_rate": 3.124281747286291e-05, + "loss": 0.4176, "step": 60660 }, { - "epoch": 2.13, - "learning_rate": 3.2050743265766534e-05, - "loss": 0.2745, + "epoch": 2.1863624896385194, + "grad_norm": 0.17851118743419647, + "learning_rate": 3.1239991743245656e-05, + "loss": 0.3797, "step": 60665 }, { - "epoch": 2.13, - "learning_rate": 3.204801014063077e-05, - "loss": 0.2572, + "epoch": 2.186542689299744, + "grad_norm": 0.17879897356033325, + "learning_rate": 3.123716592860971e-05, + "loss": 0.3958, "step": 60670 }, { - "epoch": 2.13, - "learning_rate": 3.204527692398124e-05, - "loss": 0.2781, + "epoch": 2.186722888960969, + "grad_norm": 0.24118387699127197, + "learning_rate": 3.12343400289936e-05, + "loss": 0.4083, "step": 60675 }, { - "epoch": 2.13, - "learning_rate": 3.204254361585342e-05, - "loss": 0.2621, + "epoch": 2.186903088622193, + "grad_norm": 0.17323794960975647, + "learning_rate": 3.123151404443581e-05, + "loss": 0.3856, "step": 60680 }, { - "epoch": 2.14, - "learning_rate": 3.203981021628282e-05, - "loss": 0.2497, + "epoch": 2.187083288283418, + "grad_norm": 0.19309526681900024, + "learning_rate": 3.122868797497485e-05, + "loss": 0.3939, "step": 60685 }, { - "epoch": 2.14, - "learning_rate": 3.203707672530493e-05, - "loss": 0.2819, + "epoch": 2.1872634879446426, + "grad_norm": 0.20537187159061432, + "learning_rate": 3.122586182064921e-05, + "loss": 0.3747, "step": 60690 }, { - "epoch": 2.14, - "learning_rate": 3.2034343142955225e-05, - "loss": 0.249, + "epoch": 2.1874436876058674, + "grad_norm": 0.22961921989917755, + "learning_rate": 3.122303558149742e-05, + "loss": 0.4318, "step": 60695 }, { - "epoch": 2.14, - "learning_rate": 3.203160946926921e-05, - "loss": 0.2888, + "epoch": 2.187623887267092, + "grad_norm": 0.2008984237909317, + "learning_rate": 3.122020925755797e-05, + "loss": 0.4176, "step": 60700 }, { - "epoch": 2.14, - "learning_rate": 3.202887570428239e-05, - "loss": 0.2805, + "epoch": 2.1878040869283164, + "grad_norm": 0.2251536101102829, + "learning_rate": 3.121738284886938e-05, + "loss": 0.3891, "step": 60705 }, { - "epoch": 2.14, - "learning_rate": 3.2026141848030244e-05, - "loss": 0.2742, + "epoch": 2.187984286589541, + "grad_norm": 0.16935603320598602, + "learning_rate": 3.121455635547014e-05, + "loss": 0.3819, "step": 60710 }, { - "epoch": 2.14, - "learning_rate": 3.202340790054828e-05, - "loss": 0.2432, + "epoch": 2.188164486250766, + "grad_norm": 0.20971350371837616, + "learning_rate": 3.121172977739878e-05, + "loss": 0.4172, "step": 60715 }, { - "epoch": 2.14, - "learning_rate": 3.202067386187199e-05, - "loss": 0.2833, + "epoch": 2.1883446859119906, + "grad_norm": 0.25330233573913574, + "learning_rate": 3.1208903114693804e-05, + "loss": 0.403, "step": 60720 }, { - "epoch": 2.14, - "learning_rate": 3.201793973203688e-05, - "loss": 0.267, + "epoch": 2.188524885573215, + "grad_norm": 0.17658570408821106, + "learning_rate": 3.1206076367393724e-05, + "loss": 0.3783, "step": 60725 }, { - "epoch": 2.14, - "learning_rate": 3.201520551107844e-05, - "loss": 0.2622, + "epoch": 2.1887050852344396, + "grad_norm": 0.18725471198558807, + "learning_rate": 3.1203249535537056e-05, + "loss": 0.3887, "step": 60730 }, { - "epoch": 2.14, - "learning_rate": 3.201247119903219e-05, - "loss": 0.2818, + "epoch": 2.1888852848956644, + "grad_norm": 0.17408056557178497, + "learning_rate": 3.1200422619162315e-05, + "loss": 0.3688, "step": 60735 }, { - "epoch": 2.14, - "learning_rate": 3.2009736795933616e-05, - "loss": 0.2603, + "epoch": 2.189065484556889, + "grad_norm": 0.21276924014091492, + "learning_rate": 3.119759561830802e-05, + "loss": 0.4114, "step": 60740 }, { - "epoch": 2.14, - "learning_rate": 3.200700230181823e-05, - "loss": 0.2846, + "epoch": 2.189245684218114, + "grad_norm": 0.2514096200466156, + "learning_rate": 3.119476853301268e-05, + "loss": 0.4205, "step": 60745 }, { - "epoch": 2.14, - "learning_rate": 3.2004267716721534e-05, - "loss": 0.2779, + "epoch": 2.189425883879338, + "grad_norm": 0.1897374540567398, + "learning_rate": 3.1191941363314814e-05, + "loss": 0.383, "step": 60750 }, { - "epoch": 2.14, - "learning_rate": 3.200153304067904e-05, - "loss": 0.2703, + "epoch": 2.189606083540563, + "grad_norm": 0.23337703943252563, + "learning_rate": 3.1189114109252946e-05, + "loss": 0.4357, "step": 60755 }, { - "epoch": 2.14, - "learning_rate": 3.199879827372626e-05, - "loss": 0.2681, + "epoch": 2.1897862832017876, + "grad_norm": 0.2705073654651642, + "learning_rate": 3.118628677086561e-05, + "loss": 0.4014, "step": 60760 }, { - "epoch": 2.14, - "learning_rate": 3.199606341589869e-05, - "loss": 0.276, + "epoch": 2.1899664828630123, + "grad_norm": 0.20422200858592987, + "learning_rate": 3.1183459348191296e-05, + "loss": 0.4171, "step": 60765 }, { - "epoch": 2.14, - "learning_rate": 3.199332846723185e-05, - "loss": 0.2633, + "epoch": 2.190146682524237, + "grad_norm": 0.20398341119289398, + "learning_rate": 3.118063184126856e-05, + "loss": 0.3475, "step": 60770 }, { - "epoch": 2.14, - "learning_rate": 3.199059342776125e-05, - "loss": 0.2834, + "epoch": 2.1903268821854613, + "grad_norm": 0.17860305309295654, + "learning_rate": 3.11778042501359e-05, + "loss": 0.3701, "step": 60775 }, { - "epoch": 2.14, - "learning_rate": 3.19878582975224e-05, - "loss": 0.2831, + "epoch": 2.190507081846686, + "grad_norm": 0.19039638340473175, + "learning_rate": 3.117497657483187e-05, + "loss": 0.4299, "step": 60780 }, { - "epoch": 2.14, - "learning_rate": 3.198512307655081e-05, - "loss": 0.2631, + "epoch": 2.190687281507911, + "grad_norm": 0.2850659489631653, + "learning_rate": 3.117214881539496e-05, + "loss": 0.4103, "step": 60785 }, { - "epoch": 2.14, - "learning_rate": 3.1982387764882005e-05, - "loss": 0.279, + "epoch": 2.1908674811691355, + "grad_norm": 0.24027515947818756, + "learning_rate": 3.116932097186373e-05, + "loss": 0.422, "step": 60790 }, { - "epoch": 2.14, - "learning_rate": 3.19796523625515e-05, - "loss": 0.2773, + "epoch": 2.19104768083036, + "grad_norm": 0.18420986831188202, + "learning_rate": 3.116649304427669e-05, + "loss": 0.3831, "step": 60795 }, { - "epoch": 2.14, - "learning_rate": 3.197691686959481e-05, - "loss": 0.2691, + "epoch": 2.1912278804915846, + "grad_norm": 0.23363029956817627, + "learning_rate": 3.116366503267238e-05, + "loss": 0.4279, "step": 60800 }, { - "epoch": 2.14, - "learning_rate": 3.1974181286047435e-05, - "loss": 0.2634, + "epoch": 2.1914080801528093, + "grad_norm": 0.18705283105373383, + "learning_rate": 3.116083693708933e-05, + "loss": 0.377, "step": 60805 }, { - "epoch": 2.14, - "learning_rate": 3.197144561194493e-05, - "loss": 0.2702, + "epoch": 2.191588279814034, + "grad_norm": 0.22469288110733032, + "learning_rate": 3.115800875756606e-05, + "loss": 0.3951, "step": 60810 }, { - "epoch": 2.14, - "learning_rate": 3.196870984732279e-05, - "loss": 0.2642, + "epoch": 2.1917684794752588, + "grad_norm": 0.17964445054531097, + "learning_rate": 3.115518049414112e-05, + "loss": 0.3898, "step": 60815 }, { - "epoch": 2.14, - "learning_rate": 3.1965973992216554e-05, - "loss": 0.2564, + "epoch": 2.191948679136483, + "grad_norm": 0.22461974620819092, + "learning_rate": 3.115235214685303e-05, + "loss": 0.4117, "step": 60820 }, { - "epoch": 2.14, - "learning_rate": 3.196323804666173e-05, - "loss": 0.2799, + "epoch": 2.192128878797708, + "grad_norm": 0.1743023842573166, + "learning_rate": 3.114952371574035e-05, + "loss": 0.4163, "step": 60825 }, { - "epoch": 2.14, - "learning_rate": 3.1960502010693844e-05, - "loss": 0.2689, + "epoch": 2.1923090784589325, + "grad_norm": 0.22755619883537292, + "learning_rate": 3.114669520084158e-05, + "loss": 0.4297, "step": 60830 }, { - "epoch": 2.14, - "learning_rate": 3.1957765884348427e-05, - "loss": 0.271, + "epoch": 2.1924892781201573, + "grad_norm": 0.21366189420223236, + "learning_rate": 3.114386660219528e-05, + "loss": 0.3677, "step": 60835 }, { - "epoch": 2.14, - "learning_rate": 3.1955029667661016e-05, - "loss": 0.2715, + "epoch": 2.192669477781382, + "grad_norm": 0.19875569641590118, + "learning_rate": 3.1141037919839996e-05, + "loss": 0.377, "step": 60840 }, { - "epoch": 2.14, - "learning_rate": 3.195229336066712e-05, - "loss": 0.277, + "epoch": 2.1928496774426063, + "grad_norm": 0.18235017359256744, + "learning_rate": 3.113820915381426e-05, + "loss": 0.4023, "step": 60845 }, { - "epoch": 2.14, - "learning_rate": 3.194955696340228e-05, - "loss": 0.2632, + "epoch": 2.193029877103831, + "grad_norm": 0.22413848340511322, + "learning_rate": 3.1135380304156614e-05, + "loss": 0.4025, "step": 60850 }, { - "epoch": 2.14, - "learning_rate": 3.1946820475902024e-05, - "loss": 0.2803, + "epoch": 2.1932100767650557, + "grad_norm": 0.226275235414505, + "learning_rate": 3.11325513709056e-05, + "loss": 0.4236, "step": 60855 }, { - "epoch": 2.14, - "learning_rate": 3.1944083898201874e-05, - "loss": 0.2787, + "epoch": 2.1933902764262805, + "grad_norm": 0.23043504357337952, + "learning_rate": 3.1129722354099746e-05, + "loss": 0.3911, "step": 60860 }, { - "epoch": 2.14, - "learning_rate": 3.194134723033738e-05, - "loss": 0.2935, + "epoch": 2.1935704760875048, + "grad_norm": 0.22926190495491028, + "learning_rate": 3.1126893253777626e-05, + "loss": 0.3957, "step": 60865 }, { - "epoch": 2.14, - "learning_rate": 3.193861047234406e-05, - "loss": 0.279, + "epoch": 2.1937506757487295, + "grad_norm": 0.24694472551345825, + "learning_rate": 3.1124064069977766e-05, + "loss": 0.3715, "step": 60870 }, { - "epoch": 2.14, - "learning_rate": 3.193587362425746e-05, - "loss": 0.2632, + "epoch": 2.1939308754099542, + "grad_norm": 0.16308090090751648, + "learning_rate": 3.112123480273872e-05, + "loss": 0.3668, "step": 60875 }, { - "epoch": 2.14, - "learning_rate": 3.193313668611311e-05, - "loss": 0.2805, + "epoch": 2.194111075071179, + "grad_norm": 0.17201615869998932, + "learning_rate": 3.111840545209903e-05, + "loss": 0.3633, "step": 60880 }, { - "epoch": 2.14, - "learning_rate": 3.193039965794655e-05, - "loss": 0.2754, + "epoch": 2.1942912747324037, + "grad_norm": 0.20584902167320251, + "learning_rate": 3.1115576018097264e-05, + "loss": 0.4049, "step": 60885 }, { - "epoch": 2.14, - "learning_rate": 3.192766253979332e-05, - "loss": 0.259, + "epoch": 2.194471474393628, + "grad_norm": 0.16585664451122284, + "learning_rate": 3.111274650077195e-05, + "loss": 0.3564, "step": 60890 }, { - "epoch": 2.14, - "learning_rate": 3.192492533168896e-05, - "loss": 0.2614, + "epoch": 2.1946516740548527, + "grad_norm": 0.18358787894248962, + "learning_rate": 3.110991690016165e-05, + "loss": 0.4176, "step": 60895 }, { - "epoch": 2.14, - "learning_rate": 3.1922188033668995e-05, - "loss": 0.2569, + "epoch": 2.1948318737160775, + "grad_norm": 0.28625836968421936, + "learning_rate": 3.11070872163049e-05, + "loss": 0.4292, "step": 60900 }, { - "epoch": 2.14, - "learning_rate": 3.1919450645769e-05, - "loss": 0.2703, + "epoch": 2.195012073377302, + "grad_norm": 0.2175116091966629, + "learning_rate": 3.110425744924029e-05, + "loss": 0.4274, "step": 60905 }, { - "epoch": 2.14, - "learning_rate": 3.1916713168024484e-05, - "loss": 0.2464, + "epoch": 2.1951922730385265, + "grad_norm": 0.18882659077644348, + "learning_rate": 3.1101427599006346e-05, + "loss": 0.3686, "step": 60910 }, { - "epoch": 2.14, - "learning_rate": 3.1913975600471006e-05, - "loss": 0.2377, + "epoch": 2.195372472699751, + "grad_norm": 0.2079334855079651, + "learning_rate": 3.109859766564163e-05, + "loss": 0.4185, "step": 60915 }, { - "epoch": 2.14, - "learning_rate": 3.191123794314411e-05, - "loss": 0.2752, + "epoch": 2.195552672360976, + "grad_norm": 0.18339209258556366, + "learning_rate": 3.1095767649184704e-05, + "loss": 0.399, "step": 60920 }, { - "epoch": 2.14, - "learning_rate": 3.190850019607935e-05, - "loss": 0.2745, + "epoch": 2.1957328720222007, + "grad_norm": 0.17973054945468903, + "learning_rate": 3.1092937549674126e-05, + "loss": 0.3799, "step": 60925 }, { - "epoch": 2.14, - "learning_rate": 3.190576235931227e-05, - "loss": 0.2692, + "epoch": 2.1959130716834254, + "grad_norm": 0.24256204068660736, + "learning_rate": 3.109010736714845e-05, + "loss": 0.3867, "step": 60930 }, { - "epoch": 2.14, - "learning_rate": 3.190302443287841e-05, - "loss": 0.2722, + "epoch": 2.1960932713446497, + "grad_norm": 0.21241244673728943, + "learning_rate": 3.1087277101646244e-05, + "loss": 0.4304, "step": 60935 }, { - "epoch": 2.14, - "learning_rate": 3.190028641681334e-05, - "loss": 0.28, + "epoch": 2.1962734710058744, + "grad_norm": 0.1754169911146164, + "learning_rate": 3.108444675320607e-05, + "loss": 0.4074, "step": 60940 }, { - "epoch": 2.14, - "learning_rate": 3.189754831115258e-05, - "loss": 0.2689, + "epoch": 2.196453670667099, + "grad_norm": 0.20329289138317108, + "learning_rate": 3.108161632186648e-05, + "loss": 0.4086, "step": 60945 }, { - "epoch": 2.14, - "learning_rate": 3.189481011593171e-05, - "loss": 0.2435, + "epoch": 2.196633870328324, + "grad_norm": 0.2189503312110901, + "learning_rate": 3.107878580766604e-05, + "loss": 0.4192, "step": 60950 }, { - "epoch": 2.14, - "learning_rate": 3.1892071831186275e-05, - "loss": 0.2531, + "epoch": 2.196814069989548, + "grad_norm": 0.1868715137243271, + "learning_rate": 3.107595521064333e-05, + "loss": 0.3519, "step": 60955 }, { - "epoch": 2.14, - "learning_rate": 3.188933345695183e-05, - "loss": 0.2654, + "epoch": 2.196994269650773, + "grad_norm": 0.25199243426322937, + "learning_rate": 3.1073124530836894e-05, + "loss": 0.3997, "step": 60960 }, { - "epoch": 2.14, - "learning_rate": 3.188659499326393e-05, - "loss": 0.2704, + "epoch": 2.1971744693119977, + "grad_norm": 0.21293796598911285, + "learning_rate": 3.107029376828533e-05, + "loss": 0.3592, "step": 60965 }, { - "epoch": 2.15, - "learning_rate": 3.1883856440158124e-05, - "loss": 0.3151, + "epoch": 2.1973546689732224, + "grad_norm": 0.21643023192882538, + "learning_rate": 3.1067462923027174e-05, + "loss": 0.418, "step": 60970 }, { - "epoch": 2.15, - "learning_rate": 3.188111779766998e-05, - "loss": 0.2528, + "epoch": 2.197534868634447, + "grad_norm": 0.20323172211647034, + "learning_rate": 3.106463199510102e-05, + "loss": 0.4273, "step": 60975 }, { - "epoch": 2.15, - "learning_rate": 3.187837906583507e-05, - "loss": 0.2974, + "epoch": 2.1977150682956714, + "grad_norm": 0.20973147451877594, + "learning_rate": 3.106180098454542e-05, + "loss": 0.4134, "step": 60980 }, { - "epoch": 2.15, - "learning_rate": 3.187564024468893e-05, - "loss": 0.2805, + "epoch": 2.197895267956896, + "grad_norm": 0.19453154504299164, + "learning_rate": 3.105896989139896e-05, + "loss": 0.3847, "step": 60985 }, { - "epoch": 2.15, - "learning_rate": 3.187290133426714e-05, - "loss": 0.2844, + "epoch": 2.198075467618121, + "grad_norm": 0.2046874463558197, + "learning_rate": 3.105613871570021e-05, + "loss": 0.3834, "step": 60990 }, { - "epoch": 2.15, - "learning_rate": 3.187016233460525e-05, - "loss": 0.2758, + "epoch": 2.1982556672793456, + "grad_norm": 0.2236749827861786, + "learning_rate": 3.105330745748774e-05, + "loss": 0.4089, "step": 60995 }, { - "epoch": 2.15, - "learning_rate": 3.186742324573883e-05, - "loss": 0.2762, + "epoch": 2.19843586694057, + "grad_norm": 0.2126908153295517, + "learning_rate": 3.105047611680013e-05, + "loss": 0.4004, "step": 61000 }, { - "epoch": 2.15, - "eval_loss": 0.2689875662326813, - "eval_runtime": 10.5446, - "eval_samples_per_second": 9.483, - "eval_steps_per_second": 9.483, + "epoch": 2.19843586694057, + "eval_loss": 0.43686825037002563, + "eval_runtime": 3.5139, + "eval_samples_per_second": 28.458, + "eval_steps_per_second": 7.115, "step": 61000 }, { - "epoch": 2.15, - "learning_rate": 3.186468406770344e-05, - "loss": 0.2781, + "epoch": 2.1986160666017946, + "grad_norm": 0.19092722237110138, + "learning_rate": 3.1047644693675956e-05, + "loss": 0.3829, "step": 61005 }, { - "epoch": 2.15, - "learning_rate": 3.186194480053467e-05, - "loss": 0.2567, + "epoch": 2.1987962662630194, + "grad_norm": 0.24691295623779297, + "learning_rate": 3.104481318815379e-05, + "loss": 0.4313, "step": 61010 }, { - "epoch": 2.15, - "learning_rate": 3.185920544426805e-05, - "loss": 0.2522, + "epoch": 2.198976465924244, + "grad_norm": 0.2079208791255951, + "learning_rate": 3.104198160027222e-05, + "loss": 0.425, "step": 61015 }, { - "epoch": 2.15, - "learning_rate": 3.185646599893918e-05, - "loss": 0.2552, + "epoch": 2.199156665585469, + "grad_norm": 0.19778048992156982, + "learning_rate": 3.103914993006981e-05, + "loss": 0.3969, "step": 61020 }, { - "epoch": 2.15, - "learning_rate": 3.18537264645836e-05, - "loss": 0.2444, + "epoch": 2.199336865246693, + "grad_norm": 0.16332995891571045, + "learning_rate": 3.1036318177585156e-05, + "loss": 0.4092, "step": 61025 }, { - "epoch": 2.15, - "learning_rate": 3.185098684123691e-05, - "loss": 0.2853, + "epoch": 2.199517064907918, + "grad_norm": 0.1742788553237915, + "learning_rate": 3.103348634285684e-05, + "loss": 0.4048, "step": 61030 }, { - "epoch": 2.15, - "learning_rate": 3.184824712893467e-05, - "loss": 0.3003, + "epoch": 2.1996972645691426, + "grad_norm": 0.14753888547420502, + "learning_rate": 3.103065442592344e-05, + "loss": 0.3834, "step": 61035 }, { - "epoch": 2.15, - "learning_rate": 3.184550732771246e-05, - "loss": 0.2714, + "epoch": 2.1998774642303673, + "grad_norm": 0.22011804580688477, + "learning_rate": 3.1027822426823536e-05, + "loss": 0.4417, "step": 61040 }, { - "epoch": 2.15, - "learning_rate": 3.184276743760584e-05, - "loss": 0.2848, + "epoch": 2.200057663891592, + "grad_norm": 0.20552678406238556, + "learning_rate": 3.1024990345595725e-05, + "loss": 0.4126, "step": 61045 }, { - "epoch": 2.15, - "learning_rate": 3.18400274586504e-05, - "loss": 0.2724, + "epoch": 2.2002378635528164, + "grad_norm": 0.18421411514282227, + "learning_rate": 3.1022158182278584e-05, + "loss": 0.4111, "step": 61050 }, { - "epoch": 2.15, - "learning_rate": 3.183728739088171e-05, - "loss": 0.2825, + "epoch": 2.200418063214041, + "grad_norm": 0.21071533858776093, + "learning_rate": 3.1019325936910696e-05, + "loss": 0.4239, "step": 61055 }, { - "epoch": 2.15, - "learning_rate": 3.183454723433535e-05, - "loss": 0.2541, + "epoch": 2.200598262875266, + "grad_norm": 0.19925324618816376, + "learning_rate": 3.1016493609530666e-05, + "loss": 0.3629, "step": 61060 }, { - "epoch": 2.15, - "learning_rate": 3.183180698904689e-05, - "loss": 0.2586, + "epoch": 2.2007784625364906, + "grad_norm": 0.19704802334308624, + "learning_rate": 3.101366120017707e-05, + "loss": 0.3847, "step": 61065 }, { - "epoch": 2.15, - "learning_rate": 3.182906665505193e-05, - "loss": 0.2811, + "epoch": 2.2009586621977153, + "grad_norm": 0.17782747745513916, + "learning_rate": 3.1010828708888516e-05, + "loss": 0.3889, "step": 61070 }, { - "epoch": 2.15, - "learning_rate": 3.182632623238604e-05, - "loss": 0.2755, + "epoch": 2.2011388618589396, + "grad_norm": 0.1703190803527832, + "learning_rate": 3.1007996135703576e-05, + "loss": 0.3616, "step": 61075 }, { - "epoch": 2.15, - "learning_rate": 3.182358572108479e-05, - "loss": 0.2842, + "epoch": 2.2013190615201643, + "grad_norm": 0.22071696817874908, + "learning_rate": 3.100516348066085e-05, + "loss": 0.4017, "step": 61080 }, { - "epoch": 2.15, - "learning_rate": 3.1820845121183786e-05, - "loss": 0.2524, + "epoch": 2.201499261181389, + "grad_norm": 0.20108157396316528, + "learning_rate": 3.100233074379894e-05, + "loss": 0.3965, "step": 61085 }, { - "epoch": 2.15, - "learning_rate": 3.1818104432718596e-05, - "loss": 0.2618, + "epoch": 2.201679460842614, + "grad_norm": 0.16138023138046265, + "learning_rate": 3.099949792515643e-05, + "loss": 0.4049, "step": 61090 }, { - "epoch": 2.15, - "learning_rate": 3.181536365572483e-05, - "loss": 0.2663, + "epoch": 2.201859660503838, + "grad_norm": 0.1845240294933319, + "learning_rate": 3.0996665024771924e-05, + "loss": 0.4032, "step": 61095 }, { - "epoch": 2.15, - "learning_rate": 3.181262279023804e-05, - "loss": 0.2534, + "epoch": 2.202039860165063, + "grad_norm": 0.17944936454296112, + "learning_rate": 3.099383204268402e-05, + "loss": 0.3838, "step": 61100 }, { - "epoch": 2.15, - "learning_rate": 3.180988183629384e-05, - "loss": 0.259, + "epoch": 2.2022200598262875, + "grad_norm": 0.17411671578884125, + "learning_rate": 3.0990998978931315e-05, + "loss": 0.4018, "step": 61105 }, { - "epoch": 2.15, - "learning_rate": 3.180714079392781e-05, - "loss": 0.2751, + "epoch": 2.2024002594875123, + "grad_norm": 0.1903943568468094, + "learning_rate": 3.0988165833552404e-05, + "loss": 0.3924, "step": 61110 }, { - "epoch": 2.15, - "learning_rate": 3.180439966317555e-05, - "loss": 0.2816, + "epoch": 2.202580459148737, + "grad_norm": 0.1921924352645874, + "learning_rate": 3.0985332606585905e-05, + "loss": 0.3686, "step": 61115 }, { - "epoch": 2.15, - "learning_rate": 3.180165844407264e-05, - "loss": 0.2554, + "epoch": 2.2027606588099613, + "grad_norm": 0.21751470863819122, + "learning_rate": 3.0982499298070394e-05, + "loss": 0.387, "step": 61120 }, { - "epoch": 2.15, - "learning_rate": 3.1798917136654676e-05, - "loss": 0.2584, + "epoch": 2.202940858471186, + "grad_norm": 0.18325336277484894, + "learning_rate": 3.0979665908044495e-05, + "loss": 0.3853, "step": 61125 }, { - "epoch": 2.15, - "learning_rate": 3.179617574095725e-05, - "loss": 0.2674, + "epoch": 2.2031210581324108, + "grad_norm": 0.21470201015472412, + "learning_rate": 3.097683243654681e-05, + "loss": 0.4171, "step": 61130 }, { - "epoch": 2.15, - "learning_rate": 3.1793434257015965e-05, - "loss": 0.272, + "epoch": 2.2033012577936355, + "grad_norm": 0.2272443175315857, + "learning_rate": 3.097399888361593e-05, + "loss": 0.3425, "step": 61135 }, { - "epoch": 2.15, - "learning_rate": 3.1790692684866414e-05, - "loss": 0.2681, + "epoch": 2.20348145745486, + "grad_norm": 0.22367946803569794, + "learning_rate": 3.097116524929048e-05, + "loss": 0.3819, "step": 61140 }, { - "epoch": 2.15, - "learning_rate": 3.1787951024544196e-05, - "loss": 0.2902, + "epoch": 2.2036616571160845, + "grad_norm": 0.16753867268562317, + "learning_rate": 3.096833153360905e-05, + "loss": 0.3832, "step": 61145 }, { - "epoch": 2.15, - "learning_rate": 3.178520927608491e-05, - "loss": 0.2537, + "epoch": 2.2038418567773093, + "grad_norm": 0.18004858493804932, + "learning_rate": 3.096549773661027e-05, + "loss": 0.4408, "step": 61150 }, { - "epoch": 2.15, - "learning_rate": 3.178246743952414e-05, - "loss": 0.289, + "epoch": 2.204022056438534, + "grad_norm": 0.20204921066761017, + "learning_rate": 3.096266385833273e-05, + "loss": 0.3311, "step": 61155 }, { - "epoch": 2.15, - "learning_rate": 3.177972551489751e-05, - "loss": 0.2592, + "epoch": 2.2042022560997587, + "grad_norm": 0.19747331738471985, + "learning_rate": 3.0959829898815053e-05, + "loss": 0.4104, "step": 61160 }, { - "epoch": 2.15, - "learning_rate": 3.177698350224061e-05, - "loss": 0.2744, + "epoch": 2.204382455760983, + "grad_norm": 0.20761699974536896, + "learning_rate": 3.095699585809584e-05, + "loss": 0.3564, "step": 61165 }, { - "epoch": 2.15, - "learning_rate": 3.1774241401589044e-05, - "loss": 0.2709, + "epoch": 2.2045626554222078, + "grad_norm": 0.18775121867656708, + "learning_rate": 3.0954161736213725e-05, + "loss": 0.3941, "step": 61170 }, { - "epoch": 2.15, - "learning_rate": 3.177149921297842e-05, - "loss": 0.2811, + "epoch": 2.2047428550834325, + "grad_norm": 0.1855042576789856, + "learning_rate": 3.0951327533207305e-05, + "loss": 0.398, "step": 61175 }, { - "epoch": 2.15, - "learning_rate": 3.176875693644434e-05, - "loss": 0.2822, + "epoch": 2.204923054744657, + "grad_norm": 0.15789079666137695, + "learning_rate": 3.094849324911519e-05, + "loss": 0.3827, "step": 61180 }, { - "epoch": 2.15, - "learning_rate": 3.1766014572022415e-05, - "loss": 0.278, + "epoch": 2.2051032544058815, + "grad_norm": 0.20551757514476776, + "learning_rate": 3.0945658883976014e-05, + "loss": 0.4138, "step": 61185 }, { - "epoch": 2.15, - "learning_rate": 3.1763272119748244e-05, - "loss": 0.2664, + "epoch": 2.2052834540671062, + "grad_norm": 0.20195716619491577, + "learning_rate": 3.0942824437828386e-05, + "loss": 0.4424, "step": 61190 }, { - "epoch": 2.15, - "learning_rate": 3.176052957965745e-05, - "loss": 0.2495, + "epoch": 2.205463653728331, + "grad_norm": 0.18200279772281647, + "learning_rate": 3.093998991071093e-05, + "loss": 0.4024, "step": 61195 }, { - "epoch": 2.15, - "learning_rate": 3.175778695178563e-05, - "loss": 0.2562, + "epoch": 2.2056438533895557, + "grad_norm": 0.21861431002616882, + "learning_rate": 3.0937155302662256e-05, + "loss": 0.4019, "step": 61200 }, { - "epoch": 2.15, - "learning_rate": 3.17550442361684e-05, - "loss": 0.2734, + "epoch": 2.2058240530507804, + "grad_norm": 0.18421269953250885, + "learning_rate": 3.093432061372098e-05, + "loss": 0.3865, "step": 61205 }, { - "epoch": 2.15, - "learning_rate": 3.175230143284137e-05, - "loss": 0.2879, + "epoch": 2.2060042527120047, + "grad_norm": 0.2282218486070633, + "learning_rate": 3.093148584392576e-05, + "loss": 0.3667, "step": 61210 }, { - "epoch": 2.15, - "learning_rate": 3.174955854184016e-05, - "loss": 0.264, + "epoch": 2.2061844523732295, + "grad_norm": 0.17307542264461517, + "learning_rate": 3.0928650993315175e-05, + "loss": 0.4014, "step": 61215 }, { - "epoch": 2.15, - "learning_rate": 3.174681556320038e-05, - "loss": 0.2764, + "epoch": 2.206364652034454, + "grad_norm": 0.23555909097194672, + "learning_rate": 3.092581606192788e-05, + "loss": 0.4212, "step": 61220 }, { - "epoch": 2.15, - "learning_rate": 3.174407249695764e-05, - "loss": 0.2588, + "epoch": 2.206544851695679, + "grad_norm": 0.2506897747516632, + "learning_rate": 3.092298104980247e-05, + "loss": 0.3723, "step": 61225 }, { - "epoch": 2.15, - "learning_rate": 3.1741329343147575e-05, - "loss": 0.2544, + "epoch": 2.2067250513569032, + "grad_norm": 0.2201319932937622, + "learning_rate": 3.0920145956977606e-05, + "loss": 0.3987, "step": 61230 }, { - "epoch": 2.15, - "learning_rate": 3.173858610180578e-05, - "loss": 0.2865, + "epoch": 2.206905251018128, + "grad_norm": 0.17496994137763977, + "learning_rate": 3.09173107834919e-05, + "loss": 0.3906, "step": 61235 }, { - "epoch": 2.15, - "learning_rate": 3.1735842772967895e-05, - "loss": 0.2596, + "epoch": 2.2070854506793527, + "grad_norm": 0.20881304144859314, + "learning_rate": 3.0914475529383966e-05, + "loss": 0.3911, "step": 61240 }, { - "epoch": 2.15, - "learning_rate": 3.1733099356669514e-05, - "loss": 0.2852, + "epoch": 2.2072656503405774, + "grad_norm": 0.21811749041080475, + "learning_rate": 3.091164019469246e-05, + "loss": 0.3785, "step": 61245 }, { - "epoch": 2.15, - "learning_rate": 3.1730355852946295e-05, - "loss": 0.2792, + "epoch": 2.207445850001802, + "grad_norm": 0.18549318611621857, + "learning_rate": 3.0908804779456e-05, + "loss": 0.3929, "step": 61250 }, { - "epoch": 2.16, - "learning_rate": 3.1727612261833826e-05, - "loss": 0.2798, + "epoch": 2.2076260496630264, + "grad_norm": 0.21845227479934692, + "learning_rate": 3.090596928371322e-05, + "loss": 0.3835, "step": 61255 }, { - "epoch": 2.16, - "learning_rate": 3.1724868583367756e-05, - "loss": 0.277, + "epoch": 2.207806249324251, + "grad_norm": 0.2296115756034851, + "learning_rate": 3.0903133707502744e-05, + "loss": 0.4349, "step": 61260 }, { - "epoch": 2.16, - "learning_rate": 3.172212481758369e-05, - "loss": 0.2579, + "epoch": 2.207986448985476, + "grad_norm": 0.19739088416099548, + "learning_rate": 3.090029805086322e-05, + "loss": 0.3676, "step": 61265 }, { - "epoch": 2.16, - "learning_rate": 3.171938096451727e-05, - "loss": 0.256, + "epoch": 2.2081666486467006, + "grad_norm": 0.23218046128749847, + "learning_rate": 3.089746231383327e-05, + "loss": 0.3863, "step": 61270 }, { - "epoch": 2.16, - "learning_rate": 3.171663702420412e-05, - "loss": 0.2693, + "epoch": 2.2083468483079254, + "grad_norm": 0.17996297776699066, + "learning_rate": 3.089462649645155e-05, + "loss": 0.3793, "step": 61275 }, { - "epoch": 2.16, - "learning_rate": 3.171389299667986e-05, - "loss": 0.2638, + "epoch": 2.2085270479691497, + "grad_norm": 0.22149422764778137, + "learning_rate": 3.089179059875668e-05, + "loss": 0.4028, "step": 61280 }, { - "epoch": 2.16, - "learning_rate": 3.171114888198012e-05, - "loss": 0.2672, + "epoch": 2.2087072476303744, + "grad_norm": 0.18242286145687103, + "learning_rate": 3.0888954620787306e-05, + "loss": 0.3801, "step": 61285 }, { - "epoch": 2.16, - "learning_rate": 3.170840468014054e-05, - "loss": 0.2712, + "epoch": 2.208887447291599, + "grad_norm": 0.18819750845432281, + "learning_rate": 3.0886118562582056e-05, + "loss": 0.3964, "step": 61290 }, { - "epoch": 2.16, - "learning_rate": 3.170566039119675e-05, - "loss": 0.277, + "epoch": 2.209067646952824, + "grad_norm": 0.19808700680732727, + "learning_rate": 3.0883282424179586e-05, + "loss": 0.4066, "step": 61295 }, { - "epoch": 2.16, - "learning_rate": 3.170291601518436e-05, - "loss": 0.2836, + "epoch": 2.209247846614048, + "grad_norm": 0.1916135549545288, + "learning_rate": 3.088044620561854e-05, + "loss": 0.4026, "step": 61300 }, { - "epoch": 2.16, - "learning_rate": 3.1700171552139036e-05, - "loss": 0.2654, + "epoch": 2.209428046275273, + "grad_norm": 0.2114742547273636, + "learning_rate": 3.0877609906937545e-05, + "loss": 0.3913, "step": 61305 }, { - "epoch": 2.16, - "learning_rate": 3.16974270020964e-05, - "loss": 0.2694, + "epoch": 2.2096082459364976, + "grad_norm": 0.2341557890176773, + "learning_rate": 3.0874773528175245e-05, + "loss": 0.3725, "step": 61310 }, { - "epoch": 2.16, - "learning_rate": 3.1694682365092096e-05, - "loss": 0.249, + "epoch": 2.2097884455977224, + "grad_norm": 0.20667120814323425, + "learning_rate": 3.087193706937031e-05, + "loss": 0.4142, "step": 61315 }, { - "epoch": 2.16, - "learning_rate": 3.169193764116175e-05, - "loss": 0.2703, + "epoch": 2.209968645258947, + "grad_norm": 0.23338265717029572, + "learning_rate": 3.086910053056136e-05, + "loss": 0.406, "step": 61320 }, { - "epoch": 2.16, - "learning_rate": 3.1689192830340984e-05, - "loss": 0.2533, + "epoch": 2.2101488449201714, + "grad_norm": 0.22917935252189636, + "learning_rate": 3.086626391178705e-05, + "loss": 0.3956, "step": 61325 }, { - "epoch": 2.16, - "learning_rate": 3.1686447932665475e-05, - "loss": 0.2801, + "epoch": 2.210329044581396, + "grad_norm": 0.2156306803226471, + "learning_rate": 3.0863427213086026e-05, + "loss": 0.4162, "step": 61330 }, { - "epoch": 2.16, - "learning_rate": 3.1683702948170845e-05, - "loss": 0.2747, + "epoch": 2.210509244242621, + "grad_norm": 0.20012244582176208, + "learning_rate": 3.086059043449695e-05, + "loss": 0.4189, "step": 61335 }, { - "epoch": 2.16, - "learning_rate": 3.1680957876892735e-05, - "loss": 0.2789, + "epoch": 2.2106894439038456, + "grad_norm": 0.18535156548023224, + "learning_rate": 3.085775357605847e-05, + "loss": 0.4042, "step": 61340 }, { - "epoch": 2.16, - "learning_rate": 3.16782127188668e-05, - "loss": 0.2675, + "epoch": 2.2108696435650703, + "grad_norm": 0.18791675567626953, + "learning_rate": 3.0854916637809215e-05, + "loss": 0.3618, "step": 61345 }, { - "epoch": 2.16, - "learning_rate": 3.167546747412866e-05, - "loss": 0.2792, + "epoch": 2.2110498432262946, + "grad_norm": 0.2098093181848526, + "learning_rate": 3.085207961978786e-05, + "loss": 0.4419, "step": 61350 }, { - "epoch": 2.16, - "learning_rate": 3.167272214271398e-05, - "loss": 0.2758, + "epoch": 2.2112300428875193, + "grad_norm": 0.22902563214302063, + "learning_rate": 3.0849242522033064e-05, + "loss": 0.4385, "step": 61355 }, { - "epoch": 2.16, - "learning_rate": 3.16699767246584e-05, - "loss": 0.3046, + "epoch": 2.211410242548744, + "grad_norm": 0.20538847148418427, + "learning_rate": 3.084640534458346e-05, + "loss": 0.385, "step": 61360 }, { - "epoch": 2.16, - "learning_rate": 3.166723121999757e-05, - "loss": 0.285, + "epoch": 2.211590442209969, + "grad_norm": 0.24018481373786926, + "learning_rate": 3.084356808747772e-05, + "loss": 0.4011, "step": 61365 }, { - "epoch": 2.16, - "learning_rate": 3.1664485628767126e-05, - "loss": 0.2992, + "epoch": 2.211770641871193, + "grad_norm": 0.17867416143417358, + "learning_rate": 3.084073075075449e-05, + "loss": 0.3826, "step": 61370 }, { - "epoch": 2.16, - "learning_rate": 3.166173995100275e-05, - "loss": 0.3017, + "epoch": 2.211950841532418, + "grad_norm": 0.22849269211292267, + "learning_rate": 3.083789333445244e-05, + "loss": 0.4173, "step": 61375 }, { - "epoch": 2.16, - "learning_rate": 3.165899418674005e-05, - "loss": 0.3009, + "epoch": 2.2121310411936426, + "grad_norm": 0.20987308025360107, + "learning_rate": 3.0835055838610224e-05, + "loss": 0.414, "step": 61380 }, { - "epoch": 2.16, - "learning_rate": 3.1656248336014706e-05, - "loss": 0.2688, + "epoch": 2.2123112408548673, + "grad_norm": 0.19698165357112885, + "learning_rate": 3.08322182632665e-05, + "loss": 0.3898, "step": 61385 }, { - "epoch": 2.16, - "learning_rate": 3.1653502398862356e-05, - "loss": 0.297, + "epoch": 2.212491440516092, + "grad_norm": 0.2028915286064148, + "learning_rate": 3.082938060845993e-05, + "loss": 0.4183, "step": 61390 }, { - "epoch": 2.16, - "learning_rate": 3.165075637531867e-05, - "loss": 0.2663, + "epoch": 2.2126716401773163, + "grad_norm": 0.19117394089698792, + "learning_rate": 3.082654287422918e-05, + "loss": 0.4264, "step": 61395 }, { - "epoch": 2.16, - "learning_rate": 3.16480102654193e-05, - "loss": 0.2722, + "epoch": 2.212851839838541, + "grad_norm": 0.23229442536830902, + "learning_rate": 3.082370506061291e-05, + "loss": 0.3847, "step": 61400 }, { - "epoch": 2.16, - "learning_rate": 3.1645264069199896e-05, - "loss": 0.2626, + "epoch": 2.213032039499766, + "grad_norm": 0.1863051801919937, + "learning_rate": 3.0820867167649794e-05, + "loss": 0.3803, "step": 61405 }, { - "epoch": 2.16, - "learning_rate": 3.164251778669611e-05, - "loss": 0.2683, + "epoch": 2.2132122391609905, + "grad_norm": 0.17941780388355255, + "learning_rate": 3.081802919537847e-05, + "loss": 0.3799, "step": 61410 }, { - "epoch": 2.16, - "learning_rate": 3.1639771417943616e-05, - "loss": 0.2736, + "epoch": 2.213392438822215, + "grad_norm": 0.20453837513923645, + "learning_rate": 3.081519114383764e-05, + "loss": 0.4076, "step": 61415 }, { - "epoch": 2.16, - "learning_rate": 3.1637024962978064e-05, - "loss": 0.272, + "epoch": 2.2135726384834395, + "grad_norm": 0.1980455219745636, + "learning_rate": 3.081235301306596e-05, + "loss": 0.4094, "step": 61420 }, { - "epoch": 2.16, - "learning_rate": 3.1634278421835116e-05, - "loss": 0.2703, + "epoch": 2.2137528381446643, + "grad_norm": 0.23042725026607513, + "learning_rate": 3.080951480310209e-05, + "loss": 0.4069, "step": 61425 }, { - "epoch": 2.16, - "learning_rate": 3.1631531794550435e-05, - "loss": 0.2385, + "epoch": 2.213933037805889, + "grad_norm": 0.197109192609787, + "learning_rate": 3.08066765139847e-05, + "loss": 0.3933, "step": 61430 }, { - "epoch": 2.16, - "learning_rate": 3.162878508115968e-05, - "loss": 0.2926, + "epoch": 2.2141132374671137, + "grad_norm": 0.19078285992145538, + "learning_rate": 3.0803838145752465e-05, + "loss": 0.4077, "step": 61435 }, { - "epoch": 2.16, - "learning_rate": 3.162603828169852e-05, - "loss": 0.2872, + "epoch": 2.214293437128338, + "grad_norm": 0.20791934430599213, + "learning_rate": 3.0800999698444065e-05, + "loss": 0.3886, "step": 61440 }, { - "epoch": 2.16, - "learning_rate": 3.162329139620262e-05, - "loss": 0.2672, + "epoch": 2.2144736367895628, + "grad_norm": 0.24572139978408813, + "learning_rate": 3.0798161172098175e-05, + "loss": 0.4195, "step": 61445 }, { - "epoch": 2.16, - "learning_rate": 3.162054442470765e-05, - "loss": 0.2541, + "epoch": 2.2146538364507875, + "grad_norm": 0.2402421236038208, + "learning_rate": 3.079532256675345e-05, + "loss": 0.3967, "step": 61450 }, { - "epoch": 2.16, - "learning_rate": 3.1617797367249274e-05, - "loss": 0.2762, + "epoch": 2.2148340361120122, + "grad_norm": 0.20814205706119537, + "learning_rate": 3.079248388244858e-05, + "loss": 0.4013, "step": 61455 }, { - "epoch": 2.16, - "learning_rate": 3.161505022386316e-05, - "loss": 0.2837, + "epoch": 2.2150142357732365, + "grad_norm": 0.17785018682479858, + "learning_rate": 3.078964511922224e-05, + "loss": 0.4192, "step": 61460 }, { - "epoch": 2.16, - "learning_rate": 3.1612302994584976e-05, - "loss": 0.2656, + "epoch": 2.2151944354344613, + "grad_norm": 0.2151138335466385, + "learning_rate": 3.0786806277113115e-05, + "loss": 0.3994, "step": 61465 }, { - "epoch": 2.16, - "learning_rate": 3.16095556794504e-05, - "loss": 0.2893, + "epoch": 2.215374635095686, + "grad_norm": 0.18419857323169708, + "learning_rate": 3.0783967356159856e-05, + "loss": 0.4021, "step": 61470 }, { - "epoch": 2.16, - "learning_rate": 3.1606808278495094e-05, - "loss": 0.2888, + "epoch": 2.2155548347569107, + "grad_norm": 0.18258464336395264, + "learning_rate": 3.078112835640117e-05, + "loss": 0.3915, "step": 61475 }, { - "epoch": 2.16, - "learning_rate": 3.160406079175474e-05, - "loss": 0.2957, + "epoch": 2.2157350344181355, + "grad_norm": 0.2090165615081787, + "learning_rate": 3.077828927787574e-05, + "loss": 0.3759, "step": 61480 }, { - "epoch": 2.16, - "learning_rate": 3.1601313219265e-05, - "loss": 0.2729, + "epoch": 2.2159152340793598, + "grad_norm": 0.21204036474227905, + "learning_rate": 3.0775450120622215e-05, + "loss": 0.4124, "step": 61485 }, { - "epoch": 2.16, - "learning_rate": 3.159856556106157e-05, - "loss": 0.2943, + "epoch": 2.2160954337405845, + "grad_norm": 0.1751544028520584, + "learning_rate": 3.077261088467932e-05, + "loss": 0.3981, "step": 61490 }, { - "epoch": 2.16, - "learning_rate": 3.1595817817180104e-05, - "loss": 0.2943, + "epoch": 2.216275633401809, + "grad_norm": 0.23898641765117645, + "learning_rate": 3.076977157008571e-05, + "loss": 0.395, "step": 61495 }, { - "epoch": 2.16, - "learning_rate": 3.15930699876563e-05, - "loss": 0.2764, + "epoch": 2.216455833063034, + "grad_norm": 0.17369025945663452, + "learning_rate": 3.076693217688009e-05, + "loss": 0.4193, "step": 61500 }, { - "epoch": 2.16, - "eval_loss": 0.26903456449508667, - "eval_runtime": 10.5402, - "eval_samples_per_second": 9.487, - "eval_steps_per_second": 9.487, + "epoch": 2.216455833063034, + "eval_loss": 0.4364326000213623, + "eval_runtime": 3.523, + "eval_samples_per_second": 28.385, + "eval_steps_per_second": 7.096, "step": 61500 }, { - "epoch": 2.16, - "learning_rate": 3.159032207252582e-05, - "loss": 0.2633, + "epoch": 2.2166360327242582, + "grad_norm": 0.22703151404857635, + "learning_rate": 3.076409270510112e-05, + "loss": 0.3983, "step": 61505 }, { - "epoch": 2.16, - "learning_rate": 3.1587574071824367e-05, - "loss": 0.2699, + "epoch": 2.216816232385483, + "grad_norm": 0.21053709089756012, + "learning_rate": 3.076125315478752e-05, + "loss": 0.3839, "step": 61510 }, { - "epoch": 2.16, - "learning_rate": 3.158482598558759e-05, - "loss": 0.265, + "epoch": 2.2169964320467077, + "grad_norm": 0.17118942737579346, + "learning_rate": 3.075841352597794e-05, + "loss": 0.3901, "step": 61515 }, { - "epoch": 2.16, - "learning_rate": 3.158207781385119e-05, - "loss": 0.2584, + "epoch": 2.2171766317079324, + "grad_norm": 0.21972143650054932, + "learning_rate": 3.07555738187111e-05, + "loss": 0.4219, "step": 61520 }, { - "epoch": 2.16, - "learning_rate": 3.157932955665085e-05, - "loss": 0.2815, + "epoch": 2.217356831369157, + "grad_norm": 0.1972840577363968, + "learning_rate": 3.0752734033025694e-05, + "loss": 0.3747, "step": 61525 }, { - "epoch": 2.16, - "learning_rate": 3.157658121402225e-05, - "loss": 0.2744, + "epoch": 2.2175370310303815, + "grad_norm": 0.20041783154010773, + "learning_rate": 3.074989416896038e-05, + "loss": 0.3963, "step": 61530 }, { - "epoch": 2.16, - "learning_rate": 3.157383278600108e-05, - "loss": 0.2673, + "epoch": 2.217717230691606, + "grad_norm": 0.23045070469379425, + "learning_rate": 3.0747054226553886e-05, + "loss": 0.3746, "step": 61535 }, { - "epoch": 2.17, - "learning_rate": 3.157108427262302e-05, - "loss": 0.2736, + "epoch": 2.217897430352831, + "grad_norm": 0.17599715292453766, + "learning_rate": 3.0744214205844884e-05, + "loss": 0.393, "step": 61540 }, { - "epoch": 2.17, - "learning_rate": 3.1568335673923756e-05, - "loss": 0.2542, + "epoch": 2.2180776300140557, + "grad_norm": 0.17511135339736938, + "learning_rate": 3.0741374106872084e-05, + "loss": 0.4001, "step": 61545 }, { - "epoch": 2.17, - "learning_rate": 3.1565586989938996e-05, - "loss": 0.2861, + "epoch": 2.2182578296752804, + "grad_norm": 0.18678990006446838, + "learning_rate": 3.0738533929674165e-05, + "loss": 0.4134, "step": 61550 }, { - "epoch": 2.17, - "learning_rate": 3.15628382207044e-05, - "loss": 0.2817, + "epoch": 2.2184380293365047, + "grad_norm": 0.21317550539970398, + "learning_rate": 3.0735693674289826e-05, + "loss": 0.4136, "step": 61555 }, { - "epoch": 2.17, - "learning_rate": 3.1560089366255684e-05, - "loss": 0.2694, + "epoch": 2.2186182289977294, + "grad_norm": 0.24046799540519714, + "learning_rate": 3.073285334075778e-05, + "loss": 0.3942, "step": 61560 }, { - "epoch": 2.17, - "learning_rate": 3.155734042662853e-05, - "loss": 0.2562, + "epoch": 2.218798428658954, + "grad_norm": 0.19992442429065704, + "learning_rate": 3.073001292911672e-05, + "loss": 0.3546, "step": 61565 }, { - "epoch": 2.17, - "learning_rate": 3.1554591401858634e-05, - "loss": 0.2769, + "epoch": 2.218978628320179, + "grad_norm": 0.2136695832014084, + "learning_rate": 3.072717243940534e-05, + "loss": 0.407, "step": 61570 }, { - "epoch": 2.17, - "learning_rate": 3.155184229198168e-05, - "loss": 0.2685, + "epoch": 2.2191588279814036, + "grad_norm": 0.2068341076374054, + "learning_rate": 3.072433187166234e-05, + "loss": 0.3849, "step": 61575 }, { - "epoch": 2.17, - "learning_rate": 3.154909309703338e-05, - "loss": 0.2839, + "epoch": 2.219339027642628, + "grad_norm": 0.2191094160079956, + "learning_rate": 3.072149122592643e-05, + "loss": 0.4173, "step": 61580 }, { - "epoch": 2.17, - "learning_rate": 3.154634381704942e-05, - "loss": 0.2619, + "epoch": 2.2195192273038526, + "grad_norm": 0.20730826258659363, + "learning_rate": 3.071865050223631e-05, + "loss": 0.3846, "step": 61585 }, { - "epoch": 2.17, - "learning_rate": 3.15435944520655e-05, - "loss": 0.2865, + "epoch": 2.2196994269650774, + "grad_norm": 0.20395426452159882, + "learning_rate": 3.071580970063068e-05, + "loss": 0.3963, "step": 61590 }, { - "epoch": 2.17, - "learning_rate": 3.154084500211732e-05, - "loss": 0.2774, + "epoch": 2.219879626626302, + "grad_norm": 0.19093595445156097, + "learning_rate": 3.071296882114826e-05, + "loss": 0.39, "step": 61595 }, { - "epoch": 2.17, - "learning_rate": 3.1538095467240574e-05, - "loss": 0.2769, + "epoch": 2.2200598262875264, + "grad_norm": 0.2352960854768753, + "learning_rate": 3.071012786382774e-05, + "loss": 0.4021, "step": 61600 }, { - "epoch": 2.17, - "learning_rate": 3.153534584747097e-05, - "loss": 0.2741, + "epoch": 2.220240025948751, + "grad_norm": 0.21295884251594543, + "learning_rate": 3.0707286828707835e-05, + "loss": 0.3812, "step": 61605 }, { - "epoch": 2.17, - "learning_rate": 3.1532596142844205e-05, - "loss": 0.2868, + "epoch": 2.220420225609976, + "grad_norm": 0.19988127052783966, + "learning_rate": 3.0704445715827246e-05, + "loss": 0.4401, "step": 61610 }, { - "epoch": 2.17, - "learning_rate": 3.1529846353395986e-05, - "loss": 0.2889, + "epoch": 2.2206004252712006, + "grad_norm": 0.19108517467975616, + "learning_rate": 3.0701604525224695e-05, + "loss": 0.3947, "step": 61615 }, { - "epoch": 2.17, - "learning_rate": 3.152709647916202e-05, - "loss": 0.2689, + "epoch": 2.2207806249324253, + "grad_norm": 0.21383409202098846, + "learning_rate": 3.0698763256938876e-05, + "loss": 0.3947, "step": 61620 }, { - "epoch": 2.17, - "learning_rate": 3.152434652017801e-05, - "loss": 0.2735, + "epoch": 2.2209608245936496, + "grad_norm": 0.201728954911232, + "learning_rate": 3.069592191100852e-05, + "loss": 0.3971, "step": 61625 }, { - "epoch": 2.17, - "learning_rate": 3.152159647647964e-05, - "loss": 0.2756, + "epoch": 2.2211410242548744, + "grad_norm": 0.20201417803764343, + "learning_rate": 3.069308048747233e-05, + "loss": 0.3725, "step": 61630 }, { - "epoch": 2.17, - "learning_rate": 3.151884634810266e-05, - "loss": 0.2433, + "epoch": 2.221321223916099, + "grad_norm": 0.19340789318084717, + "learning_rate": 3.069023898636902e-05, + "loss": 0.3866, "step": 61635 }, { - "epoch": 2.17, - "learning_rate": 3.151609613508274e-05, - "loss": 0.2631, + "epoch": 2.221501423577324, + "grad_norm": 0.22743280231952667, + "learning_rate": 3.0687397407737306e-05, + "loss": 0.3558, "step": 61640 }, { - "epoch": 2.17, - "learning_rate": 3.151334583745562e-05, - "loss": 0.2759, + "epoch": 2.221681623238548, + "grad_norm": 0.18240687251091003, + "learning_rate": 3.06845557516159e-05, + "loss": 0.4093, "step": 61645 }, { - "epoch": 2.17, - "learning_rate": 3.151059545525698e-05, - "loss": 0.2369, + "epoch": 2.221861822899773, + "grad_norm": 0.22113995254039764, + "learning_rate": 3.068171401804353e-05, + "loss": 0.3851, "step": 61650 }, { - "epoch": 2.17, - "learning_rate": 3.150784498852256e-05, - "loss": 0.2596, + "epoch": 2.2220420225609976, + "grad_norm": 0.22573597729206085, + "learning_rate": 3.0678872207058903e-05, + "loss": 0.4202, "step": 61655 }, { - "epoch": 2.17, - "learning_rate": 3.150509443728806e-05, - "loss": 0.286, + "epoch": 2.2222222222222223, + "grad_norm": 0.1990000456571579, + "learning_rate": 3.067603031870074e-05, + "loss": 0.4169, "step": 61660 }, { - "epoch": 2.17, - "learning_rate": 3.15023438015892e-05, - "loss": 0.28, + "epoch": 2.222402421883447, + "grad_norm": 0.18506182730197906, + "learning_rate": 3.067318835300777e-05, + "loss": 0.3866, "step": 61665 }, { - "epoch": 2.17, - "learning_rate": 3.149959308146168e-05, - "loss": 0.2702, + "epoch": 2.2225826215446713, + "grad_norm": 0.16143712401390076, + "learning_rate": 3.067034631001869e-05, + "loss": 0.3977, "step": 61670 }, { - "epoch": 2.17, - "learning_rate": 3.149684227694123e-05, - "loss": 0.2794, + "epoch": 2.222762821205896, + "grad_norm": 0.2105492502450943, + "learning_rate": 3.066750418977225e-05, + "loss": 0.3876, "step": 61675 }, { - "epoch": 2.17, - "learning_rate": 3.1494091388063573e-05, - "loss": 0.278, + "epoch": 2.222943020867121, + "grad_norm": 0.2138456106185913, + "learning_rate": 3.066466199230716e-05, + "loss": 0.4023, "step": 61680 }, { - "epoch": 2.17, - "learning_rate": 3.149134041486441e-05, - "loss": 0.2791, + "epoch": 2.2231232205283455, + "grad_norm": 0.2341010868549347, + "learning_rate": 3.066181971766215e-05, + "loss": 0.394, "step": 61685 }, { - "epoch": 2.17, - "learning_rate": 3.1488589357379476e-05, - "loss": 0.2698, + "epoch": 2.22330342018957, + "grad_norm": 0.2601710557937622, + "learning_rate": 3.065897736587594e-05, + "loss": 0.36, "step": 61690 }, { - "epoch": 2.17, - "learning_rate": 3.148583821564448e-05, - "loss": 0.2683, + "epoch": 2.2234836198507946, + "grad_norm": 0.20886510610580444, + "learning_rate": 3.065613493698726e-05, + "loss": 0.4128, "step": 61695 }, { - "epoch": 2.17, - "learning_rate": 3.148308698969516e-05, - "loss": 0.2948, + "epoch": 2.2236638195120193, + "grad_norm": 0.21699224412441254, + "learning_rate": 3.0653292431034845e-05, + "loss": 0.385, "step": 61700 }, { - "epoch": 2.17, - "learning_rate": 3.148033567956722e-05, - "loss": 0.2722, + "epoch": 2.223844019173244, + "grad_norm": 0.22566765546798706, + "learning_rate": 3.0650449848057404e-05, + "loss": 0.3951, "step": 61705 }, { - "epoch": 2.17, - "learning_rate": 3.1477584285296385e-05, - "loss": 0.2688, + "epoch": 2.2240242188344688, + "grad_norm": 0.16451287269592285, + "learning_rate": 3.064760718809368e-05, + "loss": 0.3828, "step": 61710 }, { - "epoch": 2.17, - "learning_rate": 3.1474832806918395e-05, - "loss": 0.2561, + "epoch": 2.224204418495693, + "grad_norm": 0.2075955718755722, + "learning_rate": 3.06447644511824e-05, + "loss": 0.3986, "step": 61715 }, { - "epoch": 2.17, - "learning_rate": 3.147208124446897e-05, - "loss": 0.257, + "epoch": 2.224384618156918, + "grad_norm": 0.1905994862318039, + "learning_rate": 3.06419216373623e-05, + "loss": 0.4061, "step": 61720 }, { - "epoch": 2.17, - "learning_rate": 3.1469329597983834e-05, - "loss": 0.249, + "epoch": 2.2245648178181425, + "grad_norm": 0.2250581681728363, + "learning_rate": 3.063907874667211e-05, + "loss": 0.4116, "step": 61725 }, { - "epoch": 2.17, - "learning_rate": 3.146657786749871e-05, - "loss": 0.256, + "epoch": 2.2247450174793673, + "grad_norm": 0.20644237101078033, + "learning_rate": 3.063623577915056e-05, + "loss": 0.3753, "step": 61730 }, { - "epoch": 2.17, - "learning_rate": 3.146382605304935e-05, - "loss": 0.2751, + "epoch": 2.2249252171405915, + "grad_norm": 0.2077893614768982, + "learning_rate": 3.0633392734836395e-05, + "loss": 0.3853, "step": 61735 }, { - "epoch": 2.17, - "learning_rate": 3.146107415467145e-05, - "loss": 0.2642, + "epoch": 2.2251054168018163, + "grad_norm": 0.18702587485313416, + "learning_rate": 3.063054961376834e-05, + "loss": 0.412, "step": 61740 }, { - "epoch": 2.17, - "learning_rate": 3.1458322172400766e-05, - "loss": 0.2778, + "epoch": 2.225285616463041, + "grad_norm": 0.20379160344600677, + "learning_rate": 3.0627706415985146e-05, + "loss": 0.4043, "step": 61745 }, { - "epoch": 2.17, - "learning_rate": 3.145557010627303e-05, - "loss": 0.2913, + "epoch": 2.2254658161242657, + "grad_norm": 0.17889107763767242, + "learning_rate": 3.062486314152553e-05, + "loss": 0.4084, "step": 61750 }, { - "epoch": 2.17, - "learning_rate": 3.1452817956323967e-05, - "loss": 0.2724, + "epoch": 2.2256460157854905, + "grad_norm": 0.23296573758125305, + "learning_rate": 3.0622019790428254e-05, + "loss": 0.4243, "step": 61755 }, { - "epoch": 2.17, - "learning_rate": 3.1450065722589316e-05, - "loss": 0.2712, + "epoch": 2.2258262154467148, + "grad_norm": 0.22724519670009613, + "learning_rate": 3.0619176362732035e-05, + "loss": 0.4073, "step": 61760 }, { - "epoch": 2.17, - "learning_rate": 3.1447313405104805e-05, - "loss": 0.2703, + "epoch": 2.2260064151079395, + "grad_norm": 0.27276527881622314, + "learning_rate": 3.0616332858475625e-05, + "loss": 0.4086, "step": 61765 }, { - "epoch": 2.17, - "learning_rate": 3.144456100390619e-05, - "loss": 0.2594, + "epoch": 2.2261866147691642, + "grad_norm": 0.21575415134429932, + "learning_rate": 3.0613489277697784e-05, + "loss": 0.4389, "step": 61770 }, { - "epoch": 2.17, - "learning_rate": 3.144180851902919e-05, - "loss": 0.2719, + "epoch": 2.226366814430389, + "grad_norm": 0.19603478908538818, + "learning_rate": 3.061064562043722e-05, + "loss": 0.4233, "step": 61775 }, { - "epoch": 2.17, - "learning_rate": 3.143905595050956e-05, - "loss": 0.2843, + "epoch": 2.2265470140916137, + "grad_norm": 0.2517644464969635, + "learning_rate": 3.06078018867327e-05, + "loss": 0.4004, "step": 61780 }, { - "epoch": 2.17, - "learning_rate": 3.1436303298383027e-05, - "loss": 0.2774, + "epoch": 2.226727213752838, + "grad_norm": 0.2613866627216339, + "learning_rate": 3.060495807662297e-05, + "loss": 0.3955, "step": 61785 }, { - "epoch": 2.17, - "learning_rate": 3.1433550562685335e-05, - "loss": 0.2669, + "epoch": 2.2269074134140627, + "grad_norm": 0.18380023539066315, + "learning_rate": 3.0602114190146775e-05, + "loss": 0.3754, "step": 61790 }, { - "epoch": 2.17, - "learning_rate": 3.143079774345223e-05, - "loss": 0.2658, + "epoch": 2.2270876130752875, + "grad_norm": 0.1809166818857193, + "learning_rate": 3.0599270227342845e-05, + "loss": 0.3961, "step": 61795 }, { - "epoch": 2.17, - "learning_rate": 3.142804484071945e-05, - "loss": 0.2773, + "epoch": 2.227267812736512, + "grad_norm": 0.1867736577987671, + "learning_rate": 3.059642618824996e-05, + "loss": 0.3826, "step": 61800 }, { - "epoch": 2.17, - "learning_rate": 3.142529185452275e-05, - "loss": 0.2765, + "epoch": 2.2274480123977365, + "grad_norm": 0.24860471487045288, + "learning_rate": 3.059358207290683e-05, + "loss": 0.4294, "step": 61805 }, { - "epoch": 2.17, - "learning_rate": 3.142253878489787e-05, - "loss": 0.2758, + "epoch": 2.227628212058961, + "grad_norm": 0.2238049954175949, + "learning_rate": 3.059073788135224e-05, + "loss": 0.44, "step": 61810 }, { - "epoch": 2.17, - "learning_rate": 3.141978563188056e-05, - "loss": 0.2643, + "epoch": 2.227808411720186, + "grad_norm": 0.1938982456922531, + "learning_rate": 3.0587893613624934e-05, + "loss": 0.3959, "step": 61815 }, { - "epoch": 2.17, - "learning_rate": 3.141703239550656e-05, - "loss": 0.2776, + "epoch": 2.2279886113814107, + "grad_norm": 0.1805550903081894, + "learning_rate": 3.058504926976365e-05, + "loss": 0.3997, "step": 61820 }, { - "epoch": 2.18, - "learning_rate": 3.141427907581162e-05, - "loss": 0.2638, + "epoch": 2.2281688110426354, + "grad_norm": 0.2023649662733078, + "learning_rate": 3.0582204849807154e-05, + "loss": 0.4009, "step": 61825 }, { - "epoch": 2.18, - "learning_rate": 3.141152567283149e-05, - "loss": 0.2818, + "epoch": 2.2283490107038597, + "grad_norm": 0.18612384796142578, + "learning_rate": 3.05793603537942e-05, + "loss": 0.4259, "step": 61830 }, { - "epoch": 2.18, - "learning_rate": 3.140877218660194e-05, - "loss": 0.2655, + "epoch": 2.2285292103650844, + "grad_norm": 0.22503447532653809, + "learning_rate": 3.0576515781763545e-05, + "loss": 0.4017, "step": 61835 }, { - "epoch": 2.18, - "learning_rate": 3.1406018617158695e-05, - "loss": 0.2551, + "epoch": 2.228709410026309, + "grad_norm": 0.2309921830892563, + "learning_rate": 3.057367113375393e-05, + "loss": 0.3924, "step": 61840 }, { - "epoch": 2.18, - "learning_rate": 3.1403264964537525e-05, - "loss": 0.2567, + "epoch": 2.228889609687534, + "grad_norm": 0.18473617732524872, + "learning_rate": 3.0570826409804135e-05, + "loss": 0.3755, "step": 61845 }, { - "epoch": 2.18, - "learning_rate": 3.140051122877417e-05, - "loss": 0.2728, + "epoch": 2.2290698093487586, + "grad_norm": 0.19264580309391022, + "learning_rate": 3.056798160995291e-05, + "loss": 0.4029, "step": 61850 }, { - "epoch": 2.18, - "learning_rate": 3.1397757409904405e-05, - "loss": 0.2919, + "epoch": 2.229250009009983, + "grad_norm": 0.21103538572788239, + "learning_rate": 3.0565136734239006e-05, + "loss": 0.4169, "step": 61855 }, { - "epoch": 2.18, - "learning_rate": 3.139500350796397e-05, - "loss": 0.2607, + "epoch": 2.2294302086712077, + "grad_norm": 0.21922504901885986, + "learning_rate": 3.0562291782701194e-05, + "loss": 0.3899, "step": 61860 }, { - "epoch": 2.18, - "learning_rate": 3.1392249522988636e-05, - "loss": 0.2678, + "epoch": 2.2296104083324324, + "grad_norm": 0.19922494888305664, + "learning_rate": 3.0559446755378226e-05, + "loss": 0.3969, "step": 61865 }, { - "epoch": 2.18, - "learning_rate": 3.1389495455014154e-05, - "loss": 0.2838, + "epoch": 2.229790607993657, + "grad_norm": 0.2152629941701889, + "learning_rate": 3.055660165230888e-05, + "loss": 0.4128, "step": 61870 }, { - "epoch": 2.18, - "learning_rate": 3.138674130407628e-05, - "loss": 0.265, + "epoch": 2.2299708076548814, + "grad_norm": 0.19293451309204102, + "learning_rate": 3.055375647353191e-05, + "loss": 0.3871, "step": 61875 }, { - "epoch": 2.18, - "learning_rate": 3.138398707021077e-05, - "loss": 0.2934, + "epoch": 2.230151007316106, + "grad_norm": 0.2509141266345978, + "learning_rate": 3.0550911219086084e-05, + "loss": 0.4476, "step": 61880 }, { - "epoch": 2.18, - "learning_rate": 3.1381232753453416e-05, - "loss": 0.2583, + "epoch": 2.230331206977331, + "grad_norm": 0.24453610181808472, + "learning_rate": 3.0548065889010176e-05, + "loss": 0.3912, "step": 61885 }, { - "epoch": 2.18, - "learning_rate": 3.137847835383995e-05, - "loss": 0.2492, + "epoch": 2.2305114066385556, + "grad_norm": 0.24755702912807465, + "learning_rate": 3.054522048334294e-05, + "loss": 0.3986, "step": 61890 }, { - "epoch": 2.18, - "learning_rate": 3.1375723871406146e-05, - "loss": 0.2666, + "epoch": 2.2306916062997804, + "grad_norm": 0.1772449016571045, + "learning_rate": 3.0542375002123145e-05, + "loss": 0.3765, "step": 61895 }, { - "epoch": 2.18, - "learning_rate": 3.137296930618777e-05, - "loss": 0.2915, + "epoch": 2.2308718059610047, + "grad_norm": 0.1797766089439392, + "learning_rate": 3.053952944538956e-05, + "loss": 0.3711, "step": 61900 }, { - "epoch": 2.18, - "learning_rate": 3.1370214658220574e-05, - "loss": 0.2795, + "epoch": 2.2310520056222294, + "grad_norm": 0.21006938815116882, + "learning_rate": 3.0536683813180966e-05, + "loss": 0.4086, "step": 61905 }, { - "epoch": 2.18, - "learning_rate": 3.1367459927540354e-05, - "loss": 0.2567, + "epoch": 2.231232205283454, + "grad_norm": 0.18526072800159454, + "learning_rate": 3.053383810553613e-05, + "loss": 0.3676, "step": 61910 }, { - "epoch": 2.18, - "learning_rate": 3.1364705114182855e-05, - "loss": 0.2675, + "epoch": 2.231412404944679, + "grad_norm": 0.20477519929409027, + "learning_rate": 3.053099232249381e-05, + "loss": 0.4048, "step": 61915 }, { - "epoch": 2.18, - "learning_rate": 3.136195021818387e-05, - "loss": 0.2815, + "epoch": 2.231592604605903, + "grad_norm": 0.17895658314228058, + "learning_rate": 3.05281464640928e-05, + "loss": 0.4049, "step": 61920 }, { - "epoch": 2.18, - "learning_rate": 3.1359195239579145e-05, - "loss": 0.2679, + "epoch": 2.231772804267128, + "grad_norm": 0.20034639537334442, + "learning_rate": 3.0525300530371846e-05, + "loss": 0.4089, "step": 61925 }, { - "epoch": 2.18, - "learning_rate": 3.135644017840445e-05, - "loss": 0.2546, + "epoch": 2.2319530039283526, + "grad_norm": 0.24000723659992218, + "learning_rate": 3.052245452136977e-05, + "loss": 0.3823, "step": 61930 }, { - "epoch": 2.18, - "learning_rate": 3.135368503469558e-05, - "loss": 0.2836, + "epoch": 2.2321332035895773, + "grad_norm": 0.20230047404766083, + "learning_rate": 3.0519608437125305e-05, + "loss": 0.4144, "step": 61935 }, { - "epoch": 2.18, - "learning_rate": 3.13509298084883e-05, - "loss": 0.2724, + "epoch": 2.232313403250802, + "grad_norm": 0.16788794100284576, + "learning_rate": 3.0516762277677243e-05, + "loss": 0.3911, "step": 61940 }, { - "epoch": 2.18, - "learning_rate": 3.1348174499818375e-05, - "loss": 0.2578, + "epoch": 2.2324936029120264, + "grad_norm": 0.1820405274629593, + "learning_rate": 3.051391604306436e-05, + "loss": 0.3892, "step": 61945 }, { - "epoch": 2.18, - "learning_rate": 3.134541910872159e-05, - "loss": 0.2742, + "epoch": 2.232673802573251, + "grad_norm": 0.24724507331848145, + "learning_rate": 3.0511069733325447e-05, + "loss": 0.376, "step": 61950 }, { - "epoch": 2.18, - "learning_rate": 3.1342663635233713e-05, - "loss": 0.267, + "epoch": 2.232854002234476, + "grad_norm": 0.21663494408130646, + "learning_rate": 3.0508223348499275e-05, + "loss": 0.393, "step": 61955 }, { - "epoch": 2.18, - "learning_rate": 3.133990807939053e-05, - "loss": 0.2694, + "epoch": 2.2330342018957006, + "grad_norm": 0.17999057471752167, + "learning_rate": 3.0505376888624622e-05, + "loss": 0.4099, "step": 61960 }, { - "epoch": 2.18, - "learning_rate": 3.133715244122783e-05, - "loss": 0.2745, + "epoch": 2.233214401556925, + "grad_norm": 0.15646670758724213, + "learning_rate": 3.0502530353740278e-05, + "loss": 0.3565, "step": 61965 }, { - "epoch": 2.18, - "learning_rate": 3.133439672078138e-05, - "loss": 0.2624, + "epoch": 2.2333946012181496, + "grad_norm": 0.1919204592704773, + "learning_rate": 3.049968374388502e-05, + "loss": 0.4127, "step": 61970 }, { - "epoch": 2.18, - "learning_rate": 3.1331640918086956e-05, - "loss": 0.2828, + "epoch": 2.2335748008793743, + "grad_norm": 0.1685272455215454, + "learning_rate": 3.0496837059097645e-05, + "loss": 0.4332, "step": 61975 }, { - "epoch": 2.18, - "learning_rate": 3.132888503318036e-05, - "loss": 0.2696, + "epoch": 2.233755000540599, + "grad_norm": 0.1549319177865982, + "learning_rate": 3.0493990299416913e-05, + "loss": 0.4114, "step": 61980 }, { - "epoch": 2.18, - "learning_rate": 3.132612906609734e-05, - "loss": 0.2563, + "epoch": 2.233935200201824, + "grad_norm": 0.22232964634895325, + "learning_rate": 3.0491143464881634e-05, + "loss": 0.3857, "step": 61985 }, { - "epoch": 2.18, - "learning_rate": 3.1323373016873715e-05, - "loss": 0.2596, + "epoch": 2.234115399863048, + "grad_norm": 0.14861559867858887, + "learning_rate": 3.0488296555530594e-05, + "loss": 0.3871, "step": 61990 }, { - "epoch": 2.18, - "learning_rate": 3.132061688554526e-05, - "loss": 0.2638, + "epoch": 2.234295599524273, + "grad_norm": 0.20374061167240143, + "learning_rate": 3.0485449571402573e-05, + "loss": 0.3995, "step": 61995 }, { - "epoch": 2.18, - "learning_rate": 3.131786067214777e-05, - "loss": 0.29, + "epoch": 2.2344757991854975, + "grad_norm": 0.21584060788154602, + "learning_rate": 3.048260251253636e-05, + "loss": 0.4383, "step": 62000 }, { - "epoch": 2.18, - "eval_loss": 0.2678808569908142, - "eval_runtime": 10.5389, - "eval_samples_per_second": 9.489, - "eval_steps_per_second": 9.489, + "epoch": 2.2344757991854975, + "eval_loss": 0.43634411692619324, + "eval_runtime": 3.5361, + "eval_samples_per_second": 28.28, + "eval_steps_per_second": 7.07, "step": 62000 }, { - "epoch": 2.18, - "learning_rate": 3.1315104376717e-05, - "loss": 0.3023, + "epoch": 2.2346559988467223, + "grad_norm": 0.18200920522212982, + "learning_rate": 3.0479755378970753e-05, + "loss": 0.4092, "step": 62005 }, { - "epoch": 2.18, - "learning_rate": 3.1312347999288777e-05, - "loss": 0.2886, + "epoch": 2.2348361985079466, + "grad_norm": 0.18603946268558502, + "learning_rate": 3.047690817074454e-05, + "loss": 0.3547, "step": 62010 }, { - "epoch": 2.18, - "learning_rate": 3.1309591539898866e-05, - "loss": 0.2419, + "epoch": 2.2350163981691713, + "grad_norm": 0.17530210316181183, + "learning_rate": 3.0474060887896518e-05, + "loss": 0.3952, "step": 62015 }, { - "epoch": 2.18, - "learning_rate": 3.130683499858308e-05, - "loss": 0.2869, + "epoch": 2.235196597830396, + "grad_norm": 0.1511596292257309, + "learning_rate": 3.0471213530465465e-05, + "loss": 0.3703, "step": 62020 }, { - "epoch": 2.18, - "learning_rate": 3.130407837537719e-05, - "loss": 0.2772, + "epoch": 2.2353767974916208, + "grad_norm": 0.22197118401527405, + "learning_rate": 3.046836609849019e-05, + "loss": 0.4399, "step": 62025 }, { - "epoch": 2.18, - "learning_rate": 3.1301321670316994e-05, - "loss": 0.2674, + "epoch": 2.2355569971528455, + "grad_norm": 0.2044990509748459, + "learning_rate": 3.046551859200949e-05, + "loss": 0.3614, "step": 62030 }, { - "epoch": 2.18, - "learning_rate": 3.1298564883438295e-05, - "loss": 0.2838, + "epoch": 2.23573719681407, + "grad_norm": 0.2116348147392273, + "learning_rate": 3.0462671011062162e-05, + "loss": 0.3879, "step": 62035 }, { - "epoch": 2.18, - "learning_rate": 3.1295808014776884e-05, - "loss": 0.2683, + "epoch": 2.2359173964752945, + "grad_norm": 0.18771792948246002, + "learning_rate": 3.045982335568699e-05, + "loss": 0.4121, "step": 62040 }, { - "epoch": 2.18, - "learning_rate": 3.129305106436855e-05, - "loss": 0.2963, + "epoch": 2.2360975961365193, + "grad_norm": 0.2089640349149704, + "learning_rate": 3.0456975625922784e-05, + "loss": 0.3884, "step": 62045 }, { - "epoch": 2.18, - "learning_rate": 3.129029403224909e-05, - "loss": 0.2604, + "epoch": 2.236277795797744, + "grad_norm": 0.1947159469127655, + "learning_rate": 3.0454127821808337e-05, + "loss": 0.3953, "step": 62050 }, { - "epoch": 2.18, - "learning_rate": 3.128753691845432e-05, - "loss": 0.2684, + "epoch": 2.2364579954589687, + "grad_norm": 0.23277081549167633, + "learning_rate": 3.045127994338246e-05, + "loss": 0.397, "step": 62055 }, { - "epoch": 2.18, - "learning_rate": 3.128477972302001e-05, - "loss": 0.2561, + "epoch": 2.236638195120193, + "grad_norm": 0.17750267684459686, + "learning_rate": 3.0448431990683958e-05, + "loss": 0.3889, "step": 62060 }, { - "epoch": 2.18, - "learning_rate": 3.128202244598199e-05, - "loss": 0.2732, + "epoch": 2.2368183947814178, + "grad_norm": 0.16454583406448364, + "learning_rate": 3.0445583963751606e-05, + "loss": 0.3797, "step": 62065 }, { - "epoch": 2.18, - "learning_rate": 3.1279265087376044e-05, - "loss": 0.2673, + "epoch": 2.2369985944426425, + "grad_norm": 0.20463277399539948, + "learning_rate": 3.044273586262424e-05, + "loss": 0.3917, "step": 62070 }, { - "epoch": 2.18, - "learning_rate": 3.1276507647237984e-05, - "loss": 0.2787, + "epoch": 2.237178794103867, + "grad_norm": 0.20936442911624908, + "learning_rate": 3.043988768734065e-05, + "loss": 0.4067, "step": 62075 }, { - "epoch": 2.18, - "learning_rate": 3.1273750125603604e-05, - "loss": 0.2969, + "epoch": 2.237358993765092, + "grad_norm": 0.20374515652656555, + "learning_rate": 3.043703943793964e-05, + "loss": 0.392, "step": 62080 }, { - "epoch": 2.18, - "learning_rate": 3.127099252250872e-05, - "loss": 0.2773, + "epoch": 2.2375391934263162, + "grad_norm": 0.20407363772392273, + "learning_rate": 3.043419111446002e-05, + "loss": 0.3983, "step": 62085 }, { - "epoch": 2.18, - "learning_rate": 3.126823483798912e-05, - "loss": 0.251, + "epoch": 2.237719393087541, + "grad_norm": 0.19166499376296997, + "learning_rate": 3.0431342716940602e-05, + "loss": 0.396, "step": 62090 }, { - "epoch": 2.18, - "learning_rate": 3.126547707208063e-05, - "loss": 0.27, + "epoch": 2.2378995927487657, + "grad_norm": 0.22127671539783478, + "learning_rate": 3.0428494245420192e-05, + "loss": 0.3952, "step": 62095 }, { - "epoch": 2.18, - "learning_rate": 3.1262719224819044e-05, - "loss": 0.2743, + "epoch": 2.2380797924099904, + "grad_norm": 0.19556500017642975, + "learning_rate": 3.0425645699937594e-05, + "loss": 0.4074, "step": 62100 }, { - "epoch": 2.19, - "learning_rate": 3.125996129624018e-05, - "loss": 0.2802, + "epoch": 2.2382599920712147, + "grad_norm": 0.18051710724830627, + "learning_rate": 3.0422797080531628e-05, + "loss": 0.3663, "step": 62105 }, { - "epoch": 2.19, - "learning_rate": 3.1257203286379845e-05, - "loss": 0.2797, + "epoch": 2.2384401917324395, + "grad_norm": 0.1607939898967743, + "learning_rate": 3.0419948387241093e-05, + "loss": 0.3728, "step": 62110 }, { - "epoch": 2.19, - "learning_rate": 3.1254445195273844e-05, - "loss": 0.2787, + "epoch": 2.238620391393664, + "grad_norm": 0.21384070813655853, + "learning_rate": 3.0417099620104826e-05, + "loss": 0.4021, "step": 62115 }, { - "epoch": 2.19, - "learning_rate": 3.1251687022958e-05, - "loss": 0.266, + "epoch": 2.238800591054889, + "grad_norm": 0.16077342629432678, + "learning_rate": 3.041425077916161e-05, + "loss": 0.3476, "step": 62120 }, { - "epoch": 2.19, - "learning_rate": 3.124892876946811e-05, - "loss": 0.2657, + "epoch": 2.2389807907161137, + "grad_norm": 0.20714950561523438, + "learning_rate": 3.0411401864450283e-05, + "loss": 0.402, "step": 62125 }, { - "epoch": 2.19, - "learning_rate": 3.124617043484001e-05, - "loss": 0.2578, + "epoch": 2.239160990377338, + "grad_norm": 0.2111266702413559, + "learning_rate": 3.040855287600965e-05, + "loss": 0.4138, "step": 62130 }, { - "epoch": 2.19, - "learning_rate": 3.1243412019109496e-05, - "loss": 0.2822, + "epoch": 2.2393411900385627, + "grad_norm": 0.21749116480350494, + "learning_rate": 3.0405703813878534e-05, + "loss": 0.3802, "step": 62135 }, { - "epoch": 2.19, - "learning_rate": 3.124065352231239e-05, - "loss": 0.2852, + "epoch": 2.2395213896997874, + "grad_norm": 0.2501288056373596, + "learning_rate": 3.0402854678095754e-05, + "loss": 0.4045, "step": 62140 }, { - "epoch": 2.19, - "learning_rate": 3.123789494448451e-05, - "loss": 0.2712, + "epoch": 2.239701589361012, + "grad_norm": 0.18549959361553192, + "learning_rate": 3.0400005468700115e-05, + "loss": 0.4179, "step": 62145 }, { - "epoch": 2.19, - "learning_rate": 3.123513628566168e-05, - "loss": 0.273, + "epoch": 2.2398817890222364, + "grad_norm": 0.19732961058616638, + "learning_rate": 3.0397156185730453e-05, + "loss": 0.3776, "step": 62150 }, { - "epoch": 2.19, - "learning_rate": 3.123237754587971e-05, - "loss": 0.2482, + "epoch": 2.240061988683461, + "grad_norm": 0.2629784345626831, + "learning_rate": 3.039430682922558e-05, + "loss": 0.3572, "step": 62155 }, { - "epoch": 2.19, - "learning_rate": 3.1229618725174436e-05, - "loss": 0.2561, + "epoch": 2.240242188344686, + "grad_norm": 0.1689624935388565, + "learning_rate": 3.0391457399224325e-05, + "loss": 0.3499, "step": 62160 }, { - "epoch": 2.19, - "learning_rate": 3.122685982358165e-05, - "loss": 0.2584, + "epoch": 2.2404223880059106, + "grad_norm": 0.20784690976142883, + "learning_rate": 3.038860789576551e-05, + "loss": 0.4062, "step": 62165 }, { - "epoch": 2.19, - "learning_rate": 3.1224100841137217e-05, - "loss": 0.2677, + "epoch": 2.2406025876671354, + "grad_norm": 0.17468421161174774, + "learning_rate": 3.038575831888794e-05, + "loss": 0.4152, "step": 62170 }, { - "epoch": 2.19, - "learning_rate": 3.122134177787692e-05, - "loss": 0.2588, + "epoch": 2.2407827873283597, + "grad_norm": 0.21627172827720642, + "learning_rate": 3.0382908668630477e-05, + "loss": 0.4106, "step": 62175 }, { - "epoch": 2.19, - "learning_rate": 3.12185826338366e-05, - "loss": 0.2661, + "epoch": 2.2409629869895844, + "grad_norm": 0.19633768498897552, + "learning_rate": 3.0380058945031907e-05, + "loss": 0.3889, "step": 62180 }, { - "epoch": 2.19, - "learning_rate": 3.121582340905209e-05, - "loss": 0.2692, + "epoch": 2.241143186650809, + "grad_norm": 0.18723514676094055, + "learning_rate": 3.0377209148131086e-05, + "loss": 0.4013, "step": 62185 }, { - "epoch": 2.19, - "learning_rate": 3.1213064103559206e-05, - "loss": 0.2429, + "epoch": 2.241323386312034, + "grad_norm": 0.17972959578037262, + "learning_rate": 3.037435927796683e-05, + "loss": 0.3765, "step": 62190 }, { - "epoch": 2.19, - "learning_rate": 3.1210304717393775e-05, - "loss": 0.2833, + "epoch": 2.241503585973258, + "grad_norm": 0.21043039858341217, + "learning_rate": 3.037150933457797e-05, + "loss": 0.3866, "step": 62195 }, { - "epoch": 2.19, - "learning_rate": 3.120754525059163e-05, - "loss": 0.2604, + "epoch": 2.241683785634483, + "grad_norm": 0.20892179012298584, + "learning_rate": 3.0368659318003335e-05, + "loss": 0.4267, "step": 62200 }, { - "epoch": 2.19, - "learning_rate": 3.12047857031886e-05, - "loss": 0.2753, + "epoch": 2.2418639852957076, + "grad_norm": 0.22525620460510254, + "learning_rate": 3.036580922828175e-05, + "loss": 0.3884, "step": 62205 }, { - "epoch": 2.19, - "learning_rate": 3.1202026075220525e-05, - "loss": 0.2935, + "epoch": 2.2420441849569324, + "grad_norm": 0.23556070029735565, + "learning_rate": 3.0362959065452057e-05, + "loss": 0.4409, "step": 62210 }, { - "epoch": 2.19, - "learning_rate": 3.1199266366723224e-05, - "loss": 0.257, + "epoch": 2.242224384618157, + "grad_norm": 0.20461376011371613, + "learning_rate": 3.036010882955308e-05, + "loss": 0.4232, "step": 62215 }, { - "epoch": 2.19, - "learning_rate": 3.1196506577732545e-05, - "loss": 0.2837, + "epoch": 2.2424045842793814, + "grad_norm": 0.19230584800243378, + "learning_rate": 3.035725852062367e-05, + "loss": 0.391, "step": 62220 }, { - "epoch": 2.19, - "learning_rate": 3.1193746708284294e-05, - "loss": 0.2723, + "epoch": 2.242584783940606, + "grad_norm": 0.17546550929546356, + "learning_rate": 3.0354408138702645e-05, + "loss": 0.3665, "step": 62225 }, { - "epoch": 2.19, - "learning_rate": 3.119098675841433e-05, - "loss": 0.2625, + "epoch": 2.242764983601831, + "grad_norm": 0.24618138372898102, + "learning_rate": 3.0351557683828846e-05, + "loss": 0.4114, "step": 62230 }, { - "epoch": 2.19, - "learning_rate": 3.1188226728158485e-05, - "loss": 0.2771, + "epoch": 2.2429451832630556, + "grad_norm": 0.2337779998779297, + "learning_rate": 3.03487071560411e-05, + "loss": 0.3832, "step": 62235 }, { - "epoch": 2.19, - "learning_rate": 3.1185466617552594e-05, - "loss": 0.2816, + "epoch": 2.24312538292428, + "grad_norm": 0.19449256360530853, + "learning_rate": 3.0345856555378267e-05, + "loss": 0.4175, "step": 62240 }, { - "epoch": 2.19, - "learning_rate": 3.1182706426632496e-05, - "loss": 0.2625, + "epoch": 2.2433055825855046, + "grad_norm": 0.19468238949775696, + "learning_rate": 3.0343005881879172e-05, + "loss": 0.3843, "step": 62245 }, { - "epoch": 2.19, - "learning_rate": 3.117994615543403e-05, - "loss": 0.2642, + "epoch": 2.2434857822467293, + "grad_norm": 0.24619287252426147, + "learning_rate": 3.0340155135582648e-05, + "loss": 0.403, "step": 62250 }, { - "epoch": 2.19, - "learning_rate": 3.117718580399304e-05, - "loss": 0.2604, + "epoch": 2.243665981907954, + "grad_norm": 0.15675576031208038, + "learning_rate": 3.0337304316527548e-05, + "loss": 0.3894, "step": 62255 }, { - "epoch": 2.19, - "learning_rate": 3.117442537234535e-05, - "loss": 0.2451, + "epoch": 2.243846181569179, + "grad_norm": 0.18250007927417755, + "learning_rate": 3.0334453424752712e-05, + "loss": 0.4095, "step": 62260 }, { - "epoch": 2.19, - "learning_rate": 3.1171664860526826e-05, - "loss": 0.2944, + "epoch": 2.244026381230403, + "grad_norm": 0.17939797043800354, + "learning_rate": 3.033160246029698e-05, + "loss": 0.3893, "step": 62265 }, { - "epoch": 2.19, - "learning_rate": 3.1168904268573294e-05, - "loss": 0.2968, + "epoch": 2.244206580891628, + "grad_norm": 0.15464815497398376, + "learning_rate": 3.0328751423199198e-05, + "loss": 0.3849, "step": 62270 }, { - "epoch": 2.19, - "learning_rate": 3.116614359652061e-05, - "loss": 0.2615, + "epoch": 2.2443867805528526, + "grad_norm": 0.1741357445716858, + "learning_rate": 3.03259003134982e-05, + "loss": 0.4079, "step": 62275 }, { - "epoch": 2.19, - "learning_rate": 3.116338284440462e-05, - "loss": 0.2496, + "epoch": 2.2445669802140773, + "grad_norm": 0.1852835714817047, + "learning_rate": 3.0323049131232863e-05, + "loss": 0.388, "step": 62280 }, { - "epoch": 2.19, - "learning_rate": 3.116062201226116e-05, - "loss": 0.2591, + "epoch": 2.244747179875302, + "grad_norm": 0.17500340938568115, + "learning_rate": 3.0320197876441992e-05, + "loss": 0.4356, "step": 62285 }, { - "epoch": 2.19, - "learning_rate": 3.115786110012608e-05, - "loss": 0.265, + "epoch": 2.2449273795365263, + "grad_norm": 0.20107300579547882, + "learning_rate": 3.0317346549164465e-05, + "loss": 0.3873, "step": 62290 }, { - "epoch": 2.19, - "learning_rate": 3.115510010803524e-05, - "loss": 0.2671, + "epoch": 2.245107579197751, + "grad_norm": 0.19302281737327576, + "learning_rate": 3.0314495149439115e-05, + "loss": 0.3943, "step": 62295 }, { - "epoch": 2.19, - "learning_rate": 3.115233903602448e-05, - "loss": 0.2622, + "epoch": 2.245287778858976, + "grad_norm": 0.20569072663784027, + "learning_rate": 3.03116436773048e-05, + "loss": 0.4247, "step": 62300 }, { - "epoch": 2.19, - "learning_rate": 3.114957788412965e-05, - "loss": 0.266, + "epoch": 2.2454679785202005, + "grad_norm": 0.1855345070362091, + "learning_rate": 3.0308792132800368e-05, + "loss": 0.3976, "step": 62305 }, { - "epoch": 2.19, - "learning_rate": 3.11468166523866e-05, - "loss": 0.3061, + "epoch": 2.245648178181425, + "grad_norm": 0.1881013661623001, + "learning_rate": 3.0305940515964672e-05, + "loss": 0.4136, "step": 62310 }, { - "epoch": 2.19, - "learning_rate": 3.114405534083119e-05, - "loss": 0.2594, + "epoch": 2.2458283778426495, + "grad_norm": 0.20543043315410614, + "learning_rate": 3.0303088826836563e-05, + "loss": 0.3816, "step": 62315 }, { - "epoch": 2.19, - "learning_rate": 3.114129394949927e-05, - "loss": 0.2714, + "epoch": 2.2460085775038743, + "grad_norm": 0.28218919038772583, + "learning_rate": 3.0300237065454894e-05, + "loss": 0.4104, "step": 62320 }, { - "epoch": 2.19, - "learning_rate": 3.113853247842669e-05, - "loss": 0.285, + "epoch": 2.246188777165099, + "grad_norm": 0.21315787732601166, + "learning_rate": 3.0297385231858526e-05, + "loss": 0.4041, "step": 62325 }, { - "epoch": 2.19, - "learning_rate": 3.113577092764932e-05, - "loss": 0.2718, + "epoch": 2.2463689768263237, + "grad_norm": 0.19718264043331146, + "learning_rate": 3.0294533326086305e-05, + "loss": 0.4086, "step": 62330 }, { - "epoch": 2.19, - "learning_rate": 3.113300929720302e-05, - "loss": 0.2715, + "epoch": 2.246549176487548, + "grad_norm": 0.1942020058631897, + "learning_rate": 3.0291681348177092e-05, + "loss": 0.3855, "step": 62335 }, { - "epoch": 2.19, - "learning_rate": 3.113024758712361e-05, - "loss": 0.2728, + "epoch": 2.2467293761487728, + "grad_norm": 0.1929059475660324, + "learning_rate": 3.0288829298169753e-05, + "loss": 0.3879, "step": 62340 }, { - "epoch": 2.19, - "learning_rate": 3.112748579744699e-05, - "loss": 0.264, + "epoch": 2.2469095758099975, + "grad_norm": 0.2083449363708496, + "learning_rate": 3.0285977176103132e-05, + "loss": 0.3884, "step": 62345 }, { - "epoch": 2.19, - "learning_rate": 3.1124723928209e-05, - "loss": 0.2649, + "epoch": 2.2470897754712222, + "grad_norm": 0.22706344723701477, + "learning_rate": 3.028312498201609e-05, + "loss": 0.4189, "step": 62350 }, { - "epoch": 2.19, - "learning_rate": 3.1121961979445506e-05, - "loss": 0.2562, + "epoch": 2.247269975132447, + "grad_norm": 0.2006312906742096, + "learning_rate": 3.02802727159475e-05, + "loss": 0.408, "step": 62355 }, { - "epoch": 2.19, - "learning_rate": 3.111919995119237e-05, - "loss": 0.2866, + "epoch": 2.2474501747936713, + "grad_norm": 0.2271738350391388, + "learning_rate": 3.0277420377936222e-05, + "loss": 0.4563, "step": 62360 }, { - "epoch": 2.19, - "learning_rate": 3.1116437843485465e-05, - "loss": 0.2639, + "epoch": 2.247630374454896, + "grad_norm": 0.19731751084327698, + "learning_rate": 3.0274567968021107e-05, + "loss": 0.4227, "step": 62365 }, { - "epoch": 2.19, - "learning_rate": 3.111367565636063e-05, - "loss": 0.2655, + "epoch": 2.2478105741161207, + "grad_norm": 0.19308514893054962, + "learning_rate": 3.0271715486241026e-05, + "loss": 0.4012, "step": 62370 }, { - "epoch": 2.19, - "learning_rate": 3.111091338985376e-05, - "loss": 0.2735, + "epoch": 2.2479907737773455, + "grad_norm": 0.19526423513889313, + "learning_rate": 3.0268862932634832e-05, + "loss": 0.3879, "step": 62375 }, { - "epoch": 2.19, - "learning_rate": 3.110815104400069e-05, - "loss": 0.2723, + "epoch": 2.2481709734385698, + "grad_norm": 0.20854395627975464, + "learning_rate": 3.026601030724141e-05, + "loss": 0.3913, "step": 62380 }, { - "epoch": 2.19, - "learning_rate": 3.1105388618837314e-05, - "loss": 0.3042, + "epoch": 2.2483511730997945, + "grad_norm": 0.1848597228527069, + "learning_rate": 3.0263157610099625e-05, + "loss": 0.3934, "step": 62385 }, { - "epoch": 2.2, - "learning_rate": 3.11026261143995e-05, - "loss": 0.2662, + "epoch": 2.248531372761019, + "grad_norm": 0.22897033393383026, + "learning_rate": 3.0260304841248322e-05, + "loss": 0.3878, "step": 62390 }, { - "epoch": 2.2, - "learning_rate": 3.109986353072309e-05, - "loss": 0.2803, + "epoch": 2.248711572422244, + "grad_norm": 0.1810678094625473, + "learning_rate": 3.025745200072639e-05, + "loss": 0.4209, "step": 62395 }, { - "epoch": 2.2, - "learning_rate": 3.109710086784398e-05, - "loss": 0.28, + "epoch": 2.2488917720834687, + "grad_norm": 0.19008022546768188, + "learning_rate": 3.0254599088572688e-05, + "loss": 0.3718, "step": 62400 }, { - "epoch": 2.2, - "learning_rate": 3.1094338125798035e-05, - "loss": 0.2713, + "epoch": 2.249071971744693, + "grad_norm": 0.18570220470428467, + "learning_rate": 3.02517461048261e-05, + "loss": 0.3892, "step": 62405 }, { - "epoch": 2.2, - "learning_rate": 3.1091575304621126e-05, - "loss": 0.2577, + "epoch": 2.2492521714059177, + "grad_norm": 0.23810695111751556, + "learning_rate": 3.0248893049525483e-05, + "loss": 0.42, "step": 62410 }, { - "epoch": 2.2, - "learning_rate": 3.108881240434912e-05, - "loss": 0.2785, + "epoch": 2.2494323710671424, + "grad_norm": 0.23687207698822021, + "learning_rate": 3.024603992270971e-05, + "loss": 0.4312, "step": 62415 }, { - "epoch": 2.2, - "learning_rate": 3.108604942501791e-05, - "loss": 0.2456, + "epoch": 2.249612570728367, + "grad_norm": 0.1891470104455948, + "learning_rate": 3.0243186724417664e-05, + "loss": 0.3563, "step": 62420 }, { - "epoch": 2.2, - "learning_rate": 3.108328636666335e-05, - "loss": 0.2876, + "epoch": 2.2497927703895915, + "grad_norm": 0.18046557903289795, + "learning_rate": 3.0240333454688214e-05, + "loss": 0.3756, "step": 62425 }, { - "epoch": 2.2, - "learning_rate": 3.108052322932132e-05, - "loss": 0.2638, + "epoch": 2.249972970050816, + "grad_norm": 0.19262628257274628, + "learning_rate": 3.0237480113560235e-05, + "loss": 0.3925, "step": 62430 }, { - "epoch": 2.2, - "learning_rate": 3.107776001302771e-05, - "loss": 0.2657, + "epoch": 2.250153169712041, + "grad_norm": 0.22571761906147003, + "learning_rate": 3.02346267010726e-05, + "loss": 0.3438, "step": 62435 }, { - "epoch": 2.2, - "learning_rate": 3.10749967178184e-05, - "loss": 0.2973, + "epoch": 2.2503333693732657, + "grad_norm": 0.19609612226486206, + "learning_rate": 3.02317732172642e-05, + "loss": 0.3906, "step": 62440 }, { - "epoch": 2.2, - "learning_rate": 3.107223334372925e-05, - "loss": 0.2855, + "epoch": 2.2505135690344904, + "grad_norm": 0.2036208212375641, + "learning_rate": 3.0228919662173895e-05, + "loss": 0.4103, "step": 62445 }, { - "epoch": 2.2, - "learning_rate": 3.106946989079616e-05, - "loss": 0.2855, + "epoch": 2.2506937686957147, + "grad_norm": 0.20205596089363098, + "learning_rate": 3.0226066035840568e-05, + "loss": 0.4045, "step": 62450 }, { - "epoch": 2.2, - "learning_rate": 3.1066706359054984e-05, - "loss": 0.2701, + "epoch": 2.2508739683569394, + "grad_norm": 0.18515092134475708, + "learning_rate": 3.0223212338303108e-05, + "loss": 0.4139, "step": 62455 }, { - "epoch": 2.2, - "learning_rate": 3.1063942748541644e-05, - "loss": 0.2738, + "epoch": 2.251054168018164, + "grad_norm": 0.19203773140907288, + "learning_rate": 3.0220358569600388e-05, + "loss": 0.3927, "step": 62460 }, { - "epoch": 2.2, - "learning_rate": 3.1061179059292e-05, - "loss": 0.2961, + "epoch": 2.251234367679389, + "grad_norm": 0.2266424596309662, + "learning_rate": 3.02175047297713e-05, + "loss": 0.4254, "step": 62465 }, { - "epoch": 2.2, - "learning_rate": 3.1058415291341944e-05, - "loss": 0.2745, + "epoch": 2.251414567340613, + "grad_norm": 0.23780296742916107, + "learning_rate": 3.0214650818854713e-05, + "loss": 0.3954, "step": 62470 }, { - "epoch": 2.2, - "learning_rate": 3.1055651444727345e-05, - "loss": 0.2487, + "epoch": 2.251594767001838, + "grad_norm": 0.2071855366230011, + "learning_rate": 3.021179683688952e-05, + "loss": 0.3667, "step": 62475 }, { - "epoch": 2.2, - "learning_rate": 3.105288751948411e-05, - "loss": 0.2876, + "epoch": 2.2517749666630626, + "grad_norm": 0.23853279650211334, + "learning_rate": 3.0208942783914606e-05, + "loss": 0.4328, "step": 62480 }, { - "epoch": 2.2, - "learning_rate": 3.105012351564812e-05, - "loss": 0.2709, + "epoch": 2.2519551663242874, + "grad_norm": 0.23091068863868713, + "learning_rate": 3.0206088659968857e-05, + "loss": 0.4059, "step": 62485 }, { - "epoch": 2.2, - "learning_rate": 3.1047359433255263e-05, - "loss": 0.2986, + "epoch": 2.252135365985512, + "grad_norm": 0.23153997957706451, + "learning_rate": 3.0203234465091157e-05, + "loss": 0.3665, "step": 62490 }, { - "epoch": 2.2, - "learning_rate": 3.1044595272341425e-05, - "loss": 0.2757, + "epoch": 2.2523155656467364, + "grad_norm": 0.18207654356956482, + "learning_rate": 3.020038019932039e-05, + "loss": 0.3626, "step": 62495 }, { - "epoch": 2.2, - "learning_rate": 3.104183103294251e-05, - "loss": 0.2839, + "epoch": 2.252495765307961, + "grad_norm": 0.19852490723133087, + "learning_rate": 3.0197525862695452e-05, + "loss": 0.4133, "step": 62500 }, { - "epoch": 2.2, - "eval_loss": 0.2684471011161804, - "eval_runtime": 10.5326, - "eval_samples_per_second": 9.494, - "eval_steps_per_second": 9.494, + "epoch": 2.252495765307961, + "eval_loss": 0.4360164999961853, + "eval_runtime": 3.5278, + "eval_samples_per_second": 28.347, + "eval_steps_per_second": 7.087, "step": 62500 }, { - "epoch": 2.2, - "learning_rate": 3.103906671509439e-05, - "loss": 0.2537, + "epoch": 2.252675964969186, + "grad_norm": 0.23163332045078278, + "learning_rate": 3.019467145525523e-05, + "loss": 0.3957, "step": 62505 }, { - "epoch": 2.2, - "learning_rate": 3.103630231883297e-05, - "loss": 0.2632, + "epoch": 2.2528561646304106, + "grad_norm": 0.21983997523784637, + "learning_rate": 3.0191816977038622e-05, + "loss": 0.3898, "step": 62510 }, { - "epoch": 2.2, - "learning_rate": 3.1033537844194155e-05, - "loss": 0.2969, + "epoch": 2.253036364291635, + "grad_norm": 0.1701703816652298, + "learning_rate": 3.01889624280845e-05, + "loss": 0.3719, "step": 62515 }, { - "epoch": 2.2, - "learning_rate": 3.103077329121382e-05, - "loss": 0.2819, + "epoch": 2.2532165639528596, + "grad_norm": 0.17912349104881287, + "learning_rate": 3.018610780843178e-05, + "loss": 0.3743, "step": 62520 }, { - "epoch": 2.2, - "learning_rate": 3.1028008659927866e-05, - "loss": 0.237, + "epoch": 2.2533967636140844, + "grad_norm": 0.20103944838047028, + "learning_rate": 3.0183253118119342e-05, + "loss": 0.4066, "step": 62525 }, { - "epoch": 2.2, - "learning_rate": 3.102524395037219e-05, - "loss": 0.2558, + "epoch": 2.253576963275309, + "grad_norm": 0.20848768949508667, + "learning_rate": 3.018039835718608e-05, + "loss": 0.4139, "step": 62530 }, { - "epoch": 2.2, - "learning_rate": 3.1022479162582695e-05, - "loss": 0.2682, + "epoch": 2.253757162936534, + "grad_norm": 0.1789763867855072, + "learning_rate": 3.0177543525670898e-05, + "loss": 0.404, "step": 62535 }, { - "epoch": 2.2, - "learning_rate": 3.101971429659528e-05, - "loss": 0.249, + "epoch": 2.2539373625977586, + "grad_norm": 0.1968272179365158, + "learning_rate": 3.0174688623612684e-05, + "loss": 0.39, "step": 62540 }, { - "epoch": 2.2, - "learning_rate": 3.101694935244584e-05, - "loss": 0.2778, + "epoch": 2.254117562258983, + "grad_norm": 0.2528758645057678, + "learning_rate": 3.0171833651050352e-05, + "loss": 0.4127, "step": 62545 }, { - "epoch": 2.2, - "learning_rate": 3.101418433017027e-05, - "loss": 0.2924, + "epoch": 2.2542977619202076, + "grad_norm": 0.23864948749542236, + "learning_rate": 3.016897860802277e-05, + "loss": 0.3495, "step": 62550 }, { - "epoch": 2.2, - "learning_rate": 3.10114192298045e-05, - "loss": 0.2784, + "epoch": 2.2544779615814323, + "grad_norm": 0.21808521449565887, + "learning_rate": 3.0166123494568865e-05, + "loss": 0.3913, "step": 62555 }, { - "epoch": 2.2, - "learning_rate": 3.10086540513844e-05, - "loss": 0.2703, + "epoch": 2.254658161242657, + "grad_norm": 0.17956504225730896, + "learning_rate": 3.016326831072752e-05, + "loss": 0.3843, "step": 62560 }, { - "epoch": 2.2, - "learning_rate": 3.1005888794945884e-05, - "loss": 0.285, + "epoch": 2.2548383609038813, + "grad_norm": 0.1821889877319336, + "learning_rate": 3.0160413056537656e-05, + "loss": 0.376, "step": 62565 }, { - "epoch": 2.2, - "learning_rate": 3.1003123460524865e-05, - "loss": 0.2675, + "epoch": 2.255018560565106, + "grad_norm": 0.1842850148677826, + "learning_rate": 3.0157557732038155e-05, + "loss": 0.3938, "step": 62570 }, { - "epoch": 2.2, - "learning_rate": 3.100035804815724e-05, - "loss": 0.2688, + "epoch": 2.255198760226331, + "grad_norm": 0.17056186497211456, + "learning_rate": 3.0154702337267926e-05, + "loss": 0.3657, "step": 62575 }, { - "epoch": 2.2, - "learning_rate": 3.0997592557878926e-05, - "loss": 0.2583, + "epoch": 2.2553789598875555, + "grad_norm": 0.23995059728622437, + "learning_rate": 3.0151846872265883e-05, + "loss": 0.4019, "step": 62580 }, { - "epoch": 2.2, - "learning_rate": 3.099482698972582e-05, - "loss": 0.2913, + "epoch": 2.2555591595487803, + "grad_norm": 0.25965413451194763, + "learning_rate": 3.014899133707092e-05, + "loss": 0.4349, "step": 62585 }, { - "epoch": 2.2, - "learning_rate": 3.0992061343733835e-05, - "loss": 0.308, + "epoch": 2.2557393592100046, + "grad_norm": 0.16279491782188416, + "learning_rate": 3.0146135731721946e-05, + "loss": 0.3995, "step": 62590 }, { - "epoch": 2.2, - "learning_rate": 3.098929561993889e-05, - "loss": 0.274, + "epoch": 2.2559195588712293, + "grad_norm": 0.2433793842792511, + "learning_rate": 3.0143280056257874e-05, + "loss": 0.3998, "step": 62595 }, { - "epoch": 2.2, - "learning_rate": 3.098652981837688e-05, - "loss": 0.2655, + "epoch": 2.256099758532454, + "grad_norm": 0.17817862331867218, + "learning_rate": 3.0140424310717597e-05, + "loss": 0.3904, "step": 62600 }, { - "epoch": 2.2, - "learning_rate": 3.098376393908373e-05, - "loss": 0.2737, + "epoch": 2.2562799581936788, + "grad_norm": 0.18165291845798492, + "learning_rate": 3.0137568495140045e-05, + "loss": 0.3814, "step": 62605 }, { - "epoch": 2.2, - "learning_rate": 3.098099798209535e-05, - "loss": 0.2755, + "epoch": 2.256460157854903, + "grad_norm": 0.17060628533363342, + "learning_rate": 3.0134712609564114e-05, + "loss": 0.4033, "step": 62610 }, { - "epoch": 2.2, - "learning_rate": 3.097823194744765e-05, - "loss": 0.2636, + "epoch": 2.256640357516128, + "grad_norm": 0.19508397579193115, + "learning_rate": 3.0131856654028716e-05, + "loss": 0.3854, "step": 62615 }, { - "epoch": 2.2, - "learning_rate": 3.0975465835176545e-05, - "loss": 0.2543, + "epoch": 2.2568205571773525, + "grad_norm": 0.21551352739334106, + "learning_rate": 3.012900062857276e-05, + "loss": 0.4103, "step": 62620 }, { - "epoch": 2.2, - "learning_rate": 3.097269964531796e-05, - "loss": 0.2555, + "epoch": 2.2570007568385773, + "grad_norm": 0.20918194949626923, + "learning_rate": 3.0126144533235172e-05, + "loss": 0.3812, "step": 62625 }, { - "epoch": 2.2, - "learning_rate": 3.096993337790781e-05, - "loss": 0.2465, + "epoch": 2.257180956499802, + "grad_norm": 0.2055574357509613, + "learning_rate": 3.0123288368054857e-05, + "loss": 0.3614, "step": 62630 }, { - "epoch": 2.2, - "learning_rate": 3.0967167032982e-05, - "loss": 0.2686, + "epoch": 2.2573611561610263, + "grad_norm": 0.201625257730484, + "learning_rate": 3.012043213307072e-05, + "loss": 0.3726, "step": 62635 }, { - "epoch": 2.2, - "learning_rate": 3.096440061057647e-05, - "loss": 0.2759, + "epoch": 2.257541355822251, + "grad_norm": 0.2094610631465912, + "learning_rate": 3.0117575828321697e-05, + "loss": 0.3753, "step": 62640 }, { - "epoch": 2.2, - "learning_rate": 3.096163411072711e-05, - "loss": 0.2765, + "epoch": 2.2577215554834758, + "grad_norm": 0.19469302892684937, + "learning_rate": 3.0114719453846684e-05, + "loss": 0.3766, "step": 62645 }, { - "epoch": 2.2, - "learning_rate": 3.095886753346988e-05, - "loss": 0.2789, + "epoch": 2.2579017551447005, + "grad_norm": 0.26172351837158203, + "learning_rate": 3.0111863009684627e-05, + "loss": 0.4036, "step": 62650 }, { - "epoch": 2.2, - "learning_rate": 3.095610087884067e-05, - "loss": 0.2703, + "epoch": 2.2580819548059248, + "grad_norm": 0.16828452050685883, + "learning_rate": 3.0109006495874408e-05, + "loss": 0.3963, "step": 62655 }, { - "epoch": 2.2, - "learning_rate": 3.095333414687542e-05, - "loss": 0.2798, + "epoch": 2.2582621544671495, + "grad_norm": 0.17929872870445251, + "learning_rate": 3.0106149912454974e-05, + "loss": 0.3774, "step": 62660 }, { - "epoch": 2.2, - "learning_rate": 3.095056733761005e-05, - "loss": 0.274, + "epoch": 2.2584423541283742, + "grad_norm": 0.19596625864505768, + "learning_rate": 3.0103293259465227e-05, + "loss": 0.3821, "step": 62665 }, { - "epoch": 2.2, - "learning_rate": 3.094780045108048e-05, - "loss": 0.2811, + "epoch": 2.258622553789599, + "grad_norm": 0.17597244679927826, + "learning_rate": 3.0100436536944105e-05, + "loss": 0.4049, "step": 62670 }, { - "epoch": 2.21, - "learning_rate": 3.094503348732264e-05, - "loss": 0.2785, + "epoch": 2.2588027534508237, + "grad_norm": 0.22724290192127228, + "learning_rate": 3.0097579744930522e-05, + "loss": 0.4355, "step": 62675 }, { - "epoch": 2.21, - "learning_rate": 3.094226644637246e-05, - "loss": 0.2748, + "epoch": 2.258982953112048, + "grad_norm": 0.24098409712314606, + "learning_rate": 3.0094722883463396e-05, + "loss": 0.4125, "step": 62680 }, { - "epoch": 2.21, - "learning_rate": 3.093949932826587e-05, - "loss": 0.2676, + "epoch": 2.2591631527732727, + "grad_norm": 0.22858020663261414, + "learning_rate": 3.0091865952581665e-05, + "loss": 0.4162, "step": 62685 }, { - "epoch": 2.21, - "learning_rate": 3.0936732133038795e-05, - "loss": 0.2652, + "epoch": 2.2593433524344975, + "grad_norm": 0.16358384490013123, + "learning_rate": 3.0089008952324243e-05, + "loss": 0.3844, "step": 62690 }, { - "epoch": 2.21, - "learning_rate": 3.0933964860727167e-05, - "loss": 0.2779, + "epoch": 2.259523552095722, + "grad_norm": 0.2142842710018158, + "learning_rate": 3.0086151882730063e-05, + "loss": 0.3829, "step": 62695 }, { - "epoch": 2.21, - "learning_rate": 3.0931197511366914e-05, - "loss": 0.2713, + "epoch": 2.2597037517569465, + "grad_norm": 0.26690196990966797, + "learning_rate": 3.0083294743838036e-05, + "loss": 0.4132, "step": 62700 }, { - "epoch": 2.21, - "learning_rate": 3.0928430084993965e-05, - "loss": 0.2618, + "epoch": 2.259883951418171, + "grad_norm": 0.16452406346797943, + "learning_rate": 3.008043753568711e-05, + "loss": 0.3578, "step": 62705 }, { - "epoch": 2.21, - "learning_rate": 3.092566258164427e-05, - "loss": 0.2765, + "epoch": 2.260064151079396, + "grad_norm": 0.21890616416931152, + "learning_rate": 3.0077580258316213e-05, + "loss": 0.3875, "step": 62710 }, { - "epoch": 2.21, - "learning_rate": 3.0922895001353736e-05, - "loss": 0.2803, + "epoch": 2.2602443507406207, + "grad_norm": 0.22171573340892792, + "learning_rate": 3.0074722911764258e-05, + "loss": 0.3939, "step": 62715 }, { - "epoch": 2.21, - "learning_rate": 3.0920127344158326e-05, - "loss": 0.2767, + "epoch": 2.2604245504018454, + "grad_norm": 0.2403387874364853, + "learning_rate": 3.007186549607019e-05, + "loss": 0.4446, "step": 62720 }, { - "epoch": 2.21, - "learning_rate": 3.091735961009396e-05, - "loss": 0.2781, + "epoch": 2.2606047500630697, + "grad_norm": 0.1787687987089157, + "learning_rate": 3.0069008011272936e-05, + "loss": 0.3989, "step": 62725 }, { - "epoch": 2.21, - "learning_rate": 3.091459179919657e-05, - "loss": 0.2569, + "epoch": 2.2607849497242944, + "grad_norm": 0.17443282902240753, + "learning_rate": 3.006615045741143e-05, + "loss": 0.4032, "step": 62730 }, { - "epoch": 2.21, - "learning_rate": 3.0911823911502116e-05, - "loss": 0.2774, + "epoch": 2.260965149385519, + "grad_norm": 0.18043413758277893, + "learning_rate": 3.0063292834524604e-05, + "loss": 0.415, "step": 62735 }, { - "epoch": 2.21, - "learning_rate": 3.090905594704652e-05, - "loss": 0.2852, + "epoch": 2.261145349046744, + "grad_norm": 0.22148096561431885, + "learning_rate": 3.0060435142651387e-05, + "loss": 0.423, "step": 62740 }, { - "epoch": 2.21, - "learning_rate": 3.090628790586573e-05, - "loss": 0.2657, + "epoch": 2.261325548707968, + "grad_norm": 0.18598996102809906, + "learning_rate": 3.0057577381830732e-05, + "loss": 0.3616, "step": 62745 }, { - "epoch": 2.21, - "learning_rate": 3.0903519787995684e-05, - "loss": 0.2806, + "epoch": 2.261505748369193, + "grad_norm": 0.17888624966144562, + "learning_rate": 3.005471955210156e-05, + "loss": 0.3796, "step": 62750 }, { - "epoch": 2.21, - "learning_rate": 3.090075159347232e-05, - "loss": 0.2882, + "epoch": 2.2616859480304177, + "grad_norm": 0.18961434066295624, + "learning_rate": 3.0051861653502815e-05, + "loss": 0.3933, "step": 62755 }, { - "epoch": 2.21, - "learning_rate": 3.0897983322331586e-05, - "loss": 0.2767, + "epoch": 2.2618661476916424, + "grad_norm": 0.23519083857536316, + "learning_rate": 3.004900368607343e-05, + "loss": 0.3984, "step": 62760 }, { - "epoch": 2.21, - "learning_rate": 3.0895214974609425e-05, - "loss": 0.2908, + "epoch": 2.262046347352867, + "grad_norm": 0.24493834376335144, + "learning_rate": 3.0046145649852347e-05, + "loss": 0.3918, "step": 62765 }, { - "epoch": 2.21, - "learning_rate": 3.0892446550341783e-05, - "loss": 0.2871, + "epoch": 2.2622265470140914, + "grad_norm": 0.16111089289188385, + "learning_rate": 3.0043287544878513e-05, + "loss": 0.4249, "step": 62770 }, { - "epoch": 2.21, - "learning_rate": 3.088967804956461e-05, - "loss": 0.2766, + "epoch": 2.262406746675316, + "grad_norm": 0.24691814184188843, + "learning_rate": 3.0040429371190855e-05, + "loss": 0.3989, "step": 62775 }, { - "epoch": 2.21, - "learning_rate": 3.088690947231385e-05, - "loss": 0.2849, + "epoch": 2.262586946336541, + "grad_norm": 0.22405491769313812, + "learning_rate": 3.0037571128828323e-05, + "loss": 0.3911, "step": 62780 }, { - "epoch": 2.21, - "learning_rate": 3.088414081862544e-05, - "loss": 0.2766, + "epoch": 2.2627671459977656, + "grad_norm": 0.16816742718219757, + "learning_rate": 3.003471281782986e-05, + "loss": 0.3844, "step": 62785 }, { - "epoch": 2.21, - "learning_rate": 3.088137208853535e-05, - "loss": 0.252, + "epoch": 2.26294734565899, + "grad_norm": 0.21033096313476562, + "learning_rate": 3.0031854438234413e-05, + "loss": 0.4017, "step": 62790 }, { - "epoch": 2.21, - "learning_rate": 3.0878603282079516e-05, - "loss": 0.2417, + "epoch": 2.2631275453202147, + "grad_norm": 0.16821134090423584, + "learning_rate": 3.002899599008092e-05, + "loss": 0.3908, "step": 62795 }, { - "epoch": 2.21, - "learning_rate": 3.087583439929389e-05, - "loss": 0.2696, + "epoch": 2.2633077449814394, + "grad_norm": 0.23085516691207886, + "learning_rate": 3.0026137473408332e-05, + "loss": 0.4195, "step": 62800 }, { - "epoch": 2.21, - "learning_rate": 3.087306544021443e-05, - "loss": 0.2914, + "epoch": 2.263487944642664, + "grad_norm": 0.22191166877746582, + "learning_rate": 3.0023278888255595e-05, + "loss": 0.4187, "step": 62805 }, { - "epoch": 2.21, - "learning_rate": 3.087029640487708e-05, - "loss": 0.2581, + "epoch": 2.263668144303889, + "grad_norm": 0.173319473862648, + "learning_rate": 3.0020420234661655e-05, + "loss": 0.3752, "step": 62810 }, { - "epoch": 2.21, - "learning_rate": 3.08675272933178e-05, - "loss": 0.2741, + "epoch": 2.2638483439651136, + "grad_norm": 0.20444796979427338, + "learning_rate": 3.001756151266546e-05, + "loss": 0.4048, "step": 62815 }, { - "epoch": 2.21, - "learning_rate": 3.086475810557255e-05, - "loss": 0.2921, + "epoch": 2.264028543626338, + "grad_norm": 0.18994398415088654, + "learning_rate": 3.0014702722305958e-05, + "loss": 0.4, "step": 62820 }, { - "epoch": 2.21, - "learning_rate": 3.086198884167728e-05, - "loss": 0.2578, + "epoch": 2.2642087432875626, + "grad_norm": 0.1826678216457367, + "learning_rate": 3.0011843863622112e-05, + "loss": 0.4135, "step": 62825 }, { - "epoch": 2.21, - "learning_rate": 3.0859219501667944e-05, - "loss": 0.2567, + "epoch": 2.2643889429487873, + "grad_norm": 0.19503554701805115, + "learning_rate": 3.000898493665285e-05, + "loss": 0.4056, "step": 62830 }, { - "epoch": 2.21, - "learning_rate": 3.085645008558051e-05, - "loss": 0.2587, + "epoch": 2.264569142610012, + "grad_norm": 0.22407963871955872, + "learning_rate": 3.0006125941437157e-05, + "loss": 0.4232, "step": 62835 }, { - "epoch": 2.21, - "learning_rate": 3.085368059345093e-05, - "loss": 0.2745, + "epoch": 2.2647493422712364, + "grad_norm": 0.1850503832101822, + "learning_rate": 3.000326687801395e-05, + "loss": 0.4157, "step": 62840 }, { - "epoch": 2.21, - "learning_rate": 3.085091102531516e-05, - "loss": 0.2569, + "epoch": 2.264929541932461, + "grad_norm": 0.17998819053173065, + "learning_rate": 3.000040774642221e-05, + "loss": 0.3862, "step": 62845 }, { - "epoch": 2.21, - "learning_rate": 3.084814138120917e-05, - "loss": 0.279, + "epoch": 2.265109741593686, + "grad_norm": 0.27859246730804443, + "learning_rate": 2.999754854670087e-05, + "loss": 0.4156, "step": 62850 }, { - "epoch": 2.21, - "learning_rate": 3.084537166116892e-05, - "loss": 0.2621, + "epoch": 2.2652899412549106, + "grad_norm": 0.21499688923358917, + "learning_rate": 2.9994689278888914e-05, + "loss": 0.3995, "step": 62855 }, { - "epoch": 2.21, - "learning_rate": 3.084260186523038e-05, - "loss": 0.2597, + "epoch": 2.2654701409161353, + "grad_norm": 0.1794780045747757, + "learning_rate": 2.999182994302528e-05, + "loss": 0.3926, "step": 62860 }, { - "epoch": 2.21, - "learning_rate": 3.083983199342949e-05, - "loss": 0.2743, + "epoch": 2.2656503405773596, + "grad_norm": 0.20833353698253632, + "learning_rate": 2.998897053914892e-05, + "loss": 0.4182, "step": 62865 }, { - "epoch": 2.21, - "learning_rate": 3.0837062045802235e-05, - "loss": 0.2986, + "epoch": 2.2658305402385843, + "grad_norm": 0.2357635349035263, + "learning_rate": 2.998611106729881e-05, + "loss": 0.4268, "step": 62870 }, { - "epoch": 2.21, - "learning_rate": 3.083429202238457e-05, - "loss": 0.2834, + "epoch": 2.266010739899809, + "grad_norm": 0.18633919954299927, + "learning_rate": 2.9983251527513906e-05, + "loss": 0.3511, "step": 62875 }, { - "epoch": 2.21, - "learning_rate": 3.0831521923212484e-05, - "loss": 0.2812, + "epoch": 2.266190939561034, + "grad_norm": 0.2481304407119751, + "learning_rate": 2.9980391919833156e-05, + "loss": 0.3642, "step": 62880 }, { - "epoch": 2.21, - "learning_rate": 3.0828751748321914e-05, - "loss": 0.2694, + "epoch": 2.266371139222258, + "grad_norm": 0.24019500613212585, + "learning_rate": 2.9977532244295537e-05, + "loss": 0.391, "step": 62885 }, { - "epoch": 2.21, - "learning_rate": 3.082598149774884e-05, - "loss": 0.2614, + "epoch": 2.266551338883483, + "grad_norm": 0.21231551468372345, + "learning_rate": 2.9974672500939994e-05, + "loss": 0.3907, "step": 62890 }, { - "epoch": 2.21, - "learning_rate": 3.082321117152925e-05, - "loss": 0.2899, + "epoch": 2.2667315385447075, + "grad_norm": 0.19400948286056519, + "learning_rate": 2.997181268980552e-05, + "loss": 0.3995, "step": 62895 }, { - "epoch": 2.21, - "learning_rate": 3.08204407696991e-05, - "loss": 0.2703, + "epoch": 2.2669117382059323, + "grad_norm": 0.18428564071655273, + "learning_rate": 2.9968952810931044e-05, + "loss": 0.4017, "step": 62900 }, { - "epoch": 2.21, - "learning_rate": 3.081767029229435e-05, - "loss": 0.2616, + "epoch": 2.267091937867157, + "grad_norm": 0.20284946262836456, + "learning_rate": 2.9966092864355556e-05, + "loss": 0.3902, "step": 62905 }, { - "epoch": 2.21, - "learning_rate": 3.0814899739350996e-05, - "loss": 0.2762, + "epoch": 2.2672721375283813, + "grad_norm": 0.21810269355773926, + "learning_rate": 2.9963232850118006e-05, + "loss": 0.4044, "step": 62910 }, { - "epoch": 2.21, - "learning_rate": 3.0812129110905e-05, - "loss": 0.2624, + "epoch": 2.267452337189606, + "grad_norm": 0.18810847401618958, + "learning_rate": 2.9960372768257378e-05, + "loss": 0.4189, "step": 62915 }, { - "epoch": 2.21, - "learning_rate": 3.080935840699234e-05, - "loss": 0.2833, + "epoch": 2.2676325368508308, + "grad_norm": 0.2173568606376648, + "learning_rate": 2.9957512618812634e-05, + "loss": 0.3888, "step": 62920 }, { - "epoch": 2.21, - "learning_rate": 3.080658762764898e-05, - "loss": 0.2786, + "epoch": 2.2678127365120555, + "grad_norm": 0.2114708125591278, + "learning_rate": 2.9954652401822732e-05, + "loss": 0.4262, "step": 62925 }, { - "epoch": 2.21, - "learning_rate": 3.080381677291091e-05, - "loss": 0.2778, + "epoch": 2.26799293617328, + "grad_norm": 0.20578879117965698, + "learning_rate": 2.9951792117326648e-05, + "loss": 0.3686, "step": 62930 }, { - "epoch": 2.21, - "learning_rate": 3.080104584281411e-05, - "loss": 0.27, + "epoch": 2.2681731358345045, + "grad_norm": 0.18849804997444153, + "learning_rate": 2.9948931765363364e-05, + "loss": 0.4066, "step": 62935 }, { - "epoch": 2.21, - "learning_rate": 3.0798274837394554e-05, - "loss": 0.2539, + "epoch": 2.2683533354957293, + "grad_norm": 0.2172732949256897, + "learning_rate": 2.9946071345971842e-05, + "loss": 0.4361, "step": 62940 }, { - "epoch": 2.21, - "learning_rate": 3.079550375668821e-05, - "loss": 0.2617, + "epoch": 2.268533535156954, + "grad_norm": 0.19015686213970184, + "learning_rate": 2.994321085919105e-05, + "loss": 0.4048, "step": 62945 }, { - "epoch": 2.21, - "learning_rate": 3.0792732600731075e-05, - "loss": 0.2757, + "epoch": 2.2687137348181787, + "grad_norm": 0.18530982732772827, + "learning_rate": 2.9940350305059972e-05, + "loss": 0.3978, "step": 62950 }, { - "epoch": 2.21, - "learning_rate": 3.078996136955912e-05, - "loss": 0.2698, + "epoch": 2.268893934479403, + "grad_norm": 0.2398093044757843, + "learning_rate": 2.9937489683617577e-05, + "loss": 0.4006, "step": 62955 }, { - "epoch": 2.22, - "learning_rate": 3.078719006320834e-05, - "loss": 0.2779, + "epoch": 2.2690741341406278, + "grad_norm": 0.18880394101142883, + "learning_rate": 2.9934628994902836e-05, + "loss": 0.3719, "step": 62960 }, { - "epoch": 2.22, - "learning_rate": 3.078441868171471e-05, - "loss": 0.2606, + "epoch": 2.2692543338018525, + "grad_norm": 0.21370282769203186, + "learning_rate": 2.993176823895474e-05, + "loss": 0.4061, "step": 62965 }, { - "epoch": 2.22, - "learning_rate": 3.078164722511421e-05, - "loss": 0.2839, + "epoch": 2.269434533463077, + "grad_norm": 0.20947441458702087, + "learning_rate": 2.992890741581224e-05, + "loss": 0.4062, "step": 62970 }, { - "epoch": 2.22, - "learning_rate": 3.077887569344283e-05, - "loss": 0.2678, + "epoch": 2.2696147331243015, + "grad_norm": 0.18674728274345398, + "learning_rate": 2.9926046525514345e-05, + "loss": 0.3753, "step": 62975 }, { - "epoch": 2.22, - "learning_rate": 3.0776104086736556e-05, - "loss": 0.2597, + "epoch": 2.2697949327855262, + "grad_norm": 0.19669872522354126, + "learning_rate": 2.9923185568100014e-05, + "loss": 0.3642, "step": 62980 }, { - "epoch": 2.22, - "learning_rate": 3.077333240503139e-05, - "loss": 0.2734, + "epoch": 2.269975132446751, + "grad_norm": 0.18584516644477844, + "learning_rate": 2.992032454360824e-05, + "loss": 0.373, "step": 62985 }, { - "epoch": 2.22, - "learning_rate": 3.07705606483633e-05, - "loss": 0.2606, + "epoch": 2.2701553321079757, + "grad_norm": 0.19654756784439087, + "learning_rate": 2.9917463452077986e-05, + "loss": 0.3898, "step": 62990 }, { - "epoch": 2.22, - "learning_rate": 3.0767788816768285e-05, - "loss": 0.2728, + "epoch": 2.2703355317692004, + "grad_norm": 0.18995268642902374, + "learning_rate": 2.991460229354825e-05, + "loss": 0.4269, "step": 62995 }, { - "epoch": 2.22, - "learning_rate": 3.076501691028233e-05, - "loss": 0.2611, + "epoch": 2.2705157314304247, + "grad_norm": 0.2119891196489334, + "learning_rate": 2.9911741068058012e-05, + "loss": 0.3993, "step": 63000 }, { - "epoch": 2.22, - "eval_loss": 0.2684405446052551, - "eval_runtime": 10.5301, - "eval_samples_per_second": 9.497, - "eval_steps_per_second": 9.497, + "epoch": 2.2705157314304247, + "eval_loss": 0.4354766011238098, + "eval_runtime": 3.5274, + "eval_samples_per_second": 28.349, + "eval_steps_per_second": 7.087, "step": 63000 }, { - "epoch": 2.22, - "learning_rate": 3.076224492894143e-05, - "loss": 0.2829, + "epoch": 2.2706959310916495, + "grad_norm": 0.22078485786914825, + "learning_rate": 2.9908879775646247e-05, + "loss": 0.3753, "step": 63005 }, { - "epoch": 2.22, - "learning_rate": 3.0759472872781576e-05, - "loss": 0.2912, + "epoch": 2.270876130752874, + "grad_norm": 0.20009629428386688, + "learning_rate": 2.990601841635195e-05, + "loss": 0.4031, "step": 63010 }, { - "epoch": 2.22, - "learning_rate": 3.075670074183877e-05, - "loss": 0.2739, + "epoch": 2.271056330414099, + "grad_norm": 0.17347082495689392, + "learning_rate": 2.9903156990214097e-05, + "loss": 0.3555, "step": 63015 }, { - "epoch": 2.22, - "learning_rate": 3.0753928536148986e-05, - "loss": 0.2833, + "epoch": 2.2712365300753232, + "grad_norm": 0.261214941740036, + "learning_rate": 2.9900295497271687e-05, + "loss": 0.3902, "step": 63020 }, { - "epoch": 2.22, - "learning_rate": 3.075115625574824e-05, - "loss": 0.2689, + "epoch": 2.271416729736548, + "grad_norm": 0.19732660055160522, + "learning_rate": 2.9897433937563696e-05, + "loss": 0.4313, "step": 63025 }, { - "epoch": 2.22, - "learning_rate": 3.074838390067252e-05, - "loss": 0.2743, + "epoch": 2.2715969293977727, + "grad_norm": 0.22915062308311462, + "learning_rate": 2.989457231112911e-05, + "loss": 0.4075, "step": 63030 }, { - "epoch": 2.22, - "learning_rate": 3.074561147095782e-05, - "loss": 0.2613, + "epoch": 2.2717771290589974, + "grad_norm": 0.20028504729270935, + "learning_rate": 2.989171061800693e-05, + "loss": 0.4022, "step": 63035 }, { - "epoch": 2.22, - "learning_rate": 3.074283896664014e-05, - "loss": 0.2802, + "epoch": 2.271957328720222, + "grad_norm": 0.23854713141918182, + "learning_rate": 2.988884885823614e-05, + "loss": 0.3984, "step": 63040 }, { - "epoch": 2.22, - "learning_rate": 3.0740066387755496e-05, - "loss": 0.2777, + "epoch": 2.272137528381447, + "grad_norm": 0.2059304416179657, + "learning_rate": 2.9885987031855733e-05, + "loss": 0.4019, "step": 63045 }, { - "epoch": 2.22, - "learning_rate": 3.073729373433986e-05, - "loss": 0.2518, + "epoch": 2.272317728042671, + "grad_norm": 0.16523705422878265, + "learning_rate": 2.9883125138904693e-05, + "loss": 0.3699, "step": 63050 }, { - "epoch": 2.22, - "learning_rate": 3.073452100642924e-05, - "loss": 0.2616, + "epoch": 2.272497927703896, + "grad_norm": 0.20402154326438904, + "learning_rate": 2.988026317942202e-05, + "loss": 0.3868, "step": 63055 }, { - "epoch": 2.22, - "learning_rate": 3.073174820405965e-05, - "loss": 0.277, + "epoch": 2.2726781273651206, + "grad_norm": 0.22401557862758636, + "learning_rate": 2.987740115344671e-05, + "loss": 0.3785, "step": 63060 }, { - "epoch": 2.22, - "learning_rate": 3.07289753272671e-05, - "loss": 0.3092, + "epoch": 2.2728583270263454, + "grad_norm": 0.21607942879199982, + "learning_rate": 2.9874539061017746e-05, + "loss": 0.3896, "step": 63065 }, { - "epoch": 2.22, - "learning_rate": 3.072620237608756e-05, - "loss": 0.2681, + "epoch": 2.2730385266875697, + "grad_norm": 0.2318323701620102, + "learning_rate": 2.987167690217414e-05, + "loss": 0.4374, "step": 63070 }, { - "epoch": 2.22, - "learning_rate": 3.072342935055707e-05, - "loss": 0.2587, + "epoch": 2.2732187263487944, + "grad_norm": 0.20749138295650482, + "learning_rate": 2.986881467695487e-05, + "loss": 0.4096, "step": 63075 }, { - "epoch": 2.22, - "learning_rate": 3.0720656250711614e-05, - "loss": 0.2689, + "epoch": 2.273398926010019, + "grad_norm": 0.20667487382888794, + "learning_rate": 2.9865952385398955e-05, + "loss": 0.4047, "step": 63080 }, { - "epoch": 2.22, - "learning_rate": 3.07178830765872e-05, - "loss": 0.2684, + "epoch": 2.273579125671244, + "grad_norm": 0.23267313838005066, + "learning_rate": 2.9863090027545364e-05, + "loss": 0.4173, "step": 63085 }, { - "epoch": 2.22, - "learning_rate": 3.0715109828219856e-05, - "loss": 0.2542, + "epoch": 2.2737593253324686, + "grad_norm": 0.19276480376720428, + "learning_rate": 2.986022760343313e-05, + "loss": 0.3679, "step": 63090 }, { - "epoch": 2.22, - "learning_rate": 3.0712336505645573e-05, - "loss": 0.2734, + "epoch": 2.273939524993693, + "grad_norm": 0.20015855133533478, + "learning_rate": 2.985736511310122e-05, + "loss": 0.38, "step": 63095 }, { - "epoch": 2.22, - "learning_rate": 3.070956310890037e-05, - "loss": 0.2789, + "epoch": 2.2741197246549176, + "grad_norm": 0.2319357693195343, + "learning_rate": 2.985450255658866e-05, + "loss": 0.3908, "step": 63100 }, { - "epoch": 2.22, - "learning_rate": 3.070678963802025e-05, - "loss": 0.2828, + "epoch": 2.2742999243161424, + "grad_norm": 0.2397158294916153, + "learning_rate": 2.9851639933934446e-05, + "loss": 0.3892, "step": 63105 }, { - "epoch": 2.22, - "learning_rate": 3.070401609304122e-05, - "loss": 0.3018, + "epoch": 2.274480123977367, + "grad_norm": 0.162200927734375, + "learning_rate": 2.9848777245177568e-05, + "loss": 0.4061, "step": 63110 }, { - "epoch": 2.22, - "learning_rate": 3.0701242473999304e-05, - "loss": 0.2615, + "epoch": 2.2746603236385914, + "grad_norm": 0.2283955216407776, + "learning_rate": 2.9845914490357046e-05, + "loss": 0.4059, "step": 63115 }, { - "epoch": 2.22, - "learning_rate": 3.069846878093052e-05, - "loss": 0.2628, + "epoch": 2.274840523299816, + "grad_norm": 0.20629531145095825, + "learning_rate": 2.9843051669511872e-05, + "loss": 0.3769, "step": 63120 }, { - "epoch": 2.22, - "learning_rate": 3.069569501387086e-05, - "loss": 0.2609, + "epoch": 2.275020722961041, + "grad_norm": 0.16891133785247803, + "learning_rate": 2.9840188782681062e-05, + "loss": 0.3967, "step": 63125 }, { - "epoch": 2.22, - "learning_rate": 3.069292117285637e-05, - "loss": 0.2709, + "epoch": 2.2752009226222656, + "grad_norm": 0.19010712206363678, + "learning_rate": 2.983732582990361e-05, + "loss": 0.4143, "step": 63130 }, { - "epoch": 2.22, - "learning_rate": 3.069014725792304e-05, - "loss": 0.294, + "epoch": 2.2753811222834903, + "grad_norm": 0.2108030766248703, + "learning_rate": 2.9834462811218534e-05, + "loss": 0.3775, "step": 63135 }, { - "epoch": 2.22, - "learning_rate": 3.0687373269106894e-05, - "loss": 0.2662, + "epoch": 2.2755613219447146, + "grad_norm": 0.21494841575622559, + "learning_rate": 2.9831599726664844e-05, + "loss": 0.3723, "step": 63140 }, { - "epoch": 2.22, - "learning_rate": 3.068459920644396e-05, - "loss": 0.289, + "epoch": 2.2757415216059393, + "grad_norm": 0.19775624573230743, + "learning_rate": 2.9828736576281535e-05, + "loss": 0.4012, "step": 63145 }, { - "epoch": 2.22, - "learning_rate": 3.068182506997026e-05, - "loss": 0.2689, + "epoch": 2.275921721267164, + "grad_norm": 0.20917865633964539, + "learning_rate": 2.9825873360107626e-05, + "loss": 0.3865, "step": 63150 }, { - "epoch": 2.22, - "learning_rate": 3.067905085972179e-05, - "loss": 0.2564, + "epoch": 2.276101920928389, + "grad_norm": 0.1742408573627472, + "learning_rate": 2.982301007818213e-05, + "loss": 0.3552, "step": 63155 }, { - "epoch": 2.22, - "learning_rate": 3.06762765757346e-05, - "loss": 0.2651, + "epoch": 2.276282120589613, + "grad_norm": 0.20653748512268066, + "learning_rate": 2.9820146730544052e-05, + "loss": 0.3645, "step": 63160 }, { - "epoch": 2.22, - "learning_rate": 3.0673502218044694e-05, - "loss": 0.2759, + "epoch": 2.276462320250838, + "grad_norm": 0.20753364264965057, + "learning_rate": 2.9817283317232413e-05, + "loss": 0.3959, "step": 63165 }, { - "epoch": 2.22, - "learning_rate": 3.067072778668809e-05, - "loss": 0.2634, + "epoch": 2.2766425199120626, + "grad_norm": 0.20545120537281036, + "learning_rate": 2.9814419838286213e-05, + "loss": 0.4067, "step": 63170 }, { - "epoch": 2.22, - "learning_rate": 3.0667953281700834e-05, - "loss": 0.2703, + "epoch": 2.2768227195732873, + "grad_norm": 0.1817047894001007, + "learning_rate": 2.981155629374448e-05, + "loss": 0.4138, "step": 63175 }, { - "epoch": 2.22, - "learning_rate": 3.0665178703118945e-05, - "loss": 0.261, + "epoch": 2.277002919234512, + "grad_norm": 0.1619819700717926, + "learning_rate": 2.980869268364622e-05, + "loss": 0.4093, "step": 63180 }, { - "epoch": 2.22, - "learning_rate": 3.066240405097844e-05, - "loss": 0.257, + "epoch": 2.2771831188957363, + "grad_norm": 0.18216536939144135, + "learning_rate": 2.9805829008030466e-05, + "loss": 0.388, "step": 63185 }, { - "epoch": 2.22, - "learning_rate": 3.065962932531534e-05, - "loss": 0.2566, + "epoch": 2.277363318556961, + "grad_norm": 0.23608942329883575, + "learning_rate": 2.9802965266936213e-05, + "loss": 0.3851, "step": 63190 }, { - "epoch": 2.22, - "learning_rate": 3.0656854526165695e-05, - "loss": 0.2697, + "epoch": 2.277543518218186, + "grad_norm": 0.22420279681682587, + "learning_rate": 2.9800101460402484e-05, + "loss": 0.3876, "step": 63195 }, { - "epoch": 2.22, - "learning_rate": 3.065407965356552e-05, - "loss": 0.2739, + "epoch": 2.2777237178794105, + "grad_norm": 0.20337367057800293, + "learning_rate": 2.9797237588468308e-05, + "loss": 0.398, "step": 63200 }, { - "epoch": 2.22, - "learning_rate": 3.0651304707550846e-05, - "loss": 0.2766, + "epoch": 2.277903917540635, + "grad_norm": 0.20395952463150024, + "learning_rate": 2.9794373651172696e-05, + "loss": 0.4199, "step": 63205 }, { - "epoch": 2.22, - "learning_rate": 3.0648529688157704e-05, - "loss": 0.2808, + "epoch": 2.2780841172018595, + "grad_norm": 0.19904713332653046, + "learning_rate": 2.9791509648554678e-05, + "loss": 0.3991, "step": 63210 }, { - "epoch": 2.22, - "learning_rate": 3.064575459542213e-05, - "loss": 0.2638, + "epoch": 2.2782643168630843, + "grad_norm": 0.17146888375282288, + "learning_rate": 2.9788645580653257e-05, + "loss": 0.3952, "step": 63215 }, { - "epoch": 2.22, - "learning_rate": 3.0642979429380144e-05, - "loss": 0.2747, + "epoch": 2.278444516524309, + "grad_norm": 0.2011823207139969, + "learning_rate": 2.9785781447507476e-05, + "loss": 0.3915, "step": 63220 }, { - "epoch": 2.22, - "learning_rate": 3.064020419006779e-05, - "loss": 0.2562, + "epoch": 2.2786247161855337, + "grad_norm": 0.2076421082019806, + "learning_rate": 2.9782917249156346e-05, + "loss": 0.386, "step": 63225 }, { - "epoch": 2.22, - "learning_rate": 3.0637428877521115e-05, - "loss": 0.2634, + "epoch": 2.278804915846758, + "grad_norm": 0.18694275617599487, + "learning_rate": 2.9780052985638896e-05, + "loss": 0.3882, "step": 63230 }, { - "epoch": 2.22, - "learning_rate": 3.063465349177613e-05, - "loss": 0.2749, + "epoch": 2.2789851155079828, + "grad_norm": 0.2005622684955597, + "learning_rate": 2.9777188656994147e-05, + "loss": 0.3676, "step": 63235 }, { - "epoch": 2.22, - "learning_rate": 3.063187803286889e-05, - "loss": 0.2611, + "epoch": 2.2791653151692075, + "grad_norm": 0.17751555144786835, + "learning_rate": 2.9774324263261126e-05, + "loss": 0.3941, "step": 63240 }, { - "epoch": 2.23, - "learning_rate": 3.062910250083542e-05, - "loss": 0.274, + "epoch": 2.2793455148304322, + "grad_norm": 0.20505878329277039, + "learning_rate": 2.9771459804478868e-05, + "loss": 0.3837, "step": 63245 }, { - "epoch": 2.23, - "learning_rate": 3.062632689571176e-05, - "loss": 0.2638, + "epoch": 2.2795257144916565, + "grad_norm": 0.20655593276023865, + "learning_rate": 2.976859528068639e-05, + "loss": 0.4162, "step": 63250 }, { - "epoch": 2.23, - "learning_rate": 3.062355121753396e-05, - "loss": 0.2689, + "epoch": 2.2797059141528813, + "grad_norm": 0.21149560809135437, + "learning_rate": 2.9765730691922723e-05, + "loss": 0.3893, "step": 63255 }, { - "epoch": 2.23, - "learning_rate": 3.062077546633805e-05, - "loss": 0.2442, + "epoch": 2.279886113814106, + "grad_norm": 0.2167680561542511, + "learning_rate": 2.97628660382269e-05, + "loss": 0.4075, "step": 63260 }, { - "epoch": 2.23, - "learning_rate": 3.061799964216008e-05, - "loss": 0.278, + "epoch": 2.2800663134753307, + "grad_norm": 0.23179109394550323, + "learning_rate": 2.9760001319637955e-05, + "loss": 0.3927, "step": 63265 }, { - "epoch": 2.23, - "learning_rate": 3.061522374503608e-05, - "loss": 0.2693, + "epoch": 2.2802465131365555, + "grad_norm": 0.19186845421791077, + "learning_rate": 2.97571365361949e-05, + "loss": 0.3835, "step": 63270 }, { - "epoch": 2.23, - "learning_rate": 3.06124477750021e-05, - "loss": 0.2743, + "epoch": 2.2804267127977798, + "grad_norm": 0.21390306949615479, + "learning_rate": 2.975427168793679e-05, + "loss": 0.4458, "step": 63275 }, { - "epoch": 2.23, - "learning_rate": 3.060967173209419e-05, - "loss": 0.2704, + "epoch": 2.2806069124590045, + "grad_norm": 0.2087387591600418, + "learning_rate": 2.9751406774902645e-05, + "loss": 0.3978, "step": 63280 }, { - "epoch": 2.23, - "learning_rate": 3.060689561634838e-05, - "loss": 0.25, + "epoch": 2.280787112120229, + "grad_norm": 0.2157135307788849, + "learning_rate": 2.9748541797131506e-05, + "loss": 0.4081, "step": 63285 }, { - "epoch": 2.23, - "learning_rate": 3.060411942780074e-05, - "loss": 0.2897, + "epoch": 2.280967311781454, + "grad_norm": 0.18895265460014343, + "learning_rate": 2.9745676754662405e-05, + "loss": 0.3776, "step": 63290 }, { - "epoch": 2.23, - "learning_rate": 3.06013431664873e-05, - "loss": 0.2894, + "epoch": 2.2811475114426782, + "grad_norm": 0.2101006656885147, + "learning_rate": 2.9742811647534373e-05, + "loss": 0.3925, "step": 63295 }, { - "epoch": 2.23, - "learning_rate": 3.05985668324441e-05, - "loss": 0.2574, + "epoch": 2.281327711103903, + "grad_norm": 0.21534328162670135, + "learning_rate": 2.9739946475786452e-05, + "loss": 0.4033, "step": 63300 }, { - "epoch": 2.23, - "learning_rate": 3.059579042570721e-05, - "loss": 0.2719, + "epoch": 2.2815079107651277, + "grad_norm": 0.17622357606887817, + "learning_rate": 2.9737081239457683e-05, + "loss": 0.41, "step": 63305 }, { - "epoch": 2.23, - "learning_rate": 3.059301394631266e-05, - "loss": 0.2507, + "epoch": 2.2816881104263524, + "grad_norm": 0.2728975713253021, + "learning_rate": 2.973421593858709e-05, + "loss": 0.4151, "step": 63310 }, { - "epoch": 2.23, - "learning_rate": 3.059023739429651e-05, - "loss": 0.2601, + "epoch": 2.281868310087577, + "grad_norm": 0.18991267681121826, + "learning_rate": 2.973135057321373e-05, + "loss": 0.426, "step": 63315 }, { - "epoch": 2.23, - "learning_rate": 3.058746076969482e-05, - "loss": 0.2822, + "epoch": 2.282048509748802, + "grad_norm": 0.16258344054222107, + "learning_rate": 2.972848514337662e-05, + "loss": 0.3736, "step": 63320 }, { - "epoch": 2.23, - "learning_rate": 3.058468407254363e-05, - "loss": 0.2661, + "epoch": 2.282228709410026, + "grad_norm": 0.19868828356266022, + "learning_rate": 2.972561964911484e-05, + "loss": 0.4063, "step": 63325 }, { - "epoch": 2.23, - "learning_rate": 3.058190730287899e-05, - "loss": 0.2667, + "epoch": 2.282408909071251, + "grad_norm": 0.21427805721759796, + "learning_rate": 2.9722754090467385e-05, + "loss": 0.3628, "step": 63330 }, { - "epoch": 2.23, - "learning_rate": 3.0579130460736974e-05, - "loss": 0.2643, + "epoch": 2.2825891087324757, + "grad_norm": 0.16630151867866516, + "learning_rate": 2.9719888467473333e-05, + "loss": 0.3724, "step": 63335 }, { - "epoch": 2.23, - "learning_rate": 3.057635354615362e-05, - "loss": 0.2804, + "epoch": 2.2827693083937004, + "grad_norm": 0.20577584207057953, + "learning_rate": 2.9717022780171704e-05, + "loss": 0.4154, "step": 63340 }, { - "epoch": 2.23, - "learning_rate": 3.057357655916499e-05, - "loss": 0.2649, + "epoch": 2.2829495080549247, + "grad_norm": 0.24059414863586426, + "learning_rate": 2.9714157028601558e-05, + "loss": 0.3972, "step": 63345 }, { - "epoch": 2.23, - "learning_rate": 3.0570799499807145e-05, - "loss": 0.2675, + "epoch": 2.2831297077161494, + "grad_norm": 0.1928698569536209, + "learning_rate": 2.971129121280194e-05, + "loss": 0.3794, "step": 63350 }, { - "epoch": 2.23, - "learning_rate": 3.056802236811614e-05, - "loss": 0.2487, + "epoch": 2.283309907377374, + "grad_norm": 0.20987224578857422, + "learning_rate": 2.9708425332811883e-05, + "loss": 0.3816, "step": 63355 }, { - "epoch": 2.23, - "learning_rate": 3.056524516412803e-05, - "loss": 0.2895, + "epoch": 2.283490107038599, + "grad_norm": 0.23037847876548767, + "learning_rate": 2.9705559388670446e-05, + "loss": 0.4389, "step": 63360 }, { - "epoch": 2.23, - "learning_rate": 3.0562467887878876e-05, - "loss": 0.2858, + "epoch": 2.2836703066998236, + "grad_norm": 0.19015249609947205, + "learning_rate": 2.970269338041668e-05, + "loss": 0.3849, "step": 63365 }, { - "epoch": 2.23, - "learning_rate": 3.055969053940475e-05, - "loss": 0.2805, + "epoch": 2.283850506361048, + "grad_norm": 0.21437907218933105, + "learning_rate": 2.9699827308089617e-05, + "loss": 0.4241, "step": 63370 }, { - "epoch": 2.23, - "learning_rate": 3.05569131187417e-05, - "loss": 0.2549, + "epoch": 2.2840307060222726, + "grad_norm": 0.2129935771226883, + "learning_rate": 2.969696117172832e-05, + "loss": 0.402, "step": 63375 }, { - "epoch": 2.23, - "learning_rate": 3.0554135625925806e-05, - "loss": 0.2794, + "epoch": 2.2842109056834974, + "grad_norm": 0.2034277319908142, + "learning_rate": 2.969409497137184e-05, + "loss": 0.4077, "step": 63380 }, { - "epoch": 2.23, - "learning_rate": 3.055135806099311e-05, - "loss": 0.2655, + "epoch": 2.284391105344722, + "grad_norm": 0.21894314885139465, + "learning_rate": 2.9691228707059216e-05, + "loss": 0.4069, "step": 63385 }, { - "epoch": 2.23, - "learning_rate": 3.054858042397969e-05, - "loss": 0.303, + "epoch": 2.2845713050059464, + "grad_norm": 0.20878823101520538, + "learning_rate": 2.9688362378829514e-05, + "loss": 0.3912, "step": 63390 }, { - "epoch": 2.23, - "learning_rate": 3.054580271492161e-05, - "loss": 0.2728, + "epoch": 2.284751504667171, + "grad_norm": 0.21701130270957947, + "learning_rate": 2.968549598672179e-05, + "loss": 0.3831, "step": 63395 }, { - "epoch": 2.23, - "learning_rate": 3.0543024933854944e-05, - "loss": 0.2585, + "epoch": 2.284931704328396, + "grad_norm": 0.20921416580677032, + "learning_rate": 2.9682629530775075e-05, + "loss": 0.3879, "step": 63400 }, { - "epoch": 2.23, - "learning_rate": 3.054024708081575e-05, - "loss": 0.2828, + "epoch": 2.2851119039896206, + "grad_norm": 0.22157233953475952, + "learning_rate": 2.967976301102845e-05, + "loss": 0.4179, "step": 63405 }, { - "epoch": 2.23, - "learning_rate": 3.0537469155840095e-05, - "loss": 0.2673, + "epoch": 2.2852921036508453, + "grad_norm": 0.2359781712293625, + "learning_rate": 2.967689642752096e-05, + "loss": 0.3867, "step": 63410 }, { - "epoch": 2.23, - "learning_rate": 3.053469115896405e-05, - "loss": 0.2639, + "epoch": 2.2854723033120696, + "grad_norm": 0.18056300282478333, + "learning_rate": 2.967402978029166e-05, + "loss": 0.3854, "step": 63415 }, { - "epoch": 2.23, - "learning_rate": 3.0531913090223693e-05, - "loss": 0.2824, + "epoch": 2.2856525029732944, + "grad_norm": 0.1796717792749405, + "learning_rate": 2.967116306937961e-05, + "loss": 0.3722, "step": 63420 }, { - "epoch": 2.23, - "learning_rate": 3.052913494965509e-05, - "loss": 0.2922, + "epoch": 2.285832702634519, + "grad_norm": 0.21330176293849945, + "learning_rate": 2.9668296294823862e-05, + "loss": 0.4205, "step": 63425 }, { - "epoch": 2.23, - "learning_rate": 3.0526356737294306e-05, - "loss": 0.2759, + "epoch": 2.286012902295744, + "grad_norm": 0.16301804780960083, + "learning_rate": 2.966542945666349e-05, + "loss": 0.4064, "step": 63430 }, { - "epoch": 2.23, - "learning_rate": 3.052357845317743e-05, - "loss": 0.2859, + "epoch": 2.286193101956968, + "grad_norm": 0.2423224300146103, + "learning_rate": 2.9662562554937534e-05, + "loss": 0.4057, "step": 63435 }, { - "epoch": 2.23, - "learning_rate": 3.0520800097340526e-05, - "loss": 0.2588, + "epoch": 2.286373301618193, + "grad_norm": 0.18112295866012573, + "learning_rate": 2.9659695589685076e-05, + "loss": 0.3972, "step": 63440 }, { - "epoch": 2.23, - "learning_rate": 3.051802166981967e-05, - "loss": 0.2786, + "epoch": 2.2865535012794176, + "grad_norm": 0.1735352873802185, + "learning_rate": 2.965682856094516e-05, + "loss": 0.391, "step": 63445 }, { - "epoch": 2.23, - "learning_rate": 3.051524317065093e-05, - "loss": 0.2616, + "epoch": 2.2867337009406423, + "grad_norm": 0.208638533949852, + "learning_rate": 2.9653961468756863e-05, + "loss": 0.3966, "step": 63450 }, { - "epoch": 2.23, - "learning_rate": 3.051246459987041e-05, - "loss": 0.2702, + "epoch": 2.286913900601867, + "grad_norm": 0.19702517986297607, + "learning_rate": 2.965109431315924e-05, + "loss": 0.412, "step": 63455 }, { - "epoch": 2.23, - "learning_rate": 3.050968595751416e-05, - "loss": 0.2691, + "epoch": 2.2870941002630913, + "grad_norm": 0.21106953918933868, + "learning_rate": 2.964822709419135e-05, + "loss": 0.4016, "step": 63460 }, { - "epoch": 2.23, - "learning_rate": 3.050690724361827e-05, - "loss": 0.2864, + "epoch": 2.287274299924316, + "grad_norm": 0.18838860094547272, + "learning_rate": 2.9645359811892275e-05, + "loss": 0.3739, "step": 63465 }, { - "epoch": 2.23, - "learning_rate": 3.0504128458218815e-05, - "loss": 0.2618, + "epoch": 2.287454499585541, + "grad_norm": 0.22702044248580933, + "learning_rate": 2.964249246630107e-05, + "loss": 0.3855, "step": 63470 }, { - "epoch": 2.23, - "learning_rate": 3.0501349601351887e-05, - "loss": 0.2828, + "epoch": 2.2876346992467655, + "grad_norm": 0.18174223601818085, + "learning_rate": 2.9639625057456805e-05, + "loss": 0.3885, "step": 63475 }, { - "epoch": 2.23, - "learning_rate": 3.0498570673053557e-05, - "loss": 0.2864, + "epoch": 2.28781489890799, + "grad_norm": 0.19665606319904327, + "learning_rate": 2.9636757585398544e-05, + "loss": 0.4166, "step": 63480 }, { - "epoch": 2.23, - "learning_rate": 3.049579167335991e-05, - "loss": 0.2753, + "epoch": 2.2879950985692146, + "grad_norm": 0.1689072996377945, + "learning_rate": 2.9633890050165357e-05, + "loss": 0.3923, "step": 63485 }, { - "epoch": 2.23, - "learning_rate": 3.049301260230703e-05, - "loss": 0.2759, + "epoch": 2.2881752982304393, + "grad_norm": 0.20292653143405914, + "learning_rate": 2.963102245179632e-05, + "loss": 0.3948, "step": 63490 }, { - "epoch": 2.23, - "learning_rate": 3.0490233459931e-05, - "loss": 0.2643, + "epoch": 2.288355497891664, + "grad_norm": 0.1763313263654709, + "learning_rate": 2.9628154790330498e-05, + "loss": 0.3818, "step": 63495 }, { - "epoch": 2.23, - "learning_rate": 3.0487454246267904e-05, - "loss": 0.2826, + "epoch": 2.2885356975528888, + "grad_norm": 0.2226899415254593, + "learning_rate": 2.9625287065806962e-05, + "loss": 0.393, "step": 63500 }, { - "epoch": 2.23, - "eval_loss": 0.26817554235458374, - "eval_runtime": 10.5266, - "eval_samples_per_second": 9.5, - "eval_steps_per_second": 9.5, + "epoch": 2.2885356975528888, + "eval_loss": 0.4357220530509949, + "eval_runtime": 3.5317, + "eval_samples_per_second": 28.315, + "eval_steps_per_second": 7.079, "step": 63500 }, { - "epoch": 2.23, - "learning_rate": 3.048467496135384e-05, - "loss": 0.2642, + "epoch": 2.288715897214113, + "grad_norm": 0.17390502989292145, + "learning_rate": 2.962241927826478e-05, + "loss": 0.4268, "step": 63505 }, { - "epoch": 2.23, - "learning_rate": 3.048189560522488e-05, - "loss": 0.2709, + "epoch": 2.288896096875338, + "grad_norm": 0.250418484210968, + "learning_rate": 2.9619551427743042e-05, + "loss": 0.4187, "step": 63510 }, { - "epoch": 2.23, - "learning_rate": 3.047911617791712e-05, - "loss": 0.2787, + "epoch": 2.2890762965365625, + "grad_norm": 0.25360366702079773, + "learning_rate": 2.9616683514280798e-05, + "loss": 0.3917, "step": 63515 }, { - "epoch": 2.23, - "learning_rate": 3.0476336679466645e-05, - "loss": 0.2723, + "epoch": 2.2892564961977873, + "grad_norm": 0.18742060661315918, + "learning_rate": 2.9613815537917145e-05, + "loss": 0.3793, "step": 63520 }, { - "epoch": 2.23, - "learning_rate": 3.047355710990955e-05, - "loss": 0.2784, + "epoch": 2.2894366958590116, + "grad_norm": 0.17973770201206207, + "learning_rate": 2.961094749869114e-05, + "loss": 0.3881, "step": 63525 }, { - "epoch": 2.24, - "learning_rate": 3.0470777469281918e-05, - "loss": 0.2844, + "epoch": 2.2896168955202363, + "grad_norm": 0.229409322142601, + "learning_rate": 2.9608079396641868e-05, + "loss": 0.4357, "step": 63530 }, { - "epoch": 2.24, - "learning_rate": 3.0467997757619853e-05, - "loss": 0.2933, + "epoch": 2.289797095181461, + "grad_norm": 0.2348993420600891, + "learning_rate": 2.9605211231808417e-05, + "loss": 0.4039, "step": 63535 }, { - "epoch": 2.24, - "learning_rate": 3.0465217974959436e-05, - "loss": 0.2765, + "epoch": 2.2899772948426858, + "grad_norm": 0.21660517156124115, + "learning_rate": 2.9602343004229842e-05, + "loss": 0.4293, "step": 63540 }, { - "epoch": 2.24, - "learning_rate": 3.0462438121336767e-05, - "loss": 0.291, + "epoch": 2.2901574945039105, + "grad_norm": 0.1841536909341812, + "learning_rate": 2.9599474713945242e-05, + "loss": 0.394, "step": 63545 }, { - "epoch": 2.24, - "learning_rate": 3.0459658196787937e-05, - "loss": 0.2678, + "epoch": 2.290337694165135, + "grad_norm": 0.2169848531484604, + "learning_rate": 2.959660636099369e-05, + "loss": 0.3922, "step": 63550 }, { - "epoch": 2.24, - "learning_rate": 3.045687820134904e-05, - "loss": 0.2615, + "epoch": 2.2905178938263595, + "grad_norm": 0.22825659811496735, + "learning_rate": 2.9593737945414264e-05, + "loss": 0.3977, "step": 63555 }, { - "epoch": 2.24, - "learning_rate": 3.0454098135056186e-05, - "loss": 0.2461, + "epoch": 2.2906980934875842, + "grad_norm": 0.21935486793518066, + "learning_rate": 2.9590869467246047e-05, + "loss": 0.4143, "step": 63560 }, { - "epoch": 2.24, - "learning_rate": 3.0451317997945454e-05, - "loss": 0.2817, + "epoch": 2.290878293148809, + "grad_norm": 0.18911194801330566, + "learning_rate": 2.9588000926528126e-05, + "loss": 0.3835, "step": 63565 }, { - "epoch": 2.24, - "learning_rate": 3.0448537790052962e-05, - "loss": 0.264, + "epoch": 2.2910584928100337, + "grad_norm": 0.16770300269126892, + "learning_rate": 2.958513232329957e-05, + "loss": 0.4211, "step": 63570 }, { - "epoch": 2.24, - "learning_rate": 3.0445757511414785e-05, - "loss": 0.2779, + "epoch": 2.291238692471258, + "grad_norm": 0.20675276219844818, + "learning_rate": 2.9582263657599485e-05, + "loss": 0.384, "step": 63575 }, { - "epoch": 2.24, - "learning_rate": 3.044297716206704e-05, - "loss": 0.2688, + "epoch": 2.2914188921324827, + "grad_norm": 0.26239025592803955, + "learning_rate": 2.9579394929466943e-05, + "loss": 0.394, "step": 63580 }, { - "epoch": 2.24, - "learning_rate": 3.044019674204582e-05, - "loss": 0.2604, + "epoch": 2.2915990917937075, + "grad_norm": 0.2336747944355011, + "learning_rate": 2.9576526138941025e-05, + "loss": 0.4154, "step": 63585 }, { - "epoch": 2.24, - "learning_rate": 3.0437416251387234e-05, - "loss": 0.2821, + "epoch": 2.291779291454932, + "grad_norm": 0.18274347484111786, + "learning_rate": 2.957365728606083e-05, + "loss": 0.3395, "step": 63590 }, { - "epoch": 2.24, - "learning_rate": 3.0434635690127382e-05, - "loss": 0.2489, + "epoch": 2.291959491116157, + "grad_norm": 0.17019642889499664, + "learning_rate": 2.9570788370865443e-05, + "loss": 0.3737, "step": 63595 }, { - "epoch": 2.24, - "learning_rate": 3.043185505830237e-05, - "loss": 0.2696, + "epoch": 2.2921396907773812, + "grad_norm": 0.2089766263961792, + "learning_rate": 2.956791939339394e-05, + "loss": 0.3783, "step": 63600 }, { - "epoch": 2.24, - "learning_rate": 3.0429074355948295e-05, - "loss": 0.2913, + "epoch": 2.292319890438606, + "grad_norm": 0.2029276192188263, + "learning_rate": 2.956505035368543e-05, + "loss": 0.4223, "step": 63605 }, { - "epoch": 2.24, - "learning_rate": 3.0426293583101267e-05, - "loss": 0.2934, + "epoch": 2.2925000900998307, + "grad_norm": 0.1698850840330124, + "learning_rate": 2.9562181251778986e-05, + "loss": 0.4032, "step": 63610 }, { - "epoch": 2.24, - "learning_rate": 3.0423512739797387e-05, - "loss": 0.2659, + "epoch": 2.2926802897610554, + "grad_norm": 0.17825381457805634, + "learning_rate": 2.9559312087713714e-05, + "loss": 0.3808, "step": 63615 }, { - "epoch": 2.24, - "learning_rate": 3.0420731826072785e-05, - "loss": 0.2681, + "epoch": 2.2928604894222797, + "grad_norm": 0.21084287762641907, + "learning_rate": 2.9556442861528688e-05, + "loss": 0.4042, "step": 63620 }, { - "epoch": 2.24, - "learning_rate": 3.041795084196354e-05, - "loss": 0.2589, + "epoch": 2.2930406890835044, + "grad_norm": 0.21861131489276886, + "learning_rate": 2.9553573573263016e-05, + "loss": 0.4222, "step": 63625 }, { - "epoch": 2.24, - "learning_rate": 3.0415169787505777e-05, - "loss": 0.2642, + "epoch": 2.293220888744729, + "grad_norm": 0.1877928525209427, + "learning_rate": 2.9550704222955778e-05, + "loss": 0.378, "step": 63630 }, { - "epoch": 2.24, - "learning_rate": 3.04123886627356e-05, - "loss": 0.2544, + "epoch": 2.293401088405954, + "grad_norm": 0.23301534354686737, + "learning_rate": 2.954783481064608e-05, + "loss": 0.407, "step": 63635 }, { - "epoch": 2.24, - "learning_rate": 3.0409607467689127e-05, - "loss": 0.2651, + "epoch": 2.2935812880671786, + "grad_norm": 0.2327142059803009, + "learning_rate": 2.954496533637302e-05, + "loss": 0.396, "step": 63640 }, { - "epoch": 2.24, - "learning_rate": 3.0406826202402466e-05, - "loss": 0.2479, + "epoch": 2.293761487728403, + "grad_norm": 0.22101731598377228, + "learning_rate": 2.954209580017568e-05, + "loss": 0.378, "step": 63645 }, { - "epoch": 2.24, - "learning_rate": 3.0404044866911736e-05, - "loss": 0.2829, + "epoch": 2.2939416873896277, + "grad_norm": 0.22895368933677673, + "learning_rate": 2.9539226202093162e-05, + "loss": 0.4179, "step": 63650 }, { - "epoch": 2.24, - "learning_rate": 3.0401263461253033e-05, - "loss": 0.2997, + "epoch": 2.2941218870508524, + "grad_norm": 0.19855442643165588, + "learning_rate": 2.9536356542164573e-05, + "loss": 0.3768, "step": 63655 }, { - "epoch": 2.24, - "learning_rate": 3.039848198546249e-05, - "loss": 0.2897, + "epoch": 2.294302086712077, + "grad_norm": 0.20044022798538208, + "learning_rate": 2.9533486820429e-05, + "loss": 0.3758, "step": 63660 }, { - "epoch": 2.24, - "learning_rate": 3.039570043957621e-05, - "loss": 0.278, + "epoch": 2.2944822863733014, + "grad_norm": 0.23310525715351105, + "learning_rate": 2.9530617036925545e-05, + "loss": 0.4063, "step": 63665 }, { - "epoch": 2.24, - "learning_rate": 3.039291882363032e-05, - "loss": 0.2656, + "epoch": 2.294662486034526, + "grad_norm": 0.2389223724603653, + "learning_rate": 2.9527747191693318e-05, + "loss": 0.3821, "step": 63670 }, { - "epoch": 2.24, - "learning_rate": 3.0390137137660933e-05, - "loss": 0.2906, + "epoch": 2.294842685695751, + "grad_norm": 0.21088966727256775, + "learning_rate": 2.9524877284771406e-05, + "loss": 0.4099, "step": 63675 }, { - "epoch": 2.24, - "learning_rate": 3.038735538170417e-05, - "loss": 0.28, + "epoch": 2.2950228853569756, + "grad_norm": 0.19195705652236938, + "learning_rate": 2.952200731619892e-05, + "loss": 0.3833, "step": 63680 }, { - "epoch": 2.24, - "learning_rate": 3.038457355579615e-05, - "loss": 0.2634, + "epoch": 2.2952030850182004, + "grad_norm": 0.1707751750946045, + "learning_rate": 2.9519137286014957e-05, + "loss": 0.3551, "step": 63685 }, { - "epoch": 2.24, - "learning_rate": 3.038179165997298e-05, - "loss": 0.2563, + "epoch": 2.2953832846794247, + "grad_norm": 0.23566772043704987, + "learning_rate": 2.9516267194258618e-05, + "loss": 0.3843, "step": 63690 }, { - "epoch": 2.24, - "learning_rate": 3.03790096942708e-05, - "loss": 0.2743, + "epoch": 2.2955634843406494, + "grad_norm": 0.19582681357860565, + "learning_rate": 2.9513397040969025e-05, + "loss": 0.4201, "step": 63695 }, { - "epoch": 2.24, - "learning_rate": 3.037622765872572e-05, - "loss": 0.2518, + "epoch": 2.295743684001874, + "grad_norm": 0.2070237249135971, + "learning_rate": 2.9510526826185263e-05, + "loss": 0.3976, "step": 63700 }, { - "epoch": 2.24, - "learning_rate": 3.0373445553373874e-05, - "loss": 0.2593, + "epoch": 2.295923883663099, + "grad_norm": 0.19391392171382904, + "learning_rate": 2.950765654994645e-05, + "loss": 0.4019, "step": 63705 }, { - "epoch": 2.24, - "learning_rate": 3.0370663378251374e-05, - "loss": 0.2665, + "epoch": 2.296104083324323, + "grad_norm": 0.2209123820066452, + "learning_rate": 2.9504786212291685e-05, + "loss": 0.3918, "step": 63710 }, { - "epoch": 2.24, - "learning_rate": 3.036788113339435e-05, - "loss": 0.2536, + "epoch": 2.296284282985548, + "grad_norm": 0.18248969316482544, + "learning_rate": 2.9501915813260084e-05, + "loss": 0.3817, "step": 63715 }, { - "epoch": 2.24, - "learning_rate": 3.0365098818838923e-05, - "loss": 0.2749, + "epoch": 2.2964644826467726, + "grad_norm": 0.17542307078838348, + "learning_rate": 2.9499045352890754e-05, + "loss": 0.3842, "step": 63720 }, { - "epoch": 2.24, - "learning_rate": 3.036231643462123e-05, - "loss": 0.2811, + "epoch": 2.2966446823079973, + "grad_norm": 0.19635504484176636, + "learning_rate": 2.9496174831222796e-05, + "loss": 0.3686, "step": 63725 }, { - "epoch": 2.24, - "learning_rate": 3.035953398077739e-05, - "loss": 0.2634, + "epoch": 2.296824881969222, + "grad_norm": 0.1903241127729416, + "learning_rate": 2.9493304248295327e-05, + "loss": 0.3946, "step": 63730 }, { - "epoch": 2.24, - "learning_rate": 3.0356751457343534e-05, - "loss": 0.2931, + "epoch": 2.2970050816304464, + "grad_norm": 0.19925326108932495, + "learning_rate": 2.949043360414746e-05, + "loss": 0.3765, "step": 63735 }, { - "epoch": 2.24, - "learning_rate": 3.035396886435579e-05, - "loss": 0.2714, + "epoch": 2.297185281291671, + "grad_norm": 0.19926919043064117, + "learning_rate": 2.9487562898818304e-05, + "loss": 0.4365, "step": 63740 }, { - "epoch": 2.24, - "learning_rate": 3.0351186201850286e-05, - "loss": 0.2481, + "epoch": 2.297365480952896, + "grad_norm": 0.23359358310699463, + "learning_rate": 2.9484692132346974e-05, + "loss": 0.3822, "step": 63745 }, { - "epoch": 2.24, - "learning_rate": 3.0348403469863156e-05, - "loss": 0.2698, + "epoch": 2.2975456806141206, + "grad_norm": 0.2015082836151123, + "learning_rate": 2.9481821304772572e-05, + "loss": 0.404, "step": 63750 }, { - "epoch": 2.24, - "learning_rate": 3.0345620668430535e-05, - "loss": 0.2541, + "epoch": 2.297725880275345, + "grad_norm": 0.2047175019979477, + "learning_rate": 2.9478950416134243e-05, + "loss": 0.4028, "step": 63755 }, { - "epoch": 2.24, - "learning_rate": 3.0342837797588552e-05, - "loss": 0.2757, + "epoch": 2.2979060799365696, + "grad_norm": 0.16767185926437378, + "learning_rate": 2.9476079466471063e-05, + "loss": 0.412, "step": 63760 }, { - "epoch": 2.24, - "learning_rate": 3.0340054857373344e-05, - "loss": 0.255, + "epoch": 2.2980862795977943, + "grad_norm": 0.21474462747573853, + "learning_rate": 2.9473208455822178e-05, + "loss": 0.386, "step": 63765 }, { - "epoch": 2.24, - "learning_rate": 3.033727184782103e-05, - "loss": 0.2832, + "epoch": 2.298266479259019, + "grad_norm": 0.22542273998260498, + "learning_rate": 2.947033738422668e-05, + "loss": 0.3809, "step": 63770 }, { - "epoch": 2.24, - "learning_rate": 3.033448876896777e-05, - "loss": 0.2753, + "epoch": 2.298446678920244, + "grad_norm": 0.18820738792419434, + "learning_rate": 2.9467466251723713e-05, + "loss": 0.4045, "step": 63775 }, { - "epoch": 2.24, - "learning_rate": 3.033170562084969e-05, - "loss": 0.2628, + "epoch": 2.298626878581468, + "grad_norm": 0.23645508289337158, + "learning_rate": 2.9464595058352383e-05, + "loss": 0.3902, "step": 63780 }, { - "epoch": 2.24, - "learning_rate": 3.032892240350292e-05, - "loss": 0.277, + "epoch": 2.298807078242693, + "grad_norm": 0.2576570510864258, + "learning_rate": 2.9461723804151802e-05, + "loss": 0.4025, "step": 63785 }, { - "epoch": 2.24, - "learning_rate": 3.032613911696361e-05, - "loss": 0.3037, + "epoch": 2.2989872779039175, + "grad_norm": 0.2036072313785553, + "learning_rate": 2.9458852489161102e-05, + "loss": 0.4015, "step": 63790 }, { - "epoch": 2.24, - "learning_rate": 3.0323355761267892e-05, - "loss": 0.2693, + "epoch": 2.2991674775651423, + "grad_norm": 0.1797807663679123, + "learning_rate": 2.94559811134194e-05, + "loss": 0.3861, "step": 63795 }, { - "epoch": 2.24, - "learning_rate": 3.03205723364519e-05, - "loss": 0.2608, + "epoch": 2.2993476772263666, + "grad_norm": 0.2656286954879761, + "learning_rate": 2.945310967696583e-05, + "loss": 0.3931, "step": 63800 }, { - "epoch": 2.24, - "learning_rate": 3.031778884255179e-05, - "loss": 0.2782, + "epoch": 2.2995278768875913, + "grad_norm": 0.2161501944065094, + "learning_rate": 2.9450238179839483e-05, + "loss": 0.4066, "step": 63805 }, { - "epoch": 2.25, - "learning_rate": 3.0315005279603696e-05, - "loss": 0.2889, + "epoch": 2.299708076548816, + "grad_norm": 0.24081888794898987, + "learning_rate": 2.9447366622079515e-05, + "loss": 0.4195, "step": 63810 }, { - "epoch": 2.25, - "learning_rate": 3.031222164764376e-05, - "loss": 0.2652, + "epoch": 2.2998882762100408, + "grad_norm": 0.18698875606060028, + "learning_rate": 2.9444495003725033e-05, + "loss": 0.3576, "step": 63815 }, { - "epoch": 2.25, - "learning_rate": 3.030943794670813e-05, - "loss": 0.2428, + "epoch": 2.3000684758712655, + "grad_norm": 0.22651271522045135, + "learning_rate": 2.9441623324815166e-05, + "loss": 0.4172, "step": 63820 }, { - "epoch": 2.25, - "learning_rate": 3.030665417683295e-05, - "loss": 0.2678, + "epoch": 2.3002486755324902, + "grad_norm": 0.23613341152668, + "learning_rate": 2.9438751585389047e-05, + "loss": 0.4038, "step": 63825 }, { - "epoch": 2.25, - "learning_rate": 3.0303870338054354e-05, - "loss": 0.2648, + "epoch": 2.3004288751937145, + "grad_norm": 0.19608378410339355, + "learning_rate": 2.9435879785485788e-05, + "loss": 0.3828, "step": 63830 }, { - "epoch": 2.25, - "learning_rate": 3.0301086430408498e-05, - "loss": 0.2691, + "epoch": 2.3006090748549393, + "grad_norm": 0.19310517609119415, + "learning_rate": 2.943300792514453e-05, + "loss": 0.3665, "step": 63835 }, { - "epoch": 2.25, - "learning_rate": 3.0298302453931533e-05, - "loss": 0.2628, + "epoch": 2.300789274516164, + "grad_norm": 0.23690401017665863, + "learning_rate": 2.9430136004404402e-05, + "loss": 0.4279, "step": 63840 }, { - "epoch": 2.25, - "learning_rate": 3.02955184086596e-05, - "loss": 0.2702, + "epoch": 2.3009694741773887, + "grad_norm": 0.201737180352211, + "learning_rate": 2.9427264023304523e-05, + "loss": 0.3869, "step": 63845 }, { - "epoch": 2.25, - "learning_rate": 3.029273429462885e-05, - "loss": 0.2707, + "epoch": 2.301149673838613, + "grad_norm": 0.192640483379364, + "learning_rate": 2.942439198188403e-05, + "loss": 0.4047, "step": 63850 }, { - "epoch": 2.25, - "learning_rate": 3.028995011187543e-05, - "loss": 0.29, + "epoch": 2.3013298734998378, + "grad_norm": 0.17696553468704224, + "learning_rate": 2.9421519880182047e-05, + "loss": 0.3942, "step": 63855 }, { - "epoch": 2.25, - "learning_rate": 3.02871658604355e-05, - "loss": 0.2597, + "epoch": 2.3015100731610625, + "grad_norm": 0.19767604768276215, + "learning_rate": 2.9418647718237724e-05, + "loss": 0.4076, "step": 63860 }, { - "epoch": 2.25, - "learning_rate": 3.0284381540345203e-05, - "loss": 0.2668, + "epoch": 2.301690272822287, + "grad_norm": 0.25544825196266174, + "learning_rate": 2.9415775496090174e-05, + "loss": 0.3982, "step": 63865 }, { - "epoch": 2.25, - "learning_rate": 3.02815971516407e-05, - "loss": 0.2517, + "epoch": 2.301870472483512, + "grad_norm": 0.20909026265144348, + "learning_rate": 2.941290321377854e-05, + "loss": 0.4101, "step": 63870 }, { - "epoch": 2.25, - "learning_rate": 3.0278812694358127e-05, - "loss": 0.2788, + "epoch": 2.3020506721447362, + "grad_norm": 0.16474294662475586, + "learning_rate": 2.941003087134195e-05, + "loss": 0.3868, "step": 63875 }, { - "epoch": 2.25, - "learning_rate": 3.027602816853366e-05, - "loss": 0.2658, + "epoch": 2.302230871805961, + "grad_norm": 0.18018871545791626, + "learning_rate": 2.940715846881955e-05, + "loss": 0.3999, "step": 63880 }, { - "epoch": 2.25, - "learning_rate": 3.0273243574203436e-05, - "loss": 0.2614, + "epoch": 2.3024110714671857, + "grad_norm": 0.20449526607990265, + "learning_rate": 2.9404286006250464e-05, + "loss": 0.4119, "step": 63885 }, { - "epoch": 2.25, - "learning_rate": 3.0270458911403625e-05, - "loss": 0.2683, + "epoch": 2.3025912711284104, + "grad_norm": 0.2095811516046524, + "learning_rate": 2.9401413483673833e-05, + "loss": 0.3833, "step": 63890 }, { - "epoch": 2.25, - "learning_rate": 3.026767418017038e-05, - "loss": 0.2708, + "epoch": 2.3027714707896347, + "grad_norm": 0.19565238058567047, + "learning_rate": 2.93985409011288e-05, + "loss": 0.3665, "step": 63895 }, { - "epoch": 2.25, - "learning_rate": 3.026488938053985e-05, - "loss": 0.272, + "epoch": 2.3029516704508595, + "grad_norm": 0.18948253989219666, + "learning_rate": 2.9395668258654497e-05, + "loss": 0.3988, "step": 63900 }, { - "epoch": 2.25, - "learning_rate": 3.0262104512548212e-05, - "loss": 0.2595, + "epoch": 2.303131870112084, + "grad_norm": 0.19964943826198578, + "learning_rate": 2.9392795556290064e-05, + "loss": 0.4107, "step": 63905 }, { - "epoch": 2.25, - "learning_rate": 3.0259319576231603e-05, - "loss": 0.2552, + "epoch": 2.303312069773309, + "grad_norm": 0.2215508222579956, + "learning_rate": 2.9389922794074643e-05, + "loss": 0.379, "step": 63910 }, { - "epoch": 2.25, - "learning_rate": 3.0256534571626206e-05, - "loss": 0.2814, + "epoch": 2.3034922694345337, + "grad_norm": 0.20707103610038757, + "learning_rate": 2.9387049972047376e-05, + "loss": 0.3797, "step": 63915 }, { - "epoch": 2.25, - "learning_rate": 3.0253749498768163e-05, - "loss": 0.2735, + "epoch": 2.303672469095758, + "grad_norm": 0.24852395057678223, + "learning_rate": 2.93841770902474e-05, + "loss": 0.4041, "step": 63920 }, { - "epoch": 2.25, - "learning_rate": 3.0250964357693652e-05, - "loss": 0.2509, + "epoch": 2.3038526687569827, + "grad_norm": 0.2447209507226944, + "learning_rate": 2.9381304148713856e-05, + "loss": 0.4051, "step": 63925 }, { - "epoch": 2.25, - "learning_rate": 3.0248179148438826e-05, - "loss": 0.2908, + "epoch": 2.3040328684182074, + "grad_norm": 0.22265037894248962, + "learning_rate": 2.9378431147485896e-05, + "loss": 0.396, "step": 63930 }, { - "epoch": 2.25, - "learning_rate": 3.024539387103985e-05, - "loss": 0.2475, + "epoch": 2.304213068079432, + "grad_norm": 0.18129128217697144, + "learning_rate": 2.9375558086602657e-05, + "loss": 0.3808, "step": 63935 }, { - "epoch": 2.25, - "learning_rate": 3.024260852553289e-05, - "loss": 0.2848, + "epoch": 2.3043932677406564, + "grad_norm": 0.25198519229888916, + "learning_rate": 2.9372684966103293e-05, + "loss": 0.4207, "step": 63940 }, { - "epoch": 2.25, - "learning_rate": 3.0239823111954124e-05, - "loss": 0.258, + "epoch": 2.304573467401881, + "grad_norm": 0.2193840593099594, + "learning_rate": 2.9369811786026935e-05, + "loss": 0.4081, "step": 63945 }, { - "epoch": 2.25, - "learning_rate": 3.02370376303397e-05, - "loss": 0.2805, + "epoch": 2.304753667063106, + "grad_norm": 0.25426429510116577, + "learning_rate": 2.9366938546412744e-05, + "loss": 0.3851, "step": 63950 }, { - "epoch": 2.25, - "learning_rate": 3.0234252080725796e-05, - "loss": 0.2801, + "epoch": 2.3049338667243306, + "grad_norm": 0.21279068291187286, + "learning_rate": 2.9364065247299848e-05, + "loss": 0.4081, "step": 63955 }, { - "epoch": 2.25, - "learning_rate": 3.0231466463148577e-05, - "loss": 0.2646, + "epoch": 2.3051140663855554, + "grad_norm": 0.1979341357946396, + "learning_rate": 2.9361191888727417e-05, + "loss": 0.3702, "step": 63960 }, { - "epoch": 2.25, - "learning_rate": 3.0228680777644214e-05, - "loss": 0.2717, + "epoch": 2.3052942660467797, + "grad_norm": 0.1835552603006363, + "learning_rate": 2.93583184707346e-05, + "loss": 0.3754, "step": 63965 }, { - "epoch": 2.25, - "learning_rate": 3.0225895024248874e-05, - "loss": 0.2827, + "epoch": 2.3054744657080044, + "grad_norm": 0.16004249453544617, + "learning_rate": 2.935544499336052e-05, + "loss": 0.4186, "step": 63970 }, { - "epoch": 2.25, - "learning_rate": 3.0223109202998734e-05, - "loss": 0.2582, + "epoch": 2.305654665369229, + "grad_norm": 0.22096280753612518, + "learning_rate": 2.9352571456644357e-05, + "loss": 0.3709, "step": 63975 }, { - "epoch": 2.25, - "learning_rate": 3.0220323313929965e-05, - "loss": 0.2712, + "epoch": 2.305834865030454, + "grad_norm": 0.19364888966083527, + "learning_rate": 2.934969786062524e-05, + "loss": 0.3871, "step": 63980 }, { - "epoch": 2.25, - "learning_rate": 3.021753735707874e-05, - "loss": 0.2704, + "epoch": 2.306015064691678, + "grad_norm": 0.17691142857074738, + "learning_rate": 2.934682420534235e-05, + "loss": 0.4034, "step": 63985 }, { - "epoch": 2.25, - "learning_rate": 3.0214751332481218e-05, - "loss": 0.2845, + "epoch": 2.306195264352903, + "grad_norm": 0.239870086312294, + "learning_rate": 2.9343950490834806e-05, + "loss": 0.4087, "step": 63990 }, { - "epoch": 2.25, - "learning_rate": 3.02119652401736e-05, - "loss": 0.2768, + "epoch": 2.3063754640141276, + "grad_norm": 0.2206905335187912, + "learning_rate": 2.934107671714178e-05, + "loss": 0.3875, "step": 63995 }, { - "epoch": 2.25, - "learning_rate": 3.020917908019204e-05, - "loss": 0.2669, + "epoch": 2.3065556636753524, + "grad_norm": 0.18567106127738953, + "learning_rate": 2.9338202884302423e-05, + "loss": 0.4075, "step": 64000 }, { - "epoch": 2.25, - "eval_loss": 0.26856276392936707, - "eval_runtime": 10.5405, - "eval_samples_per_second": 9.487, - "eval_steps_per_second": 9.487, + "epoch": 2.3065556636753524, + "eval_loss": 0.43620339035987854, + "eval_runtime": 3.5308, + "eval_samples_per_second": 28.322, + "eval_steps_per_second": 7.081, "step": 64000 }, { - "epoch": 2.25, - "learning_rate": 3.0206392852572733e-05, - "loss": 0.2684, + "epoch": 2.306735863336577, + "grad_norm": 0.1770346313714981, + "learning_rate": 2.93353289923559e-05, + "loss": 0.395, "step": 64005 }, { - "epoch": 2.25, - "learning_rate": 3.0203606557351836e-05, - "loss": 0.2997, + "epoch": 2.3069160629978014, + "grad_norm": 0.20676358044147491, + "learning_rate": 2.9332455041341355e-05, + "loss": 0.3684, "step": 64010 }, { - "epoch": 2.25, - "learning_rate": 3.0200820194565537e-05, - "loss": 0.297, + "epoch": 2.307096262659026, + "grad_norm": 0.21041785180568695, + "learning_rate": 2.932958103129794e-05, + "loss": 0.3907, "step": 64015 }, { - "epoch": 2.25, - "learning_rate": 3.0198033764250023e-05, - "loss": 0.2825, + "epoch": 2.307276462320251, + "grad_norm": 0.19155463576316833, + "learning_rate": 2.9326706962264832e-05, + "loss": 0.3741, "step": 64020 }, { - "epoch": 2.25, - "learning_rate": 3.0195247266441458e-05, - "loss": 0.2744, + "epoch": 2.3074566619814756, + "grad_norm": 0.1932283639907837, + "learning_rate": 2.9323832834281177e-05, + "loss": 0.3953, "step": 64025 }, { - "epoch": 2.25, - "learning_rate": 3.019246070117604e-05, - "loss": 0.2528, + "epoch": 2.3076368616427, + "grad_norm": 0.19772790372371674, + "learning_rate": 2.932095864738613e-05, + "loss": 0.4258, "step": 64030 }, { - "epoch": 2.25, - "learning_rate": 3.018967406848993e-05, - "loss": 0.2631, + "epoch": 2.3078170613039246, + "grad_norm": 0.18976783752441406, + "learning_rate": 2.9318084401618872e-05, + "loss": 0.403, "step": 64035 }, { - "epoch": 2.25, - "learning_rate": 3.0186887368419337e-05, - "loss": 0.2773, + "epoch": 2.3079972609651493, + "grad_norm": 0.19546537101268768, + "learning_rate": 2.9315210097018535e-05, + "loss": 0.3811, "step": 64040 }, { - "epoch": 2.25, - "learning_rate": 3.018410060100042e-05, - "loss": 0.2583, + "epoch": 2.308177460626374, + "grad_norm": 0.17727042734622955, + "learning_rate": 2.9312335733624312e-05, + "loss": 0.3558, "step": 64045 }, { - "epoch": 2.25, - "learning_rate": 3.0181313766269376e-05, - "loss": 0.2694, + "epoch": 2.308357660287599, + "grad_norm": 0.17102521657943726, + "learning_rate": 2.930946131147534e-05, + "loss": 0.3756, "step": 64050 }, { - "epoch": 2.25, - "learning_rate": 3.017852686426238e-05, - "loss": 0.2807, + "epoch": 2.3085378599488235, + "grad_norm": 0.21491068601608276, + "learning_rate": 2.9306586830610794e-05, + "loss": 0.4153, "step": 64055 }, { - "epoch": 2.25, - "learning_rate": 3.0175739895015636e-05, - "loss": 0.2918, + "epoch": 2.308718059610048, + "grad_norm": 0.18855595588684082, + "learning_rate": 2.930371229106983e-05, + "loss": 0.3981, "step": 64060 }, { - "epoch": 2.25, - "learning_rate": 3.0172952858565317e-05, - "loss": 0.3113, + "epoch": 2.3088982592712726, + "grad_norm": 0.19503138959407806, + "learning_rate": 2.9300837692891627e-05, + "loss": 0.4133, "step": 64065 }, { - "epoch": 2.25, - "learning_rate": 3.017016575494761e-05, - "loss": 0.2758, + "epoch": 2.3090784589324973, + "grad_norm": 0.18976575136184692, + "learning_rate": 2.929796303611534e-05, + "loss": 0.3941, "step": 64070 }, { - "epoch": 2.25, - "learning_rate": 3.0167378584198714e-05, - "loss": 0.2863, + "epoch": 2.309258658593722, + "grad_norm": 0.1784740537405014, + "learning_rate": 2.9295088320780133e-05, + "loss": 0.408, "step": 64075 }, { - "epoch": 2.25, - "learning_rate": 3.016459134635481e-05, - "loss": 0.2638, + "epoch": 2.3094388582549463, + "grad_norm": 0.21722306311130524, + "learning_rate": 2.929221354692519e-05, + "loss": 0.4086, "step": 64080 }, { - "epoch": 2.25, - "learning_rate": 3.0161804041452092e-05, - "loss": 0.2724, + "epoch": 2.309619057916171, + "grad_norm": 0.2144099622964859, + "learning_rate": 2.9289338714589664e-05, + "loss": 0.3952, "step": 64085 }, { - "epoch": 2.25, - "learning_rate": 3.0159016669526752e-05, - "loss": 0.2684, + "epoch": 2.309799257577396, + "grad_norm": 0.1988193541765213, + "learning_rate": 2.9286463823812732e-05, + "loss": 0.3712, "step": 64090 }, { - "epoch": 2.26, - "learning_rate": 3.0156229230614973e-05, - "loss": 0.2513, + "epoch": 2.3099794572386205, + "grad_norm": 0.18340793251991272, + "learning_rate": 2.928358887463355e-05, + "loss": 0.4021, "step": 64095 }, { - "epoch": 2.26, - "learning_rate": 3.015344172475296e-05, - "loss": 0.2973, + "epoch": 2.3101596568998453, + "grad_norm": 0.24040386080741882, + "learning_rate": 2.928071386709131e-05, + "loss": 0.4141, "step": 64100 }, { - "epoch": 2.26, - "learning_rate": 3.0150654151976905e-05, - "loss": 0.2869, + "epoch": 2.3103398565610695, + "grad_norm": 0.19517391920089722, + "learning_rate": 2.9277838801225165e-05, + "loss": 0.368, "step": 64105 }, { - "epoch": 2.26, - "learning_rate": 3.0147866512322996e-05, - "loss": 0.2659, + "epoch": 2.3105200562222943, + "grad_norm": 0.1790241003036499, + "learning_rate": 2.9274963677074298e-05, + "loss": 0.3828, "step": 64110 }, { - "epoch": 2.26, - "learning_rate": 3.0145078805827437e-05, - "loss": 0.3028, + "epoch": 2.310700255883519, + "grad_norm": 0.15677717328071594, + "learning_rate": 2.927208849467788e-05, + "loss": 0.4316, "step": 64115 }, { - "epoch": 2.26, - "learning_rate": 3.014229103252642e-05, - "loss": 0.2946, + "epoch": 2.3108804555447438, + "grad_norm": 0.21783356368541718, + "learning_rate": 2.9269213254075074e-05, + "loss": 0.4188, "step": 64120 }, { - "epoch": 2.26, - "learning_rate": 3.013950319245614e-05, - "loss": 0.276, + "epoch": 2.311060655205968, + "grad_norm": 0.23642495274543762, + "learning_rate": 2.926633795530508e-05, + "loss": 0.3957, "step": 64125 }, { - "epoch": 2.26, - "learning_rate": 3.0136715285652795e-05, - "loss": 0.2682, + "epoch": 2.3112408548671928, + "grad_norm": 0.18061873316764832, + "learning_rate": 2.9263462598407044e-05, + "loss": 0.3771, "step": 64130 }, { - "epoch": 2.26, - "learning_rate": 3.0133927312152593e-05, - "loss": 0.2826, + "epoch": 2.3114210545284175, + "grad_norm": 0.1819910854101181, + "learning_rate": 2.9260587183420164e-05, + "loss": 0.3468, "step": 64135 }, { - "epoch": 2.26, - "learning_rate": 3.0131139271991726e-05, - "loss": 0.2862, + "epoch": 2.3116012541896422, + "grad_norm": 0.2120652049779892, + "learning_rate": 2.9257711710383595e-05, + "loss": 0.4319, "step": 64140 }, { - "epoch": 2.26, - "learning_rate": 3.0128351165206404e-05, - "loss": 0.2731, + "epoch": 2.311781453850867, + "grad_norm": 0.20193518698215485, + "learning_rate": 2.925483617933654e-05, + "loss": 0.3839, "step": 64145 }, { - "epoch": 2.26, - "learning_rate": 3.012556299183281e-05, - "loss": 0.2835, + "epoch": 2.3119616535120913, + "grad_norm": 0.19727912545204163, + "learning_rate": 2.9251960590318167e-05, + "loss": 0.4062, "step": 64150 }, { - "epoch": 2.26, - "learning_rate": 3.012277475190716e-05, - "loss": 0.2742, + "epoch": 2.312141853173316, + "grad_norm": 0.17611241340637207, + "learning_rate": 2.9249084943367648e-05, + "loss": 0.3645, "step": 64155 }, { - "epoch": 2.26, - "learning_rate": 3.0119986445465663e-05, - "loss": 0.268, + "epoch": 2.3123220528345407, + "grad_norm": 0.17656007409095764, + "learning_rate": 2.9246209238524176e-05, + "loss": 0.3498, "step": 64160 }, { - "epoch": 2.26, - "learning_rate": 3.0117198072544518e-05, - "loss": 0.2489, + "epoch": 2.3125022524957655, + "grad_norm": 0.1993638426065445, + "learning_rate": 2.9243333475826916e-05, + "loss": 0.381, "step": 64165 }, { - "epoch": 2.26, - "learning_rate": 3.011440963317992e-05, - "loss": 0.2575, + "epoch": 2.3126824521569898, + "grad_norm": 0.23477478325366974, + "learning_rate": 2.924045765531506e-05, + "loss": 0.361, "step": 64170 }, { - "epoch": 2.26, - "learning_rate": 3.011162112740809e-05, - "loss": 0.268, + "epoch": 2.3128626518182145, + "grad_norm": 0.23515748977661133, + "learning_rate": 2.92375817770278e-05, + "loss": 0.4028, "step": 64175 }, { - "epoch": 2.26, - "learning_rate": 3.0108832555265225e-05, - "loss": 0.272, + "epoch": 2.313042851479439, + "grad_norm": 0.17302124202251434, + "learning_rate": 2.9234705841004295e-05, + "loss": 0.4029, "step": 64180 }, { - "epoch": 2.26, - "learning_rate": 3.0106043916787534e-05, - "loss": 0.2692, + "epoch": 2.313223051140664, + "grad_norm": 0.2219003438949585, + "learning_rate": 2.9231829847283753e-05, + "loss": 0.3592, "step": 64185 }, { - "epoch": 2.26, - "learning_rate": 3.010325521201123e-05, - "loss": 0.2692, + "epoch": 2.3134032508018887, + "grad_norm": 0.23449034988880157, + "learning_rate": 2.922895379590535e-05, + "loss": 0.408, "step": 64190 }, { - "epoch": 2.26, - "learning_rate": 3.0100466440972523e-05, - "loss": 0.2626, + "epoch": 2.313583450463113, + "grad_norm": 0.21402695775032043, + "learning_rate": 2.9226077686908265e-05, + "loss": 0.4229, "step": 64195 }, { - "epoch": 2.26, - "learning_rate": 3.0097677603707614e-05, - "loss": 0.2806, + "epoch": 2.3137636501243377, + "grad_norm": 0.2237950712442398, + "learning_rate": 2.922320152033169e-05, + "loss": 0.4359, "step": 64200 }, { - "epoch": 2.26, - "learning_rate": 3.009488870025273e-05, - "loss": 0.2461, + "epoch": 2.3139438497855624, + "grad_norm": 0.20469123125076294, + "learning_rate": 2.922032529621481e-05, + "loss": 0.3704, "step": 64205 }, { - "epoch": 2.26, - "learning_rate": 3.0092099730644064e-05, - "loss": 0.2769, + "epoch": 2.314124049446787, + "grad_norm": 0.17067334055900574, + "learning_rate": 2.921744901459682e-05, + "loss": 0.3781, "step": 64210 }, { - "epoch": 2.26, - "learning_rate": 3.0089310694917845e-05, - "loss": 0.2555, + "epoch": 2.3143042491080115, + "grad_norm": 0.2015264630317688, + "learning_rate": 2.92145726755169e-05, + "loss": 0.4129, "step": 64215 }, { - "epoch": 2.26, - "learning_rate": 3.008652159311028e-05, - "loss": 0.279, + "epoch": 2.314484448769236, + "grad_norm": 0.19434256851673126, + "learning_rate": 2.9211696279014244e-05, + "loss": 0.3847, "step": 64220 }, { - "epoch": 2.26, - "learning_rate": 3.0083732425257583e-05, - "loss": 0.2507, + "epoch": 2.314664648430461, + "grad_norm": 0.18357601761817932, + "learning_rate": 2.920881982512804e-05, + "loss": 0.4103, "step": 64225 }, { - "epoch": 2.26, - "learning_rate": 3.008094319139597e-05, - "loss": 0.2701, + "epoch": 2.3148448480916857, + "grad_norm": 0.1933751106262207, + "learning_rate": 2.9205943313897487e-05, + "loss": 0.3704, "step": 64230 }, { - "epoch": 2.26, - "learning_rate": 3.007815389156166e-05, - "loss": 0.2702, + "epoch": 2.3150250477529104, + "grad_norm": 0.20393973588943481, + "learning_rate": 2.920306674536177e-05, + "loss": 0.3816, "step": 64235 }, { - "epoch": 2.26, - "learning_rate": 3.0075364525790865e-05, - "loss": 0.2656, + "epoch": 2.3152052474141347, + "grad_norm": 0.23503683507442474, + "learning_rate": 2.9200190119560077e-05, + "loss": 0.3898, "step": 64240 }, { - "epoch": 2.26, - "learning_rate": 3.0072575094119813e-05, - "loss": 0.281, + "epoch": 2.3153854470753594, + "grad_norm": 0.18053038418293, + "learning_rate": 2.9197313436531604e-05, + "loss": 0.393, "step": 64245 }, { - "epoch": 2.26, - "learning_rate": 3.006978559658471e-05, - "loss": 0.2743, + "epoch": 2.315565646736584, + "grad_norm": 0.21024245023727417, + "learning_rate": 2.919443669631555e-05, + "loss": 0.4292, "step": 64250 }, { - "epoch": 2.26, - "learning_rate": 3.0066996033221785e-05, - "loss": 0.2509, + "epoch": 2.315745846397809, + "grad_norm": 0.1868125945329666, + "learning_rate": 2.9191559898951115e-05, + "loss": 0.4292, "step": 64255 }, { - "epoch": 2.26, - "learning_rate": 3.0064206404067258e-05, - "loss": 0.2651, + "epoch": 2.315926046059033, + "grad_norm": 0.19413365423679352, + "learning_rate": 2.9188683044477484e-05, + "loss": 0.4133, "step": 64260 }, { - "epoch": 2.26, - "learning_rate": 3.0061416709157343e-05, - "loss": 0.2743, + "epoch": 2.316106245720258, + "grad_norm": 0.25181156396865845, + "learning_rate": 2.9185806132933856e-05, + "loss": 0.4184, "step": 64265 }, { - "epoch": 2.26, - "learning_rate": 3.005862694852827e-05, - "loss": 0.262, + "epoch": 2.3162864453814827, + "grad_norm": 0.20910336077213287, + "learning_rate": 2.9182929164359436e-05, + "loss": 0.375, "step": 64270 }, { - "epoch": 2.26, - "learning_rate": 3.005583712221626e-05, - "loss": 0.2638, + "epoch": 2.3164666450427074, + "grad_norm": 0.1750982105731964, + "learning_rate": 2.9180052138793413e-05, + "loss": 0.3782, "step": 64275 }, { - "epoch": 2.26, - "learning_rate": 3.0053047230257535e-05, - "loss": 0.2714, + "epoch": 2.316646844703932, + "grad_norm": 0.18748337030410767, + "learning_rate": 2.917717505627498e-05, + "loss": 0.3698, "step": 64280 }, { - "epoch": 2.26, - "learning_rate": 3.0050257272688327e-05, - "loss": 0.2561, + "epoch": 2.3168270443651564, + "grad_norm": 0.2219618856906891, + "learning_rate": 2.917429791684335e-05, + "loss": 0.4027, "step": 64285 }, { - "epoch": 2.26, - "learning_rate": 3.0047467249544854e-05, - "loss": 0.2601, + "epoch": 2.317007244026381, + "grad_norm": 0.19255468249320984, + "learning_rate": 2.917142072053773e-05, + "loss": 0.373, "step": 64290 }, { - "epoch": 2.26, - "learning_rate": 3.0044677160863338e-05, - "loss": 0.2726, + "epoch": 2.317187443687606, + "grad_norm": 0.19871120154857635, + "learning_rate": 2.9168543467397298e-05, + "loss": 0.3987, "step": 64295 }, { - "epoch": 2.26, - "learning_rate": 3.0041887006680026e-05, - "loss": 0.2747, + "epoch": 2.3173676433488306, + "grad_norm": 0.2183811515569687, + "learning_rate": 2.9165666157461268e-05, + "loss": 0.3834, "step": 64300 }, { - "epoch": 2.26, - "learning_rate": 3.0039096787031123e-05, - "loss": 0.2926, + "epoch": 2.317547843010055, + "grad_norm": 0.19588343799114227, + "learning_rate": 2.9162788790768847e-05, + "loss": 0.4093, "step": 64305 }, { - "epoch": 2.26, - "learning_rate": 3.0036306501952882e-05, - "loss": 0.2767, + "epoch": 2.3177280426712796, + "grad_norm": 0.1578301340341568, + "learning_rate": 2.9159911367359238e-05, + "loss": 0.3794, "step": 64310 }, { - "epoch": 2.26, - "learning_rate": 3.0033516151481512e-05, - "loss": 0.291, + "epoch": 2.3179082423325044, + "grad_norm": 0.21837523579597473, + "learning_rate": 2.915703388727164e-05, + "loss": 0.4073, "step": 64315 }, { - "epoch": 2.26, - "learning_rate": 3.0030725735653246e-05, - "loss": 0.2831, + "epoch": 2.318088441993729, + "grad_norm": 0.2273416817188263, + "learning_rate": 2.9154156350545252e-05, + "loss": 0.3673, "step": 64320 }, { - "epoch": 2.26, - "learning_rate": 3.002793525450433e-05, - "loss": 0.248, + "epoch": 2.318268641654954, + "grad_norm": 0.2129017859697342, + "learning_rate": 2.9151278757219296e-05, + "loss": 0.3983, "step": 64325 }, { - "epoch": 2.26, - "learning_rate": 3.002514470807099e-05, - "loss": 0.2657, + "epoch": 2.3184488413161786, + "grad_norm": 0.17560027539730072, + "learning_rate": 2.9148401107332972e-05, + "loss": 0.3978, "step": 64330 }, { - "epoch": 2.26, - "learning_rate": 3.002235409638945e-05, - "loss": 0.2686, + "epoch": 2.318629040977403, + "grad_norm": 0.20387177169322968, + "learning_rate": 2.914552340092549e-05, + "loss": 0.4097, "step": 64335 }, { - "epoch": 2.26, - "learning_rate": 3.001956341949596e-05, - "loss": 0.2589, + "epoch": 2.3188092406386276, + "grad_norm": 0.23885492980480194, + "learning_rate": 2.9142645638036042e-05, + "loss": 0.3813, "step": 64340 }, { - "epoch": 2.26, - "learning_rate": 3.001677267742674e-05, - "loss": 0.2523, + "epoch": 2.3189894402998523, + "grad_norm": 0.20437073707580566, + "learning_rate": 2.9139767818703857e-05, + "loss": 0.3662, "step": 64345 }, { - "epoch": 2.26, - "learning_rate": 3.0013981870218034e-05, - "loss": 0.2848, + "epoch": 2.319169639961077, + "grad_norm": 0.20243240892887115, + "learning_rate": 2.9136889942968138e-05, + "loss": 0.3974, "step": 64350 }, { - "epoch": 2.26, - "learning_rate": 3.001119099790608e-05, - "loss": 0.2721, + "epoch": 2.3193498396223013, + "grad_norm": 0.19991594552993774, + "learning_rate": 2.9134012010868082e-05, + "loss": 0.4184, "step": 64355 }, { - "epoch": 2.26, - "learning_rate": 3.0008400060527113e-05, - "loss": 0.2901, + "epoch": 2.319530039283526, + "grad_norm": 0.1909300833940506, + "learning_rate": 2.913113402244293e-05, + "loss": 0.3714, "step": 64360 }, { - "epoch": 2.26, - "learning_rate": 3.0005609058117374e-05, - "loss": 0.2814, + "epoch": 2.319710238944751, + "grad_norm": 0.21020810306072235, + "learning_rate": 2.912825597773186e-05, + "loss": 0.4092, "step": 64365 }, { - "epoch": 2.26, - "learning_rate": 3.0002817990713096e-05, - "loss": 0.256, + "epoch": 2.3198904386059755, + "grad_norm": 0.29738765954971313, + "learning_rate": 2.9125377876774118e-05, + "loss": 0.4022, "step": 64370 }, { - "epoch": 2.26, - "learning_rate": 3.0000026858350522e-05, - "loss": 0.2928, + "epoch": 2.3200706382672003, + "grad_norm": 0.21587322652339935, + "learning_rate": 2.9122499719608894e-05, + "loss": 0.4157, "step": 64375 }, { - "epoch": 2.27, - "learning_rate": 2.99972356610659e-05, - "loss": 0.2758, + "epoch": 2.3202508379284246, + "grad_norm": 0.19749046862125397, + "learning_rate": 2.9119621506275407e-05, + "loss": 0.3893, "step": 64380 }, { - "epoch": 2.27, - "learning_rate": 2.9994444398895464e-05, - "loss": 0.2717, + "epoch": 2.3204310375896493, + "grad_norm": 0.24363020062446594, + "learning_rate": 2.9116743236812878e-05, + "loss": 0.4346, "step": 64385 }, { - "epoch": 2.27, - "learning_rate": 2.999165307187546e-05, - "loss": 0.2795, + "epoch": 2.320611237250874, + "grad_norm": 0.18595632910728455, + "learning_rate": 2.911386491126052e-05, + "loss": 0.3947, "step": 64390 }, { - "epoch": 2.27, - "learning_rate": 2.998886168004213e-05, - "loss": 0.263, + "epoch": 2.3207914369120988, + "grad_norm": 0.2038854956626892, + "learning_rate": 2.9110986529657547e-05, + "loss": 0.4065, "step": 64395 }, { - "epoch": 2.27, - "learning_rate": 2.998607022343172e-05, - "loss": 0.2792, + "epoch": 2.320971636573323, + "grad_norm": 0.26512575149536133, + "learning_rate": 2.9108108092043173e-05, + "loss": 0.3958, "step": 64400 }, { - "epoch": 2.27, - "learning_rate": 2.9983278702080468e-05, - "loss": 0.2497, + "epoch": 2.321151836234548, + "grad_norm": 0.2215341031551361, + "learning_rate": 2.910522959845663e-05, + "loss": 0.3719, "step": 64405 }, { - "epoch": 2.27, - "learning_rate": 2.998048711602463e-05, - "loss": 0.2719, + "epoch": 2.3213320358957725, + "grad_norm": 0.20210741460323334, + "learning_rate": 2.9102351048937122e-05, + "loss": 0.4205, "step": 64410 }, { - "epoch": 2.27, - "learning_rate": 2.9977695465300455e-05, - "loss": 0.2633, + "epoch": 2.3215122355569973, + "grad_norm": 0.21246850490570068, + "learning_rate": 2.9099472443523885e-05, + "loss": 0.4175, "step": 64415 }, { - "epoch": 2.27, - "learning_rate": 2.997490374994418e-05, - "loss": 0.2752, + "epoch": 2.321692435218222, + "grad_norm": 0.23316890001296997, + "learning_rate": 2.9096593782256122e-05, + "loss": 0.4295, "step": 64420 }, { - "epoch": 2.27, - "learning_rate": 2.9972111969992055e-05, - "loss": 0.3034, + "epoch": 2.3218726348794463, + "grad_norm": 0.17650622129440308, + "learning_rate": 2.9093715065173066e-05, + "loss": 0.4047, "step": 64425 }, { - "epoch": 2.27, - "learning_rate": 2.996932012548033e-05, - "loss": 0.2559, + "epoch": 2.322052834540671, + "grad_norm": 0.20012375712394714, + "learning_rate": 2.9090836292313928e-05, + "loss": 0.4085, "step": 64430 }, { - "epoch": 2.27, - "learning_rate": 2.9966528216445265e-05, - "loss": 0.2829, + "epoch": 2.3222330342018958, + "grad_norm": 0.2151358723640442, + "learning_rate": 2.908795746371794e-05, + "loss": 0.3799, "step": 64435 }, { - "epoch": 2.27, - "learning_rate": 2.9963736242923107e-05, - "loss": 0.2869, + "epoch": 2.3224132338631205, + "grad_norm": 0.20195366442203522, + "learning_rate": 2.908507857942433e-05, + "loss": 0.3912, "step": 64440 }, { - "epoch": 2.27, - "learning_rate": 2.9960944204950105e-05, - "loss": 0.2867, + "epoch": 2.3225934335243448, + "grad_norm": 0.24572651088237762, + "learning_rate": 2.9082199639472306e-05, + "loss": 0.3691, "step": 64445 }, { - "epoch": 2.27, - "learning_rate": 2.9958152102562503e-05, - "loss": 0.2968, + "epoch": 2.3227736331855695, + "grad_norm": 0.20580525696277618, + "learning_rate": 2.907932064390111e-05, + "loss": 0.3815, "step": 64450 }, { - "epoch": 2.27, - "learning_rate": 2.9955359935796568e-05, - "loss": 0.2643, + "epoch": 2.3229538328467942, + "grad_norm": 0.26529133319854736, + "learning_rate": 2.9076441592749954e-05, + "loss": 0.3954, "step": 64455 }, { - "epoch": 2.27, - "learning_rate": 2.995256770468855e-05, - "loss": 0.2917, + "epoch": 2.323134032508019, + "grad_norm": 0.19922053813934326, + "learning_rate": 2.907356248605807e-05, + "loss": 0.3686, "step": 64460 }, { - "epoch": 2.27, - "learning_rate": 2.9949775409274706e-05, - "loss": 0.2657, + "epoch": 2.3233142321692437, + "grad_norm": 0.22310146689414978, + "learning_rate": 2.9071259160741603e-05, + "loss": 0.4364, "step": 64465 }, { - "epoch": 2.27, - "learning_rate": 2.994698304959129e-05, - "loss": 0.2644, + "epoch": 2.323494431830468, + "grad_norm": 0.19857893884181976, + "learning_rate": 2.9068379954175267e-05, + "loss": 0.4276, "step": 64470 }, { - "epoch": 2.27, - "learning_rate": 2.994419062567456e-05, - "loss": 0.2726, + "epoch": 2.3236746314916927, + "grad_norm": 0.20761960744857788, + "learning_rate": 2.906550069217804e-05, + "loss": 0.3837, "step": 64475 }, { - "epoch": 2.27, - "learning_rate": 2.9941398137560772e-05, - "loss": 0.272, + "epoch": 2.3238548311529175, + "grad_norm": 0.22300156950950623, + "learning_rate": 2.906262137478915e-05, + "loss": 0.421, "step": 64480 }, { - "epoch": 2.27, - "learning_rate": 2.9938605585286182e-05, - "loss": 0.2778, + "epoch": 2.324035030814142, + "grad_norm": 0.19981767237186432, + "learning_rate": 2.9059742002047835e-05, + "loss": 0.3987, "step": 64485 }, { - "epoch": 2.27, - "learning_rate": 2.9935812968887055e-05, - "loss": 0.2645, + "epoch": 2.3242152304753665, + "grad_norm": 0.21243509650230408, + "learning_rate": 2.9056862573993322e-05, + "loss": 0.4114, "step": 64490 }, { - "epoch": 2.27, - "learning_rate": 2.993302028839965e-05, - "loss": 0.2834, + "epoch": 2.3243954301365912, + "grad_norm": 0.22393706440925598, + "learning_rate": 2.9053983090664838e-05, + "loss": 0.3966, "step": 64495 }, { - "epoch": 2.27, - "learning_rate": 2.993022754386023e-05, - "loss": 0.2824, + "epoch": 2.324575629797816, + "grad_norm": 0.19257164001464844, + "learning_rate": 2.9051103552101623e-05, + "loss": 0.389, "step": 64500 }, { - "epoch": 2.27, - "eval_loss": 0.26770463585853577, - "eval_runtime": 10.5455, - "eval_samples_per_second": 9.483, - "eval_steps_per_second": 9.483, + "epoch": 2.324575629797816, + "eval_loss": 0.4352463483810425, + "eval_runtime": 3.5326, + "eval_samples_per_second": 28.308, + "eval_steps_per_second": 7.077, "step": 64500 }, { - "epoch": 2.27, - "learning_rate": 2.992743473530505e-05, - "loss": 0.2888, + "epoch": 2.3247558294590407, + "grad_norm": 0.18349726498126984, + "learning_rate": 2.9048223958342906e-05, + "loss": 0.407, "step": 64505 }, { - "epoch": 2.27, - "learning_rate": 2.9924641862770376e-05, - "loss": 0.2819, + "epoch": 2.3249360291202654, + "grad_norm": 0.2214827537536621, + "learning_rate": 2.9045344309427924e-05, + "loss": 0.3542, "step": 64510 }, { - "epoch": 2.27, - "learning_rate": 2.992184892629248e-05, - "loss": 0.2553, + "epoch": 2.3251162287814897, + "grad_norm": 0.19785992801189423, + "learning_rate": 2.9042464605395898e-05, + "loss": 0.4088, "step": 64515 }, { - "epoch": 2.27, - "learning_rate": 2.991905592590762e-05, - "loss": 0.2743, + "epoch": 2.3252964284427144, + "grad_norm": 0.24504734575748444, + "learning_rate": 2.9039584846286083e-05, + "loss": 0.3953, "step": 64520 }, { - "epoch": 2.27, - "learning_rate": 2.991626286165205e-05, - "loss": 0.2775, + "epoch": 2.325476628103939, + "grad_norm": 0.22237750887870789, + "learning_rate": 2.9036705032137707e-05, + "loss": 0.3808, "step": 64525 }, { - "epoch": 2.27, - "learning_rate": 2.991346973356206e-05, - "loss": 0.2794, + "epoch": 2.325656827765164, + "grad_norm": 0.23211441934108734, + "learning_rate": 2.903382516299e-05, + "loss": 0.4144, "step": 64530 }, { - "epoch": 2.27, - "learning_rate": 2.9910676541673893e-05, - "loss": 0.2578, + "epoch": 2.325837027426388, + "grad_norm": 0.21520470082759857, + "learning_rate": 2.9030945238882214e-05, + "loss": 0.4161, "step": 64535 }, { - "epoch": 2.27, - "learning_rate": 2.990788328602383e-05, - "loss": 0.2791, + "epoch": 2.326017227087613, + "grad_norm": 0.21746765077114105, + "learning_rate": 2.9028065259853572e-05, + "loss": 0.4375, "step": 64540 }, { - "epoch": 2.27, - "learning_rate": 2.9905089966648137e-05, - "loss": 0.2512, + "epoch": 2.3261974267488377, + "grad_norm": 0.22863152623176575, + "learning_rate": 2.902518522594333e-05, + "loss": 0.409, "step": 64545 }, { - "epoch": 2.27, - "learning_rate": 2.990229658358309e-05, - "loss": 0.2719, + "epoch": 2.3263776264100624, + "grad_norm": 0.1936892420053482, + "learning_rate": 2.902230513719071e-05, + "loss": 0.4021, "step": 64550 }, { - "epoch": 2.27, - "learning_rate": 2.989950313686494e-05, - "loss": 0.2566, + "epoch": 2.326557826071287, + "grad_norm": 0.22578535974025726, + "learning_rate": 2.9019424993634968e-05, + "loss": 0.402, "step": 64555 }, { - "epoch": 2.27, - "learning_rate": 2.9896709626529986e-05, - "loss": 0.2486, + "epoch": 2.326738025732512, + "grad_norm": 0.23165376484394073, + "learning_rate": 2.901654479531533e-05, + "loss": 0.4211, "step": 64560 }, { - "epoch": 2.27, - "learning_rate": 2.9893916052614474e-05, - "loss": 0.2557, + "epoch": 2.326918225393736, + "grad_norm": 0.1904148906469345, + "learning_rate": 2.9013664542271057e-05, + "loss": 0.4131, "step": 64565 }, { - "epoch": 2.27, - "learning_rate": 2.989112241515469e-05, - "loss": 0.2585, + "epoch": 2.327098425054961, + "grad_norm": 0.16796857118606567, + "learning_rate": 2.9010784234541376e-05, + "loss": 0.4064, "step": 64570 }, { - "epoch": 2.27, - "learning_rate": 2.9888328714186898e-05, - "loss": 0.294, + "epoch": 2.3272786247161856, + "grad_norm": 0.1999785602092743, + "learning_rate": 2.900790387216553e-05, + "loss": 0.3945, "step": 64575 }, { - "epoch": 2.27, - "learning_rate": 2.988553494974739e-05, - "loss": 0.2827, + "epoch": 2.3274588243774104, + "grad_norm": 0.19684360921382904, + "learning_rate": 2.9005023455182784e-05, + "loss": 0.3999, "step": 64580 }, { - "epoch": 2.27, - "learning_rate": 2.9882741121872425e-05, - "loss": 0.2649, + "epoch": 2.3276390240386347, + "grad_norm": 0.20736943185329437, + "learning_rate": 2.9002142983632365e-05, + "loss": 0.3924, "step": 64585 }, { - "epoch": 2.27, - "learning_rate": 2.9879947230598283e-05, - "loss": 0.2765, + "epoch": 2.3278192236998594, + "grad_norm": 0.21448162198066711, + "learning_rate": 2.8999262457553518e-05, + "loss": 0.3711, "step": 64590 }, { - "epoch": 2.27, - "learning_rate": 2.9877153275961244e-05, - "loss": 0.2723, + "epoch": 2.327999423361084, + "grad_norm": 0.17774830758571625, + "learning_rate": 2.899638187698549e-05, + "loss": 0.3944, "step": 64595 }, { - "epoch": 2.27, - "learning_rate": 2.987435925799758e-05, - "loss": 0.2668, + "epoch": 2.328179623022309, + "grad_norm": 0.19229644536972046, + "learning_rate": 2.899350124196754e-05, + "loss": 0.3717, "step": 64600 }, { - "epoch": 2.27, - "learning_rate": 2.9871565176743583e-05, - "loss": 0.2791, + "epoch": 2.3283598226835336, + "grad_norm": 0.22813409566879272, + "learning_rate": 2.8990620552538905e-05, + "loss": 0.4118, "step": 64605 }, { - "epoch": 2.27, - "learning_rate": 2.9868771032235514e-05, - "loss": 0.2842, + "epoch": 2.328540022344758, + "grad_norm": 0.16227515041828156, + "learning_rate": 2.898773980873884e-05, + "loss": 0.4017, "step": 64610 }, { - "epoch": 2.27, - "learning_rate": 2.9865976824509673e-05, - "loss": 0.2497, + "epoch": 2.3287202220059826, + "grad_norm": 0.20297592878341675, + "learning_rate": 2.898485901060659e-05, + "loss": 0.4339, "step": 64615 }, { - "epoch": 2.27, - "learning_rate": 2.986318255360232e-05, - "loss": 0.2682, + "epoch": 2.3289004216672073, + "grad_norm": 0.2151886522769928, + "learning_rate": 2.8981978158181412e-05, + "loss": 0.4276, "step": 64620 }, { - "epoch": 2.27, - "learning_rate": 2.9860388219549744e-05, - "loss": 0.2814, + "epoch": 2.329080621328432, + "grad_norm": 0.21326357126235962, + "learning_rate": 2.8979097251502548e-05, + "loss": 0.36, "step": 64625 }, { - "epoch": 2.27, - "learning_rate": 2.9857593822388237e-05, - "loss": 0.286, + "epoch": 2.3292608209896564, + "grad_norm": 0.20007814466953278, + "learning_rate": 2.8976216290609258e-05, + "loss": 0.4148, "step": 64630 }, { - "epoch": 2.27, - "learning_rate": 2.9854799362154074e-05, - "loss": 0.2544, + "epoch": 2.329441020650881, + "grad_norm": 0.20268851518630981, + "learning_rate": 2.8973335275540785e-05, + "loss": 0.407, "step": 64635 }, { - "epoch": 2.27, - "learning_rate": 2.985200483888354e-05, - "loss": 0.2455, + "epoch": 2.329621220312106, + "grad_norm": 0.2361319661140442, + "learning_rate": 2.8970454206336393e-05, + "loss": 0.4504, "step": 64640 }, { - "epoch": 2.27, - "learning_rate": 2.9849210252612926e-05, - "loss": 0.2825, + "epoch": 2.3298014199733306, + "grad_norm": 0.17205223441123962, + "learning_rate": 2.8967573083035327e-05, + "loss": 0.3959, "step": 64645 }, { - "epoch": 2.27, - "learning_rate": 2.9846415603378496e-05, - "loss": 0.2637, + "epoch": 2.3299816196345553, + "grad_norm": 0.19118523597717285, + "learning_rate": 2.8964691905676856e-05, + "loss": 0.4025, "step": 64650 }, { - "epoch": 2.27, - "learning_rate": 2.9843620891216566e-05, - "loss": 0.2724, + "epoch": 2.3301618192957796, + "grad_norm": 0.20011954009532928, + "learning_rate": 2.8961810674300217e-05, + "loss": 0.4142, "step": 64655 }, { - "epoch": 2.27, - "learning_rate": 2.984082611616341e-05, - "loss": 0.2574, + "epoch": 2.3303420189570043, + "grad_norm": 0.21023406088352203, + "learning_rate": 2.895892938894468e-05, + "loss": 0.4246, "step": 64660 }, { - "epoch": 2.28, - "learning_rate": 2.983803127825532e-05, - "loss": 0.2862, + "epoch": 2.330522218618229, + "grad_norm": 0.19074782729148865, + "learning_rate": 2.8956048049649487e-05, + "loss": 0.4024, "step": 64665 }, { - "epoch": 2.28, - "learning_rate": 2.9835236377528575e-05, - "loss": 0.2622, + "epoch": 2.330702418279454, + "grad_norm": 0.24010269343852997, + "learning_rate": 2.895316665645391e-05, + "loss": 0.3951, "step": 64670 }, { - "epoch": 2.28, - "learning_rate": 2.9832441414019474e-05, - "loss": 0.2929, + "epoch": 2.330882617940678, + "grad_norm": 0.23700617253780365, + "learning_rate": 2.8950285209397206e-05, + "loss": 0.3924, "step": 64675 }, { - "epoch": 2.28, - "learning_rate": 2.982964638776431e-05, - "loss": 0.2792, + "epoch": 2.331062817601903, + "grad_norm": 0.23753370344638824, + "learning_rate": 2.8947403708518623e-05, + "loss": 0.4263, "step": 64680 }, { - "epoch": 2.28, - "learning_rate": 2.982685129879937e-05, - "loss": 0.2724, + "epoch": 2.3312430172631275, + "grad_norm": 0.17111685872077942, + "learning_rate": 2.8944522153857433e-05, + "loss": 0.4019, "step": 64685 }, { - "epoch": 2.28, - "learning_rate": 2.9824056147160944e-05, - "loss": 0.2722, + "epoch": 2.3314232169243523, + "grad_norm": 0.18538156151771545, + "learning_rate": 2.8941640545452898e-05, + "loss": 0.4268, "step": 64690 }, { - "epoch": 2.28, - "learning_rate": 2.982126093288533e-05, - "loss": 0.2795, + "epoch": 2.331603416585577, + "grad_norm": 0.2004711627960205, + "learning_rate": 2.893875888334427e-05, + "loss": 0.3993, "step": 64695 }, { - "epoch": 2.28, - "learning_rate": 2.981846565600882e-05, - "loss": 0.2508, + "epoch": 2.3317836162468013, + "grad_norm": 0.18873223662376404, + "learning_rate": 2.8935877167570814e-05, + "loss": 0.3939, "step": 64700 }, { - "epoch": 2.28, - "learning_rate": 2.9815670316567706e-05, - "loss": 0.2602, + "epoch": 2.331963815908026, + "grad_norm": 0.1974310278892517, + "learning_rate": 2.8932995398171793e-05, + "loss": 0.3892, "step": 64705 }, { - "epoch": 2.28, - "learning_rate": 2.9812874914598288e-05, - "loss": 0.2481, + "epoch": 2.3321440155692508, + "grad_norm": 0.2253679633140564, + "learning_rate": 2.8930113575186473e-05, + "loss": 0.4034, "step": 64710 }, { - "epoch": 2.28, - "learning_rate": 2.9810079450136863e-05, - "loss": 0.2904, + "epoch": 2.3323242152304755, + "grad_norm": 0.2432539016008377, + "learning_rate": 2.892723169865411e-05, + "loss": 0.4074, "step": 64715 }, { - "epoch": 2.28, - "learning_rate": 2.9807283923219724e-05, - "loss": 0.2735, + "epoch": 2.3325044148917, + "grad_norm": 0.22504882514476776, + "learning_rate": 2.8924349768613984e-05, + "loss": 0.4394, "step": 64720 }, { - "epoch": 2.28, - "learning_rate": 2.9804488333883172e-05, - "loss": 0.2911, + "epoch": 2.3326846145529245, + "grad_norm": 0.24001994729042053, + "learning_rate": 2.8921467785105342e-05, + "loss": 0.3778, "step": 64725 }, { - "epoch": 2.28, - "learning_rate": 2.9801692682163508e-05, - "loss": 0.2698, + "epoch": 2.3328648142141493, + "grad_norm": 0.19272451102733612, + "learning_rate": 2.8918585748167475e-05, + "loss": 0.4748, "step": 64730 }, { - "epoch": 2.28, - "learning_rate": 2.9798896968097025e-05, - "loss": 0.2831, + "epoch": 2.333045013875374, + "grad_norm": 0.20509444177150726, + "learning_rate": 2.8915703657839627e-05, + "loss": 0.4025, "step": 64735 }, { - "epoch": 2.28, - "learning_rate": 2.9796101191720033e-05, - "loss": 0.2828, + "epoch": 2.3332252135365987, + "grad_norm": 0.1780424416065216, + "learning_rate": 2.8912821514161077e-05, + "loss": 0.3849, "step": 64740 }, { - "epoch": 2.28, - "learning_rate": 2.9793305353068816e-05, - "loss": 0.2739, + "epoch": 2.333405413197823, + "grad_norm": 0.20018140971660614, + "learning_rate": 2.8909939317171086e-05, + "loss": 0.4019, "step": 64745 }, { - "epoch": 2.28, - "learning_rate": 2.9790509452179706e-05, - "loss": 0.2611, + "epoch": 2.3335856128590478, + "grad_norm": 0.22508153319358826, + "learning_rate": 2.8907057066908934e-05, + "loss": 0.3987, "step": 64750 }, { - "epoch": 2.28, - "learning_rate": 2.9787713489088975e-05, - "loss": 0.2712, + "epoch": 2.3337658125202725, + "grad_norm": 0.22255435585975647, + "learning_rate": 2.8904174763413888e-05, + "loss": 0.3795, "step": 64755 }, { - "epoch": 2.28, - "learning_rate": 2.978491746383294e-05, - "loss": 0.2625, + "epoch": 2.333946012181497, + "grad_norm": 0.17312286794185638, + "learning_rate": 2.890129240672521e-05, + "loss": 0.4022, "step": 64760 }, { - "epoch": 2.28, - "learning_rate": 2.97821213764479e-05, - "loss": 0.3069, + "epoch": 2.3341262118427215, + "grad_norm": 0.18471607565879822, + "learning_rate": 2.8898409996882182e-05, + "loss": 0.3776, "step": 64765 }, { - "epoch": 2.28, - "learning_rate": 2.9779325226970183e-05, - "loss": 0.2856, + "epoch": 2.3343064115039462, + "grad_norm": 0.19598588347434998, + "learning_rate": 2.8895527533924076e-05, + "loss": 0.3784, "step": 64770 }, { - "epoch": 2.28, - "learning_rate": 2.9776529015436065e-05, - "loss": 0.2727, + "epoch": 2.334486611165171, + "grad_norm": 0.20340387523174286, + "learning_rate": 2.889264501789015e-05, + "loss": 0.3934, "step": 64775 }, { - "epoch": 2.28, - "learning_rate": 2.977373274188187e-05, - "loss": 0.2896, + "epoch": 2.3346668108263957, + "grad_norm": 0.2492077350616455, + "learning_rate": 2.88897624488197e-05, + "loss": 0.4267, "step": 64780 }, { - "epoch": 2.28, - "learning_rate": 2.9770936406343895e-05, - "loss": 0.2778, + "epoch": 2.3348470104876204, + "grad_norm": 0.21682408452033997, + "learning_rate": 2.8886879826751982e-05, + "loss": 0.4055, "step": 64785 }, { - "epoch": 2.28, - "learning_rate": 2.9768140008858464e-05, - "loss": 0.2569, + "epoch": 2.3350272101488447, + "grad_norm": 0.2086905837059021, + "learning_rate": 2.8883997151726293e-05, + "loss": 0.353, "step": 64790 }, { - "epoch": 2.28, - "learning_rate": 2.9765343549461866e-05, - "loss": 0.2996, + "epoch": 2.3352074098100695, + "grad_norm": 0.18733155727386475, + "learning_rate": 2.8881114423781885e-05, + "loss": 0.372, "step": 64795 }, { - "epoch": 2.28, - "learning_rate": 2.9762547028190436e-05, - "loss": 0.2896, + "epoch": 2.335387609471294, + "grad_norm": 0.18295986950397491, + "learning_rate": 2.8878231642958044e-05, + "loss": 0.3722, "step": 64800 }, { - "epoch": 2.28, - "learning_rate": 2.9759750445080465e-05, - "loss": 0.2651, + "epoch": 2.335567809132519, + "grad_norm": 0.21052880585193634, + "learning_rate": 2.8875348809294044e-05, + "loss": 0.3814, "step": 64805 }, { - "epoch": 2.28, - "learning_rate": 2.9756953800168262e-05, - "loss": 0.2642, + "epoch": 2.3357480087937432, + "grad_norm": 0.20123307406902313, + "learning_rate": 2.887246592282917e-05, + "loss": 0.3888, "step": 64810 }, { - "epoch": 2.28, - "learning_rate": 2.975415709349016e-05, - "loss": 0.2653, + "epoch": 2.335928208454968, + "grad_norm": 0.23528067767620087, + "learning_rate": 2.8869582983602705e-05, + "loss": 0.3984, "step": 64815 }, { - "epoch": 2.28, - "learning_rate": 2.975136032508246e-05, - "loss": 0.2717, + "epoch": 2.3361084081161927, + "grad_norm": 0.22291633486747742, + "learning_rate": 2.8866699991653913e-05, + "loss": 0.4111, "step": 64820 }, { - "epoch": 2.28, - "learning_rate": 2.974856349498148e-05, - "loss": 0.2564, + "epoch": 2.3362886077774174, + "grad_norm": 0.16556419432163239, + "learning_rate": 2.8863816947022087e-05, + "loss": 0.3962, "step": 64825 }, { - "epoch": 2.28, - "learning_rate": 2.974576660322353e-05, - "loss": 0.2669, + "epoch": 2.336468807438642, + "grad_norm": 0.2400919497013092, + "learning_rate": 2.8860933849746496e-05, + "loss": 0.3776, "step": 64830 }, { - "epoch": 2.28, - "learning_rate": 2.9742969649844926e-05, - "loss": 0.2428, + "epoch": 2.336649007099867, + "grad_norm": 0.2065679430961609, + "learning_rate": 2.8858050699866442e-05, + "loss": 0.4298, "step": 64835 }, { - "epoch": 2.28, - "learning_rate": 2.9740172634881986e-05, - "loss": 0.291, + "epoch": 2.336829206761091, + "grad_norm": 0.19219578802585602, + "learning_rate": 2.885516749742118e-05, + "loss": 0.3546, "step": 64840 }, { - "epoch": 2.28, - "learning_rate": 2.9737375558371026e-05, - "loss": 0.2643, + "epoch": 2.337009406422316, + "grad_norm": 0.2160220742225647, + "learning_rate": 2.8852284242450013e-05, + "loss": 0.4224, "step": 64845 }, { - "epoch": 2.28, - "learning_rate": 2.973457842034837e-05, - "loss": 0.2804, + "epoch": 2.3371896060835406, + "grad_norm": 0.17602595686912537, + "learning_rate": 2.8849400934992215e-05, + "loss": 0.4115, "step": 64850 }, { - "epoch": 2.28, - "learning_rate": 2.973178122085034e-05, - "loss": 0.2886, + "epoch": 2.3373698057447654, + "grad_norm": 0.21248827874660492, + "learning_rate": 2.8846517575087086e-05, + "loss": 0.4115, "step": 64855 }, { - "epoch": 2.28, - "learning_rate": 2.9728983959913243e-05, - "loss": 0.2789, + "epoch": 2.3375500054059897, + "grad_norm": 0.1953934133052826, + "learning_rate": 2.884363416277389e-05, + "loss": 0.388, "step": 64860 }, { - "epoch": 2.28, - "learning_rate": 2.9726186637573404e-05, - "loss": 0.2683, + "epoch": 2.3377302050672144, + "grad_norm": 0.19074764847755432, + "learning_rate": 2.8840750698091924e-05, + "loss": 0.3821, "step": 64865 }, { - "epoch": 2.28, - "learning_rate": 2.9723389253867146e-05, - "loss": 0.2639, + "epoch": 2.337910404728439, + "grad_norm": 0.20078332722187042, + "learning_rate": 2.883786718108047e-05, + "loss": 0.4138, "step": 64870 }, { - "epoch": 2.28, - "learning_rate": 2.9720591808830796e-05, - "loss": 0.2614, + "epoch": 2.338090604389664, + "grad_norm": 0.20209373533725739, + "learning_rate": 2.8834983611778827e-05, + "loss": 0.4012, "step": 64875 }, { - "epoch": 2.28, - "learning_rate": 2.971779430250067e-05, - "loss": 0.2755, + "epoch": 2.3382708040508886, + "grad_norm": 0.21952852606773376, + "learning_rate": 2.8832099990226268e-05, + "loss": 0.3731, "step": 64880 }, { - "epoch": 2.28, - "learning_rate": 2.97149967349131e-05, - "loss": 0.3162, + "epoch": 2.338451003712113, + "grad_norm": 0.18298974633216858, + "learning_rate": 2.8829216316462092e-05, + "loss": 0.3778, "step": 64885 }, { - "epoch": 2.28, - "learning_rate": 2.9712199106104392e-05, - "loss": 0.2874, + "epoch": 2.3386312033733376, + "grad_norm": 0.2111566960811615, + "learning_rate": 2.8826332590525583e-05, + "loss": 0.4025, "step": 64890 }, { - "epoch": 2.28, - "learning_rate": 2.9709401416110895e-05, - "loss": 0.2674, + "epoch": 2.3388114030345624, + "grad_norm": 0.21694448590278625, + "learning_rate": 2.882344881245604e-05, + "loss": 0.414, "step": 64895 }, { - "epoch": 2.28, - "learning_rate": 2.9706603664968913e-05, - "loss": 0.26, + "epoch": 2.338991602695787, + "grad_norm": 0.188668355345726, + "learning_rate": 2.8820564982292736e-05, + "loss": 0.4043, "step": 64900 }, { - "epoch": 2.28, - "learning_rate": 2.9703805852714796e-05, - "loss": 0.2534, + "epoch": 2.3391718023570114, + "grad_norm": 0.18398909270763397, + "learning_rate": 2.881768110007498e-05, + "loss": 0.3847, "step": 64905 }, { - "epoch": 2.28, - "learning_rate": 2.9701007979384855e-05, - "loss": 0.294, + "epoch": 2.339352002018236, + "grad_norm": 0.19492432475090027, + "learning_rate": 2.8814797165842057e-05, + "loss": 0.38, "step": 64910 }, { - "epoch": 2.28, - "learning_rate": 2.9698210045015424e-05, - "loss": 0.2643, + "epoch": 2.339532201679461, + "grad_norm": 0.16589581966400146, + "learning_rate": 2.881191317963326e-05, + "loss": 0.3701, "step": 64915 }, { - "epoch": 2.28, - "learning_rate": 2.9695971653595856e-05, - "loss": 0.2674, + "epoch": 2.3397124013406856, + "grad_norm": 0.23120105266571045, + "learning_rate": 2.8809029141487886e-05, + "loss": 0.4066, "step": 64920 }, { - "epoch": 2.28, - "learning_rate": 2.9693173609446895e-05, - "loss": 0.2858, + "epoch": 2.3398926010019103, + "grad_norm": 0.19977350533008575, + "learning_rate": 2.8806145051445225e-05, + "loss": 0.3919, "step": 64925 }, { - "epoch": 2.28, - "learning_rate": 2.9690375504360162e-05, - "loss": 0.2574, + "epoch": 2.3400728006631346, + "grad_norm": 0.2010623812675476, + "learning_rate": 2.8803260909544578e-05, + "loss": 0.4305, "step": 64930 }, { - "epoch": 2.28, - "learning_rate": 2.9687577338371997e-05, - "loss": 0.2608, + "epoch": 2.3402530003243593, + "grad_norm": 0.17667075991630554, + "learning_rate": 2.880037671582524e-05, + "loss": 0.3747, "step": 64935 }, { - "epoch": 2.28, - "learning_rate": 2.9684779111518728e-05, - "loss": 0.2728, + "epoch": 2.340433199985584, + "grad_norm": 0.15319837629795074, + "learning_rate": 2.8797492470326497e-05, + "loss": 0.3922, "step": 64940 }, { - "epoch": 2.28, - "learning_rate": 2.9681980823836696e-05, - "loss": 0.2791, + "epoch": 2.340613399646809, + "grad_norm": 0.20747162401676178, + "learning_rate": 2.8794608173087655e-05, + "loss": 0.3958, "step": 64945 }, { - "epoch": 2.29, - "learning_rate": 2.967918247536222e-05, - "loss": 0.2715, + "epoch": 2.340793599308033, + "grad_norm": 0.2174191176891327, + "learning_rate": 2.879172382414802e-05, + "loss": 0.3928, "step": 64950 }, { - "epoch": 2.29, - "learning_rate": 2.9676384066131646e-05, - "loss": 0.2619, + "epoch": 2.340973798969258, + "grad_norm": 0.21791304647922516, + "learning_rate": 2.8788839423546877e-05, + "loss": 0.4441, "step": 64955 }, { - "epoch": 2.29, - "learning_rate": 2.9673585596181303e-05, - "loss": 0.2621, + "epoch": 2.3411539986304826, + "grad_norm": 0.22638213634490967, + "learning_rate": 2.8785954971323526e-05, + "loss": 0.402, "step": 64960 }, { - "epoch": 2.29, - "learning_rate": 2.9670787065547538e-05, - "loss": 0.2665, + "epoch": 2.3413341982917073, + "grad_norm": 0.20641210675239563, + "learning_rate": 2.8783070467517277e-05, + "loss": 0.4119, "step": 64965 }, { - "epoch": 2.29, - "learning_rate": 2.9667988474266678e-05, - "loss": 0.2658, + "epoch": 2.341514397952932, + "grad_norm": 0.2296299785375595, + "learning_rate": 2.8780185912167424e-05, + "loss": 0.4335, "step": 64970 }, { - "epoch": 2.29, - "learning_rate": 2.966518982237507e-05, - "loss": 0.2694, + "epoch": 2.3416945976141563, + "grad_norm": 0.18867331743240356, + "learning_rate": 2.8777301305313276e-05, + "loss": 0.3613, "step": 64975 }, { - "epoch": 2.29, - "learning_rate": 2.9662391109909038e-05, - "loss": 0.2698, + "epoch": 2.341874797275381, + "grad_norm": 0.27165400981903076, + "learning_rate": 2.8774416646994117e-05, + "loss": 0.4134, "step": 64980 }, { - "epoch": 2.29, - "learning_rate": 2.9659592336904933e-05, - "loss": 0.279, + "epoch": 2.342054996936606, + "grad_norm": 0.23474054038524628, + "learning_rate": 2.877153193724927e-05, + "loss": 0.3988, "step": 64985 }, { - "epoch": 2.29, - "learning_rate": 2.9656793503399095e-05, - "loss": 0.2622, + "epoch": 2.3422351965978305, + "grad_norm": 0.2028888314962387, + "learning_rate": 2.8768647176118024e-05, + "loss": 0.3996, "step": 64990 }, { - "epoch": 2.29, - "learning_rate": 2.965399460942786e-05, - "loss": 0.2711, + "epoch": 2.342415396259055, + "grad_norm": 0.29794272780418396, + "learning_rate": 2.8765762363639692e-05, + "loss": 0.4289, "step": 64995 }, { - "epoch": 2.29, - "learning_rate": 2.965119565502757e-05, - "loss": 0.2372, + "epoch": 2.3425955959202795, + "grad_norm": 0.2336840033531189, + "learning_rate": 2.8762877499853586e-05, + "loss": 0.3917, "step": 65000 }, { - "epoch": 2.29, - "eval_loss": 0.2671353220939636, - "eval_runtime": 10.5464, - "eval_samples_per_second": 9.482, - "eval_steps_per_second": 9.482, + "epoch": 2.3425955959202795, + "eval_loss": 0.4352496862411499, + "eval_runtime": 3.5177, + "eval_samples_per_second": 28.428, + "eval_steps_per_second": 7.107, "step": 65000 }, { - "epoch": 2.29, - "learning_rate": 2.9648396640234578e-05, - "loss": 0.2522, + "epoch": 2.3427757955815043, + "grad_norm": 0.21938785910606384, + "learning_rate": 2.8759992584798988e-05, + "loss": 0.4125, "step": 65005 }, { - "epoch": 2.29, - "learning_rate": 2.9645597565085208e-05, - "loss": 0.2762, + "epoch": 2.342955995242729, + "grad_norm": 0.1729237288236618, + "learning_rate": 2.8757107618515227e-05, + "loss": 0.405, "step": 65010 }, { - "epoch": 2.29, - "learning_rate": 2.964279842961582e-05, - "loss": 0.2681, + "epoch": 2.3431361949039538, + "grad_norm": 0.19324518740177155, + "learning_rate": 2.8754222601041603e-05, + "loss": 0.3719, "step": 65015 }, { - "epoch": 2.29, - "learning_rate": 2.9639999233862752e-05, - "loss": 0.2478, + "epoch": 2.343316394565178, + "grad_norm": 0.2103443443775177, + "learning_rate": 2.8751337532417428e-05, + "loss": 0.4041, "step": 65020 }, { - "epoch": 2.29, - "learning_rate": 2.963719997786235e-05, - "loss": 0.2876, + "epoch": 2.3434965942264028, + "grad_norm": 0.2286413609981537, + "learning_rate": 2.8748452412681994e-05, + "loss": 0.4066, "step": 65025 }, { - "epoch": 2.29, - "learning_rate": 2.9634400661650973e-05, - "loss": 0.2809, + "epoch": 2.3436767938876275, + "grad_norm": 0.2286667674779892, + "learning_rate": 2.8745567241874627e-05, + "loss": 0.3864, "step": 65030 }, { - "epoch": 2.29, - "learning_rate": 2.9631601285264947e-05, - "loss": 0.2557, + "epoch": 2.3438569935488522, + "grad_norm": 0.25851571559906006, + "learning_rate": 2.874268202003463e-05, + "loss": 0.4042, "step": 65035 }, { - "epoch": 2.29, - "learning_rate": 2.9628801848740633e-05, - "loss": 0.2618, + "epoch": 2.3440371932100765, + "grad_norm": 0.17988860607147217, + "learning_rate": 2.8739796747201313e-05, + "loss": 0.412, "step": 65040 }, { - "epoch": 2.29, - "learning_rate": 2.962600235211438e-05, - "loss": 0.2638, + "epoch": 2.3442173928713013, + "grad_norm": 0.21041817963123322, + "learning_rate": 2.8736911423414e-05, + "loss": 0.4372, "step": 65045 }, { - "epoch": 2.29, - "learning_rate": 2.9623202795422532e-05, - "loss": 0.2836, + "epoch": 2.344397592532526, + "grad_norm": 0.25124192237854004, + "learning_rate": 2.8734026048711975e-05, + "loss": 0.3904, "step": 65050 }, { - "epoch": 2.29, - "learning_rate": 2.9620403178701445e-05, - "loss": 0.261, + "epoch": 2.3445777921937507, + "grad_norm": 0.18272101879119873, + "learning_rate": 2.8731140623134583e-05, + "loss": 0.3723, "step": 65055 }, { - "epoch": 2.29, - "learning_rate": 2.9617603501987473e-05, - "loss": 0.2787, + "epoch": 2.3447579918549755, + "grad_norm": 0.2411685287952423, + "learning_rate": 2.8728255146721117e-05, + "loss": 0.3785, "step": 65060 }, { - "epoch": 2.29, - "learning_rate": 2.9614803765316956e-05, - "loss": 0.2619, + "epoch": 2.3449381915162, + "grad_norm": 0.2054976224899292, + "learning_rate": 2.8725369619510895e-05, + "loss": 0.4264, "step": 65065 }, { - "epoch": 2.29, - "learning_rate": 2.9612003968726254e-05, - "loss": 0.2937, + "epoch": 2.3451183911774245, + "grad_norm": 0.3253336250782013, + "learning_rate": 2.8722484041543233e-05, + "loss": 0.3911, "step": 65070 }, { - "epoch": 2.29, - "learning_rate": 2.9609204112251715e-05, - "loss": 0.2591, + "epoch": 2.345298590838649, + "grad_norm": 0.23166240751743317, + "learning_rate": 2.8719598412857445e-05, + "loss": 0.3862, "step": 65075 }, { - "epoch": 2.29, - "learning_rate": 2.960640419592971e-05, - "loss": 0.2509, + "epoch": 2.345478790499874, + "grad_norm": 0.17862895131111145, + "learning_rate": 2.871671273349286e-05, + "loss": 0.3967, "step": 65080 }, { - "epoch": 2.29, - "learning_rate": 2.9603604219796573e-05, - "loss": 0.2657, + "epoch": 2.3456589901610987, + "grad_norm": 0.2060977816581726, + "learning_rate": 2.8713827003488776e-05, + "loss": 0.3993, "step": 65085 }, { - "epoch": 2.29, - "learning_rate": 2.9600804183888675e-05, - "loss": 0.2633, + "epoch": 2.345839189822323, + "grad_norm": 0.1897481232881546, + "learning_rate": 2.8710941222884515e-05, + "loss": 0.3673, "step": 65090 }, { - "epoch": 2.29, - "learning_rate": 2.9598004088242355e-05, - "loss": 0.2567, + "epoch": 2.3460193894835477, + "grad_norm": 0.2062787413597107, + "learning_rate": 2.8708055391719396e-05, + "loss": 0.429, "step": 65095 }, { - "epoch": 2.29, - "learning_rate": 2.9595203932893996e-05, - "loss": 0.2692, + "epoch": 2.3461995891447724, + "grad_norm": 0.21658870577812195, + "learning_rate": 2.8705169510032747e-05, + "loss": 0.4001, "step": 65100 }, { - "epoch": 2.29, - "learning_rate": 2.9592403717879935e-05, - "loss": 0.2629, + "epoch": 2.346379788805997, + "grad_norm": 0.2051638811826706, + "learning_rate": 2.8702283577863883e-05, + "loss": 0.3861, "step": 65105 }, { - "epoch": 2.29, - "learning_rate": 2.958960344323654e-05, - "loss": 0.2529, + "epoch": 2.346559988467222, + "grad_norm": 0.22257257997989655, + "learning_rate": 2.8699397595252116e-05, + "loss": 0.3834, "step": 65110 }, { - "epoch": 2.29, - "learning_rate": 2.958680310900017e-05, - "loss": 0.278, + "epoch": 2.346740188128446, + "grad_norm": 0.1782115399837494, + "learning_rate": 2.8696511562236776e-05, + "loss": 0.4001, "step": 65115 }, { - "epoch": 2.29, - "learning_rate": 2.9584002715207182e-05, - "loss": 0.2807, + "epoch": 2.346920387789671, + "grad_norm": 0.2436537742614746, + "learning_rate": 2.8693625478857182e-05, + "loss": 0.4371, "step": 65120 }, { - "epoch": 2.29, - "learning_rate": 2.9581202261893937e-05, - "loss": 0.2419, + "epoch": 2.3471005874508957, + "grad_norm": 0.22284133732318878, + "learning_rate": 2.8690739345152658e-05, + "loss": 0.4084, "step": 65125 }, { - "epoch": 2.29, - "learning_rate": 2.9578401749096806e-05, - "loss": 0.2967, + "epoch": 2.3472807871121204, + "grad_norm": 0.19572477042675018, + "learning_rate": 2.8687853161162514e-05, + "loss": 0.3609, "step": 65130 }, { - "epoch": 2.29, - "learning_rate": 2.9575601176852146e-05, - "loss": 0.2738, + "epoch": 2.3474609867733447, + "grad_norm": 0.21674886345863342, + "learning_rate": 2.8684966926926092e-05, + "loss": 0.3779, "step": 65135 }, { - "epoch": 2.29, - "learning_rate": 2.9572800545196317e-05, - "loss": 0.2837, + "epoch": 2.3476411864345694, + "grad_norm": 0.2136157900094986, + "learning_rate": 2.8682080642482717e-05, + "loss": 0.4004, "step": 65140 }, { - "epoch": 2.29, - "learning_rate": 2.9569999854165687e-05, - "loss": 0.2811, + "epoch": 2.347821386095794, + "grad_norm": 0.22586745023727417, + "learning_rate": 2.8679194307871694e-05, + "loss": 0.4183, "step": 65145 }, { - "epoch": 2.29, - "learning_rate": 2.956719910379662e-05, - "loss": 0.2877, + "epoch": 2.348001585757019, + "grad_norm": 0.20550450682640076, + "learning_rate": 2.8676307923132367e-05, + "loss": 0.3483, "step": 65150 }, { - "epoch": 2.29, - "learning_rate": 2.9564398294125485e-05, - "loss": 0.2804, + "epoch": 2.3481817854182436, + "grad_norm": 0.18559207022190094, + "learning_rate": 2.8673421488304048e-05, + "loss": 0.3986, "step": 65155 }, { - "epoch": 2.29, - "learning_rate": 2.9561597425188642e-05, - "loss": 0.2756, + "epoch": 2.348361985079468, + "grad_norm": 0.23518766462802887, + "learning_rate": 2.867053500342609e-05, + "loss": 0.3831, "step": 65160 }, { - "epoch": 2.29, - "learning_rate": 2.955879649702247e-05, - "loss": 0.2892, + "epoch": 2.3485421847406927, + "grad_norm": 0.193081796169281, + "learning_rate": 2.866764846853779e-05, + "loss": 0.4086, "step": 65165 }, { - "epoch": 2.29, - "learning_rate": 2.955599550966332e-05, - "loss": 0.2629, + "epoch": 2.3487223844019174, + "grad_norm": 0.24136221408843994, + "learning_rate": 2.8664761883678493e-05, + "loss": 0.391, "step": 65170 }, { - "epoch": 2.29, - "learning_rate": 2.955319446314757e-05, - "loss": 0.2763, + "epoch": 2.348902584063142, + "grad_norm": 0.1842050552368164, + "learning_rate": 2.8661875248887515e-05, + "loss": 0.3772, "step": 65175 }, { - "epoch": 2.29, - "learning_rate": 2.9550393357511596e-05, - "loss": 0.2644, + "epoch": 2.3490827837243664, + "grad_norm": 0.16665658354759216, + "learning_rate": 2.865898856420421e-05, + "loss": 0.3921, "step": 65180 }, { - "epoch": 2.29, - "learning_rate": 2.9547592192791766e-05, - "loss": 0.2685, + "epoch": 2.349262983385591, + "grad_norm": 0.24213731288909912, + "learning_rate": 2.8656101829667892e-05, + "loss": 0.4142, "step": 65185 }, { - "epoch": 2.29, - "learning_rate": 2.954479096902445e-05, - "loss": 0.2634, + "epoch": 2.349443183046816, + "grad_norm": 0.16028551757335663, + "learning_rate": 2.8653215045317882e-05, + "loss": 0.4045, "step": 65190 }, { - "epoch": 2.29, - "learning_rate": 2.954198968624602e-05, - "loss": 0.304, + "epoch": 2.3496233827080406, + "grad_norm": 0.21428442001342773, + "learning_rate": 2.865032821119354e-05, + "loss": 0.3671, "step": 65195 }, { - "epoch": 2.29, - "learning_rate": 2.9539188344492837e-05, - "loss": 0.2807, + "epoch": 2.3498035823692653, + "grad_norm": 0.2171047478914261, + "learning_rate": 2.8647441327334166e-05, + "loss": 0.3788, "step": 65200 }, { - "epoch": 2.29, - "learning_rate": 2.9536386943801287e-05, - "loss": 0.3002, + "epoch": 2.3499837820304896, + "grad_norm": 0.17178626358509064, + "learning_rate": 2.8644554393779127e-05, + "loss": 0.404, "step": 65205 }, { - "epoch": 2.29, - "learning_rate": 2.9533585484207743e-05, - "loss": 0.253, + "epoch": 2.3501639816917144, + "grad_norm": 0.19908472895622253, + "learning_rate": 2.8641667410567736e-05, + "loss": 0.415, "step": 65210 }, { - "epoch": 2.29, - "learning_rate": 2.953078396574858e-05, - "loss": 0.295, + "epoch": 2.350344181352939, + "grad_norm": 0.1920705884695053, + "learning_rate": 2.8638780377739323e-05, + "loss": 0.4006, "step": 65215 }, { - "epoch": 2.29, - "learning_rate": 2.9527982388460175e-05, - "loss": 0.2554, + "epoch": 2.350524381014164, + "grad_norm": 0.22549034655094147, + "learning_rate": 2.863589329533324e-05, + "loss": 0.4142, "step": 65220 }, { - "epoch": 2.29, - "learning_rate": 2.9525180752378904e-05, - "loss": 0.274, + "epoch": 2.350704580675388, + "grad_norm": 0.19695831835269928, + "learning_rate": 2.8633006163388815e-05, + "loss": 0.3613, "step": 65225 }, { - "epoch": 2.29, - "learning_rate": 2.952237905754114e-05, - "loss": 0.275, + "epoch": 2.350884780336613, + "grad_norm": 0.2582644522190094, + "learning_rate": 2.8630118981945385e-05, + "loss": 0.3895, "step": 65230 }, { - "epoch": 2.3, - "learning_rate": 2.9519577303983264e-05, - "loss": 0.2911, + "epoch": 2.3510649799978376, + "grad_norm": 0.2204061895608902, + "learning_rate": 2.862723175104228e-05, + "loss": 0.4464, "step": 65235 }, { - "epoch": 2.3, - "learning_rate": 2.9516775491741654e-05, - "loss": 0.2803, + "epoch": 2.3512451796590623, + "grad_norm": 0.2712995707988739, + "learning_rate": 2.8624344470718852e-05, + "loss": 0.4043, "step": 65240 }, { - "epoch": 2.3, - "learning_rate": 2.95139736208527e-05, - "loss": 0.2559, + "epoch": 2.351425379320287, + "grad_norm": 0.17687354981899261, + "learning_rate": 2.862145714101443e-05, + "loss": 0.3772, "step": 65245 }, { - "epoch": 2.3, - "learning_rate": 2.9511171691352764e-05, - "loss": 0.2758, + "epoch": 2.3516055789815113, + "grad_norm": 0.22399690747261047, + "learning_rate": 2.861856976196835e-05, + "loss": 0.4168, "step": 65250 }, { - "epoch": 2.3, - "learning_rate": 2.950836970327823e-05, - "loss": 0.276, + "epoch": 2.351785778642736, + "grad_norm": 0.22232884168624878, + "learning_rate": 2.8615682333619957e-05, + "loss": 0.3784, "step": 65255 }, { - "epoch": 2.3, - "learning_rate": 2.9505567656665495e-05, - "loss": 0.2704, + "epoch": 2.351965978303961, + "grad_norm": 0.23076751828193665, + "learning_rate": 2.8612794856008595e-05, + "loss": 0.3769, "step": 65260 }, { - "epoch": 2.3, - "learning_rate": 2.950276555155093e-05, - "loss": 0.2763, + "epoch": 2.3521461779651855, + "grad_norm": 0.21458113193511963, + "learning_rate": 2.8609907329173613e-05, + "loss": 0.4084, "step": 65265 }, { - "epoch": 2.3, - "learning_rate": 2.949996338797093e-05, - "loss": 0.2608, + "epoch": 2.35232637762641, + "grad_norm": 0.22074739634990692, + "learning_rate": 2.860701975315433e-05, + "loss": 0.399, "step": 65270 }, { - "epoch": 2.3, - "learning_rate": 2.949716116596187e-05, - "loss": 0.2825, + "epoch": 2.3525065772876346, + "grad_norm": 0.20703339576721191, + "learning_rate": 2.8604132127990107e-05, + "loss": 0.3878, "step": 65275 }, { - "epoch": 2.3, - "learning_rate": 2.9494358885560138e-05, - "loss": 0.2638, + "epoch": 2.3526867769488593, + "grad_norm": 0.25511276721954346, + "learning_rate": 2.8601244453720276e-05, + "loss": 0.3976, "step": 65280 }, { - "epoch": 2.3, - "learning_rate": 2.9491556546802106e-05, - "loss": 0.2725, + "epoch": 2.352866976610084, + "grad_norm": 0.21042507886886597, + "learning_rate": 2.859835673038419e-05, + "loss": 0.4136, "step": 65285 }, { - "epoch": 2.3, - "learning_rate": 2.9488754149724175e-05, - "loss": 0.2593, + "epoch": 2.3530471762713088, + "grad_norm": 0.24099446833133698, + "learning_rate": 2.85954689580212e-05, + "loss": 0.4005, "step": 65290 }, { - "epoch": 2.3, - "learning_rate": 2.9485951694362734e-05, - "loss": 0.2668, + "epoch": 2.353227375932533, + "grad_norm": 0.18244995176792145, + "learning_rate": 2.8592581136670625e-05, + "loss": 0.3733, "step": 65295 }, { - "epoch": 2.3, - "learning_rate": 2.9483149180754168e-05, - "loss": 0.2634, + "epoch": 2.353407575593758, + "grad_norm": 0.25763794779777527, + "learning_rate": 2.8589693266371837e-05, + "loss": 0.394, "step": 65300 }, { - "epoch": 2.3, - "learning_rate": 2.948034660893486e-05, - "loss": 0.2597, + "epoch": 2.3535877752549825, + "grad_norm": 0.19655391573905945, + "learning_rate": 2.858680534716417e-05, + "loss": 0.4198, "step": 65305 }, { - "epoch": 2.3, - "learning_rate": 2.9477543978941213e-05, - "loss": 0.2492, + "epoch": 2.3537679749162073, + "grad_norm": 0.16229073703289032, + "learning_rate": 2.858391737908698e-05, + "loss": 0.3837, "step": 65310 }, { - "epoch": 2.3, - "learning_rate": 2.9474741290809592e-05, - "loss": 0.2786, + "epoch": 2.3539481745774316, + "grad_norm": 0.2285899817943573, + "learning_rate": 2.85810293621796e-05, + "loss": 0.4059, "step": 65315 }, { - "epoch": 2.3, - "learning_rate": 2.9471938544576417e-05, - "loss": 0.2849, + "epoch": 2.3541283742386563, + "grad_norm": 0.20628534257411957, + "learning_rate": 2.85781412964814e-05, + "loss": 0.4214, "step": 65320 }, { - "epoch": 2.3, - "learning_rate": 2.9469135740278066e-05, - "loss": 0.279, + "epoch": 2.354308573899881, + "grad_norm": 0.1957215815782547, + "learning_rate": 2.857525318203171e-05, + "loss": 0.391, "step": 65325 }, { - "epoch": 2.3, - "learning_rate": 2.946633287795093e-05, - "loss": 0.268, + "epoch": 2.3544887735611058, + "grad_norm": 0.1700507253408432, + "learning_rate": 2.8572365018869884e-05, + "loss": 0.3891, "step": 65330 }, { - "epoch": 2.3, - "learning_rate": 2.9463529957631404e-05, - "loss": 0.2755, + "epoch": 2.3546689732223305, + "grad_norm": 0.21826016902923584, + "learning_rate": 2.8569476807035278e-05, + "loss": 0.3958, "step": 65335 }, { - "epoch": 2.3, - "learning_rate": 2.946072697935588e-05, - "loss": 0.2858, + "epoch": 2.354849172883555, + "grad_norm": 0.1989716738462448, + "learning_rate": 2.8566588546567243e-05, + "loss": 0.4297, "step": 65340 }, { - "epoch": 2.3, - "learning_rate": 2.945792394316076e-05, - "loss": 0.2783, + "epoch": 2.3550293725447795, + "grad_norm": 0.20529745519161224, + "learning_rate": 2.856370023750513e-05, + "loss": 0.3922, "step": 65345 }, { - "epoch": 2.3, - "learning_rate": 2.9455120849082442e-05, - "loss": 0.2666, + "epoch": 2.3552095722060042, + "grad_norm": 0.1638742834329605, + "learning_rate": 2.8560811879888294e-05, + "loss": 0.3881, "step": 65350 }, { - "epoch": 2.3, - "learning_rate": 2.9452317697157307e-05, - "loss": 0.2716, + "epoch": 2.355389771867229, + "grad_norm": 0.16526326537132263, + "learning_rate": 2.855792347375608e-05, + "loss": 0.3958, "step": 65355 }, { - "epoch": 2.3, - "learning_rate": 2.9449514487421764e-05, - "loss": 0.2684, + "epoch": 2.3555699715284537, + "grad_norm": 0.17560982704162598, + "learning_rate": 2.8555035019147857e-05, + "loss": 0.3716, "step": 65360 }, { - "epoch": 2.3, - "learning_rate": 2.9446711219912203e-05, - "loss": 0.2683, + "epoch": 2.355750171189678, + "grad_norm": 0.21271197497844696, + "learning_rate": 2.855214651610296e-05, + "loss": 0.3795, "step": 65365 }, { - "epoch": 2.3, - "learning_rate": 2.9443907894665036e-05, - "loss": 0.2721, + "epoch": 2.3559303708509027, + "grad_norm": 0.18829388916492462, + "learning_rate": 2.8549257964660765e-05, + "loss": 0.4005, "step": 65370 }, { - "epoch": 2.3, - "learning_rate": 2.9441104511716645e-05, - "loss": 0.2548, + "epoch": 2.3561105705121275, + "grad_norm": 0.19685223698616028, + "learning_rate": 2.8546369364860608e-05, + "loss": 0.3688, "step": 65375 }, { - "epoch": 2.3, - "learning_rate": 2.943830107110344e-05, - "loss": 0.2763, + "epoch": 2.356290770173352, + "grad_norm": 0.2065257430076599, + "learning_rate": 2.854348071674186e-05, + "loss": 0.3742, "step": 65380 }, { - "epoch": 2.3, - "learning_rate": 2.9435497572861824e-05, - "loss": 0.2818, + "epoch": 2.356470969834577, + "grad_norm": 0.22780166566371918, + "learning_rate": 2.8540592020343872e-05, + "loss": 0.394, "step": 65385 }, { - "epoch": 2.3, - "learning_rate": 2.9432694017028195e-05, - "loss": 0.2857, + "epoch": 2.3566511694958012, + "grad_norm": 0.1743927299976349, + "learning_rate": 2.8537703275705997e-05, + "loss": 0.4064, "step": 65390 }, { - "epoch": 2.3, - "learning_rate": 2.9429890403638955e-05, - "loss": 0.2841, + "epoch": 2.356831369157026, + "grad_norm": 0.15711626410484314, + "learning_rate": 2.853481448286761e-05, + "loss": 0.3607, "step": 65395 }, { - "epoch": 2.3, - "learning_rate": 2.9427086732730506e-05, - "loss": 0.279, + "epoch": 2.3570115688182507, + "grad_norm": 0.2312869429588318, + "learning_rate": 2.853192564186805e-05, + "loss": 0.3852, "step": 65400 }, { - "epoch": 2.3, - "learning_rate": 2.942428300433926e-05, - "loss": 0.2693, + "epoch": 2.3571917684794754, + "grad_norm": 0.16384528577327728, + "learning_rate": 2.8529036752746697e-05, + "loss": 0.4077, "step": 65405 }, { - "epoch": 2.3, - "learning_rate": 2.9421479218501607e-05, - "loss": 0.2909, + "epoch": 2.3573719681406997, + "grad_norm": 0.24304990470409393, + "learning_rate": 2.8526147815542898e-05, + "loss": 0.4212, "step": 65410 }, { - "epoch": 2.3, - "learning_rate": 2.9418675375253968e-05, - "loss": 0.2845, + "epoch": 2.3575521678019244, + "grad_norm": 0.21454188227653503, + "learning_rate": 2.8523258830296017e-05, + "loss": 0.4003, "step": 65415 }, { - "epoch": 2.3, - "learning_rate": 2.9415871474632735e-05, - "loss": 0.2763, + "epoch": 2.357732367463149, + "grad_norm": 0.23705005645751953, + "learning_rate": 2.852036979704541e-05, + "loss": 0.3912, "step": 65420 }, { - "epoch": 2.3, - "learning_rate": 2.941306751667432e-05, - "loss": 0.2548, + "epoch": 2.357912567124374, + "grad_norm": 0.2235097885131836, + "learning_rate": 2.851748071583046e-05, + "loss": 0.4037, "step": 65425 }, { - "epoch": 2.3, - "learning_rate": 2.941026350141513e-05, - "loss": 0.2905, + "epoch": 2.3580927667855986, + "grad_norm": 0.2439454346895218, + "learning_rate": 2.851459158669051e-05, + "loss": 0.4144, "step": 65430 }, { - "epoch": 2.3, - "learning_rate": 2.9407459428891583e-05, - "loss": 0.2622, + "epoch": 2.358272966446823, + "grad_norm": 0.23461012542247772, + "learning_rate": 2.851170240966492e-05, + "loss": 0.4088, "step": 65435 }, { - "epoch": 2.3, - "learning_rate": 2.940465529914007e-05, - "loss": 0.2889, + "epoch": 2.3584531661080477, + "grad_norm": 0.21233990788459778, + "learning_rate": 2.8508813184793074e-05, + "loss": 0.3875, "step": 65440 }, { - "epoch": 2.3, - "learning_rate": 2.9401851112197022e-05, - "loss": 0.2797, + "epoch": 2.3586333657692724, + "grad_norm": 0.21089306473731995, + "learning_rate": 2.8505923912114324e-05, + "loss": 0.3812, "step": 65445 }, { - "epoch": 2.3, - "learning_rate": 2.9399046868098833e-05, - "loss": 0.2415, + "epoch": 2.358813565430497, + "grad_norm": 0.23103797435760498, + "learning_rate": 2.850303459166805e-05, + "loss": 0.4117, "step": 65450 }, { - "epoch": 2.3, - "learning_rate": 2.9396242566881916e-05, - "loss": 0.2487, + "epoch": 2.3589937650917214, + "grad_norm": 0.21833232045173645, + "learning_rate": 2.8500145223493595e-05, + "loss": 0.4149, "step": 65455 }, { - "epoch": 2.3, - "learning_rate": 2.939343820858269e-05, - "loss": 0.2523, + "epoch": 2.359173964752946, + "grad_norm": 0.20648404955863953, + "learning_rate": 2.8497255807630346e-05, + "loss": 0.4214, "step": 65460 }, { - "epoch": 2.3, - "learning_rate": 2.9390633793237566e-05, - "loss": 0.2733, + "epoch": 2.359354164414171, + "grad_norm": 0.1955990344285965, + "learning_rate": 2.849436634411766e-05, + "loss": 0.3793, "step": 65465 }, { - "epoch": 2.3, - "learning_rate": 2.9387829320882954e-05, - "loss": 0.2728, + "epoch": 2.3595343640753956, + "grad_norm": 0.18235990405082703, + "learning_rate": 2.8491476832994912e-05, + "loss": 0.3988, "step": 65470 }, { - "epoch": 2.3, - "learning_rate": 2.9385024791555267e-05, - "loss": 0.2709, + "epoch": 2.3597145637366204, + "grad_norm": 0.19686979055404663, + "learning_rate": 2.848858727430147e-05, + "loss": 0.4026, "step": 65475 }, { - "epoch": 2.3, - "learning_rate": 2.9382220205290923e-05, - "loss": 0.2774, + "epoch": 2.3598947633978447, + "grad_norm": 0.16368722915649414, + "learning_rate": 2.8485697668076694e-05, + "loss": 0.4115, "step": 65480 }, { - "epoch": 2.3, - "learning_rate": 2.9379415562126344e-05, - "loss": 0.2885, + "epoch": 2.3600749630590694, + "grad_norm": 0.21965758502483368, + "learning_rate": 2.848280801435997e-05, + "loss": 0.3961, "step": 65485 }, { - "epoch": 2.3, - "learning_rate": 2.9376610862097936e-05, - "loss": 0.2733, + "epoch": 2.360255162720294, + "grad_norm": 0.1920892298221588, + "learning_rate": 2.8479918313190657e-05, + "loss": 0.4003, "step": 65490 }, { - "epoch": 2.3, - "learning_rate": 2.937380610524212e-05, - "loss": 0.2483, + "epoch": 2.360435362381519, + "grad_norm": 0.18833613395690918, + "learning_rate": 2.8477028564608126e-05, + "loss": 0.3545, "step": 65495 }, { - "epoch": 2.3, - "learning_rate": 2.9371001291595314e-05, - "loss": 0.2939, + "epoch": 2.360615562042743, + "grad_norm": 0.17606264352798462, + "learning_rate": 2.8474138768651764e-05, + "loss": 0.3993, "step": 65500 }, { - "epoch": 2.3, - "eval_loss": 0.26703858375549316, - "eval_runtime": 10.541, - "eval_samples_per_second": 9.487, - "eval_steps_per_second": 9.487, + "epoch": 2.360615562042743, + "eval_loss": 0.43418970704078674, + "eval_runtime": 3.5195, + "eval_samples_per_second": 28.413, + "eval_steps_per_second": 7.103, "step": 65500 }, { - "epoch": 2.3, - "learning_rate": 2.936819642119394e-05, - "loss": 0.287, + "epoch": 2.360795761703968, + "grad_norm": 0.2237166315317154, + "learning_rate": 2.8471248925360927e-05, + "loss": 0.4224, "step": 65505 }, { - "epoch": 2.3, - "learning_rate": 2.9365391494074407e-05, - "loss": 0.2491, + "epoch": 2.3609759613651926, + "grad_norm": 0.2391594648361206, + "learning_rate": 2.8468359034775007e-05, + "loss": 0.3891, "step": 65510 }, { - "epoch": 2.3, - "learning_rate": 2.9362586510273143e-05, - "loss": 0.2859, + "epoch": 2.3611561610264173, + "grad_norm": 0.22656749188899994, + "learning_rate": 2.8465469096933352e-05, + "loss": 0.3941, "step": 65515 }, { - "epoch": 2.31, - "learning_rate": 2.935978146982658e-05, - "loss": 0.263, + "epoch": 2.361336360687642, + "grad_norm": 0.24217578768730164, + "learning_rate": 2.846257911187536e-05, + "loss": 0.3866, "step": 65520 }, { - "epoch": 2.31, - "learning_rate": 2.9356976372771117e-05, - "loss": 0.2789, + "epoch": 2.3615165603488664, + "grad_norm": 0.21993565559387207, + "learning_rate": 2.845968907964039e-05, + "loss": 0.4155, "step": 65525 }, { - "epoch": 2.31, - "learning_rate": 2.9354171219143193e-05, - "loss": 0.2859, + "epoch": 2.361696760010091, + "grad_norm": 0.2261502742767334, + "learning_rate": 2.845679900026783e-05, + "loss": 0.3222, "step": 65530 }, { - "epoch": 2.31, - "learning_rate": 2.9351366008979215e-05, - "loss": 0.2607, + "epoch": 2.361876959671316, + "grad_norm": 0.22237887978553772, + "learning_rate": 2.8453908873797058e-05, + "loss": 0.4065, "step": 65535 }, { - "epoch": 2.31, - "learning_rate": 2.934856074231563e-05, - "loss": 0.2644, + "epoch": 2.3620571593325406, + "grad_norm": 0.22978180646896362, + "learning_rate": 2.845101870026744e-05, + "loss": 0.3804, "step": 65540 }, { - "epoch": 2.31, - "learning_rate": 2.9345755419188843e-05, - "loss": 0.289, + "epoch": 2.362237358993765, + "grad_norm": 0.2067323923110962, + "learning_rate": 2.8448128479718363e-05, + "loss": 0.4058, "step": 65545 }, { - "epoch": 2.31, - "learning_rate": 2.9342950039635287e-05, - "loss": 0.287, + "epoch": 2.3624175586549896, + "grad_norm": 0.18871329724788666, + "learning_rate": 2.8445238212189208e-05, + "loss": 0.3929, "step": 65550 }, { - "epoch": 2.31, - "learning_rate": 2.9340144603691387e-05, - "loss": 0.2628, + "epoch": 2.3625977583162143, + "grad_norm": 0.20891360938549042, + "learning_rate": 2.8442347897719347e-05, + "loss": 0.3813, "step": 65555 }, { - "epoch": 2.31, - "learning_rate": 2.933733911139357e-05, - "loss": 0.2871, + "epoch": 2.362777957977439, + "grad_norm": 0.20510460436344147, + "learning_rate": 2.843945753634815e-05, + "loss": 0.3578, "step": 65560 }, { - "epoch": 2.31, - "learning_rate": 2.933453356277826e-05, - "loss": 0.2694, + "epoch": 2.362958157638664, + "grad_norm": 0.20665214955806732, + "learning_rate": 2.8436567128115022e-05, + "loss": 0.406, "step": 65565 }, { - "epoch": 2.31, - "learning_rate": 2.9331727957881892e-05, - "loss": 0.2994, + "epoch": 2.3631383572998885, + "grad_norm": 0.22137491405010223, + "learning_rate": 2.8433676673059335e-05, + "loss": 0.4062, "step": 65570 }, { - "epoch": 2.31, - "learning_rate": 2.932892229674089e-05, - "loss": 0.2825, + "epoch": 2.363318556961113, + "grad_norm": 0.17183031141757965, + "learning_rate": 2.8430786171220457e-05, + "loss": 0.3879, "step": 65575 }, { - "epoch": 2.31, - "learning_rate": 2.932611657939169e-05, - "loss": 0.2706, + "epoch": 2.3634987566223375, + "grad_norm": 0.2274576723575592, + "learning_rate": 2.842789562263779e-05, + "loss": 0.3876, "step": 65580 }, { - "epoch": 2.31, - "learning_rate": 2.932331080587071e-05, - "loss": 0.2541, + "epoch": 2.3636789562835623, + "grad_norm": 0.23537667095661163, + "learning_rate": 2.8425005027350704e-05, + "loss": 0.4282, "step": 65585 }, { - "epoch": 2.31, - "learning_rate": 2.9320504976214385e-05, - "loss": 0.2803, + "epoch": 2.363859155944787, + "grad_norm": 0.2088640332221985, + "learning_rate": 2.8422114385398597e-05, + "loss": 0.3923, "step": 65590 }, { - "epoch": 2.31, - "learning_rate": 2.9317699090459154e-05, - "loss": 0.2711, + "epoch": 2.3640393556060113, + "grad_norm": 0.19276586174964905, + "learning_rate": 2.841922369682083e-05, + "loss": 0.3885, "step": 65595 }, { - "epoch": 2.31, - "learning_rate": 2.9314893148641455e-05, - "loss": 0.2808, + "epoch": 2.364219555267236, + "grad_norm": 0.17863479256629944, + "learning_rate": 2.8416332961656812e-05, + "loss": 0.3814, "step": 65600 }, { - "epoch": 2.31, - "learning_rate": 2.9312087150797697e-05, - "loss": 0.2655, + "epoch": 2.3643997549284608, + "grad_norm": 0.21695052087306976, + "learning_rate": 2.8413442179945915e-05, + "loss": 0.395, "step": 65605 }, { - "epoch": 2.31, - "learning_rate": 2.9309281096964336e-05, - "loss": 0.2863, + "epoch": 2.3645799545896855, + "grad_norm": 0.20921245217323303, + "learning_rate": 2.8410551351727532e-05, + "loss": 0.3627, "step": 65610 }, { - "epoch": 2.31, - "learning_rate": 2.93064749871778e-05, - "loss": 0.2395, + "epoch": 2.3647601542509102, + "grad_norm": 0.18168587982654572, + "learning_rate": 2.8407660477041044e-05, + "loss": 0.3791, "step": 65615 }, { - "epoch": 2.31, - "learning_rate": 2.9303668821474522e-05, - "loss": 0.2728, + "epoch": 2.3649403539121345, + "grad_norm": 0.21238493919372559, + "learning_rate": 2.840476955592584e-05, + "loss": 0.4178, "step": 65620 }, { - "epoch": 2.31, - "learning_rate": 2.9300862599890944e-05, - "loss": 0.2665, + "epoch": 2.3651205535733593, + "grad_norm": 0.19079379737377167, + "learning_rate": 2.8401878588421316e-05, + "loss": 0.3875, "step": 65625 }, { - "epoch": 2.31, - "learning_rate": 2.92980563224635e-05, - "loss": 0.2773, + "epoch": 2.365300753234584, + "grad_norm": 0.17326843738555908, + "learning_rate": 2.8398987574566848e-05, + "loss": 0.3903, "step": 65630 }, { - "epoch": 2.31, - "learning_rate": 2.9295249989228628e-05, - "loss": 0.2719, + "epoch": 2.3654809528958087, + "grad_norm": 0.2072458267211914, + "learning_rate": 2.8396096514401838e-05, + "loss": 0.4034, "step": 65635 }, { - "epoch": 2.31, - "learning_rate": 2.9292443600222764e-05, - "loss": 0.2502, + "epoch": 2.365661152557033, + "grad_norm": 0.22681084275245667, + "learning_rate": 2.839320540796567e-05, + "loss": 0.3799, "step": 65640 }, { - "epoch": 2.31, - "learning_rate": 2.9289637155482346e-05, - "loss": 0.2657, + "epoch": 2.3658413522182578, + "grad_norm": 0.20382151007652283, + "learning_rate": 2.839031425529773e-05, + "loss": 0.4067, "step": 65645 }, { - "epoch": 2.31, - "learning_rate": 2.9286830655043817e-05, - "loss": 0.2493, + "epoch": 2.3660215518794825, + "grad_norm": 0.19017788767814636, + "learning_rate": 2.8387423056437417e-05, + "loss": 0.4157, "step": 65650 }, { - "epoch": 2.31, - "learning_rate": 2.9284024098943618e-05, - "loss": 0.2523, + "epoch": 2.366201751540707, + "grad_norm": 0.24520789086818695, + "learning_rate": 2.8384531811424127e-05, + "loss": 0.415, "step": 65655 }, { - "epoch": 2.31, - "learning_rate": 2.9281217487218193e-05, - "loss": 0.2934, + "epoch": 2.366381951201932, + "grad_norm": 0.23502486944198608, + "learning_rate": 2.8381640520297244e-05, + "loss": 0.4009, "step": 65660 }, { - "epoch": 2.31, - "learning_rate": 2.9278410819903978e-05, - "loss": 0.2807, + "epoch": 2.3665621508631562, + "grad_norm": 0.2212107628583908, + "learning_rate": 2.8378749183096154e-05, + "loss": 0.3814, "step": 65665 }, { - "epoch": 2.31, - "learning_rate": 2.9275604097037418e-05, - "loss": 0.3029, + "epoch": 2.366742350524381, + "grad_norm": 0.16753430664539337, + "learning_rate": 2.8375857799860272e-05, + "loss": 0.3745, "step": 65670 }, { - "epoch": 2.31, - "learning_rate": 2.9272797318654954e-05, - "loss": 0.2729, + "epoch": 2.3669225501856057, + "grad_norm": 0.1829628348350525, + "learning_rate": 2.8372966370628978e-05, + "loss": 0.4017, "step": 65675 }, { - "epoch": 2.31, - "learning_rate": 2.926999048479303e-05, - "loss": 0.284, + "epoch": 2.3671027498468304, + "grad_norm": 0.210256427526474, + "learning_rate": 2.8370074895441666e-05, + "loss": 0.3829, "step": 65680 }, { - "epoch": 2.31, - "learning_rate": 2.9267183595488106e-05, - "loss": 0.2547, + "epoch": 2.3672829495080547, + "grad_norm": 0.21395932137966156, + "learning_rate": 2.8367183374337743e-05, + "loss": 0.3984, "step": 65685 }, { - "epoch": 2.31, - "learning_rate": 2.9264376650776604e-05, - "loss": 0.2649, + "epoch": 2.3674631491692795, + "grad_norm": 0.22579269111156464, + "learning_rate": 2.8364291807356587e-05, + "loss": 0.4096, "step": 65690 }, { - "epoch": 2.31, - "learning_rate": 2.9261569650694986e-05, - "loss": 0.2765, + "epoch": 2.367643348830504, + "grad_norm": 0.22070454061031342, + "learning_rate": 2.8361400194537623e-05, + "loss": 0.4038, "step": 65695 }, { - "epoch": 2.31, - "learning_rate": 2.9258762595279693e-05, - "loss": 0.2556, + "epoch": 2.367823548491729, + "grad_norm": 0.18013955652713776, + "learning_rate": 2.835850853592022e-05, + "loss": 0.3868, "step": 65700 }, { - "epoch": 2.31, - "learning_rate": 2.9255955484567176e-05, - "loss": 0.3017, + "epoch": 2.3680037481529537, + "grad_norm": 0.1805589497089386, + "learning_rate": 2.835561683154379e-05, + "loss": 0.3842, "step": 65705 }, { - "epoch": 2.31, - "learning_rate": 2.925314831859388e-05, - "loss": 0.2805, + "epoch": 2.368183947814178, + "grad_norm": 0.17691951990127563, + "learning_rate": 2.835272508144773e-05, + "loss": 0.3759, "step": 65710 }, { - "epoch": 2.31, - "learning_rate": 2.9250341097396262e-05, - "loss": 0.2693, + "epoch": 2.3683641474754027, + "grad_norm": 0.18184374272823334, + "learning_rate": 2.8349833285671444e-05, + "loss": 0.3885, "step": 65715 }, { - "epoch": 2.31, - "learning_rate": 2.9247533821010758e-05, - "loss": 0.2638, + "epoch": 2.3685443471366274, + "grad_norm": 0.18779203295707703, + "learning_rate": 2.8346941444254327e-05, + "loss": 0.3972, "step": 65720 }, { - "epoch": 2.31, - "learning_rate": 2.9244726489473827e-05, - "loss": 0.2623, + "epoch": 2.368724546797852, + "grad_norm": 0.20252971351146698, + "learning_rate": 2.8344049557235775e-05, + "loss": 0.3733, "step": 65725 }, { - "epoch": 2.31, - "learning_rate": 2.924191910282192e-05, - "loss": 0.2724, + "epoch": 2.3689047464590764, + "grad_norm": 0.17999647557735443, + "learning_rate": 2.8341157624655202e-05, + "loss": 0.3842, "step": 65730 }, { - "epoch": 2.31, - "learning_rate": 2.9239111661091488e-05, - "loss": 0.2673, + "epoch": 2.369084946120301, + "grad_norm": 0.2151997685432434, + "learning_rate": 2.8338265646552002e-05, + "loss": 0.4194, "step": 65735 }, { - "epoch": 2.31, - "learning_rate": 2.9236304164318995e-05, - "loss": 0.2617, + "epoch": 2.369265145781526, + "grad_norm": 0.22379030287265778, + "learning_rate": 2.8335373622965576e-05, + "loss": 0.4103, "step": 65740 }, { - "epoch": 2.31, - "learning_rate": 2.9233496612540874e-05, - "loss": 0.2669, + "epoch": 2.3694453454427507, + "grad_norm": 0.20626239478588104, + "learning_rate": 2.833248155393533e-05, + "loss": 0.4013, "step": 65745 }, { - "epoch": 2.31, - "learning_rate": 2.9230689005793598e-05, - "loss": 0.2578, + "epoch": 2.3696255451039754, + "grad_norm": 0.2125103622674942, + "learning_rate": 2.8329589439500677e-05, + "loss": 0.4139, "step": 65750 }, { - "epoch": 2.31, - "learning_rate": 2.92278813441136e-05, - "loss": 0.2796, + "epoch": 2.3698057447651997, + "grad_norm": 0.20649980008602142, + "learning_rate": 2.8326697279701002e-05, + "loss": 0.4195, "step": 65755 }, { - "epoch": 2.31, - "learning_rate": 2.9225073627537358e-05, - "loss": 0.2737, + "epoch": 2.3699859444264244, + "grad_norm": 0.20015624165534973, + "learning_rate": 2.832380507457572e-05, + "loss": 0.3944, "step": 65760 }, { - "epoch": 2.31, - "learning_rate": 2.9222265856101317e-05, - "loss": 0.2745, + "epoch": 2.370166144087649, + "grad_norm": 0.20391733944416046, + "learning_rate": 2.8320912824164248e-05, + "loss": 0.4466, "step": 65765 }, { - "epoch": 2.31, - "learning_rate": 2.9219458029841945e-05, - "loss": 0.2825, + "epoch": 2.370346343748874, + "grad_norm": 0.21429723501205444, + "learning_rate": 2.8318020528505967e-05, + "loss": 0.4048, "step": 65770 }, { - "epoch": 2.31, - "learning_rate": 2.9216650148795683e-05, - "loss": 0.2576, + "epoch": 2.370526543410098, + "grad_norm": 0.2534154951572418, + "learning_rate": 2.831512818764031e-05, + "loss": 0.3978, "step": 65775 }, { - "epoch": 2.31, - "learning_rate": 2.9213842212998994e-05, - "loss": 0.2439, + "epoch": 2.370706743071323, + "grad_norm": 0.20116816461086273, + "learning_rate": 2.8312235801606674e-05, + "loss": 0.3887, "step": 65780 }, { - "epoch": 2.31, - "learning_rate": 2.9211034222488343e-05, - "loss": 0.2586, + "epoch": 2.3708869427325476, + "grad_norm": 0.199702188372612, + "learning_rate": 2.8309343370444457e-05, + "loss": 0.3907, "step": 65785 }, { - "epoch": 2.31, - "learning_rate": 2.9208226177300195e-05, - "loss": 0.2714, + "epoch": 2.3710671423937724, + "grad_norm": 0.21394966542720795, + "learning_rate": 2.8306450894193086e-05, + "loss": 0.3903, "step": 65790 }, { - "epoch": 2.31, - "learning_rate": 2.9205418077470998e-05, - "loss": 0.2634, + "epoch": 2.371247342054997, + "grad_norm": 0.21755504608154297, + "learning_rate": 2.8303558372891953e-05, + "loss": 0.3855, "step": 65795 }, { - "epoch": 2.32, - "learning_rate": 2.9202609923037222e-05, - "loss": 0.2927, + "epoch": 2.3714275417162214, + "grad_norm": 0.23619835078716278, + "learning_rate": 2.830066580658049e-05, + "loss": 0.4036, "step": 65800 }, { - "epoch": 2.32, - "learning_rate": 2.9199801714035324e-05, - "loss": 0.27, + "epoch": 2.371607741377446, + "grad_norm": 0.16918082535266876, + "learning_rate": 2.8297773195298084e-05, + "loss": 0.39, "step": 65805 }, { - "epoch": 2.32, - "learning_rate": 2.9196993450501764e-05, - "loss": 0.2663, + "epoch": 2.371787941038671, + "grad_norm": 0.20656408369541168, + "learning_rate": 2.8294880539084163e-05, + "loss": 0.3941, "step": 65810 }, { - "epoch": 2.32, - "learning_rate": 2.9194185132473013e-05, - "loss": 0.2596, + "epoch": 2.3719681406998956, + "grad_norm": 0.18240268528461456, + "learning_rate": 2.8291987837978125e-05, + "loss": 0.4155, "step": 65815 }, { - "epoch": 2.32, - "learning_rate": 2.9191376759985534e-05, - "loss": 0.268, + "epoch": 2.37214834036112, + "grad_norm": 0.19765466451644897, + "learning_rate": 2.8289095092019396e-05, + "loss": 0.3989, "step": 65820 }, { - "epoch": 2.32, - "learning_rate": 2.9188568333075787e-05, - "loss": 0.2997, + "epoch": 2.3723285400223446, + "grad_norm": 0.2510075271129608, + "learning_rate": 2.8286202301247382e-05, + "loss": 0.401, "step": 65825 }, { - "epoch": 2.32, - "learning_rate": 2.9185759851780247e-05, - "loss": 0.2841, + "epoch": 2.3725087396835693, + "grad_norm": 0.23590603470802307, + "learning_rate": 2.828330946570149e-05, + "loss": 0.4273, "step": 65830 }, { - "epoch": 2.32, - "learning_rate": 2.9182951316135367e-05, - "loss": 0.2822, + "epoch": 2.372688939344794, + "grad_norm": 0.20524172484874725, + "learning_rate": 2.8280416585421155e-05, + "loss": 0.3926, "step": 65835 }, { - "epoch": 2.32, - "learning_rate": 2.9180142726177616e-05, - "loss": 0.2858, + "epoch": 2.372869139006019, + "grad_norm": 0.21674352884292603, + "learning_rate": 2.8277523660445776e-05, + "loss": 0.3717, "step": 65840 }, { - "epoch": 2.32, - "learning_rate": 2.917733408194348e-05, - "loss": 0.2727, + "epoch": 2.3730493386672435, + "grad_norm": 0.20333261787891388, + "learning_rate": 2.827463069081477e-05, + "loss": 0.4245, "step": 65845 }, { - "epoch": 2.32, - "learning_rate": 2.9174525383469408e-05, - "loss": 0.2686, + "epoch": 2.373229538328468, + "grad_norm": 0.16487130522727966, + "learning_rate": 2.827173767656755e-05, + "loss": 0.3902, "step": 65850 }, { - "epoch": 2.32, - "learning_rate": 2.9171716630791878e-05, - "loss": 0.2368, + "epoch": 2.3734097379896926, + "grad_norm": 0.1875753402709961, + "learning_rate": 2.8268844617743544e-05, + "loss": 0.4189, "step": 65855 }, { - "epoch": 2.32, - "learning_rate": 2.916890782394735e-05, - "loss": 0.2818, + "epoch": 2.3735899376509173, + "grad_norm": 0.20067238807678223, + "learning_rate": 2.826595151438216e-05, + "loss": 0.3785, "step": 65860 }, { - "epoch": 2.32, - "learning_rate": 2.9166098962972306e-05, - "loss": 0.256, + "epoch": 2.373770137312142, + "grad_norm": 0.23048771917819977, + "learning_rate": 2.826305836652282e-05, + "loss": 0.4162, "step": 65865 }, { - "epoch": 2.32, - "learning_rate": 2.9163290047903208e-05, - "loss": 0.2594, + "epoch": 2.3739503369733663, + "grad_norm": 0.19079884886741638, + "learning_rate": 2.8260165174204938e-05, + "loss": 0.3493, "step": 65870 }, { - "epoch": 2.32, - "learning_rate": 2.9160481078776535e-05, - "loss": 0.259, + "epoch": 2.374130536634591, + "grad_norm": 0.1478530913591385, + "learning_rate": 2.825727193746794e-05, + "loss": 0.3911, "step": 65875 }, { - "epoch": 2.32, - "learning_rate": 2.915767205562876e-05, - "loss": 0.278, + "epoch": 2.374310736295816, + "grad_norm": 0.2297230362892151, + "learning_rate": 2.825437865635125e-05, + "loss": 0.4319, "step": 65880 }, { - "epoch": 2.32, - "learning_rate": 2.9154862978496354e-05, - "loss": 0.2681, + "epoch": 2.3744909359570405, + "grad_norm": 0.22544041275978088, + "learning_rate": 2.8251485330894266e-05, + "loss": 0.3452, "step": 65885 }, { - "epoch": 2.32, - "learning_rate": 2.9152053847415782e-05, - "loss": 0.2755, + "epoch": 2.3746711356182653, + "grad_norm": 0.19545534253120422, + "learning_rate": 2.8248591961136435e-05, + "loss": 0.4169, "step": 65890 }, { - "epoch": 2.32, - "learning_rate": 2.9149244662423535e-05, - "loss": 0.2759, + "epoch": 2.3748513352794896, + "grad_norm": 0.18258170783519745, + "learning_rate": 2.8245698547117162e-05, + "loss": 0.3857, "step": 65895 }, { - "epoch": 2.32, - "learning_rate": 2.914643542355608e-05, - "loss": 0.2689, + "epoch": 2.3750315349407143, + "grad_norm": 0.19753779470920563, + "learning_rate": 2.8242805088875874e-05, + "loss": 0.3941, "step": 65900 }, { - "epoch": 2.32, - "learning_rate": 2.9143626130849895e-05, - "loss": 0.2655, + "epoch": 2.375211734601939, + "grad_norm": 0.21284906566143036, + "learning_rate": 2.8239911586452e-05, + "loss": 0.3825, "step": 65905 }, { - "epoch": 2.32, - "learning_rate": 2.9140816784341445e-05, - "loss": 0.2749, + "epoch": 2.3753919342631638, + "grad_norm": 0.18012243509292603, + "learning_rate": 2.823701803988495e-05, + "loss": 0.3937, "step": 65910 }, { - "epoch": 2.32, - "learning_rate": 2.913800738406723e-05, - "loss": 0.277, + "epoch": 2.375572133924388, + "grad_norm": 0.2071056365966797, + "learning_rate": 2.8234124449214163e-05, + "loss": 0.3982, "step": 65915 }, { - "epoch": 2.32, - "learning_rate": 2.9135197930063708e-05, - "loss": 0.2265, + "epoch": 2.3757523335856128, + "grad_norm": 0.21273143589496613, + "learning_rate": 2.8231230814479052e-05, + "loss": 0.3928, "step": 65920 }, { - "epoch": 2.32, - "learning_rate": 2.9132388422367375e-05, - "loss": 0.2685, + "epoch": 2.3759325332468375, + "grad_norm": 0.21067939698696136, + "learning_rate": 2.8228337135719046e-05, + "loss": 0.3612, "step": 65925 }, { - "epoch": 2.32, - "learning_rate": 2.912957886101469e-05, - "loss": 0.2863, + "epoch": 2.3761127329080622, + "grad_norm": 0.22029119729995728, + "learning_rate": 2.822544341297358e-05, + "loss": 0.3829, "step": 65930 }, { - "epoch": 2.32, - "learning_rate": 2.912676924604216e-05, - "loss": 0.2608, + "epoch": 2.376292932569287, + "grad_norm": 0.2073868066072464, + "learning_rate": 2.822254964628206e-05, + "loss": 0.4605, "step": 65935 }, { - "epoch": 2.32, - "learning_rate": 2.912395957748624e-05, - "loss": 0.2767, + "epoch": 2.3764731322305113, + "grad_norm": 0.1779036968946457, + "learning_rate": 2.821965583568394e-05, + "loss": 0.3912, "step": 65940 }, { - "epoch": 2.32, - "learning_rate": 2.9121149855383417e-05, - "loss": 0.2627, + "epoch": 2.376653331891736, + "grad_norm": 0.1958044022321701, + "learning_rate": 2.821676198121862e-05, + "loss": 0.4263, "step": 65945 }, { - "epoch": 2.32, - "learning_rate": 2.911834007977019e-05, - "loss": 0.2609, + "epoch": 2.3768335315529607, + "grad_norm": 0.1888829916715622, + "learning_rate": 2.8213868082925542e-05, + "loss": 0.3649, "step": 65950 }, { - "epoch": 2.32, - "learning_rate": 2.911553025068302e-05, - "loss": 0.3032, + "epoch": 2.3770137312141855, + "grad_norm": 0.2218841016292572, + "learning_rate": 2.8210974140844137e-05, + "loss": 0.3925, "step": 65955 }, { - "epoch": 2.32, - "learning_rate": 2.911272036815842e-05, - "loss": 0.303, + "epoch": 2.3771939308754098, + "grad_norm": 0.19127243757247925, + "learning_rate": 2.8208080155013826e-05, + "loss": 0.4105, "step": 65960 }, { - "epoch": 2.32, - "learning_rate": 2.910991043223284e-05, - "loss": 0.2768, + "epoch": 2.3773741305366345, + "grad_norm": 0.19066543877124786, + "learning_rate": 2.8205186125474054e-05, + "loss": 0.3651, "step": 65965 }, { - "epoch": 2.32, - "learning_rate": 2.9107100442942793e-05, - "loss": 0.247, + "epoch": 2.3775543301978592, + "grad_norm": 0.1706237643957138, + "learning_rate": 2.8202292052264234e-05, + "loss": 0.4082, "step": 65970 }, { - "epoch": 2.32, - "learning_rate": 2.910429040032474e-05, - "loss": 0.2768, + "epoch": 2.377734529859084, + "grad_norm": 0.21013911068439484, + "learning_rate": 2.8199397935423805e-05, + "loss": 0.424, "step": 65975 }, { - "epoch": 2.32, - "learning_rate": 2.9101480304415196e-05, - "loss": 0.2787, + "epoch": 2.3779147295203087, + "grad_norm": 0.22082720696926117, + "learning_rate": 2.81965037749922e-05, + "loss": 0.3968, "step": 65980 }, { - "epoch": 2.32, - "learning_rate": 2.9098670155250624e-05, - "loss": 0.2531, + "epoch": 2.378094929181533, + "grad_norm": 0.21986991167068481, + "learning_rate": 2.8193609571008856e-05, + "loss": 0.4173, "step": 65985 }, { - "epoch": 2.32, - "learning_rate": 2.9095859952867527e-05, - "loss": 0.253, + "epoch": 2.3782751288427577, + "grad_norm": 0.22359991073608398, + "learning_rate": 2.819071532351319e-05, + "loss": 0.4279, "step": 65990 }, { - "epoch": 2.32, - "learning_rate": 2.9093049697302382e-05, - "loss": 0.2818, + "epoch": 2.3784553285039824, + "grad_norm": 0.18317356705665588, + "learning_rate": 2.8187821032544648e-05, + "loss": 0.382, "step": 65995 }, { - "epoch": 2.32, - "learning_rate": 2.9090239388591684e-05, - "loss": 0.2416, + "epoch": 2.378635528165207, + "grad_norm": 0.21548272669315338, + "learning_rate": 2.818492669814266e-05, + "loss": 0.3584, "step": 66000 }, { - "epoch": 2.32, - "eval_loss": 0.2667825520038605, - "eval_runtime": 10.5519, - "eval_samples_per_second": 9.477, - "eval_steps_per_second": 9.477, + "epoch": 2.378635528165207, + "eval_loss": 0.4347288906574249, + "eval_runtime": 3.5302, + "eval_samples_per_second": 28.327, + "eval_steps_per_second": 7.082, "step": 66000 }, { - "epoch": 2.32, - "learning_rate": 2.9087429026771924e-05, - "loss": 0.2977, + "epoch": 2.3788157278264315, + "grad_norm": 0.2195282280445099, + "learning_rate": 2.8182032320346668e-05, + "loss": 0.3977, "step": 66005 }, { - "epoch": 2.32, - "learning_rate": 2.9084618611879598e-05, - "loss": 0.2883, + "epoch": 2.378995927487656, + "grad_norm": 0.19582293927669525, + "learning_rate": 2.81791378991961e-05, + "loss": 0.3921, "step": 66010 }, { - "epoch": 2.32, - "learning_rate": 2.9081808143951185e-05, - "loss": 0.2677, + "epoch": 2.379176127148881, + "grad_norm": 0.1854901909828186, + "learning_rate": 2.8176243434730388e-05, + "loss": 0.3933, "step": 66015 }, { - "epoch": 2.32, - "learning_rate": 2.907899762302319e-05, - "loss": 0.2862, + "epoch": 2.3793563268101057, + "grad_norm": 0.2092180699110031, + "learning_rate": 2.817334892698898e-05, + "loss": 0.4335, "step": 66020 }, { - "epoch": 2.32, - "learning_rate": 2.907618704913209e-05, - "loss": 0.2749, + "epoch": 2.3795365264713304, + "grad_norm": 0.1754889339208603, + "learning_rate": 2.8170454376011307e-05, + "loss": 0.4288, "step": 66025 }, { - "epoch": 2.32, - "learning_rate": 2.907337642231439e-05, - "loss": 0.2824, + "epoch": 2.3797167261325547, + "grad_norm": 0.21382570266723633, + "learning_rate": 2.816755978183681e-05, + "loss": 0.4057, "step": 66030 }, { - "epoch": 2.32, - "learning_rate": 2.9070565742606582e-05, - "loss": 0.2703, + "epoch": 2.3798969257937794, + "grad_norm": 0.183826744556427, + "learning_rate": 2.8164665144504914e-05, + "loss": 0.3857, "step": 66035 }, { - "epoch": 2.32, - "learning_rate": 2.9067755010045167e-05, - "loss": 0.2743, + "epoch": 2.380077125455004, + "grad_norm": 0.17409420013427734, + "learning_rate": 2.8161770464055077e-05, + "loss": 0.394, "step": 66040 }, { - "epoch": 2.32, - "learning_rate": 2.9064944224666623e-05, - "loss": 0.2623, + "epoch": 2.380257325116229, + "grad_norm": 0.19534289836883545, + "learning_rate": 2.815887574052673e-05, + "loss": 0.4162, "step": 66045 }, { - "epoch": 2.32, - "learning_rate": 2.9062133386507467e-05, - "loss": 0.246, + "epoch": 2.380437524777453, + "grad_norm": 0.2002120018005371, + "learning_rate": 2.8155980973959308e-05, + "loss": 0.3953, "step": 66050 }, { - "epoch": 2.32, - "learning_rate": 2.9059322495604173e-05, - "loss": 0.2824, + "epoch": 2.380617724438678, + "grad_norm": 0.23090575635433197, + "learning_rate": 2.8153086164392258e-05, + "loss": 0.4104, "step": 66055 }, { - "epoch": 2.32, - "learning_rate": 2.9056511551993264e-05, - "loss": 0.2938, + "epoch": 2.3807979240999027, + "grad_norm": 0.22412335872650146, + "learning_rate": 2.8150191311865014e-05, + "loss": 0.4029, "step": 66060 }, { - "epoch": 2.32, - "learning_rate": 2.905370055571122e-05, - "loss": 0.2722, + "epoch": 2.3809781237611274, + "grad_norm": 0.1874934881925583, + "learning_rate": 2.814729641641703e-05, + "loss": 0.4051, "step": 66065 }, { - "epoch": 2.32, - "learning_rate": 2.9050889506794547e-05, - "loss": 0.26, + "epoch": 2.381158323422352, + "grad_norm": 0.23350152373313904, + "learning_rate": 2.8144401478087744e-05, + "loss": 0.4113, "step": 66070 }, { - "epoch": 2.32, - "learning_rate": 2.904807840527975e-05, - "loss": 0.2683, + "epoch": 2.381338523083577, + "grad_norm": 0.21121875941753387, + "learning_rate": 2.8141506496916586e-05, + "loss": 0.3871, "step": 66075 }, { - "epoch": 2.32, - "learning_rate": 2.904526725120332e-05, - "loss": 0.2762, + "epoch": 2.381518722744801, + "grad_norm": 0.1994001567363739, + "learning_rate": 2.8138611472943023e-05, + "loss": 0.4012, "step": 66080 }, { - "epoch": 2.33, - "learning_rate": 2.904245604460175e-05, - "loss": 0.2937, + "epoch": 2.381698922406026, + "grad_norm": 0.18203619122505188, + "learning_rate": 2.813571640620648e-05, + "loss": 0.3756, "step": 66085 }, { - "epoch": 2.33, - "learning_rate": 2.903964478551156e-05, - "loss": 0.2587, + "epoch": 2.3818791220672506, + "grad_norm": 0.18496555089950562, + "learning_rate": 2.813282129674641e-05, + "loss": 0.4213, "step": 66090 }, { - "epoch": 2.33, - "learning_rate": 2.903683347396925e-05, - "loss": 0.2781, + "epoch": 2.3820593217284753, + "grad_norm": 0.22467875480651855, + "learning_rate": 2.8129926144602247e-05, + "loss": 0.4402, "step": 66095 }, { - "epoch": 2.33, - "learning_rate": 2.9034022110011305e-05, - "loss": 0.2881, + "epoch": 2.3822395213896996, + "grad_norm": 0.1796996146440506, + "learning_rate": 2.812703094981346e-05, + "loss": 0.4387, "step": 66100 }, { - "epoch": 2.33, - "learning_rate": 2.9031210693674256e-05, - "loss": 0.2518, + "epoch": 2.3824197210509244, + "grad_norm": 0.20980200171470642, + "learning_rate": 2.8124135712419476e-05, + "loss": 0.4291, "step": 66105 }, { - "epoch": 2.33, - "learning_rate": 2.9028399224994586e-05, - "loss": 0.2767, + "epoch": 2.382599920712149, + "grad_norm": 0.1975623220205307, + "learning_rate": 2.8121240432459746e-05, + "loss": 0.3491, "step": 66110 }, { - "epoch": 2.33, - "learning_rate": 2.90255877040088e-05, - "loss": 0.2892, + "epoch": 2.382780120373374, + "grad_norm": 0.2462427318096161, + "learning_rate": 2.811834510997372e-05, + "loss": 0.3775, "step": 66115 }, { - "epoch": 2.33, - "learning_rate": 2.9022776130753414e-05, - "loss": 0.2792, + "epoch": 2.3829603200345986, + "grad_norm": 0.201690673828125, + "learning_rate": 2.8115449745000845e-05, + "loss": 0.4109, "step": 66120 }, { - "epoch": 2.33, - "learning_rate": 2.901996450526494e-05, - "loss": 0.2875, + "epoch": 2.383140519695823, + "grad_norm": 0.18545566499233246, + "learning_rate": 2.8112554337580577e-05, + "loss": 0.3889, "step": 66125 }, { - "epoch": 2.33, - "learning_rate": 2.901715282757987e-05, - "loss": 0.2502, + "epoch": 2.3833207193570476, + "grad_norm": 0.19686880707740784, + "learning_rate": 2.810965888775235e-05, + "loss": 0.3594, "step": 66130 }, { - "epoch": 2.33, - "learning_rate": 2.9014341097734717e-05, - "loss": 0.2566, + "epoch": 2.3835009190182723, + "grad_norm": 0.24028317630290985, + "learning_rate": 2.810676339555563e-05, + "loss": 0.4291, "step": 66135 }, { - "epoch": 2.33, - "learning_rate": 2.901152931576599e-05, - "loss": 0.2717, + "epoch": 2.383681118679497, + "grad_norm": 0.2691000998020172, + "learning_rate": 2.8103867861029852e-05, + "loss": 0.3862, "step": 66140 }, { - "epoch": 2.33, - "learning_rate": 2.90087174817102e-05, - "loss": 0.2481, + "epoch": 2.3838613183407213, + "grad_norm": 0.18645727634429932, + "learning_rate": 2.8100972284214476e-05, + "loss": 0.3964, "step": 66145 }, { - "epoch": 2.33, - "learning_rate": 2.9005905595603854e-05, - "loss": 0.2758, + "epoch": 2.384041518001946, + "grad_norm": 0.21478243172168732, + "learning_rate": 2.8098076665148965e-05, + "loss": 0.4038, "step": 66150 }, { - "epoch": 2.33, - "learning_rate": 2.900309365748347e-05, - "loss": 0.2538, + "epoch": 2.384221717663171, + "grad_norm": 0.22936570644378662, + "learning_rate": 2.8095181003872746e-05, + "loss": 0.3795, "step": 66155 }, { - "epoch": 2.33, - "learning_rate": 2.900028166738555e-05, - "loss": 0.2659, + "epoch": 2.3844019173243955, + "grad_norm": 0.22326894104480743, + "learning_rate": 2.809228530042529e-05, + "loss": 0.4041, "step": 66160 }, { - "epoch": 2.33, - "learning_rate": 2.89974696253466e-05, - "loss": 0.2753, + "epoch": 2.3845821169856203, + "grad_norm": 0.22681626677513123, + "learning_rate": 2.8089389554846048e-05, + "loss": 0.4157, "step": 66165 }, { - "epoch": 2.33, - "learning_rate": 2.8994657531403153e-05, - "loss": 0.2585, + "epoch": 2.3847623166468446, + "grad_norm": 0.19719672203063965, + "learning_rate": 2.808649376717447e-05, + "loss": 0.37, "step": 66170 }, { - "epoch": 2.33, - "learning_rate": 2.899184538559171e-05, - "loss": 0.2701, + "epoch": 2.3849425163080693, + "grad_norm": 0.22357481718063354, + "learning_rate": 2.8083597937450006e-05, + "loss": 0.385, "step": 66175 }, { - "epoch": 2.33, - "learning_rate": 2.8989033187948778e-05, - "loss": 0.2448, + "epoch": 2.385122715969294, + "grad_norm": 0.22523802518844604, + "learning_rate": 2.8080702065712127e-05, + "loss": 0.3971, "step": 66180 }, { - "epoch": 2.33, - "learning_rate": 2.8986220938510884e-05, - "loss": 0.2692, + "epoch": 2.3853029156305188, + "grad_norm": 0.2830953001976013, + "learning_rate": 2.8077806152000273e-05, + "loss": 0.3931, "step": 66185 }, { - "epoch": 2.33, - "learning_rate": 2.8983408637314535e-05, - "loss": 0.2859, + "epoch": 2.385483115291743, + "grad_norm": 0.21233409643173218, + "learning_rate": 2.8074910196353904e-05, + "loss": 0.3689, "step": 66190 }, { - "epoch": 2.33, - "learning_rate": 2.8980596284396246e-05, - "loss": 0.2792, + "epoch": 2.385663314952968, + "grad_norm": 0.20997081696987152, + "learning_rate": 2.8072014198812487e-05, + "loss": 0.3721, "step": 66195 }, { - "epoch": 2.33, - "learning_rate": 2.897778387979255e-05, - "loss": 0.2653, + "epoch": 2.3858435146141925, + "grad_norm": 0.19072303175926208, + "learning_rate": 2.8069118159415458e-05, + "loss": 0.3859, "step": 66200 }, { - "epoch": 2.33, - "learning_rate": 2.8974971423539937e-05, - "loss": 0.2734, + "epoch": 2.3860237142754173, + "grad_norm": 0.2057289481163025, + "learning_rate": 2.8066222078202303e-05, + "loss": 0.3966, "step": 66205 }, { - "epoch": 2.33, - "learning_rate": 2.897215891567495e-05, - "loss": 0.2732, + "epoch": 2.386203913936642, + "grad_norm": 0.1966446340084076, + "learning_rate": 2.806332595521246e-05, + "loss": 0.3794, "step": 66210 }, { - "epoch": 2.33, - "learning_rate": 2.8969346356234095e-05, - "loss": 0.2985, + "epoch": 2.3863841135978663, + "grad_norm": 0.2165762335062027, + "learning_rate": 2.8060429790485386e-05, + "loss": 0.4308, "step": 66215 }, { - "epoch": 2.33, - "learning_rate": 2.8966533745253893e-05, - "loss": 0.2597, + "epoch": 2.386564313259091, + "grad_norm": 0.1911364644765854, + "learning_rate": 2.8057533584060558e-05, + "loss": 0.3758, "step": 66220 }, { - "epoch": 2.33, - "learning_rate": 2.8963721082770863e-05, - "loss": 0.2916, + "epoch": 2.3867445129203158, + "grad_norm": 0.2317955642938614, + "learning_rate": 2.8054637335977423e-05, + "loss": 0.4062, "step": 66225 }, { - "epoch": 2.33, - "learning_rate": 2.896090836882153e-05, - "loss": 0.2687, + "epoch": 2.3869247125815405, + "grad_norm": 0.21293920278549194, + "learning_rate": 2.8051741046275453e-05, + "loss": 0.4013, "step": 66230 }, { - "epoch": 2.33, - "learning_rate": 2.895809560344241e-05, - "loss": 0.2851, + "epoch": 2.3871049122427648, + "grad_norm": 0.17797954380512238, + "learning_rate": 2.8048844714994092e-05, + "loss": 0.3899, "step": 66235 }, { - "epoch": 2.33, - "learning_rate": 2.895528278667003e-05, - "loss": 0.249, + "epoch": 2.3872851119039895, + "grad_norm": 0.21565930545330048, + "learning_rate": 2.8045948342172823e-05, + "loss": 0.3822, "step": 66240 }, { - "epoch": 2.33, - "learning_rate": 2.8952469918540908e-05, - "loss": 0.2783, + "epoch": 2.3874653115652142, + "grad_norm": 0.19301265478134155, + "learning_rate": 2.8043051927851083e-05, + "loss": 0.4029, "step": 66245 }, { - "epoch": 2.33, - "learning_rate": 2.8949656999091568e-05, - "loss": 0.2905, + "epoch": 2.387645511226439, + "grad_norm": 0.20318666100502014, + "learning_rate": 2.8040155472068365e-05, + "loss": 0.3968, "step": 66250 }, { - "epoch": 2.33, - "learning_rate": 2.8946844028358533e-05, - "loss": 0.255, + "epoch": 2.3878257108876637, + "grad_norm": 0.18554876744747162, + "learning_rate": 2.8037258974864105e-05, + "loss": 0.3907, "step": 66255 }, { - "epoch": 2.33, - "learning_rate": 2.8944031006378332e-05, - "loss": 0.253, + "epoch": 2.388005910548888, + "grad_norm": 0.2470291256904602, + "learning_rate": 2.8034362436277784e-05, + "loss": 0.4145, "step": 66260 }, { - "epoch": 2.33, - "learning_rate": 2.8941217933187487e-05, - "loss": 0.2725, + "epoch": 2.3881861102101127, + "grad_norm": 0.20082451403141022, + "learning_rate": 2.8031465856348865e-05, + "loss": 0.4123, "step": 66265 }, { - "epoch": 2.33, - "learning_rate": 2.8938404808822527e-05, - "loss": 0.2718, + "epoch": 2.3883663098713375, + "grad_norm": 0.19475050270557404, + "learning_rate": 2.802856923511681e-05, + "loss": 0.4013, "step": 66270 }, { - "epoch": 2.33, - "learning_rate": 2.893559163331997e-05, - "loss": 0.2692, + "epoch": 2.388546509532562, + "grad_norm": 0.22373121976852417, + "learning_rate": 2.8025672572621088e-05, + "loss": 0.3921, "step": 66275 }, { - "epoch": 2.33, - "learning_rate": 2.8932778406716353e-05, - "loss": 0.2846, + "epoch": 2.3887267091937865, + "grad_norm": 0.25062522292137146, + "learning_rate": 2.8022775868901153e-05, + "loss": 0.4029, "step": 66280 }, { - "epoch": 2.33, - "learning_rate": 2.8929965129048197e-05, - "loss": 0.2979, + "epoch": 2.3889069088550112, + "grad_norm": 0.19427599012851715, + "learning_rate": 2.801987912399649e-05, + "loss": 0.3677, "step": 66285 }, { - "epoch": 2.33, - "learning_rate": 2.8927151800352042e-05, - "loss": 0.2936, + "epoch": 2.389087108516236, + "grad_norm": 0.1876882165670395, + "learning_rate": 2.8016982337946556e-05, + "loss": 0.3781, "step": 66290 }, { - "epoch": 2.33, - "learning_rate": 2.8924338420664403e-05, - "loss": 0.2795, + "epoch": 2.3892673081774607, + "grad_norm": 0.18821240961551666, + "learning_rate": 2.801408551079082e-05, + "loss": 0.3557, "step": 66295 }, { - "epoch": 2.33, - "learning_rate": 2.8921524990021816e-05, - "loss": 0.2871, + "epoch": 2.3894475078386854, + "grad_norm": 0.21924707293510437, + "learning_rate": 2.801118864256876e-05, + "loss": 0.4088, "step": 66300 }, { - "epoch": 2.33, - "learning_rate": 2.8918711508460805e-05, - "loss": 0.2471, + "epoch": 2.3896277074999097, + "grad_norm": 0.19032910466194153, + "learning_rate": 2.8008291733319824e-05, + "loss": 0.3856, "step": 66305 }, { - "epoch": 2.33, - "learning_rate": 2.8915897976017915e-05, - "loss": 0.2777, + "epoch": 2.3898079071611344, + "grad_norm": 0.1727897822856903, + "learning_rate": 2.800539478308351e-05, + "loss": 0.4037, "step": 66310 }, { - "epoch": 2.33, - "learning_rate": 2.891308439272967e-05, - "loss": 0.2781, + "epoch": 2.389988106822359, + "grad_norm": 0.21995660662651062, + "learning_rate": 2.8002497791899268e-05, + "loss": 0.4113, "step": 66315 }, { - "epoch": 2.33, - "learning_rate": 2.8910270758632606e-05, - "loss": 0.2709, + "epoch": 2.390168306483584, + "grad_norm": 0.20189730823040009, + "learning_rate": 2.7999600759806577e-05, + "loss": 0.3665, "step": 66320 }, { - "epoch": 2.33, - "learning_rate": 2.8907457073763254e-05, - "loss": 0.2599, + "epoch": 2.390348506144808, + "grad_norm": 0.21424639225006104, + "learning_rate": 2.79967036868449e-05, + "loss": 0.4236, "step": 66325 }, { - "epoch": 2.33, - "learning_rate": 2.890464333815814e-05, - "loss": 0.291, + "epoch": 2.390528705806033, + "grad_norm": 0.2475886195898056, + "learning_rate": 2.7993806573053723e-05, + "loss": 0.4123, "step": 66330 }, { - "epoch": 2.33, - "learning_rate": 2.890182955185381e-05, - "loss": 0.2766, + "epoch": 2.3907089054672577, + "grad_norm": 0.17715711891651154, + "learning_rate": 2.7990909418472505e-05, + "loss": 0.4205, "step": 66335 }, { - "epoch": 2.33, - "learning_rate": 2.889901571488679e-05, - "loss": 0.2551, + "epoch": 2.3908891051284824, + "grad_norm": 0.20529335737228394, + "learning_rate": 2.7988012223140726e-05, + "loss": 0.4083, "step": 66340 }, { - "epoch": 2.33, - "learning_rate": 2.889620182729363e-05, - "loss": 0.2641, + "epoch": 2.391069304789707, + "grad_norm": 0.18749858438968658, + "learning_rate": 2.798511498709786e-05, + "loss": 0.4141, "step": 66345 }, { - "epoch": 2.33, - "learning_rate": 2.8893387889110845e-05, - "loss": 0.2837, + "epoch": 2.391249504450932, + "grad_norm": 0.18986235558986664, + "learning_rate": 2.7982217710383386e-05, + "loss": 0.4154, "step": 66350 }, { - "epoch": 2.33, - "learning_rate": 2.889057390037499e-05, - "loss": 0.2869, + "epoch": 2.391429704112156, + "grad_norm": 0.19709980487823486, + "learning_rate": 2.7979320393036762e-05, + "loss": 0.3889, "step": 66355 }, { - "epoch": 2.33, - "learning_rate": 2.8887759861122594e-05, - "loss": 0.2369, + "epoch": 2.391609903773381, + "grad_norm": 0.17932918667793274, + "learning_rate": 2.7976423035097486e-05, + "loss": 0.3893, "step": 66360 }, { - "epoch": 2.33, - "learning_rate": 2.8884945771390203e-05, - "loss": 0.2778, + "epoch": 2.3917901034346056, + "grad_norm": 0.20083005726337433, + "learning_rate": 2.7973525636605014e-05, + "loss": 0.4004, "step": 66365 }, { - "epoch": 2.34, - "learning_rate": 2.888213163121435e-05, - "loss": 0.277, + "epoch": 2.3919703030958304, + "grad_norm": 0.21819722652435303, + "learning_rate": 2.797062819759884e-05, + "loss": 0.414, "step": 66370 }, { - "epoch": 2.34, - "learning_rate": 2.887931744063158e-05, - "loss": 0.2471, + "epoch": 2.3921505027570547, + "grad_norm": 0.22273634374141693, + "learning_rate": 2.7967730718118424e-05, + "loss": 0.4096, "step": 66375 }, { - "epoch": 2.34, - "learning_rate": 2.8876503199678424e-05, - "loss": 0.2734, + "epoch": 2.3923307024182794, + "grad_norm": 0.18526360392570496, + "learning_rate": 2.7964833198203254e-05, + "loss": 0.3985, "step": 66380 }, { - "epoch": 2.34, - "learning_rate": 2.8873688908391426e-05, - "loss": 0.2756, + "epoch": 2.392510902079504, + "grad_norm": 0.2259715348482132, + "learning_rate": 2.79619356378928e-05, + "loss": 0.3778, "step": 66385 }, { - "epoch": 2.34, - "learning_rate": 2.8870874566807134e-05, - "loss": 0.2497, + "epoch": 2.392691101740729, + "grad_norm": 0.17341482639312744, + "learning_rate": 2.7959038037226554e-05, + "loss": 0.3816, "step": 66390 }, { - "epoch": 2.34, - "learning_rate": 2.8868060174962092e-05, - "loss": 0.2631, + "epoch": 2.3928713014019536, + "grad_norm": 0.22242747247219086, + "learning_rate": 2.7956140396243986e-05, + "loss": 0.378, "step": 66395 }, { - "epoch": 2.34, - "learning_rate": 2.886524573289283e-05, - "loss": 0.2787, + "epoch": 2.393051501063178, + "grad_norm": 0.19790546596050262, + "learning_rate": 2.7953242714984573e-05, + "loss": 0.3698, "step": 66400 }, { - "epoch": 2.34, - "learning_rate": 2.8862431240635906e-05, - "loss": 0.2505, + "epoch": 2.3932317007244026, + "grad_norm": 0.2102694809436798, + "learning_rate": 2.7950344993487803e-05, + "loss": 0.4262, "step": 66405 }, { - "epoch": 2.34, - "learning_rate": 2.8859616698227854e-05, - "loss": 0.2918, + "epoch": 2.3934119003856273, + "grad_norm": 0.19927994906902313, + "learning_rate": 2.7947447231793156e-05, + "loss": 0.4135, "step": 66410 }, { - "epoch": 2.34, - "learning_rate": 2.8856802105705216e-05, - "loss": 0.2516, + "epoch": 2.393592100046852, + "grad_norm": 0.20652970671653748, + "learning_rate": 2.7944549429940115e-05, + "loss": 0.3911, "step": 66415 }, { - "epoch": 2.34, - "learning_rate": 2.8853987463104554e-05, - "loss": 0.2788, + "epoch": 2.3937722997080764, + "grad_norm": 0.17121002078056335, + "learning_rate": 2.7941651587968147e-05, + "loss": 0.394, "step": 66420 }, { - "epoch": 2.34, - "learning_rate": 2.8851172770462398e-05, - "loss": 0.2767, + "epoch": 2.393952499369301, + "grad_norm": 0.20938289165496826, + "learning_rate": 2.7938753705916752e-05, + "loss": 0.3841, "step": 66425 }, { - "epoch": 2.34, - "learning_rate": 2.884835802781531e-05, - "loss": 0.2687, + "epoch": 2.394132699030526, + "grad_norm": 0.2229117602109909, + "learning_rate": 2.7935855783825406e-05, + "loss": 0.4056, "step": 66430 }, { - "epoch": 2.34, - "learning_rate": 2.8845543235199824e-05, - "loss": 0.2584, + "epoch": 2.3943128986917506, + "grad_norm": 0.2586416006088257, + "learning_rate": 2.7932957821733592e-05, + "loss": 0.4423, "step": 66435 }, { - "epoch": 2.34, - "learning_rate": 2.884272839265249e-05, - "loss": 0.2691, + "epoch": 2.3944930983529753, + "grad_norm": 0.22627079486846924, + "learning_rate": 2.79300598196808e-05, + "loss": 0.3877, "step": 66440 }, { - "epoch": 2.34, - "learning_rate": 2.8839913500209863e-05, - "loss": 0.269, + "epoch": 2.3946732980141996, + "grad_norm": 0.20481657981872559, + "learning_rate": 2.79271617777065e-05, + "loss": 0.3804, "step": 66445 }, { - "epoch": 2.34, - "learning_rate": 2.8837098557908493e-05, - "loss": 0.2869, + "epoch": 2.3948534976754243, + "grad_norm": 0.18298093974590302, + "learning_rate": 2.7924263695850196e-05, + "loss": 0.4435, "step": 66450 }, { - "epoch": 2.34, - "learning_rate": 2.8834283565784927e-05, - "loss": 0.266, + "epoch": 2.395033697336649, + "grad_norm": 0.2255946695804596, + "learning_rate": 2.7921365574151364e-05, + "loss": 0.3889, "step": 66455 }, { - "epoch": 2.34, - "learning_rate": 2.8831468523875715e-05, - "loss": 0.2755, + "epoch": 2.395213896997874, + "grad_norm": 0.15484662353992462, + "learning_rate": 2.7918467412649495e-05, + "loss": 0.3927, "step": 66460 }, { - "epoch": 2.34, - "learning_rate": 2.8828653432217405e-05, - "loss": 0.2646, + "epoch": 2.395394096659098, + "grad_norm": 0.23011015355587006, + "learning_rate": 2.7915569211384064e-05, + "loss": 0.3822, "step": 66465 }, { - "epoch": 2.34, - "learning_rate": 2.882583829084656e-05, - "loss": 0.2931, + "epoch": 2.395574296320323, + "grad_norm": 0.21641449630260468, + "learning_rate": 2.791267097039457e-05, + "loss": 0.3903, "step": 66470 }, { - "epoch": 2.34, - "learning_rate": 2.8823023099799717e-05, - "loss": 0.2787, + "epoch": 2.3957544959815475, + "grad_norm": 0.19246269762516022, + "learning_rate": 2.7909772689720503e-05, + "loss": 0.3717, "step": 66475 }, { - "epoch": 2.34, - "learning_rate": 2.882020785911345e-05, - "loss": 0.26, + "epoch": 2.3959346956427723, + "grad_norm": 0.1879129558801651, + "learning_rate": 2.7906874369401342e-05, + "loss": 0.4046, "step": 66480 }, { - "epoch": 2.34, - "learning_rate": 2.8817392568824292e-05, - "loss": 0.2788, + "epoch": 2.396114895303997, + "grad_norm": 0.20422397553920746, + "learning_rate": 2.7903976009476584e-05, + "loss": 0.353, "step": 66485 }, { - "epoch": 2.34, - "learning_rate": 2.8814577228968815e-05, - "loss": 0.2496, + "epoch": 2.3962950949652213, + "grad_norm": 0.19214318692684174, + "learning_rate": 2.7901077609985708e-05, + "loss": 0.3937, "step": 66490 }, { - "epoch": 2.34, - "learning_rate": 2.8811761839583558e-05, - "loss": 0.2569, + "epoch": 2.396475294626446, + "grad_norm": 0.20487315952777863, + "learning_rate": 2.789817917096822e-05, + "loss": 0.3955, "step": 66495 }, { - "epoch": 2.34, - "learning_rate": 2.8808946400705094e-05, - "loss": 0.3004, + "epoch": 2.3966554942876708, + "grad_norm": 0.19864313304424286, + "learning_rate": 2.78952806924636e-05, + "loss": 0.3657, "step": 66500 }, { - "epoch": 2.34, - "eval_loss": 0.2670895457267761, - "eval_runtime": 10.5428, - "eval_samples_per_second": 9.485, - "eval_steps_per_second": 9.485, + "epoch": 2.3966554942876708, + "eval_loss": 0.43440091609954834, + "eval_runtime": 3.5323, + "eval_samples_per_second": 28.31, + "eval_steps_per_second": 7.077, "step": 66500 }, { - "epoch": 2.34, - "learning_rate": 2.8806130912369966e-05, - "loss": 0.2656, + "epoch": 2.3968356939488955, + "grad_norm": 0.19604401290416718, + "learning_rate": 2.7892382174511333e-05, + "loss": 0.3751, "step": 66505 }, { - "epoch": 2.34, - "learning_rate": 2.8803315374614738e-05, - "loss": 0.2754, + "epoch": 2.39701589361012, + "grad_norm": 0.27748361229896545, + "learning_rate": 2.7889483617150924e-05, + "loss": 0.3847, "step": 66510 }, { - "epoch": 2.34, - "learning_rate": 2.8800499787475964e-05, - "loss": 0.273, + "epoch": 2.3971960932713445, + "grad_norm": 0.20608752965927124, + "learning_rate": 2.7886585020421863e-05, + "loss": 0.364, "step": 66515 }, { - "epoch": 2.34, - "learning_rate": 2.8797684150990206e-05, - "loss": 0.2715, + "epoch": 2.3973762929325693, + "grad_norm": 0.20215480029582977, + "learning_rate": 2.788368638436364e-05, + "loss": 0.3916, "step": 66520 }, { - "epoch": 2.34, - "learning_rate": 2.8794868465194025e-05, - "loss": 0.2471, + "epoch": 2.397556492593794, + "grad_norm": 0.25804924964904785, + "learning_rate": 2.7880787709015742e-05, + "loss": 0.4008, "step": 66525 }, { - "epoch": 2.34, - "learning_rate": 2.8792052730123976e-05, - "loss": 0.2625, + "epoch": 2.3977366922550187, + "grad_norm": 0.2014016956090927, + "learning_rate": 2.787788899441768e-05, + "loss": 0.3776, "step": 66530 }, { - "epoch": 2.34, - "learning_rate": 2.8789236945816624e-05, - "loss": 0.283, + "epoch": 2.397916891916243, + "grad_norm": 0.15427666902542114, + "learning_rate": 2.7874990240608927e-05, + "loss": 0.3746, "step": 66535 }, { - "epoch": 2.34, - "learning_rate": 2.878642111230852e-05, - "loss": 0.273, + "epoch": 2.3980970915774678, + "grad_norm": 0.15082351863384247, + "learning_rate": 2.787209144762899e-05, + "loss": 0.3814, "step": 66540 }, { - "epoch": 2.34, - "learning_rate": 2.878360522963624e-05, - "loss": 0.2841, + "epoch": 2.3982772912386925, + "grad_norm": 0.1636931151151657, + "learning_rate": 2.786919261551737e-05, + "loss": 0.3839, "step": 66545 }, { - "epoch": 2.34, - "learning_rate": 2.8780789297836334e-05, - "loss": 0.2855, + "epoch": 2.398457490899917, + "grad_norm": 0.17769426107406616, + "learning_rate": 2.786629374431355e-05, + "loss": 0.3759, "step": 66550 }, { - "epoch": 2.34, - "learning_rate": 2.8777973316945374e-05, - "loss": 0.2768, + "epoch": 2.3986376905611415, + "grad_norm": 0.2125890552997589, + "learning_rate": 2.7863394834057038e-05, + "loss": 0.4056, "step": 66555 }, { - "epoch": 2.34, - "learning_rate": 2.877515728699992e-05, - "loss": 0.2538, + "epoch": 2.3988178902223662, + "grad_norm": 0.18982288241386414, + "learning_rate": 2.7860495884787318e-05, + "loss": 0.4115, "step": 66560 }, { - "epoch": 2.34, - "learning_rate": 2.877234120803654e-05, - "loss": 0.2692, + "epoch": 2.398998089883591, + "grad_norm": 0.18557365238666534, + "learning_rate": 2.78575968965439e-05, + "loss": 0.3919, "step": 66565 }, { - "epoch": 2.34, - "learning_rate": 2.876952508009179e-05, - "loss": 0.2605, + "epoch": 2.3991782895448157, + "grad_norm": 0.17741166055202484, + "learning_rate": 2.7854697869366277e-05, + "loss": 0.4147, "step": 66570 }, { - "epoch": 2.34, - "learning_rate": 2.8766708903202245e-05, - "loss": 0.2634, + "epoch": 2.3993584892060404, + "grad_norm": 0.20264872908592224, + "learning_rate": 2.785179880329395e-05, + "loss": 0.3773, "step": 66575 }, { - "epoch": 2.34, - "learning_rate": 2.8763892677404465e-05, - "loss": 0.2392, + "epoch": 2.399538688867265, + "grad_norm": 0.20139038562774658, + "learning_rate": 2.7848899698366414e-05, + "loss": 0.3799, "step": 66580 }, { - "epoch": 2.34, - "learning_rate": 2.876107640273502e-05, - "loss": 0.2734, + "epoch": 2.3997188885284895, + "grad_norm": 0.21390214562416077, + "learning_rate": 2.7846000554623168e-05, + "loss": 0.4303, "step": 66585 }, { - "epoch": 2.34, - "learning_rate": 2.8758260079230475e-05, - "loss": 0.2385, + "epoch": 2.399899088189714, + "grad_norm": 0.20651130378246307, + "learning_rate": 2.7843101372103726e-05, + "loss": 0.4191, "step": 66590 }, { - "epoch": 2.34, - "learning_rate": 2.8755443706927403e-05, - "loss": 0.2735, + "epoch": 2.400079287850939, + "grad_norm": 0.20792073011398315, + "learning_rate": 2.7840202150847566e-05, + "loss": 0.3953, "step": 66595 }, { - "epoch": 2.34, - "learning_rate": 2.8752627285862365e-05, - "loss": 0.2632, + "epoch": 2.4002594875121637, + "grad_norm": 0.17147089540958405, + "learning_rate": 2.783730289089421e-05, + "loss": 0.3817, "step": 66600 }, { - "epoch": 2.34, - "learning_rate": 2.8749810816071936e-05, - "loss": 0.2805, + "epoch": 2.400439687173388, + "grad_norm": 0.1750619262456894, + "learning_rate": 2.7834403592283147e-05, + "loss": 0.4007, "step": 66605 }, { - "epoch": 2.34, - "learning_rate": 2.8746994297592688e-05, - "loss": 0.289, + "epoch": 2.4006198868346127, + "grad_norm": 0.20801058411598206, + "learning_rate": 2.7831504255053886e-05, + "loss": 0.3996, "step": 66610 }, { - "epoch": 2.34, - "learning_rate": 2.8744177730461185e-05, - "loss": 0.2455, + "epoch": 2.4008000864958374, + "grad_norm": 0.2012956142425537, + "learning_rate": 2.7828604879245923e-05, + "loss": 0.383, "step": 66615 }, { - "epoch": 2.34, - "learning_rate": 2.8741361114714006e-05, - "loss": 0.263, + "epoch": 2.400980286157062, + "grad_norm": 0.18420088291168213, + "learning_rate": 2.7825705464898777e-05, + "loss": 0.3725, "step": 66620 }, { - "epoch": 2.34, - "learning_rate": 2.8738544450387716e-05, - "loss": 0.2629, + "epoch": 2.401160485818287, + "grad_norm": 0.2484653741121292, + "learning_rate": 2.7822806012051934e-05, + "loss": 0.4174, "step": 66625 }, { - "epoch": 2.34, - "learning_rate": 2.8735727737518893e-05, - "loss": 0.2732, + "epoch": 2.401340685479511, + "grad_norm": 0.19384010136127472, + "learning_rate": 2.7819906520744903e-05, + "loss": 0.3899, "step": 66630 }, { - "epoch": 2.34, - "learning_rate": 2.87329109761441e-05, - "loss": 0.2799, + "epoch": 2.401520885140736, + "grad_norm": 0.19840480387210846, + "learning_rate": 2.7817006991017196e-05, + "loss": 0.3819, "step": 66635 }, { - "epoch": 2.34, - "learning_rate": 2.873009416629992e-05, - "loss": 0.2525, + "epoch": 2.4017010848019607, + "grad_norm": 0.20642586052417755, + "learning_rate": 2.7814107422908315e-05, + "loss": 0.3706, "step": 66640 }, { - "epoch": 2.34, - "learning_rate": 2.872727730802293e-05, - "loss": 0.2636, + "epoch": 2.4018812844631854, + "grad_norm": 0.20059265196323395, + "learning_rate": 2.7811207816457756e-05, + "loss": 0.3905, "step": 66645 }, { - "epoch": 2.34, - "learning_rate": 2.8724460401349696e-05, - "loss": 0.268, + "epoch": 2.4020614841244097, + "grad_norm": 0.19518163800239563, + "learning_rate": 2.7808308171705045e-05, + "loss": 0.3732, "step": 66650 }, { - "epoch": 2.35, - "learning_rate": 2.87216434463168e-05, - "loss": 0.2448, + "epoch": 2.4022416837856344, + "grad_norm": 0.2086179107427597, + "learning_rate": 2.780540848868967e-05, + "loss": 0.3932, "step": 66655 }, { - "epoch": 2.35, - "learning_rate": 2.871882644296081e-05, - "loss": 0.2645, + "epoch": 2.402421883446859, + "grad_norm": 0.18908485770225525, + "learning_rate": 2.780250876745116e-05, + "loss": 0.3602, "step": 66660 }, { - "epoch": 2.35, - "learning_rate": 2.8716009391318317e-05, - "loss": 0.2918, + "epoch": 2.402602083108084, + "grad_norm": 0.19856788218021393, + "learning_rate": 2.7799609008029004e-05, + "loss": 0.3977, "step": 66665 }, { - "epoch": 2.35, - "learning_rate": 2.8713192291425893e-05, - "loss": 0.2543, + "epoch": 2.4027822827693086, + "grad_norm": 0.1787065863609314, + "learning_rate": 2.779670921046272e-05, + "loss": 0.3638, "step": 66670 }, { - "epoch": 2.35, - "learning_rate": 2.8710375143320113e-05, - "loss": 0.2864, + "epoch": 2.402962482430533, + "grad_norm": 0.225900337100029, + "learning_rate": 2.7793809374791807e-05, + "loss": 0.3711, "step": 66675 }, { - "epoch": 2.35, - "learning_rate": 2.8707557947037555e-05, - "loss": 0.2812, + "epoch": 2.4031426820917576, + "grad_norm": 0.20942628383636475, + "learning_rate": 2.779090950105579e-05, + "loss": 0.3831, "step": 66680 }, { - "epoch": 2.35, - "learning_rate": 2.87047407026148e-05, - "loss": 0.2605, + "epoch": 2.4033228817529824, + "grad_norm": 0.1625223010778427, + "learning_rate": 2.7788009589294167e-05, + "loss": 0.368, "step": 66685 }, { - "epoch": 2.35, - "learning_rate": 2.8701923410088426e-05, - "loss": 0.2609, + "epoch": 2.403503081414207, + "grad_norm": 0.17064473032951355, + "learning_rate": 2.7785109639546454e-05, + "loss": 0.4337, "step": 66690 }, { - "epoch": 2.35, - "learning_rate": 2.869910606949502e-05, - "loss": 0.2737, + "epoch": 2.4036832810754314, + "grad_norm": 0.1989968866109848, + "learning_rate": 2.778220965185216e-05, + "loss": 0.3975, "step": 66695 }, { - "epoch": 2.35, - "learning_rate": 2.8696288680871164e-05, - "loss": 0.2603, + "epoch": 2.403863480736656, + "grad_norm": 0.1930345594882965, + "learning_rate": 2.77793096262508e-05, + "loss": 0.3987, "step": 66700 }, { - "epoch": 2.35, - "learning_rate": 2.869347124425343e-05, - "loss": 0.2734, + "epoch": 2.404043680397881, + "grad_norm": 0.22240734100341797, + "learning_rate": 2.7776409562781885e-05, + "loss": 0.4242, "step": 66705 }, { - "epoch": 2.35, - "learning_rate": 2.869065375967841e-05, - "loss": 0.2654, + "epoch": 2.4042238800591056, + "grad_norm": 0.22761568427085876, + "learning_rate": 2.7773509461484924e-05, + "loss": 0.3846, "step": 66710 }, { - "epoch": 2.35, - "learning_rate": 2.8687836227182668e-05, - "loss": 0.2671, + "epoch": 2.4044040797203303, + "grad_norm": 0.18993456661701202, + "learning_rate": 2.7770609322399438e-05, + "loss": 0.3828, "step": 66715 }, { - "epoch": 2.35, - "learning_rate": 2.868501864680282e-05, - "loss": 0.2596, + "epoch": 2.4045842793815546, + "grad_norm": 0.253111869096756, + "learning_rate": 2.7767709145564936e-05, + "loss": 0.3801, "step": 66720 }, { - "epoch": 2.35, - "learning_rate": 2.8682201018575426e-05, - "loss": 0.2768, + "epoch": 2.4047644790427793, + "grad_norm": 0.2091599851846695, + "learning_rate": 2.7764808931020923e-05, + "loss": 0.3715, "step": 66725 }, { - "epoch": 2.35, - "learning_rate": 2.8679383342537087e-05, - "loss": 0.2812, + "epoch": 2.404944678704004, + "grad_norm": 0.19851459562778473, + "learning_rate": 2.7761908678806937e-05, + "loss": 0.4124, "step": 66730 }, { - "epoch": 2.35, - "learning_rate": 2.867656561872437e-05, - "loss": 0.2778, + "epoch": 2.405124878365229, + "grad_norm": 0.19328032433986664, + "learning_rate": 2.775900838896247e-05, + "loss": 0.3716, "step": 66735 }, { - "epoch": 2.35, - "learning_rate": 2.867374784717387e-05, - "loss": 0.2537, + "epoch": 2.405305078026453, + "grad_norm": 0.18993335962295532, + "learning_rate": 2.7756108061527062e-05, + "loss": 0.3552, "step": 66740 }, { - "epoch": 2.35, - "learning_rate": 2.8670930027922183e-05, - "loss": 0.3001, + "epoch": 2.405485277687678, + "grad_norm": 0.19817394018173218, + "learning_rate": 2.77532076965402e-05, + "loss": 0.4001, "step": 66745 }, { - "epoch": 2.35, - "learning_rate": 2.866811216100589e-05, - "loss": 0.2672, + "epoch": 2.4056654773489026, + "grad_norm": 0.19719624519348145, + "learning_rate": 2.7750307294041423e-05, + "loss": 0.3777, "step": 66750 }, { - "epoch": 2.35, - "learning_rate": 2.866529424646157e-05, - "loss": 0.2582, + "epoch": 2.4058456770101273, + "grad_norm": 0.20810894668102264, + "learning_rate": 2.774740685407024e-05, + "loss": 0.3946, "step": 66755 }, { - "epoch": 2.35, - "learning_rate": 2.8662476284325823e-05, - "loss": 0.2792, + "epoch": 2.406025876671352, + "grad_norm": 0.1935310959815979, + "learning_rate": 2.7744506376666175e-05, + "loss": 0.3903, "step": 66760 }, { - "epoch": 2.35, - "learning_rate": 2.865965827463524e-05, - "loss": 0.2657, + "epoch": 2.4062060763325763, + "grad_norm": 0.24938994646072388, + "learning_rate": 2.7741605861868735e-05, + "loss": 0.3914, "step": 66765 }, { - "epoch": 2.35, - "learning_rate": 2.8656840217426394e-05, - "loss": 0.2705, + "epoch": 2.406386275993801, + "grad_norm": 0.22419728338718414, + "learning_rate": 2.7738705309717444e-05, + "loss": 0.4173, "step": 66770 }, { - "epoch": 2.35, - "learning_rate": 2.8654022112735895e-05, - "loss": 0.2326, + "epoch": 2.406566475655026, + "grad_norm": 0.17850545048713684, + "learning_rate": 2.7735804720251835e-05, + "loss": 0.4045, "step": 66775 }, { - "epoch": 2.35, - "learning_rate": 2.8651203960600325e-05, - "loss": 0.2847, + "epoch": 2.4067466753162505, + "grad_norm": 0.23113694787025452, + "learning_rate": 2.773290409351141e-05, + "loss": 0.3859, "step": 66780 }, { - "epoch": 2.35, - "learning_rate": 2.8648385761056285e-05, - "loss": 0.2878, + "epoch": 2.406926874977475, + "grad_norm": 0.24465039372444153, + "learning_rate": 2.7730003429535688e-05, + "loss": 0.3736, "step": 66785 }, { - "epoch": 2.35, - "learning_rate": 2.8645567514140355e-05, - "loss": 0.2512, + "epoch": 2.4071070746386996, + "grad_norm": 0.22059732675552368, + "learning_rate": 2.7727102728364207e-05, + "loss": 0.3946, "step": 66790 }, { - "epoch": 2.35, - "learning_rate": 2.864274921988913e-05, - "loss": 0.2916, + "epoch": 2.4072872742999243, + "grad_norm": 0.20629587769508362, + "learning_rate": 2.7724201990036474e-05, + "loss": 0.3921, "step": 66795 }, { - "epoch": 2.35, - "learning_rate": 2.863993087833921e-05, - "loss": 0.2613, + "epoch": 2.407467473961149, + "grad_norm": 0.2906322479248047, + "learning_rate": 2.772130121459202e-05, + "loss": 0.3928, "step": 66800 }, { - "epoch": 2.35, - "learning_rate": 2.8637112489527195e-05, - "loss": 0.2886, + "epoch": 2.4076476736223738, + "grad_norm": 0.1948506087064743, + "learning_rate": 2.771840040207037e-05, + "loss": 0.3751, "step": 66805 }, { - "epoch": 2.35, - "learning_rate": 2.8634294053489658e-05, - "loss": 0.2658, + "epoch": 2.407827873283598, + "grad_norm": 0.1990462988615036, + "learning_rate": 2.7715499552511036e-05, + "loss": 0.4181, "step": 66810 }, { - "epoch": 2.35, - "learning_rate": 2.8631475570263218e-05, - "loss": 0.2933, + "epoch": 2.4080080729448228, + "grad_norm": 0.23790279030799866, + "learning_rate": 2.7712598665953543e-05, + "loss": 0.3669, "step": 66815 }, { - "epoch": 2.35, - "learning_rate": 2.8628657039884455e-05, - "loss": 0.2801, + "epoch": 2.4081882726060475, + "grad_norm": 0.18465937674045563, + "learning_rate": 2.7709697742437418e-05, + "loss": 0.4092, "step": 66820 }, { - "epoch": 2.35, - "learning_rate": 2.862583846238997e-05, - "loss": 0.2542, + "epoch": 2.4083684722672722, + "grad_norm": 0.16770188510417938, + "learning_rate": 2.7706796782002192e-05, + "loss": 0.37, "step": 66825 }, { - "epoch": 2.35, - "learning_rate": 2.8623019837816367e-05, - "loss": 0.2774, + "epoch": 2.4085486719284965, + "grad_norm": 0.2589314877986908, + "learning_rate": 2.770389578468738e-05, + "loss": 0.4184, "step": 66830 }, { - "epoch": 2.35, - "learning_rate": 2.8620201166200238e-05, - "loss": 0.2683, + "epoch": 2.4087288715897213, + "grad_norm": 0.18053090572357178, + "learning_rate": 2.770099475053251e-05, + "loss": 0.3487, "step": 66835 }, { - "epoch": 2.35, - "learning_rate": 2.8617382447578178e-05, - "loss": 0.2716, + "epoch": 2.408909071250946, + "grad_norm": 0.2627122700214386, + "learning_rate": 2.7698093679577108e-05, + "loss": 0.4316, "step": 66840 }, { - "epoch": 2.35, - "learning_rate": 2.86145636819868e-05, - "loss": 0.2712, + "epoch": 2.4090892709121707, + "grad_norm": 0.18675784766674042, + "learning_rate": 2.769519257186071e-05, + "loss": 0.3685, "step": 66845 }, { - "epoch": 2.35, - "learning_rate": 2.8611744869462688e-05, - "loss": 0.2498, + "epoch": 2.4092694705733955, + "grad_norm": 0.27119845151901245, + "learning_rate": 2.7692291427422828e-05, + "loss": 0.4013, "step": 66850 }, { - "epoch": 2.35, - "learning_rate": 2.8608926010042447e-05, - "loss": 0.2782, + "epoch": 2.40944967023462, + "grad_norm": 0.19368845224380493, + "learning_rate": 2.7689390246302997e-05, + "loss": 0.3814, "step": 66855 }, { - "epoch": 2.35, - "learning_rate": 2.8606107103762688e-05, - "loss": 0.2876, + "epoch": 2.4096298698958445, + "grad_norm": 0.19200463593006134, + "learning_rate": 2.7686489028540748e-05, + "loss": 0.3739, "step": 66860 }, { - "epoch": 2.35, - "learning_rate": 2.860328815066e-05, - "loss": 0.2708, + "epoch": 2.4098100695570692, + "grad_norm": 0.22479012608528137, + "learning_rate": 2.76835877741756e-05, + "loss": 0.4098, "step": 66865 }, { - "epoch": 2.35, - "learning_rate": 2.860046915077099e-05, - "loss": 0.2864, + "epoch": 2.409990269218294, + "grad_norm": 0.23108981549739838, + "learning_rate": 2.7680686483247098e-05, + "loss": 0.3996, "step": 66870 }, { - "epoch": 2.35, - "learning_rate": 2.8597650104132257e-05, - "loss": 0.2729, + "epoch": 2.4101704688795187, + "grad_norm": 0.20057092607021332, + "learning_rate": 2.767778515579475e-05, + "loss": 0.4096, "step": 66875 }, { - "epoch": 2.35, - "learning_rate": 2.859483101078041e-05, - "loss": 0.2683, + "epoch": 2.410350668540743, + "grad_norm": 0.17036274075508118, + "learning_rate": 2.7674883791858107e-05, + "loss": 0.4209, "step": 66880 }, { - "epoch": 2.35, - "learning_rate": 2.8592011870752062e-05, - "loss": 0.2692, + "epoch": 2.4105308682019677, + "grad_norm": 0.19558821618556976, + "learning_rate": 2.7671982391476686e-05, + "loss": 0.4133, "step": 66885 }, { - "epoch": 2.35, - "learning_rate": 2.8589192684083794e-05, - "loss": 0.2591, + "epoch": 2.4107110678631924, + "grad_norm": 0.2363506406545639, + "learning_rate": 2.7669080954690023e-05, + "loss": 0.4149, "step": 66890 }, { - "epoch": 2.35, - "learning_rate": 2.8586373450812233e-05, - "loss": 0.2617, + "epoch": 2.410891267524417, + "grad_norm": 0.23225028812885284, + "learning_rate": 2.7666179481537646e-05, + "loss": 0.4357, "step": 66895 }, { - "epoch": 2.35, - "learning_rate": 2.858355417097397e-05, - "loss": 0.2512, + "epoch": 2.411071467185642, + "grad_norm": 0.20364603400230408, + "learning_rate": 2.76632779720591e-05, + "loss": 0.3863, "step": 66900 }, { - "epoch": 2.35, - "learning_rate": 2.8580734844605624e-05, - "loss": 0.2595, + "epoch": 2.411251666846866, + "grad_norm": 0.14624075591564178, + "learning_rate": 2.76603764262939e-05, + "loss": 0.3693, "step": 66905 }, { - "epoch": 2.35, - "learning_rate": 2.857791547174379e-05, - "loss": 0.2547, + "epoch": 2.411431866508091, + "grad_norm": 0.20455005764961243, + "learning_rate": 2.7657474844281577e-05, + "loss": 0.3922, "step": 66910 }, { - "epoch": 2.35, - "learning_rate": 2.857509605242508e-05, - "loss": 0.2712, + "epoch": 2.4116120661693157, + "grad_norm": 0.19567883014678955, + "learning_rate": 2.7654573226061686e-05, + "loss": 0.4302, "step": 66915 }, { - "epoch": 2.35, - "learning_rate": 2.8572276586686107e-05, - "loss": 0.2826, + "epoch": 2.4117922658305404, + "grad_norm": 0.2601669430732727, + "learning_rate": 2.7651671571673743e-05, + "loss": 0.3964, "step": 66920 }, { - "epoch": 2.35, - "learning_rate": 2.8569457074563477e-05, - "loss": 0.278, + "epoch": 2.4119724654917647, + "grad_norm": 0.22823162376880646, + "learning_rate": 2.76487698811573e-05, + "loss": 0.3906, "step": 66925 }, { - "epoch": 2.35, - "learning_rate": 2.8566637516093807e-05, - "loss": 0.266, + "epoch": 2.4121526651529894, + "grad_norm": 0.2383589893579483, + "learning_rate": 2.764586815455187e-05, + "loss": 0.4254, "step": 66930 }, { - "epoch": 2.35, - "learning_rate": 2.8563817911313684e-05, - "loss": 0.2973, + "epoch": 2.412332864814214, + "grad_norm": 0.18745693564414978, + "learning_rate": 2.764296639189699e-05, + "loss": 0.3825, "step": 66935 }, { - "epoch": 2.36, - "learning_rate": 2.8560998260259748e-05, - "loss": 0.2799, + "epoch": 2.412513064475439, + "grad_norm": 0.2072102427482605, + "learning_rate": 2.7640064593232218e-05, + "loss": 0.3881, "step": 66940 }, { - "epoch": 2.36, - "learning_rate": 2.8558178562968583e-05, - "loss": 0.281, + "epoch": 2.4126932641366636, + "grad_norm": 0.1848241686820984, + "learning_rate": 2.7637162758597073e-05, + "loss": 0.3747, "step": 66945 }, { - "epoch": 2.36, - "learning_rate": 2.855535881947683e-05, - "loss": 0.2711, + "epoch": 2.412873463797888, + "grad_norm": 0.1639891117811203, + "learning_rate": 2.7634260888031098e-05, + "loss": 0.3715, "step": 66950 }, { - "epoch": 2.36, - "learning_rate": 2.8552539029821074e-05, - "loss": 0.274, + "epoch": 2.4130536634591127, + "grad_norm": 0.20181863009929657, + "learning_rate": 2.7631358981573824e-05, + "loss": 0.4048, "step": 66955 }, { - "epoch": 2.36, - "learning_rate": 2.8549719194037942e-05, - "loss": 0.2637, + "epoch": 2.4132338631203374, + "grad_norm": 0.1724659949541092, + "learning_rate": 2.76284570392648e-05, + "loss": 0.4168, "step": 66960 }, { - "epoch": 2.36, - "learning_rate": 2.8546899312164045e-05, - "loss": 0.2716, + "epoch": 2.413414062781562, + "grad_norm": 0.23063142597675323, + "learning_rate": 2.7625555061143553e-05, + "loss": 0.4061, "step": 66965 }, { - "epoch": 2.36, - "learning_rate": 2.8544079384236004e-05, - "loss": 0.2458, + "epoch": 2.4135942624427864, + "grad_norm": 0.23275382816791534, + "learning_rate": 2.7622653047249626e-05, + "loss": 0.4226, "step": 66970 }, { - "epoch": 2.36, - "learning_rate": 2.8541259410290422e-05, - "loss": 0.2872, + "epoch": 2.413774462104011, + "grad_norm": 0.17771199345588684, + "learning_rate": 2.7619750997622564e-05, + "loss": 0.4079, "step": 66975 }, { - "epoch": 2.36, - "learning_rate": 2.8538439390363924e-05, - "loss": 0.2763, + "epoch": 2.413954661765236, + "grad_norm": 0.20910033583641052, + "learning_rate": 2.7616848912301892e-05, + "loss": 0.4074, "step": 66980 }, { - "epoch": 2.36, - "learning_rate": 2.8535619324493125e-05, - "loss": 0.2574, + "epoch": 2.4141348614264606, + "grad_norm": 0.2285621017217636, + "learning_rate": 2.7613946791327167e-05, + "loss": 0.4133, "step": 66985 }, { - "epoch": 2.36, - "learning_rate": 2.8532799212714628e-05, - "loss": 0.2851, + "epoch": 2.4143150610876853, + "grad_norm": 0.22376610338687897, + "learning_rate": 2.7611044634737926e-05, + "loss": 0.3761, "step": 66990 }, { - "epoch": 2.36, - "learning_rate": 2.8529979055065072e-05, - "loss": 0.2583, + "epoch": 2.4144952607489096, + "grad_norm": 0.20609983801841736, + "learning_rate": 2.7608142442573704e-05, + "loss": 0.376, "step": 66995 }, { - "epoch": 2.36, - "learning_rate": 2.852715885158106e-05, - "loss": 0.2546, + "epoch": 2.4146754604101344, + "grad_norm": 0.20516866445541382, + "learning_rate": 2.7605240214874038e-05, + "loss": 0.3812, "step": 67000 }, { - "epoch": 2.36, - "eval_loss": 0.26735541224479675, - "eval_runtime": 10.5619, - "eval_samples_per_second": 9.468, - "eval_steps_per_second": 9.468, + "epoch": 2.4146754604101344, + "eval_loss": 0.4341346025466919, + "eval_runtime": 3.524, + "eval_samples_per_second": 28.377, + "eval_steps_per_second": 7.094, "step": 67000 }, { - "epoch": 2.36, - "learning_rate": 2.852433860229921e-05, - "loss": 0.2645, + "epoch": 2.414855660071359, + "grad_norm": 0.19318906962871552, + "learning_rate": 2.760233795167849e-05, + "loss": 0.376, "step": 67005 }, { - "epoch": 2.36, - "learning_rate": 2.852151830725615e-05, - "loss": 0.2531, + "epoch": 2.415035859732584, + "grad_norm": 0.19978414475917816, + "learning_rate": 2.759943565302659e-05, + "loss": 0.3926, "step": 67010 }, { - "epoch": 2.36, - "learning_rate": 2.8518697966488493e-05, - "loss": 0.2846, + "epoch": 2.415216059393808, + "grad_norm": 0.19029457867145538, + "learning_rate": 2.759653331895788e-05, + "loss": 0.3724, "step": 67015 }, { - "epoch": 2.36, - "learning_rate": 2.8515877580032868e-05, - "loss": 0.2697, + "epoch": 2.415396259055033, + "grad_norm": 0.1965416818857193, + "learning_rate": 2.7593630949511908e-05, + "loss": 0.4342, "step": 67020 }, { - "epoch": 2.36, - "learning_rate": 2.8513057147925894e-05, - "loss": 0.2515, + "epoch": 2.4155764587162576, + "grad_norm": 0.19365379214286804, + "learning_rate": 2.7590728544728213e-05, + "loss": 0.3787, "step": 67025 }, { - "epoch": 2.36, - "learning_rate": 2.8510236670204176e-05, - "loss": 0.2567, + "epoch": 2.4157566583774823, + "grad_norm": 0.24059845507144928, + "learning_rate": 2.7587826104646348e-05, + "loss": 0.3632, "step": 67030 }, { - "epoch": 2.36, - "learning_rate": 2.8507416146904358e-05, - "loss": 0.2844, + "epoch": 2.415936858038707, + "grad_norm": 0.2164888083934784, + "learning_rate": 2.758492362930585e-05, + "loss": 0.4078, "step": 67035 }, { - "epoch": 2.36, - "learning_rate": 2.8504595578063047e-05, - "loss": 0.2846, + "epoch": 2.4161170576999313, + "grad_norm": 0.21107304096221924, + "learning_rate": 2.7582021118746266e-05, + "loss": 0.3813, "step": 67040 }, { - "epoch": 2.36, - "learning_rate": 2.850177496371687e-05, - "loss": 0.2816, + "epoch": 2.416297257361156, + "grad_norm": 0.23840226233005524, + "learning_rate": 2.7579118573007144e-05, + "loss": 0.3851, "step": 67045 }, { - "epoch": 2.36, - "learning_rate": 2.8498954303902452e-05, - "loss": 0.2844, + "epoch": 2.416477457022381, + "grad_norm": 0.23817472159862518, + "learning_rate": 2.7576215992128034e-05, + "loss": 0.3961, "step": 67050 }, { - "epoch": 2.36, - "learning_rate": 2.8496133598656426e-05, - "loss": 0.2526, + "epoch": 2.4166576566836055, + "grad_norm": 0.23038597404956818, + "learning_rate": 2.7573313376148478e-05, + "loss": 0.4176, "step": 67055 }, { - "epoch": 2.36, - "learning_rate": 2.8493312848015407e-05, - "loss": 0.2688, + "epoch": 2.41683785634483, + "grad_norm": 0.18033789098262787, + "learning_rate": 2.7570410725108027e-05, + "loss": 0.4053, "step": 67060 }, { - "epoch": 2.36, - "learning_rate": 2.8490492052016028e-05, - "loss": 0.2706, + "epoch": 2.4170180560060546, + "grad_norm": 0.24066656827926636, + "learning_rate": 2.7567508039046232e-05, + "loss": 0.4378, "step": 67065 }, { - "epoch": 2.36, - "learning_rate": 2.84876712106949e-05, - "loss": 0.2574, + "epoch": 2.4171982556672793, + "grad_norm": 0.2486283779144287, + "learning_rate": 2.7564605318002627e-05, + "loss": 0.4038, "step": 67070 }, { - "epoch": 2.36, - "learning_rate": 2.8484850324088662e-05, - "loss": 0.2623, + "epoch": 2.417378455328504, + "grad_norm": 0.2280510812997818, + "learning_rate": 2.756170256201678e-05, + "loss": 0.3974, "step": 67075 }, { - "epoch": 2.36, - "learning_rate": 2.848202939223394e-05, - "loss": 0.2618, + "epoch": 2.4175586549897288, + "grad_norm": 0.19952905178070068, + "learning_rate": 2.7558799771128223e-05, + "loss": 0.408, "step": 67080 }, { - "epoch": 2.36, - "learning_rate": 2.8479208415167363e-05, - "loss": 0.2743, + "epoch": 2.417738854650953, + "grad_norm": 0.19479811191558838, + "learning_rate": 2.755589694537652e-05, + "loss": 0.3922, "step": 67085 }, { - "epoch": 2.36, - "learning_rate": 2.8476387392925552e-05, - "loss": 0.2593, + "epoch": 2.417919054312178, + "grad_norm": 0.2107398509979248, + "learning_rate": 2.755299408480122e-05, + "loss": 0.4211, "step": 67090 }, { - "epoch": 2.36, - "learning_rate": 2.8473566325545148e-05, - "loss": 0.2704, + "epoch": 2.4180992539734025, + "grad_norm": 0.21813732385635376, + "learning_rate": 2.755009118944186e-05, + "loss": 0.3841, "step": 67095 }, { - "epoch": 2.36, - "learning_rate": 2.8470745213062767e-05, - "loss": 0.2534, + "epoch": 2.4182794536346273, + "grad_norm": 0.18200333416461945, + "learning_rate": 2.754718825933801e-05, + "loss": 0.4034, "step": 67100 }, { - "epoch": 2.36, - "learning_rate": 2.846792405551506e-05, - "loss": 0.2744, + "epoch": 2.418459653295852, + "grad_norm": 0.18498298525810242, + "learning_rate": 2.7544285294529204e-05, + "loss": 0.3802, "step": 67105 }, { - "epoch": 2.36, - "learning_rate": 2.846510285293863e-05, - "loss": 0.2651, + "epoch": 2.4186398529570763, + "grad_norm": 0.21120643615722656, + "learning_rate": 2.7541382295055014e-05, + "loss": 0.4255, "step": 67110 }, { - "epoch": 2.36, - "learning_rate": 2.8462281605370133e-05, - "loss": 0.2744, + "epoch": 2.418820052618301, + "grad_norm": 0.2860146164894104, + "learning_rate": 2.753847926095498e-05, + "loss": 0.402, "step": 67115 }, { - "epoch": 2.36, - "learning_rate": 2.8459460312846186e-05, - "loss": 0.2822, + "epoch": 2.4190002522795258, + "grad_norm": 0.21103060245513916, + "learning_rate": 2.7535576192268648e-05, + "loss": 0.3978, "step": 67120 }, { - "epoch": 2.36, - "learning_rate": 2.8456638975403428e-05, - "loss": 0.2749, + "epoch": 2.4191804519407505, + "grad_norm": 0.1569247990846634, + "learning_rate": 2.7532673089035588e-05, + "loss": 0.3973, "step": 67125 }, { - "epoch": 2.36, - "learning_rate": 2.8453817593078492e-05, - "loss": 0.2597, + "epoch": 2.419360651601975, + "grad_norm": 0.23274268209934235, + "learning_rate": 2.7529769951295355e-05, + "loss": 0.3947, "step": 67130 }, { - "epoch": 2.36, - "learning_rate": 2.8450996165908006e-05, - "loss": 0.2588, + "epoch": 2.4195408512631995, + "grad_norm": 0.1998492181301117, + "learning_rate": 2.752686677908749e-05, + "loss": 0.4082, "step": 67135 }, { - "epoch": 2.36, - "learning_rate": 2.844817469392862e-05, - "loss": 0.276, + "epoch": 2.4197210509244242, + "grad_norm": 0.19233012199401855, + "learning_rate": 2.7523963572451545e-05, + "loss": 0.4093, "step": 67140 }, { - "epoch": 2.36, - "learning_rate": 2.8445353177176946e-05, - "loss": 0.2702, + "epoch": 2.419901250585649, + "grad_norm": 0.22969095408916473, + "learning_rate": 2.7521060331427096e-05, + "loss": 0.4131, "step": 67145 }, { - "epoch": 2.36, - "learning_rate": 2.8442531615689644e-05, - "loss": 0.2784, + "epoch": 2.4200814502468737, + "grad_norm": 0.2119283676147461, + "learning_rate": 2.751815705605369e-05, + "loss": 0.4162, "step": 67150 }, { - "epoch": 2.36, - "learning_rate": 2.8439710009503322e-05, - "loss": 0.2722, + "epoch": 2.420261649908098, + "grad_norm": 0.18394353985786438, + "learning_rate": 2.7515253746370873e-05, + "loss": 0.3837, "step": 67155 }, { - "epoch": 2.36, - "learning_rate": 2.8436888358654644e-05, - "loss": 0.2814, + "epoch": 2.4204418495693227, + "grad_norm": 0.19230255484580994, + "learning_rate": 2.7512350402418212e-05, + "loss": 0.3603, "step": 67160 }, { - "epoch": 2.36, - "learning_rate": 2.843406666318023e-05, - "loss": 0.2579, + "epoch": 2.4206220492305475, + "grad_norm": 0.2101142406463623, + "learning_rate": 2.7509447024235262e-05, + "loss": 0.3784, "step": 67165 }, { - "epoch": 2.36, - "learning_rate": 2.843124492311673e-05, - "loss": 0.2651, + "epoch": 2.420802248891772, + "grad_norm": 0.1899431347846985, + "learning_rate": 2.7506543611861598e-05, + "loss": 0.3301, "step": 67170 }, { - "epoch": 2.36, - "learning_rate": 2.842842313850077e-05, - "loss": 0.2666, + "epoch": 2.420982448552997, + "grad_norm": 0.2313234508037567, + "learning_rate": 2.750364016533674e-05, + "loss": 0.3929, "step": 67175 }, { - "epoch": 2.36, - "learning_rate": 2.8425601309369e-05, - "loss": 0.2696, + "epoch": 2.4211626482142212, + "grad_norm": 0.20797547698020935, + "learning_rate": 2.7500736684700283e-05, + "loss": 0.3923, "step": 67180 }, { - "epoch": 2.36, - "learning_rate": 2.8422779435758045e-05, - "loss": 0.2789, + "epoch": 2.421342847875446, + "grad_norm": 0.2217523455619812, + "learning_rate": 2.7497833169991767e-05, + "loss": 0.4297, "step": 67185 }, { - "epoch": 2.36, - "learning_rate": 2.8419957517704567e-05, - "loss": 0.2796, + "epoch": 2.4215230475366707, + "grad_norm": 0.19359783828258514, + "learning_rate": 2.7494929621250765e-05, + "loss": 0.3811, "step": 67190 }, { - "epoch": 2.36, - "learning_rate": 2.8417135555245183e-05, - "loss": 0.2679, + "epoch": 2.4217032471978954, + "grad_norm": 0.21590293943881989, + "learning_rate": 2.7492026038516826e-05, + "loss": 0.3841, "step": 67195 }, { - "epoch": 2.36, - "learning_rate": 2.841431354841656e-05, - "loss": 0.2773, + "epoch": 2.4218834468591197, + "grad_norm": 0.17126692831516266, + "learning_rate": 2.748912242182951e-05, + "loss": 0.3761, "step": 67200 }, { - "epoch": 2.36, - "learning_rate": 2.8411491497255316e-05, - "loss": 0.2451, + "epoch": 2.4220636465203444, + "grad_norm": 0.24167068302631378, + "learning_rate": 2.7486218771228388e-05, + "loss": 0.3784, "step": 67205 }, { - "epoch": 2.36, - "learning_rate": 2.8408669401798098e-05, - "loss": 0.2846, + "epoch": 2.422243846181569, + "grad_norm": 0.21758897602558136, + "learning_rate": 2.7483315086753015e-05, + "loss": 0.4132, "step": 67210 }, { - "epoch": 2.36, - "learning_rate": 2.840584726208156e-05, - "loss": 0.2712, + "epoch": 2.422424045842794, + "grad_norm": 0.20795123279094696, + "learning_rate": 2.7480411368442956e-05, + "loss": 0.3909, "step": 67215 }, { - "epoch": 2.36, - "learning_rate": 2.840302507814234e-05, - "loss": 0.2558, + "epoch": 2.4226042455040186, + "grad_norm": 0.1906597912311554, + "learning_rate": 2.7477507616337773e-05, + "loss": 0.3742, "step": 67220 }, { - "epoch": 2.37, - "learning_rate": 2.8400202850017087e-05, - "loss": 0.2631, + "epoch": 2.422784445165243, + "grad_norm": 0.22008202970027924, + "learning_rate": 2.747460383047702e-05, + "loss": 0.4114, "step": 67225 }, { - "epoch": 2.37, - "learning_rate": 2.839738057774244e-05, - "loss": 0.2752, + "epoch": 2.4229646448264677, + "grad_norm": 0.2209016978740692, + "learning_rate": 2.747170001090028e-05, + "loss": 0.3567, "step": 67230 }, { - "epoch": 2.37, - "learning_rate": 2.839455826135503e-05, - "loss": 0.2842, + "epoch": 2.4231448444876924, + "grad_norm": 0.2042957842350006, + "learning_rate": 2.74687961576471e-05, + "loss": 0.3943, "step": 67235 }, { - "epoch": 2.37, - "learning_rate": 2.839173590089153e-05, - "loss": 0.3026, + "epoch": 2.423325044148917, + "grad_norm": 0.2651365399360657, + "learning_rate": 2.7465892270757055e-05, + "loss": 0.4139, "step": 67240 }, { - "epoch": 2.37, - "learning_rate": 2.838891349638858e-05, - "loss": 0.2557, + "epoch": 2.4235052438101414, + "grad_norm": 0.2116497904062271, + "learning_rate": 2.7462988350269698e-05, + "loss": 0.4203, "step": 67245 }, { - "epoch": 2.37, - "learning_rate": 2.838609104788281e-05, - "loss": 0.2689, + "epoch": 2.423685443471366, + "grad_norm": 0.24418489634990692, + "learning_rate": 2.7460084396224606e-05, + "loss": 0.3874, "step": 67250 }, { - "epoch": 2.37, - "learning_rate": 2.838326855541089e-05, - "loss": 0.2744, + "epoch": 2.423865643132591, + "grad_norm": 0.18240700662136078, + "learning_rate": 2.7457180408661343e-05, + "loss": 0.3952, "step": 67255 }, { - "epoch": 2.37, - "learning_rate": 2.8380446019009448e-05, - "loss": 0.2816, + "epoch": 2.4240458427938156, + "grad_norm": 0.20729464292526245, + "learning_rate": 2.7454276387619465e-05, + "loss": 0.378, "step": 67260 }, { - "epoch": 2.37, - "learning_rate": 2.8377623438715152e-05, - "loss": 0.2801, + "epoch": 2.4242260424550404, + "grad_norm": 0.22618576884269714, + "learning_rate": 2.745137233313856e-05, + "loss": 0.408, "step": 67265 }, { - "epoch": 2.37, - "learning_rate": 2.8374800814564627e-05, - "loss": 0.289, + "epoch": 2.4244062421162647, + "grad_norm": 0.1970742791891098, + "learning_rate": 2.7448468245258162e-05, + "loss": 0.4034, "step": 67270 }, { - "epoch": 2.37, - "learning_rate": 2.837197814659455e-05, - "loss": 0.276, + "epoch": 2.4245864417774894, + "grad_norm": 0.18698358535766602, + "learning_rate": 2.744556412401788e-05, + "loss": 0.3862, "step": 67275 }, { - "epoch": 2.37, - "learning_rate": 2.8369155434841554e-05, - "loss": 0.2614, + "epoch": 2.424766641438714, + "grad_norm": 0.18384528160095215, + "learning_rate": 2.7442659969457247e-05, + "loss": 0.3994, "step": 67280 }, { - "epoch": 2.37, - "learning_rate": 2.8366332679342305e-05, - "loss": 0.2795, + "epoch": 2.424946841099939, + "grad_norm": 0.199675053358078, + "learning_rate": 2.7439755781615845e-05, + "loss": 0.3781, "step": 67285 }, { - "epoch": 2.37, - "learning_rate": 2.836350988013343e-05, - "loss": 0.2556, + "epoch": 2.425127040761163, + "grad_norm": 0.2059258222579956, + "learning_rate": 2.7436851560533246e-05, + "loss": 0.4037, "step": 67290 }, { - "epoch": 2.37, - "learning_rate": 2.8360687037251603e-05, - "loss": 0.2745, + "epoch": 2.425307240422388, + "grad_norm": 0.21822243928909302, + "learning_rate": 2.7433947306249014e-05, + "loss": 0.3944, "step": 67295 }, { - "epoch": 2.37, - "learning_rate": 2.835786415073347e-05, - "loss": 0.2803, + "epoch": 2.4254874400836126, + "grad_norm": 0.19655758142471313, + "learning_rate": 2.7431043018802725e-05, + "loss": 0.3946, "step": 67300 }, { - "epoch": 2.37, - "learning_rate": 2.8355041220615685e-05, - "loss": 0.2531, + "epoch": 2.4256676397448373, + "grad_norm": 0.2280445247888565, + "learning_rate": 2.742813869823394e-05, + "loss": 0.3906, "step": 67305 }, { - "epoch": 2.37, - "learning_rate": 2.8352218246934897e-05, - "loss": 0.2607, + "epoch": 2.425847839406062, + "grad_norm": 0.16742421686649323, + "learning_rate": 2.742523434458224e-05, + "loss": 0.3632, "step": 67310 }, { - "epoch": 2.37, - "learning_rate": 2.8349395229727766e-05, - "loss": 0.2564, + "epoch": 2.4260280390672864, + "grad_norm": 0.19926291704177856, + "learning_rate": 2.7422329957887188e-05, + "loss": 0.4446, "step": 67315 }, { - "epoch": 2.37, - "learning_rate": 2.8346572169030938e-05, - "loss": 0.2854, + "epoch": 2.426208238728511, + "grad_norm": 0.17569011449813843, + "learning_rate": 2.7419425538188365e-05, + "loss": 0.3709, "step": 67320 }, { - "epoch": 2.37, - "learning_rate": 2.8344313689185543e-05, - "loss": 0.2996, + "epoch": 2.426388438389736, + "grad_norm": 0.26853156089782715, + "learning_rate": 2.741652108552533e-05, + "loss": 0.4286, "step": 67325 }, { - "epoch": 2.37, - "learning_rate": 2.834149055029965e-05, - "loss": 0.2884, + "epoch": 2.4265686380509606, + "grad_norm": 0.20490770041942596, + "learning_rate": 2.741361659993767e-05, + "loss": 0.3812, "step": 67330 }, { - "epoch": 2.37, - "learning_rate": 2.833866736802671e-05, - "loss": 0.2499, + "epoch": 2.426748837712185, + "grad_norm": 0.24885760247707367, + "learning_rate": 2.7410712081464947e-05, + "loss": 0.4166, "step": 67335 }, { - "epoch": 2.37, - "learning_rate": 2.8335844142403366e-05, - "loss": 0.245, + "epoch": 2.4269290373734096, + "grad_norm": 0.24645163118839264, + "learning_rate": 2.7407807530146735e-05, + "loss": 0.3814, "step": 67340 }, { - "epoch": 2.37, - "learning_rate": 2.8333020873466297e-05, - "loss": 0.2702, + "epoch": 2.4271092370346343, + "grad_norm": 0.24521587789058685, + "learning_rate": 2.740490294602262e-05, + "loss": 0.3759, "step": 67345 }, { - "epoch": 2.37, - "learning_rate": 2.8330197561252142e-05, - "loss": 0.2931, + "epoch": 2.427289436695859, + "grad_norm": 0.20544666051864624, + "learning_rate": 2.7401998329132162e-05, + "loss": 0.404, "step": 67350 }, { - "epoch": 2.37, - "learning_rate": 2.8327374205797562e-05, - "loss": 0.2842, + "epoch": 2.427469636357084, + "grad_norm": 0.17196045815944672, + "learning_rate": 2.7399093679514947e-05, + "loss": 0.3892, "step": 67355 }, { - "epoch": 2.37, - "learning_rate": 2.8324550807139223e-05, - "loss": 0.2538, + "epoch": 2.4276498360183085, + "grad_norm": 0.18687868118286133, + "learning_rate": 2.7396188997210538e-05, + "loss": 0.3991, "step": 67360 }, { - "epoch": 2.37, - "learning_rate": 2.8321727365313788e-05, - "loss": 0.2573, + "epoch": 2.427830035679533, + "grad_norm": 0.23785381019115448, + "learning_rate": 2.7393284282258517e-05, + "loss": 0.4111, "step": 67365 }, { - "epoch": 2.37, - "learning_rate": 2.8318903880357906e-05, - "loss": 0.266, + "epoch": 2.4280102353407575, + "grad_norm": 0.23783884942531586, + "learning_rate": 2.739037953469847e-05, + "loss": 0.3777, "step": 67370 }, { - "epoch": 2.37, - "learning_rate": 2.831608035230825e-05, - "loss": 0.2564, + "epoch": 2.4281904350019823, + "grad_norm": 0.16781599819660187, + "learning_rate": 2.7387474754569964e-05, + "loss": 0.3757, "step": 67375 }, { - "epoch": 2.37, - "learning_rate": 2.831325678120147e-05, - "loss": 0.2514, + "epoch": 2.428370634663207, + "grad_norm": 0.15680092573165894, + "learning_rate": 2.7384569941912574e-05, + "loss": 0.4104, "step": 67380 }, { - "epoch": 2.37, - "learning_rate": 2.831043316707424e-05, - "loss": 0.2565, + "epoch": 2.4285508343244313, + "grad_norm": 0.23837818205356598, + "learning_rate": 2.7381665096765874e-05, + "loss": 0.3977, "step": 67385 }, { - "epoch": 2.37, - "learning_rate": 2.830760950996321e-05, - "loss": 0.2854, + "epoch": 2.428731033985656, + "grad_norm": 0.2140997052192688, + "learning_rate": 2.7378760219169453e-05, + "loss": 0.4186, "step": 67390 }, { - "epoch": 2.37, - "learning_rate": 2.830478580990506e-05, - "loss": 0.2671, + "epoch": 2.4289112336468808, + "grad_norm": 0.25618883967399597, + "learning_rate": 2.7375855309162886e-05, + "loss": 0.3869, "step": 67395 }, { - "epoch": 2.37, - "learning_rate": 2.8301962066936437e-05, - "loss": 0.2981, + "epoch": 2.4290914333081055, + "grad_norm": 0.26297783851623535, + "learning_rate": 2.7372950366785748e-05, + "loss": 0.411, "step": 67400 }, { - "epoch": 2.37, - "learning_rate": 2.829913828109401e-05, - "loss": 0.2826, + "epoch": 2.4292716329693302, + "grad_norm": 0.22148928046226501, + "learning_rate": 2.7370045392077626e-05, + "loss": 0.3997, "step": 67405 }, { - "epoch": 2.37, - "learning_rate": 2.8296314452414453e-05, - "loss": 0.2819, + "epoch": 2.4294518326305545, + "grad_norm": 0.21536856889724731, + "learning_rate": 2.736714038507808e-05, + "loss": 0.4008, "step": 67410 }, { - "epoch": 2.37, - "learning_rate": 2.8293490580934422e-05, - "loss": 0.2803, + "epoch": 2.4296320322917793, + "grad_norm": 0.18646790087223053, + "learning_rate": 2.736423534582672e-05, + "loss": 0.3861, "step": 67415 }, { - "epoch": 2.37, - "learning_rate": 2.829066666669059e-05, - "loss": 0.2906, + "epoch": 2.429812231953004, + "grad_norm": 0.17240692675113678, + "learning_rate": 2.7361330274363113e-05, + "loss": 0.3718, "step": 67420 }, { - "epoch": 2.37, - "learning_rate": 2.8287842709719618e-05, - "loss": 0.27, + "epoch": 2.4299924316142287, + "grad_norm": 0.17794083058834076, + "learning_rate": 2.735842517072683e-05, + "loss": 0.4088, "step": 67425 }, { - "epoch": 2.37, - "learning_rate": 2.8285018710058176e-05, - "loss": 0.2333, + "epoch": 2.430172631275453, + "grad_norm": 0.17455384135246277, + "learning_rate": 2.735552003495746e-05, + "loss": 0.4008, "step": 67430 }, { - "epoch": 2.37, - "learning_rate": 2.828219466774293e-05, - "loss": 0.2658, + "epoch": 2.4303528309366778, + "grad_norm": 0.22316579520702362, + "learning_rate": 2.735261486709459e-05, + "loss": 0.4001, "step": 67435 }, { - "epoch": 2.37, - "learning_rate": 2.8279370582810544e-05, - "loss": 0.2773, + "epoch": 2.4305330305979025, + "grad_norm": 0.18151316046714783, + "learning_rate": 2.734970966717779e-05, + "loss": 0.4002, "step": 67440 }, { - "epoch": 2.37, - "learning_rate": 2.8276546455297694e-05, - "loss": 0.2568, + "epoch": 2.4307132302591272, + "grad_norm": 0.27039554715156555, + "learning_rate": 2.7346804435246658e-05, + "loss": 0.4181, "step": 67445 }, { - "epoch": 2.37, - "learning_rate": 2.8273722285241056e-05, - "loss": 0.2769, + "epoch": 2.430893429920352, + "grad_norm": 0.24719054996967316, + "learning_rate": 2.734389917134077e-05, + "loss": 0.417, "step": 67450 }, { - "epoch": 2.37, - "learning_rate": 2.8270898072677282e-05, - "loss": 0.2584, + "epoch": 2.4310736295815762, + "grad_norm": 0.2097766101360321, + "learning_rate": 2.7340993875499704e-05, + "loss": 0.387, "step": 67455 }, { - "epoch": 2.37, - "learning_rate": 2.8268073817643053e-05, - "loss": 0.2837, + "epoch": 2.431253829242801, + "grad_norm": 0.23086480796337128, + "learning_rate": 2.7338088547763057e-05, + "loss": 0.4455, "step": 67460 }, { - "epoch": 2.37, - "learning_rate": 2.826524952017504e-05, - "loss": 0.2787, + "epoch": 2.4314340289040257, + "grad_norm": 0.21231964230537415, + "learning_rate": 2.7335183188170398e-05, + "loss": 0.3746, "step": 67465 }, { - "epoch": 2.37, - "learning_rate": 2.8262425180309927e-05, - "loss": 0.2792, + "epoch": 2.4316142285652504, + "grad_norm": 0.1851814240217209, + "learning_rate": 2.733227779676133e-05, + "loss": 0.4, "step": 67470 }, { - "epoch": 2.37, - "learning_rate": 2.8259600798084356e-05, - "loss": 0.2781, + "epoch": 2.4317944282264747, + "grad_norm": 0.1971953958272934, + "learning_rate": 2.7329372373575418e-05, + "loss": 0.398, "step": 67475 }, { - "epoch": 2.37, - "learning_rate": 2.8256776373535026e-05, - "loss": 0.2648, + "epoch": 2.4319746278876995, + "grad_norm": 0.24778111279010773, + "learning_rate": 2.7326466918652267e-05, + "loss": 0.4247, "step": 67480 }, { - "epoch": 2.37, - "learning_rate": 2.82539519066986e-05, - "loss": 0.2737, + "epoch": 2.432154827548924, + "grad_norm": 0.20682400465011597, + "learning_rate": 2.7323561432031452e-05, + "loss": 0.383, "step": 67485 }, { - "epoch": 2.37, - "learning_rate": 2.8251127397611748e-05, - "loss": 0.256, + "epoch": 2.432335027210149, + "grad_norm": 0.20953048765659332, + "learning_rate": 2.7320655913752558e-05, + "loss": 0.4237, "step": 67490 }, { - "epoch": 2.37, - "learning_rate": 2.8248302846311154e-05, - "loss": 0.2713, + "epoch": 2.4325152268713737, + "grad_norm": 0.20623236894607544, + "learning_rate": 2.7317750363855184e-05, + "loss": 0.3984, "step": 67495 }, { - "epoch": 2.37, - "learning_rate": 2.824547825283349e-05, - "loss": 0.2877, + "epoch": 2.432695426532598, + "grad_norm": 0.19964554905891418, + "learning_rate": 2.7314844782378907e-05, + "loss": 0.3993, "step": 67500 }, { - "epoch": 2.37, - "eval_loss": 0.2662014961242676, - "eval_runtime": 10.538, - "eval_samples_per_second": 9.489, - "eval_steps_per_second": 9.489, + "epoch": 2.432695426532598, + "eval_loss": 0.43458136916160583, + "eval_runtime": 3.5307, + "eval_samples_per_second": 28.323, + "eval_steps_per_second": 7.081, "step": 67500 }, { - "epoch": 2.38, - "learning_rate": 2.8242653617215424e-05, - "loss": 0.2656, + "epoch": 2.4328756261938227, + "grad_norm": 0.2280506193637848, + "learning_rate": 2.731193916936331e-05, + "loss": 0.3918, "step": 67505 }, { - "epoch": 2.38, - "learning_rate": 2.8239828939493644e-05, - "loss": 0.2672, + "epoch": 2.4330558258550474, + "grad_norm": 0.2009294182062149, + "learning_rate": 2.7309033524848e-05, + "loss": 0.3965, "step": 67510 }, { - "epoch": 2.38, - "learning_rate": 2.8237004219704817e-05, - "loss": 0.2692, + "epoch": 2.433236025516272, + "grad_norm": 0.18741028010845184, + "learning_rate": 2.7306127848872547e-05, + "loss": 0.3826, "step": 67515 }, { - "epoch": 2.38, - "learning_rate": 2.823417945788562e-05, - "loss": 0.277, + "epoch": 2.4334162251774965, + "grad_norm": 0.2168714553117752, + "learning_rate": 2.7303222141476566e-05, + "loss": 0.3582, "step": 67520 }, { - "epoch": 2.38, - "learning_rate": 2.8231354654072733e-05, - "loss": 0.2665, + "epoch": 2.433596424838721, + "grad_norm": 0.2223115712404251, + "learning_rate": 2.730031640269961e-05, + "loss": 0.3941, "step": 67525 }, { - "epoch": 2.38, - "learning_rate": 2.8228529808302845e-05, - "loss": 0.2728, + "epoch": 2.433776624499946, + "grad_norm": 0.24095144867897034, + "learning_rate": 2.7297410632581293e-05, + "loss": 0.4198, "step": 67530 }, { - "epoch": 2.38, - "learning_rate": 2.8225704920612616e-05, - "loss": 0.2625, + "epoch": 2.4339568241611707, + "grad_norm": 0.162398099899292, + "learning_rate": 2.72945048311612e-05, + "loss": 0.3884, "step": 67535 }, { - "epoch": 2.38, - "learning_rate": 2.8222879991038736e-05, - "loss": 0.2947, + "epoch": 2.4341370238223954, + "grad_norm": 0.2290612757205963, + "learning_rate": 2.7291598998478927e-05, + "loss": 0.3902, "step": 67540 }, { - "epoch": 2.38, - "learning_rate": 2.8220055019617887e-05, - "loss": 0.2678, + "epoch": 2.4343172234836197, + "grad_norm": 0.19016504287719727, + "learning_rate": 2.7288693134574063e-05, + "loss": 0.3899, "step": 67545 }, { - "epoch": 2.38, - "learning_rate": 2.821723000638674e-05, - "loss": 0.2809, + "epoch": 2.4344974231448444, + "grad_norm": 0.19795559346675873, + "learning_rate": 2.7285787239486192e-05, + "loss": 0.3749, "step": 67550 }, { - "epoch": 2.38, - "learning_rate": 2.821440495138199e-05, - "loss": 0.2604, + "epoch": 2.434677622806069, + "grad_norm": 0.18482926487922668, + "learning_rate": 2.7282881313254916e-05, + "loss": 0.3787, "step": 67555 }, { - "epoch": 2.38, - "learning_rate": 2.82115798546403e-05, - "loss": 0.2859, + "epoch": 2.434857822467294, + "grad_norm": 0.23885639011859894, + "learning_rate": 2.727997535591983e-05, + "loss": 0.4029, "step": 67560 }, { - "epoch": 2.38, - "learning_rate": 2.8208754716198376e-05, - "loss": 0.2527, + "epoch": 2.435038022128518, + "grad_norm": 0.17567263543605804, + "learning_rate": 2.7277069367520513e-05, + "loss": 0.3902, "step": 67565 }, { - "epoch": 2.38, - "learning_rate": 2.8205929536092878e-05, - "loss": 0.2474, + "epoch": 2.435218221789743, + "grad_norm": 0.195444256067276, + "learning_rate": 2.7274163348096564e-05, + "loss": 0.4088, "step": 67570 }, { - "epoch": 2.38, - "learning_rate": 2.8203104314360495e-05, - "loss": 0.2517, + "epoch": 2.4353984214509676, + "grad_norm": 0.20873726904392242, + "learning_rate": 2.727125729768759e-05, + "loss": 0.4304, "step": 67575 }, { - "epoch": 2.38, - "learning_rate": 2.8200279051037914e-05, - "loss": 0.305, + "epoch": 2.4355786211121924, + "grad_norm": 0.22101178765296936, + "learning_rate": 2.7268351216333164e-05, + "loss": 0.4032, "step": 67580 }, { - "epoch": 2.38, - "learning_rate": 2.8197453746161827e-05, - "loss": 0.2641, + "epoch": 2.435758820773417, + "grad_norm": 0.23092830181121826, + "learning_rate": 2.72654451040729e-05, + "loss": 0.4214, "step": 67585 }, { - "epoch": 2.38, - "learning_rate": 2.81946283997689e-05, - "loss": 0.2854, + "epoch": 2.4359390204346414, + "grad_norm": 0.21683692932128906, + "learning_rate": 2.7262538960946382e-05, + "loss": 0.4102, "step": 67590 }, { - "epoch": 2.38, - "learning_rate": 2.819180301189584e-05, - "loss": 0.2652, + "epoch": 2.436119220095866, + "grad_norm": 0.21636758744716644, + "learning_rate": 2.7259632786993205e-05, + "loss": 0.3927, "step": 67595 }, { - "epoch": 2.38, - "learning_rate": 2.8188977582579308e-05, - "loss": 0.2688, + "epoch": 2.436299419757091, + "grad_norm": 0.22299599647521973, + "learning_rate": 2.725672658225298e-05, + "loss": 0.3997, "step": 67600 }, { - "epoch": 2.38, - "learning_rate": 2.8186152111856013e-05, - "loss": 0.273, + "epoch": 2.4364796194183156, + "grad_norm": 0.1862391084432602, + "learning_rate": 2.7253820346765285e-05, + "loss": 0.4002, "step": 67605 }, { - "epoch": 2.38, - "learning_rate": 2.8183326599762627e-05, - "loss": 0.2844, + "epoch": 2.4366598190795403, + "grad_norm": 0.18761450052261353, + "learning_rate": 2.725091408056972e-05, + "loss": 0.3688, "step": 67610 }, { - "epoch": 2.38, - "learning_rate": 2.818050104633585e-05, - "loss": 0.2694, + "epoch": 2.4368400187407646, + "grad_norm": 0.19558748602867126, + "learning_rate": 2.724800778370589e-05, + "loss": 0.4267, "step": 67615 }, { - "epoch": 2.38, - "learning_rate": 2.8177675451612358e-05, - "loss": 0.2896, + "epoch": 2.4370202184019893, + "grad_norm": 0.154653400182724, + "learning_rate": 2.724510145621339e-05, + "loss": 0.3866, "step": 67620 }, { - "epoch": 2.38, - "learning_rate": 2.8174849815628846e-05, - "loss": 0.2841, + "epoch": 2.437200418063214, + "grad_norm": 0.18840929865837097, + "learning_rate": 2.7242195098131824e-05, + "loss": 0.3955, "step": 67625 }, { - "epoch": 2.38, - "learning_rate": 2.8172024138422e-05, - "loss": 0.2586, + "epoch": 2.437380617724439, + "grad_norm": 0.2518814206123352, + "learning_rate": 2.7239288709500772e-05, + "loss": 0.4223, "step": 67630 }, { - "epoch": 2.38, - "learning_rate": 2.816919842002851e-05, - "loss": 0.2634, + "epoch": 2.4375608173856635, + "grad_norm": 0.2483137845993042, + "learning_rate": 2.723638229035985e-05, + "loss": 0.3919, "step": 67635 }, { - "epoch": 2.38, - "learning_rate": 2.816637266048507e-05, - "loss": 0.2691, + "epoch": 2.437741017046888, + "grad_norm": 0.18424572050571442, + "learning_rate": 2.7233475840748645e-05, + "loss": 0.3912, "step": 67640 }, { - "epoch": 2.38, - "learning_rate": 2.816354685982837e-05, - "loss": 0.2768, + "epoch": 2.4379212167081126, + "grad_norm": 0.18739376962184906, + "learning_rate": 2.7230569360706777e-05, + "loss": 0.4361, "step": 67645 }, { - "epoch": 2.38, - "learning_rate": 2.8160721018095094e-05, - "loss": 0.2891, + "epoch": 2.4381014163693373, + "grad_norm": 0.16617445647716522, + "learning_rate": 2.7227662850273827e-05, + "loss": 0.3941, "step": 67650 }, { - "epoch": 2.38, - "learning_rate": 2.815789513532194e-05, - "loss": 0.2806, + "epoch": 2.438281616030562, + "grad_norm": 0.23537923395633698, + "learning_rate": 2.7224756309489403e-05, + "loss": 0.3962, "step": 67655 }, { - "epoch": 2.38, - "learning_rate": 2.8155069211545594e-05, - "loss": 0.272, + "epoch": 2.4384618156917863, + "grad_norm": 0.21504931151866913, + "learning_rate": 2.7221849738393103e-05, + "loss": 0.4205, "step": 67660 }, { - "epoch": 2.38, - "learning_rate": 2.8152243246802757e-05, - "loss": 0.2733, + "epoch": 2.438642015353011, + "grad_norm": 0.18787220120429993, + "learning_rate": 2.7218943137024532e-05, + "loss": 0.3643, "step": 67665 }, { - "epoch": 2.38, - "learning_rate": 2.8149417241130122e-05, - "loss": 0.266, + "epoch": 2.438822215014236, + "grad_norm": 0.24440403282642365, + "learning_rate": 2.7216036505423293e-05, + "loss": 0.3797, "step": 67670 }, { - "epoch": 2.38, - "learning_rate": 2.8146591194564377e-05, - "loss": 0.2856, + "epoch": 2.4390024146754605, + "grad_norm": 0.2223052680492401, + "learning_rate": 2.721312984362898e-05, + "loss": 0.4005, "step": 67675 }, { - "epoch": 2.38, - "learning_rate": 2.8143765107142218e-05, - "loss": 0.2567, + "epoch": 2.4391826143366853, + "grad_norm": 0.18885168433189392, + "learning_rate": 2.7210223151681207e-05, + "loss": 0.3727, "step": 67680 }, { - "epoch": 2.38, - "learning_rate": 2.8140938978900345e-05, - "loss": 0.284, + "epoch": 2.4393628139979096, + "grad_norm": 0.1814367175102234, + "learning_rate": 2.7207316429619566e-05, + "loss": 0.3837, "step": 67685 }, { - "epoch": 2.38, - "learning_rate": 2.813811280987545e-05, - "loss": 0.2534, + "epoch": 2.4395430136591343, + "grad_norm": 0.18942394852638245, + "learning_rate": 2.7204409677483668e-05, + "loss": 0.4423, "step": 67690 }, { - "epoch": 2.38, - "learning_rate": 2.8135286600104222e-05, - "loss": 0.2557, + "epoch": 2.439723213320359, + "grad_norm": 0.16898776590824127, + "learning_rate": 2.720150289531312e-05, + "loss": 0.3894, "step": 67695 }, { - "epoch": 2.38, - "learning_rate": 2.813246034962337e-05, - "loss": 0.252, + "epoch": 2.4399034129815838, + "grad_norm": 0.1935228556394577, + "learning_rate": 2.719859608314751e-05, + "loss": 0.4158, "step": 67700 }, { - "epoch": 2.38, - "learning_rate": 2.812963405846958e-05, - "loss": 0.2668, + "epoch": 2.440083612642808, + "grad_norm": 0.22555245459079742, + "learning_rate": 2.719568924102647e-05, + "loss": 0.3861, "step": 67705 }, { - "epoch": 2.38, - "learning_rate": 2.8126807726679556e-05, - "loss": 0.2756, + "epoch": 2.4402638123040328, + "grad_norm": 0.22015783190727234, + "learning_rate": 2.7192782368989577e-05, + "loss": 0.3656, "step": 67710 }, { - "epoch": 2.38, - "learning_rate": 2.812398135428999e-05, - "loss": 0.2436, + "epoch": 2.4404440119652575, + "grad_norm": 0.20583826303482056, + "learning_rate": 2.7189875467076454e-05, + "loss": 0.4075, "step": 67715 }, { - "epoch": 2.38, - "learning_rate": 2.812115494133759e-05, - "loss": 0.2669, + "epoch": 2.4406242116264822, + "grad_norm": 0.2194293737411499, + "learning_rate": 2.7186968535326694e-05, + "loss": 0.4094, "step": 67720 }, { - "epoch": 2.38, - "learning_rate": 2.8118328487859053e-05, - "loss": 0.2457, + "epoch": 2.440804411287707, + "grad_norm": 0.15374071896076202, + "learning_rate": 2.7184061573779918e-05, + "loss": 0.3868, "step": 67725 }, { - "epoch": 2.38, - "learning_rate": 2.8115501993891074e-05, - "loss": 0.3102, + "epoch": 2.4409846109489313, + "grad_norm": 0.2262052297592163, + "learning_rate": 2.7181154582475726e-05, + "loss": 0.4121, "step": 67730 }, { - "epoch": 2.38, - "learning_rate": 2.811267545947035e-05, - "loss": 0.2651, + "epoch": 2.441164810610156, + "grad_norm": 0.21393641829490662, + "learning_rate": 2.7178247561453724e-05, + "loss": 0.3887, "step": 67735 }, { - "epoch": 2.38, - "learning_rate": 2.8109848884633588e-05, - "loss": 0.2928, + "epoch": 2.4413450102713807, + "grad_norm": 0.23438535630702972, + "learning_rate": 2.717534051075352e-05, + "loss": 0.4055, "step": 67740 }, { - "epoch": 2.38, - "learning_rate": 2.8107022269417488e-05, - "loss": 0.2644, + "epoch": 2.4415252099326055, + "grad_norm": 0.1914755254983902, + "learning_rate": 2.7172433430414725e-05, + "loss": 0.4044, "step": 67745 }, { - "epoch": 2.38, - "learning_rate": 2.8104195613858754e-05, - "loss": 0.2744, + "epoch": 2.4417054095938298, + "grad_norm": 0.22475279867649078, + "learning_rate": 2.7169526320476944e-05, + "loss": 0.3659, "step": 67750 }, { - "epoch": 2.38, - "learning_rate": 2.8101368917994082e-05, - "loss": 0.2925, + "epoch": 2.4418856092550545, + "grad_norm": 0.2241634875535965, + "learning_rate": 2.7166619180979784e-05, + "loss": 0.4037, "step": 67755 }, { - "epoch": 2.38, - "learning_rate": 2.8098542181860183e-05, - "loss": 0.2604, + "epoch": 2.4420658089162792, + "grad_norm": 0.23309601843357086, + "learning_rate": 2.716371201196286e-05, + "loss": 0.3942, "step": 67760 }, { - "epoch": 2.38, - "learning_rate": 2.809571540549375e-05, - "loss": 0.2893, + "epoch": 2.442246008577504, + "grad_norm": 0.20057342946529388, + "learning_rate": 2.716080481346578e-05, + "loss": 0.3939, "step": 67765 }, { - "epoch": 2.38, - "learning_rate": 2.80928885889315e-05, - "loss": 0.2932, + "epoch": 2.4424262082387287, + "grad_norm": 0.19580478966236115, + "learning_rate": 2.715789758552815e-05, + "loss": 0.3698, "step": 67770 }, { - "epoch": 2.38, - "learning_rate": 2.809006173221012e-05, - "loss": 0.2549, + "epoch": 2.442606407899953, + "grad_norm": 0.21272897720336914, + "learning_rate": 2.7154990328189583e-05, + "loss": 0.4033, "step": 67775 }, { - "epoch": 2.38, - "learning_rate": 2.8087234835366333e-05, - "loss": 0.2662, + "epoch": 2.4427866075611777, + "grad_norm": 0.22313345968723297, + "learning_rate": 2.715208304148969e-05, + "loss": 0.3641, "step": 67780 }, { - "epoch": 2.38, - "learning_rate": 2.8084407898436837e-05, - "loss": 0.2651, + "epoch": 2.4429668072224024, + "grad_norm": 0.20916157960891724, + "learning_rate": 2.7149175725468086e-05, + "loss": 0.3875, "step": 67785 }, { - "epoch": 2.39, - "learning_rate": 2.8081580921458335e-05, - "loss": 0.2785, + "epoch": 2.443147006883627, + "grad_norm": 0.20672792196273804, + "learning_rate": 2.7146268380164376e-05, + "loss": 0.4256, "step": 67790 }, { - "epoch": 2.39, - "learning_rate": 2.8078753904467532e-05, - "loss": 0.2767, + "epoch": 2.4433272065448515, + "grad_norm": 0.19972656667232513, + "learning_rate": 2.7143361005618168e-05, + "loss": 0.4102, "step": 67795 }, { - "epoch": 2.39, - "learning_rate": 2.8075926847501137e-05, - "loss": 0.2838, + "epoch": 2.443507406206076, + "grad_norm": 0.18391171097755432, + "learning_rate": 2.7140453601869093e-05, + "loss": 0.4145, "step": 67800 }, { - "epoch": 2.39, - "learning_rate": 2.8073099750595866e-05, - "loss": 0.2798, + "epoch": 2.443687605867301, + "grad_norm": 0.17627693712711334, + "learning_rate": 2.713754616895674e-05, + "loss": 0.3597, "step": 67805 }, { - "epoch": 2.39, - "learning_rate": 2.8070272613788417e-05, - "loss": 0.276, + "epoch": 2.4438678055285257, + "grad_norm": 0.25177159905433655, + "learning_rate": 2.7134638706920752e-05, + "loss": 0.4215, "step": 67810 }, { - "epoch": 2.39, - "learning_rate": 2.8067445437115504e-05, - "loss": 0.2661, + "epoch": 2.4440480051897504, + "grad_norm": 0.2532104253768921, + "learning_rate": 2.713173121580071e-05, + "loss": 0.3984, "step": 67815 }, { - "epoch": 2.39, - "learning_rate": 2.8064618220613826e-05, - "loss": 0.2742, + "epoch": 2.4442282048509747, + "grad_norm": 0.19765953719615936, + "learning_rate": 2.7128823695636253e-05, + "loss": 0.4179, "step": 67820 }, { - "epoch": 2.39, - "learning_rate": 2.8061790964320108e-05, - "loss": 0.2534, + "epoch": 2.4444084045121994, + "grad_norm": 0.2110573947429657, + "learning_rate": 2.7125916146466973e-05, + "loss": 0.3967, "step": 67825 }, { - "epoch": 2.39, - "learning_rate": 2.8058963668271047e-05, - "loss": 0.2752, + "epoch": 2.444588604173424, + "grad_norm": 0.22106719017028809, + "learning_rate": 2.7123008568332504e-05, + "loss": 0.4195, "step": 67830 }, { - "epoch": 2.39, - "learning_rate": 2.8056136332503362e-05, - "loss": 0.2537, + "epoch": 2.444768803834649, + "grad_norm": 0.21176251769065857, + "learning_rate": 2.7120100961272455e-05, + "loss": 0.3917, "step": 67835 }, { - "epoch": 2.39, - "learning_rate": 2.805330895705376e-05, - "loss": 0.2581, + "epoch": 2.444949003495873, + "grad_norm": 0.2130795568227768, + "learning_rate": 2.711719332532644e-05, + "loss": 0.3974, "step": 67840 }, { - "epoch": 2.39, - "learning_rate": 2.805048154195895e-05, - "loss": 0.2517, + "epoch": 2.445129203157098, + "grad_norm": 0.20041079819202423, + "learning_rate": 2.7114285660534078e-05, + "loss": 0.415, "step": 67845 }, { - "epoch": 2.39, - "learning_rate": 2.804765408725565e-05, - "loss": 0.2553, + "epoch": 2.4453094028183227, + "grad_norm": 0.2205459475517273, + "learning_rate": 2.711137796693498e-05, + "loss": 0.4196, "step": 67850 }, { - "epoch": 2.39, - "learning_rate": 2.8044826592980577e-05, - "loss": 0.2766, + "epoch": 2.4454896024795474, + "grad_norm": 0.2311592847108841, + "learning_rate": 2.7108470244568767e-05, + "loss": 0.4111, "step": 67855 }, { - "epoch": 2.39, - "learning_rate": 2.8041999059170426e-05, - "loss": 0.2772, + "epoch": 2.445669802140772, + "grad_norm": 0.2623636722564697, + "learning_rate": 2.7105562493475052e-05, + "loss": 0.4283, "step": 67860 }, { - "epoch": 2.39, - "learning_rate": 2.8039171485861932e-05, - "loss": 0.2644, + "epoch": 2.445850001801997, + "grad_norm": 0.21312451362609863, + "learning_rate": 2.710265471369346e-05, + "loss": 0.3945, "step": 67865 }, { - "epoch": 2.39, - "learning_rate": 2.8036343873091793e-05, - "loss": 0.2892, + "epoch": 2.446030201463221, + "grad_norm": 0.1985563039779663, + "learning_rate": 2.7099746905263606e-05, + "loss": 0.356, "step": 67870 }, { - "epoch": 2.39, - "learning_rate": 2.803351622089673e-05, - "loss": 0.2932, + "epoch": 2.446210401124446, + "grad_norm": 0.20180106163024902, + "learning_rate": 2.70968390682251e-05, + "loss": 0.3865, "step": 67875 }, { - "epoch": 2.39, - "learning_rate": 2.803068852931346e-05, - "loss": 0.2749, + "epoch": 2.4463906007856706, + "grad_norm": 0.1858830749988556, + "learning_rate": 2.7093931202617572e-05, + "loss": 0.417, "step": 67880 }, { - "epoch": 2.39, - "learning_rate": 2.8027860798378697e-05, - "loss": 0.2781, + "epoch": 2.4465708004468953, + "grad_norm": 0.2230648547410965, + "learning_rate": 2.7091023308480632e-05, + "loss": 0.4184, "step": 67885 }, { - "epoch": 2.39, - "learning_rate": 2.8025033028129155e-05, - "loss": 0.2682, + "epoch": 2.4467510001081196, + "grad_norm": 0.24601806700229645, + "learning_rate": 2.708811538585392e-05, + "loss": 0.4014, "step": 67890 }, { - "epoch": 2.39, - "learning_rate": 2.802220521860155e-05, - "loss": 0.2645, + "epoch": 2.4469311997693444, + "grad_norm": 0.1883206069469452, + "learning_rate": 2.7085207434777026e-05, + "loss": 0.3919, "step": 67895 }, { - "epoch": 2.39, - "learning_rate": 2.8019377369832605e-05, - "loss": 0.2776, + "epoch": 2.447111399430569, + "grad_norm": 0.22426816821098328, + "learning_rate": 2.7082299455289588e-05, + "loss": 0.4058, "step": 67900 }, { - "epoch": 2.39, - "learning_rate": 2.8016549481859035e-05, - "loss": 0.2684, + "epoch": 2.447291599091794, + "grad_norm": 0.2288169413805008, + "learning_rate": 2.7079391447431218e-05, + "loss": 0.3892, "step": 67905 }, { - "epoch": 2.39, - "learning_rate": 2.801372155471756e-05, - "loss": 0.2765, + "epoch": 2.4474717987530186, + "grad_norm": 0.2036815732717514, + "learning_rate": 2.7076483411241547e-05, + "loss": 0.3782, "step": 67910 }, { - "epoch": 2.39, - "learning_rate": 2.8010893588444892e-05, - "loss": 0.2764, + "epoch": 2.447651998414243, + "grad_norm": 0.1831190586090088, + "learning_rate": 2.7073575346760194e-05, + "loss": 0.3705, "step": 67915 }, { - "epoch": 2.39, - "learning_rate": 2.8008065583077763e-05, - "loss": 0.2738, + "epoch": 2.4478321980754676, + "grad_norm": 0.1885637491941452, + "learning_rate": 2.707066725402677e-05, + "loss": 0.4212, "step": 67920 }, { - "epoch": 2.39, - "learning_rate": 2.8005237538652878e-05, - "loss": 0.2562, + "epoch": 2.4480123977366923, + "grad_norm": 0.202529639005661, + "learning_rate": 2.7067759133080917e-05, + "loss": 0.4009, "step": 67925 }, { - "epoch": 2.39, - "learning_rate": 2.8002409455206964e-05, - "loss": 0.2766, + "epoch": 2.448192597397917, + "grad_norm": 0.22661788761615753, + "learning_rate": 2.7064850983962243e-05, + "loss": 0.4497, "step": 67930 }, { - "epoch": 2.39, - "learning_rate": 2.799958133277674e-05, - "loss": 0.272, + "epoch": 2.4483727970591413, + "grad_norm": 0.2267916351556778, + "learning_rate": 2.7061942806710368e-05, + "loss": 0.3874, "step": 67935 }, { - "epoch": 2.39, - "learning_rate": 2.7996753171398937e-05, - "loss": 0.2884, + "epoch": 2.448552996720366, + "grad_norm": 0.22525878250598907, + "learning_rate": 2.7059034601364925e-05, + "loss": 0.3741, "step": 67940 }, { - "epoch": 2.39, - "learning_rate": 2.7993924971110262e-05, - "loss": 0.2617, + "epoch": 2.448733196381591, + "grad_norm": 0.18289059400558472, + "learning_rate": 2.705612636796553e-05, + "loss": 0.3917, "step": 67945 }, { - "epoch": 2.39, - "learning_rate": 2.7991096731947454e-05, - "loss": 0.2923, + "epoch": 2.4489133960428155, + "grad_norm": 0.24663716554641724, + "learning_rate": 2.7053218106551825e-05, + "loss": 0.3758, "step": 67950 }, { - "epoch": 2.39, - "learning_rate": 2.798826845394722e-05, - "loss": 0.2731, + "epoch": 2.4490935957040403, + "grad_norm": 0.25891220569610596, + "learning_rate": 2.7050309817163407e-05, + "loss": 0.4179, "step": 67955 }, { - "epoch": 2.39, - "learning_rate": 2.7985440137146283e-05, - "loss": 0.2692, + "epoch": 2.4492737953652646, + "grad_norm": 0.20873096585273743, + "learning_rate": 2.704740149983992e-05, + "loss": 0.4246, "step": 67960 }, { - "epoch": 2.39, - "learning_rate": 2.7982611781581382e-05, - "loss": 0.2848, + "epoch": 2.4494539950264893, + "grad_norm": 0.2050144225358963, + "learning_rate": 2.7044493154620975e-05, + "loss": 0.4267, "step": 67965 }, { - "epoch": 2.39, - "learning_rate": 2.7979783387289233e-05, - "loss": 0.2871, + "epoch": 2.449634194687714, + "grad_norm": 0.189696803689003, + "learning_rate": 2.704158478154622e-05, + "loss": 0.3697, "step": 67970 }, { - "epoch": 2.39, - "learning_rate": 2.7976954954306557e-05, - "loss": 0.2772, + "epoch": 2.4498143943489388, + "grad_norm": 0.2150600403547287, + "learning_rate": 2.7038676380655265e-05, + "loss": 0.3711, "step": 67975 }, { - "epoch": 2.39, - "learning_rate": 2.7974126482670078e-05, - "loss": 0.2792, + "epoch": 2.449994594010163, + "grad_norm": 0.2863604724407196, + "learning_rate": 2.7035767951987735e-05, + "loss": 0.4377, "step": 67980 }, { - "epoch": 2.39, - "learning_rate": 2.7971297972416527e-05, - "loss": 0.2637, + "epoch": 2.450174793671388, + "grad_norm": 0.21974705159664154, + "learning_rate": 2.7032859495583258e-05, + "loss": 0.4083, "step": 67985 }, { - "epoch": 2.39, - "learning_rate": 2.7968469423582637e-05, - "loss": 0.264, + "epoch": 2.4503549933326125, + "grad_norm": 0.22870804369449615, + "learning_rate": 2.7029951011481464e-05, + "loss": 0.3969, "step": 67990 }, { - "epoch": 2.39, - "learning_rate": 2.7965640836205127e-05, - "loss": 0.2702, + "epoch": 2.4505351929938373, + "grad_norm": 0.19309473037719727, + "learning_rate": 2.7027042499721994e-05, + "loss": 0.3831, "step": 67995 }, { - "epoch": 2.39, - "learning_rate": 2.7962812210320726e-05, - "loss": 0.2813, + "epoch": 2.450715392655062, + "grad_norm": 0.22232699394226074, + "learning_rate": 2.702413396034445e-05, + "loss": 0.3634, "step": 68000 }, { - "epoch": 2.39, - "eval_loss": 0.2646848261356354, - "eval_runtime": 10.557, - "eval_samples_per_second": 9.472, - "eval_steps_per_second": 9.472, + "epoch": 2.450715392655062, + "eval_loss": 0.43447232246398926, + "eval_runtime": 3.5294, + "eval_samples_per_second": 28.333, + "eval_steps_per_second": 7.083, "step": 68000 }, { - "epoch": 2.39, - "learning_rate": 2.7959983545966155e-05, - "loss": 0.2767, + "epoch": 2.4508955923162863, + "grad_norm": 0.16693948209285736, + "learning_rate": 2.7021225393388477e-05, + "loss": 0.3795, "step": 68005 }, { - "epoch": 2.39, - "learning_rate": 2.7957154843178153e-05, - "loss": 0.2649, + "epoch": 2.451075791977511, + "grad_norm": 0.19517579674720764, + "learning_rate": 2.7018316798893695e-05, + "loss": 0.396, "step": 68010 }, { - "epoch": 2.39, - "learning_rate": 2.7954326101993445e-05, - "loss": 0.2803, + "epoch": 2.4512559916387358, + "grad_norm": 0.18335868418216705, + "learning_rate": 2.701540817689974e-05, + "loss": 0.3768, "step": 68015 }, { - "epoch": 2.39, - "learning_rate": 2.7951497322448757e-05, - "loss": 0.2935, + "epoch": 2.4514361912999605, + "grad_norm": 0.24918295443058014, + "learning_rate": 2.701249952744624e-05, + "loss": 0.3984, "step": 68020 }, { - "epoch": 2.39, - "learning_rate": 2.794866850458082e-05, - "loss": 0.2573, + "epoch": 2.4516163909611848, + "grad_norm": 0.1807776242494583, + "learning_rate": 2.700959085057282e-05, + "loss": 0.41, "step": 68025 }, { - "epoch": 2.39, - "learning_rate": 2.794583964842637e-05, - "loss": 0.2796, + "epoch": 2.4517965906224095, + "grad_norm": 0.18439331650733948, + "learning_rate": 2.700668214631912e-05, + "loss": 0.3853, "step": 68030 }, { - "epoch": 2.39, - "learning_rate": 2.7943010754022137e-05, - "loss": 0.2686, + "epoch": 2.4519767902836342, + "grad_norm": 0.19809627532958984, + "learning_rate": 2.700377341472476e-05, + "loss": 0.3885, "step": 68035 }, { - "epoch": 2.39, - "learning_rate": 2.7940181821404832e-05, - "loss": 0.263, + "epoch": 2.452156989944859, + "grad_norm": 0.22589033842086792, + "learning_rate": 2.7000864655829377e-05, + "loss": 0.4058, "step": 68040 }, { - "epoch": 2.39, - "learning_rate": 2.793735285061122e-05, - "loss": 0.2584, + "epoch": 2.4523371896060837, + "grad_norm": 0.2336047887802124, + "learning_rate": 2.699795586967259e-05, + "loss": 0.435, "step": 68045 }, { - "epoch": 2.39, - "learning_rate": 2.7934523841678013e-05, - "loss": 0.2866, + "epoch": 2.452517389267308, + "grad_norm": 0.20116937160491943, + "learning_rate": 2.6995047056294054e-05, + "loss": 0.3978, "step": 68050 }, { - "epoch": 2.39, - "learning_rate": 2.793169479464195e-05, - "loss": 0.2616, + "epoch": 2.4526975889285327, + "grad_norm": 0.2507151961326599, + "learning_rate": 2.6992138215733385e-05, + "loss": 0.4326, "step": 68055 }, { - "epoch": 2.39, - "learning_rate": 2.7928865709539754e-05, - "loss": 0.253, + "epoch": 2.4528777885897575, + "grad_norm": 0.21996821463108063, + "learning_rate": 2.6989229348030214e-05, + "loss": 0.3858, "step": 68060 }, { - "epoch": 2.39, - "learning_rate": 2.7926036586408172e-05, - "loss": 0.2722, + "epoch": 2.453057988250982, + "grad_norm": 0.2189149707555771, + "learning_rate": 2.698632045322419e-05, + "loss": 0.434, "step": 68065 }, { - "epoch": 2.39, - "learning_rate": 2.7923207425283933e-05, - "loss": 0.2763, + "epoch": 2.4532381879122065, + "grad_norm": 0.17697672545909882, + "learning_rate": 2.6983411531354918e-05, + "loss": 0.3774, "step": 68070 }, { - "epoch": 2.4, - "learning_rate": 2.7920378226203775e-05, - "loss": 0.2651, + "epoch": 2.4534183875734312, + "grad_norm": 0.17975100874900818, + "learning_rate": 2.6980502582462063e-05, + "loss": 0.4279, "step": 68075 }, { - "epoch": 2.4, - "learning_rate": 2.7917548989204422e-05, - "loss": 0.2692, + "epoch": 2.453598587234656, + "grad_norm": 0.2552946209907532, + "learning_rate": 2.6977593606585232e-05, + "loss": 0.3939, "step": 68080 }, { - "epoch": 2.4, - "learning_rate": 2.7914719714322628e-05, - "loss": 0.2624, + "epoch": 2.4537787868958807, + "grad_norm": 0.23870886862277985, + "learning_rate": 2.697468460376408e-05, + "loss": 0.3806, "step": 68085 }, { - "epoch": 2.4, - "learning_rate": 2.7911890401595114e-05, - "loss": 0.2823, + "epoch": 2.4539589865571054, + "grad_norm": 0.27226266264915466, + "learning_rate": 2.6971775574038226e-05, + "loss": 0.4057, "step": 68090 }, { - "epoch": 2.4, - "learning_rate": 2.790906105105862e-05, - "loss": 0.2674, + "epoch": 2.4541391862183297, + "grad_norm": 0.1971409022808075, + "learning_rate": 2.6968866517447317e-05, + "loss": 0.3958, "step": 68095 }, { - "epoch": 2.4, - "learning_rate": 2.7906231662749886e-05, - "loss": 0.2812, + "epoch": 2.4543193858795544, + "grad_norm": 0.21888650953769684, + "learning_rate": 2.696595743403098e-05, + "loss": 0.3885, "step": 68100 }, { - "epoch": 2.4, - "learning_rate": 2.790340223670566e-05, - "loss": 0.262, + "epoch": 2.454499585540779, + "grad_norm": 0.2371811419725418, + "learning_rate": 2.6963048323828848e-05, + "loss": 0.3722, "step": 68105 }, { - "epoch": 2.4, - "learning_rate": 2.790057277296266e-05, - "loss": 0.2699, + "epoch": 2.454679785202004, + "grad_norm": 0.23788423836231232, + "learning_rate": 2.6960139186880574e-05, + "loss": 0.3851, "step": 68110 }, { - "epoch": 2.4, - "learning_rate": 2.7897743271557634e-05, - "loss": 0.2597, + "epoch": 2.4548599848632287, + "grad_norm": 0.23677048087120056, + "learning_rate": 2.6957230023225778e-05, + "loss": 0.4163, "step": 68115 }, { - "epoch": 2.4, - "learning_rate": 2.789491373252732e-05, - "loss": 0.2699, + "epoch": 2.455040184524453, + "grad_norm": 0.17837996780872345, + "learning_rate": 2.6954320832904094e-05, + "loss": 0.388, "step": 68120 }, { - "epoch": 2.4, - "learning_rate": 2.7892084155908465e-05, - "loss": 0.2745, + "epoch": 2.4552203841856777, + "grad_norm": 0.19991345703601837, + "learning_rate": 2.695141161595518e-05, + "loss": 0.381, "step": 68125 }, { - "epoch": 2.4, - "learning_rate": 2.7889254541737795e-05, - "loss": 0.2705, + "epoch": 2.4554005838469024, + "grad_norm": 0.22142352163791656, + "learning_rate": 2.694850237241865e-05, + "loss": 0.357, "step": 68130 }, { - "epoch": 2.4, - "learning_rate": 2.7886424890052067e-05, - "loss": 0.2734, + "epoch": 2.455580783508127, + "grad_norm": 0.2008516937494278, + "learning_rate": 2.6945593102334166e-05, + "loss": 0.3499, "step": 68135 }, { - "epoch": 2.4, - "learning_rate": 2.7883595200888014e-05, - "loss": 0.2759, + "epoch": 2.455760983169352, + "grad_norm": 0.2108553797006607, + "learning_rate": 2.6942683805741337e-05, + "loss": 0.4014, "step": 68140 }, { - "epoch": 2.4, - "learning_rate": 2.7880765474282374e-05, - "loss": 0.2572, + "epoch": 2.455941182830576, + "grad_norm": 0.21089521050453186, + "learning_rate": 2.693977448267983e-05, + "loss": 0.4221, "step": 68145 }, { - "epoch": 2.4, - "learning_rate": 2.7877935710271884e-05, - "loss": 0.266, + "epoch": 2.456121382491801, + "grad_norm": 0.22154204547405243, + "learning_rate": 2.693686513318926e-05, + "loss": 0.3958, "step": 68150 }, { - "epoch": 2.4, - "learning_rate": 2.7875105908893305e-05, - "loss": 0.2584, + "epoch": 2.4563015821530256, + "grad_norm": 0.17124603688716888, + "learning_rate": 2.693395575730928e-05, + "loss": 0.4072, "step": 68155 }, { - "epoch": 2.4, - "learning_rate": 2.7872276070183373e-05, - "loss": 0.2862, + "epoch": 2.4564817818142504, + "grad_norm": 0.2264595329761505, + "learning_rate": 2.6931046355079538e-05, + "loss": 0.3937, "step": 68160 }, { - "epoch": 2.4, - "learning_rate": 2.7869446194178827e-05, - "loss": 0.2482, + "epoch": 2.4566619814754747, + "grad_norm": 0.22285620868206024, + "learning_rate": 2.692813692653965e-05, + "loss": 0.3988, "step": 68165 }, { - "epoch": 2.4, - "learning_rate": 2.7866616280916412e-05, - "loss": 0.2581, + "epoch": 2.4568421811366994, + "grad_norm": 0.20908579230308533, + "learning_rate": 2.6925227471729275e-05, + "loss": 0.3614, "step": 68170 }, { - "epoch": 2.4, - "learning_rate": 2.7863786330432873e-05, - "loss": 0.2524, + "epoch": 2.457022380797924, + "grad_norm": 0.18623116612434387, + "learning_rate": 2.692231799068805e-05, + "loss": 0.4202, "step": 68175 }, { - "epoch": 2.4, - "learning_rate": 2.786095634276496e-05, - "loss": 0.25, + "epoch": 2.457202580459149, + "grad_norm": 0.216128870844841, + "learning_rate": 2.6919408483455615e-05, + "loss": 0.3951, "step": 68180 }, { - "epoch": 2.4, - "learning_rate": 2.7858126317949407e-05, - "loss": 0.2789, + "epoch": 2.4573827801203736, + "grad_norm": 0.18577782809734344, + "learning_rate": 2.6916498950071605e-05, + "loss": 0.4192, "step": 68185 }, { - "epoch": 2.4, - "learning_rate": 2.785529625602298e-05, - "loss": 0.2712, + "epoch": 2.457562979781598, + "grad_norm": 0.18905296921730042, + "learning_rate": 2.6913589390575677e-05, + "loss": 0.3804, "step": 68190 }, { - "epoch": 2.4, - "learning_rate": 2.78524661570224e-05, - "loss": 0.2741, + "epoch": 2.4577431794428226, + "grad_norm": 0.23588091135025024, + "learning_rate": 2.6910679805007454e-05, + "loss": 0.4285, "step": 68195 }, { - "epoch": 2.4, - "learning_rate": 2.7849636020984425e-05, - "loss": 0.2724, + "epoch": 2.4579233791040473, + "grad_norm": 0.23398438096046448, + "learning_rate": 2.6907770193406595e-05, + "loss": 0.3918, "step": 68200 }, { - "epoch": 2.4, - "learning_rate": 2.784680584794581e-05, - "loss": 0.2737, + "epoch": 2.458103578765272, + "grad_norm": 0.22657303512096405, + "learning_rate": 2.6904860555812738e-05, + "loss": 0.3779, "step": 68205 }, { - "epoch": 2.4, - "learning_rate": 2.7843975637943303e-05, - "loss": 0.2678, + "epoch": 2.4582837784264964, + "grad_norm": 0.17407497763633728, + "learning_rate": 2.6901950892265514e-05, + "loss": 0.4045, "step": 68210 }, { - "epoch": 2.4, - "learning_rate": 2.784114539101364e-05, - "loss": 0.269, + "epoch": 2.458463978087721, + "grad_norm": 0.19006119668483734, + "learning_rate": 2.6899041202804587e-05, + "loss": 0.3903, "step": 68215 }, { - "epoch": 2.4, - "learning_rate": 2.783831510719358e-05, - "loss": 0.2992, + "epoch": 2.458644177748946, + "grad_norm": 0.22295033931732178, + "learning_rate": 2.6896131487469595e-05, + "loss": 0.3926, "step": 68220 }, { - "epoch": 2.4, - "learning_rate": 2.7835484786519868e-05, - "loss": 0.302, + "epoch": 2.4588243774101706, + "grad_norm": 0.21144932508468628, + "learning_rate": 2.6893221746300167e-05, + "loss": 0.4296, "step": 68225 }, { - "epoch": 2.4, - "learning_rate": 2.7832654429029248e-05, - "loss": 0.2639, + "epoch": 2.4590045770713953, + "grad_norm": 0.2753607928752899, + "learning_rate": 2.689031197933597e-05, + "loss": 0.4244, "step": 68230 }, { - "epoch": 2.4, - "learning_rate": 2.7829824034758483e-05, - "loss": 0.2903, + "epoch": 2.4591847767326196, + "grad_norm": 0.18635913729667664, + "learning_rate": 2.6887402186616627e-05, + "loss": 0.4052, "step": 68235 }, { - "epoch": 2.4, - "learning_rate": 2.7826993603744323e-05, - "loss": 0.2771, + "epoch": 2.4593649763938443, + "grad_norm": 0.1916455328464508, + "learning_rate": 2.6884492368181808e-05, + "loss": 0.4064, "step": 68240 }, { - "epoch": 2.4, - "learning_rate": 2.7824163136023506e-05, - "loss": 0.2762, + "epoch": 2.459545176055069, + "grad_norm": 0.21135953068733215, + "learning_rate": 2.6881582524071137e-05, + "loss": 0.4178, "step": 68245 }, { - "epoch": 2.4, - "learning_rate": 2.782133263163279e-05, - "loss": 0.2731, + "epoch": 2.459725375716294, + "grad_norm": 0.20750294625759125, + "learning_rate": 2.6878672654324272e-05, + "loss": 0.4014, "step": 68250 }, { - "epoch": 2.4, - "learning_rate": 2.7818502090608937e-05, - "loss": 0.294, + "epoch": 2.459905575377518, + "grad_norm": 0.18974362313747406, + "learning_rate": 2.687576275898085e-05, + "loss": 0.3691, "step": 68255 }, { - "epoch": 2.4, - "learning_rate": 2.7815671512988685e-05, - "loss": 0.2546, + "epoch": 2.460085775038743, + "grad_norm": 0.19954413175582886, + "learning_rate": 2.6872852838080524e-05, + "loss": 0.3698, "step": 68260 }, { - "epoch": 2.4, - "learning_rate": 2.7812840898808797e-05, - "loss": 0.2527, + "epoch": 2.4602659746999676, + "grad_norm": 0.20801982283592224, + "learning_rate": 2.6869942891662947e-05, + "loss": 0.3933, "step": 68265 }, { - "epoch": 2.4, - "learning_rate": 2.7810010248106024e-05, - "loss": 0.2724, + "epoch": 2.4604461743611923, + "grad_norm": 0.2402622401714325, + "learning_rate": 2.6867032919767754e-05, + "loss": 0.4223, "step": 68270 }, { - "epoch": 2.4, - "learning_rate": 2.7807179560917123e-05, - "loss": 0.2793, + "epoch": 2.460626374022417, + "grad_norm": 0.2088380753993988, + "learning_rate": 2.6864122922434603e-05, + "loss": 0.4071, "step": 68275 }, { - "epoch": 2.4, - "learning_rate": 2.7804348837278836e-05, - "loss": 0.263, + "epoch": 2.4608065736836413, + "grad_norm": 0.2807213366031647, + "learning_rate": 2.6861212899703142e-05, + "loss": 0.4163, "step": 68280 }, { - "epoch": 2.4, - "learning_rate": 2.7801518077227935e-05, - "loss": 0.2702, + "epoch": 2.460986773344866, + "grad_norm": 0.1840617060661316, + "learning_rate": 2.685830285161301e-05, + "loss": 0.39, "step": 68285 }, { - "epoch": 2.4, - "learning_rate": 2.7798687280801166e-05, - "loss": 0.2803, + "epoch": 2.4611669730060908, + "grad_norm": 0.21558882296085358, + "learning_rate": 2.6855392778203858e-05, + "loss": 0.3968, "step": 68290 }, { - "epoch": 2.4, - "learning_rate": 2.779585644803529e-05, - "loss": 0.287, + "epoch": 2.4613471726673155, + "grad_norm": 0.2157306671142578, + "learning_rate": 2.6852482679515345e-05, + "loss": 0.4016, "step": 68295 }, { - "epoch": 2.4, - "learning_rate": 2.779302557896706e-05, - "loss": 0.262, + "epoch": 2.46152737232854, + "grad_norm": 0.18490919470787048, + "learning_rate": 2.6849572555587116e-05, + "loss": 0.4155, "step": 68300 }, { - "epoch": 2.4, - "learning_rate": 2.7790194673633228e-05, - "loss": 0.2756, + "epoch": 2.4617075719897645, + "grad_norm": 0.2350340336561203, + "learning_rate": 2.684666240645881e-05, + "loss": 0.4039, "step": 68305 }, { - "epoch": 2.4, - "learning_rate": 2.778736373207056e-05, - "loss": 0.2656, + "epoch": 2.4618877716509893, + "grad_norm": 0.1901131123304367, + "learning_rate": 2.6843752232170095e-05, + "loss": 0.3888, "step": 68310 }, { - "epoch": 2.4, - "learning_rate": 2.7784532754315812e-05, - "loss": 0.2581, + "epoch": 2.462067971312214, + "grad_norm": 0.1914856880903244, + "learning_rate": 2.68408420327606e-05, + "loss": 0.3938, "step": 68315 }, { - "epoch": 2.4, - "learning_rate": 2.778170174040574e-05, - "loss": 0.2646, + "epoch": 2.4622481709734387, + "grad_norm": 0.23871468007564545, + "learning_rate": 2.6837931808270006e-05, + "loss": 0.3918, "step": 68320 }, { - "epoch": 2.4, - "learning_rate": 2.7778870690377108e-05, - "loss": 0.2627, + "epoch": 2.462428370634663, + "grad_norm": 0.1966165006160736, + "learning_rate": 2.6835021558737932e-05, + "loss": 0.4099, "step": 68325 }, { - "epoch": 2.4, - "learning_rate": 2.7776039604266667e-05, - "loss": 0.265, + "epoch": 2.4626085702958878, + "grad_norm": 0.224965900182724, + "learning_rate": 2.6832111284204054e-05, + "loss": 0.4068, "step": 68330 }, { - "epoch": 2.4, - "learning_rate": 2.7773208482111186e-05, - "loss": 0.2562, + "epoch": 2.4627887699571125, + "grad_norm": 0.20366400480270386, + "learning_rate": 2.682920098470801e-05, + "loss": 0.3795, "step": 68335 }, { - "epoch": 2.4, - "learning_rate": 2.777037732394741e-05, - "loss": 0.2778, + "epoch": 2.4629689696183372, + "grad_norm": 0.20222342014312744, + "learning_rate": 2.682629066028946e-05, + "loss": 0.3814, "step": 68340 }, { - "epoch": 2.4, - "learning_rate": 2.7767546129812123e-05, - "loss": 0.2595, + "epoch": 2.4631491692795615, + "grad_norm": 0.21054887771606445, + "learning_rate": 2.682338031098805e-05, + "loss": 0.4116, "step": 68345 }, { - "epoch": 2.4, - "learning_rate": 2.7764714899742063e-05, - "loss": 0.2597, + "epoch": 2.4633293689407862, + "grad_norm": 0.23802423477172852, + "learning_rate": 2.6820469936843434e-05, + "loss": 0.4045, "step": 68350 }, { - "epoch": 2.4, - "learning_rate": 2.776188363377401e-05, - "loss": 0.2698, + "epoch": 2.463509568602011, + "grad_norm": 0.2170737087726593, + "learning_rate": 2.6817559537895276e-05, + "loss": 0.3764, "step": 68355 }, { - "epoch": 2.41, - "learning_rate": 2.7759052331944713e-05, - "loss": 0.3009, + "epoch": 2.4636897682632357, + "grad_norm": 0.2287246584892273, + "learning_rate": 2.6814649114183216e-05, + "loss": 0.4225, "step": 68360 }, { - "epoch": 2.41, - "learning_rate": 2.7756220994290938e-05, - "loss": 0.2374, + "epoch": 2.4638699679244604, + "grad_norm": 0.17304402589797974, + "learning_rate": 2.681173866574691e-05, + "loss": 0.3751, "step": 68365 }, { - "epoch": 2.41, - "learning_rate": 2.7753389620849452e-05, - "loss": 0.2609, + "epoch": 2.464050167585685, + "grad_norm": 0.22758109867572784, + "learning_rate": 2.6808828192626022e-05, + "loss": 0.4037, "step": 68370 }, { - "epoch": 2.41, - "learning_rate": 2.775055821165702e-05, - "loss": 0.2568, + "epoch": 2.4642303672469095, + "grad_norm": 0.22057542204856873, + "learning_rate": 2.6805917694860195e-05, + "loss": 0.4412, "step": 68375 }, { - "epoch": 2.41, - "learning_rate": 2.7747726766750405e-05, - "loss": 0.2988, + "epoch": 2.464410566908134, + "grad_norm": 0.22053518891334534, + "learning_rate": 2.680300717248909e-05, + "loss": 0.3812, "step": 68380 }, { - "epoch": 2.41, - "learning_rate": 2.7744895286166357e-05, - "loss": 0.2684, + "epoch": 2.464590766569359, + "grad_norm": 0.21665138006210327, + "learning_rate": 2.6800096625552368e-05, + "loss": 0.3685, "step": 68385 }, { - "epoch": 2.41, - "learning_rate": 2.7742063769941662e-05, - "loss": 0.2592, + "epoch": 2.4647709662305837, + "grad_norm": 0.1874067336320877, + "learning_rate": 2.6797186054089674e-05, + "loss": 0.4096, "step": 68390 }, { - "epoch": 2.41, - "learning_rate": 2.7739232218113076e-05, - "loss": 0.2616, + "epoch": 2.464951165891808, + "grad_norm": 0.2060876190662384, + "learning_rate": 2.679427545814066e-05, + "loss": 0.3738, "step": 68395 }, { - "epoch": 2.41, - "learning_rate": 2.773640063071736e-05, - "loss": 0.2722, + "epoch": 2.4651313655530327, + "grad_norm": 0.2187156081199646, + "learning_rate": 2.6791364837744998e-05, + "loss": 0.4185, "step": 68400 }, { - "epoch": 2.41, - "learning_rate": 2.7733569007791284e-05, - "loss": 0.262, + "epoch": 2.4653115652142574, + "grad_norm": 0.214223712682724, + "learning_rate": 2.6788454192942342e-05, + "loss": 0.4253, "step": 68405 }, { - "epoch": 2.41, - "learning_rate": 2.773073734937162e-05, - "loss": 0.276, + "epoch": 2.465491764875482, + "grad_norm": 0.1790245622396469, + "learning_rate": 2.6785543523772334e-05, + "loss": 0.4309, "step": 68410 }, { - "epoch": 2.41, - "learning_rate": 2.7727905655495135e-05, - "loss": 0.2585, + "epoch": 2.465671964536707, + "grad_norm": 0.20714068412780762, + "learning_rate": 2.6782632830274645e-05, + "loss": 0.4007, "step": 68415 }, { - "epoch": 2.41, - "learning_rate": 2.7725073926198587e-05, - "loss": 0.2751, + "epoch": 2.465852164197931, + "grad_norm": 0.21486568450927734, + "learning_rate": 2.6779722112488924e-05, + "loss": 0.3859, "step": 68420 }, { - "epoch": 2.41, - "learning_rate": 2.7722242161518746e-05, - "loss": 0.2607, + "epoch": 2.466032363859156, + "grad_norm": 0.24496997892856598, + "learning_rate": 2.6776811370454848e-05, + "loss": 0.3808, "step": 68425 }, { - "epoch": 2.41, - "learning_rate": 2.7719410361492393e-05, - "loss": 0.2654, + "epoch": 2.4662125635203807, + "grad_norm": 0.24687924981117249, + "learning_rate": 2.677390060421204e-05, + "loss": 0.3642, "step": 68430 }, { - "epoch": 2.41, - "learning_rate": 2.7716578526156285e-05, - "loss": 0.2693, + "epoch": 2.4663927631816054, + "grad_norm": 0.17569184303283691, + "learning_rate": 2.6770989813800197e-05, + "loss": 0.4155, "step": 68435 }, { - "epoch": 2.41, - "learning_rate": 2.7713746655547202e-05, - "loss": 0.2475, + "epoch": 2.4665729628428297, + "grad_norm": 0.2444978952407837, + "learning_rate": 2.676807899925895e-05, + "loss": 0.4116, "step": 68440 }, { - "epoch": 2.41, - "learning_rate": 2.7710914749701893e-05, - "loss": 0.2863, + "epoch": 2.4667531625040544, + "grad_norm": 0.16109678149223328, + "learning_rate": 2.676516816062798e-05, + "loss": 0.3992, "step": 68445 }, { - "epoch": 2.41, - "learning_rate": 2.7708082808657148e-05, - "loss": 0.2811, + "epoch": 2.466933362165279, + "grad_norm": 0.20728158950805664, + "learning_rate": 2.676225729794693e-05, + "loss": 0.3597, "step": 68450 }, { - "epoch": 2.41, - "learning_rate": 2.7705250832449737e-05, - "loss": 0.2761, + "epoch": 2.467113561826504, + "grad_norm": 0.2165517657995224, + "learning_rate": 2.675934641125546e-05, + "loss": 0.3792, "step": 68455 }, { - "epoch": 2.41, - "learning_rate": 2.7702418821116426e-05, - "loss": 0.27, + "epoch": 2.4672937614877286, + "grad_norm": 0.274471253156662, + "learning_rate": 2.6756435500593242e-05, + "loss": 0.4132, "step": 68460 }, { - "epoch": 2.41, - "learning_rate": 2.7699586774693986e-05, - "loss": 0.2699, + "epoch": 2.467473961148953, + "grad_norm": 0.23226816952228546, + "learning_rate": 2.675352456599993e-05, + "loss": 0.4197, "step": 68465 }, { - "epoch": 2.41, - "learning_rate": 2.7696754693219197e-05, - "loss": 0.2681, + "epoch": 2.4676541608101776, + "grad_norm": 0.24921727180480957, + "learning_rate": 2.6750613607515184e-05, + "loss": 0.4038, "step": 68470 }, { - "epoch": 2.41, - "learning_rate": 2.7693922576728816e-05, - "loss": 0.2782, + "epoch": 2.4678343604714024, + "grad_norm": 0.17365936934947968, + "learning_rate": 2.6747702625178667e-05, + "loss": 0.4263, "step": 68475 }, { - "epoch": 2.41, - "learning_rate": 2.769109042525962e-05, - "loss": 0.2844, + "epoch": 2.468014560132627, + "grad_norm": 0.24769355356693268, + "learning_rate": 2.6744791619030042e-05, + "loss": 0.3998, "step": 68480 }, { - "epoch": 2.41, - "learning_rate": 2.7688258238848403e-05, - "loss": 0.2863, + "epoch": 2.4681947597938514, + "grad_norm": 0.24163182079792023, + "learning_rate": 2.674188058910897e-05, + "loss": 0.3979, "step": 68485 }, { - "epoch": 2.41, - "learning_rate": 2.7685426017531917e-05, - "loss": 0.2722, + "epoch": 2.468374959455076, + "grad_norm": 0.21235689520835876, + "learning_rate": 2.6738969535455104e-05, + "loss": 0.3893, "step": 68490 }, { - "epoch": 2.41, - "learning_rate": 2.7682593761346948e-05, - "loss": 0.2669, + "epoch": 2.468555159116301, + "grad_norm": 0.23081441223621368, + "learning_rate": 2.673605845810812e-05, + "loss": 0.3736, "step": 68495 }, { - "epoch": 2.41, - "learning_rate": 2.7679761470330262e-05, - "loss": 0.2725, + "epoch": 2.4687353587775256, + "grad_norm": 0.2072082906961441, + "learning_rate": 2.6733147357107673e-05, + "loss": 0.4183, "step": 68500 }, { - "epoch": 2.41, - "eval_loss": 0.26537927985191345, - "eval_runtime": 10.5494, - "eval_samples_per_second": 9.479, - "eval_steps_per_second": 9.479, + "epoch": 2.4687353587775256, + "eval_loss": 0.43393146991729736, + "eval_runtime": 3.5293, + "eval_samples_per_second": 28.334, + "eval_steps_per_second": 7.084, "step": 68500 }, { - "epoch": 2.41, - "learning_rate": 2.7676929144518642e-05, - "loss": 0.2678, + "epoch": 2.4689155584387503, + "grad_norm": 0.245635524392128, + "learning_rate": 2.673023623249344e-05, + "loss": 0.3877, "step": 68505 }, { - "epoch": 2.41, - "learning_rate": 2.767409678394886e-05, - "loss": 0.2775, + "epoch": 2.4690957580999746, + "grad_norm": 0.21268950402736664, + "learning_rate": 2.672732508430506e-05, + "loss": 0.3474, "step": 68510 }, { - "epoch": 2.41, - "learning_rate": 2.76712643886577e-05, - "loss": 0.2876, + "epoch": 2.4692759577611993, + "grad_norm": 0.22742435336112976, + "learning_rate": 2.6724413912582208e-05, + "loss": 0.3821, "step": 68515 }, { - "epoch": 2.41, - "learning_rate": 2.7668431958681924e-05, - "loss": 0.2665, + "epoch": 2.469456157422424, + "grad_norm": 0.26614660024642944, + "learning_rate": 2.672150271736456e-05, + "loss": 0.3755, "step": 68520 }, { - "epoch": 2.41, - "learning_rate": 2.7665599494058325e-05, - "loss": 0.2805, + "epoch": 2.469636357083649, + "grad_norm": 0.20712517201900482, + "learning_rate": 2.671859149869177e-05, + "loss": 0.4151, "step": 68525 }, { - "epoch": 2.41, - "learning_rate": 2.7662766994823665e-05, - "loss": 0.2676, + "epoch": 2.469816556744873, + "grad_norm": 0.16104906797409058, + "learning_rate": 2.6715680256603503e-05, + "loss": 0.3874, "step": 68530 }, { - "epoch": 2.41, - "learning_rate": 2.765993446101473e-05, - "loss": 0.2759, + "epoch": 2.469996756406098, + "grad_norm": 0.20770768821239471, + "learning_rate": 2.671276899113942e-05, + "loss": 0.4159, "step": 68535 }, { - "epoch": 2.41, - "learning_rate": 2.7657101892668303e-05, - "loss": 0.2572, + "epoch": 2.4701769560673226, + "grad_norm": 0.16630055010318756, + "learning_rate": 2.670985770233919e-05, + "loss": 0.4349, "step": 68540 }, { - "epoch": 2.41, - "learning_rate": 2.7654269289821165e-05, - "loss": 0.2697, + "epoch": 2.4703571557285473, + "grad_norm": 0.18788853287696838, + "learning_rate": 2.6706946390242488e-05, + "loss": 0.3773, "step": 68545 }, { - "epoch": 2.41, - "learning_rate": 2.7651436652510082e-05, - "loss": 0.2701, + "epoch": 2.470537355389772, + "grad_norm": 0.24081262946128845, + "learning_rate": 2.6704035054888964e-05, + "loss": 0.3926, "step": 68550 }, { - "epoch": 2.41, - "learning_rate": 2.7648603980771846e-05, - "loss": 0.2857, + "epoch": 2.4707175550509963, + "grad_norm": 0.21271935105323792, + "learning_rate": 2.67011236963183e-05, + "loss": 0.3867, "step": 68555 }, { - "epoch": 2.41, - "learning_rate": 2.7645771274643223e-05, - "loss": 0.2857, + "epoch": 2.470897754712221, + "grad_norm": 0.19515545666217804, + "learning_rate": 2.6698212314570148e-05, + "loss": 0.3917, "step": 68560 }, { - "epoch": 2.41, - "learning_rate": 2.7642938534161016e-05, - "loss": 0.2729, + "epoch": 2.471077954373446, + "grad_norm": 0.18669338524341583, + "learning_rate": 2.669530090968419e-05, + "loss": 0.3656, "step": 68565 }, { - "epoch": 2.41, - "learning_rate": 2.7640105759361985e-05, - "loss": 0.279, + "epoch": 2.4712581540346705, + "grad_norm": 0.19376404583454132, + "learning_rate": 2.669238948170008e-05, + "loss": 0.3925, "step": 68570 }, { - "epoch": 2.41, - "learning_rate": 2.7637272950282927e-05, - "loss": 0.2692, + "epoch": 2.471438353695895, + "grad_norm": 0.22779560089111328, + "learning_rate": 2.6689478030657496e-05, + "loss": 0.4522, "step": 68575 }, { - "epoch": 2.41, - "learning_rate": 2.7634440106960614e-05, - "loss": 0.2535, + "epoch": 2.4716185533571196, + "grad_norm": 0.19855950772762299, + "learning_rate": 2.6686566556596087e-05, + "loss": 0.3975, "step": 68580 }, { - "epoch": 2.41, - "learning_rate": 2.7631607229431834e-05, - "loss": 0.2822, + "epoch": 2.4717987530183443, + "grad_norm": 0.2209222912788391, + "learning_rate": 2.6683655059555544e-05, + "loss": 0.3871, "step": 68585 }, { - "epoch": 2.41, - "learning_rate": 2.7628774317733362e-05, - "loss": 0.2818, + "epoch": 2.471978952679569, + "grad_norm": 0.1904340535402298, + "learning_rate": 2.668074353957553e-05, + "loss": 0.3682, "step": 68590 }, { - "epoch": 2.41, - "learning_rate": 2.7625941371901997e-05, - "loss": 0.2645, + "epoch": 2.4721591523407938, + "grad_norm": 0.2192251831293106, + "learning_rate": 2.6677831996695706e-05, + "loss": 0.3725, "step": 68595 }, { - "epoch": 2.41, - "learning_rate": 2.762310839197451e-05, - "loss": 0.2604, + "epoch": 2.472339352002018, + "grad_norm": 0.20693185925483704, + "learning_rate": 2.6674920430955753e-05, + "loss": 0.3858, "step": 68600 }, { - "epoch": 2.41, - "learning_rate": 2.7620275377987686e-05, - "loss": 0.2736, + "epoch": 2.4725195516632428, + "grad_norm": 0.25332531332969666, + "learning_rate": 2.6672008842395323e-05, + "loss": 0.3742, "step": 68605 }, { - "epoch": 2.41, - "learning_rate": 2.7617442329978323e-05, - "loss": 0.2814, + "epoch": 2.4726997513244675, + "grad_norm": 0.2141808420419693, + "learning_rate": 2.666909723105411e-05, + "loss": 0.4053, "step": 68610 }, { - "epoch": 2.41, - "learning_rate": 2.7614609247983193e-05, - "loss": 0.2776, + "epoch": 2.4728799509856922, + "grad_norm": 0.20601968467235565, + "learning_rate": 2.6666185596971753e-05, + "loss": 0.3986, "step": 68615 }, { - "epoch": 2.41, - "learning_rate": 2.7611776132039074e-05, - "loss": 0.2643, + "epoch": 2.473060150646917, + "grad_norm": 0.21490851044654846, + "learning_rate": 2.6663273940187956e-05, + "loss": 0.3877, "step": 68620 }, { - "epoch": 2.41, - "learning_rate": 2.7608942982182774e-05, - "loss": 0.2721, + "epoch": 2.4732403503081413, + "grad_norm": 0.1867503970861435, + "learning_rate": 2.666036226074236e-05, + "loss": 0.3738, "step": 68625 }, { - "epoch": 2.41, - "learning_rate": 2.760610979845107e-05, - "loss": 0.2682, + "epoch": 2.473420549969366, + "grad_norm": 0.21378225088119507, + "learning_rate": 2.6657450558674658e-05, + "loss": 0.3711, "step": 68630 }, { - "epoch": 2.41, - "learning_rate": 2.760327658088074e-05, - "loss": 0.2631, + "epoch": 2.4736007496305907, + "grad_norm": 0.20687870681285858, + "learning_rate": 2.665453883402451e-05, + "loss": 0.4117, "step": 68635 }, { - "epoch": 2.41, - "learning_rate": 2.7600443329508578e-05, - "loss": 0.2766, + "epoch": 2.4737809492918155, + "grad_norm": 0.17772191762924194, + "learning_rate": 2.665162708683159e-05, + "loss": 0.3953, "step": 68640 }, { - "epoch": 2.42, - "learning_rate": 2.759761004437138e-05, - "loss": 0.2757, + "epoch": 2.47396114895304, + "grad_norm": 0.19218339025974274, + "learning_rate": 2.6648715317135575e-05, + "loss": 0.3936, "step": 68645 }, { - "epoch": 2.42, - "learning_rate": 2.759477672550593e-05, - "loss": 0.2499, + "epoch": 2.4741413486142645, + "grad_norm": 0.2298266589641571, + "learning_rate": 2.6645803524976133e-05, + "loss": 0.3804, "step": 68650 }, { - "epoch": 2.42, - "learning_rate": 2.759194337294901e-05, - "loss": 0.2766, + "epoch": 2.4743215482754892, + "grad_norm": 0.1857258528470993, + "learning_rate": 2.664289171039293e-05, + "loss": 0.3942, "step": 68655 }, { - "epoch": 2.42, - "learning_rate": 2.7589109986737416e-05, - "loss": 0.2666, + "epoch": 2.474501747936714, + "grad_norm": 0.1851481795310974, + "learning_rate": 2.6639979873425652e-05, + "loss": 0.4023, "step": 68660 }, { - "epoch": 2.42, - "learning_rate": 2.7586276566907925e-05, - "loss": 0.2593, + "epoch": 2.4746819475979387, + "grad_norm": 0.19881199300289154, + "learning_rate": 2.663706801411396e-05, + "loss": 0.4297, "step": 68665 }, { - "epoch": 2.42, - "learning_rate": 2.7583443113497348e-05, - "loss": 0.264, + "epoch": 2.474862147259163, + "grad_norm": 0.22843848168849945, + "learning_rate": 2.6634156132497538e-05, + "loss": 0.3793, "step": 68670 }, { - "epoch": 2.42, - "learning_rate": 2.7580609626542457e-05, - "loss": 0.284, + "epoch": 2.4750423469203877, + "grad_norm": 0.20685456693172455, + "learning_rate": 2.6631244228616053e-05, + "loss": 0.3826, "step": 68675 }, { - "epoch": 2.42, - "learning_rate": 2.757777610608006e-05, - "loss": 0.2797, + "epoch": 2.4752225465816124, + "grad_norm": 0.24268147349357605, + "learning_rate": 2.6628332302509186e-05, + "loss": 0.3882, "step": 68680 }, { - "epoch": 2.42, - "learning_rate": 2.757494255214693e-05, - "loss": 0.2877, + "epoch": 2.475402746242837, + "grad_norm": 0.21436165273189545, + "learning_rate": 2.6625420354216597e-05, + "loss": 0.4387, "step": 68685 }, { - "epoch": 2.42, - "learning_rate": 2.7572108964779873e-05, - "loss": 0.2632, + "epoch": 2.475582945904062, + "grad_norm": 0.20121796429157257, + "learning_rate": 2.6622508383777977e-05, + "loss": 0.4059, "step": 68690 }, { - "epoch": 2.42, - "learning_rate": 2.7569275344015678e-05, - "loss": 0.2612, + "epoch": 2.475763145565286, + "grad_norm": 0.1770712435245514, + "learning_rate": 2.6619596391233e-05, + "loss": 0.3906, "step": 68695 }, { - "epoch": 2.42, - "learning_rate": 2.756644168989113e-05, - "loss": 0.2647, + "epoch": 2.475943345226511, + "grad_norm": 0.2208649069070816, + "learning_rate": 2.661668437662132e-05, + "loss": 0.4162, "step": 68700 }, { - "epoch": 2.42, - "learning_rate": 2.756360800244303e-05, - "loss": 0.2814, + "epoch": 2.4761235448877357, + "grad_norm": 0.23539263010025024, + "learning_rate": 2.661377233998264e-05, + "loss": 0.4129, "step": 68705 }, { - "epoch": 2.42, - "learning_rate": 2.7560774281708174e-05, - "loss": 0.2855, + "epoch": 2.4763037445489604, + "grad_norm": 0.18581153452396393, + "learning_rate": 2.6610860281356627e-05, + "loss": 0.4237, "step": 68710 }, { - "epoch": 2.42, - "learning_rate": 2.7557940527723348e-05, - "loss": 0.2755, + "epoch": 2.4764839442101847, + "grad_norm": 0.20653104782104492, + "learning_rate": 2.6607948200782944e-05, + "loss": 0.3858, "step": 68715 }, { - "epoch": 2.42, - "learning_rate": 2.755510674052535e-05, - "loss": 0.2472, + "epoch": 2.4766641438714094, + "grad_norm": 0.18141742050647736, + "learning_rate": 2.6605036098301283e-05, + "loss": 0.3642, "step": 68720 }, { - "epoch": 2.42, - "learning_rate": 2.7552272920150966e-05, - "loss": 0.2665, + "epoch": 2.476844343532634, + "grad_norm": 0.22775548696517944, + "learning_rate": 2.6602123973951314e-05, + "loss": 0.4237, "step": 68725 }, { - "epoch": 2.42, - "learning_rate": 2.754943906663701e-05, - "loss": 0.2592, + "epoch": 2.477024543193859, + "grad_norm": 0.1954440176486969, + "learning_rate": 2.6599211827772724e-05, + "loss": 0.403, "step": 68730 }, { - "epoch": 2.42, - "learning_rate": 2.7546605180020273e-05, - "loss": 0.2856, + "epoch": 2.4772047428550836, + "grad_norm": 0.18037401139736176, + "learning_rate": 2.6596299659805173e-05, + "loss": 0.3839, "step": 68735 }, { - "epoch": 2.42, - "learning_rate": 2.7543771260337538e-05, - "loss": 0.263, + "epoch": 2.477384942516308, + "grad_norm": 0.2156188189983368, + "learning_rate": 2.6593387470088354e-05, + "loss": 0.4249, "step": 68740 }, { - "epoch": 2.42, - "learning_rate": 2.7540937307625613e-05, - "loss": 0.2834, + "epoch": 2.4775651421775327, + "grad_norm": 0.2691105008125305, + "learning_rate": 2.6590475258661935e-05, + "loss": 0.4494, "step": 68745 }, { - "epoch": 2.42, - "learning_rate": 2.7538103321921287e-05, - "loss": 0.2778, + "epoch": 2.4777453418387574, + "grad_norm": 0.1580956131219864, + "learning_rate": 2.6587563025565604e-05, + "loss": 0.3839, "step": 68750 }, { - "epoch": 2.42, - "learning_rate": 2.753526930326137e-05, - "loss": 0.2541, + "epoch": 2.477925541499982, + "grad_norm": 0.17688532173633575, + "learning_rate": 2.6584650770839026e-05, + "loss": 0.4133, "step": 68755 }, { - "epoch": 2.42, - "learning_rate": 2.7532435251682643e-05, - "loss": 0.2764, + "epoch": 2.4781057411612064, + "grad_norm": 0.19186891615390778, + "learning_rate": 2.6581738494521898e-05, + "loss": 0.3996, "step": 68760 }, { - "epoch": 2.42, - "learning_rate": 2.752960116722192e-05, - "loss": 0.2536, + "epoch": 2.478285940822431, + "grad_norm": 0.23146666586399078, + "learning_rate": 2.657882619665388e-05, + "loss": 0.3699, "step": 68765 }, { - "epoch": 2.42, - "learning_rate": 2.7526767049915992e-05, - "loss": 0.2796, + "epoch": 2.478466140483656, + "grad_norm": 0.2393793761730194, + "learning_rate": 2.6575913877274666e-05, + "loss": 0.3812, "step": 68770 }, { - "epoch": 2.42, - "learning_rate": 2.7523932899801657e-05, - "loss": 0.2645, + "epoch": 2.4786463401448806, + "grad_norm": 0.18306025862693787, + "learning_rate": 2.657300153642393e-05, + "loss": 0.4051, "step": 68775 }, { - "epoch": 2.42, - "learning_rate": 2.752109871691571e-05, - "loss": 0.2534, + "epoch": 2.4788265398061053, + "grad_norm": 0.22833152115345, + "learning_rate": 2.657008917414135e-05, + "loss": 0.3908, "step": 68780 }, { - "epoch": 2.42, - "learning_rate": 2.7518264501294964e-05, - "loss": 0.2548, + "epoch": 2.4790067394673296, + "grad_norm": 0.19252647459506989, + "learning_rate": 2.6567176790466613e-05, + "loss": 0.3735, "step": 68785 }, { - "epoch": 2.42, - "learning_rate": 2.7515430252976215e-05, - "loss": 0.2627, + "epoch": 2.4791869391285544, + "grad_norm": 0.202143132686615, + "learning_rate": 2.6564264385439385e-05, + "loss": 0.411, "step": 68790 }, { - "epoch": 2.42, - "learning_rate": 2.7512595971996265e-05, - "loss": 0.2931, + "epoch": 2.479367138789779, + "grad_norm": 0.1779770851135254, + "learning_rate": 2.656135195909938e-05, + "loss": 0.3841, "step": 68795 }, { - "epoch": 2.42, - "learning_rate": 2.750976165839191e-05, - "loss": 0.2687, + "epoch": 2.479547338451004, + "grad_norm": 0.17505523562431335, + "learning_rate": 2.6558439511486232e-05, + "loss": 0.3515, "step": 68800 }, { - "epoch": 2.42, - "learning_rate": 2.750692731219995e-05, - "loss": 0.2772, + "epoch": 2.479727538112228, + "grad_norm": 0.2236313372850418, + "learning_rate": 2.655552704263966e-05, + "loss": 0.4296, "step": 68805 }, { - "epoch": 2.42, - "learning_rate": 2.7504092933457187e-05, - "loss": 0.2637, + "epoch": 2.479907737773453, + "grad_norm": 0.21467982232570648, + "learning_rate": 2.655261455259933e-05, + "loss": 0.3938, "step": 68810 }, { - "epoch": 2.42, - "learning_rate": 2.7501258522200436e-05, - "loss": 0.2547, + "epoch": 2.4800879374346776, + "grad_norm": 0.1988895684480667, + "learning_rate": 2.6549702041404932e-05, + "loss": 0.4324, "step": 68815 }, { - "epoch": 2.42, - "learning_rate": 2.749842407846649e-05, - "loss": 0.2584, + "epoch": 2.4802681370959023, + "grad_norm": 0.17672614753246307, + "learning_rate": 2.6546789509096144e-05, + "loss": 0.4032, "step": 68820 }, { - "epoch": 2.42, - "learning_rate": 2.7495589602292154e-05, - "loss": 0.2928, + "epoch": 2.480448336757127, + "grad_norm": 0.22681747376918793, + "learning_rate": 2.6543876955712637e-05, + "loss": 0.4158, "step": 68825 }, { - "epoch": 2.42, - "learning_rate": 2.749275509371423e-05, - "loss": 0.2746, + "epoch": 2.4806285364183513, + "grad_norm": 0.16268977522850037, + "learning_rate": 2.654096438129412e-05, + "loss": 0.3774, "step": 68830 }, { - "epoch": 2.42, - "learning_rate": 2.748992055276952e-05, - "loss": 0.2673, + "epoch": 2.480808736079576, + "grad_norm": 0.18678313493728638, + "learning_rate": 2.6538051785880254e-05, + "loss": 0.3649, "step": 68835 }, { - "epoch": 2.42, - "learning_rate": 2.7487085979494838e-05, - "loss": 0.2751, + "epoch": 2.480988935740801, + "grad_norm": 0.2044302076101303, + "learning_rate": 2.6535139169510727e-05, + "loss": 0.3747, "step": 68840 }, { - "epoch": 2.42, - "learning_rate": 2.7484251373926977e-05, - "loss": 0.2683, + "epoch": 2.4811691354020255, + "grad_norm": 0.2173137664794922, + "learning_rate": 2.6532226532225235e-05, + "loss": 0.3782, "step": 68845 }, { - "epoch": 2.42, - "learning_rate": 2.748141673610276e-05, - "loss": 0.2614, + "epoch": 2.48134933506325, + "grad_norm": 0.22002023458480835, + "learning_rate": 2.6529313874063445e-05, + "loss": 0.3879, "step": 68850 }, { - "epoch": 2.42, - "learning_rate": 2.7478582066058974e-05, - "loss": 0.2732, + "epoch": 2.4815295347244746, + "grad_norm": 0.19581471383571625, + "learning_rate": 2.652640119506506e-05, + "loss": 0.3761, "step": 68855 }, { - "epoch": 2.42, - "learning_rate": 2.7475747363832432e-05, - "loss": 0.2437, + "epoch": 2.4817097343856993, + "grad_norm": 0.18339914083480835, + "learning_rate": 2.6523488495269744e-05, + "loss": 0.4203, "step": 68860 }, { - "epoch": 2.42, - "learning_rate": 2.7472912629459946e-05, - "loss": 0.265, + "epoch": 2.481889934046924, + "grad_norm": 0.19573746621608734, + "learning_rate": 2.6520575774717194e-05, + "loss": 0.4061, "step": 68865 }, { - "epoch": 2.42, - "learning_rate": 2.7470077862978322e-05, - "loss": 0.2429, + "epoch": 2.4820701337081488, + "grad_norm": 0.24082912504673004, + "learning_rate": 2.6517663033447092e-05, + "loss": 0.4076, "step": 68870 }, { - "epoch": 2.42, - "learning_rate": 2.7467243064424358e-05, - "loss": 0.2671, + "epoch": 2.4822503333693735, + "grad_norm": 0.1869727075099945, + "learning_rate": 2.6514750271499127e-05, + "loss": 0.4011, "step": 68875 }, { - "epoch": 2.42, - "learning_rate": 2.7464408233834876e-05, - "loss": 0.2554, + "epoch": 2.482430533030598, + "grad_norm": 0.19371429085731506, + "learning_rate": 2.6511837488912988e-05, + "loss": 0.3798, "step": 68880 }, { - "epoch": 2.42, - "learning_rate": 2.7461573371246674e-05, - "loss": 0.271, + "epoch": 2.4826107326918225, + "grad_norm": 0.20784878730773926, + "learning_rate": 2.650892468572835e-05, + "loss": 0.4097, "step": 68885 }, { - "epoch": 2.42, - "learning_rate": 2.745873847669656e-05, - "loss": 0.2693, + "epoch": 2.4827909323530473, + "grad_norm": 0.17571403086185455, + "learning_rate": 2.65060118619849e-05, + "loss": 0.3819, "step": 68890 }, { - "epoch": 2.42, - "learning_rate": 2.745590355022135e-05, - "loss": 0.2692, + "epoch": 2.482971132014272, + "grad_norm": 0.17903995513916016, + "learning_rate": 2.6503099017722343e-05, + "loss": 0.3743, "step": 68895 }, { - "epoch": 2.42, - "learning_rate": 2.7453068591857857e-05, - "loss": 0.2953, + "epoch": 2.4831513316754963, + "grad_norm": 0.23476238548755646, + "learning_rate": 2.650018615298035e-05, + "loss": 0.3742, "step": 68900 }, { - "epoch": 2.42, - "learning_rate": 2.7450233601642878e-05, - "loss": 0.2494, + "epoch": 2.483331531336721, + "grad_norm": 0.21859110891819, + "learning_rate": 2.6497273267798605e-05, + "loss": 0.3606, "step": 68905 }, { - "epoch": 2.42, - "learning_rate": 2.7447398579613233e-05, - "loss": 0.2435, + "epoch": 2.4835117309979458, + "grad_norm": 0.2518290579319, + "learning_rate": 2.6494360362216803e-05, + "loss": 0.4162, "step": 68910 }, { - "epoch": 2.42, - "learning_rate": 2.7444563525805728e-05, - "loss": 0.284, + "epoch": 2.4836919306591705, + "grad_norm": 0.1847134232521057, + "learning_rate": 2.6491447436274637e-05, + "loss": 0.3972, "step": 68915 }, { - "epoch": 2.42, - "learning_rate": 2.7441728440257174e-05, - "loss": 0.2486, + "epoch": 2.483872130320395, + "grad_norm": 0.18543338775634766, + "learning_rate": 2.648853449001178e-05, + "loss": 0.4026, "step": 68920 }, { - "epoch": 2.42, - "learning_rate": 2.743889332300439e-05, - "loss": 0.255, + "epoch": 2.4840523299816195, + "grad_norm": 0.20667828619480133, + "learning_rate": 2.648562152346793e-05, + "loss": 0.4239, "step": 68925 }, { - "epoch": 2.43, - "learning_rate": 2.7436058174084188e-05, - "loss": 0.2625, + "epoch": 2.4842325296428442, + "grad_norm": 0.21871261298656464, + "learning_rate": 2.6482708536682777e-05, + "loss": 0.4119, "step": 68930 }, { - "epoch": 2.43, - "learning_rate": 2.743322299353337e-05, - "loss": 0.276, + "epoch": 2.484412729304069, + "grad_norm": 0.18584778904914856, + "learning_rate": 2.647979552969601e-05, + "loss": 0.3987, "step": 68935 }, { - "epoch": 2.43, - "learning_rate": 2.7430387781388756e-05, - "loss": 0.2862, + "epoch": 2.4845929289652937, + "grad_norm": 0.1998424530029297, + "learning_rate": 2.6476882502547305e-05, + "loss": 0.3779, "step": 68940 }, { - "epoch": 2.43, - "learning_rate": 2.742755253768715e-05, - "loss": 0.2511, + "epoch": 2.484773128626518, + "grad_norm": 0.18726879358291626, + "learning_rate": 2.647396945527637e-05, + "loss": 0.4084, "step": 68945 }, { - "epoch": 2.43, - "learning_rate": 2.742471726246538e-05, - "loss": 0.2575, + "epoch": 2.4849533282877427, + "grad_norm": 0.19069020450115204, + "learning_rate": 2.6471056387922886e-05, + "loss": 0.3962, "step": 68950 }, { - "epoch": 2.43, - "learning_rate": 2.7421881955760265e-05, - "loss": 0.2964, + "epoch": 2.4851335279489675, + "grad_norm": 0.231498122215271, + "learning_rate": 2.6468143300526543e-05, + "loss": 0.3819, "step": 68955 }, { - "epoch": 2.43, - "learning_rate": 2.7419046617608597e-05, - "loss": 0.2514, + "epoch": 2.485313727610192, + "grad_norm": 0.19024336338043213, + "learning_rate": 2.6465230193127033e-05, + "loss": 0.3738, "step": 68960 }, { - "epoch": 2.43, - "learning_rate": 2.7416211248047212e-05, - "loss": 0.2529, + "epoch": 2.485493927271417, + "grad_norm": 0.23656608164310455, + "learning_rate": 2.6462317065764043e-05, + "loss": 0.4132, "step": 68965 }, { - "epoch": 2.43, - "learning_rate": 2.741337584711291e-05, - "loss": 0.2616, + "epoch": 2.4856741269326412, + "grad_norm": 0.2034633606672287, + "learning_rate": 2.645940391847727e-05, + "loss": 0.4149, "step": 68970 }, { - "epoch": 2.43, - "learning_rate": 2.741054041484251e-05, - "loss": 0.2552, + "epoch": 2.485854326593866, + "grad_norm": 0.20113669335842133, + "learning_rate": 2.6456490751306395e-05, + "loss": 0.3942, "step": 68975 }, { - "epoch": 2.43, - "learning_rate": 2.7407704951272833e-05, - "loss": 0.2736, + "epoch": 2.4860345262550907, + "grad_norm": 0.18040354549884796, + "learning_rate": 2.645357756429112e-05, + "loss": 0.403, "step": 68980 }, { - "epoch": 2.43, - "learning_rate": 2.7404869456440703e-05, - "loss": 0.2452, + "epoch": 2.4862147259163154, + "grad_norm": 0.2093953937292099, + "learning_rate": 2.645066435747113e-05, + "loss": 0.3853, "step": 68985 }, { - "epoch": 2.43, - "learning_rate": 2.740203393038292e-05, - "loss": 0.2571, + "epoch": 2.4863949255775397, + "grad_norm": 0.18795111775398254, + "learning_rate": 2.6447751130886117e-05, + "loss": 0.4241, "step": 68990 }, { - "epoch": 2.43, - "learning_rate": 2.739919837313631e-05, - "loss": 0.2656, + "epoch": 2.4865751252387644, + "grad_norm": 0.24407252669334412, + "learning_rate": 2.644483788457578e-05, + "loss": 0.3794, "step": 68995 }, { - "epoch": 2.43, - "learning_rate": 2.7396362784737682e-05, - "loss": 0.2927, + "epoch": 2.486755324899989, + "grad_norm": 0.18721212446689606, + "learning_rate": 2.6441924618579807e-05, + "loss": 0.3853, "step": 69000 }, { - "epoch": 2.43, - "eval_loss": 0.2651735842227936, - "eval_runtime": 10.5525, - "eval_samples_per_second": 9.476, - "eval_steps_per_second": 9.476, + "epoch": 2.486755324899989, + "eval_loss": 0.43400391936302185, + "eval_runtime": 3.5358, + "eval_samples_per_second": 28.282, + "eval_steps_per_second": 7.071, "step": 69000 }, { - "epoch": 2.43, - "learning_rate": 2.7393527165223875e-05, - "loss": 0.2494, + "epoch": 2.486935524561214, + "grad_norm": 0.21904613077640533, + "learning_rate": 2.643901133293789e-05, + "loss": 0.4267, "step": 69005 }, { - "epoch": 2.43, - "learning_rate": 2.7390691514631693e-05, - "loss": 0.2765, + "epoch": 2.4871157242224387, + "grad_norm": 0.18053406476974487, + "learning_rate": 2.6436098027689714e-05, + "loss": 0.3885, "step": 69010 }, { - "epoch": 2.43, - "learning_rate": 2.738785583299796e-05, - "loss": 0.2723, + "epoch": 2.487295923883663, + "grad_norm": 0.18730615079402924, + "learning_rate": 2.6433184702874993e-05, + "loss": 0.371, "step": 69015 }, { - "epoch": 2.43, - "learning_rate": 2.7385020120359483e-05, - "loss": 0.2716, + "epoch": 2.4874761235448877, + "grad_norm": 0.21903234720230103, + "learning_rate": 2.6430271358533397e-05, + "loss": 0.4299, "step": 69020 }, { - "epoch": 2.43, - "learning_rate": 2.7382184376753096e-05, - "loss": 0.2638, + "epoch": 2.4876563232061124, + "grad_norm": 0.21092742681503296, + "learning_rate": 2.6427357994704633e-05, + "loss": 0.4174, "step": 69025 }, { - "epoch": 2.43, - "learning_rate": 2.7379348602215616e-05, - "loss": 0.2854, + "epoch": 2.487836522867337, + "grad_norm": 0.20386511087417603, + "learning_rate": 2.6424444611428396e-05, + "loss": 0.4047, "step": 69030 }, { - "epoch": 2.43, - "learning_rate": 2.7376512796783867e-05, - "loss": 0.263, + "epoch": 2.4880167225285614, + "grad_norm": 0.2634836733341217, + "learning_rate": 2.642153120874437e-05, + "loss": 0.424, "step": 69035 }, { - "epoch": 2.43, - "learning_rate": 2.737367696049466e-05, - "loss": 0.2827, + "epoch": 2.488196922189786, + "grad_norm": 0.22535258531570435, + "learning_rate": 2.6418617786692273e-05, + "loss": 0.4279, "step": 69040 }, { - "epoch": 2.43, - "learning_rate": 2.7370841093384824e-05, - "loss": 0.295, + "epoch": 2.488377121851011, + "grad_norm": 0.19794537127017975, + "learning_rate": 2.6415704345311764e-05, + "loss": 0.3665, "step": 69045 }, { - "epoch": 2.43, - "learning_rate": 2.7368005195491176e-05, - "loss": 0.2831, + "epoch": 2.4885573215122356, + "grad_norm": 0.20642319321632385, + "learning_rate": 2.641279088464257e-05, + "loss": 0.3978, "step": 69050 }, { - "epoch": 2.43, - "learning_rate": 2.736516926685054e-05, - "loss": 0.2635, + "epoch": 2.4887375211734604, + "grad_norm": 0.22682100534439087, + "learning_rate": 2.6409877404724363e-05, + "loss": 0.4194, "step": 69055 }, { - "epoch": 2.43, - "learning_rate": 2.736290050182495e-05, - "loss": 0.2612, + "epoch": 2.4889177208346847, + "grad_norm": 0.24420538544654846, + "learning_rate": 2.640696390559686e-05, + "loss": 0.3943, "step": 69060 }, { - "epoch": 2.43, - "learning_rate": 2.7360064517932533e-05, - "loss": 0.2668, + "epoch": 2.4890979204959094, + "grad_norm": 0.16333971917629242, + "learning_rate": 2.6404050387299744e-05, + "loss": 0.3718, "step": 69065 }, { - "epoch": 2.43, - "learning_rate": 2.735722850339623e-05, - "loss": 0.2729, + "epoch": 2.489278120157134, + "grad_norm": 0.19951343536376953, + "learning_rate": 2.640113684987271e-05, + "loss": 0.3919, "step": 69070 }, { - "epoch": 2.43, - "learning_rate": 2.7354392458252876e-05, - "loss": 0.2574, + "epoch": 2.489458319818359, + "grad_norm": 0.18329237401485443, + "learning_rate": 2.6398223293355455e-05, + "loss": 0.408, "step": 69075 }, { - "epoch": 2.43, - "learning_rate": 2.7351556382539283e-05, - "loss": 0.2612, + "epoch": 2.489638519479583, + "grad_norm": 0.2317817211151123, + "learning_rate": 2.6395309717787686e-05, + "loss": 0.384, "step": 69080 }, { - "epoch": 2.43, - "learning_rate": 2.7348720276292284e-05, - "loss": 0.2877, + "epoch": 2.489818719140808, + "grad_norm": 0.20693840086460114, + "learning_rate": 2.6392396123209085e-05, + "loss": 0.3992, "step": 69085 }, { - "epoch": 2.43, - "learning_rate": 2.734588413954869e-05, - "loss": 0.2746, + "epoch": 2.4899989188020326, + "grad_norm": 0.18415316939353943, + "learning_rate": 2.6389482509659365e-05, + "loss": 0.3955, "step": 69090 }, { - "epoch": 2.43, - "learning_rate": 2.7343047972345347e-05, - "loss": 0.2714, + "epoch": 2.4901791184632573, + "grad_norm": 0.18562282621860504, + "learning_rate": 2.6386568877178204e-05, + "loss": 0.3756, "step": 69095 }, { - "epoch": 2.43, - "learning_rate": 2.7340211774719064e-05, - "loss": 0.2651, + "epoch": 2.490359318124482, + "grad_norm": 0.1960134655237198, + "learning_rate": 2.6383655225805326e-05, + "loss": 0.3731, "step": 69100 }, { - "epoch": 2.43, - "learning_rate": 2.7337375546706677e-05, - "loss": 0.2713, + "epoch": 2.4905395177857064, + "grad_norm": 0.22196051478385925, + "learning_rate": 2.6380741555580398e-05, + "loss": 0.3898, "step": 69105 }, { - "epoch": 2.43, - "learning_rate": 2.7334539288345012e-05, - "loss": 0.2508, + "epoch": 2.490719717446931, + "grad_norm": 0.19862577319145203, + "learning_rate": 2.6377827866543142e-05, + "loss": 0.371, "step": 69110 }, { - "epoch": 2.43, - "learning_rate": 2.7331702999670883e-05, - "loss": 0.2733, + "epoch": 2.490899917108156, + "grad_norm": 0.1888255774974823, + "learning_rate": 2.6374914158733238e-05, + "loss": 0.3924, "step": 69115 }, { - "epoch": 2.43, - "learning_rate": 2.7328866680721134e-05, - "loss": 0.2548, + "epoch": 2.4910801167693806, + "grad_norm": 0.18701927363872528, + "learning_rate": 2.6372000432190407e-05, + "loss": 0.369, "step": 69120 }, { - "epoch": 2.43, - "learning_rate": 2.7326030331532586e-05, - "loss": 0.2789, + "epoch": 2.4912603164306053, + "grad_norm": 0.25110724568367004, + "learning_rate": 2.636908668695433e-05, + "loss": 0.3578, "step": 69125 }, { - "epoch": 2.43, - "learning_rate": 2.732319395214207e-05, - "loss": 0.2904, + "epoch": 2.4914405160918296, + "grad_norm": 0.21141697466373444, + "learning_rate": 2.6366172923064714e-05, + "loss": 0.4043, "step": 69130 }, { - "epoch": 2.43, - "learning_rate": 2.7320357542586405e-05, - "loss": 0.2713, + "epoch": 2.4916207157530543, + "grad_norm": 0.23328730463981628, + "learning_rate": 2.6363259140561252e-05, + "loss": 0.4044, "step": 69135 }, { - "epoch": 2.43, - "learning_rate": 2.7317521102902433e-05, - "loss": 0.2806, + "epoch": 2.491800915414279, + "grad_norm": 0.17210394144058228, + "learning_rate": 2.6360345339483655e-05, + "loss": 0.3999, "step": 69140 }, { - "epoch": 2.43, - "learning_rate": 2.7314684633126964e-05, - "loss": 0.2691, + "epoch": 2.491981115075504, + "grad_norm": 0.19488251209259033, + "learning_rate": 2.6357431519871612e-05, + "loss": 0.3879, "step": 69145 }, { - "epoch": 2.43, - "learning_rate": 2.7311848133296853e-05, - "loss": 0.2929, + "epoch": 2.4921613147367285, + "grad_norm": 0.23553623259067535, + "learning_rate": 2.635451768176483e-05, + "loss": 0.412, "step": 69150 }, { - "epoch": 2.43, - "learning_rate": 2.7309011603448914e-05, - "loss": 0.2708, + "epoch": 2.492341514397953, + "grad_norm": 0.21684016287326813, + "learning_rate": 2.6351603825203003e-05, + "loss": 0.425, "step": 69155 }, { - "epoch": 2.43, - "learning_rate": 2.730617504361998e-05, - "loss": 0.2812, + "epoch": 2.4925217140591776, + "grad_norm": 0.22947993874549866, + "learning_rate": 2.634868995022584e-05, + "loss": 0.3921, "step": 69160 }, { - "epoch": 2.43, - "learning_rate": 2.7303338453846884e-05, - "loss": 0.2603, + "epoch": 2.4927019137204023, + "grad_norm": 0.1686110943555832, + "learning_rate": 2.634577605687303e-05, + "loss": 0.3758, "step": 69165 }, { - "epoch": 2.43, - "learning_rate": 2.7300501834166452e-05, - "loss": 0.2747, + "epoch": 2.492882113381627, + "grad_norm": 0.2178329974412918, + "learning_rate": 2.6342862145184287e-05, + "loss": 0.3766, "step": 69170 }, { - "epoch": 2.43, - "learning_rate": 2.7297665184615522e-05, - "loss": 0.2567, + "epoch": 2.4930623130428513, + "grad_norm": 0.16872194409370422, + "learning_rate": 2.6339948215199304e-05, + "loss": 0.3641, "step": 69175 }, { - "epoch": 2.43, - "learning_rate": 2.7294828505230936e-05, - "loss": 0.2734, + "epoch": 2.493242512704076, + "grad_norm": 0.2486531138420105, + "learning_rate": 2.633703426695779e-05, + "loss": 0.4011, "step": 69180 }, { - "epoch": 2.43, - "learning_rate": 2.72919917960495e-05, - "loss": 0.2757, + "epoch": 2.4934227123653008, + "grad_norm": 0.1963934600353241, + "learning_rate": 2.6334120300499443e-05, + "loss": 0.3931, "step": 69185 }, { - "epoch": 2.43, - "learning_rate": 2.728915505710807e-05, - "loss": 0.2778, + "epoch": 2.4936029120265255, + "grad_norm": 0.20075222849845886, + "learning_rate": 2.6331206315863966e-05, + "loss": 0.3863, "step": 69190 }, { - "epoch": 2.43, - "learning_rate": 2.728631828844346e-05, - "loss": 0.2741, + "epoch": 2.4937831116877502, + "grad_norm": 0.19349929690361023, + "learning_rate": 2.6328292313091056e-05, + "loss": 0.3631, "step": 69195 }, { - "epoch": 2.43, - "learning_rate": 2.7283481490092516e-05, - "loss": 0.2572, + "epoch": 2.4939633113489745, + "grad_norm": 0.14709793031215668, + "learning_rate": 2.6325378292220428e-05, + "loss": 0.3709, "step": 69200 }, { - "epoch": 2.43, - "learning_rate": 2.728064466209207e-05, - "loss": 0.2774, + "epoch": 2.4941435110101993, + "grad_norm": 0.21333067119121552, + "learning_rate": 2.6322464253291775e-05, + "loss": 0.3988, "step": 69205 }, { - "epoch": 2.43, - "learning_rate": 2.7277807804478965e-05, - "loss": 0.2795, + "epoch": 2.494323710671424, + "grad_norm": 0.19537271559238434, + "learning_rate": 2.6319550196344793e-05, + "loss": 0.3719, "step": 69210 }, { - "epoch": 2.44, - "learning_rate": 2.7274970917290017e-05, - "loss": 0.2495, + "epoch": 2.4945039103326487, + "grad_norm": 0.22581617534160614, + "learning_rate": 2.63166361214192e-05, + "loss": 0.4023, "step": 69215 }, { - "epoch": 2.44, - "learning_rate": 2.7272134000562076e-05, - "loss": 0.2688, + "epoch": 2.494684109993873, + "grad_norm": 0.2060622274875641, + "learning_rate": 2.6313722028554692e-05, + "loss": 0.3808, "step": 69220 }, { - "epoch": 2.44, - "learning_rate": 2.7269297054331965e-05, - "loss": 0.2808, + "epoch": 2.4948643096550978, + "grad_norm": 0.19554631412029266, + "learning_rate": 2.631080791779099e-05, + "loss": 0.4116, "step": 69225 }, { - "epoch": 2.44, - "learning_rate": 2.7266460078636534e-05, - "loss": 0.2556, + "epoch": 2.4950445093163225, + "grad_norm": 0.1858394294977188, + "learning_rate": 2.630789378916777e-05, + "loss": 0.3857, "step": 69230 }, { - "epoch": 2.44, - "learning_rate": 2.726362307351261e-05, - "loss": 0.2753, + "epoch": 2.4952247089775472, + "grad_norm": 0.23908282816410065, + "learning_rate": 2.6304979642724754e-05, + "loss": 0.3515, "step": 69235 }, { - "epoch": 2.44, - "learning_rate": 2.7260786038997045e-05, - "loss": 0.286, + "epoch": 2.495404908638772, + "grad_norm": 0.1817556619644165, + "learning_rate": 2.630206547850165e-05, + "loss": 0.4181, "step": 69240 }, { - "epoch": 2.44, - "learning_rate": 2.7257948975126647e-05, - "loss": 0.2793, + "epoch": 2.4955851082999962, + "grad_norm": 0.19453182816505432, + "learning_rate": 2.629915129653815e-05, + "loss": 0.3915, "step": 69245 }, { - "epoch": 2.44, - "learning_rate": 2.7255111881938273e-05, - "loss": 0.2538, + "epoch": 2.495765307961221, + "grad_norm": 0.18795980513095856, + "learning_rate": 2.6296237096873964e-05, + "loss": 0.3749, "step": 69250 }, { - "epoch": 2.44, - "learning_rate": 2.7252274759468754e-05, - "loss": 0.2577, + "epoch": 2.4959455076224457, + "grad_norm": 0.16127502918243408, + "learning_rate": 2.6293322879548792e-05, + "loss": 0.3953, "step": 69255 }, { - "epoch": 2.44, - "learning_rate": 2.7249437607754937e-05, - "loss": 0.2796, + "epoch": 2.4961257072836704, + "grad_norm": 0.20099008083343506, + "learning_rate": 2.629040864460236e-05, + "loss": 0.3947, "step": 69260 }, { - "epoch": 2.44, - "learning_rate": 2.7246600426833656e-05, - "loss": 0.272, + "epoch": 2.4963059069448947, + "grad_norm": 0.18468354642391205, + "learning_rate": 2.6287494392074352e-05, + "loss": 0.3996, "step": 69265 }, { - "epoch": 2.44, - "learning_rate": 2.7243763216741745e-05, - "loss": 0.2648, + "epoch": 2.4964861066061195, + "grad_norm": 0.190368190407753, + "learning_rate": 2.6284580122004482e-05, + "loss": 0.3393, "step": 69270 }, { - "epoch": 2.44, - "learning_rate": 2.724092597751606e-05, - "loss": 0.2778, + "epoch": 2.496666306267344, + "grad_norm": 0.2398798167705536, + "learning_rate": 2.6281665834432462e-05, + "loss": 0.396, "step": 69275 }, { - "epoch": 2.44, - "learning_rate": 2.723808870919341e-05, - "loss": 0.2485, + "epoch": 2.496846505928569, + "grad_norm": 0.21134978532791138, + "learning_rate": 2.6278751529397983e-05, + "loss": 0.3844, "step": 69280 }, { - "epoch": 2.44, - "learning_rate": 2.7235251411810657e-05, - "loss": 0.2654, + "epoch": 2.4970267055897937, + "grad_norm": 0.221641406416893, + "learning_rate": 2.6275837206940772e-05, + "loss": 0.3982, "step": 69285 }, { - "epoch": 2.44, - "learning_rate": 2.7232414085404643e-05, - "loss": 0.2599, + "epoch": 2.497206905251018, + "grad_norm": 0.2198064923286438, + "learning_rate": 2.6272922867100524e-05, + "loss": 0.4137, "step": 69290 }, { - "epoch": 2.44, - "learning_rate": 2.7229576730012207e-05, - "loss": 0.232, + "epoch": 2.4973871049122427, + "grad_norm": 0.26445555686950684, + "learning_rate": 2.627000850991695e-05, + "loss": 0.4162, "step": 69295 }, { - "epoch": 2.44, - "learning_rate": 2.7226739345670176e-05, - "loss": 0.2623, + "epoch": 2.4975673045734674, + "grad_norm": 0.2406071126461029, + "learning_rate": 2.6267094135429748e-05, + "loss": 0.3863, "step": 69300 }, { - "epoch": 2.44, - "learning_rate": 2.7223901932415415e-05, - "loss": 0.2579, + "epoch": 2.497747504234692, + "grad_norm": 0.21584929525852203, + "learning_rate": 2.6264179743678642e-05, + "loss": 0.3893, "step": 69305 }, { - "epoch": 2.44, - "learning_rate": 2.7221064490284738e-05, - "loss": 0.2594, + "epoch": 2.4979277038959165, + "grad_norm": 0.19201698899269104, + "learning_rate": 2.6261265334703327e-05, + "loss": 0.4101, "step": 69310 }, { - "epoch": 2.44, - "learning_rate": 2.721822701931501e-05, - "loss": 0.2513, + "epoch": 2.498107903557141, + "grad_norm": 0.22103743255138397, + "learning_rate": 2.625835090854351e-05, + "loss": 0.4085, "step": 69315 }, { - "epoch": 2.44, - "learning_rate": 2.7215389519543068e-05, - "loss": 0.3011, + "epoch": 2.498288103218366, + "grad_norm": 0.24371977150440216, + "learning_rate": 2.625543646523892e-05, + "loss": 0.4247, "step": 69320 }, { - "epoch": 2.44, - "learning_rate": 2.7212551991005753e-05, - "loss": 0.256, + "epoch": 2.4984683028795907, + "grad_norm": 0.21865001320838928, + "learning_rate": 2.6252522004829243e-05, + "loss": 0.3971, "step": 69325 }, { - "epoch": 2.44, - "learning_rate": 2.7209714433739903e-05, - "loss": 0.2623, + "epoch": 2.4986485025408154, + "grad_norm": 0.18151873350143433, + "learning_rate": 2.6249607527354198e-05, + "loss": 0.4078, "step": 69330 }, { - "epoch": 2.44, - "learning_rate": 2.720687684778237e-05, - "loss": 0.2762, + "epoch": 2.4988287022020397, + "grad_norm": 0.19175831973552704, + "learning_rate": 2.6246693032853486e-05, + "loss": 0.4046, "step": 69335 }, { - "epoch": 2.44, - "learning_rate": 2.7204039233169992e-05, - "loss": 0.2626, + "epoch": 2.4990089018632644, + "grad_norm": 0.18904618918895721, + "learning_rate": 2.6243778521366835e-05, + "loss": 0.3663, "step": 69340 }, { - "epoch": 2.44, - "learning_rate": 2.720120158993963e-05, - "loss": 0.281, + "epoch": 2.499189101524489, + "grad_norm": 0.21790605783462524, + "learning_rate": 2.6240863992933936e-05, + "loss": 0.4033, "step": 69345 }, { - "epoch": 2.44, - "learning_rate": 2.71983639181281e-05, - "loss": 0.2703, + "epoch": 2.499369301185714, + "grad_norm": 0.20878206193447113, + "learning_rate": 2.6237949447594502e-05, + "loss": 0.4134, "step": 69350 }, { - "epoch": 2.44, - "learning_rate": 2.7195526217772278e-05, - "loss": 0.2765, + "epoch": 2.499549500846938, + "grad_norm": 0.2429923415184021, + "learning_rate": 2.6235034885388247e-05, + "loss": 0.3973, "step": 69355 }, { - "epoch": 2.44, - "learning_rate": 2.7192688488908984e-05, - "loss": 0.282, + "epoch": 2.499729700508163, + "grad_norm": 0.19370168447494507, + "learning_rate": 2.6232120306354884e-05, + "loss": 0.3918, "step": 69360 }, { - "epoch": 2.44, - "learning_rate": 2.7189850731575077e-05, - "loss": 0.2712, + "epoch": 2.4999099001693876, + "grad_norm": 0.22662904858589172, + "learning_rate": 2.6229205710534126e-05, + "loss": 0.37, "step": 69365 }, { - "epoch": 2.44, - "learning_rate": 2.71870129458074e-05, - "loss": 0.2931, + "epoch": 2.5000900998306124, + "grad_norm": 0.18225222826004028, + "learning_rate": 2.6226291097965668e-05, + "loss": 0.4206, "step": 69370 }, { - "epoch": 2.44, - "learning_rate": 2.7184175131642803e-05, - "loss": 0.2832, + "epoch": 2.500270299491837, + "grad_norm": 0.2310526967048645, + "learning_rate": 2.622337646868923e-05, + "loss": 0.4082, "step": 69375 }, { - "epoch": 2.44, - "learning_rate": 2.7181337289118137e-05, - "loss": 0.2733, + "epoch": 2.500450499153062, + "grad_norm": 0.20121249556541443, + "learning_rate": 2.6220461822744536e-05, + "loss": 0.407, "step": 69380 }, { - "epoch": 2.44, - "learning_rate": 2.7178499418270238e-05, - "loss": 0.2858, + "epoch": 2.500630698814286, + "grad_norm": 0.24337324500083923, + "learning_rate": 2.6217547160171274e-05, + "loss": 0.4463, "step": 69385 }, { - "epoch": 2.44, - "learning_rate": 2.7175661519135958e-05, - "loss": 0.2905, + "epoch": 2.500810898475511, + "grad_norm": 0.21460457146167755, + "learning_rate": 2.6214632481009176e-05, + "loss": 0.3511, "step": 69390 }, { - "epoch": 2.44, - "learning_rate": 2.717282359175215e-05, - "loss": 0.2579, + "epoch": 2.5009910981367356, + "grad_norm": 0.19160525500774384, + "learning_rate": 2.621171778529794e-05, + "loss": 0.3843, "step": 69395 }, { - "epoch": 2.44, - "learning_rate": 2.7169985636155663e-05, - "loss": 0.2739, + "epoch": 2.50117129779796, + "grad_norm": 0.20841006934642792, + "learning_rate": 2.6208803073077294e-05, + "loss": 0.4167, "step": 69400 }, { - "epoch": 2.44, - "learning_rate": 2.7167147652383334e-05, - "loss": 0.2675, + "epoch": 2.5013514974591846, + "grad_norm": 0.21460412442684174, + "learning_rate": 2.6205888344386927e-05, + "loss": 0.3881, "step": 69405 }, { - "epoch": 2.44, - "learning_rate": 2.716430964047203e-05, - "loss": 0.2827, + "epoch": 2.5015316971204093, + "grad_norm": 0.20934461057186127, + "learning_rate": 2.6202973599266573e-05, + "loss": 0.371, "step": 69410 }, { - "epoch": 2.44, - "learning_rate": 2.7161471600458588e-05, - "loss": 0.2544, + "epoch": 2.501711896781634, + "grad_norm": 0.23594777286052704, + "learning_rate": 2.620005883775593e-05, + "loss": 0.3876, "step": 69415 }, { - "epoch": 2.44, - "learning_rate": 2.7158633532379863e-05, - "loss": 0.301, + "epoch": 2.501892096442859, + "grad_norm": 0.21076056361198425, + "learning_rate": 2.6197144059894724e-05, + "loss": 0.4082, "step": 69420 }, { - "epoch": 2.44, - "learning_rate": 2.7155795436272696e-05, - "loss": 0.2817, + "epoch": 2.5020722961040835, + "grad_norm": 0.21848683059215546, + "learning_rate": 2.619422926572266e-05, + "loss": 0.3747, "step": 69425 }, { - "epoch": 2.44, - "learning_rate": 2.715295731217396e-05, - "loss": 0.2906, + "epoch": 2.502252495765308, + "grad_norm": 0.20957939326763153, + "learning_rate": 2.6191314455279453e-05, + "loss": 0.4058, "step": 69430 }, { - "epoch": 2.44, - "learning_rate": 2.715011916012048e-05, - "loss": 0.2861, + "epoch": 2.5024326954265326, + "grad_norm": 0.23492544889450073, + "learning_rate": 2.618839962860482e-05, + "loss": 0.383, "step": 69435 }, { - "epoch": 2.44, - "learning_rate": 2.714728098014913e-05, - "loss": 0.2622, + "epoch": 2.5026128950877573, + "grad_norm": 0.16732336580753326, + "learning_rate": 2.6185484785738467e-05, + "loss": 0.391, "step": 69440 }, { - "epoch": 2.44, - "learning_rate": 2.7144442772296742e-05, - "loss": 0.2833, + "epoch": 2.5027930947489816, + "grad_norm": 0.18726083636283875, + "learning_rate": 2.6182569926720117e-05, + "loss": 0.381, "step": 69445 }, { - "epoch": 2.44, - "learning_rate": 2.7141604536600185e-05, - "loss": 0.246, + "epoch": 2.5029732944102063, + "grad_norm": 0.201826810836792, + "learning_rate": 2.617965505158948e-05, + "loss": 0.3941, "step": 69450 }, { - "epoch": 2.44, - "learning_rate": 2.7138766273096296e-05, - "loss": 0.2596, + "epoch": 2.503153494071431, + "grad_norm": 0.20765550434589386, + "learning_rate": 2.617674016038627e-05, + "loss": 0.4062, "step": 69455 }, { - "epoch": 2.44, - "learning_rate": 2.7135927981821947e-05, - "loss": 0.2619, + "epoch": 2.503333693732656, + "grad_norm": 0.23174385726451874, + "learning_rate": 2.61738252531502e-05, + "loss": 0.3968, "step": 69460 }, { - "epoch": 2.44, - "learning_rate": 2.7133089662813977e-05, - "loss": 0.2508, + "epoch": 2.5035138933938805, + "grad_norm": 0.21468636393547058, + "learning_rate": 2.6170910329920994e-05, + "loss": 0.3938, "step": 69465 }, { - "epoch": 2.44, - "learning_rate": 2.7130251316109233e-05, - "loss": 0.2653, + "epoch": 2.5036940930551053, + "grad_norm": 0.23031465709209442, + "learning_rate": 2.6167995390738366e-05, + "loss": 0.4022, "step": 69470 }, { - "epoch": 2.44, - "learning_rate": 2.712741294174459e-05, - "loss": 0.2602, + "epoch": 2.5038742927163296, + "grad_norm": 0.19280219078063965, + "learning_rate": 2.6165080435642014e-05, + "loss": 0.3947, "step": 69475 }, { - "epoch": 2.44, - "learning_rate": 2.71245745397569e-05, - "loss": 0.2717, + "epoch": 2.5040544923775543, + "grad_norm": 0.20321400463581085, + "learning_rate": 2.616216546467168e-05, + "loss": 0.3962, "step": 69480 }, { - "epoch": 2.44, - "learning_rate": 2.7121736110182994e-05, - "loss": 0.2808, + "epoch": 2.504234692038779, + "grad_norm": 0.16147299110889435, + "learning_rate": 2.6159250477867053e-05, + "loss": 0.4046, "step": 69485 }, { - "epoch": 2.44, - "learning_rate": 2.711889765305975e-05, - "loss": 0.2695, + "epoch": 2.5044148917000038, + "grad_norm": 0.21102051436901093, + "learning_rate": 2.6156335475267874e-05, + "loss": 0.4235, "step": 69490 }, { - "epoch": 2.45, - "learning_rate": 2.711605916842402e-05, - "loss": 0.281, + "epoch": 2.504595091361228, + "grad_norm": 0.22560648620128632, + "learning_rate": 2.615342045691384e-05, + "loss": 0.4053, "step": 69495 }, { - "epoch": 2.45, - "learning_rate": 2.711322065631265e-05, - "loss": 0.2699, + "epoch": 2.5047752910224528, + "grad_norm": 0.17885218560695648, + "learning_rate": 2.615050542284468e-05, + "loss": 0.35, "step": 69500 }, { - "epoch": 2.45, - "eval_loss": 0.2653324604034424, - "eval_runtime": 10.5362, - "eval_samples_per_second": 9.491, - "eval_steps_per_second": 9.491, + "epoch": 2.5047752910224528, + "eval_loss": 0.4340173304080963, + "eval_runtime": 3.5362, + "eval_samples_per_second": 28.279, + "eval_steps_per_second": 7.07, "step": 69500 }, { - "epoch": 2.45, - "learning_rate": 2.71103821167625e-05, - "loss": 0.2805, + "epoch": 2.5049554906836775, + "grad_norm": 0.23639249801635742, + "learning_rate": 2.6147590373100106e-05, + "loss": 0.4151, "step": 69505 }, { - "epoch": 2.45, - "learning_rate": 2.7107543549810433e-05, - "loss": 0.2772, + "epoch": 2.5051356903449022, + "grad_norm": 0.260985404253006, + "learning_rate": 2.6144675307719835e-05, + "loss": 0.4056, "step": 69510 }, { - "epoch": 2.45, - "learning_rate": 2.7104704955493303e-05, - "loss": 0.2752, + "epoch": 2.505315890006127, + "grad_norm": 0.21319666504859924, + "learning_rate": 2.614176022674359e-05, + "loss": 0.3559, "step": 69515 }, { - "epoch": 2.45, - "learning_rate": 2.7101866333847962e-05, - "loss": 0.2571, + "epoch": 2.5054960896673513, + "grad_norm": 0.20121723413467407, + "learning_rate": 2.613884513021107e-05, + "loss": 0.4013, "step": 69520 }, { - "epoch": 2.45, - "learning_rate": 2.709902768491128e-05, - "loss": 0.2747, + "epoch": 2.505676289328576, + "grad_norm": 0.1814257949590683, + "learning_rate": 2.613593001816201e-05, + "loss": 0.3781, "step": 69525 }, { - "epoch": 2.45, - "learning_rate": 2.7096189008720096e-05, - "loss": 0.262, + "epoch": 2.5058564889898007, + "grad_norm": 0.20692424476146698, + "learning_rate": 2.613301489063613e-05, + "loss": 0.3848, "step": 69530 }, { - "epoch": 2.45, - "learning_rate": 2.7093350305311288e-05, - "loss": 0.2717, + "epoch": 2.5060366886510255, + "grad_norm": 0.21745166182518005, + "learning_rate": 2.6130099747673136e-05, + "loss": 0.3875, "step": 69535 }, { - "epoch": 2.45, - "learning_rate": 2.7090511574721705e-05, - "loss": 0.2975, + "epoch": 2.5062168883122498, + "grad_norm": 0.19533394277095795, + "learning_rate": 2.612718458931276e-05, + "loss": 0.4273, "step": 69540 }, { - "epoch": 2.45, - "learning_rate": 2.708767281698821e-05, - "loss": 0.2871, + "epoch": 2.5063970879734745, + "grad_norm": 0.18785609304904938, + "learning_rate": 2.6124269415594698e-05, + "loss": 0.4146, "step": 69545 }, { - "epoch": 2.45, - "learning_rate": 2.708483403214765e-05, - "loss": 0.2548, + "epoch": 2.5065772876346992, + "grad_norm": 0.21127115190029144, + "learning_rate": 2.6121354226558692e-05, + "loss": 0.4068, "step": 69550 }, { - "epoch": 2.45, - "learning_rate": 2.7081995220236893e-05, - "loss": 0.269, + "epoch": 2.506757487295924, + "grad_norm": 0.2023439109325409, + "learning_rate": 2.611843902224445e-05, + "loss": 0.4257, "step": 69555 }, { - "epoch": 2.45, - "learning_rate": 2.707915638129281e-05, - "loss": 0.2799, + "epoch": 2.5069376869571487, + "grad_norm": 0.1987898051738739, + "learning_rate": 2.6115523802691695e-05, + "loss": 0.4159, "step": 69560 }, { - "epoch": 2.45, - "learning_rate": 2.7076317515352247e-05, - "loss": 0.2619, + "epoch": 2.5071178866183734, + "grad_norm": 0.2078784704208374, + "learning_rate": 2.6112608567940138e-05, + "loss": 0.3997, "step": 69565 }, { - "epoch": 2.45, - "learning_rate": 2.707347862245207e-05, - "loss": 0.2628, + "epoch": 2.5072980862795977, + "grad_norm": 0.27035263180732727, + "learning_rate": 2.610969331802951e-05, + "loss": 0.4235, "step": 69570 }, { - "epoch": 2.45, - "learning_rate": 2.7070639702629142e-05, - "loss": 0.2729, + "epoch": 2.5074782859408224, + "grad_norm": 0.22846317291259766, + "learning_rate": 2.610677805299953e-05, + "loss": 0.3812, "step": 69575 }, { - "epoch": 2.45, - "learning_rate": 2.706780075592032e-05, - "loss": 0.2924, + "epoch": 2.507658485602047, + "grad_norm": 0.20302127301692963, + "learning_rate": 2.6103862772889902e-05, + "loss": 0.3991, "step": 69580 }, { - "epoch": 2.45, - "learning_rate": 2.7064961782362462e-05, - "loss": 0.2756, + "epoch": 2.5078386852632715, + "grad_norm": 0.20338794589042664, + "learning_rate": 2.6100947477740367e-05, + "loss": 0.3879, "step": 69585 }, { - "epoch": 2.45, - "learning_rate": 2.7062122781992438e-05, - "loss": 0.2735, + "epoch": 2.508018884924496, + "grad_norm": 0.1619461178779602, + "learning_rate": 2.609803216759063e-05, + "loss": 0.4004, "step": 69590 }, { - "epoch": 2.45, - "learning_rate": 2.705928375484712e-05, - "loss": 0.2717, + "epoch": 2.508199084585721, + "grad_norm": 0.2222161740064621, + "learning_rate": 2.6095116842480417e-05, + "loss": 0.3937, "step": 69595 }, { - "epoch": 2.45, - "learning_rate": 2.7056444700963346e-05, - "loss": 0.258, + "epoch": 2.5083792842469457, + "grad_norm": 0.18493251502513885, + "learning_rate": 2.6092201502449455e-05, + "loss": 0.3873, "step": 69600 }, { - "epoch": 2.45, - "learning_rate": 2.7053605620377998e-05, - "loss": 0.2592, + "epoch": 2.5085594839081704, + "grad_norm": 0.21007417142391205, + "learning_rate": 2.6089286147537452e-05, + "loss": 0.3758, "step": 69605 }, { - "epoch": 2.45, - "learning_rate": 2.7050766513127934e-05, - "loss": 0.2745, + "epoch": 2.508739683569395, + "grad_norm": 0.2484482377767563, + "learning_rate": 2.608637077778414e-05, + "loss": 0.4411, "step": 69610 }, { - "epoch": 2.45, - "learning_rate": 2.7047927379250016e-05, - "loss": 0.274, + "epoch": 2.5089198832306194, + "grad_norm": 0.2558552920818329, + "learning_rate": 2.608345539322924e-05, + "loss": 0.4278, "step": 69615 }, { - "epoch": 2.45, - "learning_rate": 2.7045088218781116e-05, - "loss": 0.2726, + "epoch": 2.509100082891844, + "grad_norm": 0.2762976586818695, + "learning_rate": 2.6080539993912467e-05, + "loss": 0.4425, "step": 69620 }, { - "epoch": 2.45, - "learning_rate": 2.7042249031758088e-05, - "loss": 0.2749, + "epoch": 2.509280282553069, + "grad_norm": 0.2021670788526535, + "learning_rate": 2.607762457987354e-05, + "loss": 0.3999, "step": 69625 }, { - "epoch": 2.45, - "learning_rate": 2.703940981821781e-05, - "loss": 0.2606, + "epoch": 2.509460482214293, + "grad_norm": 0.22645121812820435, + "learning_rate": 2.6074709151152193e-05, + "loss": 0.4255, "step": 69630 }, { - "epoch": 2.45, - "learning_rate": 2.7036570578197134e-05, - "loss": 0.2564, + "epoch": 2.509640681875518, + "grad_norm": 0.23237501084804535, + "learning_rate": 2.6071793707788138e-05, + "loss": 0.431, "step": 69635 }, { - "epoch": 2.45, - "learning_rate": 2.703373131173293e-05, - "loss": 0.2579, + "epoch": 2.5098208815367427, + "grad_norm": 0.22652441263198853, + "learning_rate": 2.6068878249821106e-05, + "loss": 0.3742, "step": 69640 }, { - "epoch": 2.45, - "learning_rate": 2.7030892018862066e-05, - "loss": 0.2936, + "epoch": 2.5100010811979674, + "grad_norm": 0.17889447510242462, + "learning_rate": 2.606596277729081e-05, + "loss": 0.3447, "step": 69645 }, { - "epoch": 2.45, - "learning_rate": 2.702805269962141e-05, - "loss": 0.2619, + "epoch": 2.510181280859192, + "grad_norm": 0.21000555157661438, + "learning_rate": 2.6063047290236974e-05, + "loss": 0.3839, "step": 69650 }, { - "epoch": 2.45, - "learning_rate": 2.7025213354047824e-05, - "loss": 0.2891, + "epoch": 2.510361480520417, + "grad_norm": 0.18811647593975067, + "learning_rate": 2.6060131788699343e-05, + "loss": 0.4068, "step": 69655 }, { - "epoch": 2.45, - "learning_rate": 2.7022373982178182e-05, - "loss": 0.2682, + "epoch": 2.510541680181641, + "grad_norm": 0.21040557324886322, + "learning_rate": 2.6057216272717605e-05, + "loss": 0.3839, "step": 69660 }, { - "epoch": 2.45, - "learning_rate": 2.7019534584049333e-05, - "loss": 0.2676, + "epoch": 2.510721879842866, + "grad_norm": 0.23716652393341064, + "learning_rate": 2.6054300742331498e-05, + "loss": 0.4053, "step": 69665 }, { - "epoch": 2.45, - "learning_rate": 2.7016695159698174e-05, - "loss": 0.2531, + "epoch": 2.5109020795040906, + "grad_norm": 0.2132147252559662, + "learning_rate": 2.6051385197580757e-05, + "loss": 0.3949, "step": 69670 }, { - "epoch": 2.45, - "learning_rate": 2.701385570916155e-05, - "loss": 0.271, + "epoch": 2.511082279165315, + "grad_norm": 0.20189203321933746, + "learning_rate": 2.6048469638505092e-05, + "loss": 0.4238, "step": 69675 }, { - "epoch": 2.45, - "learning_rate": 2.7011016232476344e-05, - "loss": 0.2571, + "epoch": 2.5112624788265396, + "grad_norm": 0.2348521649837494, + "learning_rate": 2.6045554065144234e-05, + "loss": 0.3931, "step": 69680 }, { - "epoch": 2.45, - "learning_rate": 2.7008176729679407e-05, - "loss": 0.2842, + "epoch": 2.5114426784877644, + "grad_norm": 0.1996840536594391, + "learning_rate": 2.60426384775379e-05, + "loss": 0.4236, "step": 69685 }, { - "epoch": 2.45, - "learning_rate": 2.7005337200807624e-05, - "loss": 0.2799, + "epoch": 2.511622878148989, + "grad_norm": 0.20005884766578674, + "learning_rate": 2.603972287572582e-05, + "loss": 0.4177, "step": 69690 }, { - "epoch": 2.45, - "learning_rate": 2.7002497645897855e-05, - "loss": 0.266, + "epoch": 2.511803077810214, + "grad_norm": 0.19069145619869232, + "learning_rate": 2.603680725974772e-05, + "loss": 0.4015, "step": 69695 }, { - "epoch": 2.45, - "learning_rate": 2.6999658064986983e-05, - "loss": 0.2721, + "epoch": 2.5119832774714386, + "grad_norm": 0.211561918258667, + "learning_rate": 2.6033891629643314e-05, + "loss": 0.3864, "step": 69700 }, { - "epoch": 2.45, - "learning_rate": 2.6996818458111862e-05, - "loss": 0.2712, + "epoch": 2.512163477132663, + "grad_norm": 0.2280474752187729, + "learning_rate": 2.6030975985452344e-05, + "loss": 0.3813, "step": 69705 }, { - "epoch": 2.45, - "learning_rate": 2.6993978825309374e-05, - "loss": 0.2511, + "epoch": 2.5123436767938876, + "grad_norm": 0.1532917618751526, + "learning_rate": 2.602806032721452e-05, + "loss": 0.3638, "step": 69710 }, { - "epoch": 2.45, - "learning_rate": 2.6991139166616375e-05, - "loss": 0.2838, + "epoch": 2.5125238764551123, + "grad_norm": 0.22681502997875214, + "learning_rate": 2.602514465496958e-05, + "loss": 0.4054, "step": 69715 }, { - "epoch": 2.45, - "learning_rate": 2.698829948206975e-05, - "loss": 0.2664, + "epoch": 2.512704076116337, + "grad_norm": 0.18224921822547913, + "learning_rate": 2.6022228968757233e-05, + "loss": 0.36, "step": 69720 }, { - "epoch": 2.45, - "learning_rate": 2.6985459771706367e-05, - "loss": 0.2854, + "epoch": 2.5128842757775613, + "grad_norm": 0.18726272881031036, + "learning_rate": 2.6019313268617223e-05, + "loss": 0.3718, "step": 69725 }, { - "epoch": 2.45, - "learning_rate": 2.69826200355631e-05, - "loss": 0.2769, + "epoch": 2.513064475438786, + "grad_norm": 0.21877476572990417, + "learning_rate": 2.601639755458926e-05, + "loss": 0.398, "step": 69730 }, { - "epoch": 2.45, - "learning_rate": 2.697978027367682e-05, - "loss": 0.2712, + "epoch": 2.513244675100011, + "grad_norm": 0.2230885922908783, + "learning_rate": 2.6013481826713083e-05, + "loss": 0.3982, "step": 69735 }, { - "epoch": 2.45, - "learning_rate": 2.6976940486084394e-05, - "loss": 0.2626, + "epoch": 2.5134248747612356, + "grad_norm": 0.22673091292381287, + "learning_rate": 2.6010566085028408e-05, + "loss": 0.4168, "step": 69740 }, { - "epoch": 2.45, - "learning_rate": 2.6974100672822705e-05, - "loss": 0.2676, + "epoch": 2.5136050744224603, + "grad_norm": 0.19574376940727234, + "learning_rate": 2.6007650329574968e-05, + "loss": 0.3864, "step": 69745 }, { - "epoch": 2.45, - "learning_rate": 2.6971260833928607e-05, - "loss": 0.2453, + "epoch": 2.5137852740836846, + "grad_norm": 0.1650337278842926, + "learning_rate": 2.6004734560392487e-05, + "loss": 0.4042, "step": 69750 }, { - "epoch": 2.45, - "learning_rate": 2.6968420969439002e-05, - "loss": 0.2894, + "epoch": 2.5139654737449093, + "grad_norm": 0.19529931247234344, + "learning_rate": 2.6001818777520692e-05, + "loss": 0.3919, "step": 69755 }, { - "epoch": 2.45, - "learning_rate": 2.6965581079390734e-05, - "loss": 0.2795, + "epoch": 2.514145673406134, + "grad_norm": 0.21473079919815063, + "learning_rate": 2.5998902980999314e-05, + "loss": 0.3822, "step": 69760 }, { - "epoch": 2.45, - "learning_rate": 2.6962741163820703e-05, - "loss": 0.2667, + "epoch": 2.5143258730673588, + "grad_norm": 0.20333951711654663, + "learning_rate": 2.5995987170868068e-05, + "loss": 0.3926, "step": 69765 }, { - "epoch": 2.45, - "learning_rate": 2.6959901222765764e-05, - "loss": 0.2664, + "epoch": 2.514506072728583, + "grad_norm": 0.2425466775894165, + "learning_rate": 2.5993071347166693e-05, + "loss": 0.3759, "step": 69770 }, { - "epoch": 2.45, - "learning_rate": 2.69570612562628e-05, - "loss": 0.2816, + "epoch": 2.514686272389808, + "grad_norm": 0.20622824132442474, + "learning_rate": 2.599015550993491e-05, + "loss": 0.3813, "step": 69775 }, { - "epoch": 2.46, - "learning_rate": 2.6954221264348684e-05, - "loss": 0.2869, + "epoch": 2.5148664720510325, + "grad_norm": 0.18303707242012024, + "learning_rate": 2.598723965921246e-05, + "loss": 0.4201, "step": 69780 }, { - "epoch": 2.46, - "learning_rate": 2.69513812470603e-05, - "loss": 0.2762, + "epoch": 2.5150466717122573, + "grad_norm": 0.1895645707845688, + "learning_rate": 2.5984323795039057e-05, + "loss": 0.362, "step": 69785 }, { - "epoch": 2.46, - "learning_rate": 2.694854120443451e-05, - "loss": 0.278, + "epoch": 2.515226871373482, + "grad_norm": 0.20545552670955658, + "learning_rate": 2.5981407917454427e-05, + "loss": 0.384, "step": 69790 }, { - "epoch": 2.46, - "learning_rate": 2.69457011365082e-05, - "loss": 0.2699, + "epoch": 2.5154070710347067, + "grad_norm": 0.20499677956104279, + "learning_rate": 2.5978492026498308e-05, + "loss": 0.3918, "step": 69795 }, { - "epoch": 2.46, - "learning_rate": 2.694286104331824e-05, - "loss": 0.2747, + "epoch": 2.515587270695931, + "grad_norm": 0.22519873082637787, + "learning_rate": 2.5975576122210422e-05, + "loss": 0.4266, "step": 69800 }, { - "epoch": 2.46, - "learning_rate": 2.694002092490151e-05, - "loss": 0.2802, + "epoch": 2.5157674703571558, + "grad_norm": 0.21015673875808716, + "learning_rate": 2.59726602046305e-05, + "loss": 0.3873, "step": 69805 }, { - "epoch": 2.46, - "learning_rate": 2.6937180781294885e-05, - "loss": 0.288, + "epoch": 2.5159476700183805, + "grad_norm": 0.19208891689777374, + "learning_rate": 2.5969744273798274e-05, + "loss": 0.397, "step": 69810 }, { - "epoch": 2.46, - "learning_rate": 2.6934340612535247e-05, - "loss": 0.2781, + "epoch": 2.516127869679605, + "grad_norm": 0.23260699212551117, + "learning_rate": 2.5966828329753467e-05, + "loss": 0.4168, "step": 69815 }, { - "epoch": 2.46, - "learning_rate": 2.6931500418659473e-05, - "loss": 0.284, + "epoch": 2.5163080693408295, + "grad_norm": 0.2245369702577591, + "learning_rate": 2.596391237253582e-05, + "loss": 0.4131, "step": 69820 }, { - "epoch": 2.46, - "learning_rate": 2.6928660199704435e-05, - "loss": 0.2531, + "epoch": 2.5164882690020542, + "grad_norm": 0.17223899066448212, + "learning_rate": 2.5960996402185043e-05, + "loss": 0.373, "step": 69825 }, { - "epoch": 2.46, - "learning_rate": 2.6925819955707014e-05, - "loss": 0.2725, + "epoch": 2.516668468663279, + "grad_norm": 0.23170410096645355, + "learning_rate": 2.595808041874088e-05, + "loss": 0.4081, "step": 69830 }, { - "epoch": 2.46, - "learning_rate": 2.6922979686704097e-05, - "loss": 0.2799, + "epoch": 2.5168486683245037, + "grad_norm": 0.21683074533939362, + "learning_rate": 2.5955164422243054e-05, + "loss": 0.4064, "step": 69835 }, { - "epoch": 2.46, - "learning_rate": 2.6920139392732552e-05, - "loss": 0.2975, + "epoch": 2.5170288679857284, + "grad_norm": 0.16481082141399384, + "learning_rate": 2.5952248412731305e-05, + "loss": 0.3402, "step": 69840 }, { - "epoch": 2.46, - "learning_rate": 2.691729907382926e-05, - "loss": 0.2886, + "epoch": 2.5172090676469527, + "grad_norm": 0.15683116018772125, + "learning_rate": 2.594933239024535e-05, + "loss": 0.4006, "step": 69845 }, { - "epoch": 2.46, - "learning_rate": 2.691445873003111e-05, - "loss": 0.2714, + "epoch": 2.5173892673081775, + "grad_norm": 0.18406714498996735, + "learning_rate": 2.5946416354824927e-05, + "loss": 0.3865, "step": 69850 }, { - "epoch": 2.46, - "learning_rate": 2.691161836137497e-05, - "loss": 0.2786, + "epoch": 2.517569466969402, + "grad_norm": 0.19561605155467987, + "learning_rate": 2.5943500306509765e-05, + "loss": 0.4131, "step": 69855 }, { - "epoch": 2.46, - "learning_rate": 2.6908777967897718e-05, - "loss": 0.2714, + "epoch": 2.5177496666306265, + "grad_norm": 0.19253993034362793, + "learning_rate": 2.5940584245339593e-05, + "loss": 0.3926, "step": 69860 }, { - "epoch": 2.46, - "learning_rate": 2.690593754963625e-05, - "loss": 0.2534, + "epoch": 2.5179298662918512, + "grad_norm": 0.24067455530166626, + "learning_rate": 2.5937668171354145e-05, + "loss": 0.3809, "step": 69865 }, { - "epoch": 2.46, - "learning_rate": 2.6903097106627447e-05, - "loss": 0.2817, + "epoch": 2.518110065953076, + "grad_norm": 0.17989800870418549, + "learning_rate": 2.5934752084593143e-05, + "loss": 0.3891, "step": 69870 }, { - "epoch": 2.46, - "learning_rate": 2.6900256638908174e-05, - "loss": 0.2838, + "epoch": 2.5182902656143007, + "grad_norm": 0.21168993413448334, + "learning_rate": 2.593183598509633e-05, + "loss": 0.3663, "step": 69875 }, { - "epoch": 2.46, - "learning_rate": 2.689741614651532e-05, - "loss": 0.2638, + "epoch": 2.5184704652755254, + "grad_norm": 0.19849541783332825, + "learning_rate": 2.5928919872903434e-05, + "loss": 0.4195, "step": 69880 }, { - "epoch": 2.46, - "learning_rate": 2.6894575629485762e-05, - "loss": 0.2688, + "epoch": 2.51865066493675, + "grad_norm": 0.22839267551898956, + "learning_rate": 2.592600374805418e-05, + "loss": 0.4412, "step": 69885 }, { - "epoch": 2.46, - "learning_rate": 2.68917350878564e-05, - "loss": 0.2442, + "epoch": 2.5188308645979745, + "grad_norm": 0.2030458301305771, + "learning_rate": 2.5923087610588305e-05, + "loss": 0.3982, "step": 69890 }, { - "epoch": 2.46, - "learning_rate": 2.68888945216641e-05, - "loss": 0.2397, + "epoch": 2.519011064259199, + "grad_norm": 0.161631777882576, + "learning_rate": 2.592017146054554e-05, + "loss": 0.3629, "step": 69895 }, { - "epoch": 2.46, - "learning_rate": 2.6886053930945755e-05, - "loss": 0.2594, + "epoch": 2.519191263920424, + "grad_norm": 0.21375270187854767, + "learning_rate": 2.5917255297965625e-05, + "loss": 0.3821, "step": 69900 }, { - "epoch": 2.46, - "learning_rate": 2.6883213315738232e-05, - "loss": 0.257, + "epoch": 2.519371463581648, + "grad_norm": 0.19490370154380798, + "learning_rate": 2.5914339122888272e-05, + "loss": 0.411, "step": 69905 }, { - "epoch": 2.46, - "learning_rate": 2.6880372676078423e-05, - "loss": 0.2749, + "epoch": 2.519551663242873, + "grad_norm": 0.2110588401556015, + "learning_rate": 2.591142293535323e-05, + "loss": 0.3623, "step": 69910 }, { - "epoch": 2.46, - "learning_rate": 2.6877532012003225e-05, - "loss": 0.2849, + "epoch": 2.5197318629040977, + "grad_norm": 0.21977630257606506, + "learning_rate": 2.5908506735400223e-05, + "loss": 0.4114, "step": 69915 }, { - "epoch": 2.46, - "learning_rate": 2.687469132354951e-05, - "loss": 0.2624, + "epoch": 2.5199120625653224, + "grad_norm": 0.20621120929718018, + "learning_rate": 2.5905590523068995e-05, + "loss": 0.3517, "step": 69920 }, { - "epoch": 2.46, - "learning_rate": 2.6871850610754156e-05, - "loss": 0.2645, + "epoch": 2.520092262226547, + "grad_norm": 0.20464351773262024, + "learning_rate": 2.5902674298399273e-05, + "loss": 0.3803, "step": 69925 }, { - "epoch": 2.46, - "learning_rate": 2.6869009873654066e-05, - "loss": 0.2821, + "epoch": 2.520272461887772, + "grad_norm": 0.20108066499233246, + "learning_rate": 2.5899758061430777e-05, + "loss": 0.4085, "step": 69930 }, { - "epoch": 2.46, - "learning_rate": 2.6866169112286105e-05, - "loss": 0.2655, + "epoch": 2.520452661548996, + "grad_norm": 0.2749893367290497, + "learning_rate": 2.5896841812203265e-05, + "loss": 0.3936, "step": 69935 }, { - "epoch": 2.46, - "learning_rate": 2.686332832668717e-05, - "loss": 0.2726, + "epoch": 2.520632861210221, + "grad_norm": 0.21574847400188446, + "learning_rate": 2.589392555075645e-05, + "loss": 0.3946, "step": 69940 }, { - "epoch": 2.46, - "learning_rate": 2.6860487516894144e-05, - "loss": 0.2955, + "epoch": 2.5208130608714456, + "grad_norm": 0.17388522624969482, + "learning_rate": 2.5891009277130073e-05, + "loss": 0.4072, "step": 69945 }, { - "epoch": 2.46, - "learning_rate": 2.685764668294392e-05, - "loss": 0.2693, + "epoch": 2.52099326053267, + "grad_norm": 0.21994316577911377, + "learning_rate": 2.5888092991363867e-05, + "loss": 0.4174, "step": 69950 }, { - "epoch": 2.46, - "learning_rate": 2.6854805824873376e-05, - "loss": 0.2529, + "epoch": 2.5211734601938947, + "grad_norm": 0.17600910365581512, + "learning_rate": 2.5885176693497558e-05, + "loss": 0.3881, "step": 69955 }, { - "epoch": 2.46, - "learning_rate": 2.6851964942719394e-05, - "loss": 0.2596, + "epoch": 2.5213536598551194, + "grad_norm": 0.19850102066993713, + "learning_rate": 2.5882260383570906e-05, + "loss": 0.3902, "step": 69960 }, { - "epoch": 2.46, - "learning_rate": 2.684912403651888e-05, - "loss": 0.2836, + "epoch": 2.521533859516344, + "grad_norm": 0.20933623611927032, + "learning_rate": 2.587934406162361e-05, + "loss": 0.3793, "step": 69965 }, { - "epoch": 2.46, - "learning_rate": 2.684628310630869e-05, - "loss": 0.2747, + "epoch": 2.521714059177569, + "grad_norm": 0.2151513695716858, + "learning_rate": 2.5876427727695433e-05, + "loss": 0.3761, "step": 69970 }, { - "epoch": 2.46, - "learning_rate": 2.6843442152125748e-05, - "loss": 0.2686, + "epoch": 2.5218942588387936, + "grad_norm": 0.19325150549411774, + "learning_rate": 2.5873511381826087e-05, + "loss": 0.3629, "step": 69975 }, { - "epoch": 2.46, - "learning_rate": 2.6840601174006918e-05, - "loss": 0.2522, + "epoch": 2.522074458500018, + "grad_norm": 0.1916101574897766, + "learning_rate": 2.5870595024055328e-05, + "loss": 0.3827, "step": 69980 }, { - "epoch": 2.46, - "learning_rate": 2.6837760171989094e-05, - "loss": 0.2775, + "epoch": 2.5222546581612426, + "grad_norm": 0.17457756400108337, + "learning_rate": 2.586767865442288e-05, + "loss": 0.4084, "step": 69985 }, { - "epoch": 2.46, - "learning_rate": 2.6834919146109167e-05, - "loss": 0.2693, + "epoch": 2.5224348578224673, + "grad_norm": 0.19067886471748352, + "learning_rate": 2.586476227296847e-05, + "loss": 0.3825, "step": 69990 }, { - "epoch": 2.46, - "learning_rate": 2.683207809640402e-05, - "loss": 0.2827, + "epoch": 2.522615057483692, + "grad_norm": 0.18429118394851685, + "learning_rate": 2.586184587973185e-05, + "loss": 0.3885, "step": 69995 }, { - "epoch": 2.46, - "learning_rate": 2.6829237022910552e-05, - "loss": 0.238, + "epoch": 2.5227952571449164, + "grad_norm": 0.18464790284633636, + "learning_rate": 2.5858929474752734e-05, + "loss": 0.4124, "step": 70000 }, { - "epoch": 2.46, - "eval_loss": 0.26445090770721436, - "eval_runtime": 10.5622, - "eval_samples_per_second": 9.468, - "eval_steps_per_second": 9.468, + "epoch": 2.5227952571449164, + "eval_loss": 0.43379154801368713, + "eval_runtime": 3.5327, + "eval_samples_per_second": 28.307, + "eval_steps_per_second": 7.077, "step": 70000 }, { - "epoch": 2.46, - "learning_rate": 2.682639592566565e-05, - "loss": 0.2826, + "epoch": 2.522975456806141, + "grad_norm": 0.20850814878940582, + "learning_rate": 2.5856013058070888e-05, + "loss": 0.3697, "step": 70005 }, { - "epoch": 2.46, - "learning_rate": 2.6823554804706192e-05, - "loss": 0.2633, + "epoch": 2.523155656467366, + "grad_norm": 0.23533137142658234, + "learning_rate": 2.5853096629726022e-05, + "loss": 0.4432, "step": 70010 }, { - "epoch": 2.46, - "learning_rate": 2.6820713660069087e-05, - "loss": 0.2642, + "epoch": 2.5233358561285906, + "grad_norm": 0.22087322175502777, + "learning_rate": 2.5850180189757878e-05, + "loss": 0.3875, "step": 70015 }, { - "epoch": 2.46, - "learning_rate": 2.68178724917912e-05, - "loss": 0.2651, + "epoch": 2.5235160557898153, + "grad_norm": 0.23013746738433838, + "learning_rate": 2.584726373820619e-05, + "loss": 0.3855, "step": 70020 }, { - "epoch": 2.46, - "learning_rate": 2.681503129990945e-05, - "loss": 0.2732, + "epoch": 2.5236962554510396, + "grad_norm": 0.17639127373695374, + "learning_rate": 2.5844347275110702e-05, + "loss": 0.3714, "step": 70025 }, { - "epoch": 2.46, - "learning_rate": 2.6812190084460704e-05, - "loss": 0.2791, + "epoch": 2.5238764551122643, + "grad_norm": 0.18888616561889648, + "learning_rate": 2.5841430800511145e-05, + "loss": 0.4043, "step": 70030 }, { - "epoch": 2.46, - "learning_rate": 2.6809348845481873e-05, - "loss": 0.2664, + "epoch": 2.524056654773489, + "grad_norm": 0.2257877141237259, + "learning_rate": 2.5838514314447255e-05, + "loss": 0.3873, "step": 70035 }, { - "epoch": 2.46, - "learning_rate": 2.6806507583009837e-05, - "loss": 0.2579, + "epoch": 2.524236854434714, + "grad_norm": 0.20586729049682617, + "learning_rate": 2.5835597816958774e-05, + "loss": 0.406, "step": 70040 }, { - "epoch": 2.46, - "learning_rate": 2.680366629708149e-05, - "loss": 0.2769, + "epoch": 2.524417054095938, + "grad_norm": 0.2077612429857254, + "learning_rate": 2.583268130808543e-05, + "loss": 0.3753, "step": 70045 }, { - "epoch": 2.46, - "learning_rate": 2.680082498773372e-05, - "loss": 0.2742, + "epoch": 2.524597253757163, + "grad_norm": 0.17256224155426025, + "learning_rate": 2.5829764787866974e-05, + "loss": 0.35, "step": 70050 }, { - "epoch": 2.46, - "learning_rate": 2.6797983655003433e-05, - "loss": 0.2812, + "epoch": 2.5247774534183876, + "grad_norm": 0.28003284335136414, + "learning_rate": 2.5826848256343116e-05, + "loss": 0.4164, "step": 70055 }, { - "epoch": 2.46, - "learning_rate": 2.6795142298927507e-05, - "loss": 0.2794, + "epoch": 2.5249576530796123, + "grad_norm": 0.234117329120636, + "learning_rate": 2.582393171355363e-05, + "loss": 0.3949, "step": 70060 }, { - "epoch": 2.47, - "learning_rate": 2.6792300919542845e-05, - "loss": 0.2661, + "epoch": 2.525137852740837, + "grad_norm": 0.21253302693367004, + "learning_rate": 2.582101515953822e-05, + "loss": 0.3964, "step": 70065 }, { - "epoch": 2.47, - "learning_rate": 2.6789459516886334e-05, - "loss": 0.2733, + "epoch": 2.5253180524020618, + "grad_norm": 0.2189071625471115, + "learning_rate": 2.5818098594336636e-05, + "loss": 0.3859, "step": 70070 }, { - "epoch": 2.47, - "learning_rate": 2.678661809099487e-05, - "loss": 0.2739, + "epoch": 2.525498252063286, + "grad_norm": 0.2182706594467163, + "learning_rate": 2.5815182017988626e-05, + "loss": 0.3934, "step": 70075 }, { - "epoch": 2.47, - "learning_rate": 2.6783776641905346e-05, - "loss": 0.2775, + "epoch": 2.5256784517245108, + "grad_norm": 0.1897723227739334, + "learning_rate": 2.5812265430533916e-05, + "loss": 0.4036, "step": 70080 }, { - "epoch": 2.47, - "learning_rate": 2.6780935169654657e-05, - "loss": 0.2773, + "epoch": 2.5258586513857355, + "grad_norm": 0.20511159300804138, + "learning_rate": 2.5809348832012253e-05, + "loss": 0.4152, "step": 70085 }, { - "epoch": 2.47, - "learning_rate": 2.6778093674279708e-05, - "loss": 0.2615, + "epoch": 2.52603885104696, + "grad_norm": 0.21636447310447693, + "learning_rate": 2.5806432222463356e-05, + "loss": 0.379, "step": 70090 }, { - "epoch": 2.47, - "learning_rate": 2.6775252155817375e-05, - "loss": 0.3045, + "epoch": 2.5262190507081845, + "grad_norm": 0.2109370082616806, + "learning_rate": 2.580351560192698e-05, + "loss": 0.386, "step": 70095 }, { - "epoch": 2.47, - "learning_rate": 2.6772410614304572e-05, - "loss": 0.2739, + "epoch": 2.5263992503694093, + "grad_norm": 0.20028775930404663, + "learning_rate": 2.5800598970442862e-05, + "loss": 0.4101, "step": 70100 }, { - "epoch": 2.47, - "learning_rate": 2.6769569049778175e-05, - "loss": 0.2818, + "epoch": 2.526579450030634, + "grad_norm": 0.2018347680568695, + "learning_rate": 2.5797682328050744e-05, + "loss": 0.3857, "step": 70105 }, { - "epoch": 2.47, - "learning_rate": 2.6766727462275098e-05, - "loss": 0.2558, + "epoch": 2.5267596496918587, + "grad_norm": 0.2127305567264557, + "learning_rate": 2.579476567479035e-05, + "loss": 0.3927, "step": 70110 }, { - "epoch": 2.47, - "learning_rate": 2.6763885851832228e-05, - "loss": 0.2704, + "epoch": 2.5269398493530835, + "grad_norm": 0.19696669280529022, + "learning_rate": 2.579184901070143e-05, + "loss": 0.4074, "step": 70115 }, { - "epoch": 2.47, - "learning_rate": 2.6761044218486457e-05, - "loss": 0.2763, + "epoch": 2.5271200490143078, + "grad_norm": 0.2948433458805084, + "learning_rate": 2.578893233582372e-05, + "loss": 0.3763, "step": 70120 }, { - "epoch": 2.47, - "learning_rate": 2.6758202562274693e-05, - "loss": 0.2631, + "epoch": 2.5273002486755325, + "grad_norm": 0.1763259321451187, + "learning_rate": 2.578601565019696e-05, + "loss": 0.3824, "step": 70125 }, { - "epoch": 2.47, - "learning_rate": 2.6755360883233826e-05, - "loss": 0.2597, + "epoch": 2.5274804483367572, + "grad_norm": 0.22030363976955414, + "learning_rate": 2.5783098953860883e-05, + "loss": 0.424, "step": 70130 }, { - "epoch": 2.47, - "learning_rate": 2.675251918140075e-05, - "loss": 0.2679, + "epoch": 2.5276606479979815, + "grad_norm": 0.2222214639186859, + "learning_rate": 2.5780182246855245e-05, + "loss": 0.421, "step": 70135 }, { - "epoch": 2.47, - "learning_rate": 2.6749677456812376e-05, - "loss": 0.2464, + "epoch": 2.5278408476592062, + "grad_norm": 0.2385231852531433, + "learning_rate": 2.5777265529219767e-05, + "loss": 0.3931, "step": 70140 }, { - "epoch": 2.47, - "learning_rate": 2.6746835709505586e-05, - "loss": 0.2559, + "epoch": 2.528021047320431, + "grad_norm": 0.1919146180152893, + "learning_rate": 2.577434880099421e-05, + "loss": 0.3618, "step": 70145 }, { - "epoch": 2.47, - "learning_rate": 2.674399393951729e-05, - "loss": 0.2623, + "epoch": 2.5282012469816557, + "grad_norm": 0.21808387339115143, + "learning_rate": 2.5771432062218286e-05, + "loss": 0.4243, "step": 70150 }, { - "epoch": 2.47, - "learning_rate": 2.6741152146884386e-05, - "loss": 0.2735, + "epoch": 2.5283814466428804, + "grad_norm": 0.1588166505098343, + "learning_rate": 2.576851531293176e-05, + "loss": 0.3825, "step": 70155 }, { - "epoch": 2.47, - "learning_rate": 2.6738310331643755e-05, - "loss": 0.2737, + "epoch": 2.528561646304105, + "grad_norm": 0.20755480229854584, + "learning_rate": 2.576559855317435e-05, + "loss": 0.4111, "step": 70160 }, { - "epoch": 2.47, - "learning_rate": 2.6735468493832322e-05, - "loss": 0.2854, + "epoch": 2.5287418459653295, + "grad_norm": 0.22904981672763824, + "learning_rate": 2.5762681782985816e-05, + "loss": 0.3904, "step": 70165 }, { - "epoch": 2.47, - "learning_rate": 2.6732626633486973e-05, - "loss": 0.2754, + "epoch": 2.528922045626554, + "grad_norm": 0.18714340031147003, + "learning_rate": 2.575976500240589e-05, + "loss": 0.3692, "step": 70170 }, { - "epoch": 2.47, - "learning_rate": 2.6729784750644605e-05, - "loss": 0.2867, + "epoch": 2.529102245287779, + "grad_norm": 0.2257447987794876, + "learning_rate": 2.5756848211474306e-05, + "loss": 0.398, "step": 70175 }, { - "epoch": 2.47, - "learning_rate": 2.672694284534213e-05, - "loss": 0.2767, + "epoch": 2.5292824449490032, + "grad_norm": 0.21989066898822784, + "learning_rate": 2.575393141023082e-05, + "loss": 0.3885, "step": 70180 }, { - "epoch": 2.47, - "learning_rate": 2.672410091761643e-05, - "loss": 0.2667, + "epoch": 2.529462644610228, + "grad_norm": 0.21285291016101837, + "learning_rate": 2.5751014598715155e-05, + "loss": 0.3877, "step": 70185 }, { - "epoch": 2.47, - "learning_rate": 2.672125896750441e-05, - "loss": 0.2848, + "epoch": 2.5296428442714527, + "grad_norm": 0.20206022262573242, + "learning_rate": 2.5748097776967073e-05, + "loss": 0.4259, "step": 70190 }, { - "epoch": 2.47, - "learning_rate": 2.671841699504299e-05, - "loss": 0.2816, + "epoch": 2.5298230439326774, + "grad_norm": 0.18147850036621094, + "learning_rate": 2.5745180945026298e-05, + "loss": 0.3914, "step": 70195 }, { - "epoch": 2.47, - "learning_rate": 2.6715575000269055e-05, - "loss": 0.2844, + "epoch": 2.530003243593902, + "grad_norm": 0.18609163165092468, + "learning_rate": 2.574226410293258e-05, + "loss": 0.4179, "step": 70200 }, { - "epoch": 2.47, - "learning_rate": 2.6712732983219514e-05, - "loss": 0.2748, + "epoch": 2.530183443255127, + "grad_norm": 0.1647646427154541, + "learning_rate": 2.573934725072565e-05, + "loss": 0.3955, "step": 70205 }, { - "epoch": 2.47, - "learning_rate": 2.6709890943931252e-05, - "loss": 0.2603, + "epoch": 2.530363642916351, + "grad_norm": 0.21977296471595764, + "learning_rate": 2.573643038844526e-05, + "loss": 0.3912, "step": 70210 }, { - "epoch": 2.47, - "learning_rate": 2.670704888244119e-05, - "loss": 0.2777, + "epoch": 2.530543842577576, + "grad_norm": 0.18793781101703644, + "learning_rate": 2.5733513516131153e-05, + "loss": 0.3975, "step": 70215 }, { - "epoch": 2.47, - "learning_rate": 2.670420679878622e-05, - "loss": 0.2673, + "epoch": 2.5307240422388007, + "grad_norm": 0.18933290243148804, + "learning_rate": 2.573059663382306e-05, + "loss": 0.4017, "step": 70220 }, { - "epoch": 2.47, - "learning_rate": 2.6701364693003255e-05, - "loss": 0.2878, + "epoch": 2.5309042419000254, + "grad_norm": 0.17556174099445343, + "learning_rate": 2.5727679741560734e-05, + "loss": 0.4003, "step": 70225 }, { - "epoch": 2.47, - "learning_rate": 2.669852256512918e-05, - "loss": 0.2912, + "epoch": 2.5310844415612497, + "grad_norm": 0.18023037910461426, + "learning_rate": 2.5724762839383915e-05, + "loss": 0.4288, "step": 70230 }, { - "epoch": 2.47, - "learning_rate": 2.669568041520092e-05, - "loss": 0.2746, + "epoch": 2.5312646412224744, + "grad_norm": 0.1804884523153305, + "learning_rate": 2.5721845927332333e-05, + "loss": 0.3854, "step": 70235 }, { - "epoch": 2.47, - "learning_rate": 2.6692838243255364e-05, - "loss": 0.2432, + "epoch": 2.531444840883699, + "grad_norm": 0.26398801803588867, + "learning_rate": 2.5718929005445746e-05, + "loss": 0.4287, "step": 70240 }, { - "epoch": 2.47, - "learning_rate": 2.6689996049329413e-05, - "loss": 0.2665, + "epoch": 2.531625040544924, + "grad_norm": 0.2065039724111557, + "learning_rate": 2.5716012073763883e-05, + "loss": 0.397, "step": 70245 }, { - "epoch": 2.47, - "learning_rate": 2.6687153833459982e-05, - "loss": 0.2876, + "epoch": 2.5318052402061486, + "grad_norm": 0.1797478199005127, + "learning_rate": 2.5713095132326515e-05, + "loss": 0.3692, "step": 70250 }, { - "epoch": 2.47, - "learning_rate": 2.668431159568397e-05, - "loss": 0.3013, + "epoch": 2.531985439867373, + "grad_norm": 0.19305941462516785, + "learning_rate": 2.5710178181173344e-05, + "loss": 0.3386, "step": 70255 }, { - "epoch": 2.47, - "learning_rate": 2.668146933603828e-05, - "loss": 0.2851, + "epoch": 2.5321656395285976, + "grad_norm": 0.2119571417570114, + "learning_rate": 2.5707261220344143e-05, + "loss": 0.4244, "step": 70260 }, { - "epoch": 2.47, - "learning_rate": 2.6678627054559824e-05, - "loss": 0.2648, + "epoch": 2.5323458391898224, + "grad_norm": 0.21070972084999084, + "learning_rate": 2.5704344249878637e-05, + "loss": 0.3988, "step": 70265 }, { - "epoch": 2.47, - "learning_rate": 2.6675784751285494e-05, - "loss": 0.2773, + "epoch": 2.532526038851047, + "grad_norm": 0.2026742547750473, + "learning_rate": 2.570142726981658e-05, + "loss": 0.4084, "step": 70270 }, { - "epoch": 2.47, - "learning_rate": 2.6672942426252212e-05, - "loss": 0.2921, + "epoch": 2.5327062385122714, + "grad_norm": 0.16865421831607819, + "learning_rate": 2.569851028019771e-05, + "loss": 0.3662, "step": 70275 }, { - "epoch": 2.47, - "learning_rate": 2.667010007949687e-05, - "loss": 0.2451, + "epoch": 2.532886438173496, + "grad_norm": 0.23617038130760193, + "learning_rate": 2.5695593281061774e-05, + "loss": 0.3986, "step": 70280 }, { - "epoch": 2.47, - "learning_rate": 2.666725771105638e-05, - "loss": 0.2775, + "epoch": 2.533066637834721, + "grad_norm": 0.2046816498041153, + "learning_rate": 2.5692676272448517e-05, + "loss": 0.3803, "step": 70285 }, { - "epoch": 2.47, - "learning_rate": 2.6664415320967644e-05, - "loss": 0.2334, + "epoch": 2.5332468374959456, + "grad_norm": 0.18710558116436005, + "learning_rate": 2.5689759254397683e-05, + "loss": 0.3968, "step": 70290 }, { - "epoch": 2.47, - "learning_rate": 2.6661572909267574e-05, - "loss": 0.2681, + "epoch": 2.5334270371571703, + "grad_norm": 0.21419739723205566, + "learning_rate": 2.5686842226949008e-05, + "loss": 0.4224, "step": 70295 }, { - "epoch": 2.47, - "learning_rate": 2.6658730475993077e-05, - "loss": 0.2791, + "epoch": 2.533607236818395, + "grad_norm": 0.18646739423274994, + "learning_rate": 2.568392519014224e-05, + "loss": 0.4011, "step": 70300 }, { - "epoch": 2.47, - "learning_rate": 2.6655888021181052e-05, - "loss": 0.2672, + "epoch": 2.5337874364796193, + "grad_norm": 0.19873277842998505, + "learning_rate": 2.5681008144017128e-05, + "loss": 0.3635, "step": 70305 }, { - "epoch": 2.47, - "learning_rate": 2.665304554486842e-05, - "loss": 0.2755, + "epoch": 2.533967636140844, + "grad_norm": 0.18021991848945618, + "learning_rate": 2.5678091088613408e-05, + "loss": 0.3867, "step": 70310 }, { - "epoch": 2.47, - "learning_rate": 2.665020304709208e-05, - "loss": 0.2966, + "epoch": 2.534147835802069, + "grad_norm": 0.17604483664035797, + "learning_rate": 2.5675174023970826e-05, + "loss": 0.3857, "step": 70315 }, { - "epoch": 2.47, - "learning_rate": 2.6647360527888943e-05, - "loss": 0.2693, + "epoch": 2.534328035463293, + "grad_norm": 0.2240837961435318, + "learning_rate": 2.567225695012913e-05, + "loss": 0.3711, "step": 70320 }, { - "epoch": 2.47, - "learning_rate": 2.6644517987295903e-05, - "loss": 0.2733, + "epoch": 2.534508235124518, + "grad_norm": 0.1983606368303299, + "learning_rate": 2.566933986712806e-05, + "loss": 0.3828, "step": 70325 }, { - "epoch": 2.47, - "learning_rate": 2.6641675425349894e-05, - "loss": 0.263, + "epoch": 2.5346884347857426, + "grad_norm": 0.22788797318935394, + "learning_rate": 2.566642277500738e-05, + "loss": 0.3859, "step": 70330 }, { - "epoch": 2.47, - "learning_rate": 2.6638832842087807e-05, - "loss": 0.2887, + "epoch": 2.5348686344469673, + "grad_norm": 0.2181435376405716, + "learning_rate": 2.5663505673806805e-05, + "loss": 0.4207, "step": 70335 }, { - "epoch": 2.47, - "learning_rate": 2.663599023754656e-05, - "loss": 0.2671, + "epoch": 2.535048834108192, + "grad_norm": 0.19292283058166504, + "learning_rate": 2.5660588563566097e-05, + "loss": 0.4102, "step": 70340 }, { - "epoch": 2.47, - "learning_rate": 2.6633147611763047e-05, - "loss": 0.2613, + "epoch": 2.5352290337694168, + "grad_norm": 0.18651723861694336, + "learning_rate": 2.5657671444324994e-05, + "loss": 0.4028, "step": 70345 }, { - "epoch": 2.48, - "learning_rate": 2.6630304964774195e-05, - "loss": 0.2824, + "epoch": 2.535409233430641, + "grad_norm": 0.20241175591945648, + "learning_rate": 2.5654754316123248e-05, + "loss": 0.4, "step": 70350 }, { - "epoch": 2.48, - "learning_rate": 2.6627462296616906e-05, - "loss": 0.2821, + "epoch": 2.535589433091866, + "grad_norm": 0.19530391693115234, + "learning_rate": 2.5651837179000605e-05, + "loss": 0.3573, "step": 70355 }, { - "epoch": 2.48, - "learning_rate": 2.6624619607328093e-05, - "loss": 0.2637, + "epoch": 2.5357696327530905, + "grad_norm": 0.1831565499305725, + "learning_rate": 2.5648920032996794e-05, + "loss": 0.3696, "step": 70360 }, { - "epoch": 2.48, - "learning_rate": 2.6621776896944666e-05, - "loss": 0.283, + "epoch": 2.535949832414315, + "grad_norm": 0.22338707745075226, + "learning_rate": 2.5646002878151586e-05, + "loss": 0.3642, "step": 70365 }, { - "epoch": 2.48, - "learning_rate": 2.661893416550354e-05, - "loss": 0.2689, + "epoch": 2.5361300320755396, + "grad_norm": 0.23138903081417084, + "learning_rate": 2.5643669147936305e-05, + "loss": 0.3546, "step": 70370 }, { - "epoch": 2.48, - "learning_rate": 2.661609141304161e-05, - "loss": 0.2667, + "epoch": 2.5363102317367643, + "grad_norm": 0.18024463951587677, + "learning_rate": 2.5640751977276717e-05, + "loss": 0.377, "step": 70375 }, { - "epoch": 2.48, - "learning_rate": 2.66132486395958e-05, - "loss": 0.2465, + "epoch": 2.536490431397989, + "grad_norm": 0.19997356832027435, + "learning_rate": 2.5637834797887005e-05, + "loss": 0.3983, "step": 70380 }, { - "epoch": 2.48, - "learning_rate": 2.661040584520302e-05, - "loss": 0.2643, + "epoch": 2.5366706310592138, + "grad_norm": 0.20950496196746826, + "learning_rate": 2.5634917609806917e-05, + "loss": 0.3991, "step": 70385 }, { - "epoch": 2.48, - "learning_rate": 2.660756302990019e-05, - "loss": 0.2812, + "epoch": 2.5368508307204385, + "grad_norm": 0.20799300074577332, + "learning_rate": 2.56320004130762e-05, + "loss": 0.3923, "step": 70390 }, { - "epoch": 2.48, - "learning_rate": 2.66047201937242e-05, - "loss": 0.2553, + "epoch": 2.5370310303816628, + "grad_norm": 0.23899616301059723, + "learning_rate": 2.5629083207734595e-05, + "loss": 0.4264, "step": 70395 }, { - "epoch": 2.48, - "learning_rate": 2.660187733671199e-05, - "loss": 0.2678, + "epoch": 2.5372112300428875, + "grad_norm": 0.18154381215572357, + "learning_rate": 2.5626165993821866e-05, + "loss": 0.3814, "step": 70400 }, { - "epoch": 2.48, - "learning_rate": 2.6599034458900452e-05, - "loss": 0.2585, + "epoch": 2.5373914297041122, + "grad_norm": 0.1529223769903183, + "learning_rate": 2.5623248771377733e-05, + "loss": 0.3888, "step": 70405 }, { - "epoch": 2.48, - "learning_rate": 2.65961915603265e-05, - "loss": 0.2741, + "epoch": 2.5375716293653365, + "grad_norm": 0.18600383400917053, + "learning_rate": 2.5620331540441956e-05, + "loss": 0.3829, "step": 70410 }, { - "epoch": 2.48, - "learning_rate": 2.6593348641027056e-05, - "loss": 0.2839, + "epoch": 2.5377518290265613, + "grad_norm": 0.17343242466449738, + "learning_rate": 2.5617414301054288e-05, + "loss": 0.3795, "step": 70415 }, { - "epoch": 2.48, - "learning_rate": 2.6590505701039032e-05, - "loss": 0.2588, + "epoch": 2.537932028687786, + "grad_norm": 0.33770492672920227, + "learning_rate": 2.5614497053254464e-05, + "loss": 0.3925, "step": 70420 }, { - "epoch": 2.48, - "learning_rate": 2.6587662740399338e-05, - "loss": 0.2524, + "epoch": 2.5381122283490107, + "grad_norm": 0.20434433221817017, + "learning_rate": 2.5611579797082252e-05, + "loss": 0.3664, "step": 70425 }, { - "epoch": 2.48, - "learning_rate": 2.6584819759144892e-05, - "loss": 0.2595, + "epoch": 2.5382924280102355, + "grad_norm": 0.2171066850423813, + "learning_rate": 2.5608662532577367e-05, + "loss": 0.403, "step": 70430 }, { - "epoch": 2.48, - "learning_rate": 2.65819767573126e-05, - "loss": 0.2641, + "epoch": 2.53847262767146, + "grad_norm": 0.21163609623908997, + "learning_rate": 2.5605745259779578e-05, + "loss": 0.4073, "step": 70435 }, { - "epoch": 2.48, - "learning_rate": 2.6579133734939383e-05, - "loss": 0.2606, + "epoch": 2.5386528273326845, + "grad_norm": 0.19679218530654907, + "learning_rate": 2.5602827978728626e-05, + "loss": 0.3859, "step": 70440 }, { - "epoch": 2.48, - "learning_rate": 2.6576290692062162e-05, - "loss": 0.2499, + "epoch": 2.5388330269939092, + "grad_norm": 0.19141094386577606, + "learning_rate": 2.5599910689464263e-05, + "loss": 0.3611, "step": 70445 }, { - "epoch": 2.48, - "learning_rate": 2.657344762871784e-05, - "loss": 0.2609, + "epoch": 2.539013226655134, + "grad_norm": 0.18265005946159363, + "learning_rate": 2.559699339202623e-05, + "loss": 0.4121, "step": 70450 }, { - "epoch": 2.48, - "learning_rate": 2.6570604544943346e-05, - "loss": 0.2638, + "epoch": 2.5391934263163582, + "grad_norm": 0.1961100548505783, + "learning_rate": 2.559407608645428e-05, + "loss": 0.387, "step": 70455 }, { - "epoch": 2.48, - "learning_rate": 2.656776144077558e-05, - "loss": 0.2473, + "epoch": 2.539373625977583, + "grad_norm": 0.1709757298231125, + "learning_rate": 2.559115877278816e-05, + "loss": 0.3952, "step": 70460 }, { - "epoch": 2.48, - "learning_rate": 2.6564918316251463e-05, - "loss": 0.2855, + "epoch": 2.5395538256388077, + "grad_norm": 0.193019837141037, + "learning_rate": 2.558824145106761e-05, + "loss": 0.4121, "step": 70465 }, { - "epoch": 2.48, - "learning_rate": 2.6562075171407914e-05, - "loss": 0.2692, + "epoch": 2.5397340253000324, + "grad_norm": 0.23720510303974152, + "learning_rate": 2.5585324121332394e-05, + "loss": 0.3731, "step": 70470 }, { - "epoch": 2.48, - "learning_rate": 2.6559232006281853e-05, - "loss": 0.2746, + "epoch": 2.539914224961257, + "grad_norm": 0.21066009998321533, + "learning_rate": 2.558240678362224e-05, + "loss": 0.4199, "step": 70475 }, { - "epoch": 2.48, - "learning_rate": 2.6556388820910183e-05, - "loss": 0.2747, + "epoch": 2.540094424622482, + "grad_norm": 0.23059865832328796, + "learning_rate": 2.557948943797691e-05, + "loss": 0.3951, "step": 70480 }, { - "epoch": 2.48, - "learning_rate": 2.6553545615329838e-05, - "loss": 0.2568, + "epoch": 2.540274624283706, + "grad_norm": 0.19941085577011108, + "learning_rate": 2.5576572084436153e-05, + "loss": 0.4155, "step": 70485 }, { - "epoch": 2.48, - "learning_rate": 2.655070238957772e-05, - "loss": 0.2693, + "epoch": 2.540454823944931, + "grad_norm": 0.20750625431537628, + "learning_rate": 2.5573654723039704e-05, + "loss": 0.3913, "step": 70490 }, { - "epoch": 2.48, - "learning_rate": 2.6547859143690762e-05, - "loss": 0.265, + "epoch": 2.5406350236061557, + "grad_norm": 0.1879345029592514, + "learning_rate": 2.5570737353827323e-05, + "loss": 0.4201, "step": 70495 }, { - "epoch": 2.48, - "learning_rate": 2.6545015877705865e-05, - "loss": 0.2696, + "epoch": 2.5408152232673804, + "grad_norm": 0.217643141746521, + "learning_rate": 2.5567819976838752e-05, + "loss": 0.3992, "step": 70500 }, { - "epoch": 2.48, - "eval_loss": 0.2644537389278412, - "eval_runtime": 10.5368, - "eval_samples_per_second": 9.491, - "eval_steps_per_second": 9.491, + "epoch": 2.5408152232673804, + "eval_loss": 0.4338361620903015, + "eval_runtime": 3.5352, + "eval_samples_per_second": 28.287, + "eval_steps_per_second": 7.072, "step": 70500 }, { - "epoch": 2.48, - "learning_rate": 2.6542172591659962e-05, - "loss": 0.2577, + "epoch": 2.5409954229286047, + "grad_norm": 0.24064627289772034, + "learning_rate": 2.556490259211376e-05, + "loss": 0.3785, "step": 70505 }, { - "epoch": 2.48, - "learning_rate": 2.6539329285589963e-05, - "loss": 0.2593, + "epoch": 2.5411756225898294, + "grad_norm": 0.18428386747837067, + "learning_rate": 2.5561985199692062e-05, + "loss": 0.4018, "step": 70510 }, { - "epoch": 2.48, - "learning_rate": 2.653648595953278e-05, - "loss": 0.2707, + "epoch": 2.541355822251054, + "grad_norm": 0.20441985130310059, + "learning_rate": 2.5559067799613434e-05, + "loss": 0.3607, "step": 70515 }, { - "epoch": 2.48, - "learning_rate": 2.6533642613525345e-05, - "loss": 0.2658, + "epoch": 2.541536021912279, + "grad_norm": 0.20763367414474487, + "learning_rate": 2.555615039191761e-05, + "loss": 0.3537, "step": 70520 }, { - "epoch": 2.48, - "learning_rate": 2.6530799247604577e-05, - "loss": 0.2745, + "epoch": 2.5417162215735036, + "grad_norm": 0.2506287693977356, + "learning_rate": 2.5553232976644352e-05, + "loss": 0.382, "step": 70525 }, { - "epoch": 2.48, - "learning_rate": 2.6527955861807385e-05, - "loss": 0.2485, + "epoch": 2.541896421234728, + "grad_norm": 0.2257665991783142, + "learning_rate": 2.55503155538334e-05, + "loss": 0.4206, "step": 70530 }, { - "epoch": 2.48, - "learning_rate": 2.6525112456170692e-05, - "loss": 0.2742, + "epoch": 2.5420766208959527, + "grad_norm": 0.2143712192773819, + "learning_rate": 2.5547398123524495e-05, + "loss": 0.3677, "step": 70535 }, { - "epoch": 2.48, - "learning_rate": 2.6522269030731416e-05, - "loss": 0.2785, + "epoch": 2.5422568205571774, + "grad_norm": 0.21773605048656464, + "learning_rate": 2.554448068575741e-05, + "loss": 0.4063, "step": 70540 }, { - "epoch": 2.48, - "learning_rate": 2.651942558552648e-05, - "loss": 0.2837, + "epoch": 2.542437020218402, + "grad_norm": 0.19206026196479797, + "learning_rate": 2.5541563240571877e-05, + "loss": 0.4034, "step": 70545 }, { - "epoch": 2.48, - "learning_rate": 2.651658212059281e-05, - "loss": 0.2881, + "epoch": 2.5426172198796264, + "grad_norm": 0.19149447977542877, + "learning_rate": 2.553864578800765e-05, + "loss": 0.377, "step": 70550 }, { - "epoch": 2.48, - "learning_rate": 2.6513738635967316e-05, - "loss": 0.2577, + "epoch": 2.542797419540851, + "grad_norm": 0.18062478303909302, + "learning_rate": 2.553572832810447e-05, + "loss": 0.3932, "step": 70555 }, { - "epoch": 2.48, - "learning_rate": 2.6510895131686932e-05, - "loss": 0.2778, + "epoch": 2.542977619202076, + "grad_norm": 0.1797667294740677, + "learning_rate": 2.55328108609021e-05, + "loss": 0.3771, "step": 70560 }, { - "epoch": 2.48, - "learning_rate": 2.650805160778856e-05, - "loss": 0.2704, + "epoch": 2.5431578188633006, + "grad_norm": 0.25208544731140137, + "learning_rate": 2.5529893386440295e-05, + "loss": 0.4192, "step": 70565 }, { - "epoch": 2.48, - "learning_rate": 2.6505208064309134e-05, - "loss": 0.2713, + "epoch": 2.5433380185245253, + "grad_norm": 0.2537004351615906, + "learning_rate": 2.552697590475878e-05, + "loss": 0.3734, "step": 70570 }, { - "epoch": 2.48, - "learning_rate": 2.650236450128557e-05, - "loss": 0.2803, + "epoch": 2.54351821818575, + "grad_norm": 0.27715036273002625, + "learning_rate": 2.5524058415897328e-05, + "loss": 0.3997, "step": 70575 }, { - "epoch": 2.48, - "learning_rate": 2.6499520918754805e-05, - "loss": 0.2946, + "epoch": 2.5436984178469744, + "grad_norm": 0.19086778163909912, + "learning_rate": 2.552114091989568e-05, + "loss": 0.3765, "step": 70580 }, { - "epoch": 2.48, - "learning_rate": 2.6496677316753738e-05, - "loss": 0.2721, + "epoch": 2.543878617508199, + "grad_norm": 0.22903601825237274, + "learning_rate": 2.5518223416793592e-05, + "loss": 0.4433, "step": 70585 }, { - "epoch": 2.48, - "learning_rate": 2.6493833695319308e-05, - "loss": 0.2775, + "epoch": 2.544058817169424, + "grad_norm": 0.21384993195533752, + "learning_rate": 2.5515305906630805e-05, + "loss": 0.4386, "step": 70590 }, { - "epoch": 2.48, - "learning_rate": 2.649099005448843e-05, - "loss": 0.2534, + "epoch": 2.544239016830648, + "grad_norm": 0.2099718600511551, + "learning_rate": 2.5512388389447074e-05, + "loss": 0.4044, "step": 70595 }, { - "epoch": 2.48, - "learning_rate": 2.648814639429803e-05, - "loss": 0.2831, + "epoch": 2.544419216491873, + "grad_norm": 0.20232221484184265, + "learning_rate": 2.5509470865282155e-05, + "loss": 0.4041, "step": 70600 }, { - "epoch": 2.48, - "learning_rate": 2.6485302714785027e-05, - "loss": 0.2819, + "epoch": 2.5445994161530976, + "grad_norm": 0.19650457799434662, + "learning_rate": 2.550655333417579e-05, + "loss": 0.3656, "step": 70605 }, { - "epoch": 2.48, - "learning_rate": 2.6482459015986348e-05, - "loss": 0.256, + "epoch": 2.5447796158143223, + "grad_norm": 0.25365570187568665, + "learning_rate": 2.550363579616774e-05, + "loss": 0.3818, "step": 70610 }, { - "epoch": 2.48, - "learning_rate": 2.6479615297938914e-05, - "loss": 0.2702, + "epoch": 2.544959815475547, + "grad_norm": 0.19008086621761322, + "learning_rate": 2.5500718251297746e-05, + "loss": 0.4, "step": 70615 }, { - "epoch": 2.48, - "learning_rate": 2.6476771560679657e-05, - "loss": 0.2729, + "epoch": 2.545140015136772, + "grad_norm": 0.17244011163711548, + "learning_rate": 2.5497800699605563e-05, + "loss": 0.4169, "step": 70620 }, { - "epoch": 2.48, - "learning_rate": 2.647392780424549e-05, - "loss": 0.2365, + "epoch": 2.545320214797996, + "grad_norm": 0.21005220711231232, + "learning_rate": 2.5494883141130938e-05, + "loss": 0.3926, "step": 70625 }, { - "epoch": 2.48, - "learning_rate": 2.6471084028673344e-05, - "loss": 0.2691, + "epoch": 2.545500414459221, + "grad_norm": 0.15076076984405518, + "learning_rate": 2.5491965575913635e-05, + "loss": 0.3612, "step": 70630 }, { - "epoch": 2.49, - "learning_rate": 2.6468240234000134e-05, - "loss": 0.2795, + "epoch": 2.5456806141204456, + "grad_norm": 0.2208922952413559, + "learning_rate": 2.548904800399339e-05, + "loss": 0.4058, "step": 70635 }, { - "epoch": 2.49, - "learning_rate": 2.64653964202628e-05, - "loss": 0.2887, + "epoch": 2.54586081378167, + "grad_norm": 0.2186022400856018, + "learning_rate": 2.548613042540996e-05, + "loss": 0.4301, "step": 70640 }, { - "epoch": 2.49, - "learning_rate": 2.6462552587498256e-05, - "loss": 0.2906, + "epoch": 2.5460410134428946, + "grad_norm": 0.2868672311306, + "learning_rate": 2.5483212840203097e-05, + "loss": 0.3947, "step": 70645 }, { - "epoch": 2.49, - "learning_rate": 2.645970873574343e-05, - "loss": 0.2476, + "epoch": 2.5462212131041193, + "grad_norm": 0.22895467281341553, + "learning_rate": 2.5480295248412555e-05, + "loss": 0.4076, "step": 70650 }, { - "epoch": 2.49, - "learning_rate": 2.6456864865035246e-05, - "loss": 0.2553, + "epoch": 2.546401412765344, + "grad_norm": 0.23754490911960602, + "learning_rate": 2.5477377650078087e-05, + "loss": 0.3852, "step": 70655 }, { - "epoch": 2.49, - "learning_rate": 2.6454020975410636e-05, - "loss": 0.2771, + "epoch": 2.5465816124265688, + "grad_norm": 0.17783097922801971, + "learning_rate": 2.5474460045239435e-05, + "loss": 0.408, "step": 70660 }, { - "epoch": 2.49, - "learning_rate": 2.6451177066906525e-05, - "loss": 0.2717, + "epoch": 2.5467618120877935, + "grad_norm": 0.21512363851070404, + "learning_rate": 2.5471542433936358e-05, + "loss": 0.4015, "step": 70665 }, { - "epoch": 2.49, - "learning_rate": 2.6448333139559834e-05, - "loss": 0.2877, + "epoch": 2.546942011749018, + "grad_norm": 0.2190975397825241, + "learning_rate": 2.546862481620861e-05, + "loss": 0.3845, "step": 70670 }, { - "epoch": 2.49, - "learning_rate": 2.644548919340749e-05, - "loss": 0.2463, + "epoch": 2.5471222114102425, + "grad_norm": 0.19571487605571747, + "learning_rate": 2.5465707192095927e-05, + "loss": 0.4478, "step": 70675 }, { - "epoch": 2.49, - "learning_rate": 2.644264522848642e-05, - "loss": 0.2666, + "epoch": 2.5473024110714673, + "grad_norm": 0.2442859262228012, + "learning_rate": 2.546278956163809e-05, + "loss": 0.3756, "step": 70680 }, { - "epoch": 2.49, - "learning_rate": 2.6439801244833552e-05, - "loss": 0.2872, + "epoch": 2.5474826107326916, + "grad_norm": 0.20132243633270264, + "learning_rate": 2.5459871924874822e-05, + "loss": 0.3883, "step": 70685 }, { - "epoch": 2.49, - "learning_rate": 2.6436957242485817e-05, - "loss": 0.255, + "epoch": 2.5476628103939163, + "grad_norm": 0.22069872915744781, + "learning_rate": 2.54569542818459e-05, + "loss": 0.4054, "step": 70690 }, { - "epoch": 2.49, - "learning_rate": 2.643411322148014e-05, - "loss": 0.2743, + "epoch": 2.547843010055141, + "grad_norm": 0.1703534722328186, + "learning_rate": 2.5454036632591055e-05, + "loss": 0.4059, "step": 70695 }, { - "epoch": 2.49, - "learning_rate": 2.643126918185344e-05, - "loss": 0.2633, + "epoch": 2.5480232097163658, + "grad_norm": 0.1875164955854416, + "learning_rate": 2.5451118977150053e-05, + "loss": 0.3853, "step": 70700 }, { - "epoch": 2.49, - "learning_rate": 2.6428425123642664e-05, - "loss": 0.2505, + "epoch": 2.5482034093775905, + "grad_norm": 0.22562925517559052, + "learning_rate": 2.544820131556264e-05, + "loss": 0.4122, "step": 70705 }, { - "epoch": 2.49, - "learning_rate": 2.6425581046884718e-05, - "loss": 0.2791, + "epoch": 2.5483836090388152, + "grad_norm": 0.1860308051109314, + "learning_rate": 2.5445283647868574e-05, + "loss": 0.3849, "step": 70710 }, { - "epoch": 2.49, - "learning_rate": 2.642273695161655e-05, - "loss": 0.2804, + "epoch": 2.5485638087000395, + "grad_norm": 0.21737992763519287, + "learning_rate": 2.54423659741076e-05, + "loss": 0.4249, "step": 70715 }, { - "epoch": 2.49, - "learning_rate": 2.6419892837875077e-05, - "loss": 0.2931, + "epoch": 2.5487440083612642, + "grad_norm": 0.2317013442516327, + "learning_rate": 2.543944829431948e-05, + "loss": 0.3782, "step": 70720 }, { - "epoch": 2.49, - "learning_rate": 2.6417048705697232e-05, - "loss": 0.276, + "epoch": 2.548924208022489, + "grad_norm": 0.183481365442276, + "learning_rate": 2.543653060854396e-05, + "loss": 0.4066, "step": 70725 }, { - "epoch": 2.49, - "learning_rate": 2.641420455511994e-05, - "loss": 0.2728, + "epoch": 2.5491044076837137, + "grad_norm": 0.2388121783733368, + "learning_rate": 2.5433612916820798e-05, + "loss": 0.3918, "step": 70730 }, { - "epoch": 2.49, - "learning_rate": 2.641136038618014e-05, - "loss": 0.2851, + "epoch": 2.549284607344938, + "grad_norm": 0.22414498031139374, + "learning_rate": 2.5430695219189738e-05, + "loss": 0.4132, "step": 70735 }, { - "epoch": 2.49, - "learning_rate": 2.640851619891475e-05, - "loss": 0.2897, + "epoch": 2.5494648070061627, + "grad_norm": 0.18682889640331268, + "learning_rate": 2.5427777515690543e-05, + "loss": 0.4106, "step": 70740 }, { - "epoch": 2.49, - "learning_rate": 2.6405671993360716e-05, - "loss": 0.2527, + "epoch": 2.5496450066673875, + "grad_norm": 0.18605414032936096, + "learning_rate": 2.5424859806362954e-05, + "loss": 0.3922, "step": 70745 }, { - "epoch": 2.49, - "learning_rate": 2.6402827769554943e-05, - "loss": 0.2513, + "epoch": 2.549825206328612, + "grad_norm": 0.19950343668460846, + "learning_rate": 2.542194209124675e-05, + "loss": 0.3529, "step": 70750 }, { - "epoch": 2.49, - "learning_rate": 2.6399983527534388e-05, - "loss": 0.2765, + "epoch": 2.550005405989837, + "grad_norm": 0.22607798874378204, + "learning_rate": 2.541902437038165e-05, + "loss": 0.3833, "step": 70755 }, { - "epoch": 2.49, - "learning_rate": 2.6397139267335963e-05, - "loss": 0.2954, + "epoch": 2.5501856056510612, + "grad_norm": 0.24269573390483856, + "learning_rate": 2.5416106643807434e-05, + "loss": 0.4602, "step": 70760 }, { - "epoch": 2.49, - "learning_rate": 2.6394294988996604e-05, - "loss": 0.2749, + "epoch": 2.550365805312286, + "grad_norm": 0.22042149305343628, + "learning_rate": 2.541318891156384e-05, + "loss": 0.385, "step": 70765 }, { - "epoch": 2.49, - "learning_rate": 2.6391450692553243e-05, - "loss": 0.2695, + "epoch": 2.5505460049735107, + "grad_norm": 0.21216388046741486, + "learning_rate": 2.541027117369063e-05, + "loss": 0.4038, "step": 70770 }, { - "epoch": 2.49, - "learning_rate": 2.6388606378042812e-05, - "loss": 0.2888, + "epoch": 2.5507262046347354, + "grad_norm": 0.21207298338413239, + "learning_rate": 2.5407353430227554e-05, + "loss": 0.4195, "step": 70775 }, { - "epoch": 2.49, - "learning_rate": 2.638576204550225e-05, - "loss": 0.2716, + "epoch": 2.5509064042959597, + "grad_norm": 0.219013512134552, + "learning_rate": 2.540443568121436e-05, + "loss": 0.4065, "step": 70780 }, { - "epoch": 2.49, - "learning_rate": 2.6382917694968474e-05, - "loss": 0.2771, + "epoch": 2.5510866039571845, + "grad_norm": 0.23736967146396637, + "learning_rate": 2.5401517926690816e-05, + "loss": 0.3827, "step": 70785 }, { - "epoch": 2.49, - "learning_rate": 2.638007332647843e-05, - "loss": 0.2758, + "epoch": 2.551266803618409, + "grad_norm": 0.20479167997837067, + "learning_rate": 2.5398600166696668e-05, + "loss": 0.4108, "step": 70790 }, { - "epoch": 2.49, - "learning_rate": 2.6377228940069032e-05, - "loss": 0.2839, + "epoch": 2.551447003279634, + "grad_norm": 0.24934591352939606, + "learning_rate": 2.539568240127167e-05, + "loss": 0.3904, "step": 70795 }, { - "epoch": 2.49, - "learning_rate": 2.6374384535777235e-05, - "loss": 0.2812, + "epoch": 2.5516272029408587, + "grad_norm": 0.20699314773082733, + "learning_rate": 2.5392764630455574e-05, + "loss": 0.4098, "step": 70800 }, { - "epoch": 2.49, - "learning_rate": 2.6371540113639952e-05, - "loss": 0.2791, + "epoch": 2.5518074026020834, + "grad_norm": 0.22364868223667145, + "learning_rate": 2.5389846854288135e-05, + "loss": 0.4066, "step": 70805 }, { - "epoch": 2.49, - "learning_rate": 2.6368695673694133e-05, - "loss": 0.2697, + "epoch": 2.5519876022633077, + "grad_norm": 0.23530325293540955, + "learning_rate": 2.5386929072809107e-05, + "loss": 0.4167, "step": 70810 }, { - "epoch": 2.49, - "learning_rate": 2.6365851215976695e-05, - "loss": 0.3063, + "epoch": 2.5521678019245324, + "grad_norm": 0.2264833003282547, + "learning_rate": 2.5384011286058245e-05, + "loss": 0.3679, "step": 70815 }, { - "epoch": 2.49, - "learning_rate": 2.6363006740524583e-05, - "loss": 0.2718, + "epoch": 2.552348001585757, + "grad_norm": 0.20546197891235352, + "learning_rate": 2.5381093494075308e-05, + "loss": 0.3812, "step": 70820 }, { - "epoch": 2.49, - "learning_rate": 2.636016224737472e-05, - "loss": 0.2651, + "epoch": 2.5525282012469814, + "grad_norm": 0.1657361537218094, + "learning_rate": 2.5378175696900042e-05, + "loss": 0.3598, "step": 70825 }, { - "epoch": 2.49, - "learning_rate": 2.6357317736564058e-05, - "loss": 0.2578, + "epoch": 2.552708400908206, + "grad_norm": 0.24184440076351166, + "learning_rate": 2.5375257894572207e-05, + "loss": 0.3985, "step": 70830 }, { - "epoch": 2.49, - "learning_rate": 2.6354473208129516e-05, - "loss": 0.2959, + "epoch": 2.552888600569431, + "grad_norm": 0.22822557389736176, + "learning_rate": 2.537234008713156e-05, + "loss": 0.3811, "step": 70835 }, { - "epoch": 2.49, - "learning_rate": 2.635162866210803e-05, - "loss": 0.2696, + "epoch": 2.5530688002306556, + "grad_norm": 0.2141774296760559, + "learning_rate": 2.5369422274617844e-05, + "loss": 0.4142, "step": 70840 }, { - "epoch": 2.49, - "learning_rate": 2.634878409853654e-05, - "loss": 0.2828, + "epoch": 2.5532489998918804, + "grad_norm": 0.16231632232666016, + "learning_rate": 2.5366504457070826e-05, + "loss": 0.3693, "step": 70845 }, { - "epoch": 2.49, - "learning_rate": 2.6345939517451968e-05, - "loss": 0.2892, + "epoch": 2.553429199553105, + "grad_norm": 0.1983339488506317, + "learning_rate": 2.536358663453025e-05, + "loss": 0.4081, "step": 70850 }, { - "epoch": 2.49, - "learning_rate": 2.6343094918891264e-05, - "loss": 0.268, + "epoch": 2.5536093992143294, + "grad_norm": 0.21507583558559418, + "learning_rate": 2.5360668807035887e-05, + "loss": 0.3717, "step": 70855 }, { - "epoch": 2.49, - "learning_rate": 2.634025030289136e-05, - "loss": 0.28, + "epoch": 2.553789598875554, + "grad_norm": 0.2187241017818451, + "learning_rate": 2.535775097462747e-05, + "loss": 0.3672, "step": 70860 }, { - "epoch": 2.49, - "learning_rate": 2.6337405669489183e-05, - "loss": 0.301, + "epoch": 2.553969798536779, + "grad_norm": 0.20706336200237274, + "learning_rate": 2.5354833137344773e-05, + "loss": 0.407, "step": 70865 }, { - "epoch": 2.49, - "learning_rate": 2.633456101872168e-05, - "loss": 0.2413, + "epoch": 2.554149998198003, + "grad_norm": 0.21873357892036438, + "learning_rate": 2.5351915295227535e-05, + "loss": 0.4204, "step": 70870 }, { - "epoch": 2.49, - "learning_rate": 2.633171635062578e-05, - "loss": 0.2937, + "epoch": 2.554330197859228, + "grad_norm": 0.1820431351661682, + "learning_rate": 2.534899744831553e-05, + "loss": 0.3873, "step": 70875 }, { - "epoch": 2.49, - "learning_rate": 2.6328871665238425e-05, - "loss": 0.2646, + "epoch": 2.5545103975204526, + "grad_norm": 0.17599505186080933, + "learning_rate": 2.5346079596648497e-05, + "loss": 0.3794, "step": 70880 }, { - "epoch": 2.49, - "learning_rate": 2.632602696259654e-05, - "loss": 0.2688, + "epoch": 2.5546905971816773, + "grad_norm": 0.2262696623802185, + "learning_rate": 2.534316174026619e-05, + "loss": 0.4126, "step": 70885 }, { - "epoch": 2.49, - "learning_rate": 2.632318224273707e-05, - "loss": 0.2705, + "epoch": 2.554870796842902, + "grad_norm": 0.23906999826431274, + "learning_rate": 2.534024387920838e-05, + "loss": 0.3999, "step": 70890 }, { - "epoch": 2.49, - "learning_rate": 2.6320337505696956e-05, - "loss": 0.2516, + "epoch": 2.555050996504127, + "grad_norm": 0.16976365447044373, + "learning_rate": 2.5337326013514817e-05, + "loss": 0.3864, "step": 70895 }, { - "epoch": 2.49, - "learning_rate": 2.6317492751513128e-05, - "loss": 0.2815, + "epoch": 2.555231196165351, + "grad_norm": 0.2087818831205368, + "learning_rate": 2.5334408143225246e-05, + "loss": 0.4226, "step": 70900 }, { - "epoch": 2.49, - "learning_rate": 2.6314647980222524e-05, - "loss": 0.2841, + "epoch": 2.555411395826576, + "grad_norm": 0.21390338242053986, + "learning_rate": 2.533149026837942e-05, + "loss": 0.3674, "step": 70905 }, { - "epoch": 2.49, - "learning_rate": 2.631180319186208e-05, - "loss": 0.2748, + "epoch": 2.5555915954878006, + "grad_norm": 0.22753164172172546, + "learning_rate": 2.532857238901712e-05, + "loss": 0.4407, "step": 70910 }, { - "epoch": 2.49, - "learning_rate": 2.630895838646874e-05, - "loss": 0.2425, + "epoch": 2.555771795149025, + "grad_norm": 0.179393008351326, + "learning_rate": 2.5325654505178074e-05, + "loss": 0.4156, "step": 70915 }, { - "epoch": 2.5, - "learning_rate": 2.630611356407943e-05, - "loss": 0.2694, + "epoch": 2.5559519948102496, + "grad_norm": 0.22798627614974976, + "learning_rate": 2.53233201949103e-05, + "loss": 0.4018, "step": 70920 }, { - "epoch": 2.5, - "learning_rate": 2.6303268724731112e-05, - "loss": 0.2638, + "epoch": 2.5561321944714743, + "grad_norm": 0.203562393784523, + "learning_rate": 2.5320402303113327e-05, + "loss": 0.4037, "step": 70925 }, { - "epoch": 2.5, - "learning_rate": 2.6300423868460688e-05, - "loss": 0.2543, + "epoch": 2.556312394132699, + "grad_norm": 0.22589832544326782, + "learning_rate": 2.531748440695092e-05, + "loss": 0.3967, "step": 70930 }, { - "epoch": 2.5, - "learning_rate": 2.6297578995305133e-05, - "loss": 0.2726, + "epoch": 2.556492593793924, + "grad_norm": 0.23674431443214417, + "learning_rate": 2.5314566506462856e-05, + "loss": 0.3992, "step": 70935 }, { - "epoch": 2.5, - "learning_rate": 2.629473410530136e-05, - "loss": 0.2665, + "epoch": 2.5566727934551485, + "grad_norm": 0.20412121713161469, + "learning_rate": 2.5311648601688876e-05, + "loss": 0.4023, "step": 70940 }, { - "epoch": 2.5, - "learning_rate": 2.629188919848633e-05, - "loss": 0.2788, + "epoch": 2.556852993116373, + "grad_norm": 0.17974796891212463, + "learning_rate": 2.5308730692668754e-05, + "loss": 0.3787, "step": 70945 }, { - "epoch": 2.5, - "learning_rate": 2.6289044274896963e-05, - "loss": 0.2554, + "epoch": 2.5570331927775976, + "grad_norm": 0.16718092560768127, + "learning_rate": 2.5305812779442232e-05, + "loss": 0.407, "step": 70950 }, { - "epoch": 2.5, - "learning_rate": 2.62861993345702e-05, - "loss": 0.2959, + "epoch": 2.5572133924388223, + "grad_norm": 0.16526854038238525, + "learning_rate": 2.530289486204907e-05, + "loss": 0.3996, "step": 70955 }, { - "epoch": 2.5, - "learning_rate": 2.628335437754299e-05, - "loss": 0.2694, + "epoch": 2.5573935921000466, + "grad_norm": 0.2542569935321808, + "learning_rate": 2.529997694052903e-05, + "loss": 0.3735, "step": 70960 }, { - "epoch": 2.5, - "learning_rate": 2.6280509403852277e-05, - "loss": 0.2644, + "epoch": 2.5575737917612713, + "grad_norm": 0.21731750667095184, + "learning_rate": 2.5297059014921857e-05, + "loss": 0.381, "step": 70965 }, { - "epoch": 2.5, - "learning_rate": 2.627766441353498e-05, - "loss": 0.2655, + "epoch": 2.557753991422496, + "grad_norm": 0.18733660876750946, + "learning_rate": 2.5294141085267315e-05, + "loss": 0.3858, "step": 70970 }, { - "epoch": 2.5, - "learning_rate": 2.6274819406628066e-05, - "loss": 0.2675, + "epoch": 2.5579341910837208, + "grad_norm": 0.23402203619480133, + "learning_rate": 2.5291223151605147e-05, + "loss": 0.4118, "step": 70975 }, { - "epoch": 2.5, - "learning_rate": 2.6271974383168453e-05, - "loss": 0.2728, + "epoch": 2.5581143907449455, + "grad_norm": 0.19820962846279144, + "learning_rate": 2.5288305213975132e-05, + "loss": 0.3861, "step": 70980 }, { - "epoch": 2.5, - "learning_rate": 2.6269129343193093e-05, - "loss": 0.246, + "epoch": 2.5582945904061702, + "grad_norm": 0.2706112861633301, + "learning_rate": 2.5285387272417015e-05, + "loss": 0.4072, "step": 70985 }, { - "epoch": 2.5, - "learning_rate": 2.6266284286738924e-05, - "loss": 0.2662, + "epoch": 2.5584747900673945, + "grad_norm": 0.21151851117610931, + "learning_rate": 2.5282469326970543e-05, + "loss": 0.3774, "step": 70990 }, { - "epoch": 2.5, - "learning_rate": 2.6263439213842893e-05, - "loss": 0.2847, + "epoch": 2.5586549897286193, + "grad_norm": 0.1977061629295349, + "learning_rate": 2.5279551377675488e-05, + "loss": 0.4204, "step": 70995 }, { - "epoch": 2.5, - "learning_rate": 2.6260594124541932e-05, - "loss": 0.2974, + "epoch": 2.558835189389844, + "grad_norm": 0.19445201754570007, + "learning_rate": 2.5276633424571594e-05, + "loss": 0.3868, "step": 71000 }, { - "epoch": 2.5, - "eval_loss": 0.26469552516937256, - "eval_runtime": 10.5298, - "eval_samples_per_second": 9.497, - "eval_steps_per_second": 9.497, + "epoch": 2.558835189389844, + "eval_loss": 0.43283751606941223, + "eval_runtime": 3.5328, + "eval_samples_per_second": 28.306, + "eval_steps_per_second": 7.077, "step": 71000 }, { - "epoch": 2.5, - "learning_rate": 2.6257749018872983e-05, - "loss": 0.2703, + "epoch": 2.5590153890510687, + "grad_norm": 0.17734093964099884, + "learning_rate": 2.5273715467698633e-05, + "loss": 0.3773, "step": 71005 }, { - "epoch": 2.5, - "learning_rate": 2.6254903896872996e-05, - "loss": 0.2498, + "epoch": 2.559195588712293, + "grad_norm": 0.23937411606311798, + "learning_rate": 2.527079750709634e-05, + "loss": 0.4013, "step": 71010 }, { - "epoch": 2.5, - "learning_rate": 2.6252058758578907e-05, - "loss": 0.2616, + "epoch": 2.5593757883735178, + "grad_norm": 0.1706862598657608, + "learning_rate": 2.5267879542804484e-05, + "loss": 0.3689, "step": 71015 }, { - "epoch": 2.5, - "learning_rate": 2.6249213604027667e-05, - "loss": 0.2757, + "epoch": 2.5595559880347425, + "grad_norm": 0.20813287794589996, + "learning_rate": 2.526496157486283e-05, + "loss": 0.3894, "step": 71020 }, { - "epoch": 2.5, - "learning_rate": 2.6246368433256206e-05, - "loss": 0.2803, + "epoch": 2.5597361876959672, + "grad_norm": 0.24396944046020508, + "learning_rate": 2.526204360331112e-05, + "loss": 0.3836, "step": 71025 }, { - "epoch": 2.5, - "learning_rate": 2.6243523246301477e-05, - "loss": 0.2634, + "epoch": 2.559916387357192, + "grad_norm": 0.18609154224395752, + "learning_rate": 2.525912562818912e-05, + "loss": 0.3683, "step": 71030 }, { - "epoch": 2.5, - "learning_rate": 2.6240678043200416e-05, - "loss": 0.2533, + "epoch": 2.5600965870184162, + "grad_norm": 0.24493709206581116, + "learning_rate": 2.525620764953658e-05, + "loss": 0.3732, "step": 71035 }, { - "epoch": 2.5, - "learning_rate": 2.6237832823989967e-05, - "loss": 0.2823, + "epoch": 2.560276786679641, + "grad_norm": 0.19621089100837708, + "learning_rate": 2.525328966739326e-05, + "loss": 0.3917, "step": 71040 }, { - "epoch": 2.5, - "learning_rate": 2.623498758870707e-05, - "loss": 0.2766, + "epoch": 2.5604569863408657, + "grad_norm": 0.20630283653736115, + "learning_rate": 2.525037168179892e-05, + "loss": 0.4063, "step": 71045 }, { - "epoch": 2.5, - "learning_rate": 2.6232142337388684e-05, - "loss": 0.3033, + "epoch": 2.5606371860020904, + "grad_norm": 0.2241489142179489, + "learning_rate": 2.524745369279331e-05, + "loss": 0.4282, "step": 71050 }, { - "epoch": 2.5, - "learning_rate": 2.6229297070071734e-05, - "loss": 0.2837, + "epoch": 2.5608173856633147, + "grad_norm": 0.20686471462249756, + "learning_rate": 2.52445357004162e-05, + "loss": 0.4066, "step": 71055 }, { - "epoch": 2.5, - "learning_rate": 2.6226451786793177e-05, - "loss": 0.2547, + "epoch": 2.5609975853245395, + "grad_norm": 0.2341819703578949, + "learning_rate": 2.5241617704707328e-05, + "loss": 0.4269, "step": 71060 }, { - "epoch": 2.5, - "learning_rate": 2.6223606487589946e-05, - "loss": 0.2853, + "epoch": 2.561177784985764, + "grad_norm": 0.17863284051418304, + "learning_rate": 2.523869970570647e-05, + "loss": 0.4245, "step": 71065 }, { - "epoch": 2.5, - "learning_rate": 2.6220761172498993e-05, - "loss": 0.2503, + "epoch": 2.561357984646989, + "grad_norm": 0.31622689962387085, + "learning_rate": 2.523578170345337e-05, + "loss": 0.437, "step": 71070 }, { - "epoch": 2.5, - "learning_rate": 2.6217915841557255e-05, - "loss": 0.2546, + "epoch": 2.5615381843082137, + "grad_norm": 0.24306558072566986, + "learning_rate": 2.5232863697987796e-05, + "loss": 0.4115, "step": 71075 }, { - "epoch": 2.5, - "learning_rate": 2.6215070494801693e-05, - "loss": 0.2663, + "epoch": 2.5617183839694384, + "grad_norm": 0.23051370680332184, + "learning_rate": 2.5229945689349487e-05, + "loss": 0.4229, "step": 71080 }, { - "epoch": 2.5, - "learning_rate": 2.6212225132269234e-05, - "loss": 0.2875, + "epoch": 2.5618985836306627, + "grad_norm": 0.22686095535755157, + "learning_rate": 2.5227027677578224e-05, + "loss": 0.3949, "step": 71085 }, { - "epoch": 2.5, - "learning_rate": 2.6209379753996833e-05, - "loss": 0.2916, + "epoch": 2.5620787832918874, + "grad_norm": 0.24122454226016998, + "learning_rate": 2.5224109662713752e-05, + "loss": 0.382, "step": 71090 }, { - "epoch": 2.5, - "learning_rate": 2.6206534360021427e-05, - "loss": 0.286, + "epoch": 2.562258982953112, + "grad_norm": 0.19796620309352875, + "learning_rate": 2.5221191644795822e-05, + "loss": 0.3849, "step": 71095 }, { - "epoch": 2.5, - "learning_rate": 2.6203688950379978e-05, - "loss": 0.2548, + "epoch": 2.5624391826143365, + "grad_norm": 0.20853497087955475, + "learning_rate": 2.5218273623864202e-05, + "loss": 0.3628, "step": 71100 }, { - "epoch": 2.5, - "learning_rate": 2.6200843525109418e-05, - "loss": 0.2594, + "epoch": 2.562619382275561, + "grad_norm": 0.24205127358436584, + "learning_rate": 2.5215355599958647e-05, + "loss": 0.4302, "step": 71105 }, { - "epoch": 2.5, - "learning_rate": 2.61979980842467e-05, - "loss": 0.2408, + "epoch": 2.562799581936786, + "grad_norm": 0.18552717566490173, + "learning_rate": 2.521243757311892e-05, + "loss": 0.3786, "step": 71110 }, { - "epoch": 2.5, - "learning_rate": 2.619515262782876e-05, - "loss": 0.2577, + "epoch": 2.5629797815980107, + "grad_norm": 0.15225747227668762, + "learning_rate": 2.5209519543384763e-05, + "loss": 0.3853, "step": 71115 }, { - "epoch": 2.5, - "learning_rate": 2.6192307155892547e-05, - "loss": 0.2733, + "epoch": 2.5631599812592354, + "grad_norm": 0.2108543962240219, + "learning_rate": 2.5206601510795948e-05, + "loss": 0.4087, "step": 71120 }, { - "epoch": 2.5, - "learning_rate": 2.6189461668475018e-05, - "loss": 0.2338, + "epoch": 2.56334018092046, + "grad_norm": 0.26248225569725037, + "learning_rate": 2.520368347539222e-05, + "loss": 0.3905, "step": 71125 }, { - "epoch": 2.5, - "learning_rate": 2.618661616561311e-05, - "loss": 0.2518, + "epoch": 2.5635203805816844, + "grad_norm": 0.18176917731761932, + "learning_rate": 2.5200765437213347e-05, + "loss": 0.3922, "step": 71130 }, { - "epoch": 2.5, - "learning_rate": 2.618377064734378e-05, - "loss": 0.2524, + "epoch": 2.563700580242909, + "grad_norm": 0.1985873579978943, + "learning_rate": 2.519784739629909e-05, + "loss": 0.3832, "step": 71135 }, { - "epoch": 2.5, - "learning_rate": 2.618092511370396e-05, - "loss": 0.244, + "epoch": 2.563880779904134, + "grad_norm": 0.217198446393013, + "learning_rate": 2.519492935268919e-05, + "loss": 0.4287, "step": 71140 }, { - "epoch": 2.5, - "learning_rate": 2.6178079564730613e-05, - "loss": 0.2627, + "epoch": 2.564060979565358, + "grad_norm": 0.18571309745311737, + "learning_rate": 2.5192011306423424e-05, + "loss": 0.3855, "step": 71145 }, { - "epoch": 2.5, - "learning_rate": 2.617523400046067e-05, - "loss": 0.2476, + "epoch": 2.564241179226583, + "grad_norm": 0.22361566126346588, + "learning_rate": 2.518909325754154e-05, + "loss": 0.419, "step": 71150 }, { - "epoch": 2.5, - "learning_rate": 2.6172388420931093e-05, - "loss": 0.2441, + "epoch": 2.5644213788878076, + "grad_norm": 0.16923829913139343, + "learning_rate": 2.51861752060833e-05, + "loss": 0.3755, "step": 71155 }, { - "epoch": 2.5, - "learning_rate": 2.6169542826178822e-05, - "loss": 0.2498, + "epoch": 2.5646015785490324, + "grad_norm": 0.19118502736091614, + "learning_rate": 2.518325715208845e-05, + "loss": 0.3829, "step": 71160 }, { - "epoch": 2.5, - "learning_rate": 2.6166697216240816e-05, - "loss": 0.2706, + "epoch": 2.564781778210257, + "grad_norm": 0.2314773052930832, + "learning_rate": 2.5180339095596755e-05, + "loss": 0.3592, "step": 71165 }, { - "epoch": 2.5, - "learning_rate": 2.616385159115401e-05, - "loss": 0.275, + "epoch": 2.564961977871482, + "grad_norm": 0.2128879874944687, + "learning_rate": 2.517742103664799e-05, + "loss": 0.3965, "step": 71170 }, { - "epoch": 2.5, - "learning_rate": 2.6161005950955358e-05, - "loss": 0.2578, + "epoch": 2.565142177532706, + "grad_norm": 0.20926570892333984, + "learning_rate": 2.5174502975281887e-05, + "loss": 0.3829, "step": 71175 }, { - "epoch": 2.5, - "learning_rate": 2.6158160295681804e-05, - "loss": 0.2837, + "epoch": 2.565322377193931, + "grad_norm": 0.2084009349346161, + "learning_rate": 2.517158491153822e-05, + "loss": 0.3937, "step": 71180 }, { - "epoch": 2.5, - "learning_rate": 2.6155314625370315e-05, - "loss": 0.2616, + "epoch": 2.5655025768551556, + "grad_norm": 0.1534343957901001, + "learning_rate": 2.5168666845456733e-05, + "loss": 0.3473, "step": 71185 }, { - "epoch": 2.5, - "learning_rate": 2.615246894005782e-05, - "loss": 0.2713, + "epoch": 2.56568277651638, + "grad_norm": 0.16337619721889496, + "learning_rate": 2.5165748777077197e-05, + "loss": 0.3902, "step": 71190 }, { - "epoch": 2.5, - "learning_rate": 2.6149623239781272e-05, - "loss": 0.2844, + "epoch": 2.5658629761776046, + "grad_norm": 0.23396508395671844, + "learning_rate": 2.516283070643937e-05, + "loss": 0.4145, "step": 71195 }, { - "epoch": 2.51, - "learning_rate": 2.6146777524577627e-05, - "loss": 0.283, + "epoch": 2.5660431758388293, + "grad_norm": 0.22363629937171936, + "learning_rate": 2.5159912633582998e-05, + "loss": 0.3752, "step": 71200 }, { - "epoch": 2.51, - "learning_rate": 2.6143931794483827e-05, - "loss": 0.2695, + "epoch": 2.566223375500054, + "grad_norm": 0.22132565081119537, + "learning_rate": 2.5156994558547857e-05, + "loss": 0.3862, "step": 71205 }, { - "epoch": 2.51, - "learning_rate": 2.6141086049536833e-05, - "loss": 0.2553, + "epoch": 2.566403575161279, + "grad_norm": 0.16280272603034973, + "learning_rate": 2.5154076481373694e-05, + "loss": 0.3692, "step": 71210 }, { - "epoch": 2.51, - "learning_rate": 2.6138240289773592e-05, - "loss": 0.2613, + "epoch": 2.5665837748225035, + "grad_norm": 0.17420467734336853, + "learning_rate": 2.5151158402100268e-05, + "loss": 0.3694, "step": 71215 }, { - "epoch": 2.51, - "learning_rate": 2.6135394515231037e-05, - "loss": 0.2718, + "epoch": 2.566763974483728, + "grad_norm": 0.22650916874408722, + "learning_rate": 2.514824032076733e-05, + "loss": 0.4037, "step": 71220 }, { - "epoch": 2.51, - "learning_rate": 2.6132548725946147e-05, - "loss": 0.2702, + "epoch": 2.5669441741449526, + "grad_norm": 0.22971905767917633, + "learning_rate": 2.514532223741466e-05, + "loss": 0.4182, "step": 71225 }, { - "epoch": 2.51, - "learning_rate": 2.6129702921955844e-05, - "loss": 0.2659, + "epoch": 2.5671243738061773, + "grad_norm": 0.19210931658744812, + "learning_rate": 2.5142404152081993e-05, + "loss": 0.3954, "step": 71230 }, { - "epoch": 2.51, - "learning_rate": 2.612685710329711e-05, - "loss": 0.2463, + "epoch": 2.567304573467402, + "grad_norm": 0.22867514193058014, + "learning_rate": 2.5139486064809097e-05, + "loss": 0.3992, "step": 71235 }, { - "epoch": 2.51, - "learning_rate": 2.6124011270006866e-05, - "loss": 0.2611, + "epoch": 2.5674847731286263, + "grad_norm": 0.19859769940376282, + "learning_rate": 2.5136567975635733e-05, + "loss": 0.4205, "step": 71240 }, { - "epoch": 2.51, - "learning_rate": 2.612116542212208e-05, - "loss": 0.2812, + "epoch": 2.567664972789851, + "grad_norm": 0.20891262590885162, + "learning_rate": 2.513364988460165e-05, + "loss": 0.4269, "step": 71245 }, { - "epoch": 2.51, - "learning_rate": 2.61183195596797e-05, - "loss": 0.2586, + "epoch": 2.567845172451076, + "grad_norm": 0.20854581892490387, + "learning_rate": 2.5130731791746627e-05, + "loss": 0.3829, "step": 71250 }, { - "epoch": 2.51, - "learning_rate": 2.611547368271668e-05, - "loss": 0.2627, + "epoch": 2.5680253721123005, + "grad_norm": 0.23713940382003784, + "learning_rate": 2.5127813697110398e-05, + "loss": 0.3889, "step": 71255 }, { - "epoch": 2.51, - "learning_rate": 2.611262779126996e-05, - "loss": 0.2599, + "epoch": 2.5682055717735253, + "grad_norm": 0.17710144817829132, + "learning_rate": 2.512489560073274e-05, + "loss": 0.3795, "step": 71260 }, { - "epoch": 2.51, - "learning_rate": 2.610978188537651e-05, - "loss": 0.2799, + "epoch": 2.5683857714347496, + "grad_norm": 0.16682571172714233, + "learning_rate": 2.5121977502653395e-05, + "loss": 0.3858, "step": 71265 }, { - "epoch": 2.51, - "learning_rate": 2.6106935965073275e-05, - "loss": 0.261, + "epoch": 2.5685659710959743, + "grad_norm": 0.2379252016544342, + "learning_rate": 2.511905940291214e-05, + "loss": 0.4112, "step": 71270 }, { - "epoch": 2.51, - "learning_rate": 2.6104090030397206e-05, - "loss": 0.2442, + "epoch": 2.568746170757199, + "grad_norm": 0.19735756516456604, + "learning_rate": 2.5116141301548714e-05, + "loss": 0.3893, "step": 71275 }, { - "epoch": 2.51, - "learning_rate": 2.6101244081385257e-05, - "loss": 0.2656, + "epoch": 2.5689263704184238, + "grad_norm": 0.2579612135887146, + "learning_rate": 2.5113223198602885e-05, + "loss": 0.3624, "step": 71280 }, { - "epoch": 2.51, - "learning_rate": 2.609839811807437e-05, - "loss": 0.263, + "epoch": 2.569106570079648, + "grad_norm": 0.2095087617635727, + "learning_rate": 2.5110305094114416e-05, + "loss": 0.377, "step": 71285 }, { - "epoch": 2.51, - "learning_rate": 2.6095552140501512e-05, - "loss": 0.251, + "epoch": 2.5692867697408728, + "grad_norm": 0.19634920358657837, + "learning_rate": 2.510738698812306e-05, + "loss": 0.3583, "step": 71290 }, { - "epoch": 2.51, - "learning_rate": 2.609270614870363e-05, - "loss": 0.2839, + "epoch": 2.5694669694020975, + "grad_norm": 0.2694518268108368, + "learning_rate": 2.5104468880668587e-05, + "loss": 0.4229, "step": 71295 }, { - "epoch": 2.51, - "learning_rate": 2.6089860142717682e-05, - "loss": 0.2422, + "epoch": 2.5696471690633222, + "grad_norm": 0.1848263442516327, + "learning_rate": 2.5101550771790733e-05, + "loss": 0.4172, "step": 71300 }, { - "epoch": 2.51, - "learning_rate": 2.6087014122580617e-05, - "loss": 0.2632, + "epoch": 2.569827368724547, + "grad_norm": 0.16099122166633606, + "learning_rate": 2.509863266152927e-05, + "loss": 0.3887, "step": 71305 }, { - "epoch": 2.51, - "learning_rate": 2.6084168088329384e-05, - "loss": 0.293, + "epoch": 2.5700075683857713, + "grad_norm": 0.2326761782169342, + "learning_rate": 2.509571454992396e-05, + "loss": 0.4158, "step": 71310 }, { - "epoch": 2.51, - "learning_rate": 2.608132204000095e-05, - "loss": 0.2771, + "epoch": 2.570187768046996, + "grad_norm": 0.23651954531669617, + "learning_rate": 2.509279643701456e-05, + "loss": 0.3646, "step": 71315 }, { - "epoch": 2.51, - "learning_rate": 2.6078475977632268e-05, - "loss": 0.2842, + "epoch": 2.5703679677082207, + "grad_norm": 0.21195195615291595, + "learning_rate": 2.5089878322840826e-05, + "loss": 0.3927, "step": 71320 }, { - "epoch": 2.51, - "learning_rate": 2.6075629901260272e-05, - "loss": 0.2775, + "epoch": 2.5705481673694455, + "grad_norm": 0.23943771421909332, + "learning_rate": 2.5086960207442512e-05, + "loss": 0.3641, "step": 71325 }, { - "epoch": 2.51, - "learning_rate": 2.6072783810921946e-05, - "loss": 0.2772, + "epoch": 2.5707283670306698, + "grad_norm": 0.2217801809310913, + "learning_rate": 2.5084042090859382e-05, + "loss": 0.3776, "step": 71330 }, { - "epoch": 2.51, - "learning_rate": 2.6069937706654218e-05, - "loss": 0.2903, + "epoch": 2.5709085666918945, + "grad_norm": 0.17442776262760162, + "learning_rate": 2.5081123973131205e-05, + "loss": 0.3989, "step": 71335 }, { - "epoch": 2.51, - "learning_rate": 2.606709158849405e-05, - "loss": 0.274, + "epoch": 2.5710887663531192, + "grad_norm": 0.21039848029613495, + "learning_rate": 2.5078205854297715e-05, + "loss": 0.4116, "step": 71340 }, { - "epoch": 2.51, - "learning_rate": 2.606424545647841e-05, - "loss": 0.2756, + "epoch": 2.571268966014344, + "grad_norm": 0.1870678812265396, + "learning_rate": 2.5075287734398695e-05, + "loss": 0.3692, "step": 71345 }, { - "epoch": 2.51, - "learning_rate": 2.6061399310644246e-05, - "loss": 0.2881, + "epoch": 2.5714491656755687, + "grad_norm": 0.22971993684768677, + "learning_rate": 2.507236961347389e-05, + "loss": 0.4334, "step": 71350 }, { - "epoch": 2.51, - "learning_rate": 2.605855315102851e-05, - "loss": 0.2835, + "epoch": 2.5716293653367934, + "grad_norm": 0.1942625790834427, + "learning_rate": 2.5069451491563073e-05, + "loss": 0.3755, "step": 71355 }, { - "epoch": 2.51, - "learning_rate": 2.6055706977668147e-05, - "loss": 0.2657, + "epoch": 2.5718095649980177, + "grad_norm": 0.20081962645053864, + "learning_rate": 2.506653336870598e-05, + "loss": 0.381, "step": 71360 }, { - "epoch": 2.51, - "learning_rate": 2.6052860790600143e-05, - "loss": 0.2821, + "epoch": 2.5719897646592425, + "grad_norm": 0.23155467212200165, + "learning_rate": 2.506361524494239e-05, + "loss": 0.3836, "step": 71365 }, { - "epoch": 2.51, - "learning_rate": 2.6050014589861416e-05, - "loss": 0.2426, + "epoch": 2.572169964320467, + "grad_norm": 0.21292930841445923, + "learning_rate": 2.506069712031205e-05, + "loss": 0.3763, "step": 71370 }, { - "epoch": 2.51, - "learning_rate": 2.6047168375488956e-05, - "loss": 0.2741, + "epoch": 2.5723501639816915, + "grad_norm": 0.1886298656463623, + "learning_rate": 2.5057778994854724e-05, + "loss": 0.3779, "step": 71375 }, { - "epoch": 2.51, - "learning_rate": 2.60443221475197e-05, - "loss": 0.2739, + "epoch": 2.572530363642916, + "grad_norm": 0.211809903383255, + "learning_rate": 2.5054860868610173e-05, + "loss": 0.3735, "step": 71380 }, { - "epoch": 2.51, - "learning_rate": 2.6041475905990614e-05, - "loss": 0.2688, + "epoch": 2.572710563304141, + "grad_norm": 0.18282023072242737, + "learning_rate": 2.505194274161815e-05, + "loss": 0.3909, "step": 71385 }, { - "epoch": 2.51, - "learning_rate": 2.6038629650938644e-05, - "loss": 0.2555, + "epoch": 2.5728907629653657, + "grad_norm": 0.22584885358810425, + "learning_rate": 2.504902461391842e-05, + "loss": 0.4002, "step": 71390 }, { - "epoch": 2.51, - "learning_rate": 2.603578338240075e-05, - "loss": 0.2414, + "epoch": 2.5730709626265904, + "grad_norm": 0.18001751601696014, + "learning_rate": 2.504610648555074e-05, + "loss": 0.4059, "step": 71395 }, { - "epoch": 2.51, - "learning_rate": 2.6032937100413896e-05, - "loss": 0.2605, + "epoch": 2.573251162287815, + "grad_norm": 0.22520719468593597, + "learning_rate": 2.504318835655487e-05, + "loss": 0.4089, "step": 71400 }, { - "epoch": 2.51, - "learning_rate": 2.603009080501504e-05, - "loss": 0.2634, + "epoch": 2.5734313619490394, + "grad_norm": 0.17423519492149353, + "learning_rate": 2.5040270226970558e-05, + "loss": 0.3849, "step": 71405 }, { - "epoch": 2.51, - "learning_rate": 2.602724449624112e-05, - "loss": 0.2889, + "epoch": 2.573611561610264, + "grad_norm": 0.2317894846200943, + "learning_rate": 2.503735209683758e-05, + "loss": 0.3856, "step": 71410 }, { - "epoch": 2.51, - "learning_rate": 2.6024398174129122e-05, - "loss": 0.2832, + "epoch": 2.573791761271489, + "grad_norm": 0.2146165519952774, + "learning_rate": 2.5034433966195686e-05, + "loss": 0.3974, "step": 71415 }, { - "epoch": 2.51, - "learning_rate": 2.602155183871598e-05, - "loss": 0.2571, + "epoch": 2.573971960932713, + "grad_norm": 0.23812200129032135, + "learning_rate": 2.5031515835084636e-05, + "loss": 0.4132, "step": 71420 }, { - "epoch": 2.51, - "learning_rate": 2.601870549003866e-05, - "loss": 0.2695, + "epoch": 2.574152160593938, + "grad_norm": 0.229581817984581, + "learning_rate": 2.502859770354419e-05, + "loss": 0.3816, "step": 71425 }, { - "epoch": 2.51, - "learning_rate": 2.6015859128134124e-05, - "loss": 0.28, + "epoch": 2.5743323602551627, + "grad_norm": 0.2108045518398285, + "learning_rate": 2.5025679571614095e-05, + "loss": 0.3487, "step": 71430 }, { - "epoch": 2.51, - "learning_rate": 2.601301275303933e-05, - "loss": 0.2626, + "epoch": 2.5745125599163874, + "grad_norm": 0.21576263010501862, + "learning_rate": 2.502276143933413e-05, + "loss": 0.3742, "step": 71435 }, { - "epoch": 2.51, - "learning_rate": 2.6010166364791223e-05, - "loss": 0.2621, + "epoch": 2.574692759577612, + "grad_norm": 0.21896515786647797, + "learning_rate": 2.5019843306744045e-05, + "loss": 0.386, "step": 71440 }, { - "epoch": 2.51, - "learning_rate": 2.600731996342678e-05, - "loss": 0.2734, + "epoch": 2.574872959238837, + "grad_norm": 0.25227516889572144, + "learning_rate": 2.50169251738836e-05, + "loss": 0.3987, "step": 71445 }, { - "epoch": 2.51, - "learning_rate": 2.6004473548982945e-05, - "loss": 0.2753, + "epoch": 2.575053158900061, + "grad_norm": 0.24805094301700592, + "learning_rate": 2.5014007040792548e-05, + "loss": 0.3862, "step": 71450 }, { - "epoch": 2.51, - "learning_rate": 2.6001627121496687e-05, - "loss": 0.2534, + "epoch": 2.575233358561286, + "grad_norm": 0.1973733752965927, + "learning_rate": 2.5011088907510648e-05, + "loss": 0.3989, "step": 71455 }, { - "epoch": 2.51, - "learning_rate": 2.599878068100496e-05, - "loss": 0.289, + "epoch": 2.5754135582225106, + "grad_norm": 0.20170655846595764, + "learning_rate": 2.500817077407768e-05, + "loss": 0.3698, "step": 71460 }, { - "epoch": 2.51, - "learning_rate": 2.5995934227544728e-05, - "loss": 0.2631, + "epoch": 2.575593757883735, + "grad_norm": 0.22908920049667358, + "learning_rate": 2.5005252640533374e-05, + "loss": 0.3889, "step": 71465 }, { - "epoch": 2.51, - "learning_rate": 2.599308776115294e-05, - "loss": 0.2656, + "epoch": 2.5757739575449596, + "grad_norm": 0.17910641431808472, + "learning_rate": 2.500233450691751e-05, + "loss": 0.361, "step": 71470 }, { - "epoch": 2.51, - "learning_rate": 2.599024128186656e-05, - "loss": 0.263, + "epoch": 2.5759541572061844, + "grad_norm": 0.22365345060825348, + "learning_rate": 2.4999416373269836e-05, + "loss": 0.4144, "step": 71475 }, { - "epoch": 2.51, - "learning_rate": 2.5987394789722554e-05, - "loss": 0.2728, + "epoch": 2.576134356867409, + "grad_norm": 0.20202086865901947, + "learning_rate": 2.4996498239630105e-05, + "loss": 0.3748, "step": 71480 }, { - "epoch": 2.52, - "learning_rate": 2.5984548284757877e-05, - "loss": 0.2724, + "epoch": 2.576314556528634, + "grad_norm": 0.17802977561950684, + "learning_rate": 2.4993580106038096e-05, + "loss": 0.4047, "step": 71485 }, { - "epoch": 2.52, - "learning_rate": 2.598170176700949e-05, - "loss": 0.2847, + "epoch": 2.5764947561898586, + "grad_norm": 0.22806064784526825, + "learning_rate": 2.499066197253355e-05, + "loss": 0.388, "step": 71490 }, { - "epoch": 2.52, - "learning_rate": 2.597885523651435e-05, - "loss": 0.2776, + "epoch": 2.576674955851083, + "grad_norm": 0.1772589087486267, + "learning_rate": 2.4987743839156234e-05, + "loss": 0.3769, "step": 71495 }, { - "epoch": 2.52, - "learning_rate": 2.597600869330943e-05, - "loss": 0.2942, + "epoch": 2.5768551555123076, + "grad_norm": 0.2450006902217865, + "learning_rate": 2.4984825705945912e-05, + "loss": 0.3806, "step": 71500 }, { - "epoch": 2.52, - "eval_loss": 0.2644185721874237, - "eval_runtime": 10.558, - "eval_samples_per_second": 9.471, - "eval_steps_per_second": 9.471, + "epoch": 2.5768551555123076, + "eval_loss": 0.4331854581832886, + "eval_runtime": 3.5274, + "eval_samples_per_second": 28.349, + "eval_steps_per_second": 7.087, "step": 71500 }, { - "epoch": 2.52, - "learning_rate": 2.5973162137431668e-05, - "loss": 0.2832, + "epoch": 2.5770353551735323, + "grad_norm": 0.1859433799982071, + "learning_rate": 2.4981907572942326e-05, + "loss": 0.3643, "step": 71505 }, { - "epoch": 2.52, - "learning_rate": 2.597031556891804e-05, - "loss": 0.2688, + "epoch": 2.577215554834757, + "grad_norm": 0.21410761773586273, + "learning_rate": 2.4978989440185254e-05, + "loss": 0.3997, "step": 71510 }, { - "epoch": 2.52, - "learning_rate": 2.5967468987805504e-05, - "loss": 0.2701, + "epoch": 2.5773957544959814, + "grad_norm": 0.19530677795410156, + "learning_rate": 2.4976071307714446e-05, + "loss": 0.403, "step": 71515 }, { - "epoch": 2.52, - "learning_rate": 2.5964622394131023e-05, - "loss": 0.2681, + "epoch": 2.577575954157206, + "grad_norm": 0.19589672982692719, + "learning_rate": 2.4973153175569657e-05, + "loss": 0.3772, "step": 71520 }, { - "epoch": 2.52, - "learning_rate": 2.5961775787931553e-05, - "loss": 0.2971, + "epoch": 2.577756153818431, + "grad_norm": 0.16859115660190582, + "learning_rate": 2.4970235043790657e-05, + "loss": 0.3551, "step": 71525 }, { - "epoch": 2.52, - "learning_rate": 2.5958929169244063e-05, - "loss": 0.312, + "epoch": 2.5779363534796556, + "grad_norm": 0.2540879547595978, + "learning_rate": 2.4967316912417204e-05, + "loss": 0.3946, "step": 71530 }, { - "epoch": 2.52, - "learning_rate": 2.5956082538105504e-05, - "loss": 0.2934, + "epoch": 2.5781165531408803, + "grad_norm": 0.1967131346464157, + "learning_rate": 2.4964398781489035e-05, + "loss": 0.4273, "step": 71535 }, { - "epoch": 2.52, - "learning_rate": 2.5953235894552847e-05, - "loss": 0.271, + "epoch": 2.5782967528021046, + "grad_norm": 0.2016594558954239, + "learning_rate": 2.496148065104594e-05, + "loss": 0.4031, "step": 71540 }, { - "epoch": 2.52, - "learning_rate": 2.5950389238623047e-05, - "loss": 0.2588, + "epoch": 2.5784769524633293, + "grad_norm": 0.22788941860198975, + "learning_rate": 2.495856252112765e-05, + "loss": 0.3956, "step": 71545 }, { - "epoch": 2.52, - "learning_rate": 2.5947542570353077e-05, - "loss": 0.2516, + "epoch": 2.578657152124554, + "grad_norm": 0.18338678777217865, + "learning_rate": 2.4955644391773954e-05, + "loss": 0.3795, "step": 71550 }, { - "epoch": 2.52, - "learning_rate": 2.594469588977988e-05, - "loss": 0.2648, + "epoch": 2.5788373517857788, + "grad_norm": 0.16157367825508118, + "learning_rate": 2.4952726263024588e-05, + "loss": 0.3509, "step": 71555 }, { - "epoch": 2.52, - "learning_rate": 2.594184919694044e-05, - "loss": 0.2894, + "epoch": 2.579017551447003, + "grad_norm": 0.1809634417295456, + "learning_rate": 2.4949808134919312e-05, + "loss": 0.3658, "step": 71560 }, { - "epoch": 2.52, - "learning_rate": 2.59390024918717e-05, - "loss": 0.2683, + "epoch": 2.579197751108228, + "grad_norm": 0.2137366533279419, + "learning_rate": 2.4946890007497898e-05, + "loss": 0.3747, "step": 71565 }, { - "epoch": 2.52, - "learning_rate": 2.593615577461064e-05, - "loss": 0.2838, + "epoch": 2.5793779507694525, + "grad_norm": 0.1947045475244522, + "learning_rate": 2.4943971880800093e-05, + "loss": 0.4341, "step": 71570 }, { - "epoch": 2.52, - "learning_rate": 2.5933309045194204e-05, - "loss": 0.2564, + "epoch": 2.5795581504306773, + "grad_norm": 0.2671882212162018, + "learning_rate": 2.4941053754865658e-05, + "loss": 0.4122, "step": 71575 }, { - "epoch": 2.52, - "learning_rate": 2.5930462303659375e-05, - "loss": 0.2731, + "epoch": 2.579738350091902, + "grad_norm": 0.17521758377552032, + "learning_rate": 2.4938135629734356e-05, + "loss": 0.3967, "step": 71580 }, { - "epoch": 2.52, - "learning_rate": 2.59276155500431e-05, - "loss": 0.2804, + "epoch": 2.5799185497531267, + "grad_norm": 0.1912911832332611, + "learning_rate": 2.4935217505445947e-05, + "loss": 0.384, "step": 71585 }, { - "epoch": 2.52, - "learning_rate": 2.592476878438234e-05, - "loss": 0.267, + "epoch": 2.580098749414351, + "grad_norm": 0.19323128461837769, + "learning_rate": 2.4932299382040183e-05, + "loss": 0.4119, "step": 71590 }, { - "epoch": 2.52, - "learning_rate": 2.5921922006714085e-05, - "loss": 0.2696, + "epoch": 2.5802789490755758, + "grad_norm": 0.22955751419067383, + "learning_rate": 2.4929381259556835e-05, + "loss": 0.4337, "step": 71595 }, { - "epoch": 2.52, - "learning_rate": 2.5919075217075268e-05, - "loss": 0.2778, + "epoch": 2.5804591487368005, + "grad_norm": 0.21088866889476776, + "learning_rate": 2.492646313803564e-05, + "loss": 0.3519, "step": 71600 }, { - "epoch": 2.52, - "learning_rate": 2.5916228415502875e-05, - "loss": 0.2754, + "epoch": 2.580639348398025, + "grad_norm": 0.23416496813297272, + "learning_rate": 2.492354501751638e-05, + "loss": 0.4205, "step": 71605 }, { - "epoch": 2.52, - "learning_rate": 2.5913381602033847e-05, - "loss": 0.2525, + "epoch": 2.5808195480592495, + "grad_norm": 0.22170627117156982, + "learning_rate": 2.4920626898038806e-05, + "loss": 0.4031, "step": 71610 }, { - "epoch": 2.52, - "learning_rate": 2.591053477670517e-05, - "loss": 0.2565, + "epoch": 2.5809997477204742, + "grad_norm": 0.18647152185440063, + "learning_rate": 2.491770877964267e-05, + "loss": 0.3669, "step": 71615 }, { - "epoch": 2.52, - "learning_rate": 2.5907687939553794e-05, - "loss": 0.2581, + "epoch": 2.581179947381699, + "grad_norm": 0.24669143557548523, + "learning_rate": 2.4914790662367737e-05, + "loss": 0.4038, "step": 71620 }, { - "epoch": 2.52, - "learning_rate": 2.5904841090616695e-05, - "loss": 0.2581, + "epoch": 2.5813601470429237, + "grad_norm": 0.2305600941181183, + "learning_rate": 2.491187254625376e-05, + "loss": 0.3989, "step": 71625 }, { - "epoch": 2.52, - "learning_rate": 2.5901994229930824e-05, - "loss": 0.2522, + "epoch": 2.5815403467041484, + "grad_norm": 0.17700450122356415, + "learning_rate": 2.4908954431340513e-05, + "loss": 0.3687, "step": 71630 }, { - "epoch": 2.52, - "learning_rate": 2.589914735753316e-05, - "loss": 0.2773, + "epoch": 2.5817205463653727, + "grad_norm": 0.20750737190246582, + "learning_rate": 2.4906036317667744e-05, + "loss": 0.3824, "step": 71635 }, { - "epoch": 2.52, - "learning_rate": 2.5896300473460655e-05, - "loss": 0.2666, + "epoch": 2.5819007460265975, + "grad_norm": 0.2014245092868805, + "learning_rate": 2.49031182052752e-05, + "loss": 0.3933, "step": 71640 }, { - "epoch": 2.52, - "learning_rate": 2.5893453577750277e-05, - "loss": 0.2703, + "epoch": 2.582080945687822, + "grad_norm": 0.21627309918403625, + "learning_rate": 2.4900200094202663e-05, + "loss": 0.4101, "step": 71645 }, { - "epoch": 2.52, - "learning_rate": 2.5890606670438994e-05, - "loss": 0.2686, + "epoch": 2.5822611453490465, + "grad_norm": 0.24448803067207336, + "learning_rate": 2.4897281984489868e-05, + "loss": 0.3986, "step": 71650 }, { - "epoch": 2.52, - "learning_rate": 2.5887759751563777e-05, - "loss": 0.2582, + "epoch": 2.5824413450102712, + "grad_norm": 0.20513613522052765, + "learning_rate": 2.4894363876176602e-05, + "loss": 0.3929, "step": 71655 }, { - "epoch": 2.52, - "learning_rate": 2.588491282116158e-05, - "loss": 0.2581, + "epoch": 2.582621544671496, + "grad_norm": 0.16580136120319366, + "learning_rate": 2.48914457693026e-05, + "loss": 0.3893, "step": 71660 }, { - "epoch": 2.52, - "learning_rate": 2.5882065879269373e-05, - "loss": 0.259, + "epoch": 2.5828017443327207, + "grad_norm": 0.17390620708465576, + "learning_rate": 2.4888527663907627e-05, + "loss": 0.3854, "step": 71665 }, { - "epoch": 2.52, - "learning_rate": 2.5879218925924116e-05, - "loss": 0.2782, + "epoch": 2.5829819439939454, + "grad_norm": 0.21692463755607605, + "learning_rate": 2.4885609560031445e-05, + "loss": 0.3975, "step": 71670 }, { - "epoch": 2.52, - "learning_rate": 2.587637196116279e-05, - "loss": 0.2828, + "epoch": 2.58316214365517, + "grad_norm": 0.20750007033348083, + "learning_rate": 2.4882691457713813e-05, + "loss": 0.4188, "step": 71675 }, { - "epoch": 2.52, - "learning_rate": 2.5873524985022347e-05, - "loss": 0.2558, + "epoch": 2.5833423433163945, + "grad_norm": 0.1784081906080246, + "learning_rate": 2.4879773356994478e-05, + "loss": 0.3624, "step": 71680 }, { - "epoch": 2.52, - "learning_rate": 2.587067799753976e-05, - "loss": 0.2871, + "epoch": 2.583522542977619, + "grad_norm": 0.2542138993740082, + "learning_rate": 2.4876855257913217e-05, + "loss": 0.3848, "step": 71685 }, { - "epoch": 2.52, - "learning_rate": 2.586783099875199e-05, - "loss": 0.2673, + "epoch": 2.583702742638844, + "grad_norm": 0.19215676188468933, + "learning_rate": 2.4873937160509772e-05, + "loss": 0.3796, "step": 71690 }, { - "epoch": 2.52, - "learning_rate": 2.5864983988696007e-05, - "loss": 0.2785, + "epoch": 2.583882942300068, + "grad_norm": 0.23548319935798645, + "learning_rate": 2.4871019064823918e-05, + "loss": 0.4226, "step": 71695 }, { - "epoch": 2.52, - "learning_rate": 2.5862136967408773e-05, - "loss": 0.2656, + "epoch": 2.584063141961293, + "grad_norm": 0.23626303672790527, + "learning_rate": 2.48681009708954e-05, + "loss": 0.4119, "step": 71700 }, { - "epoch": 2.52, - "learning_rate": 2.585928993492727e-05, - "loss": 0.2634, + "epoch": 2.5842433416225177, + "grad_norm": 0.20605653524398804, + "learning_rate": 2.4865182878763975e-05, + "loss": 0.3605, "step": 71705 }, { - "epoch": 2.52, - "learning_rate": 2.5856442891288447e-05, - "loss": 0.2504, + "epoch": 2.5844235412837424, + "grad_norm": 0.23323751986026764, + "learning_rate": 2.4862264788469414e-05, + "loss": 0.4294, "step": 71710 }, { - "epoch": 2.52, - "learning_rate": 2.5853595836529277e-05, - "loss": 0.2628, + "epoch": 2.584603740944967, + "grad_norm": 0.19780725240707397, + "learning_rate": 2.4859346700051474e-05, + "loss": 0.3779, "step": 71715 }, { - "epoch": 2.52, - "learning_rate": 2.5850748770686728e-05, - "loss": 0.255, + "epoch": 2.584783940606192, + "grad_norm": 0.2039288431406021, + "learning_rate": 2.4856428613549892e-05, + "loss": 0.3828, "step": 71720 }, { - "epoch": 2.52, - "learning_rate": 2.5847901693797765e-05, - "loss": 0.2527, + "epoch": 2.584964140267416, + "grad_norm": 0.1562710702419281, + "learning_rate": 2.485351052900446e-05, + "loss": 0.3812, "step": 71725 }, { - "epoch": 2.52, - "learning_rate": 2.584505460589936e-05, - "loss": 0.2515, + "epoch": 2.585144339928641, + "grad_norm": 0.18016739189624786, + "learning_rate": 2.48505924464549e-05, + "loss": 0.3886, "step": 71730 }, { - "epoch": 2.52, - "learning_rate": 2.5842207507028478e-05, - "loss": 0.297, + "epoch": 2.5853245395898656, + "grad_norm": 0.1871825009584427, + "learning_rate": 2.4847674365941e-05, + "loss": 0.3554, "step": 71735 }, { - "epoch": 2.52, - "learning_rate": 2.5839360397222083e-05, - "loss": 0.2261, + "epoch": 2.5855047392510904, + "grad_norm": 0.2108829915523529, + "learning_rate": 2.4844756287502514e-05, + "loss": 0.37, "step": 71740 }, { - "epoch": 2.52, - "learning_rate": 2.583651327651715e-05, - "loss": 0.2677, + "epoch": 2.5856849389123147, + "grad_norm": 0.2470017671585083, + "learning_rate": 2.4841838211179175e-05, + "loss": 0.3934, "step": 71745 }, { - "epoch": 2.52, - "learning_rate": 2.583366614495064e-05, - "loss": 0.2555, + "epoch": 2.5858651385735394, + "grad_norm": 0.22116664052009583, + "learning_rate": 2.4838920137010776e-05, + "loss": 0.4159, "step": 71750 }, { - "epoch": 2.52, - "learning_rate": 2.5830819002559526e-05, - "loss": 0.2722, + "epoch": 2.586045338234764, + "grad_norm": 0.1974770575761795, + "learning_rate": 2.4836002065037056e-05, + "loss": 0.4125, "step": 71755 }, { - "epoch": 2.52, - "learning_rate": 2.582797184938078e-05, - "loss": 0.256, + "epoch": 2.586225537895989, + "grad_norm": 0.2147388607263565, + "learning_rate": 2.4833083995297772e-05, + "loss": 0.4346, "step": 71760 }, { - "epoch": 2.52, - "learning_rate": 2.5825124685451363e-05, - "loss": 0.2765, + "epoch": 2.5864057375572136, + "grad_norm": 0.21751351654529572, + "learning_rate": 2.483016592783269e-05, + "loss": 0.354, "step": 71765 }, { - "epoch": 2.53, - "learning_rate": 2.5822277510808246e-05, - "loss": 0.2664, + "epoch": 2.586585937218438, + "grad_norm": 0.1897956132888794, + "learning_rate": 2.4827247862681556e-05, + "loss": 0.3771, "step": 71770 }, { - "epoch": 2.53, - "learning_rate": 2.5819430325488393e-05, - "loss": 0.2628, + "epoch": 2.5867661368796626, + "grad_norm": 0.2261020839214325, + "learning_rate": 2.4824329799884144e-05, + "loss": 0.3636, "step": 71775 }, { - "epoch": 2.53, - "learning_rate": 2.5816583129528776e-05, - "loss": 0.2742, + "epoch": 2.5869463365408873, + "grad_norm": 0.20903125405311584, + "learning_rate": 2.4821411739480206e-05, + "loss": 0.4079, "step": 71780 }, { - "epoch": 2.53, - "learning_rate": 2.581373592296637e-05, - "loss": 0.2894, + "epoch": 2.587126536202112, + "grad_norm": 0.21350517868995667, + "learning_rate": 2.481849368150949e-05, + "loss": 0.3354, "step": 71785 }, { - "epoch": 2.53, - "learning_rate": 2.5810888705838136e-05, - "loss": 0.2772, + "epoch": 2.5873067358633364, + "grad_norm": 0.18678854405879974, + "learning_rate": 2.481557562601177e-05, + "loss": 0.4028, "step": 71790 }, { - "epoch": 2.53, - "learning_rate": 2.580804147818105e-05, - "loss": 0.2661, + "epoch": 2.587486935524561, + "grad_norm": 0.20988419651985168, + "learning_rate": 2.4812657573026797e-05, + "loss": 0.3798, "step": 71795 }, { - "epoch": 2.53, - "learning_rate": 2.5805194240032083e-05, - "loss": 0.2905, + "epoch": 2.587667135185786, + "grad_norm": 0.21831747889518738, + "learning_rate": 2.4809739522594318e-05, + "loss": 0.3999, "step": 71800 }, { - "epoch": 2.53, - "learning_rate": 2.580234699142819e-05, - "loss": 0.2824, + "epoch": 2.5878473348470106, + "grad_norm": 0.18378116190433502, + "learning_rate": 2.4806821474754112e-05, + "loss": 0.41, "step": 71805 }, { - "epoch": 2.53, - "learning_rate": 2.579949973240635e-05, - "loss": 0.2648, + "epoch": 2.5880275345082353, + "grad_norm": 0.22383366525173187, + "learning_rate": 2.4803903429545918e-05, + "loss": 0.3942, "step": 71810 }, { - "epoch": 2.53, - "learning_rate": 2.5796652463003535e-05, - "loss": 0.2654, + "epoch": 2.5882077341694596, + "grad_norm": 0.15554063022136688, + "learning_rate": 2.480098538700951e-05, + "loss": 0.3572, "step": 71815 }, { - "epoch": 2.53, - "learning_rate": 2.579380518325672e-05, - "loss": 0.2823, + "epoch": 2.5883879338306843, + "grad_norm": 0.20337338745594025, + "learning_rate": 2.4798067347184638e-05, + "loss": 0.4069, "step": 71820 }, { - "epoch": 2.53, - "learning_rate": 2.5790957893202856e-05, - "loss": 0.2606, + "epoch": 2.588568133491909, + "grad_norm": 0.25683242082595825, + "learning_rate": 2.4795149310111047e-05, + "loss": 0.4472, "step": 71825 }, { - "epoch": 2.53, - "learning_rate": 2.5788110592878928e-05, - "loss": 0.2698, + "epoch": 2.588748333153134, + "grad_norm": 0.16255348920822144, + "learning_rate": 2.479223127582852e-05, + "loss": 0.3799, "step": 71830 }, { - "epoch": 2.53, - "learning_rate": 2.578526328232191e-05, - "loss": 0.273, + "epoch": 2.588928532814358, + "grad_norm": 0.23991474509239197, + "learning_rate": 2.478931324437679e-05, + "loss": 0.3806, "step": 71835 }, { - "epoch": 2.53, - "learning_rate": 2.5782415961568756e-05, - "loss": 0.2733, + "epoch": 2.589108732475583, + "grad_norm": 0.17780093848705292, + "learning_rate": 2.478639521579564e-05, + "loss": 0.371, "step": 71840 }, { - "epoch": 2.53, - "learning_rate": 2.5779568630656452e-05, - "loss": 0.2512, + "epoch": 2.5892889321368076, + "grad_norm": 0.21101722121238708, + "learning_rate": 2.478347719012481e-05, + "loss": 0.3911, "step": 71845 }, { - "epoch": 2.53, - "learning_rate": 2.5776721289621965e-05, - "loss": 0.264, + "epoch": 2.5894691317980323, + "grad_norm": 0.1497720628976822, + "learning_rate": 2.4780559167404054e-05, + "loss": 0.3663, "step": 71850 }, { - "epoch": 2.53, - "learning_rate": 2.5773873938502263e-05, - "loss": 0.2789, + "epoch": 2.589649331459257, + "grad_norm": 0.14714904129505157, + "learning_rate": 2.4777641147673144e-05, + "loss": 0.3627, "step": 71855 }, { - "epoch": 2.53, - "learning_rate": 2.577102657733432e-05, - "loss": 0.2529, + "epoch": 2.5898295311204818, + "grad_norm": 0.1989908665418625, + "learning_rate": 2.477472313097183e-05, + "loss": 0.3685, "step": 71860 }, { - "epoch": 2.53, - "learning_rate": 2.5768179206155098e-05, - "loss": 0.2685, + "epoch": 2.590009730781706, + "grad_norm": 0.2075403332710266, + "learning_rate": 2.4771805117339863e-05, + "loss": 0.3824, "step": 71865 }, { - "epoch": 2.53, - "learning_rate": 2.5765331825001577e-05, - "loss": 0.2743, + "epoch": 2.5901899304429308, + "grad_norm": 0.17004312574863434, + "learning_rate": 2.476888710681701e-05, + "loss": 0.3466, "step": 71870 }, { - "epoch": 2.53, - "learning_rate": 2.5762484433910732e-05, - "loss": 0.2568, + "epoch": 2.5903701301041555, + "grad_norm": 0.20493172109127045, + "learning_rate": 2.4765969099443025e-05, + "loss": 0.4133, "step": 71875 }, { - "epoch": 2.53, - "learning_rate": 2.5759637032919527e-05, - "loss": 0.2392, + "epoch": 2.59055032976538, + "grad_norm": 0.2063346803188324, + "learning_rate": 2.476305109525767e-05, + "loss": 0.4252, "step": 71880 }, { - "epoch": 2.53, - "learning_rate": 2.5756789622064937e-05, - "loss": 0.2603, + "epoch": 2.5907305294266045, + "grad_norm": 0.19341862201690674, + "learning_rate": 2.4760133094300697e-05, + "loss": 0.3719, "step": 71885 }, { - "epoch": 2.53, - "learning_rate": 2.5753942201383925e-05, - "loss": 0.2906, + "epoch": 2.5909107290878293, + "grad_norm": 0.17939716577529907, + "learning_rate": 2.475721509661186e-05, + "loss": 0.4111, "step": 71890 }, { - "epoch": 2.53, - "learning_rate": 2.5751094770913476e-05, - "loss": 0.2739, + "epoch": 2.591090928749054, + "grad_norm": 0.19604428112506866, + "learning_rate": 2.4754297102230923e-05, + "loss": 0.3756, "step": 71895 }, { - "epoch": 2.53, - "learning_rate": 2.5748247330690557e-05, - "loss": 0.2532, + "epoch": 2.5912711284102787, + "grad_norm": 0.20184142887592316, + "learning_rate": 2.4751379111197642e-05, + "loss": 0.3813, "step": 71900 }, { - "epoch": 2.53, - "learning_rate": 2.5745399880752143e-05, - "loss": 0.2563, + "epoch": 2.5914513280715035, + "grad_norm": 0.21380701661109924, + "learning_rate": 2.4748461123551768e-05, + "loss": 0.3822, "step": 71905 }, { - "epoch": 2.53, - "learning_rate": 2.5742552421135195e-05, - "loss": 0.2803, + "epoch": 2.5916315277327278, + "grad_norm": 0.22639767825603485, + "learning_rate": 2.4745543139333067e-05, + "loss": 0.4095, "step": 71910 }, { - "epoch": 2.53, - "learning_rate": 2.573970495187669e-05, - "loss": 0.2673, + "epoch": 2.5918117273939525, + "grad_norm": 0.21239355206489563, + "learning_rate": 2.4742625158581286e-05, + "loss": 0.4324, "step": 71915 }, { - "epoch": 2.53, - "learning_rate": 2.5736857473013616e-05, - "loss": 0.2969, + "epoch": 2.5919919270551772, + "grad_norm": 0.19603148102760315, + "learning_rate": 2.4739707181336193e-05, + "loss": 0.3851, "step": 71920 }, { - "epoch": 2.53, - "learning_rate": 2.573400998458293e-05, - "loss": 0.2601, + "epoch": 2.5921721267164015, + "grad_norm": 0.21165809035301208, + "learning_rate": 2.4736789207637543e-05, + "loss": 0.3982, "step": 71925 }, { - "epoch": 2.53, - "learning_rate": 2.5731162486621602e-05, - "loss": 0.2824, + "epoch": 2.5923523263776262, + "grad_norm": 0.22291170060634613, + "learning_rate": 2.473387123752508e-05, + "loss": 0.3934, "step": 71930 }, { - "epoch": 2.53, - "learning_rate": 2.5728314979166618e-05, - "loss": 0.2398, + "epoch": 2.592532526038851, + "grad_norm": 0.1911042034626007, + "learning_rate": 2.4730953271038575e-05, + "loss": 0.3858, "step": 71935 }, { - "epoch": 2.53, - "learning_rate": 2.572546746225494e-05, - "loss": 0.253, + "epoch": 2.5927127257000757, + "grad_norm": 0.2135748714208603, + "learning_rate": 2.472803530821778e-05, + "loss": 0.3912, "step": 71940 }, { - "epoch": 2.53, - "learning_rate": 2.572261993592354e-05, - "loss": 0.266, + "epoch": 2.5928929253613004, + "grad_norm": 0.20679445564746857, + "learning_rate": 2.472511734910245e-05, + "loss": 0.369, "step": 71945 }, { - "epoch": 2.53, - "learning_rate": 2.57197724002094e-05, - "loss": 0.2691, + "epoch": 2.593073125022525, + "grad_norm": 0.22252236306667328, + "learning_rate": 2.472219939373234e-05, + "loss": 0.4035, "step": 71950 }, { - "epoch": 2.53, - "learning_rate": 2.5716924855149495e-05, - "loss": 0.2848, + "epoch": 2.5932533246837495, + "grad_norm": 0.20789240300655365, + "learning_rate": 2.471928144214721e-05, + "loss": 0.3496, "step": 71955 }, { - "epoch": 2.53, - "learning_rate": 2.5714077300780792e-05, - "loss": 0.2687, + "epoch": 2.593433524344974, + "grad_norm": 0.24072077870368958, + "learning_rate": 2.4716363494386817e-05, + "loss": 0.3821, "step": 71960 }, { - "epoch": 2.53, - "learning_rate": 2.5711229737140263e-05, - "loss": 0.2578, + "epoch": 2.593613724006199, + "grad_norm": 0.21749582886695862, + "learning_rate": 2.471344555049092e-05, + "loss": 0.3826, "step": 71965 }, { - "epoch": 2.53, - "learning_rate": 2.5708382164264883e-05, - "loss": 0.247, + "epoch": 2.5937939236674232, + "grad_norm": 0.17097270488739014, + "learning_rate": 2.4710527610499265e-05, + "loss": 0.352, "step": 71970 }, { - "epoch": 2.53, - "learning_rate": 2.5705534582191626e-05, - "loss": 0.2634, + "epoch": 2.593974123328648, + "grad_norm": 0.20925599336624146, + "learning_rate": 2.470760967445162e-05, + "loss": 0.3946, "step": 71975 }, { - "epoch": 2.53, - "learning_rate": 2.5702686990957474e-05, - "loss": 0.2676, + "epoch": 2.5941543229898727, + "grad_norm": 0.16980622708797455, + "learning_rate": 2.470469174238774e-05, + "loss": 0.3889, "step": 71980 }, { - "epoch": 2.53, - "learning_rate": 2.569983939059939e-05, - "loss": 0.2804, + "epoch": 2.5943345226510974, + "grad_norm": 0.1989830583333969, + "learning_rate": 2.4701773814347366e-05, + "loss": 0.3903, "step": 71985 }, { - "epoch": 2.53, - "learning_rate": 2.569699178115436e-05, - "loss": 0.2901, + "epoch": 2.594514722312322, + "grad_norm": 0.22819073498249054, + "learning_rate": 2.469885589037028e-05, + "loss": 0.3905, "step": 71990 }, { - "epoch": 2.53, - "learning_rate": 2.5694144162659344e-05, - "loss": 0.2714, + "epoch": 2.594694921973547, + "grad_norm": 0.23240187764167786, + "learning_rate": 2.4695937970496212e-05, + "loss": 0.3952, "step": 71995 }, { - "epoch": 2.53, - "learning_rate": 2.569129653515132e-05, - "loss": 0.2763, + "epoch": 2.594875121634771, + "grad_norm": 0.20458155870437622, + "learning_rate": 2.4693020054764936e-05, + "loss": 0.3712, "step": 72000 }, { - "epoch": 2.53, - "eval_loss": 0.26415836811065674, - "eval_runtime": 10.5519, - "eval_samples_per_second": 9.477, - "eval_steps_per_second": 9.477, + "epoch": 2.594875121634771, + "eval_loss": 0.43242183327674866, + "eval_runtime": 3.5303, + "eval_samples_per_second": 28.327, + "eval_steps_per_second": 7.082, "step": 72000 }, { - "epoch": 2.53, - "learning_rate": 2.5688448898667274e-05, - "loss": 0.2591, + "epoch": 2.595055321295996, + "grad_norm": 0.2061101496219635, + "learning_rate": 2.4690102143216214e-05, + "loss": 0.3734, "step": 72005 }, { - "epoch": 2.53, - "learning_rate": 2.5685601253244172e-05, - "loss": 0.2776, + "epoch": 2.5952355209572207, + "grad_norm": 0.2840164303779602, + "learning_rate": 2.4687184235889768e-05, + "loss": 0.3939, "step": 72010 }, { - "epoch": 2.53, - "learning_rate": 2.5682753598918985e-05, - "loss": 0.2975, + "epoch": 2.5954157206184454, + "grad_norm": 0.3389698266983032, + "learning_rate": 2.468426633282539e-05, + "loss": 0.3925, "step": 72015 }, { - "epoch": 2.53, - "learning_rate": 2.56799059357287e-05, - "loss": 0.2368, + "epoch": 2.5955959202796697, + "grad_norm": 0.22343645989894867, + "learning_rate": 2.4681348434062825e-05, + "loss": 0.4075, "step": 72020 }, { - "epoch": 2.53, - "learning_rate": 2.5677058263710274e-05, - "loss": 0.2852, + "epoch": 2.5957761199408944, + "grad_norm": 0.23399618268013, + "learning_rate": 2.467843053964181e-05, + "loss": 0.3981, "step": 72025 }, { - "epoch": 2.53, - "learning_rate": 2.5674210582900698e-05, - "loss": 0.3003, + "epoch": 2.595956319602119, + "grad_norm": 0.25679734349250793, + "learning_rate": 2.4675512649602134e-05, + "loss": 0.3848, "step": 72030 }, { - "epoch": 2.53, - "learning_rate": 2.567136289333694e-05, - "loss": 0.2511, + "epoch": 2.596136519263344, + "grad_norm": 0.19120201468467712, + "learning_rate": 2.467259476398352e-05, + "loss": 0.3813, "step": 72035 }, { - "epoch": 2.53, - "learning_rate": 2.5668515195055988e-05, - "loss": 0.2747, + "epoch": 2.5963167189245686, + "grad_norm": 0.24785567820072174, + "learning_rate": 2.4669676882825754e-05, + "loss": 0.3926, "step": 72040 }, { - "epoch": 2.53, - "learning_rate": 2.566566748809479e-05, - "loss": 0.2807, + "epoch": 2.596496918585793, + "grad_norm": 0.21031877398490906, + "learning_rate": 2.4666759006168572e-05, + "loss": 0.3782, "step": 72045 }, { - "epoch": 2.53, - "learning_rate": 2.5662819772490348e-05, - "loss": 0.272, + "epoch": 2.5966771182470176, + "grad_norm": 0.22502188384532928, + "learning_rate": 2.4663841134051727e-05, + "loss": 0.389, "step": 72050 }, { - "epoch": 2.54, - "learning_rate": 2.5659972048279623e-05, - "loss": 0.2606, + "epoch": 2.5968573179082424, + "grad_norm": 0.20378191769123077, + "learning_rate": 2.4660923266514986e-05, + "loss": 0.4028, "step": 72055 }, { - "epoch": 2.54, - "learning_rate": 2.5657124315499598e-05, - "loss": 0.2775, + "epoch": 2.597037517569467, + "grad_norm": 0.19002820551395416, + "learning_rate": 2.4658005403598098e-05, + "loss": 0.3727, "step": 72060 }, { - "epoch": 2.54, - "learning_rate": 2.5654276574187254e-05, - "loss": 0.2746, + "epoch": 2.5972177172306914, + "grad_norm": 0.19522058963775635, + "learning_rate": 2.4655087545340823e-05, + "loss": 0.3779, "step": 72065 }, { - "epoch": 2.54, - "learning_rate": 2.565142882437955e-05, - "loss": 0.2642, + "epoch": 2.597397916891916, + "grad_norm": 0.23693622648715973, + "learning_rate": 2.4652169691782914e-05, + "loss": 0.3623, "step": 72070 }, { - "epoch": 2.54, - "learning_rate": 2.564858106611348e-05, - "loss": 0.2627, + "epoch": 2.597578116553141, + "grad_norm": 0.18634121119976044, + "learning_rate": 2.464925184296412e-05, + "loss": 0.3996, "step": 72075 }, { - "epoch": 2.54, - "learning_rate": 2.5645733299426006e-05, - "loss": 0.2554, + "epoch": 2.5977583162143656, + "grad_norm": 0.18742458522319794, + "learning_rate": 2.4646333998924205e-05, + "loss": 0.3633, "step": 72080 }, { - "epoch": 2.54, - "learning_rate": 2.564288552435411e-05, - "loss": 0.2609, + "epoch": 2.5979385158755903, + "grad_norm": 0.21051351726055145, + "learning_rate": 2.4643416159702925e-05, + "loss": 0.4086, "step": 72085 }, { - "epoch": 2.54, - "learning_rate": 2.564003774093477e-05, - "loss": 0.2533, + "epoch": 2.598118715536815, + "grad_norm": 0.1712328940629959, + "learning_rate": 2.4640498325340022e-05, + "loss": 0.3798, "step": 72090 }, { - "epoch": 2.54, - "learning_rate": 2.563718994920497e-05, - "loss": 0.2824, + "epoch": 2.5982989151980393, + "grad_norm": 0.19666485488414764, + "learning_rate": 2.4637580495875267e-05, + "loss": 0.3856, "step": 72095 }, { - "epoch": 2.54, - "learning_rate": 2.5634342149201667e-05, - "loss": 0.2591, + "epoch": 2.598479114859264, + "grad_norm": 0.15711526572704315, + "learning_rate": 2.4634662671348403e-05, + "loss": 0.3996, "step": 72100 }, { - "epoch": 2.54, - "learning_rate": 2.5631494340961858e-05, - "loss": 0.2786, + "epoch": 2.598659314520489, + "grad_norm": 0.21674515306949615, + "learning_rate": 2.4631744851799192e-05, + "loss": 0.3851, "step": 72105 }, { - "epoch": 2.54, - "learning_rate": 2.5628646524522503e-05, - "loss": 0.2509, + "epoch": 2.598839514181713, + "grad_norm": 0.1570635885000229, + "learning_rate": 2.4628827037267397e-05, + "loss": 0.3768, "step": 72110 }, { - "epoch": 2.54, - "learning_rate": 2.5625798699920596e-05, - "loss": 0.2841, + "epoch": 2.599019713842938, + "grad_norm": 0.188796266913414, + "learning_rate": 2.462590922779274e-05, + "loss": 0.4152, "step": 72115 }, { - "epoch": 2.54, - "learning_rate": 2.5622950867193095e-05, - "loss": 0.2978, + "epoch": 2.5991999135041626, + "grad_norm": 0.22848844528198242, + "learning_rate": 2.4622991423415016e-05, + "loss": 0.3614, "step": 72120 }, { - "epoch": 2.54, - "learning_rate": 2.5620103026377003e-05, - "loss": 0.2747, + "epoch": 2.5993801131653873, + "grad_norm": 0.23881231248378754, + "learning_rate": 2.4620073624173952e-05, + "loss": 0.3662, "step": 72125 }, { - "epoch": 2.54, - "learning_rate": 2.561725517750927e-05, - "loss": 0.2687, + "epoch": 2.599560312826612, + "grad_norm": 0.1681346446275711, + "learning_rate": 2.461715583010931e-05, + "loss": 0.4039, "step": 72130 }, { - "epoch": 2.54, - "learning_rate": 2.5614407320626886e-05, - "loss": 0.2913, + "epoch": 2.5997405124878368, + "grad_norm": 0.19078193604946136, + "learning_rate": 2.461423804126085e-05, + "loss": 0.3721, "step": 72135 }, { - "epoch": 2.54, - "learning_rate": 2.5611559455766825e-05, - "loss": 0.2905, + "epoch": 2.599920712149061, + "grad_norm": 0.19204656779766083, + "learning_rate": 2.4611320257668318e-05, + "loss": 0.3735, "step": 72140 }, { - "epoch": 2.54, - "learning_rate": 2.5608711582966077e-05, - "loss": 0.2821, + "epoch": 2.600100911810286, + "grad_norm": 0.2286890149116516, + "learning_rate": 2.4608402479371475e-05, + "loss": 0.3929, "step": 72145 }, { - "epoch": 2.54, - "learning_rate": 2.5605863702261607e-05, - "loss": 0.2703, + "epoch": 2.6002811114715105, + "grad_norm": 0.18372632563114166, + "learning_rate": 2.4605484706410072e-05, + "loss": 0.3658, "step": 72150 }, { - "epoch": 2.54, - "learning_rate": 2.5603015813690396e-05, - "loss": 0.2875, + "epoch": 2.600461311132735, + "grad_norm": 0.1332695037126541, + "learning_rate": 2.460256693882386e-05, + "loss": 0.3649, "step": 72155 }, { - "epoch": 2.54, - "learning_rate": 2.560016791728942e-05, - "loss": 0.2575, + "epoch": 2.6006415107939596, + "grad_norm": 0.22151631116867065, + "learning_rate": 2.4599649176652602e-05, + "loss": 0.425, "step": 72160 }, { - "epoch": 2.54, - "learning_rate": 2.5597320013095656e-05, - "loss": 0.2838, + "epoch": 2.6008217104551843, + "grad_norm": 0.25563111901283264, + "learning_rate": 2.4596731419936044e-05, + "loss": 0.4232, "step": 72165 }, { - "epoch": 2.54, - "learning_rate": 2.559447210114609e-05, - "loss": 0.2722, + "epoch": 2.601001910116409, + "grad_norm": 0.19068589806556702, + "learning_rate": 2.4593813668713942e-05, + "loss": 0.3822, "step": 72170 }, { - "epoch": 2.54, - "learning_rate": 2.5591624181477693e-05, - "loss": 0.2918, + "epoch": 2.6011821097776338, + "grad_norm": 0.2579037845134735, + "learning_rate": 2.459089592302605e-05, + "loss": 0.3904, "step": 72175 }, { - "epoch": 2.54, - "learning_rate": 2.558877625412745e-05, - "loss": 0.2727, + "epoch": 2.6013623094388585, + "grad_norm": 0.1858731061220169, + "learning_rate": 2.458797818291212e-05, + "loss": 0.3894, "step": 72180 }, { - "epoch": 2.54, - "learning_rate": 2.558592831913233e-05, - "loss": 0.2597, + "epoch": 2.601542509100083, + "grad_norm": 0.2007628083229065, + "learning_rate": 2.458506044841191e-05, + "loss": 0.399, "step": 72185 }, { - "epoch": 2.54, - "learning_rate": 2.5583080376529318e-05, - "loss": 0.2616, + "epoch": 2.6017227087613075, + "grad_norm": 0.22041386365890503, + "learning_rate": 2.4582142719565173e-05, + "loss": 0.4187, "step": 72190 }, { - "epoch": 2.54, - "learning_rate": 2.558023242635539e-05, - "loss": 0.2793, + "epoch": 2.6019029084225322, + "grad_norm": 0.22453515231609344, + "learning_rate": 2.4579224996411655e-05, + "loss": 0.4086, "step": 72195 }, { - "epoch": 2.54, - "learning_rate": 2.5577384468647537e-05, - "loss": 0.2957, + "epoch": 2.6020831080837565, + "grad_norm": 0.2213824838399887, + "learning_rate": 2.457630727899112e-05, + "loss": 0.4141, "step": 72200 }, { - "epoch": 2.54, - "learning_rate": 2.5574536503442715e-05, - "loss": 0.2607, + "epoch": 2.6022633077449813, + "grad_norm": 0.20723369717597961, + "learning_rate": 2.4573389567343323e-05, + "loss": 0.3806, "step": 72205 }, { - "epoch": 2.54, - "learning_rate": 2.5571688530777928e-05, - "loss": 0.2442, + "epoch": 2.602443507406206, + "grad_norm": 0.2115570306777954, + "learning_rate": 2.4570471861507994e-05, + "loss": 0.38, "step": 72210 }, { - "epoch": 2.54, - "learning_rate": 2.5568840550690127e-05, - "loss": 0.2868, + "epoch": 2.6026237070674307, + "grad_norm": 0.21702751517295837, + "learning_rate": 2.4567554161524917e-05, + "loss": 0.3766, "step": 72215 }, { - "epoch": 2.54, - "learning_rate": 2.5565992563216313e-05, - "loss": 0.2637, + "epoch": 2.6028039067286555, + "grad_norm": 0.22824609279632568, + "learning_rate": 2.4564636467433814e-05, + "loss": 0.3602, "step": 72220 }, { - "epoch": 2.54, - "learning_rate": 2.5563144568393456e-05, - "loss": 0.2579, + "epoch": 2.60298410638988, + "grad_norm": 0.2511116564273834, + "learning_rate": 2.4561718779274474e-05, + "loss": 0.3884, "step": 72225 }, { - "epoch": 2.54, - "learning_rate": 2.5560296566258546e-05, - "loss": 0.2757, + "epoch": 2.6031643060511045, + "grad_norm": 0.19371597468852997, + "learning_rate": 2.4558801097086627e-05, + "loss": 0.3869, "step": 72230 }, { - "epoch": 2.54, - "learning_rate": 2.5557448556848545e-05, - "loss": 0.2823, + "epoch": 2.6033445057123292, + "grad_norm": 0.27392515540122986, + "learning_rate": 2.455588342091002e-05, + "loss": 0.4111, "step": 72235 }, { - "epoch": 2.54, - "learning_rate": 2.555460054020045e-05, - "loss": 0.3012, + "epoch": 2.603524705373554, + "grad_norm": 0.20092566311359406, + "learning_rate": 2.4552965750784422e-05, + "loss": 0.4155, "step": 72240 }, { - "epoch": 2.54, - "learning_rate": 2.555175251635123e-05, - "loss": 0.2845, + "epoch": 2.6037049050347787, + "grad_norm": 0.18741832673549652, + "learning_rate": 2.4550048086749572e-05, + "loss": 0.4058, "step": 72245 }, { - "epoch": 2.54, - "learning_rate": 2.5548904485337864e-05, - "loss": 0.2942, + "epoch": 2.603885104696003, + "grad_norm": 0.20643602311611176, + "learning_rate": 2.454713042884524e-05, + "loss": 0.4002, "step": 72250 }, { - "epoch": 2.54, - "learning_rate": 2.5546056447197335e-05, - "loss": 0.2608, + "epoch": 2.6040653043572277, + "grad_norm": 0.1865357607603073, + "learning_rate": 2.4544212777111164e-05, + "loss": 0.3739, "step": 72255 }, { - "epoch": 2.54, - "learning_rate": 2.554320840196663e-05, - "loss": 0.2834, + "epoch": 2.6042455040184525, + "grad_norm": 0.165154829621315, + "learning_rate": 2.4541295131587098e-05, + "loss": 0.4122, "step": 72260 }, { - "epoch": 2.54, - "learning_rate": 2.5540360349682713e-05, - "loss": 0.2809, + "epoch": 2.604425703679677, + "grad_norm": 0.19582092761993408, + "learning_rate": 2.4538377492312797e-05, + "loss": 0.4369, "step": 72265 }, { - "epoch": 2.54, - "learning_rate": 2.5537512290382576e-05, - "loss": 0.2622, + "epoch": 2.604605903340902, + "grad_norm": 0.18715718388557434, + "learning_rate": 2.453545985932802e-05, + "loss": 0.3845, "step": 72270 }, { - "epoch": 2.54, - "learning_rate": 2.5534664224103193e-05, - "loss": 0.2804, + "epoch": 2.604786103002126, + "grad_norm": 0.22059041261672974, + "learning_rate": 2.4532542232672504e-05, + "loss": 0.3972, "step": 72275 }, { - "epoch": 2.54, - "learning_rate": 2.5531816150881554e-05, - "loss": 0.2637, + "epoch": 2.604966302663351, + "grad_norm": 0.22253043949604034, + "learning_rate": 2.4529624612386015e-05, + "loss": 0.4599, "step": 72280 }, { - "epoch": 2.54, - "learning_rate": 2.552896807075463e-05, - "loss": 0.2903, + "epoch": 2.6051465023245757, + "grad_norm": 0.22209565341472626, + "learning_rate": 2.4526706998508296e-05, + "loss": 0.39, "step": 72285 }, { - "epoch": 2.54, - "learning_rate": 2.5526119983759404e-05, - "loss": 0.2552, + "epoch": 2.6053267019858004, + "grad_norm": 0.2502153515815735, + "learning_rate": 2.4523789391079103e-05, + "loss": 0.3973, "step": 72290 }, { - "epoch": 2.54, - "learning_rate": 2.552327188993286e-05, - "loss": 0.2634, + "epoch": 2.6055069016470247, + "grad_norm": 0.2121538519859314, + "learning_rate": 2.4520871790138196e-05, + "loss": 0.362, "step": 72295 }, { - "epoch": 2.54, - "learning_rate": 2.5520423789311972e-05, - "loss": 0.2673, + "epoch": 2.6056871013082494, + "grad_norm": 0.22222217917442322, + "learning_rate": 2.4517954195725305e-05, + "loss": 0.4204, "step": 72300 }, { - "epoch": 2.54, - "learning_rate": 2.551757568193372e-05, - "loss": 0.2947, + "epoch": 2.605867300969474, + "grad_norm": 0.1976117193698883, + "learning_rate": 2.4515036607880208e-05, + "loss": 0.3909, "step": 72305 }, { - "epoch": 2.54, - "learning_rate": 2.551472756783509e-05, - "loss": 0.258, + "epoch": 2.606047500630699, + "grad_norm": 0.16748464107513428, + "learning_rate": 2.451211902664264e-05, + "loss": 0.3622, "step": 72310 }, { - "epoch": 2.54, - "learning_rate": 2.5511879447053068e-05, - "loss": 0.2688, + "epoch": 2.6062277002919236, + "grad_norm": 0.15831992030143738, + "learning_rate": 2.4509201452052338e-05, + "loss": 0.3806, "step": 72315 }, { - "epoch": 2.54, - "learning_rate": 2.5509031319624627e-05, - "loss": 0.2813, + "epoch": 2.606407899953148, + "grad_norm": 0.18840594589710236, + "learning_rate": 2.4506283884149094e-05, + "loss": 0.4006, "step": 72320 }, { - "epoch": 2.54, - "learning_rate": 2.5506183185586752e-05, - "loss": 0.2619, + "epoch": 2.6065880996143727, + "grad_norm": 0.28430864214897156, + "learning_rate": 2.450336632297262e-05, + "loss": 0.4045, "step": 72325 }, { - "epoch": 2.54, - "learning_rate": 2.5503335044976405e-05, - "loss": 0.2896, + "epoch": 2.6067682992755974, + "grad_norm": 0.24897444248199463, + "learning_rate": 2.45004487685627e-05, + "loss": 0.4207, "step": 72330 }, { - "epoch": 2.54, - "learning_rate": 2.5500486897830607e-05, - "loss": 0.2547, + "epoch": 2.606948498936822, + "grad_norm": 0.2031540423631668, + "learning_rate": 2.4497531220959056e-05, + "loss": 0.3999, "step": 72335 }, { - "epoch": 2.55, - "learning_rate": 2.5497638744186304e-05, - "loss": 0.2922, + "epoch": 2.6071286985980464, + "grad_norm": 0.26375287771224976, + "learning_rate": 2.4494613680201456e-05, + "loss": 0.4072, "step": 72340 }, { - "epoch": 2.55, - "learning_rate": 2.5494790584080498e-05, - "loss": 0.2762, + "epoch": 2.607308898259271, + "grad_norm": 0.2009999305009842, + "learning_rate": 2.4491696146329648e-05, + "loss": 0.366, "step": 72345 }, { - "epoch": 2.55, - "learning_rate": 2.5491942417550154e-05, - "loss": 0.2727, + "epoch": 2.607489097920496, + "grad_norm": 0.24075284600257874, + "learning_rate": 2.448877861938338e-05, + "loss": 0.4361, "step": 72350 }, { - "epoch": 2.55, - "learning_rate": 2.548909424463226e-05, - "loss": 0.2848, + "epoch": 2.6076692975817206, + "grad_norm": 0.22679272294044495, + "learning_rate": 2.4485861099402402e-05, + "loss": 0.4212, "step": 72355 }, { - "epoch": 2.55, - "learning_rate": 2.5486246065363807e-05, - "loss": 0.2811, + "epoch": 2.6078494972429453, + "grad_norm": 0.18782539665699005, + "learning_rate": 2.4482943586426473e-05, + "loss": 0.3904, "step": 72360 }, { - "epoch": 2.55, - "learning_rate": 2.5483397879781768e-05, - "loss": 0.2931, + "epoch": 2.60802969690417, + "grad_norm": 0.18936201930046082, + "learning_rate": 2.448002608049533e-05, + "loss": 0.3494, "step": 72365 }, { - "epoch": 2.55, - "learning_rate": 2.5480549687923123e-05, - "loss": 0.2764, + "epoch": 2.6082098965653944, + "grad_norm": 0.22821679711341858, + "learning_rate": 2.4477108581648734e-05, + "loss": 0.4238, "step": 72370 }, { - "epoch": 2.55, - "learning_rate": 2.5477701489824863e-05, - "loss": 0.2898, + "epoch": 2.608390096226619, + "grad_norm": 0.19849362969398499, + "learning_rate": 2.4474191089926438e-05, + "loss": 0.3797, "step": 72375 }, { - "epoch": 2.55, - "learning_rate": 2.5474853285523958e-05, - "loss": 0.2849, + "epoch": 2.608570295887844, + "grad_norm": 0.21136881411075592, + "learning_rate": 2.447127360536818e-05, + "loss": 0.3807, "step": 72380 }, { - "epoch": 2.55, - "learning_rate": 2.54720050750574e-05, - "loss": 0.267, + "epoch": 2.608750495549068, + "grad_norm": 0.23229160904884338, + "learning_rate": 2.446835612801372e-05, + "loss": 0.4021, "step": 72385 }, { - "epoch": 2.55, - "learning_rate": 2.5469156858462156e-05, - "loss": 0.2518, + "epoch": 2.608930695210293, + "grad_norm": 0.2442099153995514, + "learning_rate": 2.446543865790281e-05, + "loss": 0.4003, "step": 72390 }, { - "epoch": 2.55, - "learning_rate": 2.5466308635775238e-05, - "loss": 0.2578, + "epoch": 2.6091108948715176, + "grad_norm": 0.2331792116165161, + "learning_rate": 2.446252119507518e-05, + "loss": 0.4425, "step": 72395 }, { - "epoch": 2.55, - "learning_rate": 2.5463460407033597e-05, - "loss": 0.2703, + "epoch": 2.6092910945327423, + "grad_norm": 0.16892333328723907, + "learning_rate": 2.445960373957061e-05, + "loss": 0.3777, "step": 72400 }, { - "epoch": 2.55, - "learning_rate": 2.5460612172274234e-05, - "loss": 0.2511, + "epoch": 2.609471294193967, + "grad_norm": 0.18657229840755463, + "learning_rate": 2.445668629142882e-05, + "loss": 0.3742, "step": 72405 }, { - "epoch": 2.55, - "learning_rate": 2.5457763931534117e-05, - "loss": 0.2541, + "epoch": 2.609651493855192, + "grad_norm": 0.2221583127975464, + "learning_rate": 2.4453768850689587e-05, + "loss": 0.4194, "step": 72410 }, { - "epoch": 2.55, - "learning_rate": 2.5454915684850238e-05, - "loss": 0.2788, + "epoch": 2.609831693516416, + "grad_norm": 0.1793508529663086, + "learning_rate": 2.4450851417392645e-05, + "loss": 0.397, "step": 72415 }, { - "epoch": 2.55, - "learning_rate": 2.5452067432259585e-05, - "loss": 0.2648, + "epoch": 2.610011893177641, + "grad_norm": 0.1674244999885559, + "learning_rate": 2.444793399157774e-05, + "loss": 0.3605, "step": 72420 }, { - "epoch": 2.55, - "learning_rate": 2.5449219173799132e-05, - "loss": 0.2611, + "epoch": 2.6101920928388656, + "grad_norm": 0.18199168145656586, + "learning_rate": 2.4445016573284632e-05, + "loss": 0.3821, "step": 72425 }, { - "epoch": 2.55, - "learning_rate": 2.5446370909505868e-05, - "loss": 0.2631, + "epoch": 2.61037229250009, + "grad_norm": 0.20687104761600494, + "learning_rate": 2.444209916255306e-05, + "loss": 0.3999, "step": 72430 }, { - "epoch": 2.55, - "learning_rate": 2.5443522639416767e-05, - "loss": 0.2589, + "epoch": 2.6105524921613146, + "grad_norm": 0.21444140374660492, + "learning_rate": 2.4439181759422787e-05, + "loss": 0.3575, "step": 72435 }, { - "epoch": 2.55, - "learning_rate": 2.544067436356882e-05, - "loss": 0.2697, + "epoch": 2.6107326918225393, + "grad_norm": 0.18158109486103058, + "learning_rate": 2.4436264363933554e-05, + "loss": 0.3997, "step": 72440 }, { - "epoch": 2.55, - "learning_rate": 2.5437826081999e-05, - "loss": 0.2398, + "epoch": 2.610912891483764, + "grad_norm": 0.23257912695407867, + "learning_rate": 2.4433346976125103e-05, + "loss": 0.4247, "step": 72445 }, { - "epoch": 2.55, - "learning_rate": 2.543497779474431e-05, - "loss": 0.2626, + "epoch": 2.6110930911449888, + "grad_norm": 0.24175713956356049, + "learning_rate": 2.4430429596037195e-05, + "loss": 0.407, "step": 72450 }, { - "epoch": 2.55, - "learning_rate": 2.5432129501841704e-05, - "loss": 0.2503, + "epoch": 2.6112732908062135, + "grad_norm": 0.24281500279903412, + "learning_rate": 2.442751222370957e-05, + "loss": 0.4246, "step": 72455 }, { - "epoch": 2.55, - "learning_rate": 2.5429281203328198e-05, - "loss": 0.2749, + "epoch": 2.611453490467438, + "grad_norm": 0.21452686190605164, + "learning_rate": 2.4424594859181978e-05, + "loss": 0.3739, "step": 72460 }, { - "epoch": 2.55, - "learning_rate": 2.542643289924075e-05, - "loss": 0.2486, + "epoch": 2.6116336901286625, + "grad_norm": 0.23282234370708466, + "learning_rate": 2.4421677502494175e-05, + "loss": 0.3948, "step": 72465 }, { - "epoch": 2.55, - "learning_rate": 2.5423584589616346e-05, - "loss": 0.2838, + "epoch": 2.6118138897898873, + "grad_norm": 0.24855384230613708, + "learning_rate": 2.44187601536859e-05, + "loss": 0.4206, "step": 72470 }, { - "epoch": 2.55, - "learning_rate": 2.542073627449198e-05, - "loss": 0.2734, + "epoch": 2.6119940894511116, + "grad_norm": 0.22032064199447632, + "learning_rate": 2.441584281279691e-05, + "loss": 0.3899, "step": 72475 }, { - "epoch": 2.55, - "learning_rate": 2.5417887953904633e-05, - "loss": 0.295, + "epoch": 2.6121742891123363, + "grad_norm": 0.28760719299316406, + "learning_rate": 2.4412925479866946e-05, + "loss": 0.3476, "step": 72480 }, { - "epoch": 2.55, - "learning_rate": 2.5415039627891287e-05, - "loss": 0.2504, + "epoch": 2.612354488773561, + "grad_norm": 0.22636237740516663, + "learning_rate": 2.4410008154935757e-05, + "loss": 0.3523, "step": 72485 }, { - "epoch": 2.55, - "learning_rate": 2.541219129648892e-05, - "loss": 0.2823, + "epoch": 2.6125346884347858, + "grad_norm": 0.19048437476158142, + "learning_rate": 2.4407090838043097e-05, + "loss": 0.3876, "step": 72490 }, { - "epoch": 2.55, - "learning_rate": 2.5409342959734522e-05, - "loss": 0.2523, + "epoch": 2.6127148880960105, + "grad_norm": 0.20822523534297943, + "learning_rate": 2.440417352922871e-05, + "loss": 0.4092, "step": 72495 }, { - "epoch": 2.55, - "learning_rate": 2.5406494617665084e-05, - "loss": 0.2634, + "epoch": 2.6128950877572352, + "grad_norm": 0.20144416391849518, + "learning_rate": 2.4401256228532334e-05, + "loss": 0.3772, "step": 72500 }, { - "epoch": 2.55, - "eval_loss": 0.26341068744659424, - "eval_runtime": 10.5346, - "eval_samples_per_second": 9.493, - "eval_steps_per_second": 9.493, + "epoch": 2.6128950877572352, + "eval_loss": 0.432780385017395, + "eval_runtime": 3.5337, + "eval_samples_per_second": 28.299, + "eval_steps_per_second": 7.075, "step": 72500 }, { - "epoch": 2.55, - "learning_rate": 2.5403646270317573e-05, - "loss": 0.2763, + "epoch": 2.6130752874184595, + "grad_norm": 0.20547565817832947, + "learning_rate": 2.4398338935993742e-05, + "loss": 0.4123, "step": 72505 }, { - "epoch": 2.55, - "learning_rate": 2.540079791772898e-05, - "loss": 0.2665, + "epoch": 2.6132554870796842, + "grad_norm": 0.2339104562997818, + "learning_rate": 2.4395421651652646e-05, + "loss": 0.4156, "step": 72510 }, { - "epoch": 2.55, - "learning_rate": 2.5397949559936297e-05, - "loss": 0.2826, + "epoch": 2.613435686740909, + "grad_norm": 0.209024116396904, + "learning_rate": 2.439250437554883e-05, + "loss": 0.3592, "step": 72515 }, { - "epoch": 2.55, - "learning_rate": 2.5395101196976488e-05, - "loss": 0.2844, + "epoch": 2.6136158864021337, + "grad_norm": 0.20917484164237976, + "learning_rate": 2.438958710772202e-05, + "loss": 0.3925, "step": 72520 }, { - "epoch": 2.55, - "learning_rate": 2.539225282888656e-05, - "loss": 0.2834, + "epoch": 2.613796086063358, + "grad_norm": 0.23410077393054962, + "learning_rate": 2.438666984821196e-05, + "loss": 0.3747, "step": 72525 }, { - "epoch": 2.55, - "learning_rate": 2.5389404455703487e-05, - "loss": 0.2522, + "epoch": 2.6139762857245827, + "grad_norm": 0.18121454119682312, + "learning_rate": 2.4383752597058414e-05, + "loss": 0.3926, "step": 72530 }, { - "epoch": 2.55, - "learning_rate": 2.5386556077464258e-05, - "loss": 0.2763, + "epoch": 2.6141564853858075, + "grad_norm": 0.1945260465145111, + "learning_rate": 2.4380835354301117e-05, + "loss": 0.4095, "step": 72535 }, { - "epoch": 2.55, - "learning_rate": 2.5383707694205843e-05, - "loss": 0.2537, + "epoch": 2.614336685047032, + "grad_norm": 0.21491825580596924, + "learning_rate": 2.437791811997981e-05, + "loss": 0.4031, "step": 72540 }, { - "epoch": 2.55, - "learning_rate": 2.538085930596525e-05, - "loss": 0.2708, + "epoch": 2.614516884708257, + "grad_norm": 0.23276425898075104, + "learning_rate": 2.4375000894134257e-05, + "loss": 0.3753, "step": 72545 }, { - "epoch": 2.55, - "learning_rate": 2.5378010912779432e-05, - "loss": 0.2655, + "epoch": 2.6146970843694812, + "grad_norm": 0.20412658154964447, + "learning_rate": 2.4372083676804187e-05, + "loss": 0.4058, "step": 72550 }, { - "epoch": 2.55, - "learning_rate": 2.5375162514685408e-05, - "loss": 0.309, + "epoch": 2.614877284030706, + "grad_norm": 0.2030048817396164, + "learning_rate": 2.4369166468029367e-05, + "loss": 0.371, "step": 72555 }, { - "epoch": 2.55, - "learning_rate": 2.5372314111720135e-05, - "loss": 0.2705, + "epoch": 2.6150574836919307, + "grad_norm": 0.22809742391109467, + "learning_rate": 2.436624926784953e-05, + "loss": 0.3964, "step": 72560 }, { - "epoch": 2.55, - "learning_rate": 2.536946570392062e-05, - "loss": 0.2456, + "epoch": 2.6152376833531554, + "grad_norm": 0.20488815009593964, + "learning_rate": 2.4363332076304413e-05, + "loss": 0.3581, "step": 72565 }, { - "epoch": 2.55, - "learning_rate": 2.5366617291323825e-05, - "loss": 0.2688, + "epoch": 2.6154178830143797, + "grad_norm": 0.2278917282819748, + "learning_rate": 2.4360414893433784e-05, + "loss": 0.3737, "step": 72570 }, { - "epoch": 2.55, - "learning_rate": 2.5363768873966747e-05, - "loss": 0.2542, + "epoch": 2.6155980826756045, + "grad_norm": 0.2145715057849884, + "learning_rate": 2.435749771927738e-05, + "loss": 0.3794, "step": 72575 }, { - "epoch": 2.55, - "learning_rate": 2.5360920451886373e-05, - "loss": 0.2535, + "epoch": 2.615778282336829, + "grad_norm": 0.1873069554567337, + "learning_rate": 2.435458055387493e-05, + "loss": 0.3869, "step": 72580 }, { - "epoch": 2.55, - "learning_rate": 2.5358072025119687e-05, - "loss": 0.2642, + "epoch": 2.615958481998054, + "grad_norm": 0.16591453552246094, + "learning_rate": 2.4351663397266213e-05, + "loss": 0.3601, "step": 72585 }, { - "epoch": 2.55, - "learning_rate": 2.535522359370367e-05, - "loss": 0.272, + "epoch": 2.6161386816592787, + "grad_norm": 0.21023093163967133, + "learning_rate": 2.434874624949094e-05, + "loss": 0.41, "step": 72590 }, { - "epoch": 2.55, - "learning_rate": 2.535237515767531e-05, - "loss": 0.2651, + "epoch": 2.6163188813205034, + "grad_norm": 0.17836223542690277, + "learning_rate": 2.4345829110588892e-05, + "loss": 0.3976, "step": 72595 }, { - "epoch": 2.55, - "learning_rate": 2.534952671707159e-05, - "loss": 0.2561, + "epoch": 2.6164990809817277, + "grad_norm": 0.2159523069858551, + "learning_rate": 2.4342911980599788e-05, + "loss": 0.4025, "step": 72600 }, { - "epoch": 2.55, - "learning_rate": 2.534667827192949e-05, - "loss": 0.2688, + "epoch": 2.6166792806429524, + "grad_norm": 0.23821072280406952, + "learning_rate": 2.4339994859563368e-05, + "loss": 0.4003, "step": 72605 }, { - "epoch": 2.55, - "learning_rate": 2.5343829822286008e-05, - "loss": 0.2615, + "epoch": 2.616859480304177, + "grad_norm": 0.2293204665184021, + "learning_rate": 2.433707774751941e-05, + "loss": 0.3821, "step": 72610 }, { - "epoch": 2.55, - "learning_rate": 2.5340981368178125e-05, - "loss": 0.2712, + "epoch": 2.6170396799654014, + "grad_norm": 0.2255268394947052, + "learning_rate": 2.433416064450763e-05, + "loss": 0.3848, "step": 72615 }, { - "epoch": 2.55, - "learning_rate": 2.533813290964282e-05, - "loss": 0.3022, + "epoch": 2.617219879626626, + "grad_norm": 0.18856073915958405, + "learning_rate": 2.433124355056778e-05, + "loss": 0.4097, "step": 72620 }, { - "epoch": 2.56, - "learning_rate": 2.5335284446717083e-05, - "loss": 0.2799, + "epoch": 2.617400079287851, + "grad_norm": 0.221920907497406, + "learning_rate": 2.432832646573961e-05, + "loss": 0.3756, "step": 72625 }, { - "epoch": 2.56, - "learning_rate": 2.533243597943789e-05, - "loss": 0.2795, + "epoch": 2.6175802789490756, + "grad_norm": 0.23313437402248383, + "learning_rate": 2.432540939006286e-05, + "loss": 0.4022, "step": 72630 }, { - "epoch": 2.56, - "learning_rate": 2.5329587507842246e-05, - "loss": 0.2702, + "epoch": 2.6177604786103004, + "grad_norm": 0.17918843030929565, + "learning_rate": 2.432249232357728e-05, + "loss": 0.394, "step": 72635 }, { - "epoch": 2.56, - "learning_rate": 2.5326739031967118e-05, - "loss": 0.2808, + "epoch": 2.617940678271525, + "grad_norm": 0.1900051385164261, + "learning_rate": 2.4319575266322607e-05, + "loss": 0.4176, "step": 72640 }, { - "epoch": 2.56, - "learning_rate": 2.53238905518495e-05, - "loss": 0.2706, + "epoch": 2.6181208779327494, + "grad_norm": 0.21642592549324036, + "learning_rate": 2.431665821833859e-05, + "loss": 0.4371, "step": 72645 }, { - "epoch": 2.56, - "learning_rate": 2.5321042067526383e-05, - "loss": 0.2738, + "epoch": 2.618301077593974, + "grad_norm": 0.21715706586837769, + "learning_rate": 2.4313741179664974e-05, + "loss": 0.3757, "step": 72650 }, { - "epoch": 2.56, - "learning_rate": 2.5318193579034737e-05, - "loss": 0.27, + "epoch": 2.618481277255199, + "grad_norm": 0.21024832129478455, + "learning_rate": 2.4310824150341497e-05, + "loss": 0.3762, "step": 72655 }, { - "epoch": 2.56, - "learning_rate": 2.531534508641156e-05, - "loss": 0.2633, + "epoch": 2.618661476916423, + "grad_norm": 0.2558997869491577, + "learning_rate": 2.4307907130407916e-05, + "loss": 0.4358, "step": 72660 }, { - "epoch": 2.56, - "learning_rate": 2.531249658969384e-05, - "loss": 0.2696, + "epoch": 2.618841676577648, + "grad_norm": 0.1785580962896347, + "learning_rate": 2.4304990119903966e-05, + "loss": 0.4231, "step": 72665 }, { - "epoch": 2.56, - "learning_rate": 2.5309648088918558e-05, - "loss": 0.2552, + "epoch": 2.6190218762388726, + "grad_norm": 0.18506887555122375, + "learning_rate": 2.430207311886938e-05, + "loss": 0.3657, "step": 72670 }, { - "epoch": 2.56, - "learning_rate": 2.5306799584122693e-05, - "loss": 0.2602, + "epoch": 2.6192020759000973, + "grad_norm": 0.1763153374195099, + "learning_rate": 2.429915612734392e-05, + "loss": 0.3529, "step": 72675 }, { - "epoch": 2.56, - "learning_rate": 2.5303951075343245e-05, - "loss": 0.2569, + "epoch": 2.619382275561322, + "grad_norm": 0.21150760352611542, + "learning_rate": 2.4296239145367333e-05, + "loss": 0.3704, "step": 72680 }, { - "epoch": 2.56, - "learning_rate": 2.5301102562617186e-05, - "loss": 0.2762, + "epoch": 2.619562475222547, + "grad_norm": 0.2270565927028656, + "learning_rate": 2.429332217297933e-05, + "loss": 0.3814, "step": 72685 }, { - "epoch": 2.56, - "learning_rate": 2.529825404598151e-05, - "loss": 0.2722, + "epoch": 2.619742674883771, + "grad_norm": 0.20172244310379028, + "learning_rate": 2.4290405210219694e-05, + "loss": 0.3692, "step": 72690 }, { - "epoch": 2.56, - "learning_rate": 2.5295405525473205e-05, - "loss": 0.2748, + "epoch": 2.619922874544996, + "grad_norm": 0.20030339062213898, + "learning_rate": 2.428748825712813e-05, + "loss": 0.3969, "step": 72695 }, { - "epoch": 2.56, - "learning_rate": 2.5292557001129252e-05, - "loss": 0.2611, + "epoch": 2.6201030742062206, + "grad_norm": 0.23000763356685638, + "learning_rate": 2.4284571313744424e-05, + "loss": 0.4319, "step": 72700 }, { - "epoch": 2.56, - "learning_rate": 2.5289708472986638e-05, - "loss": 0.2969, + "epoch": 2.620283273867445, + "grad_norm": 0.18827667832374573, + "learning_rate": 2.428165438010828e-05, + "loss": 0.3748, "step": 72705 }, { - "epoch": 2.56, - "learning_rate": 2.5286859941082347e-05, - "loss": 0.2757, + "epoch": 2.6204634735286696, + "grad_norm": 0.23635447025299072, + "learning_rate": 2.427873745625946e-05, + "loss": 0.4177, "step": 72710 }, { - "epoch": 2.56, - "learning_rate": 2.528401140545337e-05, - "loss": 0.259, + "epoch": 2.6206436731898943, + "grad_norm": 0.18466027081012726, + "learning_rate": 2.4275820542237703e-05, + "loss": 0.3716, "step": 72715 }, { - "epoch": 2.56, - "learning_rate": 2.5281162866136705e-05, - "loss": 0.2553, + "epoch": 2.620823872851119, + "grad_norm": 0.21398019790649414, + "learning_rate": 2.427290363808275e-05, + "loss": 0.3842, "step": 72720 }, { - "epoch": 2.56, - "learning_rate": 2.527831432316931e-05, - "loss": 0.252, + "epoch": 2.621004072512344, + "grad_norm": 0.24221466481685638, + "learning_rate": 2.426998674383434e-05, + "loss": 0.4314, "step": 72725 }, { - "epoch": 2.56, - "learning_rate": 2.52754657765882e-05, - "loss": 0.2745, + "epoch": 2.6211842721735685, + "grad_norm": 0.18980202078819275, + "learning_rate": 2.4267069859532228e-05, + "loss": 0.3927, "step": 72730 }, { - "epoch": 2.56, - "learning_rate": 2.527261722643034e-05, - "loss": 0.2559, + "epoch": 2.621364471834793, + "grad_norm": 0.2422516644001007, + "learning_rate": 2.4264152985216136e-05, + "loss": 0.398, "step": 72735 }, { - "epoch": 2.56, - "learning_rate": 2.5269768672732726e-05, - "loss": 0.2649, + "epoch": 2.6215446714960176, + "grad_norm": 0.18269525468349457, + "learning_rate": 2.4261236120925828e-05, + "loss": 0.4142, "step": 72740 }, { - "epoch": 2.56, - "learning_rate": 2.5266920115532338e-05, - "loss": 0.262, + "epoch": 2.6217248711572423, + "grad_norm": 0.17914442718029022, + "learning_rate": 2.4258319266701032e-05, + "loss": 0.3988, "step": 72745 }, { - "epoch": 2.56, - "learning_rate": 2.5264071554866176e-05, - "loss": 0.2882, + "epoch": 2.621905070818467, + "grad_norm": 0.22300316393375397, + "learning_rate": 2.4255402422581485e-05, + "loss": 0.3862, "step": 72750 }, { - "epoch": 2.56, - "learning_rate": 2.5261222990771218e-05, - "loss": 0.2757, + "epoch": 2.6220852704796913, + "grad_norm": 0.21765939891338348, + "learning_rate": 2.4252485588606947e-05, + "loss": 0.3836, "step": 72755 }, { - "epoch": 2.56, - "learning_rate": 2.525837442328445e-05, - "loss": 0.2497, + "epoch": 2.622265470140916, + "grad_norm": 0.2101963460445404, + "learning_rate": 2.424956876481715e-05, + "loss": 0.4349, "step": 72760 }, { - "epoch": 2.56, - "learning_rate": 2.5255525852442863e-05, - "loss": 0.2631, + "epoch": 2.6224456698021408, + "grad_norm": 0.20395536720752716, + "learning_rate": 2.4246651951251815e-05, + "loss": 0.3813, "step": 72765 }, { - "epoch": 2.56, - "learning_rate": 2.5252677278283432e-05, - "loss": 0.2461, + "epoch": 2.6226258694633655, + "grad_norm": 0.208245649933815, + "learning_rate": 2.4243735147950715e-05, + "loss": 0.3975, "step": 72770 }, { - "epoch": 2.56, - "learning_rate": 2.5249828700843165e-05, - "loss": 0.2635, + "epoch": 2.6228060691245902, + "grad_norm": 0.20807571709156036, + "learning_rate": 2.424081835495357e-05, + "loss": 0.3674, "step": 72775 }, { - "epoch": 2.56, - "learning_rate": 2.5246980120159026e-05, - "loss": 0.2524, + "epoch": 2.6229862687858145, + "grad_norm": 0.2246702015399933, + "learning_rate": 2.423790157230014e-05, + "loss": 0.3766, "step": 72780 }, { - "epoch": 2.56, - "learning_rate": 2.5244131536268024e-05, - "loss": 0.2744, + "epoch": 2.6231664684470393, + "grad_norm": 0.16798308491706848, + "learning_rate": 2.423498480003015e-05, + "loss": 0.3983, "step": 72785 }, { - "epoch": 2.56, - "learning_rate": 2.524128294920713e-05, - "loss": 0.2592, + "epoch": 2.623346668108264, + "grad_norm": 0.18455500900745392, + "learning_rate": 2.423206803818333e-05, + "loss": 0.3787, "step": 72790 }, { - "epoch": 2.56, - "learning_rate": 2.523843435901333e-05, - "loss": 0.2661, + "epoch": 2.6235268677694887, + "grad_norm": 0.22922283411026, + "learning_rate": 2.422915128679945e-05, + "loss": 0.4127, "step": 72795 }, { - "epoch": 2.56, - "learning_rate": 2.5235585765723624e-05, - "loss": 0.2602, + "epoch": 2.623707067430713, + "grad_norm": 0.19777072966098785, + "learning_rate": 2.422623454591823e-05, + "loss": 0.379, "step": 72800 }, { - "epoch": 2.56, - "learning_rate": 2.5232737169374993e-05, - "loss": 0.2876, + "epoch": 2.6238872670919378, + "grad_norm": 0.2073335349559784, + "learning_rate": 2.4223317815579414e-05, + "loss": 0.3929, "step": 72805 }, { - "epoch": 2.56, - "learning_rate": 2.5229888570004416e-05, - "loss": 0.2709, + "epoch": 2.6240674667531625, + "grad_norm": 0.19318436086177826, + "learning_rate": 2.4220401095822742e-05, + "loss": 0.3986, "step": 72810 }, { - "epoch": 2.56, - "learning_rate": 2.52270399676489e-05, - "loss": 0.2644, + "epoch": 2.6242476664143872, + "grad_norm": 0.20948757231235504, + "learning_rate": 2.421748438668795e-05, + "loss": 0.432, "step": 72815 }, { - "epoch": 2.56, - "learning_rate": 2.522419136234541e-05, - "loss": 0.2693, + "epoch": 2.624427866075612, + "grad_norm": 0.24110877513885498, + "learning_rate": 2.4214567688214788e-05, + "loss": 0.4188, "step": 72820 }, { - "epoch": 2.56, - "learning_rate": 2.5221342754130943e-05, - "loss": 0.2959, + "epoch": 2.6246080657368362, + "grad_norm": 0.24411988258361816, + "learning_rate": 2.4211651000442988e-05, + "loss": 0.3898, "step": 72825 }, { - "epoch": 2.56, - "learning_rate": 2.521849414304249e-05, - "loss": 0.2704, + "epoch": 2.624788265398061, + "grad_norm": 0.21839167177677155, + "learning_rate": 2.4208734323412284e-05, + "loss": 0.4199, "step": 72830 }, { - "epoch": 2.56, - "learning_rate": 2.5215645529117033e-05, - "loss": 0.2734, + "epoch": 2.6249684650592857, + "grad_norm": 0.2261667400598526, + "learning_rate": 2.420581765716243e-05, + "loss": 0.4064, "step": 72835 }, { - "epoch": 2.56, - "learning_rate": 2.521279691239156e-05, - "loss": 0.2744, + "epoch": 2.6251486647205104, + "grad_norm": 0.20573599636554718, + "learning_rate": 2.420290100173315e-05, + "loss": 0.3813, "step": 72840 }, { - "epoch": 2.56, - "learning_rate": 2.5209948292903063e-05, - "loss": 0.254, + "epoch": 2.6253288643817347, + "grad_norm": 0.17270725965499878, + "learning_rate": 2.4199984357164197e-05, + "loss": 0.3894, "step": 72845 }, { - "epoch": 2.56, - "learning_rate": 2.5207099670688516e-05, - "loss": 0.2688, + "epoch": 2.6255090640429595, + "grad_norm": 0.2366410195827484, + "learning_rate": 2.4197067723495298e-05, + "loss": 0.3717, "step": 72850 }, { - "epoch": 2.56, - "learning_rate": 2.5204251045784927e-05, - "loss": 0.2712, + "epoch": 2.625689263704184, + "grad_norm": 0.2237681895494461, + "learning_rate": 2.4194151100766193e-05, + "loss": 0.3762, "step": 72855 }, { - "epoch": 2.56, - "learning_rate": 2.5201402418229265e-05, - "loss": 0.2834, + "epoch": 2.625869463365409, + "grad_norm": 0.20603452622890472, + "learning_rate": 2.419123448901663e-05, + "loss": 0.4403, "step": 72860 }, { - "epoch": 2.56, - "learning_rate": 2.5198553788058533e-05, - "loss": 0.2676, + "epoch": 2.6260496630266337, + "grad_norm": 0.21571598947048187, + "learning_rate": 2.4188317888286345e-05, + "loss": 0.3842, "step": 72865 }, { - "epoch": 2.56, - "learning_rate": 2.5195705155309705e-05, - "loss": 0.2897, + "epoch": 2.6262298626878584, + "grad_norm": 0.20403259992599487, + "learning_rate": 2.4185401298615052e-05, + "loss": 0.3822, "step": 72870 }, { - "epoch": 2.56, - "learning_rate": 2.5192856520019774e-05, - "loss": 0.2422, + "epoch": 2.6264100623490827, + "grad_norm": 0.20591306686401367, + "learning_rate": 2.418248472004253e-05, + "loss": 0.4068, "step": 72875 }, { - "epoch": 2.56, - "learning_rate": 2.519000788222573e-05, - "loss": 0.2494, + "epoch": 2.6265902620103074, + "grad_norm": 0.233648881316185, + "learning_rate": 2.4179568152608476e-05, + "loss": 0.3983, "step": 72880 }, { - "epoch": 2.56, - "learning_rate": 2.518715924196456e-05, - "loss": 0.2354, + "epoch": 2.626770461671532, + "grad_norm": 0.21279099583625793, + "learning_rate": 2.4176651596352657e-05, + "loss": 0.3876, "step": 72885 }, { - "epoch": 2.56, - "learning_rate": 2.518431059927325e-05, - "loss": 0.2622, + "epoch": 2.6269506613327565, + "grad_norm": 0.24593648314476013, + "learning_rate": 2.417373505131481e-05, + "loss": 0.4046, "step": 72890 }, { - "epoch": 2.56, - "learning_rate": 2.518146195418879e-05, - "loss": 0.2665, + "epoch": 2.627130860993981, + "grad_norm": 0.18338236212730408, + "learning_rate": 2.4170818517534642e-05, + "loss": 0.3814, "step": 72895 }, { - "epoch": 2.56, - "learning_rate": 2.5178613306748166e-05, - "loss": 0.2761, + "epoch": 2.627311060655206, + "grad_norm": 0.19615016877651215, + "learning_rate": 2.416790199505193e-05, + "loss": 0.3829, "step": 72900 }, { - "epoch": 2.57, - "learning_rate": 2.5175764656988365e-05, - "loss": 0.2379, + "epoch": 2.6274912603164307, + "grad_norm": 0.17791742086410522, + "learning_rate": 2.4164985483906384e-05, + "loss": 0.3914, "step": 72905 }, { - "epoch": 2.57, - "learning_rate": 2.5172916004946372e-05, - "loss": 0.2582, + "epoch": 2.6276714599776554, + "grad_norm": 0.22163866460323334, + "learning_rate": 2.416206898413775e-05, + "loss": 0.4144, "step": 72910 }, { - "epoch": 2.57, - "learning_rate": 2.5170067350659183e-05, - "loss": 0.2882, + "epoch": 2.62785165963888, + "grad_norm": 0.18475517630577087, + "learning_rate": 2.4159152495785765e-05, + "loss": 0.3815, "step": 72915 }, { - "epoch": 2.57, - "learning_rate": 2.5167218694163786e-05, - "loss": 0.28, + "epoch": 2.6280318593001044, + "grad_norm": 0.22653162479400635, + "learning_rate": 2.415623601889016e-05, + "loss": 0.403, "step": 72920 }, { - "epoch": 2.57, - "learning_rate": 2.5164370035497152e-05, - "loss": 0.2516, + "epoch": 2.628212058961329, + "grad_norm": 0.1823846399784088, + "learning_rate": 2.4153319553490677e-05, + "loss": 0.3878, "step": 72925 }, { - "epoch": 2.57, - "learning_rate": 2.516152137469629e-05, - "loss": 0.2771, + "epoch": 2.628392258622554, + "grad_norm": 0.21723927557468414, + "learning_rate": 2.4150403099627056e-05, + "loss": 0.4188, "step": 72930 }, { - "epoch": 2.57, - "learning_rate": 2.5158672711798174e-05, - "loss": 0.2699, + "epoch": 2.628572458283778, + "grad_norm": 0.2315875142812729, + "learning_rate": 2.414748665733902e-05, + "loss": 0.3892, "step": 72935 }, { - "epoch": 2.57, - "learning_rate": 2.5155824046839803e-05, - "loss": 0.2599, + "epoch": 2.628752657945003, + "grad_norm": 0.24299949407577515, + "learning_rate": 2.4144570226666325e-05, + "loss": 0.4066, "step": 72940 }, { - "epoch": 2.57, - "learning_rate": 2.5152975379858155e-05, - "loss": 0.2754, + "epoch": 2.6289328576062276, + "grad_norm": 0.19684675335884094, + "learning_rate": 2.414165380764869e-05, + "loss": 0.4139, "step": 72945 }, { - "epoch": 2.57, - "learning_rate": 2.5150126710890232e-05, - "loss": 0.2775, + "epoch": 2.6291130572674524, + "grad_norm": 0.232985258102417, + "learning_rate": 2.413873740032585e-05, + "loss": 0.3786, "step": 72950 }, { - "epoch": 2.57, - "learning_rate": 2.5147278039973005e-05, - "loss": 0.2599, + "epoch": 2.629293256928677, + "grad_norm": 0.20983463525772095, + "learning_rate": 2.413582100473755e-05, + "loss": 0.3665, "step": 72955 }, { - "epoch": 2.57, - "learning_rate": 2.514442936714347e-05, - "loss": 0.2474, + "epoch": 2.629473456589902, + "grad_norm": 0.1940142810344696, + "learning_rate": 2.4132904620923518e-05, + "loss": 0.3979, "step": 72960 }, { - "epoch": 2.57, - "learning_rate": 2.514158069243861e-05, - "loss": 0.2508, + "epoch": 2.629653656251126, + "grad_norm": 0.20459094643592834, + "learning_rate": 2.41299882489235e-05, + "loss": 0.3806, "step": 72965 }, { - "epoch": 2.57, - "learning_rate": 2.513873201589543e-05, - "loss": 0.2771, + "epoch": 2.629833855912351, + "grad_norm": 0.22843003273010254, + "learning_rate": 2.4127071888777227e-05, + "loss": 0.3792, "step": 72970 }, { - "epoch": 2.57, - "learning_rate": 2.5135883337550898e-05, - "loss": 0.2758, + "epoch": 2.6300140555735756, + "grad_norm": 0.25140973925590515, + "learning_rate": 2.4124155540524414e-05, + "loss": 0.4148, "step": 72975 }, { - "epoch": 2.57, - "learning_rate": 2.513303465744201e-05, - "loss": 0.2695, + "epoch": 2.6301942552348, + "grad_norm": 0.2235340029001236, + "learning_rate": 2.4121239204204833e-05, + "loss": 0.3699, "step": 72980 }, { - "epoch": 2.57, - "learning_rate": 2.5130185975605756e-05, - "loss": 0.2611, + "epoch": 2.6303744548960246, + "grad_norm": 0.2354481816291809, + "learning_rate": 2.4118322879858187e-05, + "loss": 0.4014, "step": 72985 }, { - "epoch": 2.57, - "learning_rate": 2.5127337292079122e-05, - "loss": 0.2607, + "epoch": 2.6305546545572493, + "grad_norm": 0.17458322644233704, + "learning_rate": 2.4115406567524217e-05, + "loss": 0.3573, "step": 72990 }, { - "epoch": 2.57, - "learning_rate": 2.51244886068991e-05, - "loss": 0.2707, + "epoch": 2.630734854218474, + "grad_norm": 0.1971106380224228, + "learning_rate": 2.4112490267242665e-05, + "loss": 0.3703, "step": 72995 }, { - "epoch": 2.57, - "learning_rate": 2.5121639920102675e-05, - "loss": 0.2887, + "epoch": 2.630915053879699, + "grad_norm": 0.2097444236278534, + "learning_rate": 2.410957397905326e-05, + "loss": 0.4161, "step": 73000 }, { - "epoch": 2.57, - "eval_loss": 0.2631213068962097, - "eval_runtime": 10.5493, - "eval_samples_per_second": 9.479, - "eval_steps_per_second": 9.479, + "epoch": 2.630915053879699, + "eval_loss": 0.43243083357810974, + "eval_runtime": 3.5273, + "eval_samples_per_second": 28.35, + "eval_steps_per_second": 7.087, "step": 73000 }, { - "epoch": 2.57, - "learning_rate": 2.5118791231726835e-05, - "loss": 0.2693, + "epoch": 2.6310952535409236, + "grad_norm": 0.2296389639377594, + "learning_rate": 2.410665770299574e-05, + "loss": 0.386, "step": 73005 }, { - "epoch": 2.57, - "learning_rate": 2.5115942541808568e-05, - "loss": 0.2585, + "epoch": 2.631275453202148, + "grad_norm": 0.2250434011220932, + "learning_rate": 2.4103741439109835e-05, + "loss": 0.3863, "step": 73010 }, { - "epoch": 2.57, - "learning_rate": 2.5113093850384863e-05, - "loss": 0.2537, + "epoch": 2.6314556528633726, + "grad_norm": 0.20868182182312012, + "learning_rate": 2.410082518743528e-05, + "loss": 0.366, "step": 73015 }, { - "epoch": 2.57, - "learning_rate": 2.5110245157492707e-05, - "loss": 0.2874, + "epoch": 2.6316358525245973, + "grad_norm": 0.2829134166240692, + "learning_rate": 2.4097908948011804e-05, + "loss": 0.3913, "step": 73020 }, { - "epoch": 2.57, - "learning_rate": 2.5107396463169097e-05, - "loss": 0.2721, + "epoch": 2.631816052185822, + "grad_norm": 0.21935230493545532, + "learning_rate": 2.4094992720879144e-05, + "loss": 0.3815, "step": 73025 }, { - "epoch": 2.57, - "learning_rate": 2.510454776745101e-05, - "loss": 0.2768, + "epoch": 2.6319962518470463, + "grad_norm": 0.1936255842447281, + "learning_rate": 2.4092076506077036e-05, + "loss": 0.3789, "step": 73030 }, { - "epoch": 2.57, - "learning_rate": 2.5101699070375446e-05, - "loss": 0.2624, + "epoch": 2.632176451508271, + "grad_norm": 0.18805243074893951, + "learning_rate": 2.408916030364521e-05, + "loss": 0.3998, "step": 73035 }, { - "epoch": 2.57, - "learning_rate": 2.5098850371979375e-05, - "loss": 0.2713, + "epoch": 2.632356651169496, + "grad_norm": 0.19446292519569397, + "learning_rate": 2.4086244113623395e-05, + "loss": 0.4257, "step": 73040 }, { - "epoch": 2.57, - "learning_rate": 2.5096001672299806e-05, - "loss": 0.2588, + "epoch": 2.6325368508307205, + "grad_norm": 0.2560482621192932, + "learning_rate": 2.408332793605133e-05, + "loss": 0.387, "step": 73045 }, { - "epoch": 2.57, - "learning_rate": 2.5093152971373712e-05, - "loss": 0.2834, + "epoch": 2.6327170504919453, + "grad_norm": 0.1912938952445984, + "learning_rate": 2.4080411770968746e-05, + "loss": 0.4343, "step": 73050 }, { - "epoch": 2.57, - "learning_rate": 2.5090304269238096e-05, - "loss": 0.25, + "epoch": 2.6328972501531696, + "grad_norm": 0.2362576127052307, + "learning_rate": 2.4077495618415367e-05, + "loss": 0.3869, "step": 73055 }, { - "epoch": 2.57, - "learning_rate": 2.5087455565929936e-05, - "loss": 0.2558, + "epoch": 2.6330774498143943, + "grad_norm": 0.19982530176639557, + "learning_rate": 2.4074579478430942e-05, + "loss": 0.3916, "step": 73060 }, { - "epoch": 2.57, - "learning_rate": 2.508460686148622e-05, - "loss": 0.2671, + "epoch": 2.633257649475619, + "grad_norm": 0.2339782565832138, + "learning_rate": 2.407166335105518e-05, + "loss": 0.3988, "step": 73065 }, { - "epoch": 2.57, - "learning_rate": 2.5081758155943935e-05, - "loss": 0.2846, + "epoch": 2.6334378491368438, + "grad_norm": 0.2075488567352295, + "learning_rate": 2.4068747236327838e-05, + "loss": 0.4047, "step": 73070 }, { - "epoch": 2.57, - "learning_rate": 2.507890944934009e-05, - "loss": 0.2752, + "epoch": 2.633618048798068, + "grad_norm": 0.20350724458694458, + "learning_rate": 2.4065831134288635e-05, + "loss": 0.3793, "step": 73075 }, { - "epoch": 2.57, - "learning_rate": 2.5076060741711643e-05, - "loss": 0.2523, + "epoch": 2.633798248459293, + "grad_norm": 0.20746393501758575, + "learning_rate": 2.4062915044977284e-05, + "loss": 0.401, "step": 73080 }, { - "epoch": 2.57, - "learning_rate": 2.5073212033095605e-05, - "loss": 0.2534, + "epoch": 2.6339784481205175, + "grad_norm": 0.21754801273345947, + "learning_rate": 2.4059998968433553e-05, + "loss": 0.3765, "step": 73085 }, { - "epoch": 2.57, - "learning_rate": 2.5070363323528956e-05, - "loss": 0.2498, + "epoch": 2.6341586477817422, + "grad_norm": 0.16761621832847595, + "learning_rate": 2.4057082904697152e-05, + "loss": 0.393, "step": 73090 }, { - "epoch": 2.57, - "learning_rate": 2.5067514613048683e-05, - "loss": 0.271, + "epoch": 2.634338847442967, + "grad_norm": 0.23849821090698242, + "learning_rate": 2.4054166853807803e-05, + "loss": 0.4177, "step": 73095 }, { - "epoch": 2.57, - "learning_rate": 2.5064665901691775e-05, - "loss": 0.2751, + "epoch": 2.6345190471041917, + "grad_norm": 0.2702219784259796, + "learning_rate": 2.4051250815805253e-05, + "loss": 0.4136, "step": 73100 }, { - "epoch": 2.57, - "learning_rate": 2.506181718949523e-05, - "loss": 0.2606, + "epoch": 2.634699246765416, + "grad_norm": 0.22448520362377167, + "learning_rate": 2.4048334790729225e-05, + "loss": 0.423, "step": 73105 }, { - "epoch": 2.57, - "learning_rate": 2.5058968476496024e-05, - "loss": 0.2337, + "epoch": 2.6348794464266407, + "grad_norm": 0.24827606976032257, + "learning_rate": 2.4045418778619456e-05, + "loss": 0.4137, "step": 73110 }, { - "epoch": 2.57, - "learning_rate": 2.505611976273115e-05, - "loss": 0.2835, + "epoch": 2.6350596460878655, + "grad_norm": 0.2294238954782486, + "learning_rate": 2.4042502779515668e-05, + "loss": 0.3948, "step": 73115 }, { - "epoch": 2.57, - "learning_rate": 2.50532710482376e-05, - "loss": 0.2665, + "epoch": 2.6352398457490898, + "grad_norm": 0.2006445825099945, + "learning_rate": 2.4039586793457593e-05, + "loss": 0.3847, "step": 73120 }, { - "epoch": 2.57, - "learning_rate": 2.5050422333052366e-05, - "loss": 0.2758, + "epoch": 2.6354200454103145, + "grad_norm": 0.2071392983198166, + "learning_rate": 2.4036670820484964e-05, + "loss": 0.3991, "step": 73125 }, { - "epoch": 2.57, - "learning_rate": 2.504757361721242e-05, - "loss": 0.2706, + "epoch": 2.6356002450715392, + "grad_norm": 0.23408040404319763, + "learning_rate": 2.403375486063751e-05, + "loss": 0.4106, "step": 73130 }, { - "epoch": 2.57, - "learning_rate": 2.5044724900754775e-05, - "loss": 0.2624, + "epoch": 2.635780444732764, + "grad_norm": 0.2418873906135559, + "learning_rate": 2.4030838913954955e-05, + "loss": 0.4135, "step": 73135 }, { - "epoch": 2.57, - "learning_rate": 2.50418761837164e-05, - "loss": 0.2817, + "epoch": 2.6359606443939887, + "grad_norm": 0.18823467195034027, + "learning_rate": 2.4027922980477036e-05, + "loss": 0.3632, "step": 73140 }, { - "epoch": 2.57, - "learning_rate": 2.5039027466134292e-05, - "loss": 0.2912, + "epoch": 2.6361408440552134, + "grad_norm": 0.22256679832935333, + "learning_rate": 2.402500706024347e-05, + "loss": 0.3916, "step": 73145 }, { - "epoch": 2.57, - "learning_rate": 2.503617874804543e-05, - "loss": 0.2722, + "epoch": 2.6363210437164377, + "grad_norm": 0.2467431128025055, + "learning_rate": 2.4022091153294004e-05, + "loss": 0.4101, "step": 73150 }, { - "epoch": 2.57, - "learning_rate": 2.5033330029486818e-05, - "loss": 0.3058, + "epoch": 2.6365012433776625, + "grad_norm": 0.20016492903232574, + "learning_rate": 2.4019175259668362e-05, + "loss": 0.3946, "step": 73155 }, { - "epoch": 2.57, - "learning_rate": 2.503048131049544e-05, - "loss": 0.2577, + "epoch": 2.636681443038887, + "grad_norm": 0.2600384056568146, + "learning_rate": 2.4016259379406247e-05, + "loss": 0.4024, "step": 73160 }, { - "epoch": 2.57, - "learning_rate": 2.502763259110828e-05, - "loss": 0.2728, + "epoch": 2.6368616427001115, + "grad_norm": 0.2585957646369934, + "learning_rate": 2.401334351254743e-05, + "loss": 0.3854, "step": 73165 }, { - "epoch": 2.57, - "learning_rate": 2.5024783871362327e-05, - "loss": 0.2937, + "epoch": 2.637041842361336, + "grad_norm": 0.1720488965511322, + "learning_rate": 2.4010427659131604e-05, + "loss": 0.3705, "step": 73170 }, { - "epoch": 2.57, - "learning_rate": 2.5021935151294573e-05, - "loss": 0.2686, + "epoch": 2.637222042022561, + "grad_norm": 0.23503340780735016, + "learning_rate": 2.4007511819198503e-05, + "loss": 0.4219, "step": 73175 }, { - "epoch": 2.57, - "learning_rate": 2.5019086430942006e-05, - "loss": 0.2744, + "epoch": 2.6374022416837857, + "grad_norm": 0.21345233917236328, + "learning_rate": 2.4004595992787877e-05, + "loss": 0.4177, "step": 73180 }, { - "epoch": 2.57, - "learning_rate": 2.501623771034161e-05, - "loss": 0.2831, + "epoch": 2.6375824413450104, + "grad_norm": 0.2077561467885971, + "learning_rate": 2.400168017993942e-05, + "loss": 0.4072, "step": 73185 }, { - "epoch": 2.58, - "learning_rate": 2.5013388989530388e-05, - "loss": 0.2692, + "epoch": 2.637762641006235, + "grad_norm": 0.2376868724822998, + "learning_rate": 2.3998764380692896e-05, + "loss": 0.4042, "step": 73190 }, { - "epoch": 2.58, - "learning_rate": 2.5010540268545314e-05, - "loss": 0.2712, + "epoch": 2.6379428406674594, + "grad_norm": 0.24904000759124756, + "learning_rate": 2.3995848595088008e-05, + "loss": 0.423, "step": 73195 }, { - "epoch": 2.58, - "learning_rate": 2.5007691547423377e-05, - "loss": 0.2667, + "epoch": 2.638123040328684, + "grad_norm": 0.20672129094600677, + "learning_rate": 2.3992932823164483e-05, + "loss": 0.397, "step": 73200 }, { - "epoch": 2.58, - "learning_rate": 2.5004842826201574e-05, - "loss": 0.2671, + "epoch": 2.638303239989909, + "grad_norm": 0.17566558718681335, + "learning_rate": 2.3990017064962056e-05, + "loss": 0.4138, "step": 73205 }, { - "epoch": 2.58, - "learning_rate": 2.5001994104916888e-05, - "loss": 0.2742, + "epoch": 2.638483439651133, + "grad_norm": 0.19264771044254303, + "learning_rate": 2.398710132052045e-05, + "loss": 0.3775, "step": 73210 }, { - "epoch": 2.58, - "learning_rate": 2.4999145383606313e-05, - "loss": 0.2682, + "epoch": 2.638663639312358, + "grad_norm": 0.17429447174072266, + "learning_rate": 2.3984185589879395e-05, + "loss": 0.3963, "step": 73215 }, { - "epoch": 2.58, - "learning_rate": 2.4996296662306834e-05, - "loss": 0.2402, + "epoch": 2.6388438389735827, + "grad_norm": 0.19570399820804596, + "learning_rate": 2.3981269873078613e-05, + "loss": 0.4127, "step": 73220 }, { - "epoch": 2.58, - "learning_rate": 2.4993447941055446e-05, - "loss": 0.2721, + "epoch": 2.6390240386348074, + "grad_norm": 0.23415052890777588, + "learning_rate": 2.3978354170157828e-05, + "loss": 0.3912, "step": 73225 }, { - "epoch": 2.58, - "learning_rate": 2.4990599219889114e-05, - "loss": 0.2458, + "epoch": 2.639204238296032, + "grad_norm": 0.22329415380954742, + "learning_rate": 2.3975438481156772e-05, + "loss": 0.4042, "step": 73230 }, { - "epoch": 2.58, - "learning_rate": 2.498775049884486e-05, - "loss": 0.2727, + "epoch": 2.639384437957257, + "grad_norm": 0.17532110214233398, + "learning_rate": 2.3972522806115176e-05, + "loss": 0.4019, "step": 73235 }, { - "epoch": 2.58, - "learning_rate": 2.4984901777959655e-05, - "loss": 0.2571, + "epoch": 2.639564637618481, + "grad_norm": 0.2220272421836853, + "learning_rate": 2.3969607145072747e-05, + "loss": 0.4198, "step": 73240 }, { - "epoch": 2.58, - "learning_rate": 2.4982053057270483e-05, - "loss": 0.2896, + "epoch": 2.639744837279706, + "grad_norm": 0.2223370224237442, + "learning_rate": 2.3966691498069228e-05, + "loss": 0.4365, "step": 73245 }, { - "epoch": 2.58, - "learning_rate": 2.497920433681435e-05, - "loss": 0.276, + "epoch": 2.6399250369409306, + "grad_norm": 0.2924058437347412, + "learning_rate": 2.396377586514433e-05, + "loss": 0.3981, "step": 73250 }, { - "epoch": 2.58, - "learning_rate": 2.4976355616628236e-05, - "loss": 0.266, + "epoch": 2.6401052366021553, + "grad_norm": 0.22655771672725677, + "learning_rate": 2.396086024633779e-05, + "loss": 0.4427, "step": 73255 }, { - "epoch": 2.58, - "learning_rate": 2.4973506896749123e-05, - "loss": 0.291, + "epoch": 2.6402854362633796, + "grad_norm": 0.17349043488502502, + "learning_rate": 2.3957944641689335e-05, + "loss": 0.3575, "step": 73260 }, { - "epoch": 2.58, - "learning_rate": 2.4970658177214007e-05, - "loss": 0.2903, + "epoch": 2.6404656359246044, + "grad_norm": 0.18977303802967072, + "learning_rate": 2.3955029051238666e-05, + "loss": 0.3798, "step": 73265 }, { - "epoch": 2.58, - "learning_rate": 2.4967809458059873e-05, - "loss": 0.2851, + "epoch": 2.640645835585829, + "grad_norm": 0.2203160524368286, + "learning_rate": 2.3952113475025543e-05, + "loss": 0.3796, "step": 73270 }, { - "epoch": 2.58, - "learning_rate": 2.496496073932372e-05, - "loss": 0.2526, + "epoch": 2.640826035247054, + "grad_norm": 0.16892023384571075, + "learning_rate": 2.394919791308966e-05, + "loss": 0.4015, "step": 73275 }, { - "epoch": 2.58, - "learning_rate": 2.4962112021042523e-05, - "loss": 0.2813, + "epoch": 2.6410062349082786, + "grad_norm": 0.22856315970420837, + "learning_rate": 2.3946282365470755e-05, + "loss": 0.4011, "step": 73280 }, { - "epoch": 2.58, - "learning_rate": 2.495926330325327e-05, - "loss": 0.2818, + "epoch": 2.641186434569503, + "grad_norm": 0.21979033946990967, + "learning_rate": 2.3943366832208548e-05, + "loss": 0.4229, "step": 73285 }, { - "epoch": 2.58, - "learning_rate": 2.4956414585992963e-05, - "loss": 0.2748, + "epoch": 2.6413666342307276, + "grad_norm": 0.180423304438591, + "learning_rate": 2.3940451313342757e-05, + "loss": 0.3786, "step": 73290 }, { - "epoch": 2.58, - "learning_rate": 2.4953565869298586e-05, - "loss": 0.262, + "epoch": 2.6415468338919523, + "grad_norm": 0.2113245725631714, + "learning_rate": 2.393753580891312e-05, + "loss": 0.3818, "step": 73295 }, { - "epoch": 2.58, - "learning_rate": 2.495071715320713e-05, - "loss": 0.2837, + "epoch": 2.641727033553177, + "grad_norm": 0.21729253232479095, + "learning_rate": 2.393462031895935e-05, + "loss": 0.4212, "step": 73300 }, { - "epoch": 2.58, - "learning_rate": 2.4947868437755563e-05, - "loss": 0.2752, + "epoch": 2.6419072332144014, + "grad_norm": 0.20087195932865143, + "learning_rate": 2.393170484352117e-05, + "loss": 0.4411, "step": 73305 }, { - "epoch": 2.58, - "learning_rate": 2.4945019722980904e-05, - "loss": 0.2715, + "epoch": 2.642087432875626, + "grad_norm": 0.21028025448322296, + "learning_rate": 2.3928789382638305e-05, + "loss": 0.424, "step": 73310 }, { - "epoch": 2.58, - "learning_rate": 2.494217100892012e-05, - "loss": 0.273, + "epoch": 2.642267632536851, + "grad_norm": 0.18576066195964813, + "learning_rate": 2.392587393635048e-05, + "loss": 0.3946, "step": 73315 }, { - "epoch": 2.58, - "learning_rate": 2.4939322295610216e-05, - "loss": 0.2473, + "epoch": 2.6424478321980756, + "grad_norm": 0.18550072610378265, + "learning_rate": 2.392295850469741e-05, + "loss": 0.4047, "step": 73320 }, { - "epoch": 2.58, - "learning_rate": 2.493647358308816e-05, - "loss": 0.2795, + "epoch": 2.6426280318593003, + "grad_norm": 0.17761926352977753, + "learning_rate": 2.3920043087718826e-05, + "loss": 0.3817, "step": 73325 }, { - "epoch": 2.58, - "learning_rate": 2.4933624871390962e-05, - "loss": 0.2874, + "epoch": 2.6428082315205246, + "grad_norm": 0.20507977902889252, + "learning_rate": 2.3917127685454442e-05, + "loss": 0.409, "step": 73330 }, { - "epoch": 2.58, - "learning_rate": 2.4930776160555603e-05, - "loss": 0.2605, + "epoch": 2.6429884311817493, + "grad_norm": 0.24363189935684204, + "learning_rate": 2.391421229794399e-05, + "loss": 0.4085, "step": 73335 }, { - "epoch": 2.58, - "learning_rate": 2.492792745061906e-05, - "loss": 0.2756, + "epoch": 2.643168630842974, + "grad_norm": 0.17502768337726593, + "learning_rate": 2.3911296925227182e-05, + "loss": 0.4124, "step": 73340 }, { - "epoch": 2.58, - "learning_rate": 2.4925078741618338e-05, - "loss": 0.2642, + "epoch": 2.6433488305041988, + "grad_norm": 0.21647201478481293, + "learning_rate": 2.3908381567343736e-05, + "loss": 0.3596, "step": 73345 }, { - "epoch": 2.58, - "learning_rate": 2.4922230033590415e-05, - "loss": 0.2525, + "epoch": 2.643529030165423, + "grad_norm": 0.24254228174686432, + "learning_rate": 2.390546622433339e-05, + "loss": 0.4148, "step": 73350 }, { - "epoch": 2.58, - "learning_rate": 2.491938132657229e-05, - "loss": 0.2883, + "epoch": 2.643709229826648, + "grad_norm": 0.22495651245117188, + "learning_rate": 2.3902550896235855e-05, + "loss": 0.3867, "step": 73355 }, { - "epoch": 2.58, - "learning_rate": 2.4916532620600937e-05, - "loss": 0.2925, + "epoch": 2.6438894294878725, + "grad_norm": 0.23512765765190125, + "learning_rate": 2.3899635583090837e-05, + "loss": 0.4377, "step": 73360 }, { - "epoch": 2.58, - "learning_rate": 2.4913683915713362e-05, - "loss": 0.2917, + "epoch": 2.6440696291490973, + "grad_norm": 0.18619103729724884, + "learning_rate": 2.389672028493809e-05, + "loss": 0.4124, "step": 73365 }, { - "epoch": 2.58, - "learning_rate": 2.491083521194654e-05, - "loss": 0.2734, + "epoch": 2.644249828810322, + "grad_norm": 0.2616622745990753, + "learning_rate": 2.3893805001817298e-05, + "loss": 0.3899, "step": 73370 }, { - "epoch": 2.58, - "learning_rate": 2.4907986509337467e-05, - "loss": 0.2758, + "epoch": 2.6444300284715467, + "grad_norm": 0.23114174604415894, + "learning_rate": 2.3890889733768215e-05, + "loss": 0.4002, "step": 73375 }, { - "epoch": 2.58, - "learning_rate": 2.4905137807923116e-05, - "loss": 0.2821, + "epoch": 2.644610228132771, + "grad_norm": 0.19029228389263153, + "learning_rate": 2.388797448083054e-05, + "loss": 0.3984, "step": 73380 }, { - "epoch": 2.58, - "learning_rate": 2.4902289107740508e-05, - "loss": 0.2439, + "epoch": 2.6447904277939958, + "grad_norm": 0.2512267529964447, + "learning_rate": 2.388505924304399e-05, + "loss": 0.4141, "step": 73385 }, { - "epoch": 2.58, - "learning_rate": 2.4899440408826602e-05, - "loss": 0.2789, + "epoch": 2.6449706274552205, + "grad_norm": 0.21497133374214172, + "learning_rate": 2.3882144020448297e-05, + "loss": 0.4142, "step": 73390 }, { - "epoch": 2.58, - "learning_rate": 2.4896591711218394e-05, - "loss": 0.2544, + "epoch": 2.645150827116445, + "grad_norm": 0.20400524139404297, + "learning_rate": 2.3879228813083175e-05, + "loss": 0.4061, "step": 73395 }, { - "epoch": 2.58, - "learning_rate": 2.4893743014952875e-05, - "loss": 0.2764, + "epoch": 2.6453310267776695, + "grad_norm": 0.1923159807920456, + "learning_rate": 2.387631362098834e-05, + "loss": 0.4065, "step": 73400 }, { - "epoch": 2.58, - "learning_rate": 2.489089432006703e-05, - "loss": 0.2583, + "epoch": 2.6455112264388942, + "grad_norm": 0.2456676959991455, + "learning_rate": 2.387339844420352e-05, + "loss": 0.3786, "step": 73405 }, { - "epoch": 2.58, - "learning_rate": 2.488804562659786e-05, - "loss": 0.2913, + "epoch": 2.645691426100119, + "grad_norm": 0.20096588134765625, + "learning_rate": 2.3870483282768422e-05, + "loss": 0.4352, "step": 73410 }, { - "epoch": 2.58, - "learning_rate": 2.4885196934582343e-05, - "loss": 0.2417, + "epoch": 2.6458716257613437, + "grad_norm": 0.19349773228168488, + "learning_rate": 2.3867568136722777e-05, + "loss": 0.4179, "step": 73415 }, { - "epoch": 2.58, - "learning_rate": 2.488234824405746e-05, - "loss": 0.28, + "epoch": 2.6460518254225684, + "grad_norm": 0.24120834469795227, + "learning_rate": 2.3864653006106298e-05, + "loss": 0.3935, "step": 73420 }, { - "epoch": 2.58, - "learning_rate": 2.487949955506021e-05, - "loss": 0.2668, + "epoch": 2.6462320250837927, + "grad_norm": 0.1798119693994522, + "learning_rate": 2.386173789095869e-05, + "loss": 0.3727, "step": 73425 }, { - "epoch": 2.58, - "learning_rate": 2.487665086762758e-05, - "loss": 0.2542, + "epoch": 2.6464122247450175, + "grad_norm": 0.2536999583244324, + "learning_rate": 2.3858822791319693e-05, + "loss": 0.3995, "step": 73430 }, { - "epoch": 2.58, - "learning_rate": 2.487380218179656e-05, - "loss": 0.2687, + "epoch": 2.646592424406242, + "grad_norm": 0.24796532094478607, + "learning_rate": 2.385590770722901e-05, + "loss": 0.4144, "step": 73435 }, { - "epoch": 2.58, - "learning_rate": 2.4870953497604126e-05, - "loss": 0.2709, + "epoch": 2.6467726240674665, + "grad_norm": 0.17443668842315674, + "learning_rate": 2.3852992638726368e-05, + "loss": 0.3669, "step": 73440 }, { - "epoch": 2.58, - "learning_rate": 2.4868104815087284e-05, - "loss": 0.2685, + "epoch": 2.6469528237286912, + "grad_norm": 0.2030688226222992, + "learning_rate": 2.385007758585148e-05, + "loss": 0.3755, "step": 73445 }, { - "epoch": 2.58, - "learning_rate": 2.486525613428301e-05, - "loss": 0.2622, + "epoch": 2.647133023389916, + "grad_norm": 0.1865500956773758, + "learning_rate": 2.3847162548644054e-05, + "loss": 0.3985, "step": 73450 }, { - "epoch": 2.58, - "learning_rate": 2.4862407455228303e-05, - "loss": 0.291, + "epoch": 2.6473132230511407, + "grad_norm": 0.16397204995155334, + "learning_rate": 2.3844247527143826e-05, + "loss": 0.3599, "step": 73455 }, { - "epoch": 2.58, - "learning_rate": 2.4859558777960135e-05, - "loss": 0.265, + "epoch": 2.6474934227123654, + "grad_norm": 0.16662755608558655, + "learning_rate": 2.3841332521390496e-05, + "loss": 0.3815, "step": 73460 }, { - "epoch": 2.58, - "learning_rate": 2.4856710102515513e-05, - "loss": 0.2648, + "epoch": 2.64767362237359, + "grad_norm": 0.2906810939311981, + "learning_rate": 2.383841753142378e-05, + "loss": 0.3871, "step": 73465 }, { - "epoch": 2.58, - "learning_rate": 2.4853861428931412e-05, - "loss": 0.3027, + "epoch": 2.6478538220348145, + "grad_norm": 0.21434706449508667, + "learning_rate": 2.383550255728341e-05, + "loss": 0.3876, "step": 73470 }, { - "epoch": 2.59, - "learning_rate": 2.4851012757244818e-05, - "loss": 0.3025, + "epoch": 2.648034021696039, + "grad_norm": 0.212555930018425, + "learning_rate": 2.3832587599009083e-05, + "loss": 0.4124, "step": 73475 }, { - "epoch": 2.59, - "learning_rate": 2.4848164087492735e-05, - "loss": 0.2561, + "epoch": 2.648214221357264, + "grad_norm": 0.26868897676467896, + "learning_rate": 2.3829672656640534e-05, + "loss": 0.4283, "step": 73480 }, { - "epoch": 2.59, - "learning_rate": 2.4845315419712135e-05, - "loss": 0.2634, + "epoch": 2.648394421018488, + "grad_norm": 0.20437636971473694, + "learning_rate": 2.3826757730217467e-05, + "loss": 0.3905, "step": 73485 }, { - "epoch": 2.59, - "learning_rate": 2.484246675394002e-05, - "loss": 0.2709, + "epoch": 2.648574620679713, + "grad_norm": 0.2129250317811966, + "learning_rate": 2.382384281977959e-05, + "loss": 0.4188, "step": 73490 }, { - "epoch": 2.59, - "learning_rate": 2.4839618090213357e-05, - "loss": 0.2787, + "epoch": 2.6487548203409377, + "grad_norm": 0.2285563051700592, + "learning_rate": 2.3820927925366634e-05, + "loss": 0.3996, "step": 73495 }, { - "epoch": 2.59, - "learning_rate": 2.4836769428569158e-05, - "loss": 0.2796, + "epoch": 2.6489350200021624, + "grad_norm": 0.20043610036373138, + "learning_rate": 2.3818013047018304e-05, + "loss": 0.403, "step": 73500 }, { - "epoch": 2.59, - "eval_loss": 0.2626335024833679, - "eval_runtime": 10.5465, - "eval_samples_per_second": 9.482, - "eval_steps_per_second": 9.482, + "epoch": 2.6489350200021624, + "eval_loss": 0.4324740469455719, + "eval_runtime": 3.5359, + "eval_samples_per_second": 28.282, + "eval_steps_per_second": 7.07, "step": 73500 }, { - "epoch": 2.59, - "learning_rate": 2.4833920769044402e-05, - "loss": 0.2616, + "epoch": 2.649115219663387, + "grad_norm": 0.21913573145866394, + "learning_rate": 2.3815098184774318e-05, + "loss": 0.4078, "step": 73505 }, { - "epoch": 2.59, - "learning_rate": 2.483107211167607e-05, - "loss": 0.2657, + "epoch": 2.649295419324612, + "grad_norm": 0.1827915608882904, + "learning_rate": 2.3812183338674393e-05, + "loss": 0.4111, "step": 73510 }, { - "epoch": 2.59, - "learning_rate": 2.482822345650115e-05, - "loss": 0.245, + "epoch": 2.649475618985836, + "grad_norm": 0.2369619905948639, + "learning_rate": 2.3809268508758232e-05, + "loss": 0.3877, "step": 73515 }, { - "epoch": 2.59, - "learning_rate": 2.4825374803556643e-05, - "loss": 0.2641, + "epoch": 2.649655818647061, + "grad_norm": 0.19070202112197876, + "learning_rate": 2.3806353695065564e-05, + "loss": 0.4125, "step": 73520 }, { - "epoch": 2.59, - "learning_rate": 2.4822526152879533e-05, - "loss": 0.2623, + "epoch": 2.6498360183082856, + "grad_norm": 0.2124054878950119, + "learning_rate": 2.3803438897636095e-05, + "loss": 0.3944, "step": 73525 }, { - "epoch": 2.59, - "learning_rate": 2.4819677504506795e-05, - "loss": 0.2704, + "epoch": 2.6500162179695104, + "grad_norm": 0.22031430900096893, + "learning_rate": 2.3800524116509537e-05, + "loss": 0.3687, "step": 73530 }, { - "epoch": 2.59, - "learning_rate": 2.481682885847543e-05, - "loss": 0.2854, + "epoch": 2.6501964176307347, + "grad_norm": 0.20872965455055237, + "learning_rate": 2.379760935172561e-05, + "loss": 0.376, "step": 73535 }, { - "epoch": 2.59, - "learning_rate": 2.4813980214822418e-05, - "loss": 0.2653, + "epoch": 2.6503766172919594, + "grad_norm": 0.2545884847640991, + "learning_rate": 2.3794694603324026e-05, + "loss": 0.3869, "step": 73540 }, { - "epoch": 2.59, - "learning_rate": 2.4811131573584757e-05, - "loss": 0.2757, + "epoch": 2.650556816953184, + "grad_norm": 0.23649878799915314, + "learning_rate": 2.379177987134448e-05, + "loss": 0.3706, "step": 73545 }, { - "epoch": 2.59, - "learning_rate": 2.480828293479943e-05, - "loss": 0.2732, + "epoch": 2.650737016614409, + "grad_norm": 0.20820178091526031, + "learning_rate": 2.3788865155826716e-05, + "loss": 0.4016, "step": 73550 }, { - "epoch": 2.59, - "learning_rate": 2.4805434298503407e-05, - "loss": 0.2669, + "epoch": 2.6509172162756336, + "grad_norm": 0.2441217005252838, + "learning_rate": 2.378595045681041e-05, + "loss": 0.3979, "step": 73555 }, { - "epoch": 2.59, - "learning_rate": 2.4802585664733708e-05, - "loss": 0.2607, + "epoch": 2.651097415936858, + "grad_norm": 0.2145996391773224, + "learning_rate": 2.3783035774335313e-05, + "loss": 0.4015, "step": 73560 }, { - "epoch": 2.59, - "learning_rate": 2.4799737033527296e-05, - "loss": 0.2629, + "epoch": 2.6512776155980826, + "grad_norm": 0.2503126263618469, + "learning_rate": 2.3780121108441116e-05, + "loss": 0.4389, "step": 73565 }, { - "epoch": 2.59, - "learning_rate": 2.4797458130432597e-05, - "loss": 0.2736, + "epoch": 2.6514578152593073, + "grad_norm": 0.19581691920757294, + "learning_rate": 2.377720645916752e-05, + "loss": 0.3767, "step": 73570 }, { - "epoch": 2.59, - "learning_rate": 2.4794609503933324e-05, - "loss": 0.2633, + "epoch": 2.651638014920532, + "grad_norm": 0.19556576013565063, + "learning_rate": 2.377429182655426e-05, + "loss": 0.389, "step": 73575 }, { - "epoch": 2.59, - "learning_rate": 2.4791760880100916e-05, - "loss": 0.2567, + "epoch": 2.6518182145817564, + "grad_norm": 0.2089264839887619, + "learning_rate": 2.3771377210641035e-05, + "loss": 0.3602, "step": 73580 }, { - "epoch": 2.59, - "learning_rate": 2.4788912258972347e-05, - "loss": 0.2552, + "epoch": 2.651998414242981, + "grad_norm": 0.21499642729759216, + "learning_rate": 2.3768462611467552e-05, + "loss": 0.4022, "step": 73585 }, { - "epoch": 2.59, - "learning_rate": 2.4786063640584623e-05, - "loss": 0.2663, + "epoch": 2.652178613904206, + "grad_norm": 0.27903100848197937, + "learning_rate": 2.3765548029073535e-05, + "loss": 0.417, "step": 73590 }, { - "epoch": 2.59, - "learning_rate": 2.4783215024974717e-05, - "loss": 0.2864, + "epoch": 2.6523588135654306, + "grad_norm": 0.2061304748058319, + "learning_rate": 2.3762633463498677e-05, + "loss": 0.3962, "step": 73595 }, { - "epoch": 2.59, - "learning_rate": 2.4780366412179627e-05, - "loss": 0.2785, + "epoch": 2.6525390132266553, + "grad_norm": 0.23685584962368011, + "learning_rate": 2.375971891478271e-05, + "loss": 0.3888, "step": 73600 }, { - "epoch": 2.59, - "learning_rate": 2.4777517802236323e-05, - "loss": 0.2649, + "epoch": 2.65271921288788, + "grad_norm": 0.22389626502990723, + "learning_rate": 2.3756804382965324e-05, + "loss": 0.4113, "step": 73605 }, { - "epoch": 2.59, - "learning_rate": 2.4774669195181813e-05, - "loss": 0.2524, + "epoch": 2.6528994125491043, + "grad_norm": 0.1898953765630722, + "learning_rate": 2.375388986808624e-05, + "loss": 0.3737, "step": 73610 }, { - "epoch": 2.59, - "learning_rate": 2.4771820591053075e-05, - "loss": 0.2715, + "epoch": 2.653079612210329, + "grad_norm": 0.22866353392601013, + "learning_rate": 2.375097537018517e-05, + "loss": 0.3694, "step": 73615 }, { - "epoch": 2.59, - "learning_rate": 2.476897198988709e-05, - "loss": 0.2782, + "epoch": 2.653259811871554, + "grad_norm": 0.22122244536876678, + "learning_rate": 2.374806088930181e-05, + "loss": 0.4329, "step": 73620 }, { - "epoch": 2.59, - "learning_rate": 2.4766123391720855e-05, - "loss": 0.2799, + "epoch": 2.653440011532778, + "grad_norm": 0.17833849787712097, + "learning_rate": 2.3745146425475884e-05, + "loss": 0.4094, "step": 73625 }, { - "epoch": 2.59, - "learning_rate": 2.4763274796591352e-05, - "loss": 0.2584, + "epoch": 2.653620211194003, + "grad_norm": 0.2136964350938797, + "learning_rate": 2.37422319787471e-05, + "loss": 0.4217, "step": 73630 }, { - "epoch": 2.59, - "learning_rate": 2.4760426204535575e-05, - "loss": 0.2706, + "epoch": 2.6538004108552276, + "grad_norm": 0.18438690900802612, + "learning_rate": 2.3739317549155148e-05, + "loss": 0.4136, "step": 73635 }, { - "epoch": 2.59, - "learning_rate": 2.4757577615590507e-05, - "loss": 0.2862, + "epoch": 2.6539806105164523, + "grad_norm": 0.23062624037265778, + "learning_rate": 2.373640313673976e-05, + "loss": 0.3857, "step": 73640 }, { - "epoch": 2.59, - "learning_rate": 2.475472902979312e-05, - "loss": 0.2709, + "epoch": 2.654160810177677, + "grad_norm": 0.26239144802093506, + "learning_rate": 2.373348874154064e-05, + "loss": 0.4017, "step": 73645 }, { - "epoch": 2.59, - "learning_rate": 2.4751880447180427e-05, - "loss": 0.2287, + "epoch": 2.6543410098389018, + "grad_norm": 0.22294588387012482, + "learning_rate": 2.3730574363597476e-05, + "loss": 0.4033, "step": 73650 }, { - "epoch": 2.59, - "learning_rate": 2.47490318677894e-05, - "loss": 0.2704, + "epoch": 2.654521209500126, + "grad_norm": 0.17945276200771332, + "learning_rate": 2.3727660002950006e-05, + "loss": 0.3721, "step": 73655 }, { - "epoch": 2.59, - "learning_rate": 2.4746183291657022e-05, - "loss": 0.2665, + "epoch": 2.654701409161351, + "grad_norm": 0.22688071429729462, + "learning_rate": 2.3724745659637902e-05, + "loss": 0.3883, "step": 73660 }, { - "epoch": 2.59, - "learning_rate": 2.4743334718820298e-05, - "loss": 0.2821, + "epoch": 2.6548816088225755, + "grad_norm": 0.21472230553627014, + "learning_rate": 2.3721831333700913e-05, + "loss": 0.4231, "step": 73665 }, { - "epoch": 2.59, - "learning_rate": 2.4740486149316204e-05, - "loss": 0.2545, + "epoch": 2.6550618084838, + "grad_norm": 0.23527327179908752, + "learning_rate": 2.371891702517872e-05, + "loss": 0.3958, "step": 73670 }, { - "epoch": 2.59, - "learning_rate": 2.4737637583181716e-05, - "loss": 0.2813, + "epoch": 2.6552420081450245, + "grad_norm": 0.19936297833919525, + "learning_rate": 2.3716002734111024e-05, + "loss": 0.3937, "step": 73675 }, { - "epoch": 2.59, - "learning_rate": 2.4734789020453835e-05, - "loss": 0.2617, + "epoch": 2.6554222078062493, + "grad_norm": 0.19073469936847687, + "learning_rate": 2.371308846053755e-05, + "loss": 0.3815, "step": 73680 }, { - "epoch": 2.59, - "learning_rate": 2.4731940461169543e-05, - "loss": 0.2424, + "epoch": 2.655602407467474, + "grad_norm": 0.19665153324604034, + "learning_rate": 2.3710174204497997e-05, + "loss": 0.4036, "step": 73685 }, { - "epoch": 2.59, - "learning_rate": 2.4729091905365835e-05, - "loss": 0.277, + "epoch": 2.6557826071286987, + "grad_norm": 0.2125919610261917, + "learning_rate": 2.3707259966032064e-05, + "loss": 0.4332, "step": 73690 }, { - "epoch": 2.59, - "learning_rate": 2.4726243353079685e-05, - "loss": 0.2643, + "epoch": 2.6559628067899235, + "grad_norm": 0.16980135440826416, + "learning_rate": 2.370434574517947e-05, + "loss": 0.4054, "step": 73695 }, { - "epoch": 2.59, - "learning_rate": 2.472339480434808e-05, - "loss": 0.273, + "epoch": 2.6561430064511478, + "grad_norm": 0.16812558472156525, + "learning_rate": 2.370143154197991e-05, + "loss": 0.3967, "step": 73700 }, { - "epoch": 2.59, - "learning_rate": 2.472054625920802e-05, - "loss": 0.2446, + "epoch": 2.6563232061123725, + "grad_norm": 0.23627859354019165, + "learning_rate": 2.3698517356473098e-05, + "loss": 0.3897, "step": 73705 }, { - "epoch": 2.59, - "learning_rate": 2.4717697717696475e-05, - "loss": 0.2484, + "epoch": 2.6565034057735972, + "grad_norm": 0.2503635287284851, + "learning_rate": 2.3695603188698733e-05, + "loss": 0.3958, "step": 73710 }, { - "epoch": 2.59, - "learning_rate": 2.4714849179850448e-05, - "loss": 0.2785, + "epoch": 2.6566836054348215, + "grad_norm": 0.2157202512025833, + "learning_rate": 2.3692689038696518e-05, + "loss": 0.4087, "step": 73715 }, { - "epoch": 2.59, - "learning_rate": 2.47120006457069e-05, - "loss": 0.2813, + "epoch": 2.6568638050960462, + "grad_norm": 0.1902592033147812, + "learning_rate": 2.368977490650617e-05, + "loss": 0.389, "step": 73720 }, { - "epoch": 2.59, - "learning_rate": 2.470915211530285e-05, - "loss": 0.2686, + "epoch": 2.657044004757271, + "grad_norm": 0.188002347946167, + "learning_rate": 2.368686079216739e-05, + "loss": 0.3733, "step": 73725 }, { - "epoch": 2.59, - "learning_rate": 2.4706303588675267e-05, - "loss": 0.2724, + "epoch": 2.6572242044184957, + "grad_norm": 0.22462768852710724, + "learning_rate": 2.3683946695719857e-05, + "loss": 0.3922, "step": 73730 }, { - "epoch": 2.59, - "learning_rate": 2.4703455065861133e-05, - "loss": 0.268, + "epoch": 2.6574044040797205, + "grad_norm": 0.22756509482860565, + "learning_rate": 2.3681032617203317e-05, + "loss": 0.4083, "step": 73735 }, { - "epoch": 2.59, - "learning_rate": 2.4700606546897432e-05, - "loss": 0.2808, + "epoch": 2.657584603740945, + "grad_norm": 0.2529270052909851, + "learning_rate": 2.367811855665743e-05, + "loss": 0.3848, "step": 73740 }, { - "epoch": 2.59, - "learning_rate": 2.469775803182117e-05, - "loss": 0.2872, + "epoch": 2.6577648034021695, + "grad_norm": 0.2393999844789505, + "learning_rate": 2.3675204514121942e-05, + "loss": 0.3802, "step": 73745 }, { - "epoch": 2.59, - "learning_rate": 2.469490952066932e-05, - "loss": 0.2504, + "epoch": 2.657945003063394, + "grad_norm": 0.2106008678674698, + "learning_rate": 2.3672290489636533e-05, + "loss": 0.4216, "step": 73750 }, { - "epoch": 2.59, - "learning_rate": 2.469206101347887e-05, - "loss": 0.2543, + "epoch": 2.658125202724619, + "grad_norm": 0.19306325912475586, + "learning_rate": 2.3669376483240894e-05, + "loss": 0.3937, "step": 73755 }, { - "epoch": 2.6, - "learning_rate": 2.4689212510286793e-05, - "loss": 0.2908, + "epoch": 2.6583054023858437, + "grad_norm": 0.2211955040693283, + "learning_rate": 2.366646249497476e-05, + "loss": 0.4201, "step": 73760 }, { - "epoch": 2.6, - "learning_rate": 2.4686364011130094e-05, - "loss": 0.2758, + "epoch": 2.658485602047068, + "grad_norm": 0.20790930092334747, + "learning_rate": 2.366354852487781e-05, + "loss": 0.368, "step": 73765 }, { - "epoch": 2.6, - "learning_rate": 2.4683515516045756e-05, - "loss": 0.2654, + "epoch": 2.6586658017082927, + "grad_norm": 0.2273816168308258, + "learning_rate": 2.3660634572989747e-05, + "loss": 0.3965, "step": 73770 }, { - "epoch": 2.6, - "learning_rate": 2.468066702507075e-05, - "loss": 0.2751, + "epoch": 2.6588460013695174, + "grad_norm": 0.22291794419288635, + "learning_rate": 2.3657720639350288e-05, + "loss": 0.4491, "step": 73775 }, { - "epoch": 2.6, - "learning_rate": 2.467781853824208e-05, - "loss": 0.2788, + "epoch": 2.659026201030742, + "grad_norm": 0.21958285570144653, + "learning_rate": 2.3654806723999117e-05, + "loss": 0.3876, "step": 73780 }, { - "epoch": 2.6, - "learning_rate": 2.4674970055596727e-05, - "loss": 0.2844, + "epoch": 2.659206400691967, + "grad_norm": 0.23919081687927246, + "learning_rate": 2.365189282697595e-05, + "loss": 0.3955, "step": 73785 }, { - "epoch": 2.6, - "learning_rate": 2.4672121577171667e-05, - "loss": 0.2669, + "epoch": 2.659386600353191, + "grad_norm": 0.2069309651851654, + "learning_rate": 2.3648978948320483e-05, + "loss": 0.4031, "step": 73790 }, { - "epoch": 2.6, - "learning_rate": 2.4669273103003886e-05, - "loss": 0.2827, + "epoch": 2.659566800014416, + "grad_norm": 0.22566114366054535, + "learning_rate": 2.3646065088072407e-05, + "loss": 0.3822, "step": 73795 }, { - "epoch": 2.6, - "learning_rate": 2.4666424633130393e-05, - "loss": 0.2853, + "epoch": 2.6597469996756407, + "grad_norm": 0.193036288022995, + "learning_rate": 2.364315124627144e-05, + "loss": 0.3603, "step": 73800 }, { - "epoch": 2.6, - "learning_rate": 2.466357616758815e-05, - "loss": 0.2893, + "epoch": 2.6599271993368654, + "grad_norm": 0.19962070882320404, + "learning_rate": 2.3640237422957275e-05, + "loss": 0.3869, "step": 73805 }, { - "epoch": 2.6, - "learning_rate": 2.4660727706414148e-05, - "loss": 0.2785, + "epoch": 2.6601073989980897, + "grad_norm": 0.20868006348609924, + "learning_rate": 2.3637323618169606e-05, + "loss": 0.3864, "step": 73810 }, { - "epoch": 2.6, - "learning_rate": 2.4657879249645366e-05, - "loss": 0.2621, + "epoch": 2.6602875986593144, + "grad_norm": 0.22715534269809723, + "learning_rate": 2.3634409831948144e-05, + "loss": 0.3757, "step": 73815 }, { - "epoch": 2.6, - "learning_rate": 2.4655030797318797e-05, - "loss": 0.2654, + "epoch": 2.660467798320539, + "grad_norm": 0.22947651147842407, + "learning_rate": 2.363149606433258e-05, + "loss": 0.3707, "step": 73820 }, { - "epoch": 2.6, - "learning_rate": 2.4652182349471437e-05, - "loss": 0.2726, + "epoch": 2.660647997981764, + "grad_norm": 0.17915059626102448, + "learning_rate": 2.3628582315362625e-05, + "loss": 0.4104, "step": 73825 }, { - "epoch": 2.6, - "learning_rate": 2.4649333906140256e-05, - "loss": 0.2621, + "epoch": 2.6608281976429886, + "grad_norm": 0.2261868566274643, + "learning_rate": 2.362566858507797e-05, + "loss": 0.3774, "step": 73830 }, { - "epoch": 2.6, - "learning_rate": 2.4646485467362234e-05, - "loss": 0.287, + "epoch": 2.661008397304213, + "grad_norm": 0.23218120634555817, + "learning_rate": 2.3622754873518303e-05, + "loss": 0.4169, "step": 73835 }, { - "epoch": 2.6, - "learning_rate": 2.4643637033174378e-05, - "loss": 0.2776, + "epoch": 2.6611885969654376, + "grad_norm": 0.2152547836303711, + "learning_rate": 2.3619841180723345e-05, + "loss": 0.4173, "step": 73840 }, { - "epoch": 2.6, - "learning_rate": 2.4640788603613652e-05, - "loss": 0.2683, + "epoch": 2.6613687966266624, + "grad_norm": 0.22858308255672455, + "learning_rate": 2.3616927506732773e-05, + "loss": 0.3866, "step": 73845 }, { - "epoch": 2.6, - "learning_rate": 2.4637940178717053e-05, - "loss": 0.2783, + "epoch": 2.661548996287887, + "grad_norm": 0.2074364870786667, + "learning_rate": 2.361401385158631e-05, + "loss": 0.3856, "step": 73850 }, { - "epoch": 2.6, - "learning_rate": 2.4635091758521554e-05, - "loss": 0.2617, + "epoch": 2.6617291959491114, + "grad_norm": 0.1993214190006256, + "learning_rate": 2.361110021532363e-05, + "loss": 0.4015, "step": 73855 }, { - "epoch": 2.6, - "learning_rate": 2.4632243343064158e-05, - "loss": 0.2614, + "epoch": 2.661909395610336, + "grad_norm": 0.22513341903686523, + "learning_rate": 2.360818659798444e-05, + "loss": 0.3876, "step": 73860 }, { - "epoch": 2.6, - "learning_rate": 2.4629394932381842e-05, - "loss": 0.2846, + "epoch": 2.662089595271561, + "grad_norm": 0.20720557868480682, + "learning_rate": 2.3605272999608442e-05, + "loss": 0.4208, "step": 73865 }, { - "epoch": 2.6, - "learning_rate": 2.462654652651158e-05, - "loss": 0.2668, + "epoch": 2.6622697949327856, + "grad_norm": 0.19978566467761993, + "learning_rate": 2.3602359420235333e-05, + "loss": 0.4022, "step": 73870 }, { - "epoch": 2.6, - "learning_rate": 2.4623698125490364e-05, - "loss": 0.2688, + "epoch": 2.6624499945940103, + "grad_norm": 0.23757880926132202, + "learning_rate": 2.3599445859904798e-05, + "loss": 0.3755, "step": 73875 }, { - "epoch": 2.6, - "learning_rate": 2.4620849729355186e-05, - "loss": 0.267, + "epoch": 2.662630194255235, + "grad_norm": 0.17947709560394287, + "learning_rate": 2.3596532318656547e-05, + "loss": 0.4252, "step": 73880 }, { - "epoch": 2.6, - "learning_rate": 2.4618001338143027e-05, - "loss": 0.2587, + "epoch": 2.6628103939164594, + "grad_norm": 0.22774136066436768, + "learning_rate": 2.3593618796530268e-05, + "loss": 0.3899, "step": 73885 }, { - "epoch": 2.6, - "learning_rate": 2.461515295189086e-05, - "loss": 0.2811, + "epoch": 2.662990593577684, + "grad_norm": 0.23940308392047882, + "learning_rate": 2.3590705293565663e-05, + "loss": 0.3636, "step": 73890 }, { - "epoch": 2.6, - "learning_rate": 2.4612304570635684e-05, - "loss": 0.2919, + "epoch": 2.663170793238909, + "grad_norm": 0.1792476326227188, + "learning_rate": 2.3587791809802427e-05, + "loss": 0.3771, "step": 73895 }, { - "epoch": 2.6, - "learning_rate": 2.4609456194414473e-05, - "loss": 0.2534, + "epoch": 2.663350992900133, + "grad_norm": 0.15978147089481354, + "learning_rate": 2.358487834528025e-05, + "loss": 0.4011, "step": 73900 }, { - "epoch": 2.6, - "learning_rate": 2.4606607823264225e-05, - "loss": 0.2737, + "epoch": 2.663531192561358, + "grad_norm": 0.20006360113620758, + "learning_rate": 2.3581964900038836e-05, + "loss": 0.3993, "step": 73905 }, { - "epoch": 2.6, - "learning_rate": 2.4603759457221904e-05, - "loss": 0.2857, + "epoch": 2.6637113922225826, + "grad_norm": 0.1868210732936859, + "learning_rate": 2.357905147411788e-05, + "loss": 0.3967, "step": 73910 }, { - "epoch": 2.6, - "learning_rate": 2.4600911096324513e-05, - "loss": 0.2629, + "epoch": 2.6638915918838073, + "grad_norm": 0.21746650338172913, + "learning_rate": 2.3576138067557058e-05, + "loss": 0.3769, "step": 73915 }, { - "epoch": 2.6, - "learning_rate": 2.4598062740609028e-05, - "loss": 0.2631, + "epoch": 2.664071791545032, + "grad_norm": 0.21393951773643494, + "learning_rate": 2.357322468039609e-05, + "loss": 0.4063, "step": 73920 }, { - "epoch": 2.6, - "learning_rate": 2.459521439011243e-05, - "loss": 0.2634, + "epoch": 2.6642519912062568, + "grad_norm": 0.21100027859210968, + "learning_rate": 2.3570311312674654e-05, + "loss": 0.4181, "step": 73925 }, { - "epoch": 2.6, - "learning_rate": 2.4592366044871708e-05, - "loss": 0.2748, + "epoch": 2.664432190867481, + "grad_norm": 0.20578476786613464, + "learning_rate": 2.356739796443245e-05, + "loss": 0.396, "step": 73930 }, { - "epoch": 2.6, - "learning_rate": 2.4589517704923843e-05, - "loss": 0.2676, + "epoch": 2.664612390528706, + "grad_norm": 0.2200337052345276, + "learning_rate": 2.356448463570918e-05, + "loss": 0.3953, "step": 73935 }, { - "epoch": 2.6, - "learning_rate": 2.4586669370305825e-05, - "loss": 0.2539, + "epoch": 2.6647925901899305, + "grad_norm": 0.21242652833461761, + "learning_rate": 2.356157132654452e-05, + "loss": 0.4087, "step": 73940 }, { - "epoch": 2.6, - "learning_rate": 2.4583821041054634e-05, - "loss": 0.2718, + "epoch": 2.664972789851155, + "grad_norm": 0.2014140486717224, + "learning_rate": 2.3558658036978183e-05, + "loss": 0.3968, "step": 73945 }, { - "epoch": 2.6, - "learning_rate": 2.458097271720724e-05, - "loss": 0.2683, + "epoch": 2.6651529895123796, + "grad_norm": 0.2067977786064148, + "learning_rate": 2.355574476704984e-05, + "loss": 0.3766, "step": 73950 }, { - "epoch": 2.6, - "learning_rate": 2.4578124398800645e-05, - "loss": 0.2919, + "epoch": 2.6653331891736043, + "grad_norm": 0.18431727588176727, + "learning_rate": 2.35528315167992e-05, + "loss": 0.3723, "step": 73955 }, { - "epoch": 2.6, - "learning_rate": 2.457527608587183e-05, - "loss": 0.282, + "epoch": 2.665513388834829, + "grad_norm": 0.2466651052236557, + "learning_rate": 2.3549918286265947e-05, + "loss": 0.3828, "step": 73960 }, { - "epoch": 2.6, - "learning_rate": 2.457242777845778e-05, - "loss": 0.2822, + "epoch": 2.6656935884960538, + "grad_norm": 0.22020891308784485, + "learning_rate": 2.3547005075489776e-05, + "loss": 0.3619, "step": 73965 }, { - "epoch": 2.6, - "learning_rate": 2.456957947659546e-05, - "loss": 0.2758, + "epoch": 2.6658737881572785, + "grad_norm": 0.173415869474411, + "learning_rate": 2.3544091884510383e-05, + "loss": 0.3719, "step": 73970 }, { - "epoch": 2.6, - "learning_rate": 2.4566731180321877e-05, - "loss": 0.2454, + "epoch": 2.666053987818503, + "grad_norm": 0.17796094715595245, + "learning_rate": 2.3541178713367456e-05, + "loss": 0.4016, "step": 73975 }, { - "epoch": 2.6, - "learning_rate": 2.4563882889674e-05, - "loss": 0.2699, + "epoch": 2.6662341874797275, + "grad_norm": 0.235768124461174, + "learning_rate": 2.353826556210068e-05, + "loss": 0.3881, "step": 73980 }, { - "epoch": 2.6, - "learning_rate": 2.4561034604688822e-05, - "loss": 0.2696, + "epoch": 2.6664143871409522, + "grad_norm": 0.253508597612381, + "learning_rate": 2.3535352430749763e-05, + "loss": 0.4, "step": 73985 }, { - "epoch": 2.6, - "learning_rate": 2.4558186325403305e-05, - "loss": 0.276, + "epoch": 2.6665945868021765, + "grad_norm": 0.20636709034442902, + "learning_rate": 2.353243931935438e-05, + "loss": 0.3943, "step": 73990 }, { - "epoch": 2.6, - "learning_rate": 2.4555338051854465e-05, - "loss": 0.2672, + "epoch": 2.6667747864634013, + "grad_norm": 0.20948311686515808, + "learning_rate": 2.3529526227954225e-05, + "loss": 0.3992, "step": 73995 }, { - "epoch": 2.6, - "learning_rate": 2.4552489784079262e-05, - "loss": 0.286, + "epoch": 2.666954986124626, + "grad_norm": 0.19002006947994232, + "learning_rate": 2.3526613156588997e-05, + "loss": 0.3784, "step": 74000 }, { - "epoch": 2.6, - "eval_loss": 0.26184189319610596, - "eval_runtime": 10.5446, - "eval_samples_per_second": 9.484, - "eval_steps_per_second": 9.484, + "epoch": 2.666954986124626, + "eval_loss": 0.4320438802242279, + "eval_runtime": 3.5274, + "eval_samples_per_second": 28.35, + "eval_steps_per_second": 7.087, "step": 74000 }, { - "epoch": 2.6, - "learning_rate": 2.4549641522114678e-05, - "loss": 0.2697, + "epoch": 2.6671351857858507, + "grad_norm": 0.2230537235736847, + "learning_rate": 2.3523700105298372e-05, + "loss": 0.3806, "step": 74005 }, { - "epoch": 2.6, - "learning_rate": 2.4546793265997708e-05, - "loss": 0.2528, + "epoch": 2.6673153854470755, + "grad_norm": 0.20656515657901764, + "learning_rate": 2.3520787074122052e-05, + "loss": 0.3979, "step": 74010 }, { - "epoch": 2.6, - "learning_rate": 2.4543945015765327e-05, - "loss": 0.2671, + "epoch": 2.6674955851083, + "grad_norm": 0.2289830446243286, + "learning_rate": 2.351787406309973e-05, + "loss": 0.4243, "step": 74015 }, { - "epoch": 2.6, - "learning_rate": 2.4541096771454525e-05, - "loss": 0.2972, + "epoch": 2.6676757847695245, + "grad_norm": 0.22350530326366425, + "learning_rate": 2.3514961072271068e-05, + "loss": 0.407, "step": 74020 }, { - "epoch": 2.6, - "learning_rate": 2.4538248533102265e-05, - "loss": 0.2878, + "epoch": 2.6678559844307492, + "grad_norm": 0.224583700299263, + "learning_rate": 2.351204810167579e-05, + "loss": 0.3582, "step": 74025 }, { - "epoch": 2.6, - "learning_rate": 2.4535400300745555e-05, - "loss": 0.282, + "epoch": 2.668036184091974, + "grad_norm": 0.24546033143997192, + "learning_rate": 2.3509135151353553e-05, + "loss": 0.4033, "step": 74030 }, { - "epoch": 2.6, - "learning_rate": 2.4532552074421363e-05, - "loss": 0.2478, + "epoch": 2.6682163837531987, + "grad_norm": 0.19116534292697906, + "learning_rate": 2.350622222134408e-05, + "loss": 0.4039, "step": 74035 }, { - "epoch": 2.6, - "learning_rate": 2.452970385416668e-05, - "loss": 0.2523, + "epoch": 2.668396583414423, + "grad_norm": 0.1550498604774475, + "learning_rate": 2.350330931168703e-05, + "loss": 0.3879, "step": 74040 }, { - "epoch": 2.61, - "learning_rate": 2.452685564001847e-05, - "loss": 0.2701, + "epoch": 2.6685767830756477, + "grad_norm": 0.2186582088470459, + "learning_rate": 2.3500396422422092e-05, + "loss": 0.3957, "step": 74045 }, { - "epoch": 2.61, - "learning_rate": 2.452400743201374e-05, - "loss": 0.2813, + "epoch": 2.6687569827368725, + "grad_norm": 0.21175815165042877, + "learning_rate": 2.3497483553588977e-05, + "loss": 0.3772, "step": 74050 }, { - "epoch": 2.61, - "learning_rate": 2.4521159230189456e-05, - "loss": 0.2408, + "epoch": 2.668937182398097, + "grad_norm": 0.25158798694610596, + "learning_rate": 2.3494570705227355e-05, + "loss": 0.3367, "step": 74055 }, { - "epoch": 2.61, - "learning_rate": 2.45183110345826e-05, - "loss": 0.2699, + "epoch": 2.669117382059322, + "grad_norm": 0.21257662773132324, + "learning_rate": 2.349165787737691e-05, + "loss": 0.3836, "step": 74060 }, { - "epoch": 2.61, - "learning_rate": 2.4515462845230158e-05, - "loss": 0.2767, + "epoch": 2.669297581720546, + "grad_norm": 0.22067958116531372, + "learning_rate": 2.348874507007734e-05, + "loss": 0.4208, "step": 74065 }, { - "epoch": 2.61, - "learning_rate": 2.4512614662169116e-05, - "loss": 0.2737, + "epoch": 2.669477781381771, + "grad_norm": 0.18371091783046722, + "learning_rate": 2.348583228336832e-05, + "loss": 0.3965, "step": 74070 }, { - "epoch": 2.61, - "learning_rate": 2.450976648543645e-05, - "loss": 0.2806, + "epoch": 2.6696579810429957, + "grad_norm": 0.2284574806690216, + "learning_rate": 2.3482919517289543e-05, + "loss": 0.3892, "step": 74075 }, { - "epoch": 2.61, - "learning_rate": 2.4506918315069148e-05, - "loss": 0.2699, + "epoch": 2.6698381807042204, + "grad_norm": 0.20297889411449432, + "learning_rate": 2.34800067718807e-05, + "loss": 0.4246, "step": 74080 }, { - "epoch": 2.61, - "learning_rate": 2.4504070151104172e-05, - "loss": 0.262, + "epoch": 2.6700183803654447, + "grad_norm": 0.23391233384609222, + "learning_rate": 2.3477094047181463e-05, + "loss": 0.4102, "step": 74085 }, { - "epoch": 2.61, - "learning_rate": 2.4501221993578533e-05, - "loss": 0.2613, + "epoch": 2.6701985800266694, + "grad_norm": 0.1801353245973587, + "learning_rate": 2.347418134323153e-05, + "loss": 0.3542, "step": 74090 }, { - "epoch": 2.61, - "learning_rate": 2.4498373842529187e-05, - "loss": 0.2739, + "epoch": 2.670378779687894, + "grad_norm": 0.20619447529315948, + "learning_rate": 2.347126866007058e-05, + "loss": 0.3497, "step": 74095 }, { - "epoch": 2.61, - "learning_rate": 2.4495525697993134e-05, - "loss": 0.2618, + "epoch": 2.670558979349119, + "grad_norm": 0.15153087675571442, + "learning_rate": 2.3468355997738293e-05, + "loss": 0.4128, "step": 74100 }, { - "epoch": 2.61, - "learning_rate": 2.449267756000734e-05, - "loss": 0.261, + "epoch": 2.6707391790103436, + "grad_norm": 0.26451894640922546, + "learning_rate": 2.3465443356274365e-05, + "loss": 0.4019, "step": 74105 }, { - "epoch": 2.61, - "learning_rate": 2.4489829428608802e-05, - "loss": 0.2796, + "epoch": 2.6709193786715684, + "grad_norm": 0.24107202887535095, + "learning_rate": 2.3462530735718472e-05, + "loss": 0.4251, "step": 74110 }, { - "epoch": 2.61, - "learning_rate": 2.4486981303834485e-05, - "loss": 0.2656, + "epoch": 2.6710995783327927, + "grad_norm": 0.2847641110420227, + "learning_rate": 2.34596181361103e-05, + "loss": 0.4012, "step": 74115 }, { - "epoch": 2.61, - "learning_rate": 2.448413318572138e-05, - "loss": 0.2802, + "epoch": 2.6712797779940174, + "grad_norm": 0.25060930848121643, + "learning_rate": 2.3456705557489543e-05, + "loss": 0.4052, "step": 74120 }, { - "epoch": 2.61, - "learning_rate": 2.4481285074306468e-05, - "loss": 0.2774, + "epoch": 2.671459977655242, + "grad_norm": 0.2532309591770172, + "learning_rate": 2.3453792999895855e-05, + "loss": 0.3964, "step": 74125 }, { - "epoch": 2.61, - "learning_rate": 2.4478436969626727e-05, - "loss": 0.2624, + "epoch": 2.6716401773164664, + "grad_norm": 0.1897568553686142, + "learning_rate": 2.3450880463368955e-05, + "loss": 0.3838, "step": 74130 }, { - "epoch": 2.61, - "learning_rate": 2.4475588871719144e-05, - "loss": 0.2983, + "epoch": 2.671820376977691, + "grad_norm": 0.2430182695388794, + "learning_rate": 2.3447967947948503e-05, + "loss": 0.4141, "step": 74135 }, { - "epoch": 2.61, - "learning_rate": 2.4472740780620678e-05, - "loss": 0.2542, + "epoch": 2.672000576638916, + "grad_norm": 0.22214165329933167, + "learning_rate": 2.344505545367418e-05, + "loss": 0.4151, "step": 74140 }, { - "epoch": 2.61, - "learning_rate": 2.446989269636834e-05, - "loss": 0.2763, + "epoch": 2.6721807763001406, + "grad_norm": 0.19647006690502167, + "learning_rate": 2.344214298058568e-05, + "loss": 0.4073, "step": 74145 }, { - "epoch": 2.61, - "learning_rate": 2.446704461899909e-05, - "loss": 0.2834, + "epoch": 2.6723609759613653, + "grad_norm": 0.18476322293281555, + "learning_rate": 2.343923052872268e-05, + "loss": 0.3706, "step": 74150 }, { - "epoch": 2.61, - "learning_rate": 2.446419654854992e-05, - "loss": 0.2549, + "epoch": 2.67254117562259, + "grad_norm": 0.18611617386341095, + "learning_rate": 2.3436318098124864e-05, + "loss": 0.3995, "step": 74155 }, { - "epoch": 2.61, - "learning_rate": 2.4461348485057792e-05, - "loss": 0.2625, + "epoch": 2.6727213752838144, + "grad_norm": 0.2218163013458252, + "learning_rate": 2.3433405688831915e-05, + "loss": 0.3928, "step": 74160 }, { - "epoch": 2.61, - "learning_rate": 2.445850042855971e-05, - "loss": 0.2741, + "epoch": 2.672901574945039, + "grad_norm": 0.16507494449615479, + "learning_rate": 2.34304933008835e-05, + "loss": 0.3753, "step": 74165 }, { - "epoch": 2.61, - "learning_rate": 2.4455652379092644e-05, - "loss": 0.2667, + "epoch": 2.673081774606264, + "grad_norm": 0.19643135368824005, + "learning_rate": 2.3427580934319314e-05, + "loss": 0.4089, "step": 74170 }, { - "epoch": 2.61, - "learning_rate": 2.445280433669357e-05, - "loss": 0.2613, + "epoch": 2.673261974267488, + "grad_norm": 0.22430889308452606, + "learning_rate": 2.3424668589179037e-05, + "loss": 0.3765, "step": 74175 }, { - "epoch": 2.61, - "learning_rate": 2.4449956301399464e-05, - "loss": 0.2699, + "epoch": 2.673442173928713, + "grad_norm": 0.2428254932165146, + "learning_rate": 2.342175626550234e-05, + "loss": 0.3831, "step": 74180 }, { - "epoch": 2.61, - "learning_rate": 2.444710827324732e-05, - "loss": 0.2531, + "epoch": 2.6736223735899376, + "grad_norm": 0.2370181381702423, + "learning_rate": 2.3418843963328912e-05, + "loss": 0.3586, "step": 74185 }, { - "epoch": 2.61, - "learning_rate": 2.4444260252274116e-05, - "loss": 0.2748, + "epoch": 2.6738025732511623, + "grad_norm": 0.19188377261161804, + "learning_rate": 2.3415931682698427e-05, + "loss": 0.3616, "step": 74190 }, { - "epoch": 2.61, - "learning_rate": 2.4441412238516823e-05, - "loss": 0.2512, + "epoch": 2.673982772912387, + "grad_norm": 0.22248739004135132, + "learning_rate": 2.341301942365057e-05, + "loss": 0.4284, "step": 74195 }, { - "epoch": 2.61, - "learning_rate": 2.4438564232012412e-05, - "loss": 0.2867, + "epoch": 2.674162972573612, + "grad_norm": 0.2631896138191223, + "learning_rate": 2.3410107186225015e-05, + "loss": 0.4025, "step": 74200 }, { - "epoch": 2.61, - "learning_rate": 2.4435716232797882e-05, - "loss": 0.2848, + "epoch": 2.674343172234836, + "grad_norm": 0.19557088613510132, + "learning_rate": 2.3407194970461435e-05, + "loss": 0.4003, "step": 74205 }, { - "epoch": 2.61, - "learning_rate": 2.443286824091021e-05, - "loss": 0.2665, + "epoch": 2.674523371896061, + "grad_norm": 0.20168626308441162, + "learning_rate": 2.3404282776399523e-05, + "loss": 0.3501, "step": 74210 }, { - "epoch": 2.61, - "learning_rate": 2.443002025638637e-05, - "loss": 0.2785, + "epoch": 2.6747035715572856, + "grad_norm": 0.23849454522132874, + "learning_rate": 2.340137060407894e-05, + "loss": 0.3978, "step": 74215 }, { - "epoch": 2.61, - "learning_rate": 2.4427172279263333e-05, - "loss": 0.2872, + "epoch": 2.67488377121851, + "grad_norm": 0.2284935563802719, + "learning_rate": 2.3398458453539385e-05, + "loss": 0.3623, "step": 74220 }, { - "epoch": 2.61, - "learning_rate": 2.4424324309578096e-05, - "loss": 0.2626, + "epoch": 2.6750639708797346, + "grad_norm": 0.2417210191488266, + "learning_rate": 2.3395546324820526e-05, + "loss": 0.4028, "step": 74225 }, { - "epoch": 2.61, - "learning_rate": 2.4421476347367623e-05, - "loss": 0.2686, + "epoch": 2.6752441705409593, + "grad_norm": 0.18951061367988586, + "learning_rate": 2.3392634217962018e-05, + "loss": 0.3889, "step": 74230 }, { - "epoch": 2.61, - "learning_rate": 2.441862839266889e-05, - "loss": 0.2656, + "epoch": 2.675424370202184, + "grad_norm": 0.24638362228870392, + "learning_rate": 2.3389722133003577e-05, + "loss": 0.3558, "step": 74235 }, { - "epoch": 2.61, - "learning_rate": 2.44157804455189e-05, - "loss": 0.2722, + "epoch": 2.6756045698634088, + "grad_norm": 0.18396976590156555, + "learning_rate": 2.3386810069984856e-05, + "loss": 0.3805, "step": 74240 }, { - "epoch": 2.61, - "learning_rate": 2.441293250595461e-05, - "loss": 0.266, + "epoch": 2.6757847695246335, + "grad_norm": 0.20471645891666412, + "learning_rate": 2.3383898028945528e-05, + "loss": 0.4248, "step": 74245 }, { - "epoch": 2.61, - "learning_rate": 2.441008457401301e-05, - "loss": 0.2711, + "epoch": 2.675964969185858, + "grad_norm": 0.1948273777961731, + "learning_rate": 2.338098600992528e-05, + "loss": 0.3789, "step": 74250 }, { - "epoch": 2.61, - "learning_rate": 2.4407236649731056e-05, - "loss": 0.2852, + "epoch": 2.6761451688470825, + "grad_norm": 0.22328773140907288, + "learning_rate": 2.3378074012963783e-05, + "loss": 0.4053, "step": 74255 }, { - "epoch": 2.61, - "learning_rate": 2.4404388733145756e-05, - "loss": 0.2881, + "epoch": 2.6763253685083073, + "grad_norm": 0.25258857011795044, + "learning_rate": 2.3375162038100716e-05, + "loss": 0.4313, "step": 74260 }, { - "epoch": 2.61, - "learning_rate": 2.440154082429408e-05, - "loss": 0.2974, + "epoch": 2.676505568169532, + "grad_norm": 0.1960865557193756, + "learning_rate": 2.337225008537575e-05, + "loss": 0.3928, "step": 74265 }, { - "epoch": 2.61, - "learning_rate": 2.4398692923213e-05, - "loss": 0.294, + "epoch": 2.6766857678307563, + "grad_norm": 0.190299391746521, + "learning_rate": 2.3369338154828564e-05, + "loss": 0.3747, "step": 74270 }, { - "epoch": 2.61, - "learning_rate": 2.439584502993948e-05, - "loss": 0.2635, + "epoch": 2.676865967491981, + "grad_norm": 0.2319035530090332, + "learning_rate": 2.336642624649883e-05, + "loss": 0.3932, "step": 74275 }, { - "epoch": 2.61, - "learning_rate": 2.4392997144510533e-05, - "loss": 0.2973, + "epoch": 2.6770461671532058, + "grad_norm": 0.17021389305591583, + "learning_rate": 2.336351436042622e-05, + "loss": 0.362, "step": 74280 }, { - "epoch": 2.61, - "learning_rate": 2.439014926696311e-05, - "loss": 0.2617, + "epoch": 2.6772263668144305, + "grad_norm": 0.1885906457901001, + "learning_rate": 2.3360602496650406e-05, + "loss": 0.4038, "step": 74285 }, { - "epoch": 2.61, - "learning_rate": 2.4387301397334203e-05, - "loss": 0.2876, + "epoch": 2.6774065664756552, + "grad_norm": 0.2009781450033188, + "learning_rate": 2.3357690655211072e-05, + "loss": 0.4274, "step": 74290 }, { - "epoch": 2.61, - "learning_rate": 2.438445353566077e-05, - "loss": 0.247, + "epoch": 2.6775867661368795, + "grad_norm": 0.2083226591348648, + "learning_rate": 2.335477883614788e-05, + "loss": 0.4083, "step": 74295 }, { - "epoch": 2.61, - "learning_rate": 2.4381605681979817e-05, - "loss": 0.2682, + "epoch": 2.6777669657981042, + "grad_norm": 0.18861836194992065, + "learning_rate": 2.3351867039500513e-05, + "loss": 0.3834, "step": 74300 }, { - "epoch": 2.61, - "learning_rate": 2.4378757836328304e-05, - "loss": 0.2722, + "epoch": 2.677947165459329, + "grad_norm": 0.2695733904838562, + "learning_rate": 2.3348955265308642e-05, + "loss": 0.4182, "step": 74305 }, { - "epoch": 2.61, - "learning_rate": 2.4375909998743205e-05, - "loss": 0.259, + "epoch": 2.6781273651205537, + "grad_norm": 0.17389146983623505, + "learning_rate": 2.334604351361192e-05, + "loss": 0.4131, "step": 74310 }, { - "epoch": 2.61, - "learning_rate": 2.4373062169261498e-05, - "loss": 0.2706, + "epoch": 2.678307564781778, + "grad_norm": 0.18311482667922974, + "learning_rate": 2.3343131784450055e-05, + "loss": 0.4198, "step": 74315 }, { - "epoch": 2.61, - "learning_rate": 2.437021434792018e-05, - "loss": 0.2927, + "epoch": 2.6784877644430027, + "grad_norm": 0.2671164572238922, + "learning_rate": 2.334022007786269e-05, + "loss": 0.3622, "step": 74320 }, { - "epoch": 2.61, - "learning_rate": 2.4367366534756213e-05, - "loss": 0.282, + "epoch": 2.6786679641042275, + "grad_norm": 0.2175154834985733, + "learning_rate": 2.3337308393889493e-05, + "loss": 0.4046, "step": 74325 }, { - "epoch": 2.62, - "learning_rate": 2.4364518729806572e-05, - "loss": 0.2638, + "epoch": 2.678848163765452, + "grad_norm": 0.22409701347351074, + "learning_rate": 2.3334396732570167e-05, + "loss": 0.3906, "step": 74330 }, { - "epoch": 2.62, - "learning_rate": 2.4361670933108224e-05, - "loss": 0.2674, + "epoch": 2.679028363426677, + "grad_norm": 0.203602135181427, + "learning_rate": 2.3331485093944344e-05, + "loss": 0.4466, "step": 74335 }, { - "epoch": 2.62, - "learning_rate": 2.435882314469817e-05, - "loss": 0.2617, + "epoch": 2.6792085630879012, + "grad_norm": 0.19964513182640076, + "learning_rate": 2.332857347805173e-05, + "loss": 0.4134, "step": 74340 }, { - "epoch": 2.62, - "learning_rate": 2.4355975364613372e-05, - "loss": 0.2523, + "epoch": 2.679388762749126, + "grad_norm": 0.1884617805480957, + "learning_rate": 2.3325661884931972e-05, + "loss": 0.3872, "step": 74345 }, { - "epoch": 2.62, - "learning_rate": 2.4353127592890804e-05, - "loss": 0.2723, + "epoch": 2.6795689624103507, + "grad_norm": 0.2019912749528885, + "learning_rate": 2.3322750314624747e-05, + "loss": 0.4068, "step": 74350 }, { - "epoch": 2.62, - "learning_rate": 2.435027982956746e-05, - "loss": 0.2797, + "epoch": 2.6797491620715754, + "grad_norm": 0.2299565076828003, + "learning_rate": 2.3319838767169725e-05, + "loss": 0.4014, "step": 74355 }, { - "epoch": 2.62, - "learning_rate": 2.43474320746803e-05, - "loss": 0.2813, + "epoch": 2.6799293617327997, + "grad_norm": 0.24402816593647003, + "learning_rate": 2.3316927242606575e-05, + "loss": 0.3874, "step": 74360 }, { - "epoch": 2.62, - "learning_rate": 2.43445843282663e-05, - "loss": 0.2605, + "epoch": 2.6801095613940245, + "grad_norm": 0.25074100494384766, + "learning_rate": 2.331401574097496e-05, + "loss": 0.3988, "step": 74365 }, { - "epoch": 2.62, - "learning_rate": 2.4341736590362445e-05, - "loss": 0.2604, + "epoch": 2.680289761055249, + "grad_norm": 0.19405324757099152, + "learning_rate": 2.3311104262314563e-05, + "loss": 0.3689, "step": 74370 }, { - "epoch": 2.62, - "learning_rate": 2.43388888610057e-05, - "loss": 0.2687, + "epoch": 2.680469960716474, + "grad_norm": 0.19997498393058777, + "learning_rate": 2.3308192806665034e-05, + "loss": 0.3886, "step": 74375 }, { - "epoch": 2.62, - "learning_rate": 2.4336041140233058e-05, - "loss": 0.2694, + "epoch": 2.6806501603776987, + "grad_norm": 0.22032709419727325, + "learning_rate": 2.3305281374066057e-05, + "loss": 0.4064, "step": 74380 }, { - "epoch": 2.62, - "learning_rate": 2.4333193428081483e-05, - "loss": 0.2787, + "epoch": 2.6808303600389234, + "grad_norm": 0.25573626160621643, + "learning_rate": 2.3302369964557292e-05, + "loss": 0.3774, "step": 74385 }, { - "epoch": 2.62, - "learning_rate": 2.4330345724587937e-05, - "loss": 0.2734, + "epoch": 2.6810105597001477, + "grad_norm": 0.23341487348079681, + "learning_rate": 2.3299458578178403e-05, + "loss": 0.3918, "step": 74390 }, { - "epoch": 2.62, - "learning_rate": 2.4327498029789422e-05, - "loss": 0.2889, + "epoch": 2.6811907593613724, + "grad_norm": 0.19004227221012115, + "learning_rate": 2.3296547214969066e-05, + "loss": 0.4126, "step": 74395 }, { - "epoch": 2.62, - "learning_rate": 2.4324650343722905e-05, - "loss": 0.2638, + "epoch": 2.681370959022597, + "grad_norm": 0.18891394138336182, + "learning_rate": 2.3293635874968954e-05, + "loss": 0.3896, "step": 74400 }, { - "epoch": 2.62, - "learning_rate": 2.4321802666425357e-05, - "loss": 0.2766, + "epoch": 2.6815511586838214, + "grad_norm": 0.20582923293113708, + "learning_rate": 2.32907245582177e-05, + "loss": 0.433, "step": 74405 }, { - "epoch": 2.62, - "learning_rate": 2.4318954997933743e-05, - "loss": 0.2545, + "epoch": 2.681731358345046, + "grad_norm": 0.22365623712539673, + "learning_rate": 2.328781326475501e-05, + "loss": 0.3804, "step": 74410 }, { - "epoch": 2.62, - "learning_rate": 2.4316107338285062e-05, - "loss": 0.2671, + "epoch": 2.681911558006271, + "grad_norm": 0.2613740563392639, + "learning_rate": 2.328490199462052e-05, + "loss": 0.3879, "step": 74415 }, { - "epoch": 2.62, - "learning_rate": 2.431325968751627e-05, - "loss": 0.2798, + "epoch": 2.6820917576674956, + "grad_norm": 0.16647237539291382, + "learning_rate": 2.3281990747853925e-05, + "loss": 0.3906, "step": 74420 }, { - "epoch": 2.62, - "learning_rate": 2.431041204566436e-05, - "loss": 0.2563, + "epoch": 2.6822719573287204, + "grad_norm": 0.21178102493286133, + "learning_rate": 2.3279079524494864e-05, + "loss": 0.4273, "step": 74425 }, { - "epoch": 2.62, - "learning_rate": 2.4307564412766276e-05, - "loss": 0.2845, + "epoch": 2.682452156989945, + "grad_norm": 0.22928111255168915, + "learning_rate": 2.327616832458301e-05, + "loss": 0.3926, "step": 74430 }, { - "epoch": 2.62, - "learning_rate": 2.4304716788859024e-05, - "loss": 0.2695, + "epoch": 2.6826323566511694, + "grad_norm": 0.29340752959251404, + "learning_rate": 2.327325714815803e-05, + "loss": 0.3821, "step": 74435 }, { - "epoch": 2.62, - "learning_rate": 2.4301869173979568e-05, - "loss": 0.2853, + "epoch": 2.682812556312394, + "grad_norm": 0.22144730389118195, + "learning_rate": 2.3270345995259586e-05, + "loss": 0.4207, "step": 74440 }, { - "epoch": 2.62, - "learning_rate": 2.4299021568164876e-05, - "loss": 0.2705, + "epoch": 2.682992755973619, + "grad_norm": 0.2856822907924652, + "learning_rate": 2.3267434865927345e-05, + "loss": 0.3741, "step": 74445 }, { - "epoch": 2.62, - "learning_rate": 2.429617397145193e-05, - "loss": 0.2638, + "epoch": 2.683172955634843, + "grad_norm": 0.16166545450687408, + "learning_rate": 2.326452376020097e-05, + "loss": 0.3838, "step": 74450 }, { - "epoch": 2.62, - "learning_rate": 2.4293326383877694e-05, - "loss": 0.2754, + "epoch": 2.683353155296068, + "grad_norm": 0.21917329728603363, + "learning_rate": 2.3261612678120118e-05, + "loss": 0.4113, "step": 74455 }, { - "epoch": 2.62, - "learning_rate": 2.4290478805479158e-05, - "loss": 0.2483, + "epoch": 2.6835333549572926, + "grad_norm": 0.25646519660949707, + "learning_rate": 2.325870161972446e-05, + "loss": 0.4198, "step": 74460 }, { - "epoch": 2.62, - "learning_rate": 2.4287631236293287e-05, - "loss": 0.25, + "epoch": 2.6837135546185173, + "grad_norm": 0.18003955483436584, + "learning_rate": 2.3255790585053654e-05, + "loss": 0.4351, "step": 74465 }, { - "epoch": 2.62, - "learning_rate": 2.4284783676357044e-05, - "loss": 0.2723, + "epoch": 2.683893754279742, + "grad_norm": 0.2252659797668457, + "learning_rate": 2.3252879574147363e-05, + "loss": 0.3775, "step": 74470 }, { - "epoch": 2.62, - "learning_rate": 2.4281936125707418e-05, - "loss": 0.2838, + "epoch": 2.684073953940967, + "grad_norm": 0.21669524908065796, + "learning_rate": 2.3249968587045253e-05, + "loss": 0.4371, "step": 74475 }, { - "epoch": 2.62, - "learning_rate": 2.4279088584381386e-05, - "loss": 0.2648, + "epoch": 2.684254153602191, + "grad_norm": 0.2279970496892929, + "learning_rate": 2.3247057623786974e-05, + "loss": 0.4017, "step": 74480 }, { - "epoch": 2.62, - "learning_rate": 2.4276241052415898e-05, - "loss": 0.2948, + "epoch": 2.684434353263416, + "grad_norm": 0.1842222362756729, + "learning_rate": 2.3244146684412205e-05, + "loss": 0.3811, "step": 74485 }, { - "epoch": 2.62, - "learning_rate": 2.427339352984796e-05, - "loss": 0.2717, + "epoch": 2.6846145529246406, + "grad_norm": 0.24382725358009338, + "learning_rate": 2.3241235768960595e-05, + "loss": 0.423, "step": 74490 }, { - "epoch": 2.62, - "learning_rate": 2.4270546016714525e-05, - "loss": 0.2738, + "epoch": 2.684794752585865, + "grad_norm": 0.24431829154491425, + "learning_rate": 2.3238324877471804e-05, + "loss": 0.3774, "step": 74495 }, { - "epoch": 2.62, - "learning_rate": 2.4267698513052565e-05, - "loss": 0.2835, + "epoch": 2.6849749522470896, + "grad_norm": 0.22015175223350525, + "learning_rate": 2.3235414009985498e-05, + "loss": 0.4201, "step": 74500 }, { - "epoch": 2.62, - "eval_loss": 0.2619902491569519, - "eval_runtime": 10.556, - "eval_samples_per_second": 9.473, - "eval_steps_per_second": 9.473, + "epoch": 2.6849749522470896, + "eval_loss": 0.4314359724521637, + "eval_runtime": 3.5281, + "eval_samples_per_second": 28.344, + "eval_steps_per_second": 7.086, "step": 74500 }, { - "epoch": 2.62, - "learning_rate": 2.4264851018899058e-05, - "loss": 0.2681, + "epoch": 2.6851551519083143, + "grad_norm": 0.19529901444911957, + "learning_rate": 2.323250316654134e-05, + "loss": 0.4097, "step": 74505 }, { - "epoch": 2.62, - "learning_rate": 2.4262003534290972e-05, - "loss": 0.254, + "epoch": 2.685335351569539, + "grad_norm": 0.1773121953010559, + "learning_rate": 2.322959234717897e-05, + "loss": 0.4016, "step": 74510 }, { - "epoch": 2.62, - "learning_rate": 2.4259156059265292e-05, - "loss": 0.2792, + "epoch": 2.685515551230764, + "grad_norm": 0.24559174478054047, + "learning_rate": 2.322668155193808e-05, + "loss": 0.3864, "step": 74515 }, { - "epoch": 2.62, - "learning_rate": 2.4256308593858982e-05, - "loss": 0.2778, + "epoch": 2.6856957508919885, + "grad_norm": 0.214651420712471, + "learning_rate": 2.322377078085829e-05, + "loss": 0.4201, "step": 74520 }, { - "epoch": 2.62, - "learning_rate": 2.4253461138109004e-05, - "loss": 0.2737, + "epoch": 2.685875950553213, + "grad_norm": 0.19618816673755646, + "learning_rate": 2.3220860033979296e-05, + "loss": 0.4003, "step": 74525 }, { - "epoch": 2.62, - "learning_rate": 2.4250613692052352e-05, - "loss": 0.2618, + "epoch": 2.6860561502144376, + "grad_norm": 0.22414274513721466, + "learning_rate": 2.3217949311340733e-05, + "loss": 0.3972, "step": 74530 }, { - "epoch": 2.62, - "learning_rate": 2.424776625572598e-05, - "loss": 0.2901, + "epoch": 2.6862363498756623, + "grad_norm": 0.1811375916004181, + "learning_rate": 2.3215038612982265e-05, + "loss": 0.4129, "step": 74535 }, { - "epoch": 2.62, - "learning_rate": 2.4244918829166876e-05, - "loss": 0.275, + "epoch": 2.686416549536887, + "grad_norm": 0.21368905901908875, + "learning_rate": 2.3212127938943552e-05, + "loss": 0.3923, "step": 74540 }, { - "epoch": 2.62, - "learning_rate": 2.4242071412411994e-05, - "loss": 0.2533, + "epoch": 2.6865967491981113, + "grad_norm": 0.2091556340456009, + "learning_rate": 2.320921728926425e-05, + "loss": 0.3873, "step": 74545 }, { - "epoch": 2.62, - "learning_rate": 2.423922400549832e-05, - "loss": 0.2717, + "epoch": 2.686776948859336, + "grad_norm": 0.246855229139328, + "learning_rate": 2.3206306663984013e-05, + "loss": 0.4176, "step": 74550 }, { - "epoch": 2.62, - "learning_rate": 2.423637660846282e-05, - "loss": 0.2673, + "epoch": 2.686957148520561, + "grad_norm": 0.20908492803573608, + "learning_rate": 2.3203396063142503e-05, + "loss": 0.3964, "step": 74555 }, { - "epoch": 2.62, - "learning_rate": 2.4233529221342473e-05, - "loss": 0.2636, + "epoch": 2.6871373481817855, + "grad_norm": 0.21159091591835022, + "learning_rate": 2.3200485486779367e-05, + "loss": 0.3901, "step": 74560 }, { - "epoch": 2.62, - "learning_rate": 2.4230681844174228e-05, - "loss": 0.2511, + "epoch": 2.6873175478430102, + "grad_norm": 0.20965726673603058, + "learning_rate": 2.3197574934934274e-05, + "loss": 0.3859, "step": 74565 }, { - "epoch": 2.62, - "learning_rate": 2.4227834476995086e-05, - "loss": 0.2544, + "epoch": 2.6874977475042345, + "grad_norm": 0.2037397176027298, + "learning_rate": 2.3194664407646876e-05, + "loss": 0.3793, "step": 74570 }, { - "epoch": 2.62, - "learning_rate": 2.4224987119842006e-05, - "loss": 0.276, + "epoch": 2.6876779471654593, + "grad_norm": 0.21732820570468903, + "learning_rate": 2.319175390495682e-05, + "loss": 0.4485, "step": 74575 }, { - "epoch": 2.62, - "learning_rate": 2.4222139772751952e-05, - "loss": 0.2681, + "epoch": 2.687858146826684, + "grad_norm": 0.19209305942058563, + "learning_rate": 2.3188843426903774e-05, + "loss": 0.3827, "step": 74580 }, { - "epoch": 2.62, - "learning_rate": 2.4219292435761895e-05, - "loss": 0.2883, + "epoch": 2.6880383464879087, + "grad_norm": 0.2674555480480194, + "learning_rate": 2.3185932973527386e-05, + "loss": 0.4225, "step": 74585 }, { - "epoch": 2.62, - "learning_rate": 2.4216445108908817e-05, - "loss": 0.2474, + "epoch": 2.688218546149133, + "grad_norm": 0.1785704642534256, + "learning_rate": 2.3183022544867298e-05, + "loss": 0.4047, "step": 74590 }, { - "epoch": 2.62, - "learning_rate": 2.4213597792229685e-05, - "loss": 0.2678, + "epoch": 2.6883987458103578, + "grad_norm": 0.18283165991306305, + "learning_rate": 2.318011214096319e-05, + "loss": 0.3662, "step": 74595 }, { - "epoch": 2.62, - "learning_rate": 2.4210750485761457e-05, - "loss": 0.271, + "epoch": 2.6885789454715825, + "grad_norm": 0.22287532687187195, + "learning_rate": 2.3177201761854686e-05, + "loss": 0.4041, "step": 74600 }, { - "epoch": 2.62, - "learning_rate": 2.4207903189541126e-05, - "loss": 0.2604, + "epoch": 2.6887591451328072, + "grad_norm": 0.2034824639558792, + "learning_rate": 2.317429140758147e-05, + "loss": 0.3986, "step": 74605 }, { - "epoch": 2.62, - "learning_rate": 2.420505590360565e-05, - "loss": 0.2676, + "epoch": 2.688939344794032, + "grad_norm": 0.23879247903823853, + "learning_rate": 2.317138107818318e-05, + "loss": 0.3647, "step": 74610 }, { - "epoch": 2.63, - "learning_rate": 2.4202208627991994e-05, - "loss": 0.2623, + "epoch": 2.6891195444552567, + "grad_norm": 0.2760174572467804, + "learning_rate": 2.3168470773699452e-05, + "loss": 0.3891, "step": 74615 }, { - "epoch": 2.63, - "learning_rate": 2.4199361362737127e-05, - "loss": 0.2597, + "epoch": 2.689299744116481, + "grad_norm": 0.20346973836421967, + "learning_rate": 2.3165560494169973e-05, + "loss": 0.4036, "step": 74620 }, { - "epoch": 2.63, - "learning_rate": 2.4196514107878038e-05, - "loss": 0.254, + "epoch": 2.6894799437777057, + "grad_norm": 0.27176687121391296, + "learning_rate": 2.3162650239634363e-05, + "loss": 0.4122, "step": 74625 }, { - "epoch": 2.63, - "learning_rate": 2.419366686345168e-05, - "loss": 0.2658, + "epoch": 2.6896601434389305, + "grad_norm": 0.232636496424675, + "learning_rate": 2.3159740010132304e-05, + "loss": 0.4164, "step": 74630 }, { - "epoch": 2.63, - "learning_rate": 2.419081962949503e-05, - "loss": 0.2685, + "epoch": 2.6898403431001547, + "grad_norm": 0.1852894425392151, + "learning_rate": 2.3156829805703422e-05, + "loss": 0.4268, "step": 74635 }, { - "epoch": 2.63, - "learning_rate": 2.418797240604504e-05, - "loss": 0.2688, + "epoch": 2.6900205427613795, + "grad_norm": 0.1811005026102066, + "learning_rate": 2.3153919626387374e-05, + "loss": 0.4163, "step": 74640 }, { - "epoch": 2.63, - "learning_rate": 2.41851251931387e-05, - "loss": 0.2555, + "epoch": 2.690200742422604, + "grad_norm": 0.22391211986541748, + "learning_rate": 2.315100947222382e-05, + "loss": 0.3755, "step": 74645 }, { - "epoch": 2.63, - "learning_rate": 2.4182277990812976e-05, - "loss": 0.2566, + "epoch": 2.690380942083829, + "grad_norm": 0.19471041858196259, + "learning_rate": 2.31480993432524e-05, + "loss": 0.3779, "step": 74650 }, { - "epoch": 2.63, - "learning_rate": 2.4179430799104834e-05, - "loss": 0.2696, + "epoch": 2.6905611417450537, + "grad_norm": 0.21801535785198212, + "learning_rate": 2.3145189239512765e-05, + "loss": 0.3664, "step": 74655 }, { - "epoch": 2.63, - "learning_rate": 2.4176583618051236e-05, - "loss": 0.2613, + "epoch": 2.6907413414062784, + "grad_norm": 0.21162468194961548, + "learning_rate": 2.3142279161044575e-05, + "loss": 0.3548, "step": 74660 }, { - "epoch": 2.63, - "learning_rate": 2.4173736447689165e-05, - "loss": 0.2532, + "epoch": 2.6909215410675027, + "grad_norm": 0.20754042267799377, + "learning_rate": 2.3139369107887467e-05, + "loss": 0.3827, "step": 74665 }, { - "epoch": 2.63, - "learning_rate": 2.4170889288055572e-05, - "loss": 0.2818, + "epoch": 2.6911017407287274, + "grad_norm": 0.175797700881958, + "learning_rate": 2.3136459080081096e-05, + "loss": 0.3674, "step": 74670 }, { - "epoch": 2.63, - "learning_rate": 2.4168042139187447e-05, - "loss": 0.2913, + "epoch": 2.691281940389952, + "grad_norm": 0.19099979102611542, + "learning_rate": 2.3133549077665114e-05, + "loss": 0.411, "step": 74675 }, { - "epoch": 2.63, - "learning_rate": 2.4165195001121732e-05, - "loss": 0.2811, + "epoch": 2.6914621400511765, + "grad_norm": 0.20772726833820343, + "learning_rate": 2.313063910067915e-05, + "loss": 0.3753, "step": 74680 }, { - "epoch": 2.63, - "learning_rate": 2.416234787389542e-05, - "loss": 0.2603, + "epoch": 2.691642339712401, + "grad_norm": 0.24013733863830566, + "learning_rate": 2.312772914916288e-05, + "loss": 0.3797, "step": 74685 }, { - "epoch": 2.63, - "learning_rate": 2.4159500757545473e-05, - "loss": 0.2629, + "epoch": 2.691822539373626, + "grad_norm": 0.191719189286232, + "learning_rate": 2.312481922315594e-05, + "loss": 0.3634, "step": 74690 }, { - "epoch": 2.63, - "learning_rate": 2.4156653652108847e-05, - "loss": 0.2686, + "epoch": 2.6920027390348507, + "grad_norm": 0.1952929049730301, + "learning_rate": 2.312190932269796e-05, + "loss": 0.374, "step": 74695 }, { - "epoch": 2.63, - "learning_rate": 2.415380655762251e-05, - "loss": 0.2839, + "epoch": 2.6921829386960754, + "grad_norm": 0.19985808432102203, + "learning_rate": 2.3118999447828617e-05, + "loss": 0.3761, "step": 74700 }, { - "epoch": 2.63, - "learning_rate": 2.4150959474123454e-05, - "loss": 0.2787, + "epoch": 2.6923631383573, + "grad_norm": 0.22037459909915924, + "learning_rate": 2.311608959858753e-05, + "loss": 0.3537, "step": 74705 }, { - "epoch": 2.63, - "learning_rate": 2.4148112401648624e-05, - "loss": 0.2747, + "epoch": 2.6925433380185244, + "grad_norm": 0.25286975502967834, + "learning_rate": 2.311317977501437e-05, + "loss": 0.3598, "step": 74710 }, { - "epoch": 2.63, - "learning_rate": 2.4145265340234983e-05, - "loss": 0.2692, + "epoch": 2.692723537679749, + "grad_norm": 0.21672144532203674, + "learning_rate": 2.3110269977148765e-05, + "loss": 0.3825, "step": 74715 }, { - "epoch": 2.63, - "learning_rate": 2.414241828991952e-05, - "loss": 0.2783, + "epoch": 2.692903737340974, + "grad_norm": 0.2222987562417984, + "learning_rate": 2.310736020503036e-05, + "loss": 0.3665, "step": 74720 }, { - "epoch": 2.63, - "learning_rate": 2.4139571250739183e-05, - "loss": 0.2737, + "epoch": 2.693083937002198, + "grad_norm": 0.1981443464756012, + "learning_rate": 2.3104450458698816e-05, + "loss": 0.4095, "step": 74725 }, { - "epoch": 2.63, - "learning_rate": 2.4136724222730957e-05, - "loss": 0.2638, + "epoch": 2.693264136663423, + "grad_norm": 0.19497603178024292, + "learning_rate": 2.3101540738193762e-05, + "loss": 0.4022, "step": 74730 }, { - "epoch": 2.63, - "learning_rate": 2.413387720593178e-05, - "loss": 0.2488, + "epoch": 2.6934443363246476, + "grad_norm": 0.20269820094108582, + "learning_rate": 2.3098631043554845e-05, + "loss": 0.3861, "step": 74735 }, { - "epoch": 2.63, - "learning_rate": 2.4131030200378653e-05, - "loss": 0.2688, + "epoch": 2.6936245359858724, + "grad_norm": 0.20779010653495789, + "learning_rate": 2.3095721374821716e-05, + "loss": 0.3832, "step": 74740 }, { - "epoch": 2.63, - "learning_rate": 2.4128183206108524e-05, - "loss": 0.2412, + "epoch": 2.693804735647097, + "grad_norm": 0.18731550872325897, + "learning_rate": 2.309281173203401e-05, + "loss": 0.3904, "step": 74745 }, { - "epoch": 2.63, - "learning_rate": 2.4125336223158358e-05, - "loss": 0.2759, + "epoch": 2.693984935308322, + "grad_norm": 0.2547980844974518, + "learning_rate": 2.3089902115231378e-05, + "loss": 0.3837, "step": 74750 }, { - "epoch": 2.63, - "learning_rate": 2.4122489251565117e-05, - "loss": 0.302, + "epoch": 2.694165134969546, + "grad_norm": 0.20064884424209595, + "learning_rate": 2.3086992524453462e-05, + "loss": 0.3908, "step": 74755 }, { - "epoch": 2.63, - "learning_rate": 2.4119642291365785e-05, - "loss": 0.2656, + "epoch": 2.694345334630771, + "grad_norm": 0.1967686116695404, + "learning_rate": 2.30840829597399e-05, + "loss": 0.423, "step": 74760 }, { - "epoch": 2.63, - "learning_rate": 2.411679534259732e-05, - "loss": 0.2767, + "epoch": 2.6945255342919956, + "grad_norm": 0.21911115944385529, + "learning_rate": 2.3081173421130336e-05, + "loss": 0.4434, "step": 74765 }, { - "epoch": 2.63, - "learning_rate": 2.4113948405296684e-05, - "loss": 0.2715, + "epoch": 2.6947057339532203, + "grad_norm": 0.20522983372211456, + "learning_rate": 2.307826390866442e-05, + "loss": 0.403, "step": 74770 }, { - "epoch": 2.63, - "learning_rate": 2.4111101479500835e-05, - "loss": 0.2581, + "epoch": 2.6948859336144446, + "grad_norm": 0.19616533815860748, + "learning_rate": 2.307535442238177e-05, + "loss": 0.3686, "step": 74775 }, { - "epoch": 2.63, - "learning_rate": 2.410825456524675e-05, - "loss": 0.2648, + "epoch": 2.6950661332756694, + "grad_norm": 0.2152736634016037, + "learning_rate": 2.3072444962322056e-05, + "loss": 0.4022, "step": 74780 }, { - "epoch": 2.63, - "learning_rate": 2.41054076625714e-05, - "loss": 0.2657, + "epoch": 2.695246332936894, + "grad_norm": 0.17979800701141357, + "learning_rate": 2.3069535528524902e-05, + "loss": 0.3845, "step": 74785 }, { - "epoch": 2.63, - "learning_rate": 2.410256077151174e-05, - "loss": 0.2561, + "epoch": 2.695426532598119, + "grad_norm": 0.2075699120759964, + "learning_rate": 2.3066626121029954e-05, + "loss": 0.4106, "step": 74790 }, { - "epoch": 2.63, - "learning_rate": 2.4099713892104727e-05, - "loss": 0.2544, + "epoch": 2.6956067322593436, + "grad_norm": 0.1615006923675537, + "learning_rate": 2.306371673987686e-05, + "loss": 0.3906, "step": 74795 }, { - "epoch": 2.63, - "learning_rate": 2.4096867024387345e-05, - "loss": 0.2728, + "epoch": 2.695786931920568, + "grad_norm": 0.2201845496892929, + "learning_rate": 2.306080738510523e-05, + "loss": 0.3822, "step": 74800 }, { - "epoch": 2.63, - "learning_rate": 2.4094020168396548e-05, - "loss": 0.2933, + "epoch": 2.6959671315817926, + "grad_norm": 0.17273396253585815, + "learning_rate": 2.3057898056754744e-05, + "loss": 0.401, "step": 74805 }, { - "epoch": 2.63, - "learning_rate": 2.4091173324169304e-05, - "loss": 0.2865, + "epoch": 2.6961473312430173, + "grad_norm": 0.1817549616098404, + "learning_rate": 2.3054988754865015e-05, + "loss": 0.3613, "step": 74810 }, { - "epoch": 2.63, - "learning_rate": 2.4088326491742567e-05, - "loss": 0.2505, + "epoch": 2.696327530904242, + "grad_norm": 0.18268698453903198, + "learning_rate": 2.3052079479475683e-05, + "loss": 0.4106, "step": 74815 }, { - "epoch": 2.63, - "learning_rate": 2.408547967115332e-05, - "loss": 0.2515, + "epoch": 2.6965077305654663, + "grad_norm": 0.22823894023895264, + "learning_rate": 2.3049170230626395e-05, + "loss": 0.3843, "step": 74820 }, { - "epoch": 2.63, - "learning_rate": 2.4082632862438514e-05, - "loss": 0.2667, + "epoch": 2.696687930226691, + "grad_norm": 0.22048458456993103, + "learning_rate": 2.304626100835678e-05, + "loss": 0.4166, "step": 74825 }, { - "epoch": 2.63, - "learning_rate": 2.4079786065635107e-05, - "loss": 0.2822, + "epoch": 2.696868129887916, + "grad_norm": 0.22666330635547638, + "learning_rate": 2.3043351812706486e-05, + "loss": 0.3873, "step": 74830 }, { - "epoch": 2.63, - "learning_rate": 2.4076939280780075e-05, - "loss": 0.261, + "epoch": 2.6970483295491405, + "grad_norm": 0.22461573779582977, + "learning_rate": 2.3040442643715142e-05, + "loss": 0.4052, "step": 74835 }, { - "epoch": 2.63, - "learning_rate": 2.4074092507910386e-05, - "loss": 0.2563, + "epoch": 2.6972285292103653, + "grad_norm": 0.2268482744693756, + "learning_rate": 2.3037533501422384e-05, + "loss": 0.4011, "step": 74840 }, { - "epoch": 2.63, - "learning_rate": 2.4071245747062996e-05, - "loss": 0.2818, + "epoch": 2.6974087288715896, + "grad_norm": 0.2219397872686386, + "learning_rate": 2.303462438586786e-05, + "loss": 0.387, "step": 74845 }, { - "epoch": 2.63, - "learning_rate": 2.4068398998274854e-05, - "loss": 0.2844, + "epoch": 2.6975889285328143, + "grad_norm": 0.21480488777160645, + "learning_rate": 2.3031715297091188e-05, + "loss": 0.4225, "step": 74850 }, { - "epoch": 2.63, - "learning_rate": 2.4065552261582952e-05, - "loss": 0.2628, + "epoch": 2.697769128194039, + "grad_norm": 0.1800355166196823, + "learning_rate": 2.302880623513202e-05, + "loss": 0.3738, "step": 74855 }, { - "epoch": 2.63, - "learning_rate": 2.406270553702423e-05, - "loss": 0.2968, + "epoch": 2.6979493278552638, + "grad_norm": 0.2048688679933548, + "learning_rate": 2.302589720002999e-05, + "loss": 0.3953, "step": 74860 }, { - "epoch": 2.63, - "learning_rate": 2.4059858824635663e-05, - "loss": 0.263, + "epoch": 2.698129527516488, + "grad_norm": 0.19619779288768768, + "learning_rate": 2.302298819182472e-05, + "loss": 0.3967, "step": 74865 }, { - "epoch": 2.63, - "learning_rate": 2.4057012124454205e-05, - "loss": 0.2644, + "epoch": 2.698309727177713, + "grad_norm": 0.22370363771915436, + "learning_rate": 2.302007921055586e-05, + "loss": 0.4012, "step": 74870 }, { - "epoch": 2.63, - "learning_rate": 2.4054165436516825e-05, - "loss": 0.256, + "epoch": 2.6984899268389375, + "grad_norm": 0.2369144707918167, + "learning_rate": 2.301717025626304e-05, + "loss": 0.3758, "step": 74875 }, { - "epoch": 2.63, - "learning_rate": 2.4051318760860488e-05, - "loss": 0.2898, + "epoch": 2.6986701265001622, + "grad_norm": 0.18146106600761414, + "learning_rate": 2.301426132898588e-05, + "loss": 0.4064, "step": 74880 }, { - "epoch": 2.63, - "learning_rate": 2.4048472097522148e-05, - "loss": 0.2598, + "epoch": 2.698850326161387, + "grad_norm": 0.2013833373785019, + "learning_rate": 2.301135242876404e-05, + "loss": 0.3753, "step": 74885 }, { - "epoch": 2.63, - "learning_rate": 2.404562544653877e-05, - "loss": 0.2734, + "epoch": 2.6990305258226117, + "grad_norm": 0.18798790872097015, + "learning_rate": 2.3008443555637116e-05, + "loss": 0.4047, "step": 74890 }, { - "epoch": 2.64, - "learning_rate": 2.4042778807947313e-05, - "loss": 0.2798, + "epoch": 2.699210725483836, + "grad_norm": 0.23676003515720367, + "learning_rate": 2.3005534709644784e-05, + "loss": 0.3704, "step": 74895 }, { - "epoch": 2.64, - "learning_rate": 2.4039932181784754e-05, - "loss": 0.272, + "epoch": 2.6993909251450607, + "grad_norm": 0.20541468262672424, + "learning_rate": 2.300262589082665e-05, + "loss": 0.3796, "step": 74900 }, { - "epoch": 2.64, - "learning_rate": 2.403708556808804e-05, - "loss": 0.2886, + "epoch": 2.6995711248062855, + "grad_norm": 0.2497328221797943, + "learning_rate": 2.299971709922234e-05, + "loss": 0.3954, "step": 74905 }, { - "epoch": 2.64, - "learning_rate": 2.403423896689412e-05, - "loss": 0.2677, + "epoch": 2.6997513244675098, + "grad_norm": 0.2306727170944214, + "learning_rate": 2.2996808334871513e-05, + "loss": 0.4071, "step": 74910 }, { - "epoch": 2.64, - "learning_rate": 2.403139237823998e-05, - "loss": 0.2875, + "epoch": 2.6999315241287345, + "grad_norm": 0.2239367663860321, + "learning_rate": 2.2993899597813778e-05, + "loss": 0.3672, "step": 74915 }, { - "epoch": 2.64, - "learning_rate": 2.4028545802162578e-05, - "loss": 0.2836, + "epoch": 2.7001117237899592, + "grad_norm": 0.19781452417373657, + "learning_rate": 2.299099088808877e-05, + "loss": 0.3691, "step": 74920 }, { - "epoch": 2.64, - "learning_rate": 2.402569923869887e-05, - "loss": 0.2768, + "epoch": 2.700291923451184, + "grad_norm": 0.21767933666706085, + "learning_rate": 2.298808220573613e-05, + "loss": 0.4065, "step": 74925 }, { - "epoch": 2.64, - "learning_rate": 2.40228526878858e-05, - "loss": 0.3029, + "epoch": 2.7004721231124087, + "grad_norm": 0.18852004408836365, + "learning_rate": 2.298517355079547e-05, + "loss": 0.3676, "step": 74930 }, { - "epoch": 2.64, - "learning_rate": 2.4020006149760355e-05, - "loss": 0.26, + "epoch": 2.7006523227736334, + "grad_norm": 0.21407105028629303, + "learning_rate": 2.2982264923306435e-05, + "loss": 0.4, "step": 74935 }, { - "epoch": 2.64, - "learning_rate": 2.401715962435948e-05, - "loss": 0.2837, + "epoch": 2.7008325224348577, + "grad_norm": 0.1654965728521347, + "learning_rate": 2.2979356323308653e-05, + "loss": 0.4009, "step": 74940 }, { - "epoch": 2.64, - "learning_rate": 2.401431311172014e-05, - "loss": 0.2707, + "epoch": 2.7010127220960825, + "grad_norm": 0.1940935105085373, + "learning_rate": 2.2976447750841742e-05, + "loss": 0.379, "step": 74945 }, { - "epoch": 2.64, - "learning_rate": 2.4011466611879297e-05, - "loss": 0.2681, + "epoch": 2.701192921757307, + "grad_norm": 0.21042941510677338, + "learning_rate": 2.2973539205945347e-05, + "loss": 0.437, "step": 74950 }, { - "epoch": 2.64, - "learning_rate": 2.400862012487391e-05, - "loss": 0.2783, + "epoch": 2.7013731214185315, + "grad_norm": 0.26631516218185425, + "learning_rate": 2.2970630688659086e-05, + "loss": 0.4402, "step": 74955 }, { - "epoch": 2.64, - "learning_rate": 2.400577365074094e-05, - "loss": 0.2826, + "epoch": 2.701553321079756, + "grad_norm": 0.23533588647842407, + "learning_rate": 2.2967722199022585e-05, + "loss": 0.3937, "step": 74960 }, { - "epoch": 2.64, - "learning_rate": 2.400292718951733e-05, - "loss": 0.2472, + "epoch": 2.701733520740981, + "grad_norm": 0.22266995906829834, + "learning_rate": 2.296481373707548e-05, + "loss": 0.3798, "step": 74965 }, { - "epoch": 2.64, - "learning_rate": 2.4000080741240066e-05, - "loss": 0.2662, + "epoch": 2.7019137204022057, + "grad_norm": 0.22777952253818512, + "learning_rate": 2.2961905302857387e-05, + "loss": 0.4028, "step": 74970 }, { - "epoch": 2.64, - "learning_rate": 2.399723430594609e-05, - "loss": 0.2496, + "epoch": 2.7020939200634304, + "grad_norm": 0.2449411302804947, + "learning_rate": 2.295899689640795e-05, + "loss": 0.3865, "step": 74975 }, { - "epoch": 2.64, - "learning_rate": 2.399438788367237e-05, - "loss": 0.2745, + "epoch": 2.702274119724655, + "grad_norm": 0.188294917345047, + "learning_rate": 2.2956088517766784e-05, + "loss": 0.4135, "step": 74980 }, { - "epoch": 2.64, - "learning_rate": 2.399154147445585e-05, - "loss": 0.2454, + "epoch": 2.7024543193858794, + "grad_norm": 0.19857686758041382, + "learning_rate": 2.2953180166973505e-05, + "loss": 0.3972, "step": 74985 }, { - "epoch": 2.64, - "learning_rate": 2.398869507833351e-05, - "loss": 0.2886, + "epoch": 2.702634519047104, + "grad_norm": 0.2035861313343048, + "learning_rate": 2.295027184406776e-05, + "loss": 0.3869, "step": 74990 }, { - "epoch": 2.64, - "learning_rate": 2.3985848695342294e-05, - "loss": 0.2567, + "epoch": 2.702814718708329, + "grad_norm": 0.20147527754306793, + "learning_rate": 2.2947363549089164e-05, + "loss": 0.3817, "step": 74995 }, { - "epoch": 2.64, - "learning_rate": 2.398300232551917e-05, - "loss": 0.2774, + "epoch": 2.702994918369553, + "grad_norm": 0.18223348259925842, + "learning_rate": 2.2944455282077337e-05, + "loss": 0.4237, "step": 75000 }, { - "epoch": 2.64, - "eval_loss": 0.2608758807182312, - "eval_runtime": 10.5423, - "eval_samples_per_second": 9.486, - "eval_steps_per_second": 9.486, + "epoch": 2.702994918369553, + "eval_loss": 0.43115875124931335, + "eval_runtime": 3.5382, + "eval_samples_per_second": 28.263, + "eval_steps_per_second": 7.066, "step": 75000 }, { - "epoch": 2.64, - "learning_rate": 2.3980155968901078e-05, - "loss": 0.2513, + "epoch": 2.703175118030778, + "grad_norm": 0.1930094212293625, + "learning_rate": 2.2941547043071916e-05, + "loss": 0.3965, "step": 75005 }, { - "epoch": 2.64, - "learning_rate": 2.3977309625524998e-05, - "loss": 0.2618, + "epoch": 2.7033553176920027, + "grad_norm": 0.1886611133813858, + "learning_rate": 2.2938638832112507e-05, + "loss": 0.4005, "step": 75010 }, { - "epoch": 2.64, - "learning_rate": 2.397446329542788e-05, - "loss": 0.2612, + "epoch": 2.7035355173532274, + "grad_norm": 0.2501041293144226, + "learning_rate": 2.2935730649238753e-05, + "loss": 0.4165, "step": 75015 }, { - "epoch": 2.64, - "learning_rate": 2.3971616978646678e-05, - "loss": 0.2762, + "epoch": 2.703715717014452, + "grad_norm": 0.2163308709859848, + "learning_rate": 2.293282249449027e-05, + "loss": 0.4063, "step": 75020 }, { - "epoch": 2.64, - "learning_rate": 2.396877067521835e-05, - "loss": 0.2579, + "epoch": 2.703895916675677, + "grad_norm": 0.20869512856006622, + "learning_rate": 2.2929914367906674e-05, + "loss": 0.4237, "step": 75025 }, { - "epoch": 2.64, - "learning_rate": 2.3965924385179853e-05, - "loss": 0.2517, + "epoch": 2.704076116336901, + "grad_norm": 0.22830967605113983, + "learning_rate": 2.2927006269527597e-05, + "loss": 0.3773, "step": 75030 }, { - "epoch": 2.64, - "learning_rate": 2.396307810856815e-05, - "loss": 0.2847, + "epoch": 2.704256315998126, + "grad_norm": 0.2236100733280182, + "learning_rate": 2.2924098199392658e-05, + "loss": 0.4216, "step": 75035 }, { - "epoch": 2.64, - "learning_rate": 2.39602318454202e-05, - "loss": 0.2795, + "epoch": 2.7044365156593506, + "grad_norm": 0.20641185343265533, + "learning_rate": 2.292119015754148e-05, + "loss": 0.3819, "step": 75040 }, { - "epoch": 2.64, - "learning_rate": 2.395738559577294e-05, - "loss": 0.2711, + "epoch": 2.7046167153205753, + "grad_norm": 0.2077999860048294, + "learning_rate": 2.2918282144013685e-05, + "loss": 0.4211, "step": 75045 }, { - "epoch": 2.64, - "learning_rate": 2.395453935966335e-05, - "loss": 0.2744, + "epoch": 2.7047969149817996, + "grad_norm": 0.21933238208293915, + "learning_rate": 2.291537415884889e-05, + "loss": 0.3865, "step": 75050 }, { - "epoch": 2.64, - "learning_rate": 2.3951693137128377e-05, - "loss": 0.2652, + "epoch": 2.7049771146430244, + "grad_norm": 0.1914396733045578, + "learning_rate": 2.291246620208672e-05, + "loss": 0.3647, "step": 75055 }, { - "epoch": 2.64, - "learning_rate": 2.394884692820497e-05, - "loss": 0.2746, + "epoch": 2.705157314304249, + "grad_norm": 0.24134762585163116, + "learning_rate": 2.2909558273766802e-05, + "loss": 0.3848, "step": 75060 }, { - "epoch": 2.64, - "learning_rate": 2.3946000732930105e-05, - "loss": 0.2683, + "epoch": 2.705337513965474, + "grad_norm": 0.22134195268154144, + "learning_rate": 2.290665037392873e-05, + "loss": 0.3949, "step": 75065 }, { - "epoch": 2.64, - "learning_rate": 2.3943154551340724e-05, - "loss": 0.2474, + "epoch": 2.7055177136266986, + "grad_norm": 0.17406292259693146, + "learning_rate": 2.2903742502612153e-05, + "loss": 0.3943, "step": 75070 }, { - "epoch": 2.64, - "learning_rate": 2.3940308383473788e-05, - "loss": 0.2929, + "epoch": 2.705697913287923, + "grad_norm": 0.20366698503494263, + "learning_rate": 2.2900834659856673e-05, + "loss": 0.3911, "step": 75075 }, { - "epoch": 2.64, - "learning_rate": 2.3937462229366234e-05, - "loss": 0.2724, + "epoch": 2.7058781129491476, + "grad_norm": 0.1711958944797516, + "learning_rate": 2.289792684570192e-05, + "loss": 0.3821, "step": 75080 }, { - "epoch": 2.64, - "learning_rate": 2.3934616089055038e-05, - "loss": 0.2569, + "epoch": 2.7060583126103723, + "grad_norm": 0.21541287004947662, + "learning_rate": 2.289501906018751e-05, + "loss": 0.4022, "step": 75085 }, { - "epoch": 2.64, - "learning_rate": 2.393176996257716e-05, - "loss": 0.2704, + "epoch": 2.706238512271597, + "grad_norm": 0.2087329775094986, + "learning_rate": 2.2892111303353046e-05, + "loss": 0.3882, "step": 75090 }, { - "epoch": 2.64, - "learning_rate": 2.3928923849969543e-05, - "loss": 0.2602, + "epoch": 2.7064187119328214, + "grad_norm": 0.2614741325378418, + "learning_rate": 2.288920357523817e-05, + "loss": 0.3778, "step": 75095 }, { - "epoch": 2.64, - "learning_rate": 2.392607775126913e-05, - "loss": 0.2738, + "epoch": 2.706598911594046, + "grad_norm": 0.286384642124176, + "learning_rate": 2.2886295875882484e-05, + "loss": 0.4149, "step": 75100 }, { - "epoch": 2.64, - "learning_rate": 2.3923231666512906e-05, - "loss": 0.2869, + "epoch": 2.706779111255271, + "grad_norm": 0.22412830591201782, + "learning_rate": 2.28833882053256e-05, + "loss": 0.4088, "step": 75105 }, { - "epoch": 2.64, - "learning_rate": 2.39203855957378e-05, - "loss": 0.2515, + "epoch": 2.7069593109164956, + "grad_norm": 0.1880389153957367, + "learning_rate": 2.2880480563607145e-05, + "loss": 0.4153, "step": 75110 }, { - "epoch": 2.64, - "learning_rate": 2.391753953898078e-05, - "loss": 0.2623, + "epoch": 2.7071395105777203, + "grad_norm": 0.19842107594013214, + "learning_rate": 2.287757295076673e-05, + "loss": 0.3912, "step": 75115 }, { - "epoch": 2.64, - "learning_rate": 2.3914693496278787e-05, - "loss": 0.2683, + "epoch": 2.707319710238945, + "grad_norm": 0.20793603360652924, + "learning_rate": 2.2874665366843977e-05, + "loss": 0.4447, "step": 75120 }, { - "epoch": 2.64, - "learning_rate": 2.3911847467668793e-05, - "loss": 0.2926, + "epoch": 2.7074999099001693, + "grad_norm": 0.18394573032855988, + "learning_rate": 2.2871757811878497e-05, + "loss": 0.3807, "step": 75125 }, { - "epoch": 2.64, - "learning_rate": 2.3909001453187747e-05, - "loss": 0.2708, + "epoch": 2.707680109561394, + "grad_norm": 0.2318859100341797, + "learning_rate": 2.2868850285909897e-05, + "loss": 0.407, "step": 75130 }, { - "epoch": 2.64, - "learning_rate": 2.3906155452872588e-05, - "loss": 0.2827, + "epoch": 2.7078603092226188, + "grad_norm": 0.19001048803329468, + "learning_rate": 2.286594278897781e-05, + "loss": 0.3766, "step": 75135 }, { - "epoch": 2.64, - "learning_rate": 2.3903309466760274e-05, - "loss": 0.3036, + "epoch": 2.708040508883843, + "grad_norm": 0.2057991772890091, + "learning_rate": 2.2863035321121836e-05, + "loss": 0.3665, "step": 75140 }, { - "epoch": 2.64, - "learning_rate": 2.3900463494887776e-05, - "loss": 0.2739, + "epoch": 2.708220708545068, + "grad_norm": 0.23362144827842712, + "learning_rate": 2.286012788238159e-05, + "loss": 0.3889, "step": 75145 }, { - "epoch": 2.64, - "learning_rate": 2.3897617537292036e-05, - "loss": 0.2748, + "epoch": 2.7084009082062925, + "grad_norm": 0.1938500553369522, + "learning_rate": 2.2857220472796688e-05, + "loss": 0.4234, "step": 75150 }, { - "epoch": 2.64, - "learning_rate": 2.3894771594010004e-05, - "loss": 0.2815, + "epoch": 2.7085811078675173, + "grad_norm": 0.17589519917964935, + "learning_rate": 2.285431309240674e-05, + "loss": 0.3794, "step": 75155 }, { - "epoch": 2.64, - "learning_rate": 2.3891925665078624e-05, - "loss": 0.2412, + "epoch": 2.708761307528742, + "grad_norm": 0.2224626988172531, + "learning_rate": 2.285140574125136e-05, + "loss": 0.3915, "step": 75160 }, { - "epoch": 2.64, - "learning_rate": 2.3889079750534865e-05, - "loss": 0.2747, + "epoch": 2.7089415071899667, + "grad_norm": 0.17219646275043488, + "learning_rate": 2.2848498419370174e-05, + "loss": 0.4047, "step": 75165 }, { - "epoch": 2.64, - "learning_rate": 2.388623385041568e-05, - "loss": 0.2702, + "epoch": 2.709121706851191, + "grad_norm": 0.26903530955314636, + "learning_rate": 2.284559112680276e-05, + "loss": 0.3869, "step": 75170 }, { - "epoch": 2.64, - "learning_rate": 2.3883387964758e-05, - "loss": 0.2612, + "epoch": 2.7093019065124158, + "grad_norm": 0.18370231986045837, + "learning_rate": 2.2842683863588766e-05, + "loss": 0.3703, "step": 75175 }, { - "epoch": 2.65, - "learning_rate": 2.3880542093598805e-05, - "loss": 0.2415, + "epoch": 2.7094821061736405, + "grad_norm": 0.20962488651275635, + "learning_rate": 2.2839776629767785e-05, + "loss": 0.3904, "step": 75180 }, { - "epoch": 2.65, - "learning_rate": 2.3877696236975034e-05, - "loss": 0.268, + "epoch": 2.709662305834865, + "grad_norm": 0.2684512734413147, + "learning_rate": 2.283686942537942e-05, + "loss": 0.3828, "step": 75185 }, { - "epoch": 2.65, - "learning_rate": 2.387485039492363e-05, - "loss": 0.2636, + "epoch": 2.7098425054960895, + "grad_norm": 0.20460106432437897, + "learning_rate": 2.2833962250463293e-05, + "loss": 0.4142, "step": 75190 }, { - "epoch": 2.65, - "learning_rate": 2.387200456748155e-05, - "loss": 0.2544, + "epoch": 2.7100227051573142, + "grad_norm": 0.21397659182548523, + "learning_rate": 2.2831055105059007e-05, + "loss": 0.3773, "step": 75195 }, { - "epoch": 2.65, - "learning_rate": 2.3869158754685753e-05, - "loss": 0.2749, + "epoch": 2.710202904818539, + "grad_norm": 0.20380306243896484, + "learning_rate": 2.282814798920619e-05, + "loss": 0.3858, "step": 75200 }, { - "epoch": 2.65, - "learning_rate": 2.3866312956573188e-05, - "loss": 0.2571, + "epoch": 2.7103831044797637, + "grad_norm": 0.2668623626232147, + "learning_rate": 2.282524090294443e-05, + "loss": 0.3665, "step": 75205 }, { - "epoch": 2.65, - "learning_rate": 2.38634671731808e-05, - "loss": 0.2724, + "epoch": 2.7105633041409884, + "grad_norm": 0.1861308068037033, + "learning_rate": 2.2822333846313332e-05, + "loss": 0.3891, "step": 75210 }, { - "epoch": 2.65, - "learning_rate": 2.386062140454553e-05, - "loss": 0.2596, + "epoch": 2.7107435038022127, + "grad_norm": 0.25733551383018494, + "learning_rate": 2.2819426819352525e-05, + "loss": 0.3773, "step": 75215 }, { - "epoch": 2.65, - "learning_rate": 2.385777565070435e-05, - "loss": 0.2694, + "epoch": 2.7109237034634375, + "grad_norm": 0.19054418802261353, + "learning_rate": 2.2816519822101596e-05, + "loss": 0.3619, "step": 75220 }, { - "epoch": 2.65, - "learning_rate": 2.3854929911694203e-05, - "loss": 0.2482, + "epoch": 2.711103903124662, + "grad_norm": 0.2656010687351227, + "learning_rate": 2.281361285460017e-05, + "loss": 0.3848, "step": 75225 }, { - "epoch": 2.65, - "learning_rate": 2.385208418755204e-05, - "loss": 0.2841, + "epoch": 2.7112841027858865, + "grad_norm": 0.21391278505325317, + "learning_rate": 2.281070591688784e-05, + "loss": 0.3942, "step": 75230 }, { - "epoch": 2.65, - "learning_rate": 2.384923847831479e-05, - "loss": 0.2664, + "epoch": 2.7114643024471112, + "grad_norm": 0.20552653074264526, + "learning_rate": 2.280779900900422e-05, + "loss": 0.3606, "step": 75235 }, { - "epoch": 2.65, - "learning_rate": 2.3846392784019436e-05, - "loss": 0.2859, + "epoch": 2.711644502108336, + "grad_norm": 0.2136029154062271, + "learning_rate": 2.2804892130988916e-05, + "loss": 0.4099, "step": 75240 }, { - "epoch": 2.65, - "learning_rate": 2.3843547104702903e-05, - "loss": 0.2856, + "epoch": 2.7118247017695607, + "grad_norm": 0.20137493312358856, + "learning_rate": 2.2801985282881532e-05, + "loss": 0.3833, "step": 75245 }, { - "epoch": 2.65, - "learning_rate": 2.3840701440402156e-05, - "loss": 0.2696, + "epoch": 2.7120049014307854, + "grad_norm": 0.17652560770511627, + "learning_rate": 2.279907846472167e-05, + "loss": 0.3546, "step": 75250 }, { - "epoch": 2.65, - "learning_rate": 2.3837855791154127e-05, - "loss": 0.2782, + "epoch": 2.71218510109201, + "grad_norm": 0.2107645869255066, + "learning_rate": 2.279617167654894e-05, + "loss": 0.3693, "step": 75255 }, { - "epoch": 2.65, - "learning_rate": 2.3835010156995783e-05, - "loss": 0.2685, + "epoch": 2.7123653007532345, + "grad_norm": 0.17879261076450348, + "learning_rate": 2.279326491840294e-05, + "loss": 0.4006, "step": 75260 }, { - "epoch": 2.65, - "learning_rate": 2.3832164537964066e-05, - "loss": 0.2724, + "epoch": 2.712545500414459, + "grad_norm": 0.1885097324848175, + "learning_rate": 2.279035819032328e-05, + "loss": 0.4, "step": 75265 }, { - "epoch": 2.65, - "learning_rate": 2.3829318934095918e-05, - "loss": 0.2717, + "epoch": 2.712725700075684, + "grad_norm": 0.19376368820667267, + "learning_rate": 2.2787451492349574e-05, + "loss": 0.3943, "step": 75270 }, { - "epoch": 2.65, - "learning_rate": 2.3826473345428285e-05, - "loss": 0.2621, + "epoch": 2.7129058997369087, + "grad_norm": 0.22324879467487335, + "learning_rate": 2.2784544824521392e-05, + "loss": 0.4092, "step": 75275 }, { - "epoch": 2.65, - "learning_rate": 2.3823627771998134e-05, - "loss": 0.2779, + "epoch": 2.713086099398133, + "grad_norm": 0.18715398013591766, + "learning_rate": 2.2781638186878375e-05, + "loss": 0.4188, "step": 75280 }, { - "epoch": 2.65, - "learning_rate": 2.38207822138424e-05, - "loss": 0.2559, + "epoch": 2.7132662990593577, + "grad_norm": 0.24565525352954865, + "learning_rate": 2.2778731579460105e-05, + "loss": 0.4029, "step": 75285 }, { - "epoch": 2.65, - "learning_rate": 2.381793667099802e-05, - "loss": 0.2871, + "epoch": 2.7134464987205824, + "grad_norm": 0.28024908900260925, + "learning_rate": 2.277582500230618e-05, + "loss": 0.3778, "step": 75290 }, { - "epoch": 2.65, - "learning_rate": 2.3815091143501968e-05, - "loss": 0.2464, + "epoch": 2.713626698381807, + "grad_norm": 0.1823398321866989, + "learning_rate": 2.2772918455456215e-05, + "loss": 0.3794, "step": 75295 }, { - "epoch": 2.65, - "learning_rate": 2.3812245631391168e-05, - "loss": 0.266, + "epoch": 2.713806898043032, + "grad_norm": 0.22716470062732697, + "learning_rate": 2.27700119389498e-05, + "loss": 0.3845, "step": 75300 }, { - "epoch": 2.65, - "learning_rate": 2.3809400134702582e-05, - "loss": 0.2678, + "epoch": 2.713987097704256, + "grad_norm": 0.21729101240634918, + "learning_rate": 2.2767105452826542e-05, + "loss": 0.3898, "step": 75305 }, { - "epoch": 2.65, - "learning_rate": 2.380655465347314e-05, - "loss": 0.244, + "epoch": 2.714167297365481, + "grad_norm": 0.22123655676841736, + "learning_rate": 2.2764198997126043e-05, + "loss": 0.3791, "step": 75310 }, { - "epoch": 2.65, - "learning_rate": 2.380370918773981e-05, - "loss": 0.2402, + "epoch": 2.7143474970267056, + "grad_norm": 0.222514346241951, + "learning_rate": 2.2761292571887894e-05, + "loss": 0.3756, "step": 75315 }, { - "epoch": 2.65, - "learning_rate": 2.380086373753953e-05, - "loss": 0.2624, + "epoch": 2.7145276966879304, + "grad_norm": 0.23015157878398895, + "learning_rate": 2.2758386177151707e-05, + "loss": 0.3883, "step": 75320 }, { - "epoch": 2.65, - "learning_rate": 2.3798018302909235e-05, - "loss": 0.265, + "epoch": 2.7147078963491547, + "grad_norm": 0.23218706250190735, + "learning_rate": 2.2755479812957074e-05, + "loss": 0.3997, "step": 75325 }, { - "epoch": 2.65, - "learning_rate": 2.379517288388588e-05, - "loss": 0.2735, + "epoch": 2.7148880960103794, + "grad_norm": 0.22764332592487335, + "learning_rate": 2.2752573479343588e-05, + "loss": 0.3802, "step": 75330 }, { - "epoch": 2.65, - "learning_rate": 2.3792327480506412e-05, - "loss": 0.2528, + "epoch": 2.715068295671604, + "grad_norm": 0.21155443787574768, + "learning_rate": 2.2749667176350857e-05, + "loss": 0.4231, "step": 75335 }, { - "epoch": 2.65, - "learning_rate": 2.3789482092807784e-05, - "loss": 0.2571, + "epoch": 2.715248495332829, + "grad_norm": 0.19925571978092194, + "learning_rate": 2.2746760904018472e-05, + "loss": 0.3896, "step": 75340 }, { - "epoch": 2.65, - "learning_rate": 2.3786636720826934e-05, - "loss": 0.2701, + "epoch": 2.7154286949940536, + "grad_norm": 0.22160978615283966, + "learning_rate": 2.2743854662386035e-05, + "loss": 0.3815, "step": 75345 }, { - "epoch": 2.65, - "learning_rate": 2.378379136460079e-05, - "loss": 0.2734, + "epoch": 2.715608894655278, + "grad_norm": 0.22966960072517395, + "learning_rate": 2.2740948451493148e-05, + "loss": 0.3712, "step": 75350 }, { - "epoch": 2.65, - "learning_rate": 2.378094602416632e-05, - "loss": 0.2922, + "epoch": 2.7157890943165026, + "grad_norm": 0.1914483606815338, + "learning_rate": 2.2738042271379388e-05, + "loss": 0.3998, "step": 75355 }, { - "epoch": 2.65, - "learning_rate": 2.3778100699560474e-05, - "loss": 0.2715, + "epoch": 2.7159692939777274, + "grad_norm": 0.22551338374614716, + "learning_rate": 2.2735136122084374e-05, + "loss": 0.3848, "step": 75360 }, { - "epoch": 2.65, - "learning_rate": 2.377525539082018e-05, - "loss": 0.2577, + "epoch": 2.716149493638952, + "grad_norm": 0.2782670259475708, + "learning_rate": 2.2732230003647698e-05, + "loss": 0.4082, "step": 75365 }, { - "epoch": 2.65, - "learning_rate": 2.377241009798238e-05, - "loss": 0.2751, + "epoch": 2.7163296933001764, + "grad_norm": 0.21808646619319916, + "learning_rate": 2.272932391610893e-05, + "loss": 0.3989, "step": 75370 }, { - "epoch": 2.65, - "learning_rate": 2.376956482108403e-05, - "loss": 0.2759, + "epoch": 2.716509892961401, + "grad_norm": 0.22317920625209808, + "learning_rate": 2.2726417859507703e-05, + "loss": 0.4206, "step": 75375 }, { - "epoch": 2.65, - "learning_rate": 2.376671956016207e-05, - "loss": 0.2581, + "epoch": 2.716690092622626, + "grad_norm": 0.19897177815437317, + "learning_rate": 2.2723511833883574e-05, + "loss": 0.3832, "step": 75380 }, { - "epoch": 2.65, - "learning_rate": 2.376387431525345e-05, - "loss": 0.2553, + "epoch": 2.7168702922838506, + "grad_norm": 0.20371364057064056, + "learning_rate": 2.2720605839276173e-05, + "loss": 0.4227, "step": 75385 }, { - "epoch": 2.65, - "learning_rate": 2.3761029086395095e-05, - "loss": 0.2709, + "epoch": 2.7170504919450753, + "grad_norm": 0.2331681102514267, + "learning_rate": 2.2717699875725072e-05, + "loss": 0.3592, "step": 75390 }, { - "epoch": 2.65, - "learning_rate": 2.375818387362397e-05, - "loss": 0.2801, + "epoch": 2.7172306916063, + "grad_norm": 0.2340395599603653, + "learning_rate": 2.271479394326986e-05, + "loss": 0.4067, "step": 75395 }, { - "epoch": 2.65, - "learning_rate": 2.3755338676977012e-05, - "loss": 0.2996, + "epoch": 2.7174108912675243, + "grad_norm": 0.2065713256597519, + "learning_rate": 2.2711888041950143e-05, + "loss": 0.3794, "step": 75400 }, { - "epoch": 2.65, - "learning_rate": 2.3752493496491148e-05, - "loss": 0.2742, + "epoch": 2.717591090928749, + "grad_norm": 0.19182537496089935, + "learning_rate": 2.2708982171805512e-05, + "loss": 0.388, "step": 75405 }, { - "epoch": 2.65, - "learning_rate": 2.3749648332203346e-05, - "loss": 0.257, + "epoch": 2.717771290589974, + "grad_norm": 0.2015332728624344, + "learning_rate": 2.2706076332875546e-05, + "loss": 0.3948, "step": 75410 }, { - "epoch": 2.65, - "learning_rate": 2.374680318415053e-05, - "loss": 0.278, + "epoch": 2.717951490251198, + "grad_norm": 0.2081226408481598, + "learning_rate": 2.2703170525199856e-05, + "loss": 0.3549, "step": 75415 }, { - "epoch": 2.65, - "learning_rate": 2.3743958052369652e-05, - "loss": 0.2875, + "epoch": 2.718131689912423, + "grad_norm": 0.23630069196224213, + "learning_rate": 2.2700264748818015e-05, + "loss": 0.4129, "step": 75420 }, { - "epoch": 2.65, - "learning_rate": 2.3741112936897646e-05, - "loss": 0.2619, + "epoch": 2.7183118895736476, + "grad_norm": 0.20923583209514618, + "learning_rate": 2.2697359003769627e-05, + "loss": 0.4091, "step": 75425 }, { - "epoch": 2.65, - "learning_rate": 2.3738267837771464e-05, - "loss": 0.2756, + "epoch": 2.7184920892348723, + "grad_norm": 0.2418445646762848, + "learning_rate": 2.2694453290094276e-05, + "loss": 0.4168, "step": 75430 }, { - "epoch": 2.65, - "learning_rate": 2.373542275502804e-05, - "loss": 0.2495, + "epoch": 2.718672288896097, + "grad_norm": 0.20278342068195343, + "learning_rate": 2.2691547607831547e-05, + "loss": 0.4111, "step": 75435 }, { - "epoch": 2.65, - "learning_rate": 2.3732577688704323e-05, - "loss": 0.2631, + "epoch": 2.7188524885573218, + "grad_norm": 0.2284698486328125, + "learning_rate": 2.2688641957021043e-05, + "loss": 0.3705, "step": 75440 }, { - "epoch": 2.65, - "learning_rate": 2.3729732638837238e-05, - "loss": 0.2677, + "epoch": 2.719032688218546, + "grad_norm": 0.21057669818401337, + "learning_rate": 2.2685736337702336e-05, + "loss": 0.3889, "step": 75445 }, { - "epoch": 2.65, - "learning_rate": 2.3726887605463748e-05, - "loss": 0.2637, + "epoch": 2.719212887879771, + "grad_norm": 0.23061637580394745, + "learning_rate": 2.268283074991503e-05, + "loss": 0.412, "step": 75450 }, { - "epoch": 2.65, - "learning_rate": 2.3724042588620786e-05, - "loss": 0.2702, + "epoch": 2.7193930875409955, + "grad_norm": 0.1986396461725235, + "learning_rate": 2.2679925193698713e-05, + "loss": 0.3599, "step": 75455 }, { - "epoch": 2.65, - "learning_rate": 2.3721197588345285e-05, - "loss": 0.2665, + "epoch": 2.71957328720222, + "grad_norm": 0.20687183737754822, + "learning_rate": 2.267701966909295e-05, + "loss": 0.3999, "step": 75460 }, { - "epoch": 2.66, - "learning_rate": 2.3718352604674187e-05, - "loss": 0.2869, + "epoch": 2.7197534868634445, + "grad_norm": 0.22581927478313446, + "learning_rate": 2.267411417613736e-05, + "loss": 0.3888, "step": 75465 }, { - "epoch": 2.66, - "learning_rate": 2.3715507637644437e-05, - "loss": 0.2535, + "epoch": 2.7199336865246693, + "grad_norm": 0.1779545694589615, + "learning_rate": 2.2671208714871507e-05, + "loss": 0.3675, "step": 75470 }, { - "epoch": 2.66, - "learning_rate": 2.371266268729298e-05, - "loss": 0.2708, + "epoch": 2.720113886185894, + "grad_norm": 0.19196881353855133, + "learning_rate": 2.2668303285334974e-05, + "loss": 0.3927, "step": 75475 }, { - "epoch": 2.66, - "learning_rate": 2.3709817753656748e-05, - "loss": 0.27, + "epoch": 2.7202940858471187, + "grad_norm": 0.21891996264457703, + "learning_rate": 2.2665397887567374e-05, + "loss": 0.3826, "step": 75480 }, { - "epoch": 2.66, - "learning_rate": 2.3706972836772673e-05, - "loss": 0.2495, + "epoch": 2.7204742855083435, + "grad_norm": 0.2428896129131317, + "learning_rate": 2.2662492521608263e-05, + "loss": 0.3847, "step": 75485 }, { - "epoch": 2.66, - "learning_rate": 2.3704127936677713e-05, - "loss": 0.2591, + "epoch": 2.7206544851695678, + "grad_norm": 0.21766485273838043, + "learning_rate": 2.2659587187497248e-05, + "loss": 0.408, "step": 75490 }, { - "epoch": 2.66, - "learning_rate": 2.3701283053408795e-05, - "loss": 0.2735, + "epoch": 2.7208346848307925, + "grad_norm": 0.26046082377433777, + "learning_rate": 2.2656681885273907e-05, + "loss": 0.3949, "step": 75495 }, { - "epoch": 2.66, - "learning_rate": 2.3698438187002863e-05, - "loss": 0.2667, + "epoch": 2.7210148844920172, + "grad_norm": 0.2007281333208084, + "learning_rate": 2.2653776614977813e-05, + "loss": 0.3955, "step": 75500 }, { - "epoch": 2.66, - "eval_loss": 0.2609610855579376, - "eval_runtime": 10.5588, - "eval_samples_per_second": 9.471, - "eval_steps_per_second": 9.471, + "epoch": 2.7210148844920172, + "eval_loss": 0.4320552349090576, + "eval_runtime": 3.5362, + "eval_samples_per_second": 28.279, + "eval_steps_per_second": 7.07, "step": 75500 }, { - "epoch": 2.66, - "learning_rate": 2.3695593337496847e-05, - "loss": 0.2506, + "epoch": 2.7211950841532415, + "grad_norm": 0.29919010400772095, + "learning_rate": 2.265087137664856e-05, + "loss": 0.3914, "step": 75505 }, { - "epoch": 2.66, - "learning_rate": 2.36927485049277e-05, - "loss": 0.2631, + "epoch": 2.7213752838144663, + "grad_norm": 0.24532043933868408, + "learning_rate": 2.2647966170325733e-05, + "loss": 0.4182, "step": 75510 }, { - "epoch": 2.66, - "learning_rate": 2.3689903689332353e-05, - "loss": 0.2895, + "epoch": 2.721555483475691, + "grad_norm": 0.2970276176929474, + "learning_rate": 2.2645060996048904e-05, + "loss": 0.3703, "step": 75515 }, { - "epoch": 2.66, - "learning_rate": 2.368705889074773e-05, - "loss": 0.266, + "epoch": 2.7217356831369157, + "grad_norm": 0.19525529444217682, + "learning_rate": 2.2642155853857673e-05, + "loss": 0.4027, "step": 75520 }, { - "epoch": 2.66, - "learning_rate": 2.3684214109210788e-05, - "loss": 0.2753, + "epoch": 2.7219158827981405, + "grad_norm": 0.21934610605239868, + "learning_rate": 2.26392507437916e-05, + "loss": 0.4019, "step": 75525 }, { - "epoch": 2.66, - "learning_rate": 2.3681369344758464e-05, - "loss": 0.2822, + "epoch": 2.722096082459365, + "grad_norm": 0.20460151135921478, + "learning_rate": 2.263634566589029e-05, + "loss": 0.3934, "step": 75530 }, { - "epoch": 2.66, - "learning_rate": 2.367852459742769e-05, - "loss": 0.2874, + "epoch": 2.7222762821205895, + "grad_norm": 0.18550460040569305, + "learning_rate": 2.263344062019331e-05, + "loss": 0.4085, "step": 75535 }, { - "epoch": 2.66, - "learning_rate": 2.367567986725539e-05, - "loss": 0.2457, + "epoch": 2.722456481781814, + "grad_norm": 0.23762153089046478, + "learning_rate": 2.263053560674024e-05, + "loss": 0.4393, "step": 75540 }, { - "epoch": 2.66, - "learning_rate": 2.3672835154278534e-05, - "loss": 0.2878, + "epoch": 2.722636681443039, + "grad_norm": 0.20546315610408783, + "learning_rate": 2.2627630625570666e-05, + "loss": 0.3945, "step": 75545 }, { - "epoch": 2.66, - "learning_rate": 2.3669990458534023e-05, - "loss": 0.2828, + "epoch": 2.7228168811042637, + "grad_norm": 0.22104191780090332, + "learning_rate": 2.2624725676724175e-05, + "loss": 0.3628, "step": 75550 }, { - "epoch": 2.66, - "learning_rate": 2.3667145780058823e-05, - "loss": 0.263, + "epoch": 2.722997080765488, + "grad_norm": 0.21103298664093018, + "learning_rate": 2.2621820760240316e-05, + "loss": 0.3632, "step": 75555 }, { - "epoch": 2.66, - "learning_rate": 2.3664301118889842e-05, - "loss": 0.2772, + "epoch": 2.7231772804267127, + "grad_norm": 0.20641304552555084, + "learning_rate": 2.261891587615871e-05, + "loss": 0.4007, "step": 75560 }, { - "epoch": 2.66, - "learning_rate": 2.3661456475064043e-05, - "loss": 0.2477, + "epoch": 2.7233574800879374, + "grad_norm": 0.19765135645866394, + "learning_rate": 2.261601102451889e-05, + "loss": 0.3494, "step": 75565 }, { - "epoch": 2.66, - "learning_rate": 2.3658611848618346e-05, - "loss": 0.2818, + "epoch": 2.723537679749162, + "grad_norm": 0.22192902863025665, + "learning_rate": 2.261310620536048e-05, + "loss": 0.3931, "step": 75570 }, { - "epoch": 2.66, - "learning_rate": 2.3655767239589687e-05, - "loss": 0.259, + "epoch": 2.723717879410387, + "grad_norm": 0.23835474252700806, + "learning_rate": 2.261020141872303e-05, + "loss": 0.3943, "step": 75575 }, { - "epoch": 2.66, - "learning_rate": 2.3652922648015e-05, - "loss": 0.2561, + "epoch": 2.723898079071611, + "grad_norm": 0.19383686780929565, + "learning_rate": 2.260729666464612e-05, + "loss": 0.3632, "step": 75580 }, { - "epoch": 2.66, - "learning_rate": 2.3650078073931237e-05, - "loss": 0.2811, + "epoch": 2.724078278732836, + "grad_norm": 0.18293000757694244, + "learning_rate": 2.260439194316933e-05, + "loss": 0.3706, "step": 75585 }, { - "epoch": 2.66, - "learning_rate": 2.3647233517375317e-05, - "loss": 0.2783, + "epoch": 2.7242584783940607, + "grad_norm": 0.27218204736709595, + "learning_rate": 2.260148725433224e-05, + "loss": 0.4146, "step": 75590 }, { - "epoch": 2.66, - "learning_rate": 2.364438897838418e-05, - "loss": 0.2699, + "epoch": 2.7244386780552854, + "grad_norm": 0.22335471212863922, + "learning_rate": 2.259858259817441e-05, + "loss": 0.3762, "step": 75595 }, { - "epoch": 2.66, - "learning_rate": 2.3641544456994747e-05, - "loss": 0.2716, + "epoch": 2.7246188777165097, + "grad_norm": 0.19925439357757568, + "learning_rate": 2.259567797473544e-05, + "loss": 0.3917, "step": 75600 }, { - "epoch": 2.66, - "learning_rate": 2.363869995324397e-05, - "loss": 0.2684, + "epoch": 2.7247990773777344, + "grad_norm": 0.18860496580600739, + "learning_rate": 2.2592773384054883e-05, + "loss": 0.3926, "step": 75605 }, { - "epoch": 2.66, - "learning_rate": 2.3635855467168783e-05, - "loss": 0.292, + "epoch": 2.724979277038959, + "grad_norm": 0.23276196420192719, + "learning_rate": 2.258986882617233e-05, + "loss": 0.4083, "step": 75610 }, { - "epoch": 2.66, - "learning_rate": 2.3633010998806113e-05, - "loss": 0.2756, + "epoch": 2.725159476700184, + "grad_norm": 0.21294569969177246, + "learning_rate": 2.258696430112735e-05, + "loss": 0.392, "step": 75615 }, { - "epoch": 2.66, - "learning_rate": 2.363016654819288e-05, - "loss": 0.2436, + "epoch": 2.7253396763614086, + "grad_norm": 0.21455052495002747, + "learning_rate": 2.25840598089595e-05, + "loss": 0.4136, "step": 75620 }, { - "epoch": 2.66, - "learning_rate": 2.362732211536605e-05, - "loss": 0.2806, + "epoch": 2.7255198760226333, + "grad_norm": 0.2718030512332916, + "learning_rate": 2.2581736238923366e-05, + "loss": 0.4269, "step": 75625 }, { - "epoch": 2.66, - "learning_rate": 2.3624477700362526e-05, - "loss": 0.271, + "epoch": 2.7257000756838576, + "grad_norm": 0.21406759321689606, + "learning_rate": 2.2578831806034116e-05, + "loss": 0.3959, "step": 75630 }, { - "epoch": 2.66, - "learning_rate": 2.362163330321925e-05, - "loss": 0.2794, + "epoch": 2.7258802753450824, + "grad_norm": 0.21770311892032623, + "learning_rate": 2.2575927406132795e-05, + "loss": 0.3844, "step": 75635 }, { - "epoch": 2.66, - "learning_rate": 2.3618788923973167e-05, - "loss": 0.2581, + "epoch": 2.726060475006307, + "grad_norm": 0.22125771641731262, + "learning_rate": 2.2573023039259013e-05, + "loss": 0.3554, "step": 75640 }, { - "epoch": 2.66, - "learning_rate": 2.36159445626612e-05, - "loss": 0.2853, + "epoch": 2.7262406746675314, + "grad_norm": 0.2546500861644745, + "learning_rate": 2.2570118705452317e-05, + "loss": 0.4081, "step": 75645 }, { - "epoch": 2.66, - "learning_rate": 2.3613100219320282e-05, - "loss": 0.2975, + "epoch": 2.726420874328756, + "grad_norm": 0.19551582634449005, + "learning_rate": 2.2567214404752273e-05, + "loss": 0.4039, "step": 75650 }, { - "epoch": 2.66, - "learning_rate": 2.3610255893987328e-05, - "loss": 0.2612, + "epoch": 2.726601073989981, + "grad_norm": 0.21097378432750702, + "learning_rate": 2.2564310137198474e-05, + "loss": 0.4068, "step": 75655 }, { - "epoch": 2.66, - "learning_rate": 2.3607411586699296e-05, - "loss": 0.2624, + "epoch": 2.7267812736512056, + "grad_norm": 0.20981697738170624, + "learning_rate": 2.2561405902830464e-05, + "loss": 0.3682, "step": 75660 }, { - "epoch": 2.66, - "learning_rate": 2.360456729749311e-05, - "loss": 0.2454, + "epoch": 2.7269614733124303, + "grad_norm": 0.22551362216472626, + "learning_rate": 2.2558501701687847e-05, + "loss": 0.3908, "step": 75665 }, { - "epoch": 2.66, - "learning_rate": 2.36017230264057e-05, - "loss": 0.2618, + "epoch": 2.727141672973655, + "grad_norm": 0.23293738067150116, + "learning_rate": 2.255559753381016e-05, + "loss": 0.3983, "step": 75670 }, { - "epoch": 2.66, - "learning_rate": 2.3598878773473982e-05, - "loss": 0.2853, + "epoch": 2.7273218726348794, + "grad_norm": 0.19451351463794708, + "learning_rate": 2.2552693399236978e-05, + "loss": 0.3788, "step": 75675 }, { - "epoch": 2.66, - "learning_rate": 2.3596034538734908e-05, - "loss": 0.2617, + "epoch": 2.727502072296104, + "grad_norm": 0.22846585512161255, + "learning_rate": 2.2549789298007884e-05, + "loss": 0.3828, "step": 75680 }, { - "epoch": 2.66, - "learning_rate": 2.35931903222254e-05, - "loss": 0.2671, + "epoch": 2.727682271957329, + "grad_norm": 0.20074519515037537, + "learning_rate": 2.2546885230162435e-05, + "loss": 0.3768, "step": 75685 }, { - "epoch": 2.66, - "learning_rate": 2.3590346123982394e-05, - "loss": 0.2818, + "epoch": 2.727862471618553, + "grad_norm": 0.23105072975158691, + "learning_rate": 2.2543981195740194e-05, + "loss": 0.4379, "step": 75690 }, { - "epoch": 2.66, - "learning_rate": 2.35875019440428e-05, - "loss": 0.2639, + "epoch": 2.728042671279778, + "grad_norm": 0.20044337213039398, + "learning_rate": 2.254107719478074e-05, + "loss": 0.3861, "step": 75695 }, { - "epoch": 2.66, - "learning_rate": 2.3584657782443575e-05, - "loss": 0.2507, + "epoch": 2.7282228709410026, + "grad_norm": 0.1897222101688385, + "learning_rate": 2.2538173227323626e-05, + "loss": 0.3739, "step": 75700 }, { - "epoch": 2.66, - "learning_rate": 2.3581813639221637e-05, - "loss": 0.2559, + "epoch": 2.7284030706022273, + "grad_norm": 0.19628849625587463, + "learning_rate": 2.253526929340843e-05, + "loss": 0.3837, "step": 75705 }, { - "epoch": 2.66, - "learning_rate": 2.3578969514413908e-05, - "loss": 0.2618, + "epoch": 2.728583270263452, + "grad_norm": 0.21223433315753937, + "learning_rate": 2.2532365393074715e-05, + "loss": 0.4052, "step": 75710 }, { - "epoch": 2.66, - "learning_rate": 2.3576125408057317e-05, - "loss": 0.2597, + "epoch": 2.7287634699246768, + "grad_norm": 0.19551092386245728, + "learning_rate": 2.2529461526362037e-05, + "loss": 0.4148, "step": 75715 }, { - "epoch": 2.66, - "learning_rate": 2.3573281320188813e-05, - "loss": 0.2762, + "epoch": 2.728943669585901, + "grad_norm": 0.1927630603313446, + "learning_rate": 2.2526557693309974e-05, + "loss": 0.39, "step": 75720 }, { - "epoch": 2.66, - "learning_rate": 2.3570437250845306e-05, - "loss": 0.2716, + "epoch": 2.729123869247126, + "grad_norm": 0.21203365921974182, + "learning_rate": 2.252365389395809e-05, + "loss": 0.4016, "step": 75725 }, { - "epoch": 2.66, - "learning_rate": 2.3567593200063733e-05, - "loss": 0.2634, + "epoch": 2.7293040689083505, + "grad_norm": 0.18629692494869232, + "learning_rate": 2.252075012834592e-05, + "loss": 0.4215, "step": 75730 }, { - "epoch": 2.66, - "learning_rate": 2.3564749167881007e-05, - "loss": 0.2626, + "epoch": 2.729484268569575, + "grad_norm": 0.24014006555080414, + "learning_rate": 2.251784639651307e-05, + "loss": 0.4002, "step": 75735 }, { - "epoch": 2.66, - "learning_rate": 2.356190515433407e-05, - "loss": 0.2668, + "epoch": 2.7296644682307996, + "grad_norm": 0.17442671954631805, + "learning_rate": 2.2514942698499067e-05, + "loss": 0.4086, "step": 75740 }, { - "epoch": 2.66, - "learning_rate": 2.3559061159459854e-05, - "loss": 0.2505, + "epoch": 2.7298446678920243, + "grad_norm": 0.23989979922771454, + "learning_rate": 2.2512039034343504e-05, + "loss": 0.3988, "step": 75745 }, { - "epoch": 2.67, - "learning_rate": 2.355621718329528e-05, - "loss": 0.2781, + "epoch": 2.730024867553249, + "grad_norm": 0.23518751561641693, + "learning_rate": 2.250913540408592e-05, + "loss": 0.3873, "step": 75750 }, { - "epoch": 2.67, - "learning_rate": 2.3553373225877263e-05, - "loss": 0.2879, + "epoch": 2.7302050672144738, + "grad_norm": 0.23601964116096497, + "learning_rate": 2.2506231807765882e-05, + "loss": 0.3869, "step": 75755 }, { - "epoch": 2.67, - "learning_rate": 2.355052928724275e-05, - "loss": 0.2771, + "epoch": 2.7303852668756985, + "grad_norm": 0.269535630941391, + "learning_rate": 2.2503328245422957e-05, + "loss": 0.3967, "step": 75760 }, { - "epoch": 2.67, - "learning_rate": 2.3547685367428656e-05, - "loss": 0.2718, + "epoch": 2.730565466536923, + "grad_norm": 0.20272400975227356, + "learning_rate": 2.2500424717096702e-05, + "loss": 0.3563, "step": 75765 }, { - "epoch": 2.67, - "learning_rate": 2.354484146647191e-05, - "loss": 0.2845, + "epoch": 2.7307456661981475, + "grad_norm": 0.19777542352676392, + "learning_rate": 2.2497521222826667e-05, + "loss": 0.4028, "step": 75770 }, { - "epoch": 2.67, - "learning_rate": 2.3541997584409438e-05, - "loss": 0.2798, + "epoch": 2.7309258658593722, + "grad_norm": 0.18318623304367065, + "learning_rate": 2.2494617762652433e-05, + "loss": 0.3846, "step": 75775 }, { - "epoch": 2.67, - "learning_rate": 2.353915372127817e-05, - "loss": 0.2676, + "epoch": 2.731106065520597, + "grad_norm": 0.17845581471920013, + "learning_rate": 2.2491714336613534e-05, + "loss": 0.4036, "step": 75780 }, { - "epoch": 2.67, - "learning_rate": 2.3536309877115033e-05, - "loss": 0.2759, + "epoch": 2.7312862651818213, + "grad_norm": 0.20689518749713898, + "learning_rate": 2.248881094474955e-05, + "loss": 0.3367, "step": 75785 }, { - "epoch": 2.67, - "learning_rate": 2.3533466051956938e-05, - "loss": 0.2623, + "epoch": 2.731466464843046, + "grad_norm": 0.22084331512451172, + "learning_rate": 2.2485907587100034e-05, + "loss": 0.3819, "step": 75790 }, { - "epoch": 2.67, - "learning_rate": 2.3530622245840818e-05, - "loss": 0.2667, + "epoch": 2.7316466645042707, + "grad_norm": 0.2153303027153015, + "learning_rate": 2.248300426370453e-05, + "loss": 0.3989, "step": 75795 }, { - "epoch": 2.67, - "learning_rate": 2.352777845880361e-05, - "loss": 0.2681, + "epoch": 2.7318268641654955, + "grad_norm": 0.23561865091323853, + "learning_rate": 2.2480100974602613e-05, + "loss": 0.3838, "step": 75800 }, { - "epoch": 2.67, - "learning_rate": 2.352493469088223e-05, - "loss": 0.2697, + "epoch": 2.73200706382672, + "grad_norm": 0.24491235613822937, + "learning_rate": 2.247719771983384e-05, + "loss": 0.3881, "step": 75805 }, { - "epoch": 2.67, - "learning_rate": 2.3522659690333325e-05, - "loss": 0.2463, + "epoch": 2.7321872634879445, + "grad_norm": 0.1986062377691269, + "learning_rate": 2.247429449943774e-05, + "loss": 0.3913, "step": 75810 }, { - "epoch": 2.67, - "learning_rate": 2.3519815956913475e-05, - "loss": 0.2921, + "epoch": 2.7323674631491692, + "grad_norm": 0.20403479039669037, + "learning_rate": 2.24713913134539e-05, + "loss": 0.3769, "step": 75815 }, { - "epoch": 2.67, - "learning_rate": 2.3516972242712848e-05, - "loss": 0.2793, + "epoch": 2.732547662810394, + "grad_norm": 0.232151597738266, + "learning_rate": 2.2468488161921858e-05, + "loss": 0.4194, "step": 75820 }, { - "epoch": 2.67, - "learning_rate": 2.3514128547768342e-05, - "loss": 0.2738, + "epoch": 2.7327278624716187, + "grad_norm": 0.19631794095039368, + "learning_rate": 2.2465585044881182e-05, + "loss": 0.4294, "step": 75825 }, { - "epoch": 2.67, - "learning_rate": 2.3511284872116906e-05, - "loss": 0.2991, + "epoch": 2.732908062132843, + "grad_norm": 0.20293517410755157, + "learning_rate": 2.2462681962371424e-05, + "loss": 0.3644, "step": 75830 }, { - "epoch": 2.67, - "learning_rate": 2.350844121579546e-05, - "loss": 0.2796, + "epoch": 2.7330882617940677, + "grad_norm": 0.19923901557922363, + "learning_rate": 2.2459778914432116e-05, + "loss": 0.3929, "step": 75835 }, { - "epoch": 2.67, - "learning_rate": 2.350559757884091e-05, - "loss": 0.2683, + "epoch": 2.7332684614552925, + "grad_norm": 0.24110738933086395, + "learning_rate": 2.2456875901102845e-05, + "loss": 0.3879, "step": 75840 }, { - "epoch": 2.67, - "learning_rate": 2.3502753961290204e-05, - "loss": 0.2594, + "epoch": 2.733448661116517, + "grad_norm": 0.21238499879837036, + "learning_rate": 2.245397292242313e-05, + "loss": 0.3906, "step": 75845 }, { - "epoch": 2.67, - "learning_rate": 2.3499910363180246e-05, - "loss": 0.2886, + "epoch": 2.733628860777742, + "grad_norm": 0.23390400409698486, + "learning_rate": 2.245106997843256e-05, + "loss": 0.4043, "step": 75850 }, { - "epoch": 2.67, - "learning_rate": 2.349706678454796e-05, - "loss": 0.2805, + "epoch": 2.733809060438966, + "grad_norm": 0.17990170419216156, + "learning_rate": 2.244816706917066e-05, + "loss": 0.4187, "step": 75855 }, { - "epoch": 2.67, - "learning_rate": 2.3494223225430266e-05, - "loss": 0.2628, + "epoch": 2.733989260100191, + "grad_norm": 0.20971287786960602, + "learning_rate": 2.244526419467699e-05, + "loss": 0.3834, "step": 75860 }, { - "epoch": 2.67, - "learning_rate": 2.3491379685864105e-05, - "loss": 0.272, + "epoch": 2.7341694597614157, + "grad_norm": 0.21100729703903198, + "learning_rate": 2.24423613549911e-05, + "loss": 0.3752, "step": 75865 }, { - "epoch": 2.67, - "learning_rate": 2.3488536165886384e-05, - "loss": 0.2463, + "epoch": 2.7343496594226404, + "grad_norm": 0.19430191814899445, + "learning_rate": 2.2439458550152544e-05, + "loss": 0.4059, "step": 75870 }, { - "epoch": 2.67, - "learning_rate": 2.348569266553402e-05, - "loss": 0.2638, + "epoch": 2.7345298590838647, + "grad_norm": 0.1902761608362198, + "learning_rate": 2.243655578020086e-05, + "loss": 0.4116, "step": 75875 }, { - "epoch": 2.67, - "learning_rate": 2.348284918484393e-05, - "loss": 0.2661, + "epoch": 2.7347100587450894, + "grad_norm": 0.20415811240673065, + "learning_rate": 2.2433653045175614e-05, + "loss": 0.3747, "step": 75880 }, { - "epoch": 2.67, - "learning_rate": 2.348000572385305e-05, - "loss": 0.2953, + "epoch": 2.734890258406314, + "grad_norm": 0.21149209141731262, + "learning_rate": 2.2430750345116346e-05, + "loss": 0.3851, "step": 75885 }, { - "epoch": 2.67, - "learning_rate": 2.34771622825983e-05, - "loss": 0.2886, + "epoch": 2.735070458067539, + "grad_norm": 0.2101927250623703, + "learning_rate": 2.2427847680062612e-05, + "loss": 0.4146, "step": 75890 }, { - "epoch": 2.67, - "learning_rate": 2.3474318861116594e-05, - "loss": 0.2792, + "epoch": 2.7352506577287636, + "grad_norm": 0.18154318630695343, + "learning_rate": 2.2424945050053954e-05, + "loss": 0.3912, "step": 75895 }, { - "epoch": 2.67, - "learning_rate": 2.347147545944484e-05, - "loss": 0.2792, + "epoch": 2.7354308573899884, + "grad_norm": 0.2463860660791397, + "learning_rate": 2.2422042455129916e-05, + "loss": 0.3867, "step": 75900 }, { - "epoch": 2.67, - "learning_rate": 2.346863207761998e-05, - "loss": 0.2595, + "epoch": 2.7356110570512127, + "grad_norm": 0.23593199253082275, + "learning_rate": 2.2419139895330058e-05, + "loss": 0.4111, "step": 75905 }, { - "epoch": 2.67, - "learning_rate": 2.3465788715678922e-05, - "loss": 0.263, + "epoch": 2.7357912567124374, + "grad_norm": 0.24448728561401367, + "learning_rate": 2.2416237370693922e-05, + "loss": 0.4308, "step": 75910 }, { - "epoch": 2.67, - "learning_rate": 2.3462945373658588e-05, - "loss": 0.2568, + "epoch": 2.735971456373662, + "grad_norm": 0.21843284368515015, + "learning_rate": 2.2413334881261038e-05, + "loss": 0.3489, "step": 75915 }, { - "epoch": 2.67, - "learning_rate": 2.3460102051595885e-05, - "loss": 0.2462, + "epoch": 2.7361516560348864, + "grad_norm": 0.20660178363323212, + "learning_rate": 2.2410432427070975e-05, + "loss": 0.3702, "step": 75920 }, { - "epoch": 2.67, - "learning_rate": 2.345725874952775e-05, - "loss": 0.2756, + "epoch": 2.736331855696111, + "grad_norm": 0.19898898899555206, + "learning_rate": 2.240753000816326e-05, + "loss": 0.3856, "step": 75925 }, { - "epoch": 2.67, - "learning_rate": 2.3454415467491102e-05, - "loss": 0.2813, + "epoch": 2.736512055357336, + "grad_norm": 0.2137184739112854, + "learning_rate": 2.240462762457746e-05, + "loss": 0.3925, "step": 75930 }, { - "epoch": 2.67, - "learning_rate": 2.345157220552284e-05, - "loss": 0.259, + "epoch": 2.7366922550185606, + "grad_norm": 0.21520856022834778, + "learning_rate": 2.2401725276353103e-05, + "loss": 0.4215, "step": 75935 }, { - "epoch": 2.67, - "learning_rate": 2.3448728963659885e-05, - "loss": 0.2697, + "epoch": 2.7368724546797853, + "grad_norm": 0.22829484939575195, + "learning_rate": 2.2398822963529722e-05, + "loss": 0.3858, "step": 75940 }, { - "epoch": 2.67, - "learning_rate": 2.3445885741939175e-05, - "loss": 0.2627, + "epoch": 2.73705265434101, + "grad_norm": 0.19753465056419373, + "learning_rate": 2.2395920686146894e-05, + "loss": 0.418, "step": 75945 }, { - "epoch": 2.67, - "learning_rate": 2.3443042540397615e-05, - "loss": 0.2602, + "epoch": 2.7372328540022344, + "grad_norm": 0.19238927960395813, + "learning_rate": 2.2393018444244132e-05, + "loss": 0.4194, "step": 75950 }, { - "epoch": 2.67, - "learning_rate": 2.344019935907211e-05, - "loss": 0.2678, + "epoch": 2.737413053663459, + "grad_norm": 0.2262929379940033, + "learning_rate": 2.2390116237860985e-05, + "loss": 0.373, "step": 75955 }, { - "epoch": 2.67, - "learning_rate": 2.34373561979996e-05, - "loss": 0.2482, + "epoch": 2.737593253324684, + "grad_norm": 0.1645156592130661, + "learning_rate": 2.238721406703701e-05, + "loss": 0.3634, "step": 75960 }, { - "epoch": 2.67, - "learning_rate": 2.3434513057216984e-05, - "loss": 0.2637, + "epoch": 2.737773452985908, + "grad_norm": 0.20983080565929413, + "learning_rate": 2.2384311931811728e-05, + "loss": 0.3576, "step": 75965 }, { - "epoch": 2.67, - "learning_rate": 2.343166993676119e-05, - "loss": 0.2795, + "epoch": 2.737953652647133, + "grad_norm": 0.21075214445590973, + "learning_rate": 2.238199024928914e-05, + "loss": 0.4086, "step": 75970 }, { - "epoch": 2.67, - "learning_rate": 2.3428826836669116e-05, - "loss": 0.277, + "epoch": 2.7381338523083576, + "grad_norm": 0.2087734490633011, + "learning_rate": 2.237908817824117e-05, + "loss": 0.428, "step": 75975 }, { - "epoch": 2.67, - "learning_rate": 2.34259837569777e-05, - "loss": 0.246, + "epoch": 2.7383140519695823, + "grad_norm": 0.22745364904403687, + "learning_rate": 2.2376186142902626e-05, + "loss": 0.4003, "step": 75980 }, { - "epoch": 2.67, - "learning_rate": 2.342314069772385e-05, - "loss": 0.2809, + "epoch": 2.738494251630807, + "grad_norm": 0.214681476354599, + "learning_rate": 2.2373284143313015e-05, + "loss": 0.3532, "step": 75985 }, { - "epoch": 2.67, - "learning_rate": 2.3420297658944472e-05, - "loss": 0.267, + "epoch": 2.738674451292032, + "grad_norm": 0.25850626826286316, + "learning_rate": 2.2370382179511915e-05, + "loss": 0.3884, "step": 75990 }, { - "epoch": 2.67, - "learning_rate": 2.341745464067649e-05, - "loss": 0.274, + "epoch": 2.738854650953256, + "grad_norm": 0.2193749099969864, + "learning_rate": 2.2367480251538842e-05, + "loss": 0.3821, "step": 75995 }, { - "epoch": 2.67, - "learning_rate": 2.3414611642956816e-05, - "loss": 0.2866, + "epoch": 2.739034850614481, + "grad_norm": 0.24161560833454132, + "learning_rate": 2.2364578359433345e-05, + "loss": 0.3848, "step": 76000 }, { - "epoch": 2.67, - "eval_loss": 0.2612769603729248, - "eval_runtime": 10.5493, - "eval_samples_per_second": 9.479, - "eval_steps_per_second": 9.479, + "epoch": 2.739034850614481, + "eval_loss": 0.43177616596221924, + "eval_runtime": 3.531, + "eval_samples_per_second": 28.321, + "eval_steps_per_second": 7.08, "step": 76000 }, { - "epoch": 2.67, - "learning_rate": 2.341176866582237e-05, - "loss": 0.2699, + "epoch": 2.7392150502757056, + "grad_norm": 0.21785877645015717, + "learning_rate": 2.2361676503234963e-05, + "loss": 0.3629, "step": 76005 }, { - "epoch": 2.67, - "learning_rate": 2.3408925709310063e-05, - "loss": 0.2581, + "epoch": 2.73939524993693, + "grad_norm": 0.24404390156269073, + "learning_rate": 2.2358774682983213e-05, + "loss": 0.4146, "step": 76010 }, { - "epoch": 2.67, - "learning_rate": 2.3406082773456792e-05, - "loss": 0.2552, + "epoch": 2.7395754495981546, + "grad_norm": 0.20576035976409912, + "learning_rate": 2.235587289871766e-05, + "loss": 0.4005, "step": 76015 }, { - "epoch": 2.67, - "learning_rate": 2.340323985829949e-05, - "loss": 0.259, + "epoch": 2.7397556492593793, + "grad_norm": 0.1887839436531067, + "learning_rate": 2.2352971150477824e-05, + "loss": 0.3787, "step": 76020 }, { - "epoch": 2.67, - "learning_rate": 2.340039696387508e-05, - "loss": 0.2766, + "epoch": 2.739935848920604, + "grad_norm": 0.20635947585105896, + "learning_rate": 2.235006943830324e-05, + "loss": 0.3778, "step": 76025 }, { - "epoch": 2.67, - "learning_rate": 2.3397554090220456e-05, - "loss": 0.2715, + "epoch": 2.740116048581829, + "grad_norm": 0.21008598804473877, + "learning_rate": 2.234716776223345e-05, + "loss": 0.3746, "step": 76030 }, { - "epoch": 2.68, - "learning_rate": 2.3394711237372525e-05, - "loss": 0.2591, + "epoch": 2.7402962482430535, + "grad_norm": 0.2424369752407074, + "learning_rate": 2.234426612230798e-05, + "loss": 0.4026, "step": 76035 }, { - "epoch": 2.68, - "learning_rate": 2.3391868405368218e-05, - "loss": 0.2483, + "epoch": 2.740476447904278, + "grad_norm": 0.2240283191204071, + "learning_rate": 2.2341364518566378e-05, + "loss": 0.4191, "step": 76040 }, { - "epoch": 2.68, - "learning_rate": 2.338902559424444e-05, - "loss": 0.2635, + "epoch": 2.7406566475655025, + "grad_norm": 0.22698961198329926, + "learning_rate": 2.2338462951048165e-05, + "loss": 0.4108, "step": 76045 }, { - "epoch": 2.68, - "learning_rate": 2.3386182804038108e-05, - "loss": 0.2694, + "epoch": 2.7408368472267273, + "grad_norm": 0.1691308468580246, + "learning_rate": 2.2335561419792876e-05, + "loss": 0.3827, "step": 76050 }, { - "epoch": 2.68, - "learning_rate": 2.3383340034786118e-05, - "loss": 0.2895, + "epoch": 2.741017046887952, + "grad_norm": 0.18437933921813965, + "learning_rate": 2.233265992484005e-05, + "loss": 0.3761, "step": 76055 }, { - "epoch": 2.68, - "learning_rate": 2.33804972865254e-05, - "loss": 0.2687, + "epoch": 2.7411972465491763, + "grad_norm": 0.23850589990615845, + "learning_rate": 2.232975846622922e-05, + "loss": 0.4086, "step": 76060 }, { - "epoch": 2.68, - "learning_rate": 2.3377654559292862e-05, - "loss": 0.279, + "epoch": 2.741377446210401, + "grad_norm": 0.20492541790008545, + "learning_rate": 2.2326857043999906e-05, + "loss": 0.3932, "step": 76065 }, { - "epoch": 2.68, - "learning_rate": 2.3374811853125393e-05, - "loss": 0.275, + "epoch": 2.7415576458716258, + "grad_norm": 0.219580739736557, + "learning_rate": 2.2323955658191653e-05, + "loss": 0.3816, "step": 76070 }, { - "epoch": 2.68, - "learning_rate": 2.3371969168059937e-05, - "loss": 0.2605, + "epoch": 2.7417378455328505, + "grad_norm": 0.23200367391109467, + "learning_rate": 2.232105430884398e-05, + "loss": 0.3838, "step": 76075 }, { - "epoch": 2.68, - "learning_rate": 2.336912650413338e-05, - "loss": 0.2477, + "epoch": 2.7419180451940752, + "grad_norm": 0.24806389212608337, + "learning_rate": 2.231815299599643e-05, + "loss": 0.4068, "step": 76080 }, { - "epoch": 2.68, - "learning_rate": 2.3366283861382648e-05, - "loss": 0.2602, + "epoch": 2.7420982448552995, + "grad_norm": 0.17643065750598907, + "learning_rate": 2.2315251719688536e-05, + "loss": 0.3776, "step": 76085 }, { - "epoch": 2.68, - "learning_rate": 2.3363441239844632e-05, - "loss": 0.2465, + "epoch": 2.7422784445165242, + "grad_norm": 0.2253817468881607, + "learning_rate": 2.231235047995979e-05, + "loss": 0.3983, "step": 76090 }, { - "epoch": 2.68, - "learning_rate": 2.3360598639556268e-05, - "loss": 0.2853, + "epoch": 2.742458644177749, + "grad_norm": 0.24909770488739014, + "learning_rate": 2.2309449276849775e-05, + "loss": 0.4178, "step": 76095 }, { - "epoch": 2.68, - "learning_rate": 2.3357756060554445e-05, - "loss": 0.2683, + "epoch": 2.7426388438389737, + "grad_norm": 0.21218940615653992, + "learning_rate": 2.2306548110397973e-05, + "loss": 0.3915, "step": 76100 }, { - "epoch": 2.68, - "learning_rate": 2.3354913502876072e-05, - "loss": 0.282, + "epoch": 2.742819043500198, + "grad_norm": 0.19259501993656158, + "learning_rate": 2.2303646980643948e-05, + "loss": 0.3977, "step": 76105 }, { - "epoch": 2.68, - "learning_rate": 2.335207096655806e-05, - "loss": 0.2867, + "epoch": 2.7429992431614227, + "grad_norm": 0.22517803311347961, + "learning_rate": 2.2300745887627206e-05, + "loss": 0.383, "step": 76110 }, { - "epoch": 2.68, - "learning_rate": 2.3349228451637327e-05, - "loss": 0.2511, + "epoch": 2.7431794428226475, + "grad_norm": 0.2479083389043808, + "learning_rate": 2.2297844831387265e-05, + "loss": 0.392, "step": 76115 }, { - "epoch": 2.68, - "learning_rate": 2.3346385958150778e-05, - "loss": 0.2747, + "epoch": 2.743359642483872, + "grad_norm": 0.2069377303123474, + "learning_rate": 2.2294943811963682e-05, + "loss": 0.4116, "step": 76120 }, { - "epoch": 2.68, - "learning_rate": 2.3343543486135314e-05, - "loss": 0.2776, + "epoch": 2.743539842145097, + "grad_norm": 0.21161840856075287, + "learning_rate": 2.2292042829395964e-05, + "loss": 0.4078, "step": 76125 }, { - "epoch": 2.68, - "learning_rate": 2.3340701035627843e-05, - "loss": 0.2435, + "epoch": 2.7437200418063217, + "grad_norm": 0.24700872600078583, + "learning_rate": 2.228914188372363e-05, + "loss": 0.4206, "step": 76130 }, { - "epoch": 2.68, - "learning_rate": 2.333785860666528e-05, - "loss": 0.2765, + "epoch": 2.743900241467546, + "grad_norm": 0.19092321395874023, + "learning_rate": 2.2286240974986218e-05, + "loss": 0.4151, "step": 76135 }, { - "epoch": 2.68, - "learning_rate": 2.3335016199284528e-05, - "loss": 0.2888, + "epoch": 2.7440804411287707, + "grad_norm": 0.20470459759235382, + "learning_rate": 2.2283340103223243e-05, + "loss": 0.4057, "step": 76140 }, { - "epoch": 2.68, - "learning_rate": 2.3332173813522497e-05, - "loss": 0.2604, + "epoch": 2.7442606407899954, + "grad_norm": 0.18111029267311096, + "learning_rate": 2.2280439268474236e-05, + "loss": 0.3995, "step": 76145 }, { - "epoch": 2.68, - "learning_rate": 2.3329331449416077e-05, - "loss": 0.2646, + "epoch": 2.7444408404512197, + "grad_norm": 0.20974625647068024, + "learning_rate": 2.2277538470778722e-05, + "loss": 0.3997, "step": 76150 }, { - "epoch": 2.68, - "learning_rate": 2.3326489107002194e-05, - "loss": 0.2574, + "epoch": 2.7446210401124445, + "grad_norm": 0.24757510423660278, + "learning_rate": 2.227463771017621e-05, + "loss": 0.4023, "step": 76155 }, { - "epoch": 2.68, - "learning_rate": 2.3323646786317748e-05, - "loss": 0.2908, + "epoch": 2.744801239773669, + "grad_norm": 0.2829555869102478, + "learning_rate": 2.227173698670624e-05, + "loss": 0.3991, "step": 76160 }, { - "epoch": 2.68, - "learning_rate": 2.3320804487399647e-05, - "loss": 0.2683, + "epoch": 2.744981439434894, + "grad_norm": 0.1985725462436676, + "learning_rate": 2.2268836300408323e-05, + "loss": 0.4086, "step": 76165 }, { - "epoch": 2.68, - "learning_rate": 2.3317962210284784e-05, - "loss": 0.2552, + "epoch": 2.7451616390961187, + "grad_norm": 0.23659586906433105, + "learning_rate": 2.226593565132198e-05, + "loss": 0.3794, "step": 76170 }, { - "epoch": 2.68, - "learning_rate": 2.331511995501008e-05, - "loss": 0.2676, + "epoch": 2.7453418387573434, + "grad_norm": 0.21936027705669403, + "learning_rate": 2.2263035039486734e-05, + "loss": 0.3971, "step": 76175 }, { - "epoch": 2.68, - "learning_rate": 2.3312277721612434e-05, - "loss": 0.2623, + "epoch": 2.7455220384185677, + "grad_norm": 0.21939417719841003, + "learning_rate": 2.2260134464942108e-05, + "loss": 0.4245, "step": 76180 }, { - "epoch": 2.68, - "learning_rate": 2.330943551012874e-05, - "loss": 0.2738, + "epoch": 2.7457022380797924, + "grad_norm": 0.2699432969093323, + "learning_rate": 2.225723392772762e-05, + "loss": 0.4004, "step": 76185 }, { - "epoch": 2.68, - "learning_rate": 2.3306593320595918e-05, - "loss": 0.2851, + "epoch": 2.745882437741017, + "grad_norm": 0.20219436287879944, + "learning_rate": 2.2254333427882795e-05, + "loss": 0.4058, "step": 76190 }, { - "epoch": 2.68, - "learning_rate": 2.330375115305087e-05, - "loss": 0.2665, + "epoch": 2.7460626374022414, + "grad_norm": 0.2475823611021042, + "learning_rate": 2.2251432965447126e-05, + "loss": 0.4338, "step": 76195 }, { - "epoch": 2.68, - "learning_rate": 2.3300909007530493e-05, - "loss": 0.2861, + "epoch": 2.746242837063466, + "grad_norm": 0.18280702829360962, + "learning_rate": 2.2248532540460173e-05, + "loss": 0.3821, "step": 76200 }, { - "epoch": 2.68, - "learning_rate": 2.3298066884071686e-05, - "loss": 0.2728, + "epoch": 2.746423036724691, + "grad_norm": 0.2569371461868286, + "learning_rate": 2.2245632152961425e-05, + "loss": 0.4106, "step": 76205 }, { - "epoch": 2.68, - "learning_rate": 2.329522478271137e-05, - "loss": 0.2538, + "epoch": 2.7466032363859156, + "grad_norm": 0.22354593873023987, + "learning_rate": 2.2242731802990396e-05, + "loss": 0.3885, "step": 76210 }, { - "epoch": 2.68, - "learning_rate": 2.3292382703486428e-05, - "loss": 0.2773, + "epoch": 2.7467834360471404, + "grad_norm": 0.17847710847854614, + "learning_rate": 2.223983149058662e-05, + "loss": 0.4149, "step": 76215 }, { - "epoch": 2.68, - "learning_rate": 2.3289540646433777e-05, - "loss": 0.2909, + "epoch": 2.746963635708365, + "grad_norm": 0.22239890694618225, + "learning_rate": 2.2236931215789604e-05, + "loss": 0.3831, "step": 76220 }, { - "epoch": 2.68, - "learning_rate": 2.3286698611590304e-05, - "loss": 0.2853, + "epoch": 2.7471438353695894, + "grad_norm": 0.22584442794322968, + "learning_rate": 2.2234030978638865e-05, + "loss": 0.3784, "step": 76225 }, { - "epoch": 2.68, - "learning_rate": 2.328385659899293e-05, - "loss": 0.2606, + "epoch": 2.747324035030814, + "grad_norm": 0.2300465852022171, + "learning_rate": 2.223113077917392e-05, + "loss": 0.3863, "step": 76230 }, { - "epoch": 2.68, - "learning_rate": 2.328101460867855e-05, - "loss": 0.2718, + "epoch": 2.747504234692039, + "grad_norm": 0.21806028485298157, + "learning_rate": 2.2228230617434276e-05, + "loss": 0.408, "step": 76235 }, { - "epoch": 2.68, - "learning_rate": 2.3278172640684056e-05, - "loss": 0.2591, + "epoch": 2.747684434353263, + "grad_norm": 0.26781561970710754, + "learning_rate": 2.222533049345946e-05, + "loss": 0.4062, "step": 76240 }, { - "epoch": 2.68, - "learning_rate": 2.327533069504635e-05, - "loss": 0.264, + "epoch": 2.747864634014488, + "grad_norm": 0.23583123087882996, + "learning_rate": 2.222243040728898e-05, + "loss": 0.3975, "step": 76245 }, { - "epoch": 2.68, - "learning_rate": 2.327248877180235e-05, - "loss": 0.2514, + "epoch": 2.7480448336757126, + "grad_norm": 0.1862766593694687, + "learning_rate": 2.221953035896234e-05, + "loss": 0.4025, "step": 76250 }, { - "epoch": 2.68, - "learning_rate": 2.3269646870988944e-05, - "loss": 0.2671, + "epoch": 2.7482250333369374, + "grad_norm": 0.21025675535202026, + "learning_rate": 2.2216630348519067e-05, + "loss": 0.4092, "step": 76255 }, { - "epoch": 2.68, - "learning_rate": 2.326680499264303e-05, - "loss": 0.2581, + "epoch": 2.748405232998162, + "grad_norm": 0.1862116903066635, + "learning_rate": 2.2213730375998663e-05, + "loss": 0.398, "step": 76260 }, { - "epoch": 2.68, - "learning_rate": 2.3263963136801505e-05, - "loss": 0.281, + "epoch": 2.748585432659387, + "grad_norm": 0.18690145015716553, + "learning_rate": 2.2210830441440646e-05, + "loss": 0.3963, "step": 76265 }, { - "epoch": 2.68, - "learning_rate": 2.3261121303501274e-05, - "loss": 0.2668, + "epoch": 2.748765632320611, + "grad_norm": 0.20919784903526306, + "learning_rate": 2.220793054488453e-05, + "loss": 0.4236, "step": 76270 }, { - "epoch": 2.68, - "learning_rate": 2.325827949277925e-05, - "loss": 0.2715, + "epoch": 2.748945831981836, + "grad_norm": 0.24846714735031128, + "learning_rate": 2.2205030686369805e-05, + "loss": 0.3915, "step": 76275 }, { - "epoch": 2.68, - "learning_rate": 2.3255437704672312e-05, - "loss": 0.2676, + "epoch": 2.7491260316430606, + "grad_norm": 0.19307979941368103, + "learning_rate": 2.2202130865936006e-05, + "loss": 0.3978, "step": 76280 }, { - "epoch": 2.68, - "learning_rate": 2.3252595939217357e-05, - "loss": 0.3085, + "epoch": 2.7493062313042853, + "grad_norm": 0.2004070281982422, + "learning_rate": 2.2199231083622627e-05, + "loss": 0.4069, "step": 76285 }, { - "epoch": 2.68, - "learning_rate": 2.3249754196451304e-05, - "loss": 0.2605, + "epoch": 2.7494864309655096, + "grad_norm": 0.20964334905147552, + "learning_rate": 2.2196331339469187e-05, + "loss": 0.3888, "step": 76290 }, { - "epoch": 2.68, - "learning_rate": 2.324691247641103e-05, - "loss": 0.2552, + "epoch": 2.7496666306267343, + "grad_norm": 0.1950366199016571, + "learning_rate": 2.2193431633515194e-05, + "loss": 0.3997, "step": 76295 }, { - "epoch": 2.68, - "learning_rate": 2.3244070779133442e-05, - "loss": 0.2588, + "epoch": 2.749846830287959, + "grad_norm": 0.19967371225357056, + "learning_rate": 2.2190531965800138e-05, + "loss": 0.4061, "step": 76300 }, { - "epoch": 2.68, - "learning_rate": 2.324122910465545e-05, - "loss": 0.2745, + "epoch": 2.750027029949184, + "grad_norm": 0.22163501381874084, + "learning_rate": 2.2187632336363555e-05, + "loss": 0.3802, "step": 76305 }, { - "epoch": 2.68, - "learning_rate": 2.3238387453013934e-05, - "loss": 0.2781, + "epoch": 2.7502072296104085, + "grad_norm": 0.21866455674171448, + "learning_rate": 2.218473274524493e-05, + "loss": 0.4057, "step": 76310 }, { - "epoch": 2.68, - "learning_rate": 2.3235545824245797e-05, - "loss": 0.2418, + "epoch": 2.750387429271633, + "grad_norm": 0.20910955965518951, + "learning_rate": 2.2181833192483774e-05, + "loss": 0.4417, "step": 76315 }, { - "epoch": 2.69, - "learning_rate": 2.3232704218387923e-05, - "loss": 0.2643, + "epoch": 2.7505676289328576, + "grad_norm": 0.21740221977233887, + "learning_rate": 2.2178933678119598e-05, + "loss": 0.3984, "step": 76320 }, { - "epoch": 2.69, - "learning_rate": 2.3229862635477228e-05, - "loss": 0.2813, + "epoch": 2.7507478285940823, + "grad_norm": 0.28222358226776123, + "learning_rate": 2.21760342021919e-05, + "loss": 0.3966, "step": 76325 }, { - "epoch": 2.69, - "learning_rate": 2.3227021075550606e-05, - "loss": 0.2506, + "epoch": 2.750928028255307, + "grad_norm": 0.16867958009243011, + "learning_rate": 2.2173134764740196e-05, + "loss": 0.4051, "step": 76330 }, { - "epoch": 2.69, - "learning_rate": 2.3224179538644944e-05, - "loss": 0.2613, + "epoch": 2.7511082279165313, + "grad_norm": 0.26838427782058716, + "learning_rate": 2.217023536580398e-05, + "loss": 0.4368, "step": 76335 }, { - "epoch": 2.69, - "learning_rate": 2.322133802479713e-05, - "loss": 0.2741, + "epoch": 2.751288427577756, + "grad_norm": 0.21444910764694214, + "learning_rate": 2.2167336005422758e-05, + "loss": 0.3754, "step": 76340 }, { - "epoch": 2.69, - "learning_rate": 2.3218496534044082e-05, - "loss": 0.2694, + "epoch": 2.751468627238981, + "grad_norm": 0.23337894678115845, + "learning_rate": 2.2164436683636035e-05, + "loss": 0.4093, "step": 76345 }, { - "epoch": 2.69, - "learning_rate": 2.321565506642268e-05, - "loss": 0.2591, + "epoch": 2.7516488269002055, + "grad_norm": 0.2542593479156494, + "learning_rate": 2.216153740048332e-05, + "loss": 0.4148, "step": 76350 }, { - "epoch": 2.69, - "learning_rate": 2.321281362196982e-05, - "loss": 0.2826, + "epoch": 2.7518290265614302, + "grad_norm": 0.24587145447731018, + "learning_rate": 2.2158638156004098e-05, + "loss": 0.3793, "step": 76355 }, { - "epoch": 2.69, - "learning_rate": 2.320997220072239e-05, - "loss": 0.2798, + "epoch": 2.7520092262226545, + "grad_norm": 0.19757163524627686, + "learning_rate": 2.2155738950237887e-05, + "loss": 0.4132, "step": 76360 }, { - "epoch": 2.69, - "learning_rate": 2.3207130802717304e-05, - "loss": 0.278, + "epoch": 2.7521894258838793, + "grad_norm": 0.21361534297466278, + "learning_rate": 2.215283978322418e-05, + "loss": 0.3932, "step": 76365 }, { - "epoch": 2.69, - "learning_rate": 2.3204289427991437e-05, - "loss": 0.2806, + "epoch": 2.752369625545104, + "grad_norm": 0.17598572373390198, + "learning_rate": 2.214994065500248e-05, + "loss": 0.3777, "step": 76370 }, { - "epoch": 2.69, - "learning_rate": 2.3201448076581688e-05, - "loss": 0.2719, + "epoch": 2.7525498252063287, + "grad_norm": 0.17586344480514526, + "learning_rate": 2.2147041565612294e-05, + "loss": 0.39, "step": 76375 }, { - "epoch": 2.69, - "learning_rate": 2.319860674852494e-05, - "loss": 0.2532, + "epoch": 2.752730024867553, + "grad_norm": 0.17030631005764008, + "learning_rate": 2.2144142515093097e-05, + "loss": 0.3821, "step": 76380 }, { - "epoch": 2.69, - "learning_rate": 2.3195765443858116e-05, - "loss": 0.2692, + "epoch": 2.7529102245287778, + "grad_norm": 0.22826333343982697, + "learning_rate": 2.2141243503484426e-05, + "loss": 0.4265, "step": 76385 }, { - "epoch": 2.69, - "learning_rate": 2.319292416261808e-05, - "loss": 0.2773, + "epoch": 2.7530904241900025, + "grad_norm": 0.2388213574886322, + "learning_rate": 2.213834453082575e-05, + "loss": 0.3792, "step": 76390 }, { - "epoch": 2.69, - "learning_rate": 2.3190082904841737e-05, - "loss": 0.272, + "epoch": 2.7532706238512272, + "grad_norm": 0.19387775659561157, + "learning_rate": 2.213544559715657e-05, + "loss": 0.4057, "step": 76395 }, { - "epoch": 2.69, - "learning_rate": 2.318724167056596e-05, - "loss": 0.2646, + "epoch": 2.753450823512452, + "grad_norm": 0.22617575526237488, + "learning_rate": 2.2132546702516395e-05, + "loss": 0.4345, "step": 76400 }, { - "epoch": 2.69, - "learning_rate": 2.3184400459827666e-05, - "loss": 0.2476, + "epoch": 2.7536310231736767, + "grad_norm": 0.19259920716285706, + "learning_rate": 2.2129647846944708e-05, + "loss": 0.367, "step": 76405 }, { - "epoch": 2.69, - "learning_rate": 2.3181559272663737e-05, - "loss": 0.2573, + "epoch": 2.753811222834901, + "grad_norm": 0.20966337621212006, + "learning_rate": 2.2126749030481026e-05, + "loss": 0.3878, "step": 76410 }, { - "epoch": 2.69, - "learning_rate": 2.3178718109111052e-05, - "loss": 0.248, + "epoch": 2.7539914224961257, + "grad_norm": 0.22299322485923767, + "learning_rate": 2.2123850253164826e-05, + "loss": 0.4122, "step": 76415 }, { - "epoch": 2.69, - "learning_rate": 2.3175876969206524e-05, - "loss": 0.2775, + "epoch": 2.7541716221573505, + "grad_norm": 0.1778857707977295, + "learning_rate": 2.2120951515035605e-05, + "loss": 0.4066, "step": 76420 }, { - "epoch": 2.69, - "learning_rate": 2.3173035852987036e-05, - "loss": 0.2733, + "epoch": 2.7543518218185747, + "grad_norm": 0.22316712141036987, + "learning_rate": 2.2118052816132873e-05, + "loss": 0.4025, "step": 76425 }, { - "epoch": 2.69, - "learning_rate": 2.317019476048946e-05, - "loss": 0.2875, + "epoch": 2.7545320214797995, + "grad_norm": 0.19544607400894165, + "learning_rate": 2.2115154156496105e-05, + "loss": 0.4107, "step": 76430 }, { - "epoch": 2.69, - "learning_rate": 2.3167353691750703e-05, - "loss": 0.2397, + "epoch": 2.754712221141024, + "grad_norm": 0.25387707352638245, + "learning_rate": 2.2112255536164807e-05, + "loss": 0.3657, "step": 76435 }, { - "epoch": 2.69, - "learning_rate": 2.316451264680765e-05, - "loss": 0.2967, + "epoch": 2.754892420802249, + "grad_norm": 0.21383005380630493, + "learning_rate": 2.2109356955178463e-05, + "loss": 0.3826, "step": 76440 }, { - "epoch": 2.69, - "learning_rate": 2.3161671625697197e-05, - "loss": 0.2505, + "epoch": 2.7550726204634737, + "grad_norm": 0.19232189655303955, + "learning_rate": 2.2106458413576573e-05, + "loss": 0.403, "step": 76445 }, { - "epoch": 2.69, - "learning_rate": 2.3158830628456225e-05, - "loss": 0.2745, + "epoch": 2.7552528201246984, + "grad_norm": 0.23429124057292938, + "learning_rate": 2.210355991139863e-05, + "loss": 0.3711, "step": 76450 }, { - "epoch": 2.69, - "learning_rate": 2.3155989655121614e-05, - "loss": 0.2636, + "epoch": 2.7554330197859227, + "grad_norm": 0.2584487795829773, + "learning_rate": 2.2100661448684123e-05, + "loss": 0.389, "step": 76455 }, { - "epoch": 2.69, - "learning_rate": 2.315314870573027e-05, - "loss": 0.2789, + "epoch": 2.7556132194471474, + "grad_norm": 0.2131696194410324, + "learning_rate": 2.2097763025472536e-05, + "loss": 0.3589, "step": 76460 }, { - "epoch": 2.69, - "learning_rate": 2.315030778031907e-05, - "loss": 0.2802, + "epoch": 2.755793419108372, + "grad_norm": 0.21315625309944153, + "learning_rate": 2.2094864641803372e-05, + "loss": 0.3698, "step": 76465 }, { - "epoch": 2.69, - "learning_rate": 2.3147466878924915e-05, - "loss": 0.2532, + "epoch": 2.7559736187695965, + "grad_norm": 0.2009851336479187, + "learning_rate": 2.209196629771612e-05, + "loss": 0.3787, "step": 76470 }, { - "epoch": 2.69, - "learning_rate": 2.3144626001584663e-05, - "loss": 0.2909, + "epoch": 2.756153818430821, + "grad_norm": 0.24308665096759796, + "learning_rate": 2.208906799325025e-05, + "loss": 0.3962, "step": 76475 }, { - "epoch": 2.69, - "learning_rate": 2.3141785148335234e-05, - "loss": 0.256, + "epoch": 2.756334018092046, + "grad_norm": 0.2490856796503067, + "learning_rate": 2.2086169728445276e-05, + "loss": 0.4196, "step": 76480 }, { - "epoch": 2.69, - "learning_rate": 2.3138944319213497e-05, - "loss": 0.2513, + "epoch": 2.7565142177532707, + "grad_norm": 0.20798978209495544, + "learning_rate": 2.2083271503340662e-05, + "loss": 0.3762, "step": 76485 }, { - "epoch": 2.69, - "learning_rate": 2.313610351425634e-05, - "loss": 0.2494, + "epoch": 2.7566944174144954, + "grad_norm": 0.21434953808784485, + "learning_rate": 2.2080373317975927e-05, + "loss": 0.4108, "step": 76490 }, { - "epoch": 2.69, - "learning_rate": 2.3133262733500646e-05, - "loss": 0.2747, + "epoch": 2.75687461707572, + "grad_norm": 0.19935797154903412, + "learning_rate": 2.2077475172390536e-05, + "loss": 0.4004, "step": 76495 }, { - "epoch": 2.69, - "learning_rate": 2.3130421976983314e-05, - "loss": 0.2887, + "epoch": 2.7570548167369444, + "grad_norm": 0.19653324782848358, + "learning_rate": 2.2074577066623974e-05, + "loss": 0.3803, "step": 76500 }, { - "epoch": 2.69, - "eval_loss": 0.2611083686351776, - "eval_runtime": 10.5437, - "eval_samples_per_second": 9.484, - "eval_steps_per_second": 9.484, + "epoch": 2.7570548167369444, + "eval_loss": 0.4315706491470337, + "eval_runtime": 3.524, + "eval_samples_per_second": 28.377, + "eval_steps_per_second": 7.094, "step": 76500 }, { - "epoch": 2.69, - "learning_rate": 2.312758124474122e-05, - "loss": 0.2711, + "epoch": 2.757235016398169, + "grad_norm": 0.2254483997821808, + "learning_rate": 2.2071679000715733e-05, + "loss": 0.4047, "step": 76505 }, { - "epoch": 2.69, - "learning_rate": 2.3124740536811245e-05, - "loss": 0.2671, + "epoch": 2.757415216059394, + "grad_norm": 0.20184098184108734, + "learning_rate": 2.2068780974705298e-05, + "loss": 0.386, "step": 76510 }, { - "epoch": 2.69, - "learning_rate": 2.3121899853230278e-05, - "loss": 0.2561, + "epoch": 2.757595415720618, + "grad_norm": 0.20205454528331757, + "learning_rate": 2.206588298863216e-05, + "loss": 0.3598, "step": 76515 }, { - "epoch": 2.69, - "learning_rate": 2.3119059194035202e-05, - "loss": 0.2709, + "epoch": 2.757775615381843, + "grad_norm": 0.18086735904216766, + "learning_rate": 2.2062985042535797e-05, + "loss": 0.3592, "step": 76520 }, { - "epoch": 2.69, - "learning_rate": 2.3116218559262912e-05, - "loss": 0.2708, + "epoch": 2.7579558150430676, + "grad_norm": 0.18240606784820557, + "learning_rate": 2.2060087136455687e-05, + "loss": 0.3759, "step": 76525 }, { - "epoch": 2.69, - "learning_rate": 2.3113377948950267e-05, - "loss": 0.2657, + "epoch": 2.7581360147042924, + "grad_norm": 0.19021601974964142, + "learning_rate": 2.205718927043133e-05, + "loss": 0.4057, "step": 76530 }, { - "epoch": 2.69, - "learning_rate": 2.311053736313418e-05, - "loss": 0.2494, + "epoch": 2.758316214365517, + "grad_norm": 0.19175371527671814, + "learning_rate": 2.2054291444502198e-05, + "loss": 0.3937, "step": 76535 }, { - "epoch": 2.69, - "learning_rate": 2.3107696801851513e-05, - "loss": 0.264, + "epoch": 2.758496414026742, + "grad_norm": 0.2313561886548996, + "learning_rate": 2.2051393658707766e-05, + "loss": 0.3928, "step": 76540 }, { - "epoch": 2.69, - "learning_rate": 2.3104856265139164e-05, - "loss": 0.2555, + "epoch": 2.758676613687966, + "grad_norm": 0.19368553161621094, + "learning_rate": 2.2048495913087535e-05, + "loss": 0.3741, "step": 76545 }, { - "epoch": 2.69, - "learning_rate": 2.3102015753033996e-05, - "loss": 0.2725, + "epoch": 2.758856813349191, + "grad_norm": 0.20152942836284637, + "learning_rate": 2.2045598207680968e-05, + "loss": 0.3813, "step": 76550 }, { - "epoch": 2.69, - "learning_rate": 2.3099175265572912e-05, - "loss": 0.2636, + "epoch": 2.7590370130104156, + "grad_norm": 0.2724568843841553, + "learning_rate": 2.204270054252756e-05, + "loss": 0.386, "step": 76555 }, { - "epoch": 2.69, - "learning_rate": 2.3096334802792787e-05, - "loss": 0.2668, + "epoch": 2.7592172126716403, + "grad_norm": 0.2629745602607727, + "learning_rate": 2.203980291766679e-05, + "loss": 0.4035, "step": 76560 }, { - "epoch": 2.69, - "learning_rate": 2.309349436473049e-05, - "loss": 0.2711, + "epoch": 2.7593974123328646, + "grad_norm": 0.19395357370376587, + "learning_rate": 2.2036905333138115e-05, + "loss": 0.3902, "step": 76565 }, { - "epoch": 2.69, - "learning_rate": 2.3090653951422914e-05, - "loss": 0.2713, + "epoch": 2.7595776119940894, + "grad_norm": 0.19974055886268616, + "learning_rate": 2.203400778898104e-05, + "loss": 0.3891, "step": 76570 }, { - "epoch": 2.69, - "learning_rate": 2.308781356290694e-05, - "loss": 0.2534, + "epoch": 2.759757811655314, + "grad_norm": 0.21690985560417175, + "learning_rate": 2.203111028523504e-05, + "loss": 0.3544, "step": 76575 }, { - "epoch": 2.69, - "learning_rate": 2.3084973199219452e-05, - "loss": 0.2498, + "epoch": 2.759938011316539, + "grad_norm": 0.16456058621406555, + "learning_rate": 2.2028212821939576e-05, + "loss": 0.378, "step": 76580 }, { - "epoch": 2.69, - "learning_rate": 2.308213286039732e-05, - "loss": 0.271, + "epoch": 2.7601182109777636, + "grad_norm": 0.17220036685466766, + "learning_rate": 2.202531539913415e-05, + "loss": 0.3982, "step": 76585 }, { - "epoch": 2.69, - "learning_rate": 2.3079292546477423e-05, - "loss": 0.2704, + "epoch": 2.760298410638988, + "grad_norm": 0.2109745293855667, + "learning_rate": 2.202241801685821e-05, + "loss": 0.3956, "step": 76590 }, { - "epoch": 2.69, - "learning_rate": 2.3076452257496658e-05, - "loss": 0.2688, + "epoch": 2.7604786103002126, + "grad_norm": 0.18770918250083923, + "learning_rate": 2.2019520675151263e-05, + "loss": 0.3672, "step": 76595 }, { - "epoch": 2.7, - "learning_rate": 2.3073611993491885e-05, - "loss": 0.2491, + "epoch": 2.7606588099614373, + "grad_norm": 0.22950011491775513, + "learning_rate": 2.2016623374052766e-05, + "loss": 0.3903, "step": 76600 }, { - "epoch": 2.7, - "learning_rate": 2.30707717545e-05, - "loss": 0.2701, + "epoch": 2.760839009622662, + "grad_norm": 0.19520814716815948, + "learning_rate": 2.2013726113602192e-05, + "loss": 0.3848, "step": 76605 }, { - "epoch": 2.7, - "learning_rate": 2.3067931540557855e-05, - "loss": 0.27, + "epoch": 2.7610192092838863, + "grad_norm": 0.20134703814983368, + "learning_rate": 2.201082889383903e-05, + "loss": 0.4014, "step": 76610 }, { - "epoch": 2.7, - "learning_rate": 2.306509135170236e-05, - "loss": 0.2353, + "epoch": 2.761199408945111, + "grad_norm": 0.19556838274002075, + "learning_rate": 2.200793171480274e-05, + "loss": 0.4127, "step": 76615 }, { - "epoch": 2.7, - "learning_rate": 2.306225118797038e-05, - "loss": 0.2765, + "epoch": 2.761379608606336, + "grad_norm": 0.20522183179855347, + "learning_rate": 2.20050345765328e-05, + "loss": 0.4185, "step": 76620 }, { - "epoch": 2.7, - "learning_rate": 2.3059411049398784e-05, - "loss": 0.261, + "epoch": 2.7615598082675605, + "grad_norm": 0.20291735231876373, + "learning_rate": 2.2002137479068684e-05, + "loss": 0.3848, "step": 76625 }, { - "epoch": 2.7, - "learning_rate": 2.305657093602445e-05, - "loss": 0.2716, + "epoch": 2.7617400079287853, + "grad_norm": 0.18468672037124634, + "learning_rate": 2.1999240422449863e-05, + "loss": 0.4037, "step": 76630 }, { - "epoch": 2.7, - "learning_rate": 2.3053730847884274e-05, - "loss": 0.2438, + "epoch": 2.76192020759001, + "grad_norm": 0.19746728241443634, + "learning_rate": 2.1996343406715815e-05, + "loss": 0.3784, "step": 76635 }, { - "epoch": 2.7, - "learning_rate": 2.3050890785015122e-05, - "loss": 0.2526, + "epoch": 2.7621004072512343, + "grad_norm": 0.17763979732990265, + "learning_rate": 2.1993446431906007e-05, + "loss": 0.4116, "step": 76640 }, { - "epoch": 2.7, - "learning_rate": 2.3048050747453857e-05, - "loss": 0.2388, + "epoch": 2.762280606912459, + "grad_norm": 0.2670569121837616, + "learning_rate": 2.1990549498059906e-05, + "loss": 0.3857, "step": 76645 }, { - "epoch": 2.7, - "learning_rate": 2.3045210735237373e-05, - "loss": 0.2811, + "epoch": 2.7624608065736838, + "grad_norm": 0.21598556637763977, + "learning_rate": 2.198765260521699e-05, + "loss": 0.3965, "step": 76650 }, { - "epoch": 2.7, - "learning_rate": 2.304237074840254e-05, - "loss": 0.2666, + "epoch": 2.762641006234908, + "grad_norm": 0.17488926649093628, + "learning_rate": 2.1984755753416728e-05, + "loss": 0.3939, "step": 76655 }, { - "epoch": 2.7, - "learning_rate": 2.3039530786986237e-05, - "loss": 0.2566, + "epoch": 2.762821205896133, + "grad_norm": 0.2201049029827118, + "learning_rate": 2.1981858942698568e-05, + "loss": 0.4167, "step": 76660 }, { - "epoch": 2.7, - "learning_rate": 2.3036690851025324e-05, - "loss": 0.2593, + "epoch": 2.7630014055573575, + "grad_norm": 0.21279020607471466, + "learning_rate": 2.1978962173102015e-05, + "loss": 0.3794, "step": 76665 }, { - "epoch": 2.7, - "learning_rate": 2.303385094055669e-05, - "loss": 0.2807, + "epoch": 2.7631816052185822, + "grad_norm": 0.1922799050807953, + "learning_rate": 2.1976065444666495e-05, + "loss": 0.4126, "step": 76670 }, { - "epoch": 2.7, - "learning_rate": 2.3031011055617217e-05, - "loss": 0.2642, + "epoch": 2.763361804879807, + "grad_norm": 0.19106921553611755, + "learning_rate": 2.197316875743152e-05, + "loss": 0.3927, "step": 76675 }, { - "epoch": 2.7, - "learning_rate": 2.3028171196243756e-05, - "loss": 0.2517, + "epoch": 2.7635420045410317, + "grad_norm": 0.25177741050720215, + "learning_rate": 2.1970272111436527e-05, + "loss": 0.3636, "step": 76680 }, { - "epoch": 2.7, - "learning_rate": 2.3025331362473187e-05, - "loss": 0.2673, + "epoch": 2.763722204202256, + "grad_norm": 0.21334508061408997, + "learning_rate": 2.196737550672098e-05, + "loss": 0.3626, "step": 76685 }, { - "epoch": 2.7, - "learning_rate": 2.30224915543424e-05, - "loss": 0.2477, + "epoch": 2.7639024038634807, + "grad_norm": 0.21705521643161774, + "learning_rate": 2.196447894332437e-05, + "loss": 0.377, "step": 76690 }, { - "epoch": 2.7, - "learning_rate": 2.301965177188826e-05, - "loss": 0.2783, + "epoch": 2.7640826035247055, + "grad_norm": 0.20139169692993164, + "learning_rate": 2.196158242128613e-05, + "loss": 0.4, "step": 76695 }, { - "epoch": 2.7, - "learning_rate": 2.3016812015147635e-05, - "loss": 0.2897, + "epoch": 2.7642628031859298, + "grad_norm": 0.17421554028987885, + "learning_rate": 2.195868594064576e-05, + "loss": 0.3998, "step": 76700 }, { - "epoch": 2.7, - "learning_rate": 2.3013972284157386e-05, - "loss": 0.2525, + "epoch": 2.7644430028471545, + "grad_norm": 0.1658887267112732, + "learning_rate": 2.1955789501442696e-05, + "loss": 0.3811, "step": 76705 }, { - "epoch": 2.7, - "learning_rate": 2.3011132578954407e-05, - "loss": 0.2556, + "epoch": 2.7646232025083792, + "grad_norm": 0.1670427918434143, + "learning_rate": 2.1952893103716408e-05, + "loss": 0.3959, "step": 76710 }, { - "epoch": 2.7, - "learning_rate": 2.3008292899575562e-05, - "loss": 0.2586, + "epoch": 2.764803402169604, + "grad_norm": 0.21571218967437744, + "learning_rate": 2.194999674750637e-05, + "loss": 0.4167, "step": 76715 }, { - "epoch": 2.7, - "learning_rate": 2.3005453246057725e-05, - "loss": 0.2679, + "epoch": 2.7649836018308287, + "grad_norm": 0.25938844680786133, + "learning_rate": 2.194710043285203e-05, + "loss": 0.3906, "step": 76720 }, { - "epoch": 2.7, - "learning_rate": 2.300261361843775e-05, - "loss": 0.2646, + "epoch": 2.7651638014920534, + "grad_norm": 0.1947462111711502, + "learning_rate": 2.1944204159792854e-05, + "loss": 0.3942, "step": 76725 }, { - "epoch": 2.7, - "learning_rate": 2.299977401675253e-05, - "loss": 0.259, + "epoch": 2.7653440011532777, + "grad_norm": 0.19629132747650146, + "learning_rate": 2.1941307928368305e-05, + "loss": 0.4012, "step": 76730 }, { - "epoch": 2.7, - "learning_rate": 2.2996934441038925e-05, - "loss": 0.2711, + "epoch": 2.7655242008145025, + "grad_norm": 0.20872166752815247, + "learning_rate": 2.1938411738617843e-05, + "loss": 0.4147, "step": 76735 }, { - "epoch": 2.7, - "learning_rate": 2.299409489133381e-05, - "loss": 0.2804, + "epoch": 2.765704400475727, + "grad_norm": 0.2229888141155243, + "learning_rate": 2.1935515590580934e-05, + "loss": 0.372, "step": 76740 }, { - "epoch": 2.7, - "learning_rate": 2.2991255367674043e-05, - "loss": 0.2622, + "epoch": 2.7658846001369515, + "grad_norm": 0.23434416949748993, + "learning_rate": 2.193261948429703e-05, + "loss": 0.404, "step": 76745 }, { - "epoch": 2.7, - "learning_rate": 2.298841587009651e-05, - "loss": 0.2763, + "epoch": 2.766064799798176, + "grad_norm": 0.2255534827709198, + "learning_rate": 2.1929723419805582e-05, + "loss": 0.4228, "step": 76750 }, { - "epoch": 2.7, - "learning_rate": 2.298557639863807e-05, - "loss": 0.2788, + "epoch": 2.766244999459401, + "grad_norm": 0.21123521029949188, + "learning_rate": 2.192682739714607e-05, + "loss": 0.371, "step": 76755 }, { - "epoch": 2.7, - "learning_rate": 2.298273695333558e-05, - "loss": 0.275, + "epoch": 2.7664251991206257, + "grad_norm": 0.18391209840774536, + "learning_rate": 2.1923931416357944e-05, + "loss": 0.3993, "step": 76760 }, { - "epoch": 2.7, - "learning_rate": 2.2979897534225928e-05, - "loss": 0.2933, + "epoch": 2.7666053987818504, + "grad_norm": 0.19269075989723206, + "learning_rate": 2.1921035477480636e-05, + "loss": 0.3941, "step": 76765 }, { - "epoch": 2.7, - "learning_rate": 2.297705814134598e-05, - "loss": 0.2498, + "epoch": 2.766785598443075, + "grad_norm": 0.18515953421592712, + "learning_rate": 2.1918139580553644e-05, + "loss": 0.4063, "step": 76770 }, { - "epoch": 2.7, - "learning_rate": 2.2974218774732604e-05, - "loss": 0.2771, + "epoch": 2.7669657981042994, + "grad_norm": 0.21497590839862823, + "learning_rate": 2.1915243725616386e-05, + "loss": 0.4265, "step": 76775 }, { - "epoch": 2.7, - "learning_rate": 2.2971379434422647e-05, - "loss": 0.2682, + "epoch": 2.767145997765524, + "grad_norm": 0.19425266981124878, + "learning_rate": 2.1912347912708354e-05, + "loss": 0.405, "step": 76780 }, { - "epoch": 2.7, - "learning_rate": 2.2968540120453003e-05, - "loss": 0.2858, + "epoch": 2.767326197426749, + "grad_norm": 0.24012631177902222, + "learning_rate": 2.1909452141868975e-05, + "loss": 0.4209, "step": 76785 }, { - "epoch": 2.7, - "learning_rate": 2.296570083286052e-05, - "loss": 0.2574, + "epoch": 2.7675063970879736, + "grad_norm": 0.2343771904706955, + "learning_rate": 2.190655641313771e-05, + "loss": 0.3743, "step": 76790 }, { - "epoch": 2.7, - "learning_rate": 2.2962861571682075e-05, - "loss": 0.2707, + "epoch": 2.767686596749198, + "grad_norm": 0.23053786158561707, + "learning_rate": 2.1903660726554016e-05, + "loss": 0.4217, "step": 76795 }, { - "epoch": 2.7, - "learning_rate": 2.296002233695452e-05, - "loss": 0.2781, + "epoch": 2.7678667964104227, + "grad_norm": 0.19305402040481567, + "learning_rate": 2.1900765082157347e-05, + "loss": 0.3926, "step": 76800 }, { - "epoch": 2.7, - "learning_rate": 2.295718312871475e-05, - "loss": 0.2666, + "epoch": 2.7680469960716474, + "grad_norm": 0.18887373805046082, + "learning_rate": 2.1897869479987148e-05, + "loss": 0.3979, "step": 76805 }, { - "epoch": 2.7, - "learning_rate": 2.2954343946999602e-05, - "loss": 0.2449, + "epoch": 2.768227195732872, + "grad_norm": 0.2448185682296753, + "learning_rate": 2.1894973920082884e-05, + "loss": 0.4011, "step": 76810 }, { - "epoch": 2.7, - "learning_rate": 2.2951504791845945e-05, - "loss": 0.2578, + "epoch": 2.768407395394097, + "grad_norm": 0.2089599370956421, + "learning_rate": 2.189207840248399e-05, + "loss": 0.383, "step": 76815 }, { - "epoch": 2.7, - "learning_rate": 2.2948665663290644e-05, - "loss": 0.2534, + "epoch": 2.768587595055321, + "grad_norm": 0.24234536290168762, + "learning_rate": 2.1889182927229936e-05, + "loss": 0.3862, "step": 76820 }, { - "epoch": 2.7, - "learning_rate": 2.294582656137058e-05, - "loss": 0.2635, + "epoch": 2.768767794716546, + "grad_norm": 0.28005969524383545, + "learning_rate": 2.188628749436016e-05, + "loss": 0.405, "step": 76825 }, { - "epoch": 2.7, - "learning_rate": 2.29429874861226e-05, - "loss": 0.2841, + "epoch": 2.7689479943777706, + "grad_norm": 0.20914676785469055, + "learning_rate": 2.188339210391411e-05, + "loss": 0.3951, "step": 76830 }, { - "epoch": 2.7, - "learning_rate": 2.2940148437583575e-05, - "loss": 0.2808, + "epoch": 2.7691281940389953, + "grad_norm": 0.20570850372314453, + "learning_rate": 2.1880496755931244e-05, + "loss": 0.4507, "step": 76835 }, { - "epoch": 2.7, - "learning_rate": 2.2937309415790353e-05, - "loss": 0.2633, + "epoch": 2.7693083937002196, + "grad_norm": 0.20999915897846222, + "learning_rate": 2.187760145045101e-05, + "loss": 0.4017, "step": 76840 }, { - "epoch": 2.7, - "learning_rate": 2.2934470420779815e-05, - "loss": 0.2678, + "epoch": 2.7694885933614444, + "grad_norm": 0.16798564791679382, + "learning_rate": 2.1874706187512836e-05, + "loss": 0.3827, "step": 76845 }, { - "epoch": 2.7, - "learning_rate": 2.2931631452588827e-05, - "loss": 0.2716, + "epoch": 2.769668793022669, + "grad_norm": 0.20106874406337738, + "learning_rate": 2.18718109671562e-05, + "loss": 0.4012, "step": 76850 }, { - "epoch": 2.7, - "learning_rate": 2.2928792511254235e-05, - "loss": 0.2717, + "epoch": 2.769848992683894, + "grad_norm": 0.20237289369106293, + "learning_rate": 2.1868915789420522e-05, + "loss": 0.3978, "step": 76855 }, { - "epoch": 2.7, - "learning_rate": 2.2925953596812898e-05, - "loss": 0.2703, + "epoch": 2.7700291923451186, + "grad_norm": 0.2165011614561081, + "learning_rate": 2.186602065434527e-05, + "loss": 0.4484, "step": 76860 }, { - "epoch": 2.7, - "learning_rate": 2.29231147093017e-05, - "loss": 0.2869, + "epoch": 2.770209392006343, + "grad_norm": 0.19253584742546082, + "learning_rate": 2.1863125561969883e-05, + "loss": 0.3802, "step": 76865 }, { - "epoch": 2.7, - "learning_rate": 2.292027584875748e-05, - "loss": 0.2913, + "epoch": 2.7703895916675676, + "grad_norm": 0.19273778796195984, + "learning_rate": 2.1860230512333784e-05, + "loss": 0.3859, "step": 76870 }, { - "epoch": 2.7, - "learning_rate": 2.291743701521711e-05, - "loss": 0.2903, + "epoch": 2.7705697913287923, + "grad_norm": 0.1797461360692978, + "learning_rate": 2.185733550547646e-05, + "loss": 0.3808, "step": 76875 }, { - "epoch": 2.7, - "learning_rate": 2.2914598208717446e-05, - "loss": 0.2753, + "epoch": 2.770749990990017, + "grad_norm": 0.2192516326904297, + "learning_rate": 2.185444054143731e-05, + "loss": 0.4253, "step": 76880 }, { - "epoch": 2.71, - "learning_rate": 2.2911759429295357e-05, - "loss": 0.274, + "epoch": 2.7709301906512414, + "grad_norm": 0.17714661359786987, + "learning_rate": 2.1851545620255816e-05, + "loss": 0.4122, "step": 76885 }, { - "epoch": 2.71, - "learning_rate": 2.2908920676987693e-05, - "loss": 0.2704, + "epoch": 2.771110390312466, + "grad_norm": 0.26212644577026367, + "learning_rate": 2.1848650741971395e-05, + "loss": 0.4195, "step": 76890 }, { - "epoch": 2.71, - "learning_rate": 2.2906081951831308e-05, - "loss": 0.2785, + "epoch": 2.771290589973691, + "grad_norm": 0.1727093756198883, + "learning_rate": 2.1845755906623498e-05, + "loss": 0.4085, "step": 76895 }, { - "epoch": 2.71, - "learning_rate": 2.2903243253863075e-05, - "loss": 0.2702, + "epoch": 2.7714707896349156, + "grad_norm": 0.18318147957324982, + "learning_rate": 2.184286111425157e-05, + "loss": 0.349, "step": 76900 }, { - "epoch": 2.71, - "learning_rate": 2.290040458311985e-05, - "loss": 0.2588, + "epoch": 2.7716509892961403, + "grad_norm": 0.23283013701438904, + "learning_rate": 2.1839966364895042e-05, + "loss": 0.3714, "step": 76905 }, { - "epoch": 2.71, - "learning_rate": 2.2897565939638487e-05, - "loss": 0.2606, + "epoch": 2.771831188957365, + "grad_norm": 0.20274753868579865, + "learning_rate": 2.183707165859336e-05, + "loss": 0.3953, "step": 76910 }, { - "epoch": 2.71, - "learning_rate": 2.2894727323455835e-05, - "loss": 0.2743, + "epoch": 2.7720113886185893, + "grad_norm": 0.25053125619888306, + "learning_rate": 2.183417699538597e-05, + "loss": 0.4063, "step": 76915 }, { - "epoch": 2.71, - "learning_rate": 2.2891888734608775e-05, - "loss": 0.249, + "epoch": 2.772191588279814, + "grad_norm": 0.17475423216819763, + "learning_rate": 2.18312823753123e-05, + "loss": 0.386, "step": 76920 }, { - "epoch": 2.71, - "learning_rate": 2.288905017313414e-05, - "loss": 0.2775, + "epoch": 2.772371787941039, + "grad_norm": 0.19454878568649292, + "learning_rate": 2.18283877984118e-05, + "loss": 0.382, "step": 76925 }, { - "epoch": 2.71, - "learning_rate": 2.2886211639068806e-05, - "loss": 0.2406, + "epoch": 2.772551987602263, + "grad_norm": 0.2621554434299469, + "learning_rate": 2.1825493264723902e-05, + "loss": 0.4036, "step": 76930 }, { - "epoch": 2.71, - "learning_rate": 2.2883373132449608e-05, - "loss": 0.2914, + "epoch": 2.772732187263488, + "grad_norm": 0.211991548538208, + "learning_rate": 2.1822598774288034e-05, + "loss": 0.3894, "step": 76935 }, { - "epoch": 2.71, - "learning_rate": 2.2880534653313428e-05, - "loss": 0.2439, + "epoch": 2.7729123869247125, + "grad_norm": 0.20913714170455933, + "learning_rate": 2.181970432714365e-05, + "loss": 0.3712, "step": 76940 }, { - "epoch": 2.71, - "learning_rate": 2.2877696201697107e-05, - "loss": 0.2857, + "epoch": 2.7730925865859373, + "grad_norm": 0.22146856784820557, + "learning_rate": 2.1816809923330188e-05, + "loss": 0.3821, "step": 76945 }, { - "epoch": 2.71, - "learning_rate": 2.28748577776375e-05, - "loss": 0.2752, + "epoch": 2.773272786247162, + "grad_norm": 0.16859367489814758, + "learning_rate": 2.1813915562887054e-05, + "loss": 0.4161, "step": 76950 }, { - "epoch": 2.71, - "learning_rate": 2.287201938117146e-05, - "loss": 0.2735, + "epoch": 2.7734529859083867, + "grad_norm": 0.2146807461977005, + "learning_rate": 2.1811021245853724e-05, + "loss": 0.4039, "step": 76955 }, { - "epoch": 2.71, - "learning_rate": 2.2869181012335848e-05, - "loss": 0.2812, + "epoch": 2.773633185569611, + "grad_norm": 0.23717908561229706, + "learning_rate": 2.1808126972269594e-05, + "loss": 0.403, "step": 76960 }, { - "epoch": 2.71, - "learning_rate": 2.286634267116752e-05, - "loss": 0.254, + "epoch": 2.7738133852308358, + "grad_norm": 0.23532505333423615, + "learning_rate": 2.180523274217413e-05, + "loss": 0.3809, "step": 76965 }, { - "epoch": 2.71, - "learning_rate": 2.2863504357703328e-05, - "loss": 0.2942, + "epoch": 2.7739935848920605, + "grad_norm": 0.21510958671569824, + "learning_rate": 2.180233855560675e-05, + "loss": 0.3985, "step": 76970 }, { - "epoch": 2.71, - "learning_rate": 2.286066607198011e-05, - "loss": 0.2571, + "epoch": 2.774173784553285, + "grad_norm": 0.21763652563095093, + "learning_rate": 2.1799444412606873e-05, + "loss": 0.3586, "step": 76975 }, { - "epoch": 2.71, - "learning_rate": 2.2857827814034743e-05, - "loss": 0.2972, + "epoch": 2.7743539842145095, + "grad_norm": 0.23686784505844116, + "learning_rate": 2.1796550313213963e-05, + "loss": 0.4179, "step": 76980 }, { - "epoch": 2.71, - "learning_rate": 2.2854989583904074e-05, - "loss": 0.2408, + "epoch": 2.7745341838757343, + "grad_norm": 0.1995023488998413, + "learning_rate": 2.1793656257467432e-05, + "loss": 0.4106, "step": 76985 }, { - "epoch": 2.71, - "learning_rate": 2.2852151381624952e-05, - "loss": 0.261, + "epoch": 2.774714383536959, + "grad_norm": 0.20156660676002502, + "learning_rate": 2.179076224540671e-05, + "loss": 0.3956, "step": 76990 }, { - "epoch": 2.71, - "learning_rate": 2.2849313207234217e-05, - "loss": 0.2507, + "epoch": 2.7748945831981837, + "grad_norm": 0.19944174587726593, + "learning_rate": 2.1787868277071234e-05, + "loss": 0.3759, "step": 76995 }, { - "epoch": 2.71, - "learning_rate": 2.2846475060768743e-05, - "loss": 0.2842, + "epoch": 2.7750747828594085, + "grad_norm": 0.22275997698307037, + "learning_rate": 2.1784974352500423e-05, + "loss": 0.4041, "step": 77000 }, { - "epoch": 2.71, - "eval_loss": 0.2610674202442169, - "eval_runtime": 10.5451, - "eval_samples_per_second": 9.483, - "eval_steps_per_second": 9.483, + "epoch": 2.7750747828594085, + "eval_loss": 0.4315272569656372, + "eval_runtime": 3.5286, + "eval_samples_per_second": 28.34, + "eval_steps_per_second": 7.085, "step": 77000 }, { - "epoch": 2.71, - "learning_rate": 2.2843636942265367e-05, - "loss": 0.2581, + "epoch": 2.7752549825206327, + "grad_norm": 0.19483999907970428, + "learning_rate": 2.178208047173372e-05, + "loss": 0.411, "step": 77005 }, { - "epoch": 2.71, - "learning_rate": 2.284079885176094e-05, - "loss": 0.2934, + "epoch": 2.7754351821818575, + "grad_norm": 0.18567782640457153, + "learning_rate": 2.177918663481055e-05, + "loss": 0.3731, "step": 77010 }, { - "epoch": 2.71, - "learning_rate": 2.2837960789292322e-05, - "loss": 0.2504, + "epoch": 2.775615381843082, + "grad_norm": 0.22382299602031708, + "learning_rate": 2.1776292841770333e-05, + "loss": 0.4029, "step": 77015 }, { - "epoch": 2.71, - "learning_rate": 2.2835122754896367e-05, - "loss": 0.2727, + "epoch": 2.7757955815043065, + "grad_norm": 0.20525696873664856, + "learning_rate": 2.1773399092652507e-05, + "loss": 0.3781, "step": 77020 }, { - "epoch": 2.71, - "learning_rate": 2.283228474860991e-05, - "loss": 0.2551, + "epoch": 2.7759757811655312, + "grad_norm": 0.21928441524505615, + "learning_rate": 2.1770505387496494e-05, + "loss": 0.3852, "step": 77025 }, { - "epoch": 2.71, - "learning_rate": 2.2829446770469796e-05, - "loss": 0.2866, + "epoch": 2.776155980826756, + "grad_norm": 0.25740548968315125, + "learning_rate": 2.1767611726341714e-05, + "loss": 0.4268, "step": 77030 }, { - "epoch": 2.71, - "learning_rate": 2.28266088205129e-05, - "loss": 0.2693, + "epoch": 2.7763361804879807, + "grad_norm": 0.21519418060779572, + "learning_rate": 2.17647181092276e-05, + "loss": 0.4104, "step": 77035 }, { - "epoch": 2.71, - "learning_rate": 2.2823770898776047e-05, - "loss": 0.2849, + "epoch": 2.7765163801492054, + "grad_norm": 0.19074714183807373, + "learning_rate": 2.1761824536193575e-05, + "loss": 0.4052, "step": 77040 }, { - "epoch": 2.71, - "learning_rate": 2.28209330052961e-05, - "loss": 0.2797, + "epoch": 2.77669657981043, + "grad_norm": 0.20910242199897766, + "learning_rate": 2.1758931007279067e-05, + "loss": 0.3869, "step": 77045 }, { - "epoch": 2.71, - "learning_rate": 2.2818095140109892e-05, - "loss": 0.2796, + "epoch": 2.7768767794716545, + "grad_norm": 0.22091153264045715, + "learning_rate": 2.1756037522523503e-05, + "loss": 0.4058, "step": 77050 }, { - "epoch": 2.71, - "learning_rate": 2.2815257303254296e-05, - "loss": 0.2764, + "epoch": 2.777056979132879, + "grad_norm": 0.18852129578590393, + "learning_rate": 2.175314408196628e-05, + "loss": 0.4099, "step": 77055 }, { - "epoch": 2.71, - "learning_rate": 2.281241949476614e-05, - "loss": 0.2723, + "epoch": 2.777237178794104, + "grad_norm": 0.23839035630226135, + "learning_rate": 2.1750250685646863e-05, + "loss": 0.3927, "step": 77060 }, { - "epoch": 2.71, - "learning_rate": 2.2809581714682268e-05, - "loss": 0.2533, + "epoch": 2.7774173784553287, + "grad_norm": 0.20940519869327545, + "learning_rate": 2.174735733360464e-05, + "loss": 0.3632, "step": 77065 }, { - "epoch": 2.71, - "learning_rate": 2.280674396303953e-05, - "loss": 0.2869, + "epoch": 2.777597578116553, + "grad_norm": 0.24172858893871307, + "learning_rate": 2.174446402587904e-05, + "loss": 0.3759, "step": 77070 }, { - "epoch": 2.71, - "learning_rate": 2.280390623987479e-05, - "loss": 0.2614, + "epoch": 2.7777777777777777, + "grad_norm": 0.2583145201206207, + "learning_rate": 2.1741570762509495e-05, + "loss": 0.3658, "step": 77075 }, { - "epoch": 2.71, - "learning_rate": 2.2801068545224876e-05, - "loss": 0.2631, + "epoch": 2.7779579774390024, + "grad_norm": 0.21428237855434418, + "learning_rate": 2.173867754353541e-05, + "loss": 0.4038, "step": 77080 }, { - "epoch": 2.71, - "learning_rate": 2.2798230879126636e-05, - "loss": 0.2867, + "epoch": 2.778138177100227, + "grad_norm": 0.1586180180311203, + "learning_rate": 2.173578436899622e-05, + "loss": 0.3844, "step": 77085 }, { - "epoch": 2.71, - "learning_rate": 2.2795393241616913e-05, - "loss": 0.2628, + "epoch": 2.778318376761452, + "grad_norm": 0.22997400164604187, + "learning_rate": 2.1732891238931334e-05, + "loss": 0.3767, "step": 77090 }, { - "epoch": 2.71, - "learning_rate": 2.279255563273256e-05, - "loss": 0.2489, + "epoch": 2.778498576422676, + "grad_norm": 0.22127170860767365, + "learning_rate": 2.1729998153380165e-05, + "loss": 0.3797, "step": 77095 }, { - "epoch": 2.71, - "learning_rate": 2.2789718052510424e-05, - "loss": 0.2667, + "epoch": 2.778678776083901, + "grad_norm": 0.25849655270576477, + "learning_rate": 2.1727105112382147e-05, + "loss": 0.4181, "step": 77100 }, { - "epoch": 2.71, - "learning_rate": 2.2786880500987336e-05, - "loss": 0.2879, + "epoch": 2.7788589757451256, + "grad_norm": 0.18317921459674835, + "learning_rate": 2.172421211597668e-05, + "loss": 0.3667, "step": 77105 }, { - "epoch": 2.71, - "learning_rate": 2.2784042978200142e-05, - "loss": 0.2719, + "epoch": 2.7790391754063504, + "grad_norm": 0.21405282616615295, + "learning_rate": 2.1721319164203195e-05, + "loss": 0.4073, "step": 77110 }, { - "epoch": 2.71, - "learning_rate": 2.2781205484185695e-05, - "loss": 0.2512, + "epoch": 2.7792193750675747, + "grad_norm": 0.18047797679901123, + "learning_rate": 2.1718426257101103e-05, + "loss": 0.4006, "step": 77115 }, { - "epoch": 2.71, - "learning_rate": 2.2778368018980827e-05, - "loss": 0.2795, + "epoch": 2.7793995747287994, + "grad_norm": 0.1952604204416275, + "learning_rate": 2.1715533394709807e-05, + "loss": 0.3855, "step": 77120 }, { - "epoch": 2.71, - "learning_rate": 2.2775530582622385e-05, - "loss": 0.2752, + "epoch": 2.779579774390024, + "grad_norm": 0.18268941342830658, + "learning_rate": 2.1712640577068743e-05, + "loss": 0.435, "step": 77125 }, { - "epoch": 2.71, - "learning_rate": 2.277269317514722e-05, - "loss": 0.2545, + "epoch": 2.779759974051249, + "grad_norm": 0.23162731528282166, + "learning_rate": 2.1709747804217324e-05, + "loss": 0.3652, "step": 77130 }, { - "epoch": 2.71, - "learning_rate": 2.276985579659217e-05, - "loss": 0.2642, + "epoch": 2.7799401737124736, + "grad_norm": 0.22689977288246155, + "learning_rate": 2.170685507619493e-05, + "loss": 0.3839, "step": 77135 }, { - "epoch": 2.71, - "learning_rate": 2.276701844699407e-05, - "loss": 0.2255, + "epoch": 2.7801203733736983, + "grad_norm": 0.2621477544307709, + "learning_rate": 2.1703962393041015e-05, + "loss": 0.4038, "step": 77140 }, { - "epoch": 2.71, - "learning_rate": 2.2764181126389753e-05, - "loss": 0.2544, + "epoch": 2.7803005730349226, + "grad_norm": 0.22769121825695038, + "learning_rate": 2.1701069754794966e-05, + "loss": 0.4086, "step": 77145 }, { - "epoch": 2.71, - "learning_rate": 2.2761343834816072e-05, - "loss": 0.2894, + "epoch": 2.7804807726961474, + "grad_norm": 0.21255040168762207, + "learning_rate": 2.1698177161496205e-05, + "loss": 0.4162, "step": 77150 }, { - "epoch": 2.71, - "learning_rate": 2.2758506572309878e-05, - "loss": 0.2732, + "epoch": 2.780660972357372, + "grad_norm": 0.1877208799123764, + "learning_rate": 2.1695284613184154e-05, + "loss": 0.3944, "step": 77155 }, { - "epoch": 2.71, - "learning_rate": 2.2755669338907997e-05, - "loss": 0.2722, + "epoch": 2.7808411720185964, + "grad_norm": 0.24076056480407715, + "learning_rate": 2.1692392109898185e-05, + "loss": 0.4121, "step": 77160 }, { - "epoch": 2.71, - "learning_rate": 2.275283213464726e-05, - "loss": 0.2762, + "epoch": 2.781021371679821, + "grad_norm": 0.22234618663787842, + "learning_rate": 2.1689499651677754e-05, + "loss": 0.3533, "step": 77165 }, { - "epoch": 2.72, - "learning_rate": 2.2749994959564528e-05, - "loss": 0.2824, + "epoch": 2.781201571341046, + "grad_norm": 0.21065817773342133, + "learning_rate": 2.1686607238562245e-05, + "loss": 0.3904, "step": 77170 }, { - "epoch": 2.72, - "learning_rate": 2.274715781369662e-05, - "loss": 0.2778, + "epoch": 2.7813817710022706, + "grad_norm": 0.24153132736682892, + "learning_rate": 2.1683714870591067e-05, + "loss": 0.4196, "step": 77175 }, { - "epoch": 2.72, - "learning_rate": 2.2744320697080394e-05, - "loss": 0.2657, + "epoch": 2.7815619706634953, + "grad_norm": 0.25936198234558105, + "learning_rate": 2.1680822547803635e-05, + "loss": 0.4077, "step": 77180 }, { - "epoch": 2.72, - "learning_rate": 2.274148360975266e-05, - "loss": 0.2686, + "epoch": 2.78174217032472, + "grad_norm": 0.257414847612381, + "learning_rate": 2.1677930270239343e-05, + "loss": 0.4039, "step": 77185 }, { - "epoch": 2.72, - "learning_rate": 2.2738646551750288e-05, - "loss": 0.2729, + "epoch": 2.7819223699859443, + "grad_norm": 0.17230790853500366, + "learning_rate": 2.167503803793762e-05, + "loss": 0.3765, "step": 77190 }, { - "epoch": 2.72, - "learning_rate": 2.27358095231101e-05, - "loss": 0.2537, + "epoch": 2.782102569647169, + "grad_norm": 0.21409516036510468, + "learning_rate": 2.167214585093786e-05, + "loss": 0.3749, "step": 77195 }, { - "epoch": 2.72, - "learning_rate": 2.2732972523868926e-05, - "loss": 0.2587, + "epoch": 2.782282769308394, + "grad_norm": 0.17981880903244019, + "learning_rate": 2.1669253709279458e-05, + "loss": 0.4021, "step": 77200 }, { - "epoch": 2.72, - "learning_rate": 2.2730135554063607e-05, - "loss": 0.2661, + "epoch": 2.782462968969618, + "grad_norm": 0.1959773600101471, + "learning_rate": 2.166636161300184e-05, + "loss": 0.4146, "step": 77205 }, { - "epoch": 2.72, - "learning_rate": 2.272729861373099e-05, - "loss": 0.2629, + "epoch": 2.782643168630843, + "grad_norm": 0.30680885910987854, + "learning_rate": 2.1663469562144395e-05, + "loss": 0.3919, "step": 77210 }, { - "epoch": 2.72, - "learning_rate": 2.2724461702907905e-05, - "loss": 0.2707, + "epoch": 2.7828233682920676, + "grad_norm": 0.2044559270143509, + "learning_rate": 2.1660577556746527e-05, + "loss": 0.391, "step": 77215 }, { - "epoch": 2.72, - "learning_rate": 2.2721624821631183e-05, - "loss": 0.2674, + "epoch": 2.7830035679532923, + "grad_norm": 0.2195388227701187, + "learning_rate": 2.165768559684765e-05, + "loss": 0.395, "step": 77220 }, { - "epoch": 2.72, - "learning_rate": 2.271878796993765e-05, - "loss": 0.2789, + "epoch": 2.783183767614517, + "grad_norm": 0.2089926302433014, + "learning_rate": 2.1654793682487157e-05, + "loss": 0.3878, "step": 77225 }, { - "epoch": 2.72, - "learning_rate": 2.271595114786416e-05, - "loss": 0.2739, + "epoch": 2.7833639672757418, + "grad_norm": 0.21790191531181335, + "learning_rate": 2.1651901813704452e-05, + "loss": 0.4098, "step": 77230 }, { - "epoch": 2.72, - "learning_rate": 2.271311435544754e-05, - "loss": 0.2704, + "epoch": 2.783544166936966, + "grad_norm": 0.22714762389659882, + "learning_rate": 2.1649009990538947e-05, + "loss": 0.3506, "step": 77235 }, { - "epoch": 2.72, - "learning_rate": 2.2710277592724616e-05, - "loss": 0.2693, + "epoch": 2.783724366598191, + "grad_norm": 0.18509992957115173, + "learning_rate": 2.1646118213030015e-05, + "loss": 0.3868, "step": 77240 }, { - "epoch": 2.72, - "learning_rate": 2.270744085973224e-05, - "loss": 0.2677, + "epoch": 2.7839045662594155, + "grad_norm": 0.1982908993959427, + "learning_rate": 2.1643226481217084e-05, + "loss": 0.4092, "step": 77245 }, { - "epoch": 2.72, - "learning_rate": 2.270460415650723e-05, - "loss": 0.2619, + "epoch": 2.78408476592064, + "grad_norm": 0.1664113849401474, + "learning_rate": 2.1640334795139545e-05, + "loss": 0.3934, "step": 77250 }, { - "epoch": 2.72, - "learning_rate": 2.270176748308642e-05, - "loss": 0.267, + "epoch": 2.7842649655818645, + "grad_norm": 0.22491081058979034, + "learning_rate": 2.163744315483678e-05, + "loss": 0.4087, "step": 77255 }, { - "epoch": 2.72, - "learning_rate": 2.269893083950664e-05, - "loss": 0.2798, + "epoch": 2.7844451652430893, + "grad_norm": 0.22320492565631866, + "learning_rate": 2.1634551560348213e-05, + "loss": 0.4185, "step": 77260 }, { - "epoch": 2.72, - "learning_rate": 2.2696094225804735e-05, - "loss": 0.2499, + "epoch": 2.784625364904314, + "grad_norm": 0.1935398429632187, + "learning_rate": 2.1631660011713218e-05, + "loss": 0.3915, "step": 77265 }, { - "epoch": 2.72, - "learning_rate": 2.269325764201753e-05, - "loss": 0.2957, + "epoch": 2.7848055645655387, + "grad_norm": 0.205779567360878, + "learning_rate": 2.1628768508971213e-05, + "loss": 0.4085, "step": 77270 }, { - "epoch": 2.72, - "learning_rate": 2.2690421088181852e-05, - "loss": 0.2505, + "epoch": 2.7849857642267635, + "grad_norm": 0.19906838238239288, + "learning_rate": 2.1625877052161586e-05, + "loss": 0.4113, "step": 77275 }, { - "epoch": 2.72, - "learning_rate": 2.2687584564334524e-05, - "loss": 0.2756, + "epoch": 2.7851659638879878, + "grad_norm": 0.23320987820625305, + "learning_rate": 2.1622985641323724e-05, + "loss": 0.3884, "step": 77280 }, { - "epoch": 2.72, - "learning_rate": 2.268474807051239e-05, - "loss": 0.2733, + "epoch": 2.7853461635492125, + "grad_norm": 0.22347044944763184, + "learning_rate": 2.1620094276497033e-05, + "loss": 0.3888, "step": 77285 }, { - "epoch": 2.72, - "learning_rate": 2.2681911606752286e-05, - "loss": 0.2585, + "epoch": 2.7855263632104372, + "grad_norm": 0.27736568450927734, + "learning_rate": 2.16172029577209e-05, + "loss": 0.4216, "step": 77290 }, { - "epoch": 2.72, - "learning_rate": 2.267907517309103e-05, - "loss": 0.279, + "epoch": 2.785706562871662, + "grad_norm": 0.2748807668685913, + "learning_rate": 2.1614311685034726e-05, + "loss": 0.4152, "step": 77295 }, { - "epoch": 2.72, - "learning_rate": 2.267623876956544e-05, - "loss": 0.2715, + "epoch": 2.7858867625328863, + "grad_norm": 0.2487899661064148, + "learning_rate": 2.16114204584779e-05, + "loss": 0.4088, "step": 77300 }, { - "epoch": 2.72, - "learning_rate": 2.2673402396212374e-05, - "loss": 0.2629, + "epoch": 2.786066962194111, + "grad_norm": 0.21085642278194427, + "learning_rate": 2.1608529278089808e-05, + "loss": 0.3912, "step": 77305 }, { - "epoch": 2.72, - "learning_rate": 2.2670566053068637e-05, - "loss": 0.2589, + "epoch": 2.7862471618553357, + "grad_norm": 0.19442741572856903, + "learning_rate": 2.160563814390985e-05, + "loss": 0.3965, "step": 77310 }, { - "epoch": 2.72, - "learning_rate": 2.266772974017107e-05, - "loss": 0.2688, + "epoch": 2.7864273615165605, + "grad_norm": 0.19160184264183044, + "learning_rate": 2.1602747055977417e-05, + "loss": 0.37, "step": 77315 }, { - "epoch": 2.72, - "learning_rate": 2.2664893457556486e-05, - "loss": 0.2875, + "epoch": 2.786607561177785, + "grad_norm": 0.2595144212245941, + "learning_rate": 2.1599856014331895e-05, + "loss": 0.4168, "step": 77320 }, { - "epoch": 2.72, - "learning_rate": 2.266205720526173e-05, - "loss": 0.2793, + "epoch": 2.7867877608390095, + "grad_norm": 0.23235271871089935, + "learning_rate": 2.1596965019012682e-05, + "loss": 0.4176, "step": 77325 }, { - "epoch": 2.72, - "learning_rate": 2.265922098332362e-05, - "loss": 0.2463, + "epoch": 2.786967960500234, + "grad_norm": 0.1904749721288681, + "learning_rate": 2.1594074070059155e-05, + "loss": 0.3688, "step": 77330 }, { - "epoch": 2.72, - "learning_rate": 2.2656384791778977e-05, - "loss": 0.2618, + "epoch": 2.787148160161459, + "grad_norm": 0.22936037182807922, + "learning_rate": 2.1591183167510714e-05, + "loss": 0.3988, "step": 77335 }, { - "epoch": 2.72, - "learning_rate": 2.2653548630664632e-05, - "loss": 0.2857, + "epoch": 2.7873283598226837, + "grad_norm": 0.21783271431922913, + "learning_rate": 2.158829231140675e-05, + "loss": 0.4026, "step": 77340 }, { - "epoch": 2.72, - "learning_rate": 2.265071250001742e-05, - "loss": 0.2459, + "epoch": 2.787508559483908, + "grad_norm": 0.193325474858284, + "learning_rate": 2.1585401501786622e-05, + "loss": 0.4321, "step": 77345 }, { - "epoch": 2.72, - "learning_rate": 2.2647876399874156e-05, - "loss": 0.2834, + "epoch": 2.7876887591451327, + "grad_norm": 0.24753624200820923, + "learning_rate": 2.158251073868976e-05, + "loss": 0.4161, "step": 77350 }, { - "epoch": 2.72, - "learning_rate": 2.2645040330271658e-05, - "loss": 0.2687, + "epoch": 2.7878689588063574, + "grad_norm": 0.18600989878177643, + "learning_rate": 2.1579620022155516e-05, + "loss": 0.3775, "step": 77355 }, { - "epoch": 2.72, - "learning_rate": 2.2642204291246772e-05, - "loss": 0.271, + "epoch": 2.788049158467582, + "grad_norm": 0.19241438806056976, + "learning_rate": 2.1576729352223285e-05, + "loss": 0.395, "step": 77360 }, { - "epoch": 2.72, - "learning_rate": 2.26393682828363e-05, - "loss": 0.2495, + "epoch": 2.788229358128807, + "grad_norm": 0.2080182284116745, + "learning_rate": 2.1573838728932462e-05, + "loss": 0.3651, "step": 77365 }, { - "epoch": 2.72, - "learning_rate": 2.2636532305077083e-05, - "loss": 0.2624, + "epoch": 2.788409557790031, + "grad_norm": 0.23465201258659363, + "learning_rate": 2.157094815232241e-05, + "loss": 0.3976, "step": 77370 }, { - "epoch": 2.72, - "learning_rate": 2.2633696358005928e-05, - "loss": 0.2692, + "epoch": 2.788589757451256, + "grad_norm": 0.21152657270431519, + "learning_rate": 2.1568057622432536e-05, + "loss": 0.4053, "step": 77375 }, { - "epoch": 2.72, - "learning_rate": 2.263086044165967e-05, - "loss": 0.2721, + "epoch": 2.7887699571124807, + "grad_norm": 0.20607246458530426, + "learning_rate": 2.1565167139302213e-05, + "loss": 0.3984, "step": 77380 }, { - "epoch": 2.72, - "learning_rate": 2.2628024556075134e-05, - "loss": 0.2617, + "epoch": 2.7889501567737054, + "grad_norm": 0.20786413550376892, + "learning_rate": 2.1562276702970816e-05, + "loss": 0.3772, "step": 77385 }, { - "epoch": 2.72, - "learning_rate": 2.262518870128913e-05, - "loss": 0.246, + "epoch": 2.7891303564349297, + "grad_norm": 0.21368098258972168, + "learning_rate": 2.155938631347774e-05, + "loss": 0.3946, "step": 77390 }, { - "epoch": 2.72, - "learning_rate": 2.2622352877338482e-05, - "loss": 0.274, + "epoch": 2.7893105560961544, + "grad_norm": 0.22205038368701935, + "learning_rate": 2.1556495970862356e-05, + "loss": 0.3638, "step": 77395 }, { - "epoch": 2.72, - "learning_rate": 2.2619517084260018e-05, - "loss": 0.2585, + "epoch": 2.789490755757379, + "grad_norm": 0.22499975562095642, + "learning_rate": 2.1553605675164047e-05, + "loss": 0.4032, "step": 77400 }, { - "epoch": 2.72, - "learning_rate": 2.261668132209056e-05, - "loss": 0.2608, + "epoch": 2.789670955418604, + "grad_norm": 0.2418082356452942, + "learning_rate": 2.1550715426422194e-05, + "loss": 0.3953, "step": 77405 }, { - "epoch": 2.72, - "learning_rate": 2.2613845590866924e-05, - "loss": 0.2521, + "epoch": 2.7898511550798286, + "grad_norm": 0.19211691617965698, + "learning_rate": 2.1547825224676173e-05, + "loss": 0.3994, "step": 77410 }, { - "epoch": 2.72, - "learning_rate": 2.2611009890625924e-05, - "loss": 0.2686, + "epoch": 2.7900313547410533, + "grad_norm": 0.2310890257358551, + "learning_rate": 2.154493506996537e-05, + "loss": 0.3642, "step": 77415 }, { - "epoch": 2.72, - "learning_rate": 2.2608174221404384e-05, - "loss": 0.2489, + "epoch": 2.7902115544022776, + "grad_norm": 0.24673838913440704, + "learning_rate": 2.1542044962329163e-05, + "loss": 0.4185, "step": 77420 }, { - "epoch": 2.72, - "learning_rate": 2.2605338583239138e-05, - "loss": 0.2762, + "epoch": 2.7903917540635024, + "grad_norm": 0.2144293338060379, + "learning_rate": 2.153915490180691e-05, + "loss": 0.4018, "step": 77425 }, { - "epoch": 2.72, - "learning_rate": 2.260250297616699e-05, - "loss": 0.249, + "epoch": 2.790571953724727, + "grad_norm": 0.1878860741853714, + "learning_rate": 2.1536264888438005e-05, + "loss": 0.3942, "step": 77430 }, { - "epoch": 2.72, - "learning_rate": 2.2599667400224748e-05, - "loss": 0.2607, + "epoch": 2.7907521533859514, + "grad_norm": 0.2370370328426361, + "learning_rate": 2.1533374922261835e-05, + "loss": 0.3942, "step": 77435 }, { - "epoch": 2.72, - "learning_rate": 2.2596831855449255e-05, - "loss": 0.2503, + "epoch": 2.790932353047176, + "grad_norm": 0.20887020230293274, + "learning_rate": 2.153048500331774e-05, + "loss": 0.4095, "step": 77440 }, { - "epoch": 2.72, - "learning_rate": 2.259399634187731e-05, - "loss": 0.2347, + "epoch": 2.791112552708401, + "grad_norm": 0.21834127604961395, + "learning_rate": 2.152759513164513e-05, + "loss": 0.379, "step": 77445 }, { - "epoch": 2.72, - "learning_rate": 2.2591160859545747e-05, - "loss": 0.2706, + "epoch": 2.7912927523696256, + "grad_norm": 0.16670918464660645, + "learning_rate": 2.1524705307283348e-05, + "loss": 0.3774, "step": 77450 }, { - "epoch": 2.73, - "learning_rate": 2.2588325408491356e-05, - "loss": 0.2669, + "epoch": 2.7914729520308503, + "grad_norm": 0.20966099202632904, + "learning_rate": 2.15218155302718e-05, + "loss": 0.3779, "step": 77455 }, { - "epoch": 2.73, - "learning_rate": 2.2585489988750984e-05, - "loss": 0.2725, + "epoch": 2.791653151692075, + "grad_norm": 0.1819358766078949, + "learning_rate": 2.1518925800649836e-05, + "loss": 0.4119, "step": 77460 }, { - "epoch": 2.73, - "learning_rate": 2.2582654600361433e-05, - "loss": 0.2924, + "epoch": 2.7918333513532994, + "grad_norm": 0.22907419502735138, + "learning_rate": 2.151603611845683e-05, + "loss": 0.3981, "step": 77465 }, { - "epoch": 2.73, - "learning_rate": 2.2579819243359507e-05, - "loss": 0.2495, + "epoch": 2.792013551014524, + "grad_norm": 0.2390015870332718, + "learning_rate": 2.1513146483732163e-05, + "loss": 0.3907, "step": 77470 }, { - "epoch": 2.73, - "learning_rate": 2.2576983917782046e-05, - "loss": 0.267, + "epoch": 2.792193750675749, + "grad_norm": 0.20397347211837769, + "learning_rate": 2.1510256896515195e-05, + "loss": 0.3836, "step": 77475 }, { - "epoch": 2.73, - "learning_rate": 2.2574148623665845e-05, - "loss": 0.2731, + "epoch": 2.792373950336973, + "grad_norm": 0.2034919261932373, + "learning_rate": 2.1507367356845304e-05, + "loss": 0.398, "step": 77480 }, { - "epoch": 2.73, - "learning_rate": 2.2571313361047734e-05, - "loss": 0.2742, + "epoch": 2.792554149998198, + "grad_norm": 0.1811286211013794, + "learning_rate": 2.150447786476186e-05, + "loss": 0.3666, "step": 77485 }, { - "epoch": 2.73, - "learning_rate": 2.2568478129964503e-05, - "loss": 0.2896, + "epoch": 2.7927343496594226, + "grad_norm": 0.23901815712451935, + "learning_rate": 2.1501588420304218e-05, + "loss": 0.4266, "step": 77490 }, { - "epoch": 2.73, - "learning_rate": 2.256564293045299e-05, - "loss": 0.2554, + "epoch": 2.7929145493206473, + "grad_norm": 0.23680728673934937, + "learning_rate": 2.1498699023511766e-05, + "loss": 0.419, "step": 77495 }, { - "epoch": 2.73, - "learning_rate": 2.2562807762550006e-05, - "loss": 0.2943, + "epoch": 2.793094748981872, + "grad_norm": 0.22944191098213196, + "learning_rate": 2.1495809674423865e-05, + "loss": 0.3911, "step": 77500 }, { - "epoch": 2.73, - "eval_loss": 0.2611740827560425, - "eval_runtime": 10.5438, - "eval_samples_per_second": 9.484, - "eval_steps_per_second": 9.484, + "epoch": 2.793094748981872, + "eval_loss": 0.4314689338207245, + "eval_runtime": 3.521, + "eval_samples_per_second": 28.401, + "eval_steps_per_second": 7.1, "step": 77500 }, { - "epoch": 2.73, - "learning_rate": 2.255997262629235e-05, - "loss": 0.2799, + "epoch": 2.7932749486430968, + "grad_norm": 0.22808948159217834, + "learning_rate": 2.1492920373079867e-05, + "loss": 0.3854, "step": 77505 }, { - "epoch": 2.73, - "learning_rate": 2.2557137521716834e-05, - "loss": 0.2513, + "epoch": 2.793455148304321, + "grad_norm": 0.17345234751701355, + "learning_rate": 2.149003111951916e-05, + "loss": 0.3919, "step": 77510 }, { - "epoch": 2.73, - "learning_rate": 2.255430244886029e-05, - "loss": 0.2622, + "epoch": 2.793635347965546, + "grad_norm": 0.18954497575759888, + "learning_rate": 2.148714191378109e-05, + "loss": 0.3631, "step": 77515 }, { - "epoch": 2.73, - "learning_rate": 2.2551467407759515e-05, - "loss": 0.2908, + "epoch": 2.7938155476267705, + "grad_norm": 0.28941574692726135, + "learning_rate": 2.148425275590504e-05, + "loss": 0.396, "step": 77520 }, { - "epoch": 2.73, - "learning_rate": 2.2548632398451316e-05, - "loss": 0.2826, + "epoch": 2.793995747287995, + "grad_norm": 0.2047412395477295, + "learning_rate": 2.1481363645930367e-05, + "loss": 0.3768, "step": 77525 }, { - "epoch": 2.73, - "learning_rate": 2.2545797420972514e-05, - "loss": 0.2632, + "epoch": 2.7941759469492196, + "grad_norm": 0.17408131062984467, + "learning_rate": 2.147847458389642e-05, + "loss": 0.409, "step": 77530 }, { - "epoch": 2.73, - "learning_rate": 2.254296247535991e-05, - "loss": 0.2727, + "epoch": 2.7943561466104443, + "grad_norm": 0.23604151606559753, + "learning_rate": 2.147558556984259e-05, + "loss": 0.3933, "step": 77535 }, { - "epoch": 2.73, - "learning_rate": 2.2540127561650327e-05, - "loss": 0.2617, + "epoch": 2.794536346271669, + "grad_norm": 0.22431814670562744, + "learning_rate": 2.147269660380822e-05, + "loss": 0.4178, "step": 77540 }, { - "epoch": 2.73, - "learning_rate": 2.2537292679880567e-05, - "loss": 0.2768, + "epoch": 2.7947165459328938, + "grad_norm": 0.2082727700471878, + "learning_rate": 2.1469807685832672e-05, + "loss": 0.3803, "step": 77545 }, { - "epoch": 2.73, - "learning_rate": 2.2534457830087428e-05, - "loss": 0.2644, + "epoch": 2.7948967455941185, + "grad_norm": 0.2045706808567047, + "learning_rate": 2.146691881595531e-05, + "loss": 0.401, "step": 77550 }, { - "epoch": 2.73, - "learning_rate": 2.253162301230774e-05, - "loss": 0.2605, + "epoch": 2.795076945255343, + "grad_norm": 0.17748849093914032, + "learning_rate": 2.146402999421549e-05, + "loss": 0.3753, "step": 77555 }, { - "epoch": 2.73, - "learning_rate": 2.2528788226578292e-05, - "loss": 0.2768, + "epoch": 2.7952571449165675, + "grad_norm": 0.19480100274085999, + "learning_rate": 2.1461141220652593e-05, + "loss": 0.407, "step": 77560 }, { - "epoch": 2.73, - "learning_rate": 2.252595347293591e-05, - "loss": 0.2968, + "epoch": 2.7954373445777922, + "grad_norm": 0.2130713313817978, + "learning_rate": 2.1458252495305954e-05, + "loss": 0.368, "step": 77565 }, { - "epoch": 2.73, - "learning_rate": 2.252311875141738e-05, - "loss": 0.2626, + "epoch": 2.795617544239017, + "grad_norm": 0.22572968900203705, + "learning_rate": 2.145536381821493e-05, + "loss": 0.396, "step": 77570 }, { - "epoch": 2.73, - "learning_rate": 2.2520284062059527e-05, - "loss": 0.2552, + "epoch": 2.7957977439002413, + "grad_norm": 0.21448364853858948, + "learning_rate": 2.14524751894189e-05, + "loss": 0.3612, "step": 77575 }, { - "epoch": 2.73, - "learning_rate": 2.2517449404899153e-05, - "loss": 0.2794, + "epoch": 2.795977943561466, + "grad_norm": 0.2889059782028198, + "learning_rate": 2.1449586608957207e-05, + "loss": 0.4111, "step": 77580 }, { - "epoch": 2.73, - "learning_rate": 2.2514614779973055e-05, - "loss": 0.2895, + "epoch": 2.7961581432226907, + "grad_norm": 0.23263846337795258, + "learning_rate": 2.1446698076869203e-05, + "loss": 0.4039, "step": 77585 }, { - "epoch": 2.73, - "learning_rate": 2.251178018731805e-05, - "loss": 0.2437, + "epoch": 2.7963383428839155, + "grad_norm": 0.22178532183170319, + "learning_rate": 2.1443809593194253e-05, + "loss": 0.4065, "step": 77590 }, { - "epoch": 2.73, - "learning_rate": 2.2508945626970945e-05, - "loss": 0.2697, + "epoch": 2.79651854254514, + "grad_norm": 0.19402441382408142, + "learning_rate": 2.1440921157971706e-05, + "loss": 0.3759, "step": 77595 }, { - "epoch": 2.73, - "learning_rate": 2.2506111098968537e-05, - "loss": 0.259, + "epoch": 2.7966987422063645, + "grad_norm": 0.22122874855995178, + "learning_rate": 2.1438032771240925e-05, + "loss": 0.4054, "step": 77600 }, { - "epoch": 2.73, - "learning_rate": 2.2503276603347626e-05, - "loss": 0.26, + "epoch": 2.7968789418675892, + "grad_norm": 0.2299935668706894, + "learning_rate": 2.143514443304126e-05, + "loss": 0.3833, "step": 77605 }, { - "epoch": 2.73, - "learning_rate": 2.250044214014503e-05, - "loss": 0.261, + "epoch": 2.797059141528814, + "grad_norm": 0.23036040365695953, + "learning_rate": 2.143225614341205e-05, + "loss": 0.4046, "step": 77610 }, { - "epoch": 2.73, - "learning_rate": 2.2497607709397543e-05, - "loss": 0.2581, + "epoch": 2.7972393411900387, + "grad_norm": 0.21337628364562988, + "learning_rate": 2.1429367902392668e-05, + "loss": 0.3539, "step": 77615 }, { - "epoch": 2.73, - "learning_rate": 2.2494773311141974e-05, - "loss": 0.2515, + "epoch": 2.797419540851263, + "grad_norm": 0.2017783671617508, + "learning_rate": 2.1426479710022463e-05, + "loss": 0.3555, "step": 77620 }, { - "epoch": 2.73, - "learning_rate": 2.2491938945415114e-05, - "loss": 0.2459, + "epoch": 2.7975997405124877, + "grad_norm": 0.2635432183742523, + "learning_rate": 2.142359156634076e-05, + "loss": 0.4249, "step": 77625 }, { - "epoch": 2.73, - "learning_rate": 2.2489104612253784e-05, - "loss": 0.254, + "epoch": 2.7977799401737125, + "grad_norm": 0.1856364905834198, + "learning_rate": 2.1420703471386952e-05, + "loss": 0.41, "step": 77630 }, { - "epoch": 2.73, - "learning_rate": 2.2486270311694775e-05, - "loss": 0.2882, + "epoch": 2.797960139834937, + "grad_norm": 0.1753203123807907, + "learning_rate": 2.1417815425200346e-05, + "loss": 0.3796, "step": 77635 }, { - "epoch": 2.73, - "learning_rate": 2.2483436043774884e-05, - "loss": 0.274, + "epoch": 2.798140339496162, + "grad_norm": 0.23167075216770172, + "learning_rate": 2.141492742782033e-05, + "loss": 0.3789, "step": 77640 }, { - "epoch": 2.73, - "learning_rate": 2.2480601808530914e-05, - "loss": 0.2703, + "epoch": 2.7983205391573867, + "grad_norm": 0.17792221903800964, + "learning_rate": 2.141203947928623e-05, + "loss": 0.3429, "step": 77645 }, { - "epoch": 2.73, - "learning_rate": 2.2477767605999676e-05, - "loss": 0.2312, + "epoch": 2.798500738818611, + "grad_norm": 0.23163169622421265, + "learning_rate": 2.1409151579637386e-05, + "loss": 0.4213, "step": 77650 }, { - "epoch": 2.73, - "learning_rate": 2.2474933436217964e-05, - "loss": 0.2711, + "epoch": 2.7986809384798357, + "grad_norm": 0.21783442795276642, + "learning_rate": 2.1406263728913163e-05, + "loss": 0.3908, "step": 77655 }, { - "epoch": 2.73, - "learning_rate": 2.2472099299222583e-05, - "loss": 0.2628, + "epoch": 2.7988611381410604, + "grad_norm": 0.2356899380683899, + "learning_rate": 2.1403375927152903e-05, + "loss": 0.3945, "step": 77660 }, { - "epoch": 2.73, - "learning_rate": 2.2469265195050308e-05, - "loss": 0.2846, + "epoch": 2.7990413378022847, + "grad_norm": 0.23855198919773102, + "learning_rate": 2.1400488174395946e-05, + "loss": 0.4182, "step": 77665 }, { - "epoch": 2.73, - "learning_rate": 2.2466431123737968e-05, - "loss": 0.2754, + "epoch": 2.7992215374635094, + "grad_norm": 0.20778150856494904, + "learning_rate": 2.1397600470681643e-05, + "loss": 0.3909, "step": 77670 }, { - "epoch": 2.73, - "learning_rate": 2.2463597085322352e-05, - "loss": 0.2636, + "epoch": 2.799401737124734, + "grad_norm": 0.19535931944847107, + "learning_rate": 2.1394712816049332e-05, + "loss": 0.3873, "step": 77675 }, { - "epoch": 2.73, - "learning_rate": 2.2460763079840257e-05, - "loss": 0.2799, + "epoch": 2.799581936785959, + "grad_norm": 0.21138401329517365, + "learning_rate": 2.1391825210538366e-05, + "loss": 0.4423, "step": 77680 }, { - "epoch": 2.73, - "learning_rate": 2.2457929107328473e-05, - "loss": 0.2776, + "epoch": 2.7997621364471836, + "grad_norm": 0.25198328495025635, + "learning_rate": 2.138893765418808e-05, + "loss": 0.417, "step": 77685 }, { - "epoch": 2.73, - "learning_rate": 2.245509516782381e-05, - "loss": 0.2927, + "epoch": 2.7999423361084084, + "grad_norm": 0.1913152039051056, + "learning_rate": 2.138605014703782e-05, + "loss": 0.3844, "step": 77690 }, { - "epoch": 2.73, - "learning_rate": 2.2452261261363056e-05, - "loss": 0.2628, + "epoch": 2.8001225357696327, + "grad_norm": 0.19458575546741486, + "learning_rate": 2.138316268912693e-05, + "loss": 0.3818, "step": 77695 }, { - "epoch": 2.73, - "learning_rate": 2.2449427387983004e-05, - "loss": 0.2494, + "epoch": 2.8003027354308574, + "grad_norm": 0.25212234258651733, + "learning_rate": 2.138027528049474e-05, + "loss": 0.3994, "step": 77700 }, { - "epoch": 2.73, - "learning_rate": 2.244659354772047e-05, - "loss": 0.2591, + "epoch": 2.800482935092082, + "grad_norm": 0.19638745486736298, + "learning_rate": 2.137738792118061e-05, + "loss": 0.3779, "step": 77705 }, { - "epoch": 2.73, - "learning_rate": 2.2443759740612234e-05, - "loss": 0.2602, + "epoch": 2.8006631347533064, + "grad_norm": 0.2299245297908783, + "learning_rate": 2.1374500611223867e-05, + "loss": 0.4019, "step": 77710 }, { - "epoch": 2.73, - "learning_rate": 2.2440925966695093e-05, - "loss": 0.2782, + "epoch": 2.800843334414531, + "grad_norm": 0.2781182527542114, + "learning_rate": 2.1371613350663837e-05, + "loss": 0.3782, "step": 77715 }, { - "epoch": 2.73, - "learning_rate": 2.2438092226005832e-05, - "loss": 0.267, + "epoch": 2.801023534075756, + "grad_norm": 0.2580137848854065, + "learning_rate": 2.1368726139539885e-05, + "loss": 0.4054, "step": 77720 }, { - "epoch": 2.73, - "learning_rate": 2.243525851858126e-05, - "loss": 0.2685, + "epoch": 2.8012037337369806, + "grad_norm": 0.2500860095024109, + "learning_rate": 2.1365838977891344e-05, + "loss": 0.4145, "step": 77725 }, { - "epoch": 2.73, - "learning_rate": 2.2432424844458168e-05, - "loss": 0.2624, + "epoch": 2.8013839333982054, + "grad_norm": 0.26811492443084717, + "learning_rate": 2.1362951865757523e-05, + "loss": 0.3767, "step": 77730 }, { - "epoch": 2.73, - "learning_rate": 2.2429591203673353e-05, - "loss": 0.2586, + "epoch": 2.80156413305943, + "grad_norm": 0.22250352799892426, + "learning_rate": 2.1360064803177793e-05, + "loss": 0.3912, "step": 77735 }, { - "epoch": 2.74, - "learning_rate": 2.242675759626359e-05, - "loss": 0.276, + "epoch": 2.8017443327206544, + "grad_norm": 0.16757109761238098, + "learning_rate": 2.1357177790191463e-05, + "loss": 0.3749, "step": 77740 }, { - "epoch": 2.74, - "learning_rate": 2.242392402226569e-05, - "loss": 0.2892, + "epoch": 2.801924532381879, + "grad_norm": 0.1849580705165863, + "learning_rate": 2.13542908268379e-05, + "loss": 0.3972, "step": 77745 }, { - "epoch": 2.74, - "learning_rate": 2.242109048171644e-05, - "loss": 0.2714, + "epoch": 2.802104732043104, + "grad_norm": 0.2086080014705658, + "learning_rate": 2.1351403913156403e-05, + "loss": 0.4013, "step": 77750 }, { - "epoch": 2.74, - "learning_rate": 2.241825697465263e-05, - "loss": 0.2524, + "epoch": 2.802284931704328, + "grad_norm": 0.22372741997241974, + "learning_rate": 2.1348517049186323e-05, + "loss": 0.4117, "step": 77755 }, { - "epoch": 2.74, - "learning_rate": 2.2415423501111046e-05, - "loss": 0.2848, + "epoch": 2.802465131365553, + "grad_norm": 0.18635831773281097, + "learning_rate": 2.1345630234966997e-05, + "loss": 0.3779, "step": 77760 }, { - "epoch": 2.74, - "learning_rate": 2.241259006112849e-05, - "loss": 0.2867, + "epoch": 2.8026453310267776, + "grad_norm": 0.19005261361598969, + "learning_rate": 2.134274347053775e-05, + "loss": 0.4248, "step": 77765 }, { - "epoch": 2.74, - "learning_rate": 2.2409756654741753e-05, - "loss": 0.2579, + "epoch": 2.8028255306880023, + "grad_norm": 0.19459028542041779, + "learning_rate": 2.133985675593791e-05, + "loss": 0.4361, "step": 77770 }, { - "epoch": 2.74, - "learning_rate": 2.240692328198761e-05, - "loss": 0.2494, + "epoch": 2.803005730349227, + "grad_norm": 0.19851277768611908, + "learning_rate": 2.1336970091206814e-05, + "loss": 0.3814, "step": 77775 }, { - "epoch": 2.74, - "learning_rate": 2.2404089942902855e-05, - "loss": 0.2742, + "epoch": 2.803185930010452, + "grad_norm": 0.1843992918729782, + "learning_rate": 2.1334083476383794e-05, + "loss": 0.4244, "step": 77780 }, { - "epoch": 2.74, - "learning_rate": 2.240125663752429e-05, - "loss": 0.2523, + "epoch": 2.803366129671676, + "grad_norm": 0.24044600129127502, + "learning_rate": 2.133119691150818e-05, + "loss": 0.3871, "step": 77785 }, { - "epoch": 2.74, - "learning_rate": 2.2398423365888697e-05, - "loss": 0.2723, + "epoch": 2.803546329332901, + "grad_norm": 0.1807718724012375, + "learning_rate": 2.1328310396619296e-05, + "loss": 0.4099, "step": 77790 }, { - "epoch": 2.74, - "learning_rate": 2.239559012803286e-05, - "loss": 0.2681, + "epoch": 2.8037265289941256, + "grad_norm": 0.22799517214298248, + "learning_rate": 2.1325423931756463e-05, + "loss": 0.3899, "step": 77795 }, { - "epoch": 2.74, - "learning_rate": 2.239275692399356e-05, - "loss": 0.2579, + "epoch": 2.8039067286553503, + "grad_norm": 0.2450421303510666, + "learning_rate": 2.1322537516959026e-05, + "loss": 0.3654, "step": 77800 }, { - "epoch": 2.74, - "learning_rate": 2.23899237538076e-05, - "loss": 0.2741, + "epoch": 2.8040869283165746, + "grad_norm": 0.21578456461429596, + "learning_rate": 2.1319651152266313e-05, + "loss": 0.3766, "step": 77805 }, { - "epoch": 2.74, - "learning_rate": 2.2387090617511763e-05, - "loss": 0.2572, + "epoch": 2.8042671279777993, + "grad_norm": 0.22823040187358856, + "learning_rate": 2.1316764837717618e-05, + "loss": 0.4372, "step": 77810 }, { - "epoch": 2.74, - "learning_rate": 2.238425751514282e-05, - "loss": 0.2713, + "epoch": 2.804447327639024, + "grad_norm": 0.2035275250673294, + "learning_rate": 2.131387857335231e-05, + "loss": 0.3643, "step": 77815 }, { - "epoch": 2.74, - "learning_rate": 2.2381424446737585e-05, - "loss": 0.2708, + "epoch": 2.804627527300249, + "grad_norm": 0.23069792985916138, + "learning_rate": 2.1310992359209673e-05, + "loss": 0.3928, "step": 77820 }, { - "epoch": 2.74, - "learning_rate": 2.237859141233282e-05, - "loss": 0.2514, + "epoch": 2.8048077269614735, + "grad_norm": 0.18597543239593506, + "learning_rate": 2.130810619532907e-05, + "loss": 0.3802, "step": 77825 }, { - "epoch": 2.74, - "learning_rate": 2.237575841196532e-05, - "loss": 0.2613, + "epoch": 2.804987926622698, + "grad_norm": 0.14916902780532837, + "learning_rate": 2.1305220081749798e-05, + "loss": 0.3602, "step": 77830 }, { - "epoch": 2.74, - "learning_rate": 2.2372925445671866e-05, - "loss": 0.2606, + "epoch": 2.8051681262839225, + "grad_norm": 0.20437051355838776, + "learning_rate": 2.130233401851118e-05, + "loss": 0.3659, "step": 77835 }, { - "epoch": 2.74, - "learning_rate": 2.237009251348924e-05, - "loss": 0.2792, + "epoch": 2.8053483259451473, + "grad_norm": 0.25175103545188904, + "learning_rate": 2.1299448005652552e-05, + "loss": 0.3947, "step": 77840 }, { - "epoch": 2.74, - "learning_rate": 2.236725961545424e-05, - "loss": 0.2749, + "epoch": 2.805528525606372, + "grad_norm": 0.18190394341945648, + "learning_rate": 2.129656204321323e-05, + "loss": 0.3906, "step": 77845 }, { - "epoch": 2.74, - "learning_rate": 2.2364426751603637e-05, - "loss": 0.2729, + "epoch": 2.8057087252675963, + "grad_norm": 0.16877366602420807, + "learning_rate": 2.1293676131232526e-05, + "loss": 0.3896, "step": 77850 }, { - "epoch": 2.74, - "learning_rate": 2.23615939219742e-05, - "loss": 0.2763, + "epoch": 2.805888924928821, + "grad_norm": 0.19917549192905426, + "learning_rate": 2.1290790269749774e-05, + "loss": 0.3843, "step": 77855 }, { - "epoch": 2.74, - "learning_rate": 2.235876112660274e-05, - "loss": 0.2885, + "epoch": 2.8060691245900458, + "grad_norm": 0.23629820346832275, + "learning_rate": 2.128790445880428e-05, + "loss": 0.3683, "step": 77860 }, { - "epoch": 2.74, - "learning_rate": 2.235592836552603e-05, - "loss": 0.2678, + "epoch": 2.8062493242512705, + "grad_norm": 0.2526760697364807, + "learning_rate": 2.128501869843538e-05, + "loss": 0.3883, "step": 77865 }, { - "epoch": 2.74, - "learning_rate": 2.2353095638780845e-05, - "loss": 0.288, + "epoch": 2.8064295239124952, + "grad_norm": 0.22685180604457855, + "learning_rate": 2.1282132988682374e-05, + "loss": 0.3851, "step": 77870 }, { - "epoch": 2.74, - "learning_rate": 2.235026294640396e-05, - "loss": 0.2635, + "epoch": 2.8066097235737195, + "grad_norm": 0.24433378875255585, + "learning_rate": 2.1279247329584582e-05, + "loss": 0.4229, "step": 77875 }, { - "epoch": 2.74, - "learning_rate": 2.2347430288432172e-05, - "loss": 0.2696, + "epoch": 2.8067899232349443, + "grad_norm": 0.2599703371524811, + "learning_rate": 2.1276361721181332e-05, + "loss": 0.4136, "step": 77880 }, { - "epoch": 2.74, - "learning_rate": 2.2344597664902254e-05, - "loss": 0.2782, + "epoch": 2.806970122896169, + "grad_norm": 0.22800195217132568, + "learning_rate": 2.1273476163511924e-05, + "loss": 0.3937, "step": 77885 }, { - "epoch": 2.74, - "learning_rate": 2.2341765075850985e-05, - "loss": 0.2874, + "epoch": 2.8071503225573937, + "grad_norm": 0.26085367798805237, + "learning_rate": 2.127059065661569e-05, + "loss": 0.4329, "step": 77890 }, { - "epoch": 2.74, - "learning_rate": 2.2338932521315136e-05, - "loss": 0.2768, + "epoch": 2.807330522218618, + "grad_norm": 0.1912272721529007, + "learning_rate": 2.1267705200531933e-05, + "loss": 0.4017, "step": 77895 }, { - "epoch": 2.74, - "learning_rate": 2.2336100001331506e-05, - "loss": 0.2596, + "epoch": 2.8075107218798427, + "grad_norm": 0.2735271751880646, + "learning_rate": 2.126481979529997e-05, + "loss": 0.4036, "step": 77900 }, { - "epoch": 2.74, - "learning_rate": 2.2333267515936863e-05, - "loss": 0.2539, + "epoch": 2.8076909215410675, + "grad_norm": 0.2603655755519867, + "learning_rate": 2.1261934440959115e-05, + "loss": 0.4244, "step": 77905 }, { - "epoch": 2.74, - "learning_rate": 2.2330435065167975e-05, - "loss": 0.2689, + "epoch": 2.807871121202292, + "grad_norm": 0.21912328898906708, + "learning_rate": 2.1259049137548686e-05, + "loss": 0.4253, "step": 77910 }, { - "epoch": 2.74, - "learning_rate": 2.232760264906163e-05, - "loss": 0.256, + "epoch": 2.808051320863517, + "grad_norm": 0.19126076996326447, + "learning_rate": 2.1256163885107973e-05, + "loss": 0.4077, "step": 77915 }, { - "epoch": 2.74, - "learning_rate": 2.2324770267654603e-05, - "loss": 0.2695, + "epoch": 2.8082315205247417, + "grad_norm": 0.19802503287792206, + "learning_rate": 2.125327868367632e-05, + "loss": 0.3619, "step": 77920 }, { - "epoch": 2.74, - "learning_rate": 2.2321937920983675e-05, - "loss": 0.2547, + "epoch": 2.808411720185966, + "grad_norm": 0.2620820105075836, + "learning_rate": 2.1250393533293e-05, + "loss": 0.4068, "step": 77925 }, { - "epoch": 2.74, - "learning_rate": 2.2319105609085612e-05, - "loss": 0.2681, + "epoch": 2.8085919198471907, + "grad_norm": 0.269182413816452, + "learning_rate": 2.124750843399736e-05, + "loss": 0.4109, "step": 77930 }, { - "epoch": 2.74, - "learning_rate": 2.2316273331997204e-05, - "loss": 0.2799, + "epoch": 2.8087721195084154, + "grad_norm": 0.2248949408531189, + "learning_rate": 2.1244623385828687e-05, + "loss": 0.4336, "step": 77935 }, { - "epoch": 2.74, - "learning_rate": 2.2313441089755217e-05, - "loss": 0.2527, + "epoch": 2.8089523191696397, + "grad_norm": 0.2180756777524948, + "learning_rate": 2.1241738388826288e-05, + "loss": 0.3815, "step": 77940 }, { - "epoch": 2.74, - "learning_rate": 2.2310608882396425e-05, - "loss": 0.2718, + "epoch": 2.8091325188308645, + "grad_norm": 0.22202380001544952, + "learning_rate": 2.1238853443029476e-05, + "loss": 0.3617, "step": 77945 }, { - "epoch": 2.74, - "learning_rate": 2.2307776709957596e-05, - "loss": 0.2551, + "epoch": 2.809312718492089, + "grad_norm": 0.24584850668907166, + "learning_rate": 2.1235968548477564e-05, + "loss": 0.3859, "step": 77950 }, { - "epoch": 2.74, - "learning_rate": 2.2304944572475523e-05, - "loss": 0.2872, + "epoch": 2.809492918153314, + "grad_norm": 0.18127258121967316, + "learning_rate": 2.1233083705209845e-05, + "loss": 0.3916, "step": 77955 }, { - "epoch": 2.74, - "learning_rate": 2.230211246998697e-05, - "loss": 0.2838, + "epoch": 2.8096731178145387, + "grad_norm": 0.2721177637577057, + "learning_rate": 2.1230198913265635e-05, + "loss": 0.3872, "step": 77960 }, { - "epoch": 2.74, - "learning_rate": 2.2299280402528702e-05, - "loss": 0.275, + "epoch": 2.8098533174757634, + "grad_norm": 0.23304437100887299, + "learning_rate": 2.122731417268423e-05, + "loss": 0.4107, "step": 77965 }, { - "epoch": 2.74, - "learning_rate": 2.2296448370137498e-05, - "loss": 0.26, + "epoch": 2.8100335171369877, + "grad_norm": 0.21200516819953918, + "learning_rate": 2.122442948350494e-05, + "loss": 0.4215, "step": 77970 }, { - "epoch": 2.74, - "learning_rate": 2.229361637285013e-05, - "loss": 0.2651, + "epoch": 2.8102137167982124, + "grad_norm": 0.1965387612581253, + "learning_rate": 2.1221544845767074e-05, + "loss": 0.4209, "step": 77975 }, { - "epoch": 2.74, - "learning_rate": 2.229078441070338e-05, - "loss": 0.2758, + "epoch": 2.810393916459437, + "grad_norm": 0.19175685942173004, + "learning_rate": 2.121866025950992e-05, + "loss": 0.383, "step": 77980 }, { - "epoch": 2.74, - "learning_rate": 2.2287952483734004e-05, - "loss": 0.277, + "epoch": 2.8105741161206614, + "grad_norm": 0.2140149623155594, + "learning_rate": 2.1215775724772794e-05, + "loss": 0.4087, "step": 77985 }, { - "epoch": 2.74, - "learning_rate": 2.2285120591978766e-05, - "loss": 0.2619, + "epoch": 2.810754315781886, + "grad_norm": 0.239614799618721, + "learning_rate": 2.1212891241594996e-05, + "loss": 0.3841, "step": 77990 }, { - "epoch": 2.74, - "learning_rate": 2.228228873547446e-05, - "loss": 0.2605, + "epoch": 2.810934515443111, + "grad_norm": 0.24127493798732758, + "learning_rate": 2.1210006810015803e-05, + "loss": 0.375, "step": 77995 }, { - "epoch": 2.74, - "learning_rate": 2.227945691425784e-05, - "loss": 0.2698, + "epoch": 2.8111147151043356, + "grad_norm": 0.20837131142616272, + "learning_rate": 2.120712243007455e-05, + "loss": 0.4066, "step": 78000 }, { - "epoch": 2.74, - "eval_loss": 0.26091867685317993, - "eval_runtime": 10.5455, - "eval_samples_per_second": 9.483, - "eval_steps_per_second": 9.483, + "epoch": 2.8111147151043356, + "eval_loss": 0.4307841956615448, + "eval_runtime": 3.5376, + "eval_samples_per_second": 28.268, + "eval_steps_per_second": 7.067, "step": 78000 }, { - "epoch": 2.74, - "learning_rate": 2.2276625128365683e-05, - "loss": 0.2536, + "epoch": 2.8112949147655604, + "grad_norm": 0.21309497952461243, + "learning_rate": 2.1204238101810507e-05, + "loss": 0.3937, "step": 78005 }, { - "epoch": 2.74, - "learning_rate": 2.2273793377834744e-05, - "loss": 0.2584, + "epoch": 2.811475114426785, + "grad_norm": 0.23278546333312988, + "learning_rate": 2.1201353825262996e-05, + "loss": 0.4088, "step": 78010 }, { - "epoch": 2.74, - "learning_rate": 2.227096166270181e-05, - "loss": 0.251, + "epoch": 2.8116553140880094, + "grad_norm": 0.1590275913476944, + "learning_rate": 2.1198469600471308e-05, + "loss": 0.3719, "step": 78015 }, { - "epoch": 2.74, - "learning_rate": 2.2268129983003643e-05, - "loss": 0.2848, + "epoch": 2.811835513749234, + "grad_norm": 0.19892996549606323, + "learning_rate": 2.119558542747472e-05, + "loss": 0.3786, "step": 78020 }, { - "epoch": 2.75, - "learning_rate": 2.2265298338777003e-05, - "loss": 0.2954, + "epoch": 2.812015713410459, + "grad_norm": 0.22030261158943176, + "learning_rate": 2.1192701306312556e-05, + "loss": 0.3856, "step": 78025 }, { - "epoch": 2.75, - "learning_rate": 2.2262466730058652e-05, - "loss": 0.2684, + "epoch": 2.812195913071683, + "grad_norm": 0.2299405336380005, + "learning_rate": 2.11898172370241e-05, + "loss": 0.4055, "step": 78030 }, { - "epoch": 2.75, - "learning_rate": 2.2259635156885384e-05, - "loss": 0.2402, + "epoch": 2.812376112732908, + "grad_norm": 0.22635617852210999, + "learning_rate": 2.1186933219648636e-05, + "loss": 0.3831, "step": 78035 }, { - "epoch": 2.75, - "learning_rate": 2.225680361929394e-05, - "loss": 0.2574, + "epoch": 2.8125563123941326, + "grad_norm": 0.1920858919620514, + "learning_rate": 2.1184049254225477e-05, + "loss": 0.3861, "step": 78040 }, { - "epoch": 2.75, - "learning_rate": 2.2253972117321088e-05, - "loss": 0.2703, + "epoch": 2.8127365120553574, + "grad_norm": 0.256536066532135, + "learning_rate": 2.1181165340793902e-05, + "loss": 0.4304, "step": 78045 }, { - "epoch": 2.75, - "learning_rate": 2.2251140651003608e-05, - "loss": 0.2725, + "epoch": 2.812916711716582, + "grad_norm": 0.1779921054840088, + "learning_rate": 2.1178281479393213e-05, + "loss": 0.3844, "step": 78050 }, { - "epoch": 2.75, - "learning_rate": 2.224830922037825e-05, - "loss": 0.2775, + "epoch": 2.813096911377807, + "grad_norm": 0.20656806230545044, + "learning_rate": 2.11753976700627e-05, + "loss": 0.3907, "step": 78055 }, { - "epoch": 2.75, - "learning_rate": 2.2245477825481786e-05, - "loss": 0.2936, + "epoch": 2.813277111039031, + "grad_norm": 0.2381805181503296, + "learning_rate": 2.117251391284165e-05, + "loss": 0.4102, "step": 78060 }, { - "epoch": 2.75, - "learning_rate": 2.2242646466350966e-05, - "loss": 0.2614, + "epoch": 2.813457310700256, + "grad_norm": 0.2606278657913208, + "learning_rate": 2.1169630207769366e-05, + "loss": 0.3904, "step": 78065 }, { - "epoch": 2.75, - "learning_rate": 2.223981514302258e-05, - "loss": 0.2573, + "epoch": 2.8136375103614806, + "grad_norm": 0.24265384674072266, + "learning_rate": 2.116674655488512e-05, + "loss": 0.3964, "step": 78070 }, { - "epoch": 2.75, - "learning_rate": 2.223698385553337e-05, - "loss": 0.2834, + "epoch": 2.8138177100227053, + "grad_norm": 0.20753905177116394, + "learning_rate": 2.116386295422821e-05, + "loss": 0.395, "step": 78075 }, { - "epoch": 2.75, - "learning_rate": 2.2234152603920102e-05, - "loss": 0.266, + "epoch": 2.8139979096839296, + "grad_norm": 0.19484953582286835, + "learning_rate": 2.116097940583793e-05, + "loss": 0.4158, "step": 78080 }, { - "epoch": 2.75, - "learning_rate": 2.2231321388219532e-05, - "loss": 0.2543, + "epoch": 2.8141781093451543, + "grad_norm": 0.17793631553649902, + "learning_rate": 2.1158095909753555e-05, + "loss": 0.404, "step": 78085 }, { - "epoch": 2.75, - "learning_rate": 2.222849020846844e-05, - "loss": 0.2598, + "epoch": 2.814358309006379, + "grad_norm": 0.226637065410614, + "learning_rate": 2.1155212466014383e-05, + "loss": 0.3846, "step": 78090 }, { - "epoch": 2.75, - "learning_rate": 2.222565906470358e-05, - "loss": 0.2721, + "epoch": 2.814538508667604, + "grad_norm": 0.24022680521011353, + "learning_rate": 2.1152329074659707e-05, + "loss": 0.3918, "step": 78095 }, { - "epoch": 2.75, - "learning_rate": 2.2222827956961707e-05, - "loss": 0.2614, + "epoch": 2.8147187083288285, + "grad_norm": 0.23119503259658813, + "learning_rate": 2.1149445735728784e-05, + "loss": 0.3771, "step": 78100 }, { - "epoch": 2.75, - "learning_rate": 2.221999688527957e-05, - "loss": 0.2784, + "epoch": 2.814898907990053, + "grad_norm": 0.1578037291765213, + "learning_rate": 2.1146562449260933e-05, + "loss": 0.3745, "step": 78105 }, { - "epoch": 2.75, - "learning_rate": 2.2217165849693948e-05, - "loss": 0.2608, + "epoch": 2.8150791076512776, + "grad_norm": 0.22053155303001404, + "learning_rate": 2.1143679215295407e-05, + "loss": 0.4288, "step": 78110 }, { - "epoch": 2.75, - "learning_rate": 2.2214334850241598e-05, - "loss": 0.2884, + "epoch": 2.8152593073125023, + "grad_norm": 0.21788404881954193, + "learning_rate": 2.114079603387152e-05, + "loss": 0.4265, "step": 78115 }, { - "epoch": 2.75, - "learning_rate": 2.2211503886959272e-05, - "loss": 0.2564, + "epoch": 2.815439506973727, + "grad_norm": 0.21442186832427979, + "learning_rate": 2.1137912905028537e-05, + "loss": 0.362, "step": 78120 }, { - "epoch": 2.75, - "learning_rate": 2.2208672959883726e-05, - "loss": 0.2679, + "epoch": 2.8156197066349513, + "grad_norm": 0.2007633000612259, + "learning_rate": 2.113502982880573e-05, + "loss": 0.3879, "step": 78125 }, { - "epoch": 2.75, - "learning_rate": 2.2205842069051727e-05, - "loss": 0.2764, + "epoch": 2.815799906296176, + "grad_norm": 0.23273354768753052, + "learning_rate": 2.113214680524241e-05, + "loss": 0.386, "step": 78130 }, { - "epoch": 2.75, - "learning_rate": 2.2203011214500026e-05, - "loss": 0.2784, + "epoch": 2.815980105957401, + "grad_norm": 0.23062081634998322, + "learning_rate": 2.1129263834377838e-05, + "loss": 0.3886, "step": 78135 }, { - "epoch": 2.75, - "learning_rate": 2.2200180396265384e-05, - "loss": 0.2653, + "epoch": 2.8161603056186255, + "grad_norm": 0.20802180469036102, + "learning_rate": 2.1126380916251287e-05, + "loss": 0.4092, "step": 78140 }, { - "epoch": 2.75, - "learning_rate": 2.2197349614384543e-05, - "loss": 0.2874, + "epoch": 2.8163405052798502, + "grad_norm": 0.27845528721809387, + "learning_rate": 2.1123498050902055e-05, + "loss": 0.4517, "step": 78145 }, { - "epoch": 2.75, - "learning_rate": 2.2194518868894283e-05, - "loss": 0.2918, + "epoch": 2.816520704941075, + "grad_norm": 0.20084762573242188, + "learning_rate": 2.1120615238369407e-05, + "loss": 0.4151, "step": 78150 }, { - "epoch": 2.75, - "learning_rate": 2.2191688159831346e-05, - "loss": 0.2566, + "epoch": 2.8167009046022993, + "grad_norm": 0.2194603532552719, + "learning_rate": 2.1117732478692627e-05, + "loss": 0.4013, "step": 78155 }, { - "epoch": 2.75, - "learning_rate": 2.2188857487232477e-05, - "loss": 0.2439, + "epoch": 2.816881104263524, + "grad_norm": 0.18530698120594025, + "learning_rate": 2.111484977191099e-05, + "loss": 0.37, "step": 78160 }, { - "epoch": 2.75, - "learning_rate": 2.2186026851134445e-05, - "loss": 0.2777, + "epoch": 2.8170613039247487, + "grad_norm": 0.21759265661239624, + "learning_rate": 2.1111967118063772e-05, + "loss": 0.3906, "step": 78165 }, { - "epoch": 2.75, - "learning_rate": 2.2183196251574003e-05, - "loss": 0.2715, + "epoch": 2.817241503585973, + "grad_norm": 0.18584828078746796, + "learning_rate": 2.110908451719025e-05, + "loss": 0.3837, "step": 78170 }, { - "epoch": 2.75, - "learning_rate": 2.2180365688587906e-05, - "loss": 0.2752, + "epoch": 2.8174217032471978, + "grad_norm": 0.23392297327518463, + "learning_rate": 2.11062019693297e-05, + "loss": 0.4138, "step": 78175 }, { - "epoch": 2.75, - "learning_rate": 2.217753516221289e-05, - "loss": 0.2789, + "epoch": 2.8176019029084225, + "grad_norm": 0.21533243358135223, + "learning_rate": 2.110331947452139e-05, + "loss": 0.4038, "step": 78180 }, { - "epoch": 2.75, - "learning_rate": 2.217470467248573e-05, - "loss": 0.2584, + "epoch": 2.8177821025696472, + "grad_norm": 0.18144142627716064, + "learning_rate": 2.11004370328046e-05, + "loss": 0.4086, "step": 78185 }, { - "epoch": 2.75, - "learning_rate": 2.2171874219443163e-05, - "loss": 0.2661, + "epoch": 2.817962302230872, + "grad_norm": 0.1902519166469574, + "learning_rate": 2.10975546442186e-05, + "loss": 0.366, "step": 78190 }, { - "epoch": 2.75, - "learning_rate": 2.216904380312195e-05, - "loss": 0.2743, + "epoch": 2.8181425018920967, + "grad_norm": 0.259445458650589, + "learning_rate": 2.1094672308802666e-05, + "loss": 0.448, "step": 78195 }, { - "epoch": 2.75, - "learning_rate": 2.216621342355883e-05, - "loss": 0.2647, + "epoch": 2.818322701553321, + "grad_norm": 0.2690201997756958, + "learning_rate": 2.109179002659607e-05, + "loss": 0.393, "step": 78200 }, { - "epoch": 2.75, - "learning_rate": 2.2163383080790567e-05, - "loss": 0.2367, + "epoch": 2.8185029012145457, + "grad_norm": 0.17392581701278687, + "learning_rate": 2.108890779763806e-05, + "loss": 0.4098, "step": 78205 }, { - "epoch": 2.75, - "learning_rate": 2.216055277485391e-05, - "loss": 0.2617, + "epoch": 2.8186831008757705, + "grad_norm": 0.22515340149402618, + "learning_rate": 2.108602562196794e-05, + "loss": 0.392, "step": 78210 }, { - "epoch": 2.75, - "learning_rate": 2.2157722505785595e-05, - "loss": 0.2747, + "epoch": 2.8188633005369947, + "grad_norm": 0.27040407061576843, + "learning_rate": 2.1083143499624965e-05, + "loss": 0.4095, "step": 78215 }, { - "epoch": 2.75, - "learning_rate": 2.2154892273622377e-05, - "loss": 0.2815, + "epoch": 2.8190435001982195, + "grad_norm": 0.2060219943523407, + "learning_rate": 2.108026143064839e-05, + "loss": 0.4017, "step": 78220 }, { - "epoch": 2.75, - "learning_rate": 2.2152062078401018e-05, - "loss": 0.2643, + "epoch": 2.819223699859444, + "grad_norm": 0.2325987070798874, + "learning_rate": 2.10773794150775e-05, + "loss": 0.3988, "step": 78225 }, { - "epoch": 2.75, - "learning_rate": 2.2149231920158256e-05, - "loss": 0.2734, + "epoch": 2.819403899520669, + "grad_norm": 0.2505181133747101, + "learning_rate": 2.107449745295155e-05, + "loss": 0.3817, "step": 78230 }, { - "epoch": 2.75, - "learning_rate": 2.214640179893084e-05, - "loss": 0.2741, + "epoch": 2.8195840991818937, + "grad_norm": 0.1897851973772049, + "learning_rate": 2.107161554430982e-05, + "loss": 0.4017, "step": 78235 }, { - "epoch": 2.75, - "learning_rate": 2.21435717147555e-05, - "loss": 0.29, + "epoch": 2.8197642988431184, + "grad_norm": 0.19039909541606903, + "learning_rate": 2.1068733689191567e-05, + "loss": 0.399, "step": 78240 }, { - "epoch": 2.75, - "learning_rate": 2.214074166766901e-05, - "loss": 0.2804, + "epoch": 2.8199444985043427, + "grad_norm": 0.20617642998695374, + "learning_rate": 2.106585188763605e-05, + "loss": 0.4053, "step": 78245 }, { - "epoch": 2.75, - "learning_rate": 2.2137911657708107e-05, - "loss": 0.3018, + "epoch": 2.8201246981655674, + "grad_norm": 0.26967623829841614, + "learning_rate": 2.1062970139682546e-05, + "loss": 0.3708, "step": 78250 }, { - "epoch": 2.75, - "learning_rate": 2.2135081684909536e-05, - "loss": 0.2525, + "epoch": 2.820304897826792, + "grad_norm": 0.1973973661661148, + "learning_rate": 2.106008844537031e-05, + "loss": 0.4058, "step": 78255 }, { - "epoch": 2.75, - "learning_rate": 2.2132251749310032e-05, - "loss": 0.2621, + "epoch": 2.8204850974880165, + "grad_norm": 0.2256660908460617, + "learning_rate": 2.1057206804738602e-05, + "loss": 0.3945, "step": 78260 }, { - "epoch": 2.75, - "learning_rate": 2.2129421850946357e-05, - "loss": 0.2863, + "epoch": 2.820665297149241, + "grad_norm": 0.2466830313205719, + "learning_rate": 2.1054325217826694e-05, + "loss": 0.4094, "step": 78265 }, { - "epoch": 2.75, - "learning_rate": 2.2126591989855246e-05, - "loss": 0.2516, + "epoch": 2.820845496810466, + "grad_norm": 0.22766727209091187, + "learning_rate": 2.1051443684673832e-05, + "loss": 0.4063, "step": 78270 }, { - "epoch": 2.75, - "learning_rate": 2.2123762166073446e-05, - "loss": 0.2547, + "epoch": 2.8210256964716907, + "grad_norm": 0.19182877242565155, + "learning_rate": 2.1048562205319295e-05, + "loss": 0.3953, "step": 78275 }, { - "epoch": 2.75, - "learning_rate": 2.212093237963769e-05, - "loss": 0.2712, + "epoch": 2.8212058961329154, + "grad_norm": 0.2543238699436188, + "learning_rate": 2.1045680779802336e-05, + "loss": 0.3709, "step": 78280 }, { - "epoch": 2.75, - "learning_rate": 2.2118102630584743e-05, - "loss": 0.2611, + "epoch": 2.82138609579414, + "grad_norm": 0.16309164464473724, + "learning_rate": 2.1042799408162194e-05, + "loss": 0.3363, "step": 78285 }, { - "epoch": 2.75, - "learning_rate": 2.2115272918951328e-05, - "loss": 0.2859, + "epoch": 2.8215662954553644, + "grad_norm": 0.1783873587846756, + "learning_rate": 2.1039918090438156e-05, + "loss": 0.3677, "step": 78290 }, { - "epoch": 2.75, - "learning_rate": 2.2112443244774188e-05, - "loss": 0.2507, + "epoch": 2.821746495116589, + "grad_norm": 0.2764756679534912, + "learning_rate": 2.1037036826669463e-05, + "loss": 0.4055, "step": 78295 }, { - "epoch": 2.75, - "learning_rate": 2.210961360809007e-05, - "loss": 0.2666, + "epoch": 2.821926694777814, + "grad_norm": 0.23792728781700134, + "learning_rate": 2.103415561689538e-05, + "loss": 0.4224, "step": 78300 }, { - "epoch": 2.75, - "learning_rate": 2.2106784008935723e-05, - "loss": 0.2525, + "epoch": 2.8221068944390386, + "grad_norm": 0.18397817015647888, + "learning_rate": 2.1031274461155164e-05, + "loss": 0.3565, "step": 78305 }, { - "epoch": 2.76, - "learning_rate": 2.210395444734788e-05, - "loss": 0.2667, + "epoch": 2.822287094100263, + "grad_norm": 0.16802199184894562, + "learning_rate": 2.1028393359488048e-05, + "loss": 0.3974, "step": 78310 }, { - "epoch": 2.76, - "learning_rate": 2.2101124923363267e-05, - "loss": 0.2904, + "epoch": 2.8224672937614876, + "grad_norm": 0.20538485050201416, + "learning_rate": 2.1025512311933324e-05, + "loss": 0.3369, "step": 78315 }, { - "epoch": 2.76, - "learning_rate": 2.2098295437018648e-05, - "loss": 0.2897, + "epoch": 2.8226474934227124, + "grad_norm": 0.2252337634563446, + "learning_rate": 2.102263131853022e-05, + "loss": 0.3967, "step": 78320 }, { - "epoch": 2.76, - "learning_rate": 2.2095465988350745e-05, - "loss": 0.2825, + "epoch": 2.822827693083937, + "grad_norm": 0.2148447036743164, + "learning_rate": 2.101975037931798e-05, + "loss": 0.4041, "step": 78325 }, { - "epoch": 2.76, - "learning_rate": 2.2092636577396306e-05, - "loss": 0.29, + "epoch": 2.823007892745162, + "grad_norm": 0.24140605330467224, + "learning_rate": 2.1016869494335882e-05, + "loss": 0.365, "step": 78330 }, { - "epoch": 2.76, - "learning_rate": 2.2089807204192056e-05, - "loss": 0.2703, + "epoch": 2.823188092406386, + "grad_norm": 0.21468763053417206, + "learning_rate": 2.1013988663623165e-05, + "loss": 0.3998, "step": 78335 }, { - "epoch": 2.76, - "learning_rate": 2.208697786877475e-05, - "loss": 0.2609, + "epoch": 2.823368292067611, + "grad_norm": 0.18902160227298737, + "learning_rate": 2.101110788721908e-05, + "loss": 0.3891, "step": 78340 }, { - "epoch": 2.76, - "learning_rate": 2.208414857118112e-05, - "loss": 0.2636, + "epoch": 2.8235484917288356, + "grad_norm": 0.18640626966953278, + "learning_rate": 2.1008227165162877e-05, + "loss": 0.3851, "step": 78345 }, { - "epoch": 2.76, - "learning_rate": 2.208131931144789e-05, - "loss": 0.289, + "epoch": 2.8237286913900603, + "grad_norm": 0.21738696098327637, + "learning_rate": 2.1005346497493807e-05, + "loss": 0.4259, "step": 78350 }, { - "epoch": 2.76, - "learning_rate": 2.2078490089611808e-05, - "loss": 0.2759, + "epoch": 2.8239088910512846, + "grad_norm": 0.18691019713878632, + "learning_rate": 2.1002465884251116e-05, + "loss": 0.3784, "step": 78355 }, { - "epoch": 2.76, - "learning_rate": 2.2075660905709606e-05, - "loss": 0.2688, + "epoch": 2.8240890907125094, + "grad_norm": 0.17924901843070984, + "learning_rate": 2.0999585325474057e-05, + "loss": 0.3571, "step": 78360 }, { - "epoch": 2.76, - "learning_rate": 2.2072831759778025e-05, - "loss": 0.2762, + "epoch": 2.824269290373734, + "grad_norm": 0.20223569869995117, + "learning_rate": 2.0996704821201867e-05, + "loss": 0.3781, "step": 78365 }, { - "epoch": 2.76, - "learning_rate": 2.2070002651853797e-05, - "loss": 0.2652, + "epoch": 2.824449490034959, + "grad_norm": 0.2247609794139862, + "learning_rate": 2.09938243714738e-05, + "loss": 0.3776, "step": 78370 }, { - "epoch": 2.76, - "learning_rate": 2.206717358197364e-05, - "loss": 0.2634, + "epoch": 2.8246296896961836, + "grad_norm": 0.2532680034637451, + "learning_rate": 2.09909439763291e-05, + "loss": 0.4274, "step": 78375 }, { - "epoch": 2.76, - "learning_rate": 2.206434455017431e-05, - "loss": 0.2625, + "epoch": 2.824809889357408, + "grad_norm": 0.24293555319309235, + "learning_rate": 2.0988063635807022e-05, + "loss": 0.3768, "step": 78380 }, { - "epoch": 2.76, - "learning_rate": 2.206151555649253e-05, - "loss": 0.2498, + "epoch": 2.8249900890186326, + "grad_norm": 0.17587457597255707, + "learning_rate": 2.09851833499468e-05, + "loss": 0.3666, "step": 78385 }, { - "epoch": 2.76, - "learning_rate": 2.205868660096504e-05, - "loss": 0.2632, + "epoch": 2.8251702886798573, + "grad_norm": 0.20782296359539032, + "learning_rate": 2.0982303118787662e-05, + "loss": 0.4201, "step": 78390 }, { - "epoch": 2.76, - "learning_rate": 2.2055857683628554e-05, - "loss": 0.263, + "epoch": 2.825350488341082, + "grad_norm": 0.200978621840477, + "learning_rate": 2.0979422942368882e-05, + "loss": 0.3939, "step": 78395 }, { - "epoch": 2.76, - "learning_rate": 2.2053028804519825e-05, - "loss": 0.2532, + "epoch": 2.8255306880023063, + "grad_norm": 0.18631309270858765, + "learning_rate": 2.097654282072968e-05, + "loss": 0.4075, "step": 78400 }, { - "epoch": 2.76, - "learning_rate": 2.2050199963675573e-05, - "loss": 0.2502, + "epoch": 2.825710887663531, + "grad_norm": 0.22163410484790802, + "learning_rate": 2.09736627539093e-05, + "loss": 0.3522, "step": 78405 }, { - "epoch": 2.76, - "learning_rate": 2.2047371161132523e-05, - "loss": 0.2625, + "epoch": 2.825891087324756, + "grad_norm": 0.18656250834465027, + "learning_rate": 2.0970782741946987e-05, + "loss": 0.4021, "step": 78410 }, { - "epoch": 2.76, - "learning_rate": 2.204454239692742e-05, - "loss": 0.2671, + "epoch": 2.8260712869859805, + "grad_norm": 0.23782800137996674, + "learning_rate": 2.096790278488197e-05, + "loss": 0.3833, "step": 78415 }, { - "epoch": 2.76, - "learning_rate": 2.204171367109699e-05, - "loss": 0.258, + "epoch": 2.8262514866472053, + "grad_norm": 0.21709121763706207, + "learning_rate": 2.0965022882753516e-05, + "loss": 0.3792, "step": 78420 }, { - "epoch": 2.76, - "learning_rate": 2.203888498367796e-05, - "loss": 0.2777, + "epoch": 2.82643168630843, + "grad_norm": 0.22497032582759857, + "learning_rate": 2.0962143035600833e-05, + "loss": 0.4084, "step": 78425 }, { - "epoch": 2.76, - "learning_rate": 2.2036056334707038e-05, - "loss": 0.269, + "epoch": 2.8266118859696543, + "grad_norm": 0.2216866910457611, + "learning_rate": 2.095926324346317e-05, + "loss": 0.406, "step": 78430 }, { - "epoch": 2.76, - "learning_rate": 2.2033227724220988e-05, - "loss": 0.2586, + "epoch": 2.826792085630879, + "grad_norm": 0.2360861450433731, + "learning_rate": 2.0956383506379764e-05, + "loss": 0.4031, "step": 78435 }, { - "epoch": 2.76, - "learning_rate": 2.2030399152256516e-05, - "loss": 0.2547, + "epoch": 2.8269722852921038, + "grad_norm": 0.2680504620075226, + "learning_rate": 2.0953503824389853e-05, + "loss": 0.369, "step": 78440 }, { - "epoch": 2.76, - "learning_rate": 2.202757061885036e-05, - "loss": 0.2568, + "epoch": 2.827152484953328, + "grad_norm": 0.21421213448047638, + "learning_rate": 2.0950624197532662e-05, + "loss": 0.3846, "step": 78445 }, { - "epoch": 2.76, - "learning_rate": 2.2024742124039228e-05, - "loss": 0.2524, + "epoch": 2.827332684614553, + "grad_norm": 0.1895037740468979, + "learning_rate": 2.0947744625847437e-05, + "loss": 0.4286, "step": 78450 }, { - "epoch": 2.76, - "learning_rate": 2.202191366785987e-05, - "loss": 0.2505, + "epoch": 2.8275128842757775, + "grad_norm": 0.20965850353240967, + "learning_rate": 2.0944865109373405e-05, + "loss": 0.408, "step": 78455 }, { - "epoch": 2.76, - "learning_rate": 2.2019085250349e-05, - "loss": 0.2746, + "epoch": 2.8276930839370022, + "grad_norm": 0.21087834239006042, + "learning_rate": 2.0941985648149804e-05, + "loss": 0.3806, "step": 78460 }, { - "epoch": 2.76, - "learning_rate": 2.201625687154334e-05, - "loss": 0.2663, + "epoch": 2.827873283598227, + "grad_norm": 0.19764740765094757, + "learning_rate": 2.093910624221586e-05, + "loss": 0.3897, "step": 78465 }, { - "epoch": 2.76, - "learning_rate": 2.201342853147961e-05, - "loss": 0.2835, + "epoch": 2.8280534832594517, + "grad_norm": 0.25473564863204956, + "learning_rate": 2.0936226891610806e-05, + "loss": 0.3687, "step": 78470 }, { - "epoch": 2.76, - "learning_rate": 2.2010600230194555e-05, - "loss": 0.2738, + "epoch": 2.828233682920676, + "grad_norm": 0.2085646688938141, + "learning_rate": 2.0933347596373876e-05, + "loss": 0.4079, "step": 78475 }, { - "epoch": 2.76, - "learning_rate": 2.2007771967724887e-05, - "loss": 0.2533, + "epoch": 2.8284138825819007, + "grad_norm": 0.22956323623657227, + "learning_rate": 2.0930468356544294e-05, + "loss": 0.3709, "step": 78480 }, { - "epoch": 2.76, - "learning_rate": 2.200494374410732e-05, - "loss": 0.257, + "epoch": 2.8285940822431255, + "grad_norm": 0.1897376924753189, + "learning_rate": 2.09275891721613e-05, + "loss": 0.3705, "step": 78485 }, { - "epoch": 2.76, - "learning_rate": 2.2002115559378585e-05, - "loss": 0.2756, + "epoch": 2.8287742819043498, + "grad_norm": 0.25354906916618347, + "learning_rate": 2.0924710043264116e-05, + "loss": 0.4236, "step": 78490 }, { - "epoch": 2.76, - "learning_rate": 2.1999287413575403e-05, - "loss": 0.2491, + "epoch": 2.8289544815655745, + "grad_norm": 0.20143790543079376, + "learning_rate": 2.0921830969891955e-05, + "loss": 0.3469, "step": 78495 }, { - "epoch": 2.76, - "learning_rate": 2.1996459306734503e-05, - "loss": 0.2764, + "epoch": 2.8291346812267992, + "grad_norm": 0.22372838854789734, + "learning_rate": 2.0918951952084077e-05, + "loss": 0.3999, "step": 78500 }, { - "epoch": 2.76, - "eval_loss": 0.26042640209198, - "eval_runtime": 10.5472, - "eval_samples_per_second": 9.481, - "eval_steps_per_second": 9.481, + "epoch": 2.8291346812267992, + "eval_loss": 0.43108808994293213, + "eval_runtime": 3.5275, + "eval_samples_per_second": 28.348, + "eval_steps_per_second": 7.087, "step": 78500 }, { - "epoch": 2.76, - "learning_rate": 2.1993631238892602e-05, - "loss": 0.2786, + "epoch": 2.829314880888024, + "grad_norm": 0.21421097218990326, + "learning_rate": 2.0916072989879678e-05, + "loss": 0.3838, "step": 78505 }, { - "epoch": 2.76, - "learning_rate": 2.19908032100864e-05, - "loss": 0.2912, + "epoch": 2.8294950805492487, + "grad_norm": 0.23731063306331635, + "learning_rate": 2.091319408331799e-05, + "loss": 0.3827, "step": 78510 }, { - "epoch": 2.76, - "learning_rate": 2.1987975220352655e-05, - "loss": 0.2569, + "epoch": 2.8296752802104734, + "grad_norm": 0.1949160099029541, + "learning_rate": 2.0910315232438248e-05, + "loss": 0.3793, "step": 78515 }, { - "epoch": 2.76, - "learning_rate": 2.1985147269728056e-05, - "loss": 0.2628, + "epoch": 2.8298554798716977, + "grad_norm": 0.27850422263145447, + "learning_rate": 2.0907436437279662e-05, + "loss": 0.3917, "step": 78520 }, { - "epoch": 2.76, - "learning_rate": 2.198231935824933e-05, - "loss": 0.2807, + "epoch": 2.8300356795329225, + "grad_norm": 0.19794079661369324, + "learning_rate": 2.090455769788147e-05, + "loss": 0.4164, "step": 78525 }, { - "epoch": 2.76, - "learning_rate": 2.1979491485953207e-05, - "loss": 0.2718, + "epoch": 2.830215879194147, + "grad_norm": 0.1963154375553131, + "learning_rate": 2.0901679014282882e-05, + "loss": 0.3783, "step": 78530 }, { - "epoch": 2.76, - "learning_rate": 2.19766636528764e-05, - "loss": 0.2648, + "epoch": 2.8303960788553715, + "grad_norm": 0.24002398550510406, + "learning_rate": 2.0898800386523123e-05, + "loss": 0.3993, "step": 78535 }, { - "epoch": 2.76, - "learning_rate": 2.1973835859055616e-05, - "loss": 0.2811, + "epoch": 2.830576278516596, + "grad_norm": 0.20864902436733246, + "learning_rate": 2.0895921814641416e-05, + "loss": 0.3889, "step": 78540 }, { - "epoch": 2.76, - "learning_rate": 2.1971008104527574e-05, - "loss": 0.2712, + "epoch": 2.830756478177821, + "grad_norm": 0.2638551890850067, + "learning_rate": 2.0893043298676986e-05, + "loss": 0.3931, "step": 78545 }, { - "epoch": 2.76, - "learning_rate": 2.1968180389329e-05, - "loss": 0.2597, + "epoch": 2.8309366778390457, + "grad_norm": 0.20083938539028168, + "learning_rate": 2.0890164838669036e-05, + "loss": 0.3713, "step": 78550 }, { - "epoch": 2.76, - "learning_rate": 2.196535271349661e-05, - "loss": 0.2871, + "epoch": 2.8311168775002704, + "grad_norm": 0.19976069033145905, + "learning_rate": 2.08872864346568e-05, + "loss": 0.4218, "step": 78555 }, { - "epoch": 2.76, - "learning_rate": 2.1962525077067117e-05, - "loss": 0.2908, + "epoch": 2.831297077161495, + "grad_norm": 0.19148698449134827, + "learning_rate": 2.0884408086679485e-05, + "loss": 0.3993, "step": 78560 }, { - "epoch": 2.76, - "learning_rate": 2.195969748007722e-05, - "loss": 0.2651, + "epoch": 2.8314772768227194, + "grad_norm": 0.20389747619628906, + "learning_rate": 2.088152979477632e-05, + "loss": 0.3613, "step": 78565 }, { - "epoch": 2.76, - "learning_rate": 2.195686992256366e-05, - "loss": 0.2564, + "epoch": 2.831657476483944, + "grad_norm": 0.16990236937999725, + "learning_rate": 2.087865155898652e-05, + "loss": 0.3767, "step": 78570 }, { - "epoch": 2.76, - "learning_rate": 2.1954042404563134e-05, - "loss": 0.2597, + "epoch": 2.831837676145169, + "grad_norm": 0.195389986038208, + "learning_rate": 2.0875773379349273e-05, + "loss": 0.3924, "step": 78575 }, { - "epoch": 2.76, - "learning_rate": 2.1951214926112366e-05, - "loss": 0.2854, + "epoch": 2.8320178758063936, + "grad_norm": 0.19705142080783844, + "learning_rate": 2.087289525590383e-05, + "loss": 0.4067, "step": 78580 }, { - "epoch": 2.76, - "learning_rate": 2.1948387487248052e-05, - "loss": 0.2616, + "epoch": 2.832198075467618, + "grad_norm": 0.18426579236984253, + "learning_rate": 2.0870017188689394e-05, + "loss": 0.4144, "step": 78585 }, { - "epoch": 2.77, - "learning_rate": 2.1945560088006926e-05, - "loss": 0.2823, + "epoch": 2.8323782751288427, + "grad_norm": 0.23681855201721191, + "learning_rate": 2.086713917774516e-05, + "loss": 0.4152, "step": 78590 }, { - "epoch": 2.77, - "learning_rate": 2.1942732728425685e-05, - "loss": 0.261, + "epoch": 2.8325584747900674, + "grad_norm": 0.18488551676273346, + "learning_rate": 2.0864261223110372e-05, + "loss": 0.3904, "step": 78595 }, { - "epoch": 2.77, - "learning_rate": 2.1939905408541046e-05, - "loss": 0.2808, + "epoch": 2.832738674451292, + "grad_norm": 0.21363814175128937, + "learning_rate": 2.0861383324824206e-05, + "loss": 0.4118, "step": 78600 }, { - "epoch": 2.77, - "learning_rate": 2.193707812838971e-05, - "loss": 0.2677, + "epoch": 2.832918874112517, + "grad_norm": 0.24424757063388824, + "learning_rate": 2.0858505482925905e-05, + "loss": 0.3585, "step": 78605 }, { - "epoch": 2.77, - "learning_rate": 2.1934250888008408e-05, - "loss": 0.2687, + "epoch": 2.833099073773741, + "grad_norm": 0.20323701202869415, + "learning_rate": 2.0855627697454662e-05, + "loss": 0.3676, "step": 78610 }, { - "epoch": 2.77, - "learning_rate": 2.1931423687433835e-05, - "loss": 0.2635, + "epoch": 2.833279273434966, + "grad_norm": 0.1850632280111313, + "learning_rate": 2.0852749968449684e-05, + "loss": 0.3832, "step": 78615 }, { - "epoch": 2.77, - "learning_rate": 2.1928596526702706e-05, - "loss": 0.2494, + "epoch": 2.8334594730961906, + "grad_norm": 0.19925178587436676, + "learning_rate": 2.084987229595019e-05, + "loss": 0.3802, "step": 78620 }, { - "epoch": 2.77, - "learning_rate": 2.1925769405851715e-05, - "loss": 0.2584, + "epoch": 2.8336396727574154, + "grad_norm": 0.21943655610084534, + "learning_rate": 2.084699467999538e-05, + "loss": 0.3908, "step": 78625 }, { - "epoch": 2.77, - "learning_rate": 2.1922942324917588e-05, - "loss": 0.2525, + "epoch": 2.8338198724186396, + "grad_norm": 0.2018982470035553, + "learning_rate": 2.0844117120624463e-05, + "loss": 0.3912, "step": 78630 }, { - "epoch": 2.77, - "learning_rate": 2.1920115283937033e-05, - "loss": 0.2502, + "epoch": 2.8340000720798644, + "grad_norm": 0.20863167941570282, + "learning_rate": 2.0841239617876647e-05, + "loss": 0.4028, "step": 78635 }, { - "epoch": 2.77, - "learning_rate": 2.191728828294674e-05, - "loss": 0.2859, + "epoch": 2.834180271741089, + "grad_norm": 0.19766663014888763, + "learning_rate": 2.0838362171791133e-05, + "loss": 0.3733, "step": 78640 }, { - "epoch": 2.77, - "learning_rate": 2.1914461321983436e-05, - "loss": 0.2472, + "epoch": 2.834360471402314, + "grad_norm": 0.24761301279067993, + "learning_rate": 2.083548478240713e-05, + "loss": 0.3987, "step": 78645 }, { - "epoch": 2.77, - "learning_rate": 2.191163440108382e-05, - "loss": 0.2693, + "epoch": 2.8345406710635386, + "grad_norm": 0.22107259929180145, + "learning_rate": 2.0832607449763843e-05, + "loss": 0.4007, "step": 78650 }, { - "epoch": 2.77, - "learning_rate": 2.190880752028459e-05, - "loss": 0.2925, + "epoch": 2.8347208707247633, + "grad_norm": 0.2298266589641571, + "learning_rate": 2.082973017390047e-05, + "loss": 0.3774, "step": 78655 }, { - "epoch": 2.77, - "learning_rate": 2.190598067962245e-05, - "loss": 0.2807, + "epoch": 2.8349010703859876, + "grad_norm": 0.2100599855184555, + "learning_rate": 2.0826852954856217e-05, + "loss": 0.3824, "step": 78660 }, { - "epoch": 2.77, - "learning_rate": 2.1903153879134126e-05, - "loss": 0.277, + "epoch": 2.8350812700472123, + "grad_norm": 0.19169162213802338, + "learning_rate": 2.0823975792670292e-05, + "loss": 0.3735, "step": 78665 }, { - "epoch": 2.77, - "learning_rate": 2.1900327118856305e-05, - "loss": 0.2835, + "epoch": 2.835261469708437, + "grad_norm": 0.2121647298336029, + "learning_rate": 2.0821098687381874e-05, + "loss": 0.4001, "step": 78670 }, { - "epoch": 2.77, - "learning_rate": 2.1897500398825695e-05, - "loss": 0.2835, + "epoch": 2.8354416693696614, + "grad_norm": 0.2263890504837036, + "learning_rate": 2.081822163903019e-05, + "loss": 0.4074, "step": 78675 }, { - "epoch": 2.77, - "learning_rate": 2.189467371907899e-05, - "loss": 0.2598, + "epoch": 2.835621869030886, + "grad_norm": 0.2616461515426636, + "learning_rate": 2.0815344647654413e-05, + "loss": 0.4573, "step": 78680 }, { - "epoch": 2.77, - "learning_rate": 2.1891847079652903e-05, - "loss": 0.2599, + "epoch": 2.835802068692111, + "grad_norm": 0.2435300052165985, + "learning_rate": 2.081246771329377e-05, + "loss": 0.3558, "step": 78685 }, { - "epoch": 2.77, - "learning_rate": 2.1889020480584137e-05, - "loss": 0.2563, + "epoch": 2.8359822683533356, + "grad_norm": 0.20610463619232178, + "learning_rate": 2.080959083598744e-05, + "loss": 0.4027, "step": 78690 }, { - "epoch": 2.77, - "learning_rate": 2.1886193921909394e-05, - "loss": 0.2537, + "epoch": 2.8361624680145603, + "grad_norm": 0.23141418397426605, + "learning_rate": 2.0806714015774613e-05, + "loss": 0.3734, "step": 78695 }, { - "epoch": 2.77, - "learning_rate": 2.1883367403665356e-05, - "loss": 0.2813, + "epoch": 2.836342667675785, + "grad_norm": 0.1831231713294983, + "learning_rate": 2.080383725269451e-05, + "loss": 0.4006, "step": 78700 }, { - "epoch": 2.77, - "learning_rate": 2.188054092588875e-05, - "loss": 0.2766, + "epoch": 2.8365228673370093, + "grad_norm": 0.23543448746204376, + "learning_rate": 2.0800960546786293e-05, + "loss": 0.4156, "step": 78705 }, { - "epoch": 2.77, - "learning_rate": 2.1877714488616262e-05, - "loss": 0.2852, + "epoch": 2.836703066998234, + "grad_norm": 0.1677248328924179, + "learning_rate": 2.0798083898089193e-05, + "loss": 0.3878, "step": 78710 }, { - "epoch": 2.77, - "learning_rate": 2.1874888091884593e-05, - "loss": 0.2666, + "epoch": 2.836883266659459, + "grad_norm": 0.20878414809703827, + "learning_rate": 2.0795207306642383e-05, + "loss": 0.3831, "step": 78715 }, { - "epoch": 2.77, - "learning_rate": 2.1872061735730436e-05, - "loss": 0.2682, + "epoch": 2.837063466320683, + "grad_norm": 0.24195106327533722, + "learning_rate": 2.0792330772485055e-05, + "loss": 0.4067, "step": 78720 }, { - "epoch": 2.77, - "learning_rate": 2.186923542019051e-05, - "loss": 0.2746, + "epoch": 2.837243665981908, + "grad_norm": 0.19112065434455872, + "learning_rate": 2.078945429565641e-05, + "loss": 0.4271, "step": 78725 }, { - "epoch": 2.77, - "learning_rate": 2.186640914530149e-05, - "loss": 0.2646, + "epoch": 2.8374238656431325, + "grad_norm": 0.2498737871646881, + "learning_rate": 2.0786577876195633e-05, + "loss": 0.3881, "step": 78730 }, { - "epoch": 2.77, - "learning_rate": 2.1863582911100083e-05, - "loss": 0.2447, + "epoch": 2.8376040653043573, + "grad_norm": 0.1864451766014099, + "learning_rate": 2.0783701514141916e-05, + "loss": 0.3809, "step": 78735 }, { - "epoch": 2.77, - "learning_rate": 2.1860756717622978e-05, - "loss": 0.2704, + "epoch": 2.837784264965582, + "grad_norm": 0.24682430922985077, + "learning_rate": 2.0780825209534448e-05, + "loss": 0.3981, "step": 78740 }, { - "epoch": 2.77, - "learning_rate": 2.185793056490689e-05, - "loss": 0.27, + "epoch": 2.8379644646268067, + "grad_norm": 0.2143888771533966, + "learning_rate": 2.077794896241242e-05, + "loss": 0.3755, "step": 78745 }, { - "epoch": 2.77, - "learning_rate": 2.1855104452988505e-05, - "loss": 0.2776, + "epoch": 2.838144664288031, + "grad_norm": 0.2417534738779068, + "learning_rate": 2.0775072772815023e-05, + "loss": 0.4129, "step": 78750 }, { - "epoch": 2.77, - "learning_rate": 2.18522783819045e-05, - "loss": 0.2765, + "epoch": 2.8383248639492558, + "grad_norm": 0.2023160457611084, + "learning_rate": 2.0772196640781444e-05, + "loss": 0.3503, "step": 78755 }, { - "epoch": 2.77, - "learning_rate": 2.18494523516916e-05, - "loss": 0.2562, + "epoch": 2.8385050636104805, + "grad_norm": 0.26836487650871277, + "learning_rate": 2.076932056635086e-05, + "loss": 0.433, "step": 78760 }, { - "epoch": 2.77, - "learning_rate": 2.1846626362386478e-05, - "loss": 0.3022, + "epoch": 2.838685263271705, + "grad_norm": 0.2009926289319992, + "learning_rate": 2.076644454956247e-05, + "loss": 0.452, "step": 78765 }, { - "epoch": 2.77, - "learning_rate": 2.184380041402584e-05, - "loss": 0.2736, + "epoch": 2.8388654629329295, + "grad_norm": 0.20589284598827362, + "learning_rate": 2.0763568590455458e-05, + "loss": 0.3995, "step": 78770 }, { - "epoch": 2.77, - "learning_rate": 2.1840974506646363e-05, - "loss": 0.2568, + "epoch": 2.8390456625941543, + "grad_norm": 0.23348768055438995, + "learning_rate": 2.0760692689068988e-05, + "loss": 0.3783, "step": 78775 }, { - "epoch": 2.77, - "learning_rate": 2.1838148640284763e-05, - "loss": 0.2483, + "epoch": 2.839225862255379, + "grad_norm": 0.2545948028564453, + "learning_rate": 2.0757816845442274e-05, + "loss": 0.423, "step": 78780 }, { - "epoch": 2.77, - "learning_rate": 2.1835322814977718e-05, - "loss": 0.2659, + "epoch": 2.8394060619166037, + "grad_norm": 0.19799551367759705, + "learning_rate": 2.075494105961447e-05, + "loss": 0.417, "step": 78785 }, { - "epoch": 2.77, - "learning_rate": 2.1832497030761916e-05, - "loss": 0.2842, + "epoch": 2.8395862615778285, + "grad_norm": 0.22839927673339844, + "learning_rate": 2.0752065331624788e-05, + "loss": 0.3828, "step": 78790 }, { - "epoch": 2.77, - "learning_rate": 2.182967128767405e-05, - "loss": 0.2506, + "epoch": 2.8397664612390527, + "grad_norm": 0.1921931505203247, + "learning_rate": 2.0749189661512387e-05, + "loss": 0.4218, "step": 78795 }, { - "epoch": 2.77, - "learning_rate": 2.1826845585750817e-05, - "loss": 0.2736, + "epoch": 2.8399466609002775, + "grad_norm": 0.20182907581329346, + "learning_rate": 2.074631404931645e-05, + "loss": 0.4186, "step": 78800 }, { - "epoch": 2.77, - "learning_rate": 2.1824019925028905e-05, - "loss": 0.2783, + "epoch": 2.840126860561502, + "grad_norm": 0.20706386864185333, + "learning_rate": 2.0743438495076164e-05, + "loss": 0.394, "step": 78805 }, { - "epoch": 2.77, - "learning_rate": 2.1821194305545002e-05, - "loss": 0.2795, + "epoch": 2.840307060222727, + "grad_norm": 0.23786523938179016, + "learning_rate": 2.0740562998830706e-05, + "loss": 0.3553, "step": 78810 }, { - "epoch": 2.77, - "learning_rate": 2.1818368727335784e-05, - "loss": 0.2575, + "epoch": 2.8404872598839512, + "grad_norm": 0.1973177045583725, + "learning_rate": 2.073768756061925e-05, + "loss": 0.3815, "step": 78815 }, { - "epoch": 2.77, - "learning_rate": 2.1815543190437963e-05, - "loss": 0.2811, + "epoch": 2.840667459545176, + "grad_norm": 0.21319210529327393, + "learning_rate": 2.0734812180480976e-05, + "loss": 0.3736, "step": 78820 }, { - "epoch": 2.77, - "learning_rate": 2.1812717694888207e-05, - "loss": 0.2559, + "epoch": 2.8408476592064007, + "grad_norm": 0.21359285712242126, + "learning_rate": 2.0731936858455057e-05, + "loss": 0.358, "step": 78825 }, { - "epoch": 2.77, - "learning_rate": 2.1809892240723224e-05, - "loss": 0.2639, + "epoch": 2.8410278588676254, + "grad_norm": 0.24287675321102142, + "learning_rate": 2.0729061594580677e-05, + "loss": 0.4399, "step": 78830 }, { - "epoch": 2.77, - "learning_rate": 2.180706682797967e-05, - "loss": 0.2542, + "epoch": 2.84120805852885, + "grad_norm": 0.2060524970293045, + "learning_rate": 2.0726186388897007e-05, + "loss": 0.3973, "step": 78835 }, { - "epoch": 2.77, - "learning_rate": 2.1804241456694262e-05, - "loss": 0.2483, + "epoch": 2.8413882581900745, + "grad_norm": 0.21056078374385834, + "learning_rate": 2.072331124144321e-05, + "loss": 0.3884, "step": 78840 }, { - "epoch": 2.77, - "learning_rate": 2.1801416126903667e-05, - "loss": 0.2842, + "epoch": 2.841568457851299, + "grad_norm": 0.22964583337306976, + "learning_rate": 2.0720436152258483e-05, + "loss": 0.3679, "step": 78845 }, { - "epoch": 2.77, - "learning_rate": 2.1798590838644584e-05, - "loss": 0.2807, + "epoch": 2.841748657512524, + "grad_norm": 0.1646958887577057, + "learning_rate": 2.0717561121381983e-05, + "loss": 0.3748, "step": 78850 }, { - "epoch": 2.77, - "learning_rate": 2.1795765591953678e-05, - "loss": 0.2798, + "epoch": 2.8419288571737487, + "grad_norm": 0.2664015591144562, + "learning_rate": 2.0714686148852873e-05, + "loss": 0.4059, "step": 78855 }, { - "epoch": 2.77, - "learning_rate": 2.179294038686765e-05, - "loss": 0.2552, + "epoch": 2.842109056834973, + "grad_norm": 0.22343656420707703, + "learning_rate": 2.0711811234710347e-05, + "loss": 0.4021, "step": 78860 }, { - "epoch": 2.77, - "learning_rate": 2.179011522342318e-05, - "loss": 0.268, + "epoch": 2.8422892564961977, + "grad_norm": 0.2661823332309723, + "learning_rate": 2.0708936378993545e-05, + "loss": 0.3696, "step": 78865 }, { - "epoch": 2.77, - "learning_rate": 2.1787290101656937e-05, - "loss": 0.2761, + "epoch": 2.8424694561574224, + "grad_norm": 0.25233519077301025, + "learning_rate": 2.0706061581741667e-05, + "loss": 0.4061, "step": 78870 }, { - "epoch": 2.78, - "learning_rate": 2.1784465021605627e-05, - "loss": 0.276, + "epoch": 2.842649655818647, + "grad_norm": 0.19629564881324768, + "learning_rate": 2.0703186842993878e-05, + "loss": 0.4031, "step": 78875 }, { - "epoch": 2.78, - "learning_rate": 2.1781639983305917e-05, - "loss": 0.2701, + "epoch": 2.842829855479872, + "grad_norm": 0.17642425000667572, + "learning_rate": 2.0700312162789316e-05, + "loss": 0.4057, "step": 78880 }, { - "epoch": 2.78, - "learning_rate": 2.1778814986794492e-05, - "loss": 0.2942, + "epoch": 2.843010055141096, + "grad_norm": 0.16491912305355072, + "learning_rate": 2.0697437541167182e-05, + "loss": 0.3817, "step": 78885 }, { - "epoch": 2.78, - "learning_rate": 2.177599003210802e-05, - "loss": 0.275, + "epoch": 2.843190254802321, + "grad_norm": 0.18968559801578522, + "learning_rate": 2.0694562978166617e-05, + "loss": 0.4262, "step": 78890 }, { - "epoch": 2.78, - "learning_rate": 2.1773165119283206e-05, - "loss": 0.2897, + "epoch": 2.8433704544635456, + "grad_norm": 0.23654945194721222, + "learning_rate": 2.0691688473826813e-05, + "loss": 0.4081, "step": 78895 }, { - "epoch": 2.78, - "learning_rate": 2.1770340248356714e-05, - "loss": 0.285, + "epoch": 2.8435506541247704, + "grad_norm": 0.21302512288093567, + "learning_rate": 2.068881402818691e-05, + "loss": 0.3762, "step": 78900 }, { - "epoch": 2.78, - "learning_rate": 2.176751541936522e-05, - "loss": 0.2668, + "epoch": 2.8437308537859947, + "grad_norm": 0.19468043744564056, + "learning_rate": 2.068593964128608e-05, + "loss": 0.4304, "step": 78905 }, { - "epoch": 2.78, - "learning_rate": 2.1764690632345407e-05, - "loss": 0.2558, + "epoch": 2.8439110534472194, + "grad_norm": 0.20956778526306152, + "learning_rate": 2.0683065313163493e-05, + "loss": 0.3958, "step": 78910 }, { - "epoch": 2.78, - "learning_rate": 2.1761865887333962e-05, - "loss": 0.2576, + "epoch": 2.844091253108444, + "grad_norm": 0.2271938920021057, + "learning_rate": 2.0680191043858303e-05, + "loss": 0.4014, "step": 78915 }, { - "epoch": 2.78, - "learning_rate": 2.1759041184367558e-05, - "loss": 0.29, + "epoch": 2.844271452769669, + "grad_norm": 0.2164512425661087, + "learning_rate": 2.0677316833409672e-05, + "loss": 0.4368, "step": 78920 }, { - "epoch": 2.78, - "learning_rate": 2.175621652348286e-05, - "loss": 0.2788, + "epoch": 2.8444516524308936, + "grad_norm": 0.2096407115459442, + "learning_rate": 2.0674442681856764e-05, + "loss": 0.3887, "step": 78925 }, { - "epoch": 2.78, - "learning_rate": 2.175339190471655e-05, - "loss": 0.2555, + "epoch": 2.8446318520921183, + "grad_norm": 0.199036106467247, + "learning_rate": 2.0671568589238734e-05, + "loss": 0.389, "step": 78930 }, { - "epoch": 2.78, - "learning_rate": 2.1750567328105308e-05, - "loss": 0.2706, + "epoch": 2.8448120517533426, + "grad_norm": 0.20308394730091095, + "learning_rate": 2.0668694555594746e-05, + "loss": 0.3894, "step": 78935 }, { - "epoch": 2.78, - "learning_rate": 2.1747742793685816e-05, - "loss": 0.2622, + "epoch": 2.8449922514145674, + "grad_norm": 0.2252362221479416, + "learning_rate": 2.0665820580963957e-05, + "loss": 0.3844, "step": 78940 }, { - "epoch": 2.78, - "learning_rate": 2.174491830149474e-05, - "loss": 0.2678, + "epoch": 2.845172451075792, + "grad_norm": 0.18750686943531036, + "learning_rate": 2.066294666538552e-05, + "loss": 0.4232, "step": 78945 }, { - "epoch": 2.78, - "learning_rate": 2.174209385156874e-05, - "loss": 0.2656, + "epoch": 2.8453526507370164, + "grad_norm": 0.20952045917510986, + "learning_rate": 2.06600728088986e-05, + "loss": 0.4092, "step": 78950 }, { - "epoch": 2.78, - "learning_rate": 2.1739269443944515e-05, - "loss": 0.2589, + "epoch": 2.845532850398241, + "grad_norm": 0.24399419128894806, + "learning_rate": 2.0657199011542352e-05, + "loss": 0.4103, "step": 78955 }, { - "epoch": 2.78, - "learning_rate": 2.1736445078658726e-05, - "loss": 0.2621, + "epoch": 2.845713050059466, + "grad_norm": 0.20559486746788025, + "learning_rate": 2.065432527335591e-05, + "loss": 0.3815, "step": 78960 }, { - "epoch": 2.78, - "learning_rate": 2.1733620755748054e-05, - "loss": 0.2753, + "epoch": 2.8458932497206906, + "grad_norm": 0.24112515151500702, + "learning_rate": 2.0651451594378462e-05, + "loss": 0.4345, "step": 78965 }, { - "epoch": 2.78, - "learning_rate": 2.1730796475249148e-05, - "loss": 0.2798, + "epoch": 2.8460734493819153, + "grad_norm": 0.24008318781852722, + "learning_rate": 2.064857797464913e-05, + "loss": 0.4203, "step": 78970 }, { - "epoch": 2.78, - "learning_rate": 2.1727972237198708e-05, - "loss": 0.274, + "epoch": 2.84625364904314, + "grad_norm": 0.1976160705089569, + "learning_rate": 2.0645704414207096e-05, + "loss": 0.3979, "step": 78975 }, { - "epoch": 2.78, - "learning_rate": 2.1725148041633388e-05, - "loss": 0.2559, + "epoch": 2.8464338487043643, + "grad_norm": 0.21798163652420044, + "learning_rate": 2.064283091309149e-05, + "loss": 0.3961, "step": 78980 }, { - "epoch": 2.78, - "learning_rate": 2.1722323888589857e-05, - "loss": 0.279, + "epoch": 2.846614048365589, + "grad_norm": 0.30236032605171204, + "learning_rate": 2.0639957471341463e-05, + "loss": 0.3842, "step": 78985 }, { - "epoch": 2.78, - "learning_rate": 2.1719499778104796e-05, - "loss": 0.2681, + "epoch": 2.846794248026814, + "grad_norm": 0.2003178596496582, + "learning_rate": 2.0637084088996175e-05, + "loss": 0.4055, "step": 78990 }, { - "epoch": 2.78, - "learning_rate": 2.171667571021487e-05, - "loss": 0.2853, + "epoch": 2.846974447688038, + "grad_norm": 0.18531939387321472, + "learning_rate": 2.0634210766094775e-05, + "loss": 0.3818, "step": 78995 }, { - "epoch": 2.78, - "learning_rate": 2.171385168495675e-05, - "loss": 0.258, + "epoch": 2.847154647349263, + "grad_norm": 0.16672350466251373, + "learning_rate": 2.06313375026764e-05, + "loss": 0.3738, "step": 79000 }, { - "epoch": 2.78, - "eval_loss": 0.260810911655426, - "eval_runtime": 10.5622, - "eval_samples_per_second": 9.468, - "eval_steps_per_second": 9.468, + "epoch": 2.847154647349263, + "eval_loss": 0.4311943054199219, + "eval_runtime": 3.5347, + "eval_samples_per_second": 28.291, + "eval_steps_per_second": 7.073, "step": 79000 }, { - "epoch": 2.78, - "learning_rate": 2.1711027702367086e-05, - "loss": 0.2571, + "epoch": 2.8473348470104876, + "grad_norm": 0.1916167140007019, + "learning_rate": 2.0628464298780215e-05, + "loss": 0.39, "step": 79005 }, { - "epoch": 2.78, - "learning_rate": 2.1708203762482574e-05, - "loss": 0.2718, + "epoch": 2.8475150466717123, + "grad_norm": 0.4534687399864197, + "learning_rate": 2.0625591154445348e-05, + "loss": 0.3945, "step": 79010 }, { - "epoch": 2.78, - "learning_rate": 2.170537986533986e-05, - "loss": 0.2733, + "epoch": 2.847695246332937, + "grad_norm": 0.19475337862968445, + "learning_rate": 2.062271806971096e-05, + "loss": 0.4248, "step": 79015 }, { - "epoch": 2.78, - "learning_rate": 2.1702556010975623e-05, - "loss": 0.2939, + "epoch": 2.8478754459941618, + "grad_norm": 0.20240464806556702, + "learning_rate": 2.0619845044616195e-05, + "loss": 0.4073, "step": 79020 }, { - "epoch": 2.78, - "learning_rate": 2.169973219942651e-05, - "loss": 0.2649, + "epoch": 2.848055645655386, + "grad_norm": 0.21959447860717773, + "learning_rate": 2.0616972079200185e-05, + "loss": 0.4052, "step": 79025 }, { - "epoch": 2.78, - "learning_rate": 2.1696908430729214e-05, - "loss": 0.2827, + "epoch": 2.848235845316611, + "grad_norm": 0.21866481006145477, + "learning_rate": 2.061409917350209e-05, + "loss": 0.4036, "step": 79030 }, { - "epoch": 2.78, - "learning_rate": 2.169408470492038e-05, - "loss": 0.286, + "epoch": 2.8484160449778355, + "grad_norm": 0.19479642808437347, + "learning_rate": 2.0611226327561042e-05, + "loss": 0.3468, "step": 79035 }, { - "epoch": 2.78, - "learning_rate": 2.1691261022036674e-05, - "loss": 0.2421, + "epoch": 2.84859624463906, + "grad_norm": 0.24999023973941803, + "learning_rate": 2.060835354141618e-05, + "loss": 0.3691, "step": 79040 }, { - "epoch": 2.78, - "learning_rate": 2.168843738211476e-05, - "loss": 0.2687, + "epoch": 2.8487764443002845, + "grad_norm": 0.23853568732738495, + "learning_rate": 2.0605480815106656e-05, + "loss": 0.4224, "step": 79045 }, { - "epoch": 2.78, - "learning_rate": 2.1685613785191312e-05, - "loss": 0.2901, + "epoch": 2.8489566439615093, + "grad_norm": 0.2416432797908783, + "learning_rate": 2.0602608148671602e-05, + "loss": 0.3812, "step": 79050 }, { - "epoch": 2.78, - "learning_rate": 2.1682790231302988e-05, - "loss": 0.2639, + "epoch": 2.849136843622734, + "grad_norm": 0.18531100451946259, + "learning_rate": 2.0599735542150164e-05, + "loss": 0.4103, "step": 79055 }, { - "epoch": 2.78, - "learning_rate": 2.1679966720486445e-05, - "loss": 0.2533, + "epoch": 2.8493170432839587, + "grad_norm": 0.1804402768611908, + "learning_rate": 2.0596862995581485e-05, + "loss": 0.3648, "step": 79060 }, { - "epoch": 2.78, - "learning_rate": 2.1677143252778335e-05, - "loss": 0.2734, + "epoch": 2.8494972429451835, + "grad_norm": 0.21126359701156616, + "learning_rate": 2.0593990509004675e-05, + "loss": 0.3624, "step": 79065 }, { - "epoch": 2.78, - "learning_rate": 2.1674319828215337e-05, - "loss": 0.2918, + "epoch": 2.8496774426064078, + "grad_norm": 0.23747804760932922, + "learning_rate": 2.059111808245891e-05, + "loss": 0.4037, "step": 79070 }, { - "epoch": 2.78, - "learning_rate": 2.1671496446834108e-05, - "loss": 0.2567, + "epoch": 2.8498576422676325, + "grad_norm": 0.2273935079574585, + "learning_rate": 2.058824571598329e-05, + "loss": 0.4083, "step": 79075 }, { - "epoch": 2.78, - "learning_rate": 2.1668673108671306e-05, - "loss": 0.2541, + "epoch": 2.8500378419288572, + "grad_norm": 0.22935499250888824, + "learning_rate": 2.0585373409616985e-05, + "loss": 0.3983, "step": 79080 }, { - "epoch": 2.78, - "learning_rate": 2.1665849813763576e-05, - "loss": 0.2684, + "epoch": 2.850218041590082, + "grad_norm": 0.28681254386901855, + "learning_rate": 2.058250116339911e-05, + "loss": 0.39, "step": 79085 }, { - "epoch": 2.78, - "learning_rate": 2.16630265621476e-05, - "loss": 0.2823, + "epoch": 2.8503982412513063, + "grad_norm": 0.18276835978031158, + "learning_rate": 2.0579628977368792e-05, + "loss": 0.3573, "step": 79090 }, { - "epoch": 2.78, - "learning_rate": 2.166020335386002e-05, - "loss": 0.2759, + "epoch": 2.850578440912531, + "grad_norm": 0.19169297814369202, + "learning_rate": 2.0576756851565182e-05, + "loss": 0.4129, "step": 79095 }, { - "epoch": 2.78, - "learning_rate": 2.1657380188937494e-05, - "loss": 0.284, + "epoch": 2.8507586405737557, + "grad_norm": 0.2185550034046173, + "learning_rate": 2.05738847860274e-05, + "loss": 0.4006, "step": 79100 }, { - "epoch": 2.78, - "learning_rate": 2.165455706741669e-05, - "loss": 0.2545, + "epoch": 2.8509388402349805, + "grad_norm": 0.24736441671848297, + "learning_rate": 2.0571012780794577e-05, + "loss": 0.3866, "step": 79105 }, { - "epoch": 2.78, - "learning_rate": 2.165173398933426e-05, - "loss": 0.2777, + "epoch": 2.851119039896205, + "grad_norm": 0.19851014018058777, + "learning_rate": 2.056814083590585e-05, + "loss": 0.4227, "step": 79110 }, { - "epoch": 2.78, - "learning_rate": 2.164891095472686e-05, - "loss": 0.2842, + "epoch": 2.8512992395574295, + "grad_norm": 0.19932295382022858, + "learning_rate": 2.0565268951400346e-05, + "loss": 0.3941, "step": 79115 }, { - "epoch": 2.78, - "learning_rate": 2.164608796363113e-05, - "loss": 0.2896, + "epoch": 2.851479439218654, + "grad_norm": 0.21272258460521698, + "learning_rate": 2.0562397127317197e-05, + "loss": 0.37, "step": 79120 }, { - "epoch": 2.78, - "learning_rate": 2.164326501608374e-05, - "loss": 0.2724, + "epoch": 2.851659638879879, + "grad_norm": 0.2207152098417282, + "learning_rate": 2.055952536369553e-05, + "loss": 0.3985, "step": 79125 }, { - "epoch": 2.78, - "learning_rate": 2.164044211212135e-05, - "loss": 0.2505, + "epoch": 2.8518398385411037, + "grad_norm": 0.2772730588912964, + "learning_rate": 2.0556653660574464e-05, + "loss": 0.385, "step": 79130 }, { - "epoch": 2.78, - "learning_rate": 2.1637619251780604e-05, - "loss": 0.2728, + "epoch": 2.852020038202328, + "grad_norm": 0.2133176475763321, + "learning_rate": 2.0553782017993135e-05, + "loss": 0.3741, "step": 79135 }, { - "epoch": 2.78, - "learning_rate": 2.163479643509815e-05, - "loss": 0.2859, + "epoch": 2.8522002378635527, + "grad_norm": 0.266476571559906, + "learning_rate": 2.055091043599067e-05, + "loss": 0.4071, "step": 79140 }, { - "epoch": 2.78, - "learning_rate": 2.163197366211065e-05, - "loss": 0.2734, + "epoch": 2.8523804375247774, + "grad_norm": 0.20669589936733246, + "learning_rate": 2.0548038914606174e-05, + "loss": 0.417, "step": 79145 }, { - "epoch": 2.78, - "learning_rate": 2.1629150932854752e-05, - "loss": 0.2739, + "epoch": 2.852560637186002, + "grad_norm": 0.21725420653820038, + "learning_rate": 2.0545167453878804e-05, + "loss": 0.3852, "step": 79150 }, { - "epoch": 2.78, - "learning_rate": 2.1626328247367112e-05, - "loss": 0.2747, + "epoch": 2.852740836847227, + "grad_norm": 0.23247067630290985, + "learning_rate": 2.0542296053847647e-05, + "loss": 0.399, "step": 79155 }, { - "epoch": 2.79, - "learning_rate": 2.1623505605684364e-05, - "loss": 0.2626, + "epoch": 2.8529210365084516, + "grad_norm": 0.2411661595106125, + "learning_rate": 2.0539424714551852e-05, + "loss": 0.4155, "step": 79160 }, { - "epoch": 2.79, - "learning_rate": 2.1620683007843184e-05, - "loss": 0.2693, + "epoch": 2.853101236169676, + "grad_norm": 0.2001282423734665, + "learning_rate": 2.053655343603054e-05, + "loss": 0.4257, "step": 79165 }, { - "epoch": 2.79, - "learning_rate": 2.1617860453880205e-05, - "loss": 0.2699, + "epoch": 2.8532814358309007, + "grad_norm": 0.21642963588237762, + "learning_rate": 2.0533682218322807e-05, + "loss": 0.4346, "step": 79170 }, { - "epoch": 2.79, - "learning_rate": 2.1615037943832076e-05, - "loss": 0.2775, + "epoch": 2.8534616354921254, + "grad_norm": 0.24576754868030548, + "learning_rate": 2.0530811061467802e-05, + "loss": 0.4105, "step": 79175 }, { - "epoch": 2.79, - "learning_rate": 2.1612215477735443e-05, - "loss": 0.2562, + "epoch": 2.8536418351533497, + "grad_norm": 0.16596747934818268, + "learning_rate": 2.052793996550463e-05, + "loss": 0.3769, "step": 79180 }, { - "epoch": 2.79, - "learning_rate": 2.160939305562697e-05, - "loss": 0.277, + "epoch": 2.8538220348145744, + "grad_norm": 0.20932526886463165, + "learning_rate": 2.05250689304724e-05, + "loss": 0.4079, "step": 79185 }, { - "epoch": 2.79, - "learning_rate": 2.1606570677543293e-05, - "loss": 0.2747, + "epoch": 2.854002234475799, + "grad_norm": 0.23779064416885376, + "learning_rate": 2.052219795641025e-05, + "loss": 0.4114, "step": 79190 }, { - "epoch": 2.79, - "learning_rate": 2.1603748343521062e-05, - "loss": 0.2458, + "epoch": 2.854182434137024, + "grad_norm": 0.17086093127727509, + "learning_rate": 2.0519327043357278e-05, + "loss": 0.3968, "step": 79195 }, { - "epoch": 2.79, - "learning_rate": 2.1600926053596904e-05, - "loss": 0.2618, + "epoch": 2.8543626337982486, + "grad_norm": 0.22263990342617035, + "learning_rate": 2.0516456191352612e-05, + "loss": 0.3955, "step": 79200 }, { - "epoch": 2.79, - "learning_rate": 2.159810380780749e-05, - "loss": 0.2707, + "epoch": 2.8545428334594733, + "grad_norm": 0.24070154130458832, + "learning_rate": 2.0513585400435363e-05, + "loss": 0.3586, "step": 79205 }, { - "epoch": 2.79, - "learning_rate": 2.1595281606189464e-05, - "loss": 0.2677, + "epoch": 2.8547230331206976, + "grad_norm": 0.18672329187393188, + "learning_rate": 2.0510714670644643e-05, + "loss": 0.3952, "step": 79210 }, { - "epoch": 2.79, - "learning_rate": 2.1592459448779446e-05, - "loss": 0.2631, + "epoch": 2.8549032327819224, + "grad_norm": 0.20222359895706177, + "learning_rate": 2.0507844002019564e-05, + "loss": 0.3989, "step": 79215 }, { - "epoch": 2.79, - "learning_rate": 2.1589637335614114e-05, - "loss": 0.2719, + "epoch": 2.855083432443147, + "grad_norm": 0.19654959440231323, + "learning_rate": 2.0504973394599247e-05, + "loss": 0.3831, "step": 79220 }, { - "epoch": 2.79, - "learning_rate": 2.1586815266730088e-05, - "loss": 0.2509, + "epoch": 2.8552636321043714, + "grad_norm": 0.24008645117282867, + "learning_rate": 2.050210284842279e-05, + "loss": 0.39, "step": 79225 }, { - "epoch": 2.79, - "learning_rate": 2.1583993242164012e-05, - "loss": 0.2838, + "epoch": 2.855443831765596, + "grad_norm": 0.20987041294574738, + "learning_rate": 2.0499232363529315e-05, + "loss": 0.3742, "step": 79230 }, { - "epoch": 2.79, - "learning_rate": 2.1581171261952535e-05, - "loss": 0.2725, + "epoch": 2.855624031426821, + "grad_norm": 0.22069920599460602, + "learning_rate": 2.0496361939957926e-05, + "loss": 0.4068, "step": 79235 }, { - "epoch": 2.79, - "learning_rate": 2.1578349326132293e-05, - "loss": 0.2576, + "epoch": 2.8558042310880456, + "grad_norm": 0.21462880074977875, + "learning_rate": 2.0493491577747738e-05, + "loss": 0.4358, "step": 79240 }, { - "epoch": 2.79, - "learning_rate": 2.157552743473994e-05, - "loss": 0.2681, + "epoch": 2.8559844307492703, + "grad_norm": 0.22585731744766235, + "learning_rate": 2.0490621276937853e-05, + "loss": 0.3979, "step": 79245 }, { - "epoch": 2.79, - "learning_rate": 2.15727055878121e-05, - "loss": 0.2652, + "epoch": 2.856164630410495, + "grad_norm": 0.20403482019901276, + "learning_rate": 2.048775103756737e-05, + "loss": 0.386, "step": 79250 }, { - "epoch": 2.79, - "learning_rate": 2.1569883785385413e-05, - "loss": 0.2611, + "epoch": 2.8563448300717194, + "grad_norm": 0.1699545532464981, + "learning_rate": 2.0484880859675422e-05, + "loss": 0.3828, "step": 79255 }, { - "epoch": 2.79, - "learning_rate": 2.1567062027496538e-05, - "loss": 0.2867, + "epoch": 2.856525029732944, + "grad_norm": 0.15183869004249573, + "learning_rate": 2.0482010743301093e-05, + "loss": 0.3445, "step": 79260 }, { - "epoch": 2.79, - "learning_rate": 2.156424031418209e-05, - "loss": 0.2822, + "epoch": 2.856705229394169, + "grad_norm": 0.25381314754486084, + "learning_rate": 2.0479140688483485e-05, + "loss": 0.4077, "step": 79265 }, { - "epoch": 2.79, - "learning_rate": 2.1561418645478722e-05, - "loss": 0.2569, + "epoch": 2.856885429055393, + "grad_norm": 0.18916507065296173, + "learning_rate": 2.0476270695261716e-05, + "loss": 0.3801, "step": 79270 }, { - "epoch": 2.79, - "learning_rate": 2.155859702142306e-05, - "loss": 0.2762, + "epoch": 2.857065628716618, + "grad_norm": 0.18349602818489075, + "learning_rate": 2.0473400763674876e-05, + "loss": 0.4151, "step": 79275 }, { - "epoch": 2.79, - "learning_rate": 2.1555775442051756e-05, - "loss": 0.2956, + "epoch": 2.8572458283778426, + "grad_norm": 0.1902550309896469, + "learning_rate": 2.0470530893762087e-05, + "loss": 0.4191, "step": 79280 }, { - "epoch": 2.79, - "learning_rate": 2.1552953907401433e-05, - "loss": 0.2615, + "epoch": 2.8574260280390673, + "grad_norm": 0.23861198127269745, + "learning_rate": 2.046766108556243e-05, + "loss": 0.3875, "step": 79285 }, { - "epoch": 2.79, - "learning_rate": 2.1550132417508736e-05, - "loss": 0.2734, + "epoch": 2.857606227700292, + "grad_norm": 0.19769497215747833, + "learning_rate": 2.0464791339115014e-05, + "loss": 0.3807, "step": 79290 }, { - "epoch": 2.79, - "learning_rate": 2.1547310972410287e-05, - "loss": 0.2429, + "epoch": 2.857786427361517, + "grad_norm": 0.20650269091129303, + "learning_rate": 2.0461921654458938e-05, + "loss": 0.3896, "step": 79295 }, { - "epoch": 2.79, - "learning_rate": 2.154448957214274e-05, - "loss": 0.2657, + "epoch": 2.857966627022741, + "grad_norm": 0.19639898836612701, + "learning_rate": 2.0459052031633297e-05, + "loss": 0.3815, "step": 79300 }, { - "epoch": 2.79, - "learning_rate": 2.154166821674272e-05, - "loss": 0.2652, + "epoch": 2.858146826683966, + "grad_norm": 0.23616774380207062, + "learning_rate": 2.0456182470677198e-05, + "loss": 0.4199, "step": 79305 }, { - "epoch": 2.79, - "learning_rate": 2.1538846906246853e-05, - "loss": 0.2836, + "epoch": 2.8583270263451905, + "grad_norm": 0.17947378754615784, + "learning_rate": 2.0453312971629734e-05, + "loss": 0.3959, "step": 79310 }, { - "epoch": 2.79, - "learning_rate": 2.1536025640691774e-05, - "loss": 0.2665, + "epoch": 2.8585072260064153, + "grad_norm": 0.18461818993091583, + "learning_rate": 2.0450443534529995e-05, + "loss": 0.3596, "step": 79315 }, { - "epoch": 2.79, - "learning_rate": 2.1533204420114122e-05, - "loss": 0.2522, + "epoch": 2.8586874256676396, + "grad_norm": 0.19616423547267914, + "learning_rate": 2.044757415941709e-05, + "loss": 0.3676, "step": 79320 }, { - "epoch": 2.79, - "learning_rate": 2.1530383244550534e-05, - "loss": 0.2672, + "epoch": 2.8588676253288643, + "grad_norm": 0.18989914655685425, + "learning_rate": 2.0444704846330098e-05, + "loss": 0.4095, "step": 79325 }, { - "epoch": 2.79, - "learning_rate": 2.152756211403762e-05, - "loss": 0.2577, + "epoch": 2.859047824990089, + "grad_norm": 0.20153529942035675, + "learning_rate": 2.0441835595308122e-05, + "loss": 0.4075, "step": 79330 }, { - "epoch": 2.79, - "learning_rate": 2.1524741028612034e-05, - "loss": 0.2838, + "epoch": 2.8592280246513138, + "grad_norm": 0.1888132393360138, + "learning_rate": 2.0438966406390256e-05, + "loss": 0.4005, "step": 79335 }, { - "epoch": 2.79, - "learning_rate": 2.1521919988310392e-05, - "loss": 0.287, + "epoch": 2.8594082243125385, + "grad_norm": 0.21628032624721527, + "learning_rate": 2.0436097279615585e-05, + "loss": 0.4074, "step": 79340 }, { - "epoch": 2.79, - "learning_rate": 2.151909899316933e-05, - "loss": 0.2482, + "epoch": 2.859588423973763, + "grad_norm": 0.18922530114650726, + "learning_rate": 2.0433228215023213e-05, + "loss": 0.3821, "step": 79345 }, { - "epoch": 2.79, - "learning_rate": 2.151627804322546e-05, - "loss": 0.2592, + "epoch": 2.8597686236349875, + "grad_norm": 0.24143609404563904, + "learning_rate": 2.0430359212652224e-05, + "loss": 0.3769, "step": 79350 }, { - "epoch": 2.79, - "learning_rate": 2.151345713851544e-05, - "loss": 0.2645, + "epoch": 2.8599488232962123, + "grad_norm": 0.22710378468036652, + "learning_rate": 2.042749027254169e-05, + "loss": 0.4179, "step": 79355 }, { - "epoch": 2.79, - "learning_rate": 2.1510636279075878e-05, - "loss": 0.2633, + "epoch": 2.860129022957437, + "grad_norm": 0.23377621173858643, + "learning_rate": 2.0424621394730735e-05, + "loss": 0.3965, "step": 79360 }, { - "epoch": 2.79, - "learning_rate": 2.1507815464943397e-05, - "loss": 0.2484, + "epoch": 2.8603092226186613, + "grad_norm": 0.19734351336956024, + "learning_rate": 2.042175257925842e-05, + "loss": 0.3964, "step": 79365 }, { - "epoch": 2.79, - "learning_rate": 2.1504994696154633e-05, - "loss": 0.2675, + "epoch": 2.860489422279886, + "grad_norm": 0.21778082847595215, + "learning_rate": 2.0418883826163833e-05, + "loss": 0.4076, "step": 79370 }, { - "epoch": 2.79, - "learning_rate": 2.1502173972746215e-05, - "loss": 0.2762, + "epoch": 2.8606696219411107, + "grad_norm": 0.1940212845802307, + "learning_rate": 2.0416015135486074e-05, + "loss": 0.3652, "step": 79375 }, { - "epoch": 2.79, - "learning_rate": 2.149935329475476e-05, - "loss": 0.2842, + "epoch": 2.8608498216023355, + "grad_norm": 0.21918119490146637, + "learning_rate": 2.0413146507264216e-05, + "loss": 0.411, "step": 79380 }, { - "epoch": 2.79, - "learning_rate": 2.14965326622169e-05, - "loss": 0.2683, + "epoch": 2.86103002126356, + "grad_norm": 0.21737195551395416, + "learning_rate": 2.0410277941537352e-05, + "loss": 0.3977, "step": 79385 }, { - "epoch": 2.79, - "learning_rate": 2.149371207516924e-05, - "loss": 0.2686, + "epoch": 2.8612102209247845, + "grad_norm": 0.1796942949295044, + "learning_rate": 2.0407409438344566e-05, + "loss": 0.3913, "step": 79390 }, { - "epoch": 2.79, - "learning_rate": 2.1490891533648436e-05, - "loss": 0.2529, + "epoch": 2.8613904205860092, + "grad_norm": 0.20798861980438232, + "learning_rate": 2.040454099772493e-05, + "loss": 0.4107, "step": 79395 }, { - "epoch": 2.79, - "learning_rate": 2.148807103769108e-05, - "loss": 0.268, + "epoch": 2.861570620247234, + "grad_norm": 0.2642776370048523, + "learning_rate": 2.040167261971754e-05, + "loss": 0.3942, "step": 79400 }, { - "epoch": 2.79, - "learning_rate": 2.1485250587333817e-05, - "loss": 0.2525, + "epoch": 2.8617508199084587, + "grad_norm": 0.2090744972229004, + "learning_rate": 2.039880430436147e-05, + "loss": 0.3992, "step": 79405 }, { - "epoch": 2.79, - "learning_rate": 2.1482430182613248e-05, - "loss": 0.278, + "epoch": 2.861931019569683, + "grad_norm": 0.208735391497612, + "learning_rate": 2.0395936051695794e-05, + "loss": 0.37, "step": 79410 }, { - "epoch": 2.79, - "learning_rate": 2.1479609823566013e-05, - "loss": 0.2466, + "epoch": 2.8621112192309077, + "grad_norm": 0.2147226631641388, + "learning_rate": 2.0393067861759604e-05, + "loss": 0.4038, "step": 79415 }, { - "epoch": 2.79, - "learning_rate": 2.1476789510228725e-05, - "loss": 0.2907, + "epoch": 2.8622914188921325, + "grad_norm": 0.21452739834785461, + "learning_rate": 2.0390199734591967e-05, + "loss": 0.4, "step": 79420 }, { - "epoch": 2.79, - "learning_rate": 2.1473969242638e-05, - "loss": 0.2831, + "epoch": 2.862471618553357, + "grad_norm": 0.21164734661579132, + "learning_rate": 2.0387331670231972e-05, + "loss": 0.3923, "step": 79425 }, { - "epoch": 2.79, - "learning_rate": 2.1471149020830454e-05, - "loss": 0.2742, + "epoch": 2.862651818214582, + "grad_norm": 0.25900834798812866, + "learning_rate": 2.0384463668718695e-05, + "loss": 0.4046, "step": 79430 }, { - "epoch": 2.79, - "learning_rate": 2.1468328844842722e-05, - "loss": 0.2506, + "epoch": 2.8628320178758067, + "grad_norm": 0.2033701092004776, + "learning_rate": 2.0381595730091187e-05, + "loss": 0.4171, "step": 79435 }, { - "epoch": 2.79, - "learning_rate": 2.1465508714711413e-05, - "loss": 0.2554, + "epoch": 2.863012217537031, + "grad_norm": 0.23494963347911835, + "learning_rate": 2.0378727854388557e-05, + "loss": 0.4213, "step": 79440 }, { - "epoch": 2.8, - "learning_rate": 2.146268863047314e-05, - "loss": 0.2887, + "epoch": 2.8631924171982557, + "grad_norm": 0.21937774121761322, + "learning_rate": 2.0375860041649874e-05, + "loss": 0.3904, "step": 79445 }, { - "epoch": 2.8, - "learning_rate": 2.145986859216452e-05, - "loss": 0.264, + "epoch": 2.8633726168594804, + "grad_norm": 0.18949083983898163, + "learning_rate": 2.0372992291914182e-05, + "loss": 0.3691, "step": 79450 }, { - "epoch": 2.8, - "learning_rate": 2.145704859982218e-05, - "loss": 0.2591, + "epoch": 2.8635528165207047, + "grad_norm": 0.23285529017448425, + "learning_rate": 2.037012460522059e-05, + "loss": 0.4134, "step": 79455 }, { - "epoch": 2.8, - "learning_rate": 2.145422865348273e-05, - "loss": 0.2317, + "epoch": 2.8637330161819294, + "grad_norm": 0.2178962379693985, + "learning_rate": 2.036725698160814e-05, + "loss": 0.3796, "step": 79460 }, { - "epoch": 2.8, - "learning_rate": 2.1451408753182772e-05, - "loss": 0.2664, + "epoch": 2.863913215843154, + "grad_norm": 0.20775160193443298, + "learning_rate": 2.0364389421115936e-05, + "loss": 0.3863, "step": 79465 }, { - "epoch": 2.8, - "learning_rate": 2.1448588898958947e-05, - "loss": 0.2619, + "epoch": 2.864093415504379, + "grad_norm": 0.20087379217147827, + "learning_rate": 2.0361521923783018e-05, + "loss": 0.3904, "step": 79470 }, { - "epoch": 2.8, - "learning_rate": 2.1445769090847854e-05, - "loss": 0.2431, + "epoch": 2.8642736151656036, + "grad_norm": 0.2299887239933014, + "learning_rate": 2.035865448964846e-05, + "loss": 0.4115, "step": 79475 }, { - "epoch": 2.8, - "learning_rate": 2.14429493288861e-05, - "loss": 0.249, + "epoch": 2.8644538148268284, + "grad_norm": 0.22085849940776825, + "learning_rate": 2.0355787118751346e-05, + "loss": 0.3658, "step": 79480 }, { - "epoch": 2.8, - "learning_rate": 2.1440129613110302e-05, - "loss": 0.2691, + "epoch": 2.8646340144880527, + "grad_norm": 0.22253592312335968, + "learning_rate": 2.0352919811130726e-05, + "loss": 0.4168, "step": 79485 }, { - "epoch": 2.8, - "learning_rate": 2.143730994355708e-05, - "loss": 0.2644, + "epoch": 2.8648142141492774, + "grad_norm": 0.20002683997154236, + "learning_rate": 2.035005256682568e-05, + "loss": 0.4205, "step": 79490 }, { - "epoch": 2.8, - "learning_rate": 2.1434490320263048e-05, - "loss": 0.2756, + "epoch": 2.864994413810502, + "grad_norm": 0.22108720242977142, + "learning_rate": 2.034718538587526e-05, + "loss": 0.4173, "step": 79495 }, { - "epoch": 2.8, - "learning_rate": 2.1431670743264806e-05, - "loss": 0.2753, + "epoch": 2.8651746134717264, + "grad_norm": 0.20260627567768097, + "learning_rate": 2.034431826831854e-05, + "loss": 0.4003, "step": 79500 }, { - "epoch": 2.8, - "eval_loss": 0.2603822350502014, - "eval_runtime": 10.5518, - "eval_samples_per_second": 9.477, - "eval_steps_per_second": 9.477, + "epoch": 2.8651746134717264, + "eval_loss": 0.43116477131843567, + "eval_runtime": 3.5328, + "eval_samples_per_second": 28.306, + "eval_steps_per_second": 7.077, "step": 79500 }, { - "epoch": 2.8, - "learning_rate": 2.1428851212598958e-05, - "loss": 0.2806, + "epoch": 2.865354813132951, + "grad_norm": 0.20049650967121124, + "learning_rate": 2.0341451214194586e-05, + "loss": 0.3975, "step": 79505 }, { - "epoch": 2.8, - "learning_rate": 2.1426031728302126e-05, - "loss": 0.2657, + "epoch": 2.865535012794176, + "grad_norm": 0.21907275915145874, + "learning_rate": 2.0338584223542462e-05, + "loss": 0.4186, "step": 79510 }, { - "epoch": 2.8, - "learning_rate": 2.1423212290410922e-05, - "loss": 0.2625, + "epoch": 2.8657152124554006, + "grad_norm": 0.2359967678785324, + "learning_rate": 2.033571729640121e-05, + "loss": 0.3657, "step": 79515 }, { - "epoch": 2.8, - "learning_rate": 2.1420392898961956e-05, - "loss": 0.2688, + "epoch": 2.8658954121166254, + "grad_norm": 0.24726665019989014, + "learning_rate": 2.0332850432809922e-05, + "loss": 0.3807, "step": 79520 }, { - "epoch": 2.8, - "learning_rate": 2.141757355399181e-05, - "loss": 0.2591, + "epoch": 2.86607561177785, + "grad_norm": 0.24529293179512024, + "learning_rate": 2.0329983632807632e-05, + "loss": 0.4069, "step": 79525 }, { - "epoch": 2.8, - "learning_rate": 2.1414754255537126e-05, - "loss": 0.2542, + "epoch": 2.8662558114390744, + "grad_norm": 0.19809693098068237, + "learning_rate": 2.032711689643342e-05, + "loss": 0.3777, "step": 79530 }, { - "epoch": 2.8, - "learning_rate": 2.141193500363449e-05, - "loss": 0.2542, + "epoch": 2.866436011100299, + "grad_norm": 0.1797298640012741, + "learning_rate": 2.032425022372634e-05, + "loss": 0.3542, "step": 79535 }, { - "epoch": 2.8, - "learning_rate": 2.140911579832052e-05, - "loss": 0.2587, + "epoch": 2.866616210761524, + "grad_norm": 0.1798580288887024, + "learning_rate": 2.032138361472543e-05, + "loss": 0.3837, "step": 79540 }, { - "epoch": 2.8, - "learning_rate": 2.1406296639631798e-05, - "loss": 0.2797, + "epoch": 2.866796410422748, + "grad_norm": 0.2182854264974594, + "learning_rate": 2.0318517069469775e-05, + "loss": 0.406, "step": 79545 }, { - "epoch": 2.8, - "learning_rate": 2.140347752760496e-05, - "loss": 0.2718, + "epoch": 2.866976610083973, + "grad_norm": 0.19084390997886658, + "learning_rate": 2.0315650587998416e-05, + "loss": 0.3815, "step": 79550 }, { - "epoch": 2.8, - "learning_rate": 2.1400658462276595e-05, - "loss": 0.278, + "epoch": 2.8671568097451976, + "grad_norm": 0.20089589059352875, + "learning_rate": 2.0312784170350404e-05, + "loss": 0.3958, "step": 79555 }, { - "epoch": 2.8, - "learning_rate": 2.1397839443683305e-05, - "loss": 0.2467, + "epoch": 2.8673370094064223, + "grad_norm": 0.26835712790489197, + "learning_rate": 2.030991781656481e-05, + "loss": 0.3866, "step": 79560 }, { - "epoch": 2.8, - "learning_rate": 2.1395020471861688e-05, - "loss": 0.2569, + "epoch": 2.867517209067647, + "grad_norm": 0.1739826798439026, + "learning_rate": 2.030705152668066e-05, + "loss": 0.3778, "step": 79565 }, { - "epoch": 2.8, - "learning_rate": 2.1392201546848368e-05, - "loss": 0.2563, + "epoch": 2.867697408728872, + "grad_norm": 0.23499102890491486, + "learning_rate": 2.0304185300737046e-05, + "loss": 0.3978, "step": 79570 }, { - "epoch": 2.8, - "learning_rate": 2.138938266867993e-05, - "loss": 0.2755, + "epoch": 2.867877608390096, + "grad_norm": 0.19982177019119263, + "learning_rate": 2.030131913877299e-05, + "loss": 0.4005, "step": 79575 }, { - "epoch": 2.8, - "learning_rate": 2.138656383739297e-05, - "loss": 0.2489, + "epoch": 2.868057808051321, + "grad_norm": 0.18712134659290314, + "learning_rate": 2.0298453040827544e-05, + "loss": 0.4348, "step": 79580 }, { - "epoch": 2.8, - "learning_rate": 2.1383745053024106e-05, - "loss": 0.2614, + "epoch": 2.8682380077125456, + "grad_norm": 0.26873958110809326, + "learning_rate": 2.0295587006939772e-05, + "loss": 0.3693, "step": 79585 }, { - "epoch": 2.8, - "learning_rate": 2.138092631560993e-05, - "loss": 0.2598, + "epoch": 2.8684182073737703, + "grad_norm": 0.1947907954454422, + "learning_rate": 2.0292721037148717e-05, + "loss": 0.3872, "step": 79590 }, { - "epoch": 2.8, - "learning_rate": 2.137810762518704e-05, - "loss": 0.2725, + "epoch": 2.8685984070349946, + "grad_norm": 0.16691918671131134, + "learning_rate": 2.0289855131493422e-05, + "loss": 0.3799, "step": 79595 }, { - "epoch": 2.8, - "learning_rate": 2.1375288981792026e-05, - "loss": 0.2691, + "epoch": 2.8687786066962193, + "grad_norm": 0.15523312985897064, + "learning_rate": 2.0286989290012945e-05, + "loss": 0.3837, "step": 79600 }, { - "epoch": 2.8, - "learning_rate": 2.1372470385461506e-05, - "loss": 0.2584, + "epoch": 2.868958806357444, + "grad_norm": 0.22626052796840668, + "learning_rate": 2.0284123512746317e-05, + "loss": 0.3807, "step": 79605 }, { - "epoch": 2.8, - "learning_rate": 2.136965183623207e-05, - "loss": 0.2645, + "epoch": 2.869139006018669, + "grad_norm": 0.25360921025276184, + "learning_rate": 2.0281257799732602e-05, + "loss": 0.401, "step": 79610 }, { - "epoch": 2.8, - "learning_rate": 2.1366833334140302e-05, - "loss": 0.2702, + "epoch": 2.8693192056798935, + "grad_norm": 0.21159762144088745, + "learning_rate": 2.0278392151010837e-05, + "loss": 0.4108, "step": 79615 }, { - "epoch": 2.8, - "learning_rate": 2.1364014879222807e-05, - "loss": 0.2578, + "epoch": 2.869499405341118, + "grad_norm": 0.19640955328941345, + "learning_rate": 2.0275526566620058e-05, + "loss": 0.3899, "step": 79620 }, { - "epoch": 2.8, - "learning_rate": 2.136119647151619e-05, - "loss": 0.2612, + "epoch": 2.8696796050023425, + "grad_norm": 0.20014135539531708, + "learning_rate": 2.0272661046599318e-05, + "loss": 0.3942, "step": 79625 }, { - "epoch": 2.8, - "learning_rate": 2.1358378111057046e-05, - "loss": 0.2815, + "epoch": 2.8698598046635673, + "grad_norm": 0.20158907771110535, + "learning_rate": 2.026979559098766e-05, + "loss": 0.367, "step": 79630 }, { - "epoch": 2.8, - "learning_rate": 2.1355559797881957e-05, - "loss": 0.2783, + "epoch": 2.870040004324792, + "grad_norm": 0.18245342373847961, + "learning_rate": 2.0266930199824108e-05, + "loss": 0.3979, "step": 79635 }, { - "epoch": 2.8, - "learning_rate": 2.1353305181410992e-05, - "loss": 0.2755, + "epoch": 2.8702202039860163, + "grad_norm": 0.2652152478694916, + "learning_rate": 2.0264064873147735e-05, + "loss": 0.3925, "step": 79640 }, { - "epoch": 2.8, - "learning_rate": 2.1350486953439432e-05, - "loss": 0.2867, + "epoch": 2.870400403647241, + "grad_norm": 0.22294139862060547, + "learning_rate": 2.026119961099754e-05, + "loss": 0.4227, "step": 79645 }, { - "epoch": 2.8, - "learning_rate": 2.134766877285438e-05, - "loss": 0.2578, + "epoch": 2.8705806033084658, + "grad_norm": 0.20929698646068573, + "learning_rate": 2.02583344134126e-05, + "loss": 0.3903, "step": 79650 }, { - "epoch": 2.8, - "learning_rate": 2.1344850639692447e-05, - "loss": 0.2673, + "epoch": 2.8707608029696905, + "grad_norm": 0.2380889505147934, + "learning_rate": 2.0255469280431932e-05, + "loss": 0.3933, "step": 79655 }, { - "epoch": 2.8, - "learning_rate": 2.1342032553990228e-05, - "loss": 0.2736, + "epoch": 2.8709410026309152, + "grad_norm": 0.21243393421173096, + "learning_rate": 2.025260421209457e-05, + "loss": 0.4065, "step": 79660 }, { - "epoch": 2.8, - "learning_rate": 2.13392145157843e-05, - "loss": 0.2522, + "epoch": 2.87112120229214, + "grad_norm": 0.2032131552696228, + "learning_rate": 2.0249739208439562e-05, + "loss": 0.373, "step": 79665 }, { - "epoch": 2.8, - "learning_rate": 2.1336396525111252e-05, - "loss": 0.266, + "epoch": 2.8713014019533643, + "grad_norm": 0.2194889783859253, + "learning_rate": 2.0246874269505934e-05, + "loss": 0.4264, "step": 79670 }, { - "epoch": 2.8, - "learning_rate": 2.133357858200769e-05, - "loss": 0.2527, + "epoch": 2.871481601614589, + "grad_norm": 0.20919016003608704, + "learning_rate": 2.0244009395332725e-05, + "loss": 0.4062, "step": 79675 }, { - "epoch": 2.8, - "learning_rate": 2.1330760686510188e-05, - "loss": 0.2538, + "epoch": 2.8716618012758137, + "grad_norm": 0.17519910633563995, + "learning_rate": 2.024114458595897e-05, + "loss": 0.3697, "step": 79680 }, { - "epoch": 2.8, - "learning_rate": 2.132794283865535e-05, - "loss": 0.2634, + "epoch": 2.871842000937038, + "grad_norm": 0.2320595383644104, + "learning_rate": 2.0238279841423693e-05, + "loss": 0.3617, "step": 79685 }, { - "epoch": 2.8, - "learning_rate": 2.1325125038479736e-05, - "loss": 0.2837, + "epoch": 2.8720222005982627, + "grad_norm": 0.20876125991344452, + "learning_rate": 2.0235415161765936e-05, + "loss": 0.4013, "step": 79690 }, { - "epoch": 2.8, - "learning_rate": 2.132230728601997e-05, - "loss": 0.2694, + "epoch": 2.8722024002594875, + "grad_norm": 0.2067393809556961, + "learning_rate": 2.0232550547024726e-05, + "loss": 0.4045, "step": 79695 }, { - "epoch": 2.8, - "learning_rate": 2.1319489581312614e-05, - "loss": 0.2669, + "epoch": 2.872382599920712, + "grad_norm": 0.2197972983121872, + "learning_rate": 2.0229685997239088e-05, + "loss": 0.3972, "step": 79700 }, { - "epoch": 2.8, - "learning_rate": 2.131667192439426e-05, - "loss": 0.2794, + "epoch": 2.872562799581937, + "grad_norm": 0.23758886754512787, + "learning_rate": 2.0226821512448057e-05, + "loss": 0.3835, "step": 79705 }, { - "epoch": 2.8, - "learning_rate": 2.1313854315301486e-05, - "loss": 0.2697, + "epoch": 2.8727429992431617, + "grad_norm": 0.20239755511283875, + "learning_rate": 2.0223957092690653e-05, + "loss": 0.3908, "step": 79710 }, { - "epoch": 2.8, - "learning_rate": 2.1311036754070897e-05, - "loss": 0.2478, + "epoch": 2.872923198904386, + "grad_norm": 0.23541153967380524, + "learning_rate": 2.0221092738005916e-05, + "loss": 0.3804, "step": 79715 }, { - "epoch": 2.8, - "learning_rate": 2.130821924073906e-05, - "loss": 0.2492, + "epoch": 2.8731033985656107, + "grad_norm": 0.19039641320705414, + "learning_rate": 2.021822844843287e-05, + "loss": 0.3887, "step": 79720 }, { - "epoch": 2.8, - "learning_rate": 2.1305401775342566e-05, - "loss": 0.2565, + "epoch": 2.8732835982268354, + "grad_norm": 0.20644930005073547, + "learning_rate": 2.0215364224010522e-05, + "loss": 0.3808, "step": 79725 }, { - "epoch": 2.81, - "learning_rate": 2.1302584357917985e-05, - "loss": 0.2698, + "epoch": 2.8734637978880597, + "grad_norm": 0.2427452951669693, + "learning_rate": 2.021250006477792e-05, + "loss": 0.4126, "step": 79730 }, { - "epoch": 2.81, - "learning_rate": 2.1299766988501913e-05, - "loss": 0.2696, + "epoch": 2.8736439975492845, + "grad_norm": 0.19568517804145813, + "learning_rate": 2.020963597077408e-05, + "loss": 0.4055, "step": 79735 }, { - "epoch": 2.81, - "learning_rate": 2.1296949667130935e-05, - "loss": 0.278, + "epoch": 2.873824197210509, + "grad_norm": 0.21443352103233337, + "learning_rate": 2.0206771942038008e-05, + "loss": 0.3971, "step": 79740 }, { - "epoch": 2.81, - "learning_rate": 2.1294132393841627e-05, - "loss": 0.2425, + "epoch": 2.874004396871734, + "grad_norm": 0.19338439404964447, + "learning_rate": 2.020390797860876e-05, + "loss": 0.3834, "step": 79745 }, { - "epoch": 2.81, - "learning_rate": 2.1291315168670553e-05, - "loss": 0.2866, + "epoch": 2.8741845965329587, + "grad_norm": 0.1852528303861618, + "learning_rate": 2.0201044080525314e-05, + "loss": 0.3668, "step": 79750 }, { - "epoch": 2.81, - "learning_rate": 2.1288497991654322e-05, - "loss": 0.2734, + "epoch": 2.8743647961941834, + "grad_norm": 0.2564043402671814, + "learning_rate": 2.0198180247826734e-05, + "loss": 0.4105, "step": 79755 }, { - "epoch": 2.81, - "learning_rate": 2.128568086282949e-05, - "loss": 0.2391, + "epoch": 2.8745449958554077, + "grad_norm": 0.227681502699852, + "learning_rate": 2.0195316480552013e-05, + "loss": 0.3855, "step": 79760 }, { - "epoch": 2.81, - "learning_rate": 2.128286378223264e-05, - "loss": 0.2667, + "epoch": 2.8747251955166324, + "grad_norm": 0.21400003135204315, + "learning_rate": 2.0192452778740166e-05, + "loss": 0.384, "step": 79765 }, { - "epoch": 2.81, - "learning_rate": 2.1280046749900364e-05, - "loss": 0.2573, + "epoch": 2.874905395177857, + "grad_norm": 0.17560800909996033, + "learning_rate": 2.018958914243023e-05, + "loss": 0.3919, "step": 79770 }, { - "epoch": 2.81, - "learning_rate": 2.127722976586923e-05, - "loss": 0.2598, + "epoch": 2.8750855948390814, + "grad_norm": 0.2656613886356354, + "learning_rate": 2.0186725571661207e-05, + "loss": 0.4083, "step": 79775 }, { - "epoch": 2.81, - "learning_rate": 2.1274412830175813e-05, - "loss": 0.264, + "epoch": 2.875265794500306, + "grad_norm": 0.23810866475105286, + "learning_rate": 2.018386206647211e-05, + "loss": 0.3885, "step": 79780 }, { - "epoch": 2.81, - "learning_rate": 2.127159594285668e-05, - "loss": 0.2813, + "epoch": 2.875445994161531, + "grad_norm": 0.23813889920711517, + "learning_rate": 2.0180998626901966e-05, + "loss": 0.3846, "step": 79785 }, { - "epoch": 2.81, - "learning_rate": 2.1268779103948423e-05, - "loss": 0.2484, + "epoch": 2.8756261938227556, + "grad_norm": 0.19781778752803802, + "learning_rate": 2.017813525298978e-05, + "loss": 0.385, "step": 79790 }, { - "epoch": 2.81, - "learning_rate": 2.1265962313487618e-05, - "loss": 0.2816, + "epoch": 2.8758063934839804, + "grad_norm": 0.23655954003334045, + "learning_rate": 2.017527194477457e-05, + "loss": 0.406, "step": 79795 }, { - "epoch": 2.81, - "learning_rate": 2.126314557151083e-05, - "loss": 0.2595, + "epoch": 2.875986593145205, + "grad_norm": 0.2517338991165161, + "learning_rate": 2.0172408702295347e-05, + "loss": 0.3935, "step": 79800 }, { - "epoch": 2.81, - "learning_rate": 2.1260328878054618e-05, - "loss": 0.2617, + "epoch": 2.8761667928064294, + "grad_norm": 0.19464854896068573, + "learning_rate": 2.0169545525591117e-05, + "loss": 0.41, "step": 79805 }, { - "epoch": 2.81, - "learning_rate": 2.1257512233155587e-05, - "loss": 0.2855, + "epoch": 2.876346992467654, + "grad_norm": 0.2122073471546173, + "learning_rate": 2.0166682414700896e-05, + "loss": 0.3577, "step": 79810 }, { - "epoch": 2.81, - "learning_rate": 2.1254695636850288e-05, - "loss": 0.2838, + "epoch": 2.876527192128879, + "grad_norm": 0.22305898368358612, + "learning_rate": 2.01638193696637e-05, + "loss": 0.4146, "step": 79815 }, { - "epoch": 2.81, - "learning_rate": 2.12518790891753e-05, - "loss": 0.2773, + "epoch": 2.8767073917901036, + "grad_norm": 0.2052910327911377, + "learning_rate": 2.0160956390518508e-05, + "loss": 0.3865, "step": 79820 }, { - "epoch": 2.81, - "learning_rate": 2.1249062590167186e-05, - "loss": 0.2413, + "epoch": 2.876887591451328, + "grad_norm": 0.23175741732120514, + "learning_rate": 2.0158093477304367e-05, + "loss": 0.3845, "step": 79825 }, { - "epoch": 2.81, - "learning_rate": 2.124624613986253e-05, - "loss": 0.2865, + "epoch": 2.8770677911125526, + "grad_norm": 0.14851652085781097, + "learning_rate": 2.0155230630060252e-05, + "loss": 0.3451, "step": 79830 }, { - "epoch": 2.81, - "learning_rate": 2.1243429738297895e-05, - "loss": 0.2846, + "epoch": 2.8772479907737774, + "grad_norm": 0.21722625195980072, + "learning_rate": 2.0152367848825195e-05, + "loss": 0.4043, "step": 79835 }, { - "epoch": 2.81, - "learning_rate": 2.124061338550984e-05, - "loss": 0.2647, + "epoch": 2.877428190435002, + "grad_norm": 0.17405612766742706, + "learning_rate": 2.014950513363818e-05, + "loss": 0.3895, "step": 79840 }, { - "epoch": 2.81, - "learning_rate": 2.1237797081534948e-05, - "loss": 0.2382, + "epoch": 2.877608390096227, + "grad_norm": 0.23024611175060272, + "learning_rate": 2.0146642484538217e-05, + "loss": 0.4022, "step": 79845 }, { - "epoch": 2.81, - "learning_rate": 2.123498082640978e-05, - "loss": 0.2995, + "epoch": 2.877788589757451, + "grad_norm": 0.23710598051548004, + "learning_rate": 2.0143779901564313e-05, + "loss": 0.3921, "step": 79850 }, { - "epoch": 2.81, - "learning_rate": 2.1232164620170908e-05, - "loss": 0.2746, + "epoch": 2.877968789418676, + "grad_norm": 0.19851794838905334, + "learning_rate": 2.014091738475547e-05, + "loss": 0.4004, "step": 79855 }, { - "epoch": 2.81, - "learning_rate": 2.1229348462854898e-05, - "loss": 0.2655, + "epoch": 2.8781489890799006, + "grad_norm": 0.19404460489749908, + "learning_rate": 2.013805493415068e-05, + "loss": 0.3714, "step": 79860 }, { - "epoch": 2.81, - "learning_rate": 2.1226532354498303e-05, - "loss": 0.2675, + "epoch": 2.8783291887411253, + "grad_norm": 0.1905554085969925, + "learning_rate": 2.0135192549788955e-05, + "loss": 0.3373, "step": 79865 }, { - "epoch": 2.81, - "learning_rate": 2.12237162951377e-05, - "loss": 0.2652, + "epoch": 2.8785093884023496, + "grad_norm": 0.20806780457496643, + "learning_rate": 2.0132330231709287e-05, + "loss": 0.387, "step": 79870 }, { - "epoch": 2.81, - "learning_rate": 2.1220900284809657e-05, - "loss": 0.2666, + "epoch": 2.8786895880635743, + "grad_norm": 0.2478901594877243, + "learning_rate": 2.012946797995068e-05, + "loss": 0.4016, "step": 79875 }, { - "epoch": 2.81, - "learning_rate": 2.1218084323550728e-05, - "loss": 0.2566, + "epoch": 2.878869787724799, + "grad_norm": 0.17782168090343475, + "learning_rate": 2.012660579455213e-05, + "loss": 0.4067, "step": 79880 }, { - "epoch": 2.81, - "learning_rate": 2.121526841139749e-05, - "loss": 0.2406, + "epoch": 2.879049987386024, + "grad_norm": 0.2104375660419464, + "learning_rate": 2.0123743675552624e-05, + "loss": 0.3679, "step": 79885 }, { - "epoch": 2.81, - "learning_rate": 2.1212452548386497e-05, - "loss": 0.2773, + "epoch": 2.8792301870472485, + "grad_norm": 0.16701386868953705, + "learning_rate": 2.0120881622991178e-05, + "loss": 0.3655, "step": 79890 }, { - "epoch": 2.81, - "learning_rate": 2.1209636734554307e-05, - "loss": 0.2602, + "epoch": 2.879410386708473, + "grad_norm": 0.24730798602104187, + "learning_rate": 2.0118019636906765e-05, + "loss": 0.3892, "step": 79895 }, { - "epoch": 2.81, - "learning_rate": 2.1206820969937485e-05, - "loss": 0.2835, + "epoch": 2.8795905863696976, + "grad_norm": 0.19160984456539154, + "learning_rate": 2.0115157717338396e-05, + "loss": 0.4107, "step": 79900 }, { - "epoch": 2.81, - "learning_rate": 2.1204005254572594e-05, - "loss": 0.2683, + "epoch": 2.8797707860309223, + "grad_norm": 0.22043399512767792, + "learning_rate": 2.0112295864325057e-05, + "loss": 0.3838, "step": 79905 }, { - "epoch": 2.81, - "learning_rate": 2.1201189588496197e-05, - "loss": 0.2697, + "epoch": 2.879950985692147, + "grad_norm": 0.1867411732673645, + "learning_rate": 2.0109434077905737e-05, + "loss": 0.4019, "step": 79910 }, { - "epoch": 2.81, - "learning_rate": 2.119837397174485e-05, - "loss": 0.2709, + "epoch": 2.8801311853533713, + "grad_norm": 0.23410554230213165, + "learning_rate": 2.0106572358119433e-05, + "loss": 0.398, "step": 79915 }, { - "epoch": 2.81, - "learning_rate": 2.1195558404355104e-05, - "loss": 0.2803, + "epoch": 2.880311385014596, + "grad_norm": 0.2368057370185852, + "learning_rate": 2.0103710705005142e-05, + "loss": 0.3796, "step": 79920 }, { - "epoch": 2.81, - "learning_rate": 2.1192742886363532e-05, - "loss": 0.2833, + "epoch": 2.880491584675821, + "grad_norm": 0.14740437269210815, + "learning_rate": 2.0100849118601824e-05, + "loss": 0.3645, "step": 79925 }, { - "epoch": 2.81, - "learning_rate": 2.1189927417806678e-05, - "loss": 0.2837, + "epoch": 2.8806717843370455, + "grad_norm": 0.18294650316238403, + "learning_rate": 2.0097987598948507e-05, + "loss": 0.3756, "step": 79930 }, { - "epoch": 2.81, - "learning_rate": 2.1187111998721114e-05, - "loss": 0.2748, + "epoch": 2.8808519839982702, + "grad_norm": 0.18560317158699036, + "learning_rate": 2.0095126146084145e-05, + "loss": 0.3951, "step": 79935 }, { - "epoch": 2.81, - "learning_rate": 2.1184296629143377e-05, - "loss": 0.2607, + "epoch": 2.881032183659495, + "grad_norm": 0.2644931375980377, + "learning_rate": 2.0092264760047758e-05, + "loss": 0.3868, "step": 79940 }, { - "epoch": 2.81, - "learning_rate": 2.1181481309110043e-05, - "loss": 0.2764, + "epoch": 2.8812123833207193, + "grad_norm": 0.197056844830513, + "learning_rate": 2.00894034408783e-05, + "loss": 0.3664, "step": 79945 }, { - "epoch": 2.81, - "learning_rate": 2.1178666038657653e-05, - "loss": 0.2747, + "epoch": 2.881392582981944, + "grad_norm": 0.2113136351108551, + "learning_rate": 2.0086542188614772e-05, + "loss": 0.3959, "step": 79950 }, { - "epoch": 2.81, - "learning_rate": 2.1175850817822774e-05, - "loss": 0.3014, + "epoch": 2.8815727826431687, + "grad_norm": 0.21340829133987427, + "learning_rate": 2.0083681003296158e-05, + "loss": 0.3783, "step": 79955 }, { - "epoch": 2.81, - "learning_rate": 2.1173035646641938e-05, - "loss": 0.2642, + "epoch": 2.881752982304393, + "grad_norm": 0.2382301241159439, + "learning_rate": 2.0080819884961437e-05, + "loss": 0.3865, "step": 79960 }, { - "epoch": 2.81, - "learning_rate": 2.1170220525151724e-05, - "loss": 0.2579, + "epoch": 2.8819331819656178, + "grad_norm": 0.2526952028274536, + "learning_rate": 2.007795883364959e-05, + "loss": 0.3977, "step": 79965 }, { - "epoch": 2.81, - "learning_rate": 2.1167405453388673e-05, - "loss": 0.2858, + "epoch": 2.8821133816268425, + "grad_norm": 0.2137371152639389, + "learning_rate": 2.0075097849399603e-05, + "loss": 0.3798, "step": 79970 }, { - "epoch": 2.81, - "learning_rate": 2.1164590431389326e-05, - "loss": 0.268, + "epoch": 2.8822935812880672, + "grad_norm": 0.2008858621120453, + "learning_rate": 2.007223693225045e-05, + "loss": 0.4151, "step": 79975 }, { - "epoch": 2.81, - "learning_rate": 2.116177545919025e-05, - "loss": 0.2737, + "epoch": 2.882473780949292, + "grad_norm": 0.20356670022010803, + "learning_rate": 2.006937608224112e-05, + "loss": 0.4056, "step": 79980 }, { - "epoch": 2.81, - "learning_rate": 2.115896053682799e-05, - "loss": 0.2739, + "epoch": 2.8826539806105167, + "grad_norm": 0.23385298252105713, + "learning_rate": 2.006651529941059e-05, + "loss": 0.4071, "step": 79985 }, { - "epoch": 2.81, - "learning_rate": 2.11561456643391e-05, - "loss": 0.2873, + "epoch": 2.882834180271741, + "grad_norm": 0.24331295490264893, + "learning_rate": 2.0063654583797825e-05, + "loss": 0.3944, "step": 79990 }, { - "epoch": 2.81, - "learning_rate": 2.1153330841760115e-05, - "loss": 0.2966, + "epoch": 2.8830143799329657, + "grad_norm": 0.22254779934883118, + "learning_rate": 2.0060793935441818e-05, + "loss": 0.4021, "step": 79995 }, { - "epoch": 2.81, - "learning_rate": 2.1150516069127603e-05, - "loss": 0.2477, + "epoch": 2.8831945795941905, + "grad_norm": 0.24361281096935272, + "learning_rate": 2.0057933354381543e-05, + "loss": 0.4402, "step": 80000 }, { - "epoch": 2.81, - "eval_loss": 0.26057764887809753, - "eval_runtime": 10.5527, - "eval_samples_per_second": 9.476, - "eval_steps_per_second": 9.476, + "epoch": 2.8831945795941905, + "eval_loss": 0.43133535981178284, + "eval_runtime": 3.5314, + "eval_samples_per_second": 28.317, + "eval_steps_per_second": 7.079, "step": 80000 }, { - "epoch": 2.81, - "learning_rate": 2.114770134647811e-05, - "loss": 0.261, + "epoch": 2.8833747792554147, + "grad_norm": 0.19643712043762207, + "learning_rate": 2.0055072840655952e-05, + "loss": 0.3874, "step": 80005 }, { - "epoch": 2.81, - "learning_rate": 2.1144886673848164e-05, - "loss": 0.2834, + "epoch": 2.8835549789166395, + "grad_norm": 0.23182834684848785, + "learning_rate": 2.005221239430405e-05, + "loss": 0.3392, "step": 80010 }, { - "epoch": 2.82, - "learning_rate": 2.114207205127432e-05, - "loss": 0.2665, + "epoch": 2.883735178577864, + "grad_norm": 0.19430822134017944, + "learning_rate": 2.004935201536478e-05, + "loss": 0.4039, "step": 80015 }, { - "epoch": 2.82, - "learning_rate": 2.1139257478793137e-05, - "loss": 0.2911, + "epoch": 2.883915378239089, + "grad_norm": 0.2064819037914276, + "learning_rate": 2.0046491703877143e-05, + "loss": 0.3976, "step": 80020 }, { - "epoch": 2.82, - "learning_rate": 2.1136442956441155e-05, - "loss": 0.2903, + "epoch": 2.8840955779003137, + "grad_norm": 0.22902020812034607, + "learning_rate": 2.0043631459880103e-05, + "loss": 0.3979, "step": 80025 }, { - "epoch": 2.82, - "learning_rate": 2.1133628484254906e-05, - "loss": 0.2478, + "epoch": 2.8842757775615384, + "grad_norm": 0.26856574416160583, + "learning_rate": 2.004077128341261e-05, + "loss": 0.3935, "step": 80030 }, { - "epoch": 2.82, - "learning_rate": 2.1130814062270944e-05, - "loss": 0.2896, + "epoch": 2.8844559772227627, + "grad_norm": 0.2205887734889984, + "learning_rate": 2.0037911174513663e-05, + "loss": 0.3823, "step": 80035 }, { - "epoch": 2.82, - "learning_rate": 2.112799969052581e-05, - "loss": 0.2863, + "epoch": 2.8846361768839874, + "grad_norm": 0.17064271867275238, + "learning_rate": 2.003505113322221e-05, + "loss": 0.348, "step": 80040 }, { - "epoch": 2.82, - "learning_rate": 2.112518536905606e-05, - "loss": 0.2933, + "epoch": 2.884816376545212, + "grad_norm": 0.2369140088558197, + "learning_rate": 2.003219115957722e-05, + "loss": 0.3831, "step": 80045 }, { - "epoch": 2.82, - "learning_rate": 2.112237109789822e-05, - "loss": 0.2573, + "epoch": 2.8849965762064365, + "grad_norm": 0.19578613340854645, + "learning_rate": 2.0029331253617666e-05, + "loss": 0.3514, "step": 80050 }, { - "epoch": 2.82, - "learning_rate": 2.1119556877088824e-05, - "loss": 0.2869, + "epoch": 2.885176775867661, + "grad_norm": 0.22336263954639435, + "learning_rate": 2.0026471415382507e-05, + "loss": 0.4095, "step": 80055 }, { - "epoch": 2.82, - "learning_rate": 2.111674270666444e-05, - "loss": 0.2613, + "epoch": 2.885356975528886, + "grad_norm": 0.20325559377670288, + "learning_rate": 2.0023611644910716e-05, + "loss": 0.412, "step": 80060 }, { - "epoch": 2.82, - "learning_rate": 2.1113928586661583e-05, - "loss": 0.2761, + "epoch": 2.8855371751901107, + "grad_norm": 0.21424368023872375, + "learning_rate": 2.0020751942241255e-05, + "loss": 0.3845, "step": 80065 }, { - "epoch": 2.82, - "learning_rate": 2.111111451711681e-05, - "loss": 0.2667, + "epoch": 2.8857173748513354, + "grad_norm": 0.2809605300426483, + "learning_rate": 2.001789230741308e-05, + "loss": 0.3961, "step": 80070 }, { - "epoch": 2.82, - "learning_rate": 2.1108300498066638e-05, - "loss": 0.267, + "epoch": 2.88589757451256, + "grad_norm": 0.18413975834846497, + "learning_rate": 2.0015032740465165e-05, + "loss": 0.4107, "step": 80075 }, { - "epoch": 2.82, - "learning_rate": 2.1105486529547633e-05, - "loss": 0.2622, + "epoch": 2.8860777741737844, + "grad_norm": 0.28514033555984497, + "learning_rate": 2.0012173241436456e-05, + "loss": 0.3854, "step": 80080 }, { - "epoch": 2.82, - "learning_rate": 2.1102672611596318e-05, - "loss": 0.2929, + "epoch": 2.886257973835009, + "grad_norm": 0.27392441034317017, + "learning_rate": 2.0009313810365925e-05, + "loss": 0.3835, "step": 80085 }, { - "epoch": 2.82, - "learning_rate": 2.1099858744249226e-05, - "loss": 0.252, + "epoch": 2.886438173496234, + "grad_norm": 0.1907772570848465, + "learning_rate": 2.000645444729253e-05, + "loss": 0.3915, "step": 80090 }, { - "epoch": 2.82, - "learning_rate": 2.1097044927542892e-05, - "loss": 0.2905, + "epoch": 2.8866183731574586, + "grad_norm": 0.2062581181526184, + "learning_rate": 2.0003595152255218e-05, + "loss": 0.3623, "step": 80095 }, { - "epoch": 2.82, - "learning_rate": 2.109423116151387e-05, - "loss": 0.2739, + "epoch": 2.886798572818683, + "grad_norm": 0.20222362875938416, + "learning_rate": 2.000073592529296e-05, + "loss": 0.369, "step": 80100 }, { - "epoch": 2.82, - "learning_rate": 2.109141744619868e-05, - "loss": 0.2775, + "epoch": 2.8869787724799076, + "grad_norm": 0.21699479222297668, + "learning_rate": 1.9997876766444716e-05, + "loss": 0.3965, "step": 80105 }, { - "epoch": 2.82, - "learning_rate": 2.1088603781633847e-05, - "loss": 0.2554, + "epoch": 2.8871589721411324, + "grad_norm": 0.24619553983211517, + "learning_rate": 1.999501767574941e-05, + "loss": 0.4102, "step": 80110 }, { - "epoch": 2.82, - "learning_rate": 2.108579016785593e-05, - "loss": 0.2898, + "epoch": 2.887339171802357, + "grad_norm": 0.21754342317581177, + "learning_rate": 1.9992158653246042e-05, + "loss": 0.3494, "step": 80115 }, { - "epoch": 2.82, - "learning_rate": 2.108297660490144e-05, - "loss": 0.2458, + "epoch": 2.887519371463582, + "grad_norm": 0.22781917452812195, + "learning_rate": 1.9989299698973525e-05, + "loss": 0.3932, "step": 80120 }, { - "epoch": 2.82, - "learning_rate": 2.1080163092806927e-05, - "loss": 0.2766, + "epoch": 2.887699571124806, + "grad_norm": 0.17550964653491974, + "learning_rate": 1.9986440812970844e-05, + "loss": 0.3504, "step": 80125 }, { - "epoch": 2.82, - "learning_rate": 2.1077349631608894e-05, - "loss": 0.2797, + "epoch": 2.887879770786031, + "grad_norm": 0.20450954139232635, + "learning_rate": 1.998358199527693e-05, + "loss": 0.4168, "step": 80130 }, { - "epoch": 2.82, - "learning_rate": 2.107453622134391e-05, - "loss": 0.2439, + "epoch": 2.8880599704472556, + "grad_norm": 0.20474529266357422, + "learning_rate": 1.9980723245930737e-05, + "loss": 0.3699, "step": 80135 }, { - "epoch": 2.82, - "learning_rate": 2.107172286204848e-05, - "loss": 0.2781, + "epoch": 2.8882401701084803, + "grad_norm": 0.22211550176143646, + "learning_rate": 1.9977864564971225e-05, + "loss": 0.3711, "step": 80140 }, { - "epoch": 2.82, - "learning_rate": 2.1068909553759138e-05, - "loss": 0.2773, + "epoch": 2.8884203697697046, + "grad_norm": 0.2340330183506012, + "learning_rate": 1.9975005952437336e-05, + "loss": 0.3975, "step": 80145 }, { - "epoch": 2.82, - "learning_rate": 2.106609629651241e-05, - "loss": 0.2765, + "epoch": 2.8886005694309294, + "grad_norm": 0.211472749710083, + "learning_rate": 1.9972147408368008e-05, + "loss": 0.3676, "step": 80150 }, { - "epoch": 2.82, - "learning_rate": 2.106328309034484e-05, - "loss": 0.2547, + "epoch": 2.888780769092154, + "grad_norm": 0.17939423024654388, + "learning_rate": 1.9969288932802205e-05, + "loss": 0.3778, "step": 80155 }, { - "epoch": 2.82, - "learning_rate": 2.106046993529294e-05, - "loss": 0.2779, + "epoch": 2.888960968753379, + "grad_norm": 0.21015068888664246, + "learning_rate": 1.996643052577886e-05, + "loss": 0.3659, "step": 80160 }, { - "epoch": 2.82, - "learning_rate": 2.1057656831393246e-05, - "loss": 0.2747, + "epoch": 2.8891411684146036, + "grad_norm": 0.20937329530715942, + "learning_rate": 1.9963572187336935e-05, + "loss": 0.3988, "step": 80165 }, { - "epoch": 2.82, - "learning_rate": 2.1054843778682272e-05, - "loss": 0.2883, + "epoch": 2.8893213680758283, + "grad_norm": 0.17730148136615753, + "learning_rate": 1.996071391751536e-05, + "loss": 0.3907, "step": 80170 }, { - "epoch": 2.82, - "learning_rate": 2.105203077719655e-05, - "loss": 0.2727, + "epoch": 2.8895015677370526, + "grad_norm": 0.18675543367862701, + "learning_rate": 1.995785571635308e-05, + "loss": 0.4226, "step": 80175 }, { - "epoch": 2.82, - "learning_rate": 2.1049217826972617e-05, - "loss": 0.2562, + "epoch": 2.8896817673982773, + "grad_norm": 0.20549610257148743, + "learning_rate": 1.995499758388904e-05, + "loss": 0.3824, "step": 80180 }, { - "epoch": 2.82, - "learning_rate": 2.1046404928046983e-05, - "loss": 0.2653, + "epoch": 2.889861967059502, + "grad_norm": 0.21189923584461212, + "learning_rate": 1.9952139520162186e-05, + "loss": 0.402, "step": 80185 }, { - "epoch": 2.82, - "learning_rate": 2.1043592080456168e-05, - "loss": 0.2759, + "epoch": 2.8900421667207263, + "grad_norm": 0.24374066293239594, + "learning_rate": 1.9949281525211446e-05, + "loss": 0.4361, "step": 80190 }, { - "epoch": 2.82, - "learning_rate": 2.104077928423671e-05, - "loss": 0.2759, + "epoch": 2.890222366381951, + "grad_norm": 0.2334618866443634, + "learning_rate": 1.9946423599075774e-05, + "loss": 0.3988, "step": 80195 }, { - "epoch": 2.82, - "learning_rate": 2.103796653942512e-05, - "loss": 0.2461, + "epoch": 2.890402566043176, + "grad_norm": 0.25280308723449707, + "learning_rate": 1.9943565741794095e-05, + "loss": 0.4341, "step": 80200 }, { - "epoch": 2.82, - "learning_rate": 2.103515384605793e-05, - "loss": 0.262, + "epoch": 2.8905827657044005, + "grad_norm": 0.21369872987270355, + "learning_rate": 1.994070795340536e-05, + "loss": 0.365, "step": 80205 }, { - "epoch": 2.82, - "learning_rate": 2.1032341204171645e-05, - "loss": 0.2633, + "epoch": 2.8907629653656253, + "grad_norm": 0.23660530149936676, + "learning_rate": 1.9937850233948505e-05, + "loss": 0.3767, "step": 80210 }, { - "epoch": 2.82, - "learning_rate": 2.10295286138028e-05, - "loss": 0.275, + "epoch": 2.89094316502685, + "grad_norm": 0.2095620483160019, + "learning_rate": 1.9934992583462443e-05, + "loss": 0.3988, "step": 80215 }, { - "epoch": 2.82, - "learning_rate": 2.1026716074987912e-05, - "loss": 0.271, + "epoch": 2.8911233646880743, + "grad_norm": 0.22344286739826202, + "learning_rate": 1.9932135001986144e-05, + "loss": 0.4129, "step": 80220 }, { - "epoch": 2.82, - "learning_rate": 2.102390358776349e-05, - "loss": 0.2527, + "epoch": 2.891303564349299, + "grad_norm": 0.18746274709701538, + "learning_rate": 1.9929277489558517e-05, + "loss": 0.41, "step": 80225 }, { - "epoch": 2.82, - "learning_rate": 2.102109115216606e-05, - "loss": 0.2725, + "epoch": 2.8914837640105238, + "grad_norm": 0.17481876909732819, + "learning_rate": 1.9926420046218503e-05, + "loss": 0.3326, "step": 80230 }, { - "epoch": 2.82, - "learning_rate": 2.1018278768232146e-05, - "loss": 0.2702, + "epoch": 2.891663963671748, + "grad_norm": 0.185255765914917, + "learning_rate": 1.9923562672005033e-05, + "loss": 0.4074, "step": 80235 }, { - "epoch": 2.82, - "learning_rate": 2.101546643599826e-05, - "loss": 0.2668, + "epoch": 2.891844163332973, + "grad_norm": 0.16853371262550354, + "learning_rate": 1.9920705366957035e-05, + "loss": 0.3553, "step": 80240 }, { - "epoch": 2.82, - "learning_rate": 2.1012654155500904e-05, - "loss": 0.2583, + "epoch": 2.8920243629941975, + "grad_norm": 0.25939464569091797, + "learning_rate": 1.991784813111345e-05, + "loss": 0.3817, "step": 80245 }, { - "epoch": 2.82, - "learning_rate": 2.1009841926776616e-05, - "loss": 0.2684, + "epoch": 2.8922045626554223, + "grad_norm": 0.20100079476833344, + "learning_rate": 1.9914990964513196e-05, + "loss": 0.3873, "step": 80250 }, { - "epoch": 2.82, - "learning_rate": 2.1007029749861897e-05, - "loss": 0.2692, + "epoch": 2.892384762316647, + "grad_norm": 0.16807907819747925, + "learning_rate": 1.9912133867195203e-05, + "loss": 0.387, "step": 80255 }, { - "epoch": 2.82, - "learning_rate": 2.1004217624793272e-05, - "loss": 0.2727, + "epoch": 2.8925649619778717, + "grad_norm": 0.2342177778482437, + "learning_rate": 1.990927683919841e-05, + "loss": 0.3891, "step": 80260 }, { - "epoch": 2.82, - "learning_rate": 2.1001405551607234e-05, - "loss": 0.2685, + "epoch": 2.892745161639096, + "grad_norm": 0.27248871326446533, + "learning_rate": 1.9906419880561725e-05, + "loss": 0.4153, "step": 80265 }, { - "epoch": 2.82, - "learning_rate": 2.0998593530340322e-05, - "loss": 0.2716, + "epoch": 2.8929253613003207, + "grad_norm": 0.33067587018013, + "learning_rate": 1.9903562991324088e-05, + "loss": 0.4321, "step": 80270 }, { - "epoch": 2.82, - "learning_rate": 2.0995781561029035e-05, - "loss": 0.2605, + "epoch": 2.8931055609615455, + "grad_norm": 0.22115959227085114, + "learning_rate": 1.990070617152442e-05, + "loss": 0.3908, "step": 80275 }, { - "epoch": 2.82, - "learning_rate": 2.099296964370988e-05, - "loss": 0.2623, + "epoch": 2.8932857606227698, + "grad_norm": 0.23916247487068176, + "learning_rate": 1.9897849421201636e-05, + "loss": 0.4196, "step": 80280 }, { - "epoch": 2.82, - "learning_rate": 2.0990157778419375e-05, - "loss": 0.2637, + "epoch": 2.8934659602839945, + "grad_norm": 0.2263944149017334, + "learning_rate": 1.9894992740394674e-05, + "loss": 0.3994, "step": 80285 }, { - "epoch": 2.82, - "learning_rate": 2.0987345965194028e-05, - "loss": 0.2739, + "epoch": 2.8936461599452192, + "grad_norm": 0.2417251467704773, + "learning_rate": 1.9892136129142452e-05, + "loss": 0.3894, "step": 80290 }, { - "epoch": 2.83, - "learning_rate": 2.0984534204070354e-05, - "loss": 0.2722, + "epoch": 2.893826359606444, + "grad_norm": 0.18903349339962006, + "learning_rate": 1.988927958748387e-05, + "loss": 0.3851, "step": 80295 }, { - "epoch": 2.83, - "learning_rate": 2.098172249508486e-05, - "loss": 0.2858, + "epoch": 2.8940065592676687, + "grad_norm": 0.23545511066913605, + "learning_rate": 1.988642311545788e-05, + "loss": 0.3862, "step": 80300 }, { - "epoch": 2.83, - "learning_rate": 2.0978910838274038e-05, - "loss": 0.262, + "epoch": 2.8941867589288934, + "grad_norm": 0.27978673577308655, + "learning_rate": 1.9883566713103368e-05, + "loss": 0.4131, "step": 80305 }, { - "epoch": 2.83, - "learning_rate": 2.0976099233674413e-05, - "loss": 0.2545, + "epoch": 2.8943669585901177, + "grad_norm": 0.171412855386734, + "learning_rate": 1.988071038045928e-05, + "loss": 0.3753, "step": 80310 }, { - "epoch": 2.83, - "learning_rate": 2.0973287681322497e-05, - "loss": 0.2886, + "epoch": 2.8945471582513425, + "grad_norm": 0.18040184676647186, + "learning_rate": 1.987785411756453e-05, + "loss": 0.3975, "step": 80315 }, { - "epoch": 2.83, - "learning_rate": 2.0970476181254784e-05, - "loss": 0.2829, + "epoch": 2.894727357912567, + "grad_norm": 0.2326379269361496, + "learning_rate": 1.9874997924458007e-05, + "loss": 0.4205, "step": 80320 }, { - "epoch": 2.83, - "learning_rate": 2.0967664733507768e-05, - "loss": 0.2694, + "epoch": 2.894907557573792, + "grad_norm": 0.24374447762966156, + "learning_rate": 1.987214180117866e-05, + "loss": 0.4147, "step": 80325 }, { - "epoch": 2.83, - "learning_rate": 2.0964853338117983e-05, - "loss": 0.2816, + "epoch": 2.895087757235016, + "grad_norm": 0.20812088251113892, + "learning_rate": 1.9869285747765387e-05, + "loss": 0.3989, "step": 80330 }, { - "epoch": 2.83, - "learning_rate": 2.0962041995121912e-05, - "loss": 0.28, + "epoch": 2.895267956896241, + "grad_norm": 0.22509217262268066, + "learning_rate": 1.9866429764257092e-05, + "loss": 0.3967, "step": 80335 }, { - "epoch": 2.83, - "learning_rate": 2.0959230704556066e-05, - "loss": 0.2756, + "epoch": 2.8954481565574657, + "grad_norm": 0.244933620095253, + "learning_rate": 1.9863573850692706e-05, + "loss": 0.3987, "step": 80340 }, { - "epoch": 2.83, - "learning_rate": 2.0956419466456945e-05, - "loss": 0.2736, + "epoch": 2.8956283562186904, + "grad_norm": 0.17772509157657623, + "learning_rate": 1.986071800711113e-05, + "loss": 0.4069, "step": 80345 }, { - "epoch": 2.83, - "learning_rate": 2.095360828086106e-05, - "loss": 0.2538, + "epoch": 2.895808555879915, + "grad_norm": 0.21300829946994781, + "learning_rate": 1.9857862233551274e-05, + "loss": 0.405, "step": 80350 }, { - "epoch": 2.83, - "learning_rate": 2.0950797147804903e-05, - "loss": 0.2854, + "epoch": 2.8959887555411394, + "grad_norm": 0.20396198332309723, + "learning_rate": 1.9855006530052055e-05, + "loss": 0.3692, "step": 80355 }, { - "epoch": 2.83, - "learning_rate": 2.0947986067324962e-05, - "loss": 0.2723, + "epoch": 2.896168955202364, + "grad_norm": 0.2265167236328125, + "learning_rate": 1.985215089665237e-05, + "loss": 0.387, "step": 80360 }, { - "epoch": 2.83, - "learning_rate": 2.0945175039457765e-05, - "loss": 0.2605, + "epoch": 2.896349154863589, + "grad_norm": 0.2407812774181366, + "learning_rate": 1.9849295333391134e-05, + "loss": 0.3945, "step": 80365 }, { - "epoch": 2.83, - "learning_rate": 2.0942364064239795e-05, - "loss": 0.2816, + "epoch": 2.8965293545248136, + "grad_norm": 0.19000792503356934, + "learning_rate": 1.984643984030726e-05, + "loss": 0.3897, "step": 80370 }, { - "epoch": 2.83, - "learning_rate": 2.093955314170756e-05, - "loss": 0.27, + "epoch": 2.896709554186038, + "grad_norm": 0.21029935777187347, + "learning_rate": 1.9843584417439633e-05, + "loss": 0.4148, "step": 80375 }, { - "epoch": 2.83, - "learning_rate": 2.0936742271897536e-05, - "loss": 0.2619, + "epoch": 2.8968897538472627, + "grad_norm": 0.21043239533901215, + "learning_rate": 1.9840729064827173e-05, + "loss": 0.3786, "step": 80380 }, { - "epoch": 2.83, - "learning_rate": 2.0933931454846245e-05, - "loss": 0.2715, + "epoch": 2.8970699535084874, + "grad_norm": 0.24211741983890533, + "learning_rate": 1.983787378250878e-05, + "loss": 0.4162, "step": 80385 }, { - "epoch": 2.83, - "learning_rate": 2.093112069059017e-05, - "loss": 0.2664, + "epoch": 2.897250153169712, + "grad_norm": 0.20542925596237183, + "learning_rate": 1.9835018570523363e-05, + "loss": 0.4269, "step": 80390 }, { - "epoch": 2.83, - "learning_rate": 2.0928309979165824e-05, - "loss": 0.2681, + "epoch": 2.897430352830937, + "grad_norm": 0.22718103229999542, + "learning_rate": 1.983216342890982e-05, + "loss": 0.3968, "step": 80395 }, { - "epoch": 2.83, - "learning_rate": 2.0925499320609674e-05, - "loss": 0.2559, + "epoch": 2.897610552492161, + "grad_norm": 0.20570969581604004, + "learning_rate": 1.9829308357707037e-05, + "loss": 0.4045, "step": 80400 }, { - "epoch": 2.83, - "learning_rate": 2.092268871495824e-05, - "loss": 0.2453, + "epoch": 2.897790752153386, + "grad_norm": 0.26240652799606323, + "learning_rate": 1.982645335695394e-05, + "loss": 0.3815, "step": 80405 }, { - "epoch": 2.83, - "learning_rate": 2.0919878162248008e-05, - "loss": 0.2544, + "epoch": 2.8979709518146106, + "grad_norm": 0.23519177734851837, + "learning_rate": 1.982359842668941e-05, + "loss": 0.4191, "step": 80410 }, { - "epoch": 2.83, - "learning_rate": 2.091706766251546e-05, - "loss": 0.2612, + "epoch": 2.8981511514758354, + "grad_norm": 0.23879577219486237, + "learning_rate": 1.9820743566952348e-05, + "loss": 0.4106, "step": 80415 }, { - "epoch": 2.83, - "learning_rate": 2.09142572157971e-05, - "loss": 0.2587, + "epoch": 2.8983313511370596, + "grad_norm": 0.22001482546329498, + "learning_rate": 1.981788877778165e-05, + "loss": 0.3793, "step": 80420 }, { - "epoch": 2.83, - "learning_rate": 2.091144682212942e-05, - "loss": 0.2638, + "epoch": 2.8985115507982844, + "grad_norm": 0.1924242228269577, + "learning_rate": 1.9815034059216214e-05, + "loss": 0.4246, "step": 80425 }, { - "epoch": 2.83, - "learning_rate": 2.090863648154891e-05, - "loss": 0.2679, + "epoch": 2.898691750459509, + "grad_norm": 0.2032802700996399, + "learning_rate": 1.981217941129494e-05, + "loss": 0.3554, "step": 80430 }, { - "epoch": 2.83, - "learning_rate": 2.0905826194092064e-05, - "loss": 0.2748, + "epoch": 2.898871950120734, + "grad_norm": 0.2488231360912323, + "learning_rate": 1.9809324834056713e-05, + "loss": 0.4157, "step": 80435 }, { - "epoch": 2.83, - "learning_rate": 2.090301595979535e-05, - "loss": 0.2592, + "epoch": 2.8990521497819586, + "grad_norm": 0.20548661053180695, + "learning_rate": 1.980647032754043e-05, + "loss": 0.3964, "step": 80440 }, { - "epoch": 2.83, - "learning_rate": 2.0900205778695285e-05, - "loss": 0.2609, + "epoch": 2.8992323494431833, + "grad_norm": 0.20527927577495575, + "learning_rate": 1.9803615891784987e-05, + "loss": 0.3843, "step": 80445 }, { - "epoch": 2.83, - "learning_rate": 2.0897395650828343e-05, - "loss": 0.2677, + "epoch": 2.8994125491044076, + "grad_norm": 0.20890048146247864, + "learning_rate": 1.980076152682927e-05, + "loss": 0.418, "step": 80450 }, { - "epoch": 2.83, - "learning_rate": 2.0894585576231012e-05, - "loss": 0.2443, + "epoch": 2.8995927487656323, + "grad_norm": 0.23384051024913788, + "learning_rate": 1.9797907232712166e-05, + "loss": 0.368, "step": 80455 }, { - "epoch": 2.83, - "learning_rate": 2.0891775554939792e-05, - "loss": 0.2822, + "epoch": 2.899772948426857, + "grad_norm": 0.20361943542957306, + "learning_rate": 1.9795053009472574e-05, + "loss": 0.4239, "step": 80460 }, { - "epoch": 2.83, - "learning_rate": 2.0888965586991156e-05, - "loss": 0.2614, + "epoch": 2.8999531480880814, + "grad_norm": 0.21155259013175964, + "learning_rate": 1.9792198857149375e-05, + "loss": 0.4109, "step": 80465 }, { - "epoch": 2.83, - "learning_rate": 2.0886155672421588e-05, - "loss": 0.2685, + "epoch": 2.900133347749306, + "grad_norm": 0.24116067588329315, + "learning_rate": 1.978934477578146e-05, + "loss": 0.4045, "step": 80470 }, { - "epoch": 2.83, - "learning_rate": 2.0883345811267574e-05, - "loss": 0.2603, + "epoch": 2.900313547410531, + "grad_norm": 0.20510059595108032, + "learning_rate": 1.9786490765407713e-05, + "loss": 0.4006, "step": 80475 }, { - "epoch": 2.83, - "learning_rate": 2.0880536003565605e-05, - "loss": 0.2708, + "epoch": 2.9004937470717556, + "grad_norm": 0.2083190232515335, + "learning_rate": 1.9783636826067015e-05, + "loss": 0.3923, "step": 80480 }, { - "epoch": 2.83, - "learning_rate": 2.087772624935217e-05, - "loss": 0.2762, + "epoch": 2.9006739467329803, + "grad_norm": 0.22528444230556488, + "learning_rate": 1.9780782957798263e-05, + "loss": 0.3764, "step": 80485 }, { - "epoch": 2.83, - "learning_rate": 2.0874916548663738e-05, - "loss": 0.2803, + "epoch": 2.900854146394205, + "grad_norm": 0.22572170197963715, + "learning_rate": 1.9777929160640325e-05, + "loss": 0.3875, "step": 80490 }, { - "epoch": 2.83, - "learning_rate": 2.0872106901536786e-05, - "loss": 0.2708, + "epoch": 2.9010343460554293, + "grad_norm": 0.1767890602350235, + "learning_rate": 1.97750754346321e-05, + "loss": 0.3535, "step": 80495 }, { - "epoch": 2.83, - "learning_rate": 2.086929730800782e-05, - "loss": 0.2658, + "epoch": 2.901214545716654, + "grad_norm": 0.24823251366615295, + "learning_rate": 1.977222177981246e-05, + "loss": 0.4085, "step": 80500 }, { - "epoch": 2.83, - "eval_loss": 0.26011475920677185, - "eval_runtime": 10.5627, - "eval_samples_per_second": 9.467, - "eval_steps_per_second": 9.467, + "epoch": 2.901214545716654, + "eval_loss": 0.43115851283073425, + "eval_runtime": 3.533, + "eval_samples_per_second": 28.304, + "eval_steps_per_second": 7.076, "step": 80500 }, { - "epoch": 2.83, - "learning_rate": 2.0866487768113297e-05, - "loss": 0.2571, + "epoch": 2.901394745377879, + "grad_norm": 0.284284383058548, + "learning_rate": 1.9769368196220275e-05, + "loss": 0.4208, "step": 80505 }, { - "epoch": 2.83, - "learning_rate": 2.0863678281889713e-05, - "loss": 0.2468, + "epoch": 2.901574945039103, + "grad_norm": 0.22723108530044556, + "learning_rate": 1.976651468389445e-05, + "loss": 0.415, "step": 80510 }, { - "epoch": 2.83, - "learning_rate": 2.0860868849373528e-05, - "loss": 0.2418, + "epoch": 2.901755144700328, + "grad_norm": 0.2633325159549713, + "learning_rate": 1.9763661242873845e-05, + "loss": 0.4283, "step": 80515 }, { - "epoch": 2.83, - "learning_rate": 2.0858059470601243e-05, - "loss": 0.242, + "epoch": 2.9019353443615525, + "grad_norm": 0.2002272605895996, + "learning_rate": 1.9760807873197336e-05, + "loss": 0.3805, "step": 80520 }, { - "epoch": 2.83, - "learning_rate": 2.085525014560933e-05, - "loss": 0.2763, + "epoch": 2.9021155440227773, + "grad_norm": 0.1985533982515335, + "learning_rate": 1.975795457490381e-05, + "loss": 0.3822, "step": 80525 }, { - "epoch": 2.83, - "learning_rate": 2.0852440874434256e-05, - "loss": 0.2448, + "epoch": 2.902295743684002, + "grad_norm": 0.18595746159553528, + "learning_rate": 1.9755101348032136e-05, + "loss": 0.3732, "step": 80530 }, { - "epoch": 2.83, - "learning_rate": 2.0849631657112496e-05, - "loss": 0.2852, + "epoch": 2.9024759433452267, + "grad_norm": 0.22532545030117035, + "learning_rate": 1.975224819262119e-05, + "loss": 0.3716, "step": 80535 }, { - "epoch": 2.83, - "learning_rate": 2.0846822493680548e-05, - "loss": 0.2857, + "epoch": 2.902656143006451, + "grad_norm": 0.20030663907527924, + "learning_rate": 1.974939510870985e-05, + "loss": 0.3823, "step": 80540 }, { - "epoch": 2.83, - "learning_rate": 2.084401338417487e-05, - "loss": 0.3082, + "epoch": 2.9028363426676758, + "grad_norm": 0.2480957806110382, + "learning_rate": 1.974654209633698e-05, + "loss": 0.3838, "step": 80545 }, { - "epoch": 2.83, - "learning_rate": 2.0841204328631942e-05, - "loss": 0.2949, + "epoch": 2.9030165423289005, + "grad_norm": 0.23294273018836975, + "learning_rate": 1.9743689155541458e-05, + "loss": 0.412, "step": 80550 }, { - "epoch": 2.83, - "learning_rate": 2.0838395327088223e-05, - "loss": 0.2487, + "epoch": 2.903196741990125, + "grad_norm": 0.2072293609380722, + "learning_rate": 1.974083628636216e-05, + "loss": 0.4121, "step": 80555 }, { - "epoch": 2.83, - "learning_rate": 2.0835586379580203e-05, - "loss": 0.2753, + "epoch": 2.9033769416513495, + "grad_norm": 0.20918940007686615, + "learning_rate": 1.973798348883794e-05, + "loss": 0.3944, "step": 80560 }, { - "epoch": 2.83, - "learning_rate": 2.0832777486144357e-05, - "loss": 0.275, + "epoch": 2.9035571413125743, + "grad_norm": 0.24100269377231598, + "learning_rate": 1.973513076300768e-05, + "loss": 0.416, "step": 80565 }, { - "epoch": 2.83, - "learning_rate": 2.0829968646817135e-05, - "loss": 0.277, + "epoch": 2.903737340973799, + "grad_norm": 0.1925951987504959, + "learning_rate": 1.9732278108910243e-05, + "loss": 0.3716, "step": 80570 }, { - "epoch": 2.83, - "learning_rate": 2.0827159861635034e-05, - "loss": 0.2595, + "epoch": 2.9039175406350237, + "grad_norm": 0.21889275312423706, + "learning_rate": 1.97294255265845e-05, + "loss": 0.4215, "step": 80575 }, { - "epoch": 2.84, - "learning_rate": 2.0824351130634513e-05, - "loss": 0.2627, + "epoch": 2.9040977402962485, + "grad_norm": 0.20681817829608917, + "learning_rate": 1.972657301606932e-05, + "loss": 0.4319, "step": 80580 }, { - "epoch": 2.84, - "learning_rate": 2.0821542453852032e-05, - "loss": 0.2589, + "epoch": 2.9042779399574727, + "grad_norm": 0.23390960693359375, + "learning_rate": 1.9723720577403546e-05, + "loss": 0.4255, "step": 80585 }, { - "epoch": 2.84, - "learning_rate": 2.081873383132407e-05, - "loss": 0.2754, + "epoch": 2.9044581396186975, + "grad_norm": 0.21536414325237274, + "learning_rate": 1.9720868210626067e-05, + "loss": 0.3939, "step": 80590 }, { - "epoch": 2.84, - "learning_rate": 2.08159252630871e-05, - "loss": 0.2601, + "epoch": 2.904638339279922, + "grad_norm": 0.245896577835083, + "learning_rate": 1.9718015915775743e-05, + "loss": 0.3782, "step": 80595 }, { - "epoch": 2.84, - "learning_rate": 2.081311674917758e-05, - "loss": 0.2757, + "epoch": 2.904818538941147, + "grad_norm": 0.20185402035713196, + "learning_rate": 1.9715163692891416e-05, + "loss": 0.4153, "step": 80600 }, { - "epoch": 2.84, - "learning_rate": 2.0810308289631984e-05, - "loss": 0.2631, + "epoch": 2.9049987386023712, + "grad_norm": 0.19402766227722168, + "learning_rate": 1.971231154201197e-05, + "loss": 0.4159, "step": 80605 }, { - "epoch": 2.84, - "learning_rate": 2.0807499884486763e-05, - "loss": 0.2876, + "epoch": 2.905178938263596, + "grad_norm": 0.23426468670368195, + "learning_rate": 1.9709459463176243e-05, + "loss": 0.3907, "step": 80610 }, { - "epoch": 2.84, - "learning_rate": 2.0804691533778398e-05, - "loss": 0.2869, + "epoch": 2.9053591379248207, + "grad_norm": 0.2931303381919861, + "learning_rate": 1.9706607456423122e-05, + "loss": 0.432, "step": 80615 }, { - "epoch": 2.84, - "learning_rate": 2.0801883237543355e-05, - "loss": 0.2653, + "epoch": 2.9055393375860454, + "grad_norm": 0.25184187293052673, + "learning_rate": 1.9703755521791445e-05, + "loss": 0.3872, "step": 80620 }, { - "epoch": 2.84, - "learning_rate": 2.0799074995818087e-05, - "loss": 0.2704, + "epoch": 2.90571953724727, + "grad_norm": 0.2202758938074112, + "learning_rate": 1.970090365932007e-05, + "loss": 0.359, "step": 80625 }, { - "epoch": 2.84, - "learning_rate": 2.0796266808639058e-05, - "loss": 0.275, + "epoch": 2.9058997369084945, + "grad_norm": 0.2390233725309372, + "learning_rate": 1.969805186904786e-05, + "loss": 0.3898, "step": 80630 }, { - "epoch": 2.84, - "learning_rate": 2.0793458676042737e-05, - "loss": 0.252, + "epoch": 2.906079936569719, + "grad_norm": 0.1913604885339737, + "learning_rate": 1.9695200151013666e-05, + "loss": 0.3645, "step": 80635 }, { - "epoch": 2.84, - "learning_rate": 2.079065059806558e-05, - "loss": 0.2659, + "epoch": 2.906260136230944, + "grad_norm": 0.22707746922969818, + "learning_rate": 1.9692348505256335e-05, + "loss": 0.3945, "step": 80640 }, { - "epoch": 2.84, - "learning_rate": 2.0787842574744058e-05, - "loss": 0.265, + "epoch": 2.9064403358921687, + "grad_norm": 0.2409844696521759, + "learning_rate": 1.968949693181474e-05, + "loss": 0.3892, "step": 80645 }, { - "epoch": 2.84, - "learning_rate": 2.0785034606114613e-05, - "loss": 0.2435, + "epoch": 2.906620535553393, + "grad_norm": 0.21036100387573242, + "learning_rate": 1.968664543072771e-05, + "loss": 0.3957, "step": 80650 }, { - "epoch": 2.84, - "learning_rate": 2.078222669221373e-05, - "loss": 0.2861, + "epoch": 2.9068007352146177, + "grad_norm": 0.2081831395626068, + "learning_rate": 1.9683794002034115e-05, + "loss": 0.3975, "step": 80655 }, { - "epoch": 2.84, - "learning_rate": 2.0779418833077848e-05, - "loss": 0.2312, + "epoch": 2.9069809348758424, + "grad_norm": 0.17619097232818604, + "learning_rate": 1.96809426457728e-05, + "loss": 0.3864, "step": 80660 }, { - "epoch": 2.84, - "learning_rate": 2.077661102874343e-05, - "loss": 0.2818, + "epoch": 2.907161134537067, + "grad_norm": 0.25311923027038574, + "learning_rate": 1.9678091361982602e-05, + "loss": 0.3982, "step": 80665 }, { - "epoch": 2.84, - "learning_rate": 2.0773803279246924e-05, - "loss": 0.2593, + "epoch": 2.907341334198292, + "grad_norm": 0.25830528140068054, + "learning_rate": 1.967524015070239e-05, + "loss": 0.4033, "step": 80670 }, { - "epoch": 2.84, - "learning_rate": 2.0770995584624815e-05, - "loss": 0.259, + "epoch": 2.9075215338595166, + "grad_norm": 0.20592638850212097, + "learning_rate": 1.967238901197099e-05, + "loss": 0.3828, "step": 80675 }, { - "epoch": 2.84, - "learning_rate": 2.0768187944913537e-05, - "loss": 0.2681, + "epoch": 2.907701733520741, + "grad_norm": 0.19659774005413055, + "learning_rate": 1.9669537945827265e-05, + "loss": 0.3768, "step": 80680 }, { - "epoch": 2.84, - "learning_rate": 2.076538036014955e-05, - "loss": 0.2568, + "epoch": 2.9078819331819656, + "grad_norm": 0.24102912843227386, + "learning_rate": 1.9666686952310057e-05, + "loss": 0.4266, "step": 80685 }, { - "epoch": 2.84, - "learning_rate": 2.07625728303693e-05, - "loss": 0.2833, + "epoch": 2.9080621328431904, + "grad_norm": 0.18135298788547516, + "learning_rate": 1.9663836031458195e-05, + "loss": 0.4413, "step": 80690 }, { - "epoch": 2.84, - "learning_rate": 2.075976535560925e-05, - "loss": 0.2708, + "epoch": 2.9082423325044147, + "grad_norm": 0.2321069985628128, + "learning_rate": 1.9660985183310543e-05, + "loss": 0.4058, "step": 80695 }, { - "epoch": 2.84, - "learning_rate": 2.075695793590586e-05, - "loss": 0.2695, + "epoch": 2.9084225321656394, + "grad_norm": 0.21384575963020325, + "learning_rate": 1.9658134407905935e-05, + "loss": 0.382, "step": 80700 }, { - "epoch": 2.84, - "learning_rate": 2.0754150571295563e-05, - "loss": 0.2673, + "epoch": 2.908602731826864, + "grad_norm": 0.2350618541240692, + "learning_rate": 1.9655283705283205e-05, + "loss": 0.4001, "step": 80705 }, { - "epoch": 2.84, - "learning_rate": 2.0751343261814838e-05, - "loss": 0.2859, + "epoch": 2.908782931488089, + "grad_norm": 0.19745108485221863, + "learning_rate": 1.9652433075481202e-05, + "loss": 0.425, "step": 80710 }, { - "epoch": 2.84, - "learning_rate": 2.0748536007500116e-05, - "loss": 0.2642, + "epoch": 2.9089631311493136, + "grad_norm": 0.21076776087284088, + "learning_rate": 1.9649582518538757e-05, + "loss": 0.4067, "step": 80715 }, { - "epoch": 2.84, - "learning_rate": 2.074572880838785e-05, - "loss": 0.2722, + "epoch": 2.9091433308105383, + "grad_norm": 0.20827296376228333, + "learning_rate": 1.9646732034494726e-05, + "loss": 0.3839, "step": 80720 }, { - "epoch": 2.84, - "learning_rate": 2.0742921664514486e-05, - "loss": 0.2735, + "epoch": 2.9093235304717626, + "grad_norm": 0.18359017372131348, + "learning_rate": 1.964388162338793e-05, + "loss": 0.3977, "step": 80725 }, { - "epoch": 2.84, - "learning_rate": 2.0740114575916484e-05, - "loss": 0.2475, + "epoch": 2.9095037301329874, + "grad_norm": 0.23850063979625702, + "learning_rate": 1.9641031285257205e-05, + "loss": 0.4165, "step": 80730 }, { - "epoch": 2.84, - "learning_rate": 2.073730754263029e-05, - "loss": 0.2844, + "epoch": 2.909683929794212, + "grad_norm": 0.2121899276971817, + "learning_rate": 1.96381810201414e-05, + "loss": 0.3906, "step": 80735 }, { - "epoch": 2.84, - "learning_rate": 2.073450056469235e-05, - "loss": 0.2879, + "epoch": 2.9098641294554364, + "grad_norm": 0.26047274470329285, + "learning_rate": 1.9635330828079335e-05, + "loss": 0.3992, "step": 80740 }, { - "epoch": 2.84, - "learning_rate": 2.07316936421391e-05, - "loss": 0.2833, + "epoch": 2.910044329116661, + "grad_norm": 0.16215106844902039, + "learning_rate": 1.9632480709109845e-05, + "loss": 0.4113, "step": 80745 }, { - "epoch": 2.84, - "learning_rate": 2.0728886775006997e-05, - "loss": 0.2413, + "epoch": 2.910224528777886, + "grad_norm": 0.2457340657711029, + "learning_rate": 1.9629630663271776e-05, + "loss": 0.4222, "step": 80750 }, { - "epoch": 2.84, - "learning_rate": 2.072607996333249e-05, - "loss": 0.2701, + "epoch": 2.9104047284391106, + "grad_norm": 0.24519392848014832, + "learning_rate": 1.962678069060394e-05, + "loss": 0.4321, "step": 80755 }, { - "epoch": 2.84, - "learning_rate": 2.072327320715202e-05, - "loss": 0.2796, + "epoch": 2.9105849281003353, + "grad_norm": 0.169399693608284, + "learning_rate": 1.9623930791145184e-05, + "loss": 0.3873, "step": 80760 }, { - "epoch": 2.84, - "learning_rate": 2.072046650650202e-05, - "loss": 0.249, + "epoch": 2.91076512776156, + "grad_norm": 0.20469214022159576, + "learning_rate": 1.9621080964934326e-05, + "loss": 0.3688, "step": 80765 }, { - "epoch": 2.84, - "learning_rate": 2.071765986141895e-05, - "loss": 0.2712, + "epoch": 2.9109453274227843, + "grad_norm": 0.21101103723049164, + "learning_rate": 1.9618231212010195e-05, + "loss": 0.4403, "step": 80770 }, { - "epoch": 2.84, - "learning_rate": 2.0714853271939235e-05, - "loss": 0.2685, + "epoch": 2.911125527084009, + "grad_norm": 0.193876713514328, + "learning_rate": 1.9615381532411632e-05, + "loss": 0.3991, "step": 80775 }, { - "epoch": 2.84, - "learning_rate": 2.071204673809934e-05, - "loss": 0.2792, + "epoch": 2.911305726745234, + "grad_norm": 0.1711498647928238, + "learning_rate": 1.9612531926177453e-05, + "loss": 0.3954, "step": 80780 }, { - "epoch": 2.84, - "learning_rate": 2.0709240259935676e-05, - "loss": 0.2435, + "epoch": 2.911485926406458, + "grad_norm": 0.2698494791984558, + "learning_rate": 1.960968239334647e-05, + "loss": 0.4084, "step": 80785 }, { - "epoch": 2.84, - "learning_rate": 2.070643383748471e-05, - "loss": 0.2449, + "epoch": 2.911666126067683, + "grad_norm": 0.22966143488883972, + "learning_rate": 1.9606832933957536e-05, + "loss": 0.3625, "step": 80790 }, { - "epoch": 2.84, - "learning_rate": 2.070362747078287e-05, - "loss": 0.2518, + "epoch": 2.9118463257289076, + "grad_norm": 0.22466062009334564, + "learning_rate": 1.9603983548049444e-05, + "loss": 0.3633, "step": 80795 }, { - "epoch": 2.84, - "learning_rate": 2.0700821159866597e-05, - "loss": 0.2533, + "epoch": 2.9120265253901323, + "grad_norm": 0.23133142292499542, + "learning_rate": 1.9601134235661047e-05, + "loss": 0.3741, "step": 80800 }, { - "epoch": 2.84, - "learning_rate": 2.0698014904772323e-05, - "loss": 0.2424, + "epoch": 2.912206725051357, + "grad_norm": 0.22116337716579437, + "learning_rate": 1.9598284996831145e-05, + "loss": 0.3696, "step": 80805 }, { - "epoch": 2.84, - "learning_rate": 2.0695208705536493e-05, - "loss": 0.2789, + "epoch": 2.9123869247125818, + "grad_norm": 0.22359761595726013, + "learning_rate": 1.9595435831598558e-05, + "loss": 0.4236, "step": 80810 }, { - "epoch": 2.84, - "learning_rate": 2.0692402562195544e-05, - "loss": 0.2933, + "epoch": 2.912567124373806, + "grad_norm": 0.24041809141635895, + "learning_rate": 1.9592586740002116e-05, + "loss": 0.3961, "step": 80815 }, { - "epoch": 2.84, - "learning_rate": 2.06895964747859e-05, - "loss": 0.2417, + "epoch": 2.912747324035031, + "grad_norm": 0.23116619884967804, + "learning_rate": 1.9589737722080637e-05, + "loss": 0.3997, "step": 80820 }, { - "epoch": 2.84, - "learning_rate": 2.0686790443344013e-05, - "loss": 0.271, + "epoch": 2.9129275236962555, + "grad_norm": 0.23118247091770172, + "learning_rate": 1.9586888777872925e-05, + "loss": 0.389, "step": 80825 }, { - "epoch": 2.84, - "learning_rate": 2.0683984467906306e-05, - "loss": 0.2566, + "epoch": 2.9131077233574802, + "grad_norm": 0.2595424950122833, + "learning_rate": 1.9584039907417812e-05, + "loss": 0.4273, "step": 80830 }, { - "epoch": 2.84, - "learning_rate": 2.0681178548509223e-05, - "loss": 0.2505, + "epoch": 2.9132879230187045, + "grad_norm": 0.1928250938653946, + "learning_rate": 1.95811911107541e-05, + "loss": 0.3858, "step": 80835 }, { - "epoch": 2.84, - "learning_rate": 2.0678372685189178e-05, - "loss": 0.2572, + "epoch": 2.9134681226799293, + "grad_norm": 0.22600649297237396, + "learning_rate": 1.957834238792062e-05, + "loss": 0.402, "step": 80840 }, { - "epoch": 2.84, - "learning_rate": 2.0675566877982626e-05, - "loss": 0.2604, + "epoch": 2.913648322341154, + "grad_norm": 0.1833333820104599, + "learning_rate": 1.9575493738956168e-05, + "loss": 0.4063, "step": 80845 }, { - "epoch": 2.84, - "learning_rate": 2.067276112692599e-05, - "loss": 0.2816, + "epoch": 2.9138285220023787, + "grad_norm": 0.21622666716575623, + "learning_rate": 1.9572645163899563e-05, + "loss": 0.4199, "step": 80850 }, { - "epoch": 2.84, - "learning_rate": 2.0669955432055695e-05, - "loss": 0.2688, + "epoch": 2.9140087216636035, + "grad_norm": 0.21689878404140472, + "learning_rate": 1.9569796662789623e-05, + "loss": 0.4067, "step": 80855 }, { - "epoch": 2.84, - "learning_rate": 2.066714979340817e-05, - "loss": 0.2857, + "epoch": 2.9141889213248278, + "grad_norm": 0.21402768790721893, + "learning_rate": 1.9566948235665144e-05, + "loss": 0.3951, "step": 80860 }, { - "epoch": 2.85, - "learning_rate": 2.066434421101985e-05, - "loss": 0.2708, + "epoch": 2.9143691209860525, + "grad_norm": 0.24621036648750305, + "learning_rate": 1.956409988256495e-05, + "loss": 0.3993, "step": 80865 }, { - "epoch": 2.85, - "learning_rate": 2.0661538684927175e-05, - "loss": 0.2721, + "epoch": 2.9145493206472772, + "grad_norm": 0.21831317245960236, + "learning_rate": 1.9561251603527846e-05, + "loss": 0.3955, "step": 80870 }, { - "epoch": 2.85, - "learning_rate": 2.0658733215166555e-05, - "loss": 0.2632, + "epoch": 2.914729520308502, + "grad_norm": 0.20578089356422424, + "learning_rate": 1.9558403398592625e-05, + "loss": 0.4011, "step": 80875 }, { - "epoch": 2.85, - "learning_rate": 2.0655927801774412e-05, - "loss": 0.2533, + "epoch": 2.9149097199697263, + "grad_norm": 0.18618744611740112, + "learning_rate": 1.955555526779811e-05, + "loss": 0.4194, "step": 80880 }, { - "epoch": 2.85, - "learning_rate": 2.06531224447872e-05, - "loss": 0.2479, + "epoch": 2.915089919630951, + "grad_norm": 0.21536029875278473, + "learning_rate": 1.9552707211183107e-05, + "loss": 0.3765, "step": 80885 }, { - "epoch": 2.85, - "learning_rate": 2.065031714424132e-05, - "loss": 0.2733, + "epoch": 2.9152701192921757, + "grad_norm": 0.20636074244976044, + "learning_rate": 1.95498592287864e-05, + "loss": 0.3903, "step": 80890 }, { - "epoch": 2.85, - "learning_rate": 2.0647511900173208e-05, - "loss": 0.2774, + "epoch": 2.9154503189534005, + "grad_norm": 0.201945498585701, + "learning_rate": 1.9547011320646817e-05, + "loss": 0.3891, "step": 80895 }, { - "epoch": 2.85, - "learning_rate": 2.0644706712619276e-05, - "loss": 0.2727, + "epoch": 2.915630518614625, + "grad_norm": 0.16355274617671967, + "learning_rate": 1.9544163486803134e-05, + "loss": 0.4209, "step": 80900 }, { - "epoch": 2.85, - "learning_rate": 2.064190158161597e-05, - "loss": 0.2752, + "epoch": 2.9158107182758495, + "grad_norm": 0.23773586750030518, + "learning_rate": 1.9541315727294188e-05, + "loss": 0.4007, "step": 80905 }, { - "epoch": 2.85, - "learning_rate": 2.0639096507199693e-05, - "loss": 0.2747, + "epoch": 2.915990917937074, + "grad_norm": 0.1974399834871292, + "learning_rate": 1.953846804215875e-05, + "loss": 0.3999, "step": 80910 }, { - "epoch": 2.85, - "learning_rate": 2.0636291489406878e-05, - "loss": 0.2662, + "epoch": 2.916171117598299, + "grad_norm": 0.24811916053295135, + "learning_rate": 1.9535620431435623e-05, + "loss": 0.3971, "step": 80915 }, { - "epoch": 2.85, - "learning_rate": 2.0633486528273934e-05, - "loss": 0.2834, + "epoch": 2.9163513172595237, + "grad_norm": 0.20471271872520447, + "learning_rate": 1.9532772895163616e-05, + "loss": 0.4142, "step": 80920 }, { - "epoch": 2.85, - "learning_rate": 2.06306816238373e-05, - "loss": 0.2694, + "epoch": 2.916531516920748, + "grad_norm": 0.19927844405174255, + "learning_rate": 1.9529925433381517e-05, + "loss": 0.354, "step": 80925 }, { - "epoch": 2.85, - "learning_rate": 2.0627876776133386e-05, - "loss": 0.261, + "epoch": 2.9167117165819727, + "grad_norm": 0.18068204820156097, + "learning_rate": 1.9527078046128118e-05, + "loss": 0.3941, "step": 80930 }, { - "epoch": 2.85, - "learning_rate": 2.0625071985198597e-05, - "loss": 0.2763, + "epoch": 2.9168919162431974, + "grad_norm": 0.21484220027923584, + "learning_rate": 1.952423073344223e-05, + "loss": 0.3866, "step": 80935 }, { - "epoch": 2.85, - "learning_rate": 2.062226725106938e-05, - "loss": 0.2674, + "epoch": 2.917072115904422, + "grad_norm": 0.21851830184459686, + "learning_rate": 1.9521383495362634e-05, + "loss": 0.3903, "step": 80940 }, { - "epoch": 2.85, - "learning_rate": 2.0619462573782132e-05, - "loss": 0.2511, + "epoch": 2.917252315565647, + "grad_norm": 0.2002904862165451, + "learning_rate": 1.9518536331928127e-05, + "loss": 0.3791, "step": 80945 }, { - "epoch": 2.85, - "learning_rate": 2.061665795337328e-05, - "loss": 0.2441, + "epoch": 2.9174325152268716, + "grad_norm": 0.23152127861976624, + "learning_rate": 1.9515689243177508e-05, + "loss": 0.4406, "step": 80950 }, { - "epoch": 2.85, - "learning_rate": 2.0613853389879224e-05, - "loss": 0.2774, + "epoch": 2.917612714888096, + "grad_norm": 0.1651298850774765, + "learning_rate": 1.9512842229149553e-05, + "loss": 0.3888, "step": 80955 }, { - "epoch": 2.85, - "learning_rate": 2.06110488833364e-05, - "loss": 0.2761, + "epoch": 2.9177929145493207, + "grad_norm": 0.17865732312202454, + "learning_rate": 1.950999528988306e-05, + "loss": 0.3686, "step": 80960 }, { - "epoch": 2.85, - "learning_rate": 2.0608244433781216e-05, - "loss": 0.2804, + "epoch": 2.9179731142105454, + "grad_norm": 0.2451142817735672, + "learning_rate": 1.9507148425416832e-05, + "loss": 0.423, "step": 80965 }, { - "epoch": 2.85, - "learning_rate": 2.0605440041250078e-05, - "loss": 0.2761, + "epoch": 2.9181533138717697, + "grad_norm": 0.16584070026874542, + "learning_rate": 1.9504301635789623e-05, + "loss": 0.3668, "step": 80970 }, { - "epoch": 2.85, - "learning_rate": 2.0602635705779403e-05, - "loss": 0.2573, + "epoch": 2.9183335135329944, + "grad_norm": 0.22449831664562225, + "learning_rate": 1.9501454921040256e-05, + "loss": 0.3977, "step": 80975 }, { - "epoch": 2.85, - "learning_rate": 2.0599831427405612e-05, - "loss": 0.2798, + "epoch": 2.918513713194219, + "grad_norm": 0.21804749965667725, + "learning_rate": 1.949860828120749e-05, + "loss": 0.3792, "step": 80980 }, { - "epoch": 2.85, - "learning_rate": 2.059702720616511e-05, - "loss": 0.2702, + "epoch": 2.918693912855444, + "grad_norm": 0.20953933894634247, + "learning_rate": 1.9495761716330133e-05, + "loss": 0.4228, "step": 80985 }, { - "epoch": 2.85, - "learning_rate": 2.0594223042094308e-05, - "loss": 0.2639, + "epoch": 2.9188741125166686, + "grad_norm": 0.20794197916984558, + "learning_rate": 1.949291522644695e-05, + "loss": 0.4034, "step": 80990 }, { - "epoch": 2.85, - "learning_rate": 2.059141893522961e-05, - "loss": 0.2854, + "epoch": 2.9190543121778934, + "grad_norm": 0.2192896008491516, + "learning_rate": 1.949006881159673e-05, + "loss": 0.4113, "step": 80995 }, { - "epoch": 2.85, - "learning_rate": 2.0588614885607434e-05, - "loss": 0.252, + "epoch": 2.9192345118391176, + "grad_norm": 0.18000654876232147, + "learning_rate": 1.9487222471818255e-05, + "loss": 0.4156, "step": 81000 }, { - "epoch": 2.85, - "eval_loss": 0.26019665598869324, - "eval_runtime": 10.5615, - "eval_samples_per_second": 9.468, - "eval_steps_per_second": 9.468, + "epoch": 2.9192345118391176, + "eval_loss": 0.4301445782184601, + "eval_runtime": 3.5355, + "eval_samples_per_second": 28.285, + "eval_steps_per_second": 7.071, "step": 81000 }, { - "epoch": 2.85, - "learning_rate": 2.058581089326419e-05, - "loss": 0.284, + "epoch": 2.9194147115003424, + "grad_norm": 0.292233943939209, + "learning_rate": 1.9484376207150314e-05, + "loss": 0.3844, "step": 81005 }, { - "epoch": 2.85, - "learning_rate": 2.0583006958236283e-05, - "loss": 0.2698, + "epoch": 2.919594911161567, + "grad_norm": 0.21577784419059753, + "learning_rate": 1.948153001763167e-05, + "loss": 0.3826, "step": 81010 }, { - "epoch": 2.85, - "learning_rate": 2.0580203080560112e-05, - "loss": 0.2365, + "epoch": 2.9197751108227914, + "grad_norm": 0.2999797463417053, + "learning_rate": 1.9478683903301116e-05, + "loss": 0.4494, "step": 81015 }, { - "epoch": 2.85, - "learning_rate": 2.05773992602721e-05, - "loss": 0.2765, + "epoch": 2.919955310484016, + "grad_norm": 0.24274799227714539, + "learning_rate": 1.9475837864197418e-05, + "loss": 0.3721, "step": 81020 }, { - "epoch": 2.85, - "learning_rate": 2.0574595497408637e-05, - "loss": 0.2547, + "epoch": 2.920135510145241, + "grad_norm": 0.2308402806520462, + "learning_rate": 1.947299190035937e-05, + "loss": 0.4185, "step": 81025 }, { - "epoch": 2.85, - "learning_rate": 2.0571791792006147e-05, - "loss": 0.2475, + "epoch": 2.9203157098064656, + "grad_norm": 0.2198919653892517, + "learning_rate": 1.9470146011825733e-05, + "loss": 0.3646, "step": 81030 }, { - "epoch": 2.85, - "learning_rate": 2.0568988144101005e-05, - "loss": 0.2706, + "epoch": 2.9204959094676903, + "grad_norm": 0.21217882633209229, + "learning_rate": 1.946730019863528e-05, + "loss": 0.3997, "step": 81035 }, { - "epoch": 2.85, - "learning_rate": 2.0566184553729645e-05, - "loss": 0.276, + "epoch": 2.920676109128915, + "grad_norm": 0.17091058194637299, + "learning_rate": 1.94644544608268e-05, + "loss": 0.3784, "step": 81040 }, { - "epoch": 2.85, - "learning_rate": 2.0563381020928458e-05, - "loss": 0.2681, + "epoch": 2.9208563087901394, + "grad_norm": 0.23678135871887207, + "learning_rate": 1.9461608798439055e-05, + "loss": 0.3841, "step": 81045 }, { - "epoch": 2.85, - "learning_rate": 2.056057754573383e-05, - "loss": 0.2537, + "epoch": 2.921036508451364, + "grad_norm": 0.2015039473772049, + "learning_rate": 1.945876321151081e-05, + "loss": 0.4026, "step": 81050 }, { - "epoch": 2.85, - "learning_rate": 2.0557774128182184e-05, - "loss": 0.2597, + "epoch": 2.921216708112589, + "grad_norm": 0.2211407572031021, + "learning_rate": 1.945591770008085e-05, + "loss": 0.3967, "step": 81055 }, { - "epoch": 2.85, - "learning_rate": 2.055497076830992e-05, - "loss": 0.2749, + "epoch": 2.921396907773813, + "grad_norm": 0.21739064157009125, + "learning_rate": 1.945307226418793e-05, + "loss": 0.3956, "step": 81060 }, { - "epoch": 2.85, - "learning_rate": 2.0552167466153428e-05, - "loss": 0.2832, + "epoch": 2.921577107435038, + "grad_norm": 0.2019902616739273, + "learning_rate": 1.945022690387084e-05, + "loss": 0.4075, "step": 81065 }, { - "epoch": 2.85, - "learning_rate": 2.05493642217491e-05, - "loss": 0.2556, + "epoch": 2.9217573070962626, + "grad_norm": 0.1831779032945633, + "learning_rate": 1.9447381619168332e-05, + "loss": 0.3816, "step": 81070 }, { - "epoch": 2.85, - "learning_rate": 2.054656103513336e-05, - "loss": 0.2678, + "epoch": 2.9219375067574873, + "grad_norm": 0.21984632313251495, + "learning_rate": 1.944453641011916e-05, + "loss": 0.4448, "step": 81075 }, { - "epoch": 2.85, - "learning_rate": 2.054375790634258e-05, - "loss": 0.2551, + "epoch": 2.922117706418712, + "grad_norm": 0.19661669433116913, + "learning_rate": 1.9441691276762123e-05, + "loss": 0.41, "step": 81080 }, { - "epoch": 2.85, - "learning_rate": 2.0540954835413176e-05, - "loss": 0.2738, + "epoch": 2.922297906079937, + "grad_norm": 0.2180146425962448, + "learning_rate": 1.9438846219135948e-05, + "loss": 0.3727, "step": 81085 }, { - "epoch": 2.85, - "learning_rate": 2.0538151822381522e-05, - "loss": 0.2672, + "epoch": 2.922478105741161, + "grad_norm": 0.2767757475376129, + "learning_rate": 1.9436001237279432e-05, + "loss": 0.4416, "step": 81090 }, { - "epoch": 2.85, - "learning_rate": 2.0535348867284036e-05, - "loss": 0.247, + "epoch": 2.922658305402386, + "grad_norm": 0.20873478055000305, + "learning_rate": 1.9433156331231314e-05, + "loss": 0.3978, "step": 81095 }, { - "epoch": 2.85, - "learning_rate": 2.05325459701571e-05, - "loss": 0.2663, + "epoch": 2.9228385050636105, + "grad_norm": 0.26107144355773926, + "learning_rate": 1.9430311501030362e-05, + "loss": 0.3829, "step": 81100 }, { - "epoch": 2.85, - "learning_rate": 2.052974313103711e-05, - "loss": 0.2895, + "epoch": 2.9230187047248353, + "grad_norm": 0.267535924911499, + "learning_rate": 1.9427466746715342e-05, + "loss": 0.4117, "step": 81105 }, { - "epoch": 2.85, - "learning_rate": 2.0526940349960455e-05, - "loss": 0.3062, + "epoch": 2.9231989043860596, + "grad_norm": 0.21252234280109406, + "learning_rate": 1.942462206832501e-05, + "loss": 0.379, "step": 81110 }, { - "epoch": 2.85, - "learning_rate": 2.052413762696354e-05, - "loss": 0.2912, + "epoch": 2.9233791040472843, + "grad_norm": 0.1903425008058548, + "learning_rate": 1.9421777465898114e-05, + "loss": 0.352, "step": 81115 }, { - "epoch": 2.85, - "learning_rate": 2.0521334962082748e-05, - "loss": 0.2731, + "epoch": 2.923559303708509, + "grad_norm": 0.1975475549697876, + "learning_rate": 1.9418932939473426e-05, + "loss": 0.3898, "step": 81120 }, { - "epoch": 2.85, - "learning_rate": 2.051853235535447e-05, - "loss": 0.2838, + "epoch": 2.9237395033697338, + "grad_norm": 0.22255592048168182, + "learning_rate": 1.941608848908969e-05, + "loss": 0.3482, "step": 81125 }, { - "epoch": 2.85, - "learning_rate": 2.0515729806815084e-05, - "loss": 0.2635, + "epoch": 2.9239197030309585, + "grad_norm": 0.24407097697257996, + "learning_rate": 1.941324411478568e-05, + "loss": 0.3865, "step": 81130 }, { - "epoch": 2.85, - "learning_rate": 2.0512927316500995e-05, - "loss": 0.2871, + "epoch": 2.924099902692183, + "grad_norm": 0.18102699518203735, + "learning_rate": 1.9410399816600128e-05, + "loss": 0.3786, "step": 81135 }, { - "epoch": 2.85, - "learning_rate": 2.05101248844486e-05, - "loss": 0.2514, + "epoch": 2.9242801023534075, + "grad_norm": 0.23479129374027252, + "learning_rate": 1.9407555594571796e-05, + "loss": 0.3847, "step": 81140 }, { - "epoch": 2.85, - "learning_rate": 2.050732251069427e-05, - "loss": 0.2808, + "epoch": 2.9244603020146323, + "grad_norm": 0.20247110724449158, + "learning_rate": 1.9404711448739442e-05, + "loss": 0.3794, "step": 81145 }, { - "epoch": 2.86, - "learning_rate": 2.0504520195274384e-05, - "loss": 0.2765, + "epoch": 2.924640501675857, + "grad_norm": 0.2692834436893463, + "learning_rate": 1.9401867379141815e-05, + "loss": 0.4258, "step": 81150 }, { - "epoch": 2.86, - "learning_rate": 2.0501717938225355e-05, - "loss": 0.2493, + "epoch": 2.9248207013370813, + "grad_norm": 0.20718295872211456, + "learning_rate": 1.9399023385817644e-05, + "loss": 0.4251, "step": 81155 }, { - "epoch": 2.86, - "learning_rate": 2.0498915739583547e-05, - "loss": 0.2634, + "epoch": 2.925000900998306, + "grad_norm": 0.18877676129341125, + "learning_rate": 1.939617946880571e-05, + "loss": 0.3919, "step": 81160 }, { - "epoch": 2.86, - "learning_rate": 2.0496113599385353e-05, - "loss": 0.2576, + "epoch": 2.9251811006595307, + "grad_norm": 0.2178078591823578, + "learning_rate": 1.939333562814473e-05, + "loss": 0.4156, "step": 81165 }, { - "epoch": 2.86, - "learning_rate": 2.0493311517667153e-05, - "loss": 0.2958, + "epoch": 2.9253613003207555, + "grad_norm": 0.17405816912651062, + "learning_rate": 1.9390491863873482e-05, + "loss": 0.371, "step": 81170 }, { - "epoch": 2.86, - "learning_rate": 2.049050949446534e-05, - "loss": 0.2549, + "epoch": 2.92554149998198, + "grad_norm": 0.20669405162334442, + "learning_rate": 1.9387648176030697e-05, + "loss": 0.389, "step": 81175 }, { - "epoch": 2.86, - "learning_rate": 2.048770752981629e-05, - "loss": 0.2545, + "epoch": 2.925721699643205, + "grad_norm": 0.20877471566200256, + "learning_rate": 1.9384804564655106e-05, + "loss": 0.3857, "step": 81180 }, { - "epoch": 2.86, - "learning_rate": 2.048490562375637e-05, - "loss": 0.2702, + "epoch": 2.9259018993044292, + "grad_norm": 0.2562907934188843, + "learning_rate": 1.9381961029785485e-05, + "loss": 0.396, "step": 81185 }, { - "epoch": 2.86, - "learning_rate": 2.0482103776321983e-05, - "loss": 0.291, + "epoch": 2.926082098965654, + "grad_norm": 0.2052457183599472, + "learning_rate": 1.9379117571460545e-05, + "loss": 0.3769, "step": 81190 }, { - "epoch": 2.86, - "learning_rate": 2.047930198754951e-05, - "loss": 0.2739, + "epoch": 2.9262622986268787, + "grad_norm": 0.26629289984703064, + "learning_rate": 1.9376274189719034e-05, + "loss": 0.4164, "step": 81195 }, { - "epoch": 2.86, - "learning_rate": 2.047650025747532e-05, - "loss": 0.2484, + "epoch": 2.926442498288103, + "grad_norm": 0.23448923230171204, + "learning_rate": 1.9373430884599707e-05, + "loss": 0.3989, "step": 81200 }, { - "epoch": 2.86, - "learning_rate": 2.0473698586135776e-05, - "loss": 0.286, + "epoch": 2.9266226979493277, + "grad_norm": 0.2015734761953354, + "learning_rate": 1.937058765614129e-05, + "loss": 0.4129, "step": 81205 }, { - "epoch": 2.86, - "learning_rate": 2.047089697356729e-05, - "loss": 0.2554, + "epoch": 2.9268028976105525, + "grad_norm": 0.3036739230155945, + "learning_rate": 1.936774450438253e-05, + "loss": 0.3919, "step": 81210 }, { - "epoch": 2.86, - "learning_rate": 2.046809541980621e-05, - "loss": 0.303, + "epoch": 2.926983097271777, + "grad_norm": 0.19649726152420044, + "learning_rate": 1.9364901429362164e-05, + "loss": 0.4088, "step": 81215 }, { - "epoch": 2.86, - "learning_rate": 2.0465293924888936e-05, - "loss": 0.2859, + "epoch": 2.927163296933002, + "grad_norm": 0.2458265870809555, + "learning_rate": 1.9362058431118917e-05, + "loss": 0.4255, "step": 81220 }, { - "epoch": 2.86, - "learning_rate": 2.046249248885182e-05, - "loss": 0.2765, + "epoch": 2.9273434965942267, + "grad_norm": 0.22339041531085968, + "learning_rate": 1.935921550969154e-05, + "loss": 0.3925, "step": 81225 }, { - "epoch": 2.86, - "learning_rate": 2.0459691111731258e-05, - "loss": 0.2501, + "epoch": 2.927523696255451, + "grad_norm": 0.18713003396987915, + "learning_rate": 1.9356372665118754e-05, + "loss": 0.3988, "step": 81230 }, { - "epoch": 2.86, - "learning_rate": 2.0456889793563617e-05, - "loss": 0.2619, + "epoch": 2.9277038959166757, + "grad_norm": 0.2202412635087967, + "learning_rate": 1.9353529897439298e-05, + "loss": 0.3597, "step": 81235 }, { - "epoch": 2.86, - "learning_rate": 2.045408853438526e-05, - "loss": 0.2544, + "epoch": 2.9278840955779004, + "grad_norm": 0.21688272058963776, + "learning_rate": 1.9350687206691904e-05, + "loss": 0.4228, "step": 81240 }, { - "epoch": 2.86, - "learning_rate": 2.0451287334232567e-05, - "loss": 0.2714, + "epoch": 2.9280642952391247, + "grad_norm": 0.22464942932128906, + "learning_rate": 1.9347844592915303e-05, + "loss": 0.4061, "step": 81245 }, { - "epoch": 2.86, - "learning_rate": 2.0448486193141914e-05, - "loss": 0.2519, + "epoch": 2.9282444949003494, + "grad_norm": 0.18823909759521484, + "learning_rate": 1.934500205614823e-05, + "loss": 0.4211, "step": 81250 }, { - "epoch": 2.86, - "learning_rate": 2.044568511114967e-05, - "loss": 0.2771, + "epoch": 2.928424694561574, + "grad_norm": 0.1837102770805359, + "learning_rate": 1.9342159596429414e-05, + "loss": 0.3719, "step": 81255 }, { - "epoch": 2.86, - "learning_rate": 2.0442884088292204e-05, - "loss": 0.2701, + "epoch": 2.928604894222799, + "grad_norm": 0.19186070561408997, + "learning_rate": 1.933931721379756e-05, + "loss": 0.3568, "step": 81260 }, { - "epoch": 2.86, - "learning_rate": 2.0440083124605873e-05, - "loss": 0.2639, + "epoch": 2.9287850938840236, + "grad_norm": 0.2203979641199112, + "learning_rate": 1.933647490829143e-05, + "loss": 0.3601, "step": 81265 }, { - "epoch": 2.86, - "learning_rate": 2.0437282220127067e-05, - "loss": 0.2588, + "epoch": 2.9289652935452484, + "grad_norm": 0.1754385232925415, + "learning_rate": 1.9333632679949713e-05, + "loss": 0.3799, "step": 81270 }, { - "epoch": 2.86, - "learning_rate": 2.0434481374892148e-05, - "loss": 0.2643, + "epoch": 2.9291454932064727, + "grad_norm": 0.24749146401882172, + "learning_rate": 1.9330790528811177e-05, + "loss": 0.3833, "step": 81275 }, { - "epoch": 2.86, - "learning_rate": 2.043168058893747e-05, - "loss": 0.2891, + "epoch": 2.9293256928676974, + "grad_norm": 0.19745169579982758, + "learning_rate": 1.9327948454914514e-05, + "loss": 0.3694, "step": 81280 }, { - "epoch": 2.86, - "learning_rate": 2.0428879862299414e-05, - "loss": 0.2688, + "epoch": 2.929505892528922, + "grad_norm": 0.24451091885566711, + "learning_rate": 1.932510645829845e-05, + "loss": 0.3911, "step": 81285 }, { - "epoch": 2.86, - "learning_rate": 2.0426079195014347e-05, - "loss": 0.2552, + "epoch": 2.9296860921901464, + "grad_norm": 0.22178997099399567, + "learning_rate": 1.9322264539001716e-05, + "loss": 0.3695, "step": 81290 }, { - "epoch": 2.86, - "learning_rate": 2.042327858711862e-05, - "loss": 0.2722, + "epoch": 2.929866291851371, + "grad_norm": 0.2586722671985626, + "learning_rate": 1.9319422697063027e-05, + "loss": 0.402, "step": 81295 }, { - "epoch": 2.86, - "learning_rate": 2.0420478038648604e-05, - "loss": 0.2542, + "epoch": 2.930046491512596, + "grad_norm": 0.21260590851306915, + "learning_rate": 1.9316580932521102e-05, + "loss": 0.4434, "step": 81300 }, { - "epoch": 2.86, - "learning_rate": 2.0417677549640662e-05, - "loss": 0.2934, + "epoch": 2.9302266911738206, + "grad_norm": 0.21419310569763184, + "learning_rate": 1.9313739245414665e-05, + "loss": 0.3775, "step": 81305 }, { - "epoch": 2.86, - "learning_rate": 2.0414877120131163e-05, - "loss": 0.2756, + "epoch": 2.9304068908350454, + "grad_norm": 0.18965986371040344, + "learning_rate": 1.9310897635782426e-05, + "loss": 0.4149, "step": 81310 }, { - "epoch": 2.86, - "learning_rate": 2.0412076750156463e-05, - "loss": 0.276, + "epoch": 2.93058709049627, + "grad_norm": 0.2388392835855484, + "learning_rate": 1.930805610366311e-05, + "loss": 0.4024, "step": 81315 }, { - "epoch": 2.86, - "learning_rate": 2.040927643975291e-05, - "loss": 0.2806, + "epoch": 2.9307672901574944, + "grad_norm": 0.21854186058044434, + "learning_rate": 1.9305214649095426e-05, + "loss": 0.3543, "step": 81320 }, { - "epoch": 2.86, - "learning_rate": 2.0406476188956892e-05, - "loss": 0.2896, + "epoch": 2.930947489818719, + "grad_norm": 0.18977004289627075, + "learning_rate": 1.930237327211809e-05, + "loss": 0.3614, "step": 81325 }, { - "epoch": 2.86, - "learning_rate": 2.0403675997804748e-05, - "loss": 0.2642, + "epoch": 2.931127689479944, + "grad_norm": 0.20082025229930878, + "learning_rate": 1.9299531972769813e-05, + "loss": 0.3993, "step": 81330 }, { - "epoch": 2.86, - "learning_rate": 2.0400875866332845e-05, - "loss": 0.2652, + "epoch": 2.9313078891411686, + "grad_norm": 0.20026427507400513, + "learning_rate": 1.9296690751089312e-05, + "loss": 0.3545, "step": 81335 }, { - "epoch": 2.86, - "learning_rate": 2.0398075794577527e-05, - "loss": 0.2599, + "epoch": 2.931488088802393, + "grad_norm": 0.22359101474285126, + "learning_rate": 1.9293849607115292e-05, + "loss": 0.3945, "step": 81340 }, { - "epoch": 2.86, - "learning_rate": 2.0395275782575172e-05, - "loss": 0.2457, + "epoch": 2.9316682884636176, + "grad_norm": 0.2114863246679306, + "learning_rate": 1.929100854088647e-05, + "loss": 0.3842, "step": 81345 }, { - "epoch": 2.86, - "learning_rate": 2.0392475830362122e-05, - "loss": 0.274, + "epoch": 2.9318484881248423, + "grad_norm": 0.22665932774543762, + "learning_rate": 1.928816755244155e-05, + "loss": 0.3776, "step": 81350 }, { - "epoch": 2.86, - "learning_rate": 2.0389675937974745e-05, - "loss": 0.2751, + "epoch": 2.932028687786067, + "grad_norm": 0.27405086159706116, + "learning_rate": 1.928532664181924e-05, + "loss": 0.4077, "step": 81355 }, { - "epoch": 2.86, - "learning_rate": 2.0386876105449372e-05, - "loss": 0.2649, + "epoch": 2.932208887447292, + "grad_norm": 0.16725726425647736, + "learning_rate": 1.9282485809058254e-05, + "loss": 0.3516, "step": 81360 }, { - "epoch": 2.86, - "learning_rate": 2.0384076332822387e-05, - "loss": 0.2685, + "epoch": 2.932389087108516, + "grad_norm": 0.22895494103431702, + "learning_rate": 1.927964505419728e-05, + "loss": 0.3507, "step": 81365 }, { - "epoch": 2.86, - "learning_rate": 2.0381276620130126e-05, - "loss": 0.2657, + "epoch": 2.932569286769741, + "grad_norm": 0.2224702686071396, + "learning_rate": 1.9276804377275047e-05, + "loss": 0.3727, "step": 81370 }, { - "epoch": 2.86, - "learning_rate": 2.0378476967408942e-05, - "loss": 0.282, + "epoch": 2.9327494864309656, + "grad_norm": 0.2331514060497284, + "learning_rate": 1.927396377833024e-05, + "loss": 0.4005, "step": 81375 }, { - "epoch": 2.86, - "learning_rate": 2.0375677374695188e-05, - "loss": 0.283, + "epoch": 2.9329296860921903, + "grad_norm": 0.18143446743488312, + "learning_rate": 1.9271123257401568e-05, + "loss": 0.3625, "step": 81380 }, { - "epoch": 2.86, - "learning_rate": 2.0372877842025214e-05, - "loss": 0.273, + "epoch": 2.9331098857534146, + "grad_norm": 0.20633216202259064, + "learning_rate": 1.9268282814527737e-05, + "loss": 0.3695, "step": 81385 }, { - "epoch": 2.86, - "learning_rate": 2.0370078369435378e-05, - "loss": 0.2746, + "epoch": 2.9332900854146393, + "grad_norm": 0.2796785235404968, + "learning_rate": 1.9265442449747432e-05, + "loss": 0.4057, "step": 81390 }, { - "epoch": 2.86, - "learning_rate": 2.0367278956962012e-05, - "loss": 0.2806, + "epoch": 2.933470285075864, + "grad_norm": 0.23631520569324493, + "learning_rate": 1.9262602163099375e-05, + "loss": 0.3928, "step": 81395 }, { - "epoch": 2.86, - "learning_rate": 2.036447960464149e-05, - "loss": 0.2817, + "epoch": 2.933650484737089, + "grad_norm": 0.2131415605545044, + "learning_rate": 1.925976195462225e-05, + "loss": 0.3849, "step": 81400 }, { - "epoch": 2.86, - "learning_rate": 2.036168031251014e-05, - "loss": 0.2665, + "epoch": 2.9338306843983135, + "grad_norm": 0.2613097131252289, + "learning_rate": 1.925692182435475e-05, + "loss": 0.418, "step": 81405 }, { - "epoch": 2.86, - "learning_rate": 2.0358881080604313e-05, - "loss": 0.2534, + "epoch": 2.934010884059538, + "grad_norm": 0.24537603557109833, + "learning_rate": 1.9254081772335586e-05, + "loss": 0.4091, "step": 81410 }, { - "epoch": 2.86, - "learning_rate": 2.035608190896035e-05, - "loss": 0.2562, + "epoch": 2.9341910837207625, + "grad_norm": 0.19642150402069092, + "learning_rate": 1.9251241798603447e-05, + "loss": 0.4022, "step": 81415 }, { - "epoch": 2.86, - "learning_rate": 2.035328279761461e-05, - "loss": 0.2575, + "epoch": 2.9343712833819873, + "grad_norm": 0.1668914407491684, + "learning_rate": 1.9248401903197015e-05, + "loss": 0.3614, "step": 81420 }, { - "epoch": 2.86, - "learning_rate": 2.0350483746603437e-05, - "loss": 0.292, + "epoch": 2.934551483043212, + "grad_norm": 0.20156939327716827, + "learning_rate": 1.9245562086155004e-05, + "loss": 0.4329, "step": 81425 }, { - "epoch": 2.86, - "learning_rate": 2.0347684755963164e-05, - "loss": 0.2702, + "epoch": 2.9347316827044363, + "grad_norm": 0.22902165353298187, + "learning_rate": 1.9242722347516088e-05, + "loss": 0.3377, "step": 81430 }, { - "epoch": 2.87, - "learning_rate": 2.034488582573013e-05, - "loss": 0.2691, + "epoch": 2.934911882365661, + "grad_norm": 0.1990087330341339, + "learning_rate": 1.9239882687318972e-05, + "loss": 0.4288, "step": 81435 }, { - "epoch": 2.87, - "learning_rate": 2.034208695594069e-05, - "loss": 0.2523, + "epoch": 2.9350920820268858, + "grad_norm": 0.1762835681438446, + "learning_rate": 1.9237043105602342e-05, + "loss": 0.3982, "step": 81440 }, { - "epoch": 2.87, - "learning_rate": 2.0339288146631185e-05, - "loss": 0.3037, + "epoch": 2.9352722816881105, + "grad_norm": 0.23856672644615173, + "learning_rate": 1.923420360240487e-05, + "loss": 0.4047, "step": 81445 }, { - "epoch": 2.87, - "learning_rate": 2.0336489397837956e-05, - "loss": 0.264, + "epoch": 2.9354524813493352, + "grad_norm": 0.19141538441181183, + "learning_rate": 1.923136417776527e-05, + "loss": 0.4154, "step": 81450 }, { - "epoch": 2.87, - "learning_rate": 2.0333690709597326e-05, - "loss": 0.2505, + "epoch": 2.93563268101056, + "grad_norm": 0.21707025170326233, + "learning_rate": 1.9228524831722206e-05, + "loss": 0.3691, "step": 81455 }, { - "epoch": 2.87, - "learning_rate": 2.0330892081945657e-05, - "loss": 0.2608, + "epoch": 2.9358128806717843, + "grad_norm": 0.2077445536851883, + "learning_rate": 1.922568556431438e-05, + "loss": 0.3898, "step": 81460 }, { - "epoch": 2.87, - "learning_rate": 2.032809351491927e-05, - "loss": 0.2412, + "epoch": 2.935993080333009, + "grad_norm": 0.17210866510868073, + "learning_rate": 1.9222846375580478e-05, + "loss": 0.3616, "step": 81465 }, { - "epoch": 2.87, - "learning_rate": 2.0325295008554517e-05, - "loss": 0.2593, + "epoch": 2.9361732799942337, + "grad_norm": 0.1976180225610733, + "learning_rate": 1.922000726555916e-05, + "loss": 0.3905, "step": 81470 }, { - "epoch": 2.87, - "learning_rate": 2.032249656288772e-05, - "loss": 0.2561, + "epoch": 2.936353479655458, + "grad_norm": 0.2516196072101593, + "learning_rate": 1.9217168234289136e-05, + "loss": 0.4011, "step": 81475 }, { - "epoch": 2.87, - "learning_rate": 2.0319698177955228e-05, - "loss": 0.2668, + "epoch": 2.9365336793166827, + "grad_norm": 0.2320529818534851, + "learning_rate": 1.9214329281809074e-05, + "loss": 0.3951, "step": 81480 }, { - "epoch": 2.87, - "learning_rate": 2.0316899853793374e-05, - "loss": 0.2754, + "epoch": 2.9367138789779075, + "grad_norm": 0.19517144560813904, + "learning_rate": 1.921149040815765e-05, + "loss": 0.3724, "step": 81485 }, { - "epoch": 2.87, - "learning_rate": 2.0314101590438484e-05, - "loss": 0.2649, + "epoch": 2.936894078639132, + "grad_norm": 0.21140725910663605, + "learning_rate": 1.9208651613373553e-05, + "loss": 0.3686, "step": 81490 }, { - "epoch": 2.87, - "learning_rate": 2.0311303387926888e-05, - "loss": 0.2693, + "epoch": 2.937074278300357, + "grad_norm": 0.19208674132823944, + "learning_rate": 1.9205812897495448e-05, + "loss": 0.3837, "step": 81495 }, { - "epoch": 2.87, - "learning_rate": 2.0308505246294942e-05, - "loss": 0.2814, + "epoch": 2.9372544779615817, + "grad_norm": 0.19858016073703766, + "learning_rate": 1.920297426056203e-05, + "loss": 0.3975, "step": 81500 }, { - "epoch": 2.87, - "eval_loss": 0.2588658034801483, - "eval_runtime": 10.5545, - "eval_samples_per_second": 9.475, - "eval_steps_per_second": 9.475, + "epoch": 2.9372544779615817, + "eval_loss": 0.43074142932891846, + "eval_runtime": 3.5337, + "eval_samples_per_second": 28.299, + "eval_steps_per_second": 7.075, "step": 81500 }, { - "epoch": 2.87, - "learning_rate": 2.0305707165578963e-05, - "loss": 0.2613, + "epoch": 2.937434677622806, + "grad_norm": 0.20274978876113892, + "learning_rate": 1.920013570261196e-05, + "loss": 0.3892, "step": 81505 }, { - "epoch": 2.87, - "learning_rate": 2.030290914581527e-05, - "loss": 0.2867, + "epoch": 2.9376148772840307, + "grad_norm": 0.2162855863571167, + "learning_rate": 1.919729722368392e-05, + "loss": 0.4381, "step": 81510 }, { - "epoch": 2.87, - "learning_rate": 2.0300111187040222e-05, - "loss": 0.2575, + "epoch": 2.9377950769452554, + "grad_norm": 0.212267205119133, + "learning_rate": 1.9194458823816583e-05, + "loss": 0.3685, "step": 81515 }, { - "epoch": 2.87, - "learning_rate": 2.0297313289290122e-05, - "loss": 0.2701, + "epoch": 2.9379752766064797, + "grad_norm": 0.18356037139892578, + "learning_rate": 1.919162050304862e-05, + "loss": 0.3908, "step": 81520 }, { - "epoch": 2.87, - "learning_rate": 2.0294515452601316e-05, - "loss": 0.2571, + "epoch": 2.9381554762677045, + "grad_norm": 0.23290087282657623, + "learning_rate": 1.91887822614187e-05, + "loss": 0.4314, "step": 81525 }, { - "epoch": 2.87, - "learning_rate": 2.0291717677010115e-05, - "loss": 0.2659, + "epoch": 2.938335675928929, + "grad_norm": 0.2154729664325714, + "learning_rate": 1.9185944098965502e-05, + "loss": 0.3872, "step": 81530 }, { - "epoch": 2.87, - "learning_rate": 2.0288919962552873e-05, - "loss": 0.2722, + "epoch": 2.938515875590154, + "grad_norm": 0.19503213465213776, + "learning_rate": 1.918310601572768e-05, + "loss": 0.3756, "step": 81535 }, { - "epoch": 2.87, - "learning_rate": 2.0286122309265894e-05, - "loss": 0.2492, + "epoch": 2.9386960752513787, + "grad_norm": 0.22458268702030182, + "learning_rate": 1.9180268011743925e-05, + "loss": 0.3672, "step": 81540 }, { - "epoch": 2.87, - "learning_rate": 2.0283324717185505e-05, - "loss": 0.2694, + "epoch": 2.9388762749126034, + "grad_norm": 0.20593036711215973, + "learning_rate": 1.9177430087052892e-05, + "loss": 0.4, "step": 81545 }, { - "epoch": 2.87, - "learning_rate": 2.0280527186348035e-05, - "loss": 0.2685, + "epoch": 2.9390564745738277, + "grad_norm": 0.21392624080181122, + "learning_rate": 1.917459224169323e-05, + "loss": 0.3966, "step": 81550 }, { - "epoch": 2.87, - "learning_rate": 2.0277729716789817e-05, - "loss": 0.2735, + "epoch": 2.9392366742350524, + "grad_norm": 0.19831807911396027, + "learning_rate": 1.917175447570364e-05, + "loss": 0.3983, "step": 81555 }, { - "epoch": 2.87, - "learning_rate": 2.027493230854717e-05, - "loss": 0.2833, + "epoch": 2.939416873896277, + "grad_norm": 0.20181818306446075, + "learning_rate": 1.916891678912276e-05, + "loss": 0.3699, "step": 81560 }, { - "epoch": 2.87, - "learning_rate": 2.027213496165641e-05, - "loss": 0.3045, + "epoch": 2.9395970735575014, + "grad_norm": 0.19284601509571075, + "learning_rate": 1.916607918198926e-05, + "loss": 0.3628, "step": 81565 }, { - "epoch": 2.87, - "learning_rate": 2.0269337676153852e-05, - "loss": 0.2866, + "epoch": 2.939777273218726, + "grad_norm": 0.20783284306526184, + "learning_rate": 1.91632416543418e-05, + "loss": 0.4283, "step": 81570 }, { - "epoch": 2.87, - "learning_rate": 2.026654045207583e-05, - "loss": 0.2906, + "epoch": 2.939957472879951, + "grad_norm": 0.20689019560813904, + "learning_rate": 1.916040420621904e-05, + "loss": 0.3616, "step": 81575 }, { - "epoch": 2.87, - "learning_rate": 2.026374328945867e-05, - "loss": 0.2755, + "epoch": 2.9401376725411756, + "grad_norm": 0.3156082332134247, + "learning_rate": 1.9157566837659645e-05, + "loss": 0.4052, "step": 81580 }, { - "epoch": 2.87, - "learning_rate": 2.0260946188338677e-05, - "loss": 0.271, + "epoch": 2.9403178722024004, + "grad_norm": 0.23160193860530853, + "learning_rate": 1.9154729548702272e-05, + "loss": 0.3514, "step": 81585 }, { - "epoch": 2.87, - "learning_rate": 2.0258149148752168e-05, - "loss": 0.2736, + "epoch": 2.940498071863625, + "grad_norm": 0.18180207908153534, + "learning_rate": 1.9151892339385573e-05, + "loss": 0.4118, "step": 81590 }, { - "epoch": 2.87, - "learning_rate": 2.0255352170735477e-05, - "loss": 0.2743, + "epoch": 2.9406782715248494, + "grad_norm": 0.17559707164764404, + "learning_rate": 1.9149055209748214e-05, + "loss": 0.3459, "step": 81595 }, { - "epoch": 2.87, - "learning_rate": 2.0252555254324905e-05, - "loss": 0.2799, + "epoch": 2.940858471186074, + "grad_norm": 0.2852272689342499, + "learning_rate": 1.9146218159828845e-05, + "loss": 0.4089, "step": 81600 }, { - "epoch": 2.87, - "learning_rate": 2.024975839955678e-05, - "loss": 0.2507, + "epoch": 2.941038670847299, + "grad_norm": 0.19265016913414001, + "learning_rate": 1.914338118966611e-05, + "loss": 0.4113, "step": 81605 }, { - "epoch": 2.87, - "learning_rate": 2.0246961606467402e-05, - "loss": 0.2396, + "epoch": 2.9412188705085236, + "grad_norm": 0.27521640062332153, + "learning_rate": 1.9140544299298687e-05, + "loss": 0.4314, "step": 81610 }, { - "epoch": 2.87, - "learning_rate": 2.0244164875093106e-05, - "loss": 0.2552, + "epoch": 2.941399070169748, + "grad_norm": 0.22525955736637115, + "learning_rate": 1.91377074887652e-05, + "loss": 0.4097, "step": 81615 }, { - "epoch": 2.87, - "learning_rate": 2.0241368205470194e-05, - "loss": 0.2729, + "epoch": 2.9415792698309726, + "grad_norm": 0.1709366887807846, + "learning_rate": 1.9134870758104324e-05, + "loss": 0.3707, "step": 81620 }, { - "epoch": 2.87, - "learning_rate": 2.023857159763497e-05, - "loss": 0.2491, + "epoch": 2.9417594694921974, + "grad_norm": 0.20338067412376404, + "learning_rate": 1.9132034107354693e-05, + "loss": 0.4064, "step": 81625 }, { - "epoch": 2.87, - "learning_rate": 2.0235775051623756e-05, - "loss": 0.2673, + "epoch": 2.941939669153422, + "grad_norm": 0.21200573444366455, + "learning_rate": 1.912919753655496e-05, + "loss": 0.3752, "step": 81630 }, { - "epoch": 2.87, - "learning_rate": 2.0232978567472875e-05, - "loss": 0.2717, + "epoch": 2.942119868814647, + "grad_norm": 0.19498129189014435, + "learning_rate": 1.9126361045743778e-05, + "loss": 0.3777, "step": 81635 }, { - "epoch": 2.87, - "learning_rate": 2.0230182145218624e-05, - "loss": 0.2596, + "epoch": 2.942300068475871, + "grad_norm": 0.24946102499961853, + "learning_rate": 1.9123524634959794e-05, + "loss": 0.4022, "step": 81640 }, { - "epoch": 2.87, - "learning_rate": 2.02273857848973e-05, - "loss": 0.2645, + "epoch": 2.942480268137096, + "grad_norm": 0.25150594115257263, + "learning_rate": 1.9120688304241636e-05, + "loss": 0.3575, "step": 81645 }, { - "epoch": 2.87, - "learning_rate": 2.022458948654524e-05, - "loss": 0.2759, + "epoch": 2.9426604677983206, + "grad_norm": 0.23567576706409454, + "learning_rate": 1.9117852053627978e-05, + "loss": 0.4141, "step": 81650 }, { - "epoch": 2.87, - "learning_rate": 2.0221793250198738e-05, - "loss": 0.2445, + "epoch": 2.9428406674595453, + "grad_norm": 0.218300923705101, + "learning_rate": 1.911501588315743e-05, + "loss": 0.3508, "step": 81655 }, { - "epoch": 2.87, - "learning_rate": 2.02189970758941e-05, - "loss": 0.2694, + "epoch": 2.9430208671207696, + "grad_norm": 0.24794436991214752, + "learning_rate": 1.9112179792868662e-05, + "loss": 0.3964, "step": 81660 }, { - "epoch": 2.87, - "learning_rate": 2.0216200963667624e-05, - "loss": 0.2725, + "epoch": 2.9432010667819943, + "grad_norm": 0.1559266299009323, + "learning_rate": 1.9109343782800305e-05, + "loss": 0.4062, "step": 81665 }, { - "epoch": 2.87, - "learning_rate": 2.021340491355564e-05, - "loss": 0.266, + "epoch": 2.943381266443219, + "grad_norm": 0.25583502650260925, + "learning_rate": 1.910650785299099e-05, + "loss": 0.3751, "step": 81670 }, { - "epoch": 2.87, - "learning_rate": 2.0210608925594438e-05, - "loss": 0.249, + "epoch": 2.943561466104444, + "grad_norm": 0.2357344627380371, + "learning_rate": 1.9103672003479372e-05, + "loss": 0.3932, "step": 81675 }, { - "epoch": 2.87, - "learning_rate": 2.0207812999820316e-05, - "loss": 0.2599, + "epoch": 2.9437416657656685, + "grad_norm": 0.2820563018321991, + "learning_rate": 1.9100836234304076e-05, + "loss": 0.3736, "step": 81680 }, { - "epoch": 2.87, - "learning_rate": 2.0205017136269587e-05, - "loss": 0.2606, + "epoch": 2.9439218654268933, + "grad_norm": 0.227774977684021, + "learning_rate": 1.9098000545503745e-05, + "loss": 0.4056, "step": 81685 }, { - "epoch": 2.87, - "learning_rate": 2.0202221334978548e-05, - "loss": 0.2802, + "epoch": 2.9441020650881176, + "grad_norm": 0.23130254447460175, + "learning_rate": 1.909516493711702e-05, + "loss": 0.3757, "step": 81690 }, { - "epoch": 2.87, - "learning_rate": 2.0199425595983505e-05, - "loss": 0.247, + "epoch": 2.9442822647493423, + "grad_norm": 0.24160918593406677, + "learning_rate": 1.909232940918252e-05, + "loss": 0.4112, "step": 81695 }, { - "epoch": 2.87, - "learning_rate": 2.019662991932076e-05, - "loss": 0.2425, + "epoch": 2.944462464410567, + "grad_norm": 0.19820939004421234, + "learning_rate": 1.9089493961738896e-05, + "loss": 0.3657, "step": 81700 }, { - "epoch": 2.87, - "learning_rate": 2.0193834305026603e-05, - "loss": 0.2506, + "epoch": 2.9446426640717913, + "grad_norm": 0.21389785408973694, + "learning_rate": 1.9086658594824774e-05, + "loss": 0.3894, "step": 81705 }, { - "epoch": 2.87, - "learning_rate": 2.0191038753137344e-05, - "loss": 0.2518, + "epoch": 2.944822863733016, + "grad_norm": 0.24125300347805023, + "learning_rate": 1.9083823308478776e-05, + "loss": 0.4037, "step": 81710 }, { - "epoch": 2.87, - "learning_rate": 2.0188243263689276e-05, - "loss": 0.2504, + "epoch": 2.945003063394241, + "grad_norm": 0.23977577686309814, + "learning_rate": 1.9080988102739543e-05, + "loss": 0.4222, "step": 81715 }, { - "epoch": 2.88, - "learning_rate": 2.018544783671871e-05, - "loss": 0.2751, + "epoch": 2.9451832630554655, + "grad_norm": 0.2184176743030548, + "learning_rate": 1.90781529776457e-05, + "loss": 0.3853, "step": 81720 }, { - "epoch": 2.88, - "learning_rate": 2.018265247226191e-05, - "loss": 0.2701, + "epoch": 2.9453634627166903, + "grad_norm": 0.21365883946418762, + "learning_rate": 1.9075884935661265e-05, + "loss": 0.3671, "step": 81725 }, { - "epoch": 2.88, - "learning_rate": 2.0179857170355207e-05, - "loss": 0.2809, + "epoch": 2.945543662377915, + "grad_norm": 0.2217915952205658, + "learning_rate": 1.9073049955826478e-05, + "loss": 0.3974, "step": 81730 }, { - "epoch": 2.88, - "learning_rate": 2.0177061931034875e-05, - "loss": 0.2608, + "epoch": 2.9457238620391393, + "grad_norm": 0.2563817799091339, + "learning_rate": 1.9070215056745234e-05, + "loss": 0.4093, "step": 81735 }, { - "epoch": 2.88, - "learning_rate": 2.0174266754337217e-05, - "loss": 0.286, + "epoch": 2.945904061700364, + "grad_norm": 0.21083958446979523, + "learning_rate": 1.906738023845615e-05, + "loss": 0.3654, "step": 81740 }, { - "epoch": 2.88, - "learning_rate": 2.017147164029853e-05, - "loss": 0.2641, + "epoch": 2.9460842613615887, + "grad_norm": 0.2113693654537201, + "learning_rate": 1.906454550099787e-05, + "loss": 0.3854, "step": 81745 }, { - "epoch": 2.88, - "learning_rate": 2.01686765889551e-05, - "loss": 0.2808, + "epoch": 2.946264461022813, + "grad_norm": 0.2557770311832428, + "learning_rate": 1.9061710844409007e-05, + "loss": 0.3904, "step": 81750 }, { - "epoch": 2.88, - "learning_rate": 2.016588160034322e-05, - "loss": 0.2879, + "epoch": 2.9464446606840378, + "grad_norm": 0.23025763034820557, + "learning_rate": 1.9058876268728188e-05, + "loss": 0.3876, "step": 81755 }, { - "epoch": 2.88, - "learning_rate": 2.0163086674499175e-05, - "loss": 0.2807, + "epoch": 2.9466248603452625, + "grad_norm": 0.28059622645378113, + "learning_rate": 1.9056041773994025e-05, + "loss": 0.4097, "step": 81760 }, { - "epoch": 2.88, - "learning_rate": 2.016029181145927e-05, - "loss": 0.2345, + "epoch": 2.9468050600064872, + "grad_norm": 0.19796496629714966, + "learning_rate": 1.905320736024514e-05, + "loss": 0.4118, "step": 81765 }, { - "epoch": 2.88, - "learning_rate": 2.0157497011259785e-05, - "loss": 0.2527, + "epoch": 2.946985259667712, + "grad_norm": 0.19060267508029938, + "learning_rate": 1.9050373027520157e-05, + "loss": 0.3784, "step": 81770 }, { - "epoch": 2.88, - "learning_rate": 2.015470227393701e-05, - "loss": 0.2662, + "epoch": 2.9471654593289367, + "grad_norm": 0.1780025213956833, + "learning_rate": 1.9047538775857694e-05, + "loss": 0.384, "step": 81775 }, { - "epoch": 2.88, - "learning_rate": 2.0151907599527225e-05, - "loss": 0.2849, + "epoch": 2.947345658990161, + "grad_norm": 0.20874053239822388, + "learning_rate": 1.9044704605296353e-05, + "loss": 0.3901, "step": 81780 }, { - "epoch": 2.88, - "learning_rate": 2.0149112988066734e-05, - "loss": 0.2453, + "epoch": 2.9475258586513857, + "grad_norm": 0.18418976664543152, + "learning_rate": 1.9041870515874767e-05, + "loss": 0.365, "step": 81785 }, { - "epoch": 2.88, - "learning_rate": 2.014631843959181e-05, - "loss": 0.2547, + "epoch": 2.9477060583126105, + "grad_norm": 0.2519192695617676, + "learning_rate": 1.903903650763153e-05, + "loss": 0.3867, "step": 81790 }, { - "epoch": 2.88, - "learning_rate": 2.014352395413875e-05, - "loss": 0.2393, + "epoch": 2.9478862579738347, + "grad_norm": 0.20696429908275604, + "learning_rate": 1.903620258060528e-05, + "loss": 0.3652, "step": 81795 }, { - "epoch": 2.88, - "learning_rate": 2.0140729531743816e-05, - "loss": 0.265, + "epoch": 2.9480664576350595, + "grad_norm": 0.25634706020355225, + "learning_rate": 1.903336873483461e-05, + "loss": 0.366, "step": 81800 }, { - "epoch": 2.88, - "learning_rate": 2.013793517244332e-05, - "loss": 0.2536, + "epoch": 2.948246657296284, + "grad_norm": 0.17828556895256042, + "learning_rate": 1.9030534970358135e-05, + "loss": 0.438, "step": 81805 }, { - "epoch": 2.88, - "learning_rate": 2.013514087627353e-05, - "loss": 0.2445, + "epoch": 2.948426856957509, + "grad_norm": 0.21666783094406128, + "learning_rate": 1.9027701287214466e-05, + "loss": 0.4145, "step": 81810 }, { - "epoch": 2.88, - "learning_rate": 2.0132346643270725e-05, - "loss": 0.3005, + "epoch": 2.9486070566187337, + "grad_norm": 0.22079980373382568, + "learning_rate": 1.902486768544222e-05, + "loss": 0.372, "step": 81815 }, { - "epoch": 2.88, - "learning_rate": 2.012955247347119e-05, - "loss": 0.2731, + "epoch": 2.9487872562799584, + "grad_norm": 0.22830483317375183, + "learning_rate": 1.9022034165079977e-05, + "loss": 0.3996, "step": 81820 }, { - "epoch": 2.88, - "learning_rate": 2.0126758366911207e-05, - "loss": 0.2605, + "epoch": 2.9489674559411827, + "grad_norm": 0.20484139025211334, + "learning_rate": 1.901920072616638e-05, + "loss": 0.3791, "step": 81825 }, { - "epoch": 2.88, - "learning_rate": 2.012396432362706e-05, - "loss": 0.2733, + "epoch": 2.9491476556024074, + "grad_norm": 0.2294071912765503, + "learning_rate": 1.9016367368739994e-05, + "loss": 0.4122, "step": 81830 }, { - "epoch": 2.88, - "learning_rate": 2.0121170343655024e-05, - "loss": 0.2584, + "epoch": 2.949327855263632, + "grad_norm": 0.20010051131248474, + "learning_rate": 1.9013534092839463e-05, + "loss": 0.3743, "step": 81835 }, { - "epoch": 2.88, - "learning_rate": 2.0118376427031367e-05, - "loss": 0.2725, + "epoch": 2.949508054924857, + "grad_norm": 0.2721427381038666, + "learning_rate": 1.9010700898503362e-05, + "loss": 0.4054, "step": 81840 }, { - "epoch": 2.88, - "learning_rate": 2.0115582573792382e-05, - "loss": 0.2574, + "epoch": 2.949688254586081, + "grad_norm": 0.21804888546466827, + "learning_rate": 1.90078677857703e-05, + "loss": 0.4004, "step": 81845 }, { - "epoch": 2.88, - "learning_rate": 2.0112788783974335e-05, - "loss": 0.2502, + "epoch": 2.949868454247306, + "grad_norm": 0.20582912862300873, + "learning_rate": 1.9005034754678887e-05, + "loss": 0.3752, "step": 81850 }, { - "epoch": 2.88, - "learning_rate": 2.01099950576135e-05, - "loss": 0.2637, + "epoch": 2.9500486539085307, + "grad_norm": 0.22233439981937408, + "learning_rate": 1.900220180526771e-05, + "loss": 0.3631, "step": 81855 }, { - "epoch": 2.88, - "learning_rate": 2.0107201394746165e-05, - "loss": 0.2621, + "epoch": 2.9502288535697554, + "grad_norm": 0.20850253105163574, + "learning_rate": 1.8999368937575367e-05, + "loss": 0.3808, "step": 81860 }, { - "epoch": 2.88, - "learning_rate": 2.0104407795408596e-05, - "loss": 0.2777, + "epoch": 2.95040905323098, + "grad_norm": 0.23026061058044434, + "learning_rate": 1.8996536151640472e-05, + "loss": 0.4305, "step": 81865 }, { - "epoch": 2.88, - "learning_rate": 2.0101614259637065e-05, - "loss": 0.2842, + "epoch": 2.9505892528922044, + "grad_norm": 0.2322373390197754, + "learning_rate": 1.89937034475016e-05, + "loss": 0.4094, "step": 81870 }, { - "epoch": 2.88, - "learning_rate": 2.0098820787467835e-05, - "loss": 0.2921, + "epoch": 2.950769452553429, + "grad_norm": 0.2333720475435257, + "learning_rate": 1.899087082519736e-05, + "loss": 0.4051, "step": 81875 }, { - "epoch": 2.88, - "learning_rate": 2.0096027378937193e-05, - "loss": 0.2932, + "epoch": 2.950949652214654, + "grad_norm": 0.2258276641368866, + "learning_rate": 1.898803828476634e-05, + "loss": 0.4222, "step": 81880 }, { - "epoch": 2.88, - "learning_rate": 2.0093234034081406e-05, - "loss": 0.2608, + "epoch": 2.9511298518758786, + "grad_norm": 0.21611566841602325, + "learning_rate": 1.898520582624713e-05, + "loss": 0.4085, "step": 81885 }, { - "epoch": 2.88, - "learning_rate": 2.0090440752936744e-05, - "loss": 0.2765, + "epoch": 2.951310051537103, + "grad_norm": 0.21131113171577454, + "learning_rate": 1.8982373449678336e-05, + "loss": 0.413, "step": 81890 }, { - "epoch": 2.88, - "learning_rate": 2.008764753553946e-05, - "loss": 0.2695, + "epoch": 2.9514902511983276, + "grad_norm": 0.21428248286247253, + "learning_rate": 1.8979541155098528e-05, + "loss": 0.3984, "step": 81895 }, { - "epoch": 2.88, - "learning_rate": 2.008485438192585e-05, - "loss": 0.2474, + "epoch": 2.9516704508595524, + "grad_norm": 0.1810792088508606, + "learning_rate": 1.8976708942546316e-05, + "loss": 0.4067, "step": 81900 }, { - "epoch": 2.88, - "learning_rate": 2.008206129213216e-05, - "loss": 0.2591, + "epoch": 2.951850650520777, + "grad_norm": 0.20499011874198914, + "learning_rate": 1.897387681206028e-05, + "loss": 0.3654, "step": 81905 }, { - "epoch": 2.88, - "learning_rate": 2.0079268266194668e-05, - "loss": 0.279, + "epoch": 2.952030850182002, + "grad_norm": 0.2227361649274826, + "learning_rate": 1.8971044763678995e-05, + "loss": 0.3786, "step": 81910 }, { - "epoch": 2.88, - "learning_rate": 2.0076475304149624e-05, - "loss": 0.263, + "epoch": 2.952211049843226, + "grad_norm": 0.19713594019412994, + "learning_rate": 1.8968212797441064e-05, + "loss": 0.3725, "step": 81915 }, { - "epoch": 2.88, - "learning_rate": 2.0073682406033315e-05, - "loss": 0.2706, + "epoch": 2.952391249504451, + "grad_norm": 0.250509113073349, + "learning_rate": 1.896538091338507e-05, + "loss": 0.3395, "step": 81920 }, { - "epoch": 2.88, - "learning_rate": 2.0070889571881993e-05, - "loss": 0.2994, + "epoch": 2.9525714491656756, + "grad_norm": 0.1757110208272934, + "learning_rate": 1.8962549111549578e-05, + "loss": 0.4056, "step": 81925 }, { - "epoch": 2.88, - "learning_rate": 2.0068096801731915e-05, - "loss": 0.2746, + "epoch": 2.9527516488269003, + "grad_norm": 0.23518085479736328, + "learning_rate": 1.8959717391973204e-05, + "loss": 0.381, "step": 81930 }, { - "epoch": 2.88, - "learning_rate": 2.0065304095619344e-05, - "loss": 0.2926, + "epoch": 2.9529318484881246, + "grad_norm": 0.2347138673067093, + "learning_rate": 1.8956885754694495e-05, + "loss": 0.4076, "step": 81935 }, { - "epoch": 2.88, - "learning_rate": 2.006251145358056e-05, - "loss": 0.2821, + "epoch": 2.9531120481493494, + "grad_norm": 0.19113247096538544, + "learning_rate": 1.8954054199752063e-05, + "loss": 0.3994, "step": 81940 }, { - "epoch": 2.88, - "learning_rate": 2.0059718875651812e-05, - "loss": 0.2838, + "epoch": 2.953292247810574, + "grad_norm": 0.1934749335050583, + "learning_rate": 1.8951222727184466e-05, + "loss": 0.381, "step": 81945 }, { - "epoch": 2.88, - "learning_rate": 2.0056926361869354e-05, - "loss": 0.2792, + "epoch": 2.953472447471799, + "grad_norm": 0.21083234250545502, + "learning_rate": 1.894839133703028e-05, + "loss": 0.3682, "step": 81950 }, { - "epoch": 2.88, - "learning_rate": 2.0054692397053082e-05, - "loss": 0.2797, + "epoch": 2.9536526471330236, + "grad_norm": 0.22241094708442688, + "learning_rate": 1.89455600293281e-05, + "loss": 0.3661, "step": 81955 }, { - "epoch": 2.88, - "learning_rate": 2.0051899998825325e-05, - "loss": 0.2649, + "epoch": 2.9538328467942483, + "grad_norm": 0.17992718517780304, + "learning_rate": 1.894272880411649e-05, + "loss": 0.4289, "step": 81960 }, { - "epoch": 2.88, - "learning_rate": 2.0049107664845383e-05, - "loss": 0.2662, + "epoch": 2.9540130464554726, + "grad_norm": 0.20856983959674835, + "learning_rate": 1.8939897661434022e-05, + "loss": 0.412, "step": 81965 }, { - "epoch": 2.88, - "learning_rate": 2.0046315395149516e-05, - "loss": 0.2687, + "epoch": 2.9541932461166973, + "grad_norm": 0.21314571797847748, + "learning_rate": 1.893706660131928e-05, + "loss": 0.3842, "step": 81970 }, { - "epoch": 2.88, - "learning_rate": 2.0043523189773983e-05, - "loss": 0.2927, + "epoch": 2.954373445777922, + "grad_norm": 0.19296015799045563, + "learning_rate": 1.8934235623810833e-05, + "loss": 0.3784, "step": 81975 }, { - "epoch": 2.88, - "learning_rate": 2.0040731048755038e-05, - "loss": 0.2563, + "epoch": 2.9545536454391463, + "grad_norm": 0.24017472565174103, + "learning_rate": 1.8931404728947248e-05, + "loss": 0.4024, "step": 81980 }, { - "epoch": 2.88, - "learning_rate": 2.003793897212892e-05, - "loss": 0.265, + "epoch": 2.954733845100371, + "grad_norm": 0.27945849299430847, + "learning_rate": 1.89285739167671e-05, + "loss": 0.4157, "step": 81985 }, { - "epoch": 2.88, - "learning_rate": 2.0035146959931905e-05, - "loss": 0.2949, + "epoch": 2.954914044761596, + "grad_norm": 0.23929205536842346, + "learning_rate": 1.8925743187308954e-05, + "loss": 0.385, "step": 81990 }, { - "epoch": 2.88, - "learning_rate": 2.003235501220023e-05, - "loss": 0.2663, + "epoch": 2.9550942444228205, + "grad_norm": 0.20242777466773987, + "learning_rate": 1.8922912540611387e-05, + "loss": 0.4123, "step": 81995 }, { - "epoch": 2.88, - "learning_rate": 2.0029563128970156e-05, - "loss": 0.2588, + "epoch": 2.9552744440840453, + "grad_norm": 0.2415585070848465, + "learning_rate": 1.8920081976712963e-05, + "loss": 0.3732, "step": 82000 }, { - "epoch": 2.88, - "eval_loss": 0.2587546408176422, - "eval_runtime": 10.5627, - "eval_samples_per_second": 9.467, - "eval_steps_per_second": 9.467, + "epoch": 2.9552744440840453, + "eval_loss": 0.4305243194103241, + "eval_runtime": 3.5265, + "eval_samples_per_second": 28.356, + "eval_steps_per_second": 7.089, "step": 82000 }, { - "epoch": 2.89, - "learning_rate": 2.0026771310277916e-05, - "loss": 0.2722, + "epoch": 2.95545464374527, + "grad_norm": 0.16947853565216064, + "learning_rate": 1.8917251495652234e-05, + "loss": 0.38, "step": 82005 }, { - "epoch": 2.89, - "learning_rate": 2.002397955615979e-05, - "loss": 0.2583, + "epoch": 2.9556348434064943, + "grad_norm": 0.24770832061767578, + "learning_rate": 1.8914421097467787e-05, + "loss": 0.4288, "step": 82010 }, { - "epoch": 2.89, - "learning_rate": 2.0021187866651997e-05, - "loss": 0.2724, + "epoch": 2.955815043067719, + "grad_norm": 0.21834175288677216, + "learning_rate": 1.891159078219816e-05, + "loss": 0.3732, "step": 82015 }, { - "epoch": 2.89, - "learning_rate": 2.001839624179081e-05, - "loss": 0.246, + "epoch": 2.9559952427289438, + "grad_norm": 0.1806267648935318, + "learning_rate": 1.8908760549881943e-05, + "loss": 0.3953, "step": 82020 }, { - "epoch": 2.89, - "learning_rate": 2.001560468161245e-05, - "loss": 0.3009, + "epoch": 2.956175442390168, + "grad_norm": 0.26217013597488403, + "learning_rate": 1.890593040055768e-05, + "loss": 0.4025, "step": 82025 }, { - "epoch": 2.89, - "learning_rate": 2.001281318615319e-05, - "loss": 0.2928, + "epoch": 2.956355642051393, + "grad_norm": 0.15443582832813263, + "learning_rate": 1.8903100334263933e-05, + "loss": 0.3855, "step": 82030 }, { - "epoch": 2.89, - "learning_rate": 2.001002175544927e-05, - "loss": 0.2486, + "epoch": 2.9565358417126175, + "grad_norm": 0.21527697145938873, + "learning_rate": 1.8900270351039263e-05, + "loss": 0.3965, "step": 82035 }, { - "epoch": 2.89, - "learning_rate": 2.0007230389536916e-05, - "loss": 0.2804, + "epoch": 2.9567160413738423, + "grad_norm": 0.2215004861354828, + "learning_rate": 1.8897440450922234e-05, + "loss": 0.3823, "step": 82040 }, { - "epoch": 2.89, - "learning_rate": 2.0004439088452386e-05, - "loss": 0.2471, + "epoch": 2.956896241035067, + "grad_norm": 0.2365996241569519, + "learning_rate": 1.8894610633951392e-05, + "loss": 0.3848, "step": 82045 }, { - "epoch": 2.89, - "learning_rate": 2.0001647852231924e-05, - "loss": 0.2752, + "epoch": 2.9570764406962917, + "grad_norm": 0.2466432750225067, + "learning_rate": 1.88917809001653e-05, + "loss": 0.4159, "step": 82050 }, { - "epoch": 2.89, - "learning_rate": 1.9998856680911777e-05, - "loss": 0.2607, + "epoch": 2.957256640357516, + "grad_norm": 0.20089565217494965, + "learning_rate": 1.888895124960251e-05, + "loss": 0.3936, "step": 82055 }, { - "epoch": 2.89, - "learning_rate": 1.9996065574528168e-05, - "loss": 0.2797, + "epoch": 2.9574368400187407, + "grad_norm": 0.23963125050067902, + "learning_rate": 1.8886121682301576e-05, + "loss": 0.3979, "step": 82060 }, { - "epoch": 2.89, - "learning_rate": 1.9993274533117363e-05, - "loss": 0.2755, + "epoch": 2.9576170396799655, + "grad_norm": 0.2524862289428711, + "learning_rate": 1.8883292198301056e-05, + "loss": 0.3805, "step": 82065 }, { - "epoch": 2.89, - "learning_rate": 1.9990483556715587e-05, - "loss": 0.2648, + "epoch": 2.9577972393411898, + "grad_norm": 0.18411079049110413, + "learning_rate": 1.8880462797639487e-05, + "loss": 0.3945, "step": 82070 }, { - "epoch": 2.89, - "learning_rate": 1.9987692645359075e-05, - "loss": 0.2684, + "epoch": 2.9579774390024145, + "grad_norm": 0.20663435757160187, + "learning_rate": 1.8877633480355434e-05, + "loss": 0.3686, "step": 82075 }, { - "epoch": 2.89, - "learning_rate": 1.9984901799084064e-05, - "loss": 0.2473, + "epoch": 2.9581576386636392, + "grad_norm": 0.20612770318984985, + "learning_rate": 1.8874804246487437e-05, + "loss": 0.4012, "step": 82080 }, { - "epoch": 2.89, - "learning_rate": 1.9982111017926812e-05, - "loss": 0.2687, + "epoch": 2.958337838324864, + "grad_norm": 0.18578004837036133, + "learning_rate": 1.887197509607404e-05, + "loss": 0.3866, "step": 82085 }, { - "epoch": 2.89, - "learning_rate": 1.9979320301923537e-05, - "loss": 0.2566, + "epoch": 2.9585180379860887, + "grad_norm": 0.19520944356918335, + "learning_rate": 1.8869146029153805e-05, + "loss": 0.4054, "step": 82090 }, { - "epoch": 2.89, - "learning_rate": 1.9976529651110483e-05, - "loss": 0.2715, + "epoch": 2.9586982376473134, + "grad_norm": 0.2129872888326645, + "learning_rate": 1.8866317045765264e-05, + "loss": 0.3716, "step": 82095 }, { - "epoch": 2.89, - "learning_rate": 1.9973739065523865e-05, - "loss": 0.2655, + "epoch": 2.9588784373085377, + "grad_norm": 0.2196713089942932, + "learning_rate": 1.8863488145946965e-05, + "loss": 0.3829, "step": 82100 }, { - "epoch": 2.89, - "learning_rate": 1.9970948545199943e-05, - "loss": 0.2656, + "epoch": 2.9590586369697625, + "grad_norm": 0.21013066172599792, + "learning_rate": 1.8860659329737467e-05, + "loss": 0.403, "step": 82105 }, { - "epoch": 2.89, - "learning_rate": 1.9968158090174946e-05, - "loss": 0.2702, + "epoch": 2.959238836630987, + "grad_norm": 0.19054462015628815, + "learning_rate": 1.8857830597175273e-05, + "loss": 0.3886, "step": 82110 }, { - "epoch": 2.89, - "learning_rate": 1.99653677004851e-05, - "loss": 0.2788, + "epoch": 2.959419036292212, + "grad_norm": 0.2173452377319336, + "learning_rate": 1.8855001948298966e-05, + "loss": 0.3946, "step": 82115 }, { - "epoch": 2.89, - "learning_rate": 1.9962577376166622e-05, - "loss": 0.2872, + "epoch": 2.959599235953436, + "grad_norm": 0.22703658044338226, + "learning_rate": 1.885217338314705e-05, + "loss": 0.3912, "step": 82120 }, { - "epoch": 2.89, - "learning_rate": 1.995978711725577e-05, - "loss": 0.2743, + "epoch": 2.959779435614661, + "grad_norm": 0.1947726011276245, + "learning_rate": 1.8849344901758102e-05, + "loss": 0.3932, "step": 82125 }, { - "epoch": 2.89, - "learning_rate": 1.9956996923788757e-05, - "loss": 0.2629, + "epoch": 2.9599596352758857, + "grad_norm": 0.23506419360637665, + "learning_rate": 1.884651650417063e-05, + "loss": 0.3539, "step": 82130 }, { - "epoch": 2.89, - "learning_rate": 1.9954206795801815e-05, - "loss": 0.2858, + "epoch": 2.9601398349371104, + "grad_norm": 0.23253336548805237, + "learning_rate": 1.8843688190423175e-05, + "loss": 0.3832, "step": 82135 }, { - "epoch": 2.89, - "learning_rate": 1.995141673333117e-05, - "loss": 0.2714, + "epoch": 2.960320034598335, + "grad_norm": 0.19187670946121216, + "learning_rate": 1.884085996055428e-05, + "loss": 0.3612, "step": 82140 }, { - "epoch": 2.89, - "learning_rate": 1.9948626736413055e-05, - "loss": 0.2657, + "epoch": 2.9605002342595594, + "grad_norm": 0.2828364670276642, + "learning_rate": 1.883803181460248e-05, + "loss": 0.3848, "step": 82145 }, { - "epoch": 2.89, - "learning_rate": 1.9945836805083695e-05, - "loss": 0.2455, + "epoch": 2.960680433920784, + "grad_norm": 0.19735077023506165, + "learning_rate": 1.8835203752606294e-05, + "loss": 0.3615, "step": 82150 }, { - "epoch": 2.89, - "learning_rate": 1.994304693937931e-05, - "loss": 0.2824, + "epoch": 2.960860633582009, + "grad_norm": 0.2174484133720398, + "learning_rate": 1.8832375774604272e-05, + "loss": 0.3571, "step": 82155 }, { - "epoch": 2.89, - "learning_rate": 1.9940257139336122e-05, - "loss": 0.2578, + "epoch": 2.9610408332432336, + "grad_norm": 0.1863182634115219, + "learning_rate": 1.882954788063493e-05, + "loss": 0.3636, "step": 82160 }, { - "epoch": 2.89, - "learning_rate": 1.993746740499037e-05, - "loss": 0.2316, + "epoch": 2.961221032904458, + "grad_norm": 0.1913149356842041, + "learning_rate": 1.8826720070736804e-05, + "loss": 0.3881, "step": 82165 }, { - "epoch": 2.89, - "learning_rate": 1.9934677736378265e-05, - "loss": 0.2439, + "epoch": 2.9614012325656827, + "grad_norm": 0.21392560005187988, + "learning_rate": 1.8823892344948428e-05, + "loss": 0.3875, "step": 82170 }, { - "epoch": 2.89, - "learning_rate": 1.9931888133536027e-05, - "loss": 0.2705, + "epoch": 2.9615814322269074, + "grad_norm": 0.18154218792915344, + "learning_rate": 1.8821064703308315e-05, + "loss": 0.4115, "step": 82175 }, { - "epoch": 2.89, - "learning_rate": 1.9929098596499887e-05, - "loss": 0.2591, + "epoch": 2.961761631888132, + "grad_norm": 0.2097567617893219, + "learning_rate": 1.8818237145855004e-05, + "loss": 0.3887, "step": 82180 }, { - "epoch": 2.89, - "learning_rate": 1.9926309125306057e-05, - "loss": 0.2901, + "epoch": 2.961941831549357, + "grad_norm": 0.218689426779747, + "learning_rate": 1.881540967262702e-05, + "loss": 0.3997, "step": 82185 }, { - "epoch": 2.89, - "learning_rate": 1.9923519719990763e-05, - "loss": 0.2306, + "epoch": 2.9621220312105816, + "grad_norm": 0.19388610124588013, + "learning_rate": 1.8812582283662865e-05, + "loss": 0.3715, "step": 82190 }, { - "epoch": 2.89, - "learning_rate": 1.9920730380590212e-05, - "loss": 0.2629, + "epoch": 2.962302230871806, + "grad_norm": 0.19577103853225708, + "learning_rate": 1.88097549790011e-05, + "loss": 0.3694, "step": 82195 }, { - "epoch": 2.89, - "learning_rate": 1.9917941107140635e-05, - "loss": 0.2528, + "epoch": 2.9624824305330306, + "grad_norm": 0.21979494392871857, + "learning_rate": 1.8806927758680203e-05, + "loss": 0.4049, "step": 82200 }, { - "epoch": 2.89, - "learning_rate": 1.991515189967825e-05, - "loss": 0.2852, + "epoch": 2.9626626301942554, + "grad_norm": 0.19626237452030182, + "learning_rate": 1.880410062273873e-05, + "loss": 0.3738, "step": 82205 }, { - "epoch": 2.89, - "learning_rate": 1.9912362758239258e-05, - "loss": 0.2471, + "epoch": 2.9628428298554796, + "grad_norm": 0.21775542199611664, + "learning_rate": 1.8801273571215193e-05, + "loss": 0.3772, "step": 82210 }, { - "epoch": 2.89, - "learning_rate": 1.990957368285988e-05, - "loss": 0.2607, + "epoch": 2.9630230295167044, + "grad_norm": 0.16890302300453186, + "learning_rate": 1.8798446604148085e-05, + "loss": 0.3957, "step": 82215 }, { - "epoch": 2.89, - "learning_rate": 1.990678467357634e-05, - "loss": 0.2535, + "epoch": 2.963203229177929, + "grad_norm": 0.2454291582107544, + "learning_rate": 1.879561972157596e-05, + "loss": 0.367, "step": 82220 }, { - "epoch": 2.89, - "learning_rate": 1.9903995730424846e-05, - "loss": 0.2794, + "epoch": 2.963383428839154, + "grad_norm": 0.21003590524196625, + "learning_rate": 1.8792792923537304e-05, + "loss": 0.3988, "step": 82225 }, { - "epoch": 2.89, - "learning_rate": 1.9901206853441607e-05, - "loss": 0.2707, + "epoch": 2.9635636285003786, + "grad_norm": 0.21214927732944489, + "learning_rate": 1.878996621007064e-05, + "loss": 0.3633, "step": 82230 }, { - "epoch": 2.89, - "learning_rate": 1.989841804266283e-05, - "loss": 0.2502, + "epoch": 2.9637438281616033, + "grad_norm": 0.24998506903648376, + "learning_rate": 1.878713958121449e-05, + "loss": 0.3908, "step": 82235 }, { - "epoch": 2.89, - "learning_rate": 1.9895629298124734e-05, - "loss": 0.2555, + "epoch": 2.9639240278228276, + "grad_norm": 0.21911552548408508, + "learning_rate": 1.8784313037007355e-05, + "loss": 0.3708, "step": 82240 }, { - "epoch": 2.89, - "learning_rate": 1.9892840619863535e-05, - "loss": 0.2574, + "epoch": 2.9641042274840523, + "grad_norm": 0.24015755951404572, + "learning_rate": 1.8781486577487763e-05, + "loss": 0.43, "step": 82245 }, { - "epoch": 2.89, - "learning_rate": 1.9890052007915435e-05, - "loss": 0.2736, + "epoch": 2.964284427145277, + "grad_norm": 0.17950226366519928, + "learning_rate": 1.8778660202694203e-05, + "loss": 0.3546, "step": 82250 }, { - "epoch": 2.89, - "learning_rate": 1.988726346231663e-05, - "loss": 0.2708, + "epoch": 2.9644646268065014, + "grad_norm": 0.14863833785057068, + "learning_rate": 1.877583391266519e-05, + "loss": 0.3852, "step": 82255 }, { - "epoch": 2.89, - "learning_rate": 1.988447498310335e-05, - "loss": 0.2763, + "epoch": 2.964644826467726, + "grad_norm": 0.207986518740654, + "learning_rate": 1.8773007707439245e-05, + "loss": 0.3883, "step": 82260 }, { - "epoch": 2.89, - "learning_rate": 1.9881686570311788e-05, - "loss": 0.2744, + "epoch": 2.964825026128951, + "grad_norm": 0.1966986507177353, + "learning_rate": 1.877018158705486e-05, + "loss": 0.3868, "step": 82265 }, { - "epoch": 2.89, - "learning_rate": 1.987889822397816e-05, - "loss": 0.2648, + "epoch": 2.9650052257901756, + "grad_norm": 0.19852174818515778, + "learning_rate": 1.876735555155054e-05, + "loss": 0.429, "step": 82270 }, { - "epoch": 2.89, - "learning_rate": 1.987610994413865e-05, - "loss": 0.2605, + "epoch": 2.9651854254514003, + "grad_norm": 0.2330932915210724, + "learning_rate": 1.87645296009648e-05, + "loss": 0.4144, "step": 82275 }, { - "epoch": 2.89, - "learning_rate": 1.9873321730829486e-05, - "loss": 0.264, + "epoch": 2.965365625112625, + "grad_norm": 0.2199668288230896, + "learning_rate": 1.8761703735336134e-05, + "loss": 0.4, "step": 82280 }, { - "epoch": 2.9, - "learning_rate": 1.987053358408686e-05, - "loss": 0.2632, + "epoch": 2.9655458247738493, + "grad_norm": 0.24273623526096344, + "learning_rate": 1.8758877954703047e-05, + "loss": 0.3756, "step": 82285 }, { - "epoch": 2.9, - "learning_rate": 1.9867745503946968e-05, - "loss": 0.2622, + "epoch": 2.965726024435074, + "grad_norm": 0.2339378446340561, + "learning_rate": 1.8756052259104048e-05, + "loss": 0.4029, "step": 82290 }, { - "epoch": 2.9, - "learning_rate": 1.9864957490446018e-05, - "loss": 0.2775, + "epoch": 2.965906224096299, + "grad_norm": 0.1760188341140747, + "learning_rate": 1.8753226648577613e-05, + "loss": 0.3748, "step": 82295 }, { - "epoch": 2.9, - "learning_rate": 1.9862169543620222e-05, - "loss": 0.2779, + "epoch": 2.966086423757523, + "grad_norm": 0.24585866928100586, + "learning_rate": 1.875040112316227e-05, + "loss": 0.4141, "step": 82300 }, { - "epoch": 2.9, - "learning_rate": 1.9859381663505763e-05, - "loss": 0.2398, + "epoch": 2.966266623418748, + "grad_norm": 0.2293357104063034, + "learning_rate": 1.8747575682896483e-05, + "loss": 0.3367, "step": 82305 }, { - "epoch": 2.9, - "learning_rate": 1.9856593850138837e-05, - "loss": 0.2947, + "epoch": 2.9664468230799725, + "grad_norm": 0.19390980899333954, + "learning_rate": 1.874475032781879e-05, + "loss": 0.3877, "step": 82310 }, { - "epoch": 2.9, - "learning_rate": 1.9853806103555662e-05, - "loss": 0.2575, + "epoch": 2.9666270227411973, + "grad_norm": 0.20741982758045197, + "learning_rate": 1.874192505796765e-05, + "loss": 0.4016, "step": 82315 }, { - "epoch": 2.9, - "learning_rate": 1.9851018423792416e-05, - "loss": 0.2662, + "epoch": 2.966807222402422, + "grad_norm": 0.21530795097351074, + "learning_rate": 1.873909987338157e-05, + "loss": 0.3675, "step": 82320 }, { - "epoch": 2.9, - "learning_rate": 1.984823081088531e-05, - "loss": 0.2516, + "epoch": 2.9669874220636467, + "grad_norm": 0.2282307744026184, + "learning_rate": 1.8736274774099044e-05, + "loss": 0.3606, "step": 82325 }, { - "epoch": 2.9, - "learning_rate": 1.9845443264870522e-05, - "loss": 0.2661, + "epoch": 2.967167621724871, + "grad_norm": 0.22964274883270264, + "learning_rate": 1.873344976015856e-05, + "loss": 0.3933, "step": 82330 }, { - "epoch": 2.9, - "learning_rate": 1.9842655785784265e-05, - "loss": 0.2357, + "epoch": 2.9673478213860958, + "grad_norm": 0.2597173750400543, + "learning_rate": 1.873062483159861e-05, + "loss": 0.3803, "step": 82335 }, { - "epoch": 2.9, - "learning_rate": 1.9839868373662722e-05, - "loss": 0.2718, + "epoch": 2.9675280210473205, + "grad_norm": 0.19982276856899261, + "learning_rate": 1.8727799988457682e-05, + "loss": 0.4066, "step": 82340 }, { - "epoch": 2.9, - "learning_rate": 1.9837081028542088e-05, - "loss": 0.271, + "epoch": 2.9677082207085452, + "grad_norm": 0.19702062010765076, + "learning_rate": 1.8724975230774265e-05, + "loss": 0.3729, "step": 82345 }, { - "epoch": 2.9, - "learning_rate": 1.983429375045855e-05, - "loss": 0.2681, + "epoch": 2.9678884203697695, + "grad_norm": 0.19733615219593048, + "learning_rate": 1.872215055858685e-05, + "loss": 0.4182, "step": 82350 }, { - "epoch": 2.9, - "learning_rate": 1.9831506539448302e-05, - "loss": 0.2809, + "epoch": 2.9680686200309943, + "grad_norm": 0.21504078805446625, + "learning_rate": 1.8719325971933912e-05, + "loss": 0.3605, "step": 82355 }, { - "epoch": 2.9, - "learning_rate": 1.9828719395547545e-05, - "loss": 0.2644, + "epoch": 2.968248819692219, + "grad_norm": 0.19036151468753815, + "learning_rate": 1.8716501470853942e-05, + "loss": 0.4204, "step": 82360 }, { - "epoch": 2.9, - "learning_rate": 1.9825932318792454e-05, - "loss": 0.2748, + "epoch": 2.9684290193534437, + "grad_norm": 0.2703343331813812, + "learning_rate": 1.8713677055385425e-05, + "loss": 0.3874, "step": 82365 }, { - "epoch": 2.9, - "learning_rate": 1.9823145309219213e-05, - "loss": 0.2707, + "epoch": 2.9686092190146685, + "grad_norm": 0.20502404868602753, + "learning_rate": 1.8710852725566847e-05, + "loss": 0.3794, "step": 82370 }, { - "epoch": 2.9, - "learning_rate": 1.9820358366864026e-05, - "loss": 0.2676, + "epoch": 2.9687894186758927, + "grad_norm": 0.2101486623287201, + "learning_rate": 1.8708028481436672e-05, + "loss": 0.4003, "step": 82375 }, { - "epoch": 2.9, - "learning_rate": 1.9817571491763076e-05, - "loss": 0.249, + "epoch": 2.9689696183371175, + "grad_norm": 0.19613702595233917, + "learning_rate": 1.8705204323033398e-05, + "loss": 0.3774, "step": 82380 }, { - "epoch": 2.9, - "learning_rate": 1.9814784683952544e-05, - "loss": 0.2791, + "epoch": 2.969149817998342, + "grad_norm": 0.22090743482112885, + "learning_rate": 1.870238025039549e-05, + "loss": 0.4098, "step": 82385 }, { - "epoch": 2.9, - "learning_rate": 1.9811997943468605e-05, - "loss": 0.2651, + "epoch": 2.969330017659567, + "grad_norm": 0.2624460756778717, + "learning_rate": 1.869955626356144e-05, + "loss": 0.3689, "step": 82390 }, { - "epoch": 2.9, - "learning_rate": 1.9809211270347465e-05, - "loss": 0.2612, + "epoch": 2.9695102173207912, + "grad_norm": 0.2443309724330902, + "learning_rate": 1.869673236256972e-05, + "loss": 0.4143, "step": 82395 }, { - "epoch": 2.9, - "learning_rate": 1.980642466462529e-05, - "loss": 0.27, + "epoch": 2.969690416982016, + "grad_norm": 0.2217528223991394, + "learning_rate": 1.8693908547458782e-05, + "loss": 0.386, "step": 82400 }, { - "epoch": 2.9, - "learning_rate": 1.980363812633827e-05, - "loss": 0.2555, + "epoch": 2.9698706166432407, + "grad_norm": 0.19909748435020447, + "learning_rate": 1.869108481826714e-05, + "loss": 0.4024, "step": 82405 }, { - "epoch": 2.9, - "learning_rate": 1.980085165552258e-05, - "loss": 0.2816, + "epoch": 2.9700508163044654, + "grad_norm": 0.23337341845035553, + "learning_rate": 1.8688261175033238e-05, + "loss": 0.4011, "step": 82410 }, { - "epoch": 2.9, - "learning_rate": 1.9798065252214415e-05, - "loss": 0.2741, + "epoch": 2.97023101596569, + "grad_norm": 0.20584703981876373, + "learning_rate": 1.868543761779555e-05, + "loss": 0.4166, "step": 82415 }, { - "epoch": 2.9, - "learning_rate": 1.979527891644994e-05, - "loss": 0.2858, + "epoch": 2.9704112156269145, + "grad_norm": 0.17910712957382202, + "learning_rate": 1.8682614146592558e-05, + "loss": 0.3962, "step": 82420 }, { - "epoch": 2.9, - "learning_rate": 1.979249264826533e-05, - "loss": 0.263, + "epoch": 2.970591415288139, + "grad_norm": 0.16632866859436035, + "learning_rate": 1.8679790761462717e-05, + "loss": 0.388, "step": 82425 }, { - "epoch": 2.9, - "learning_rate": 1.9789706447696785e-05, - "loss": 0.2587, + "epoch": 2.970771614949364, + "grad_norm": 0.24808615446090698, + "learning_rate": 1.8676967462444513e-05, + "loss": 0.3801, "step": 82430 }, { - "epoch": 2.9, - "learning_rate": 1.9786920314780462e-05, - "loss": 0.2738, + "epoch": 2.9709518146105887, + "grad_norm": 0.2281518578529358, + "learning_rate": 1.8674144249576403e-05, + "loss": 0.363, "step": 82435 }, { - "epoch": 2.9, - "learning_rate": 1.9784134249552544e-05, - "loss": 0.2599, + "epoch": 2.971132014271813, + "grad_norm": 0.14790934324264526, + "learning_rate": 1.8671321122896847e-05, + "loss": 0.3396, "step": 82440 }, { - "epoch": 2.9, - "learning_rate": 1.9781348252049196e-05, - "loss": 0.2548, + "epoch": 2.9713122139330377, + "grad_norm": 0.2927576005458832, + "learning_rate": 1.866849808244432e-05, + "loss": 0.3867, "step": 82445 }, { - "epoch": 2.9, - "learning_rate": 1.977856232230661e-05, - "loss": 0.2498, + "epoch": 2.9714924135942624, + "grad_norm": 0.24208620190620422, + "learning_rate": 1.866567512825728e-05, + "loss": 0.4101, "step": 82450 }, { - "epoch": 2.9, - "learning_rate": 1.9775776460360952e-05, - "loss": 0.2721, + "epoch": 2.971672613255487, + "grad_norm": 0.19803206622600555, + "learning_rate": 1.8662852260374183e-05, + "loss": 0.3986, "step": 82455 }, { - "epoch": 2.9, - "learning_rate": 1.97729906662484e-05, - "loss": 0.2808, + "epoch": 2.971852812916712, + "grad_norm": 0.22639377415180206, + "learning_rate": 1.866002947883351e-05, + "loss": 0.4211, "step": 82460 }, { - "epoch": 2.9, - "learning_rate": 1.9770204940005103e-05, - "loss": 0.2835, + "epoch": 2.9720330125779366, + "grad_norm": 0.20173002779483795, + "learning_rate": 1.865720678367369e-05, + "loss": 0.3984, "step": 82465 }, { - "epoch": 2.9, - "learning_rate": 1.976741928166726e-05, - "loss": 0.2708, + "epoch": 2.972213212239161, + "grad_norm": 0.28004732728004456, + "learning_rate": 1.8654384174933214e-05, + "loss": 0.4313, "step": 82470 }, { - "epoch": 2.9, - "learning_rate": 1.976463369127103e-05, - "loss": 0.2586, + "epoch": 2.9723934119003856, + "grad_norm": 0.19623763859272003, + "learning_rate": 1.865156165265053e-05, + "loss": 0.408, "step": 82475 }, { - "epoch": 2.9, - "learning_rate": 1.9761848168852575e-05, - "loss": 0.2569, + "epoch": 2.9725736115616104, + "grad_norm": 0.17043015360832214, + "learning_rate": 1.864873921686407e-05, + "loss": 0.3979, "step": 82480 }, { - "epoch": 2.9, - "learning_rate": 1.975906271444807e-05, - "loss": 0.2628, + "epoch": 2.9727538112228347, + "grad_norm": 0.2070167362689972, + "learning_rate": 1.864591686761233e-05, + "loss": 0.3852, "step": 82485 }, { - "epoch": 2.9, - "learning_rate": 1.9756277328093685e-05, - "loss": 0.3001, + "epoch": 2.9729340108840594, + "grad_norm": 0.21379338204860687, + "learning_rate": 1.864309460493372e-05, + "loss": 0.4143, "step": 82490 }, { - "epoch": 2.9, - "learning_rate": 1.9753492009825587e-05, - "loss": 0.2645, + "epoch": 2.973114210545284, + "grad_norm": 0.2483922243118286, + "learning_rate": 1.864027242886673e-05, + "loss": 0.4423, "step": 82495 }, { - "epoch": 2.9, - "learning_rate": 1.9750706759679934e-05, - "loss": 0.2419, + "epoch": 2.973294410206509, + "grad_norm": 0.2155703455209732, + "learning_rate": 1.86374503394498e-05, + "loss": 0.4044, "step": 82500 }, { - "epoch": 2.9, - "eval_loss": 0.25884824991226196, - "eval_runtime": 10.5461, - "eval_samples_per_second": 9.482, - "eval_steps_per_second": 9.482, + "epoch": 2.973294410206509, + "eval_loss": 0.4298676550388336, + "eval_runtime": 3.5326, + "eval_samples_per_second": 28.308, + "eval_steps_per_second": 7.077, "step": 82500 }, { - "epoch": 2.9, - "learning_rate": 1.9747921577692884e-05, - "loss": 0.266, + "epoch": 2.9734746098677336, + "grad_norm": 0.24112021923065186, + "learning_rate": 1.863462833672136e-05, + "loss": 0.3763, "step": 82505 }, { - "epoch": 2.9, - "learning_rate": 1.9745136463900626e-05, - "loss": 0.2801, + "epoch": 2.9736548095289583, + "grad_norm": 0.19381298124790192, + "learning_rate": 1.8631806420719896e-05, + "loss": 0.3785, "step": 82510 }, { - "epoch": 2.9, - "learning_rate": 1.97423514183393e-05, - "loss": 0.2743, + "epoch": 2.9738350091901826, + "grad_norm": 0.21174710988998413, + "learning_rate": 1.862898459148383e-05, + "loss": 0.3688, "step": 82515 }, { - "epoch": 2.9, - "learning_rate": 1.9739566441045072e-05, - "loss": 0.2633, + "epoch": 2.9740152088514074, + "grad_norm": 0.18652644753456116, + "learning_rate": 1.862616284905161e-05, + "loss": 0.3974, "step": 82520 }, { - "epoch": 2.9, - "learning_rate": 1.9736781532054115e-05, - "loss": 0.2905, + "epoch": 2.974195408512632, + "grad_norm": 0.24465037882328033, + "learning_rate": 1.8623341193461696e-05, + "loss": 0.4212, "step": 82525 }, { - "epoch": 2.9, - "learning_rate": 1.9733996691402583e-05, - "loss": 0.2409, + "epoch": 2.9743756081738564, + "grad_norm": 0.19063496589660645, + "learning_rate": 1.8620519624752512e-05, + "loss": 0.3724, "step": 82530 }, { - "epoch": 2.9, - "learning_rate": 1.9731211919126634e-05, - "loss": 0.275, + "epoch": 2.974555807835081, + "grad_norm": 0.21079134941101074, + "learning_rate": 1.8617698142962523e-05, + "loss": 0.401, "step": 82535 }, { - "epoch": 2.9, - "learning_rate": 1.9728427215262415e-05, - "loss": 0.2609, + "epoch": 2.974736007496306, + "grad_norm": 0.22930555045604706, + "learning_rate": 1.861487674813016e-05, + "loss": 0.3471, "step": 82540 }, { - "epoch": 2.9, - "learning_rate": 1.97256425798461e-05, - "loss": 0.272, + "epoch": 2.9749162071575306, + "grad_norm": 0.22107750177383423, + "learning_rate": 1.8612055440293856e-05, + "loss": 0.3809, "step": 82545 }, { - "epoch": 2.9, - "learning_rate": 1.9722858012913843e-05, - "loss": 0.2581, + "epoch": 2.9750964068187553, + "grad_norm": 0.21329925954341888, + "learning_rate": 1.8609234219492072e-05, + "loss": 0.3947, "step": 82550 }, { - "epoch": 2.9, - "learning_rate": 1.9720073514501802e-05, - "loss": 0.2868, + "epoch": 2.97527660647998, + "grad_norm": 0.1962546557188034, + "learning_rate": 1.860641308576323e-05, + "loss": 0.3516, "step": 82555 }, { - "epoch": 2.9, - "learning_rate": 1.971728908464611e-05, - "loss": 0.2457, + "epoch": 2.9754568061412043, + "grad_norm": 0.19978949427604675, + "learning_rate": 1.8603592039145767e-05, + "loss": 0.3896, "step": 82560 }, { - "epoch": 2.9, - "learning_rate": 1.9714504723382952e-05, - "loss": 0.2821, + "epoch": 2.975637005802429, + "grad_norm": 0.23347966372966766, + "learning_rate": 1.860077107967813e-05, + "loss": 0.3658, "step": 82565 }, { - "epoch": 2.91, - "learning_rate": 1.9711720430748466e-05, - "loss": 0.2635, + "epoch": 2.975817205463654, + "grad_norm": 0.19242475926876068, + "learning_rate": 1.8597950207398745e-05, + "loss": 0.4014, "step": 82570 }, { - "epoch": 2.91, - "learning_rate": 1.9708936206778803e-05, - "loss": 0.2541, + "epoch": 2.975997405124878, + "grad_norm": 0.20746643841266632, + "learning_rate": 1.8595129422346054e-05, + "loss": 0.3827, "step": 82575 }, { - "epoch": 2.91, - "learning_rate": 1.9706152051510106e-05, - "loss": 0.2585, + "epoch": 2.976177604786103, + "grad_norm": 0.20090548694133759, + "learning_rate": 1.8592308724558485e-05, + "loss": 0.3629, "step": 82580 }, { - "epoch": 2.91, - "learning_rate": 1.970336796497855e-05, - "loss": 0.2627, + "epoch": 2.9763578044473276, + "grad_norm": 0.17902597784996033, + "learning_rate": 1.8589488114074456e-05, + "loss": 0.3842, "step": 82585 }, { - "epoch": 2.91, - "learning_rate": 1.9700583947220268e-05, - "loss": 0.2771, + "epoch": 2.9765380041085523, + "grad_norm": 0.2532462477684021, + "learning_rate": 1.8586667590932426e-05, + "loss": 0.4095, "step": 82590 }, { - "epoch": 2.91, - "learning_rate": 1.969779999827141e-05, - "loss": 0.2966, + "epoch": 2.976718203769777, + "grad_norm": 0.26254743337631226, + "learning_rate": 1.8583847155170803e-05, + "loss": 0.3585, "step": 82595 }, { - "epoch": 2.91, - "learning_rate": 1.9695016118168117e-05, - "loss": 0.2555, + "epoch": 2.9768984034310018, + "grad_norm": 0.21005457639694214, + "learning_rate": 1.858102680682802e-05, + "loss": 0.3766, "step": 82600 }, { - "epoch": 2.91, - "learning_rate": 1.969223230694655e-05, - "loss": 0.2502, + "epoch": 2.977078603092226, + "grad_norm": 0.23283547163009644, + "learning_rate": 1.85782065459425e-05, + "loss": 0.3884, "step": 82605 }, { - "epoch": 2.91, - "learning_rate": 1.9689448564642855e-05, - "loss": 0.238, + "epoch": 2.977258802753451, + "grad_norm": 0.21179614961147308, + "learning_rate": 1.8575386372552673e-05, + "loss": 0.3647, "step": 82610 }, { - "epoch": 2.91, - "learning_rate": 1.9686664891293167e-05, - "loss": 0.275, + "epoch": 2.9774390024146755, + "grad_norm": 0.1830110400915146, + "learning_rate": 1.857256628669697e-05, + "loss": 0.3991, "step": 82615 }, { - "epoch": 2.91, - "learning_rate": 1.9683881286933625e-05, - "loss": 0.2512, + "epoch": 2.9776192020759003, + "grad_norm": 0.24650508165359497, + "learning_rate": 1.8569746288413802e-05, + "loss": 0.363, "step": 82620 }, { - "epoch": 2.91, - "learning_rate": 1.9681097751600387e-05, - "loss": 0.2571, + "epoch": 2.9777994017371245, + "grad_norm": 0.23117853701114655, + "learning_rate": 1.856692637774159e-05, + "loss": 0.3747, "step": 82625 }, { - "epoch": 2.91, - "learning_rate": 1.967831428532959e-05, - "loss": 0.2597, + "epoch": 2.9779796013983493, + "grad_norm": 0.18374405801296234, + "learning_rate": 1.8564106554718767e-05, + "loss": 0.3854, "step": 82630 }, { - "epoch": 2.91, - "learning_rate": 1.9675530888157366e-05, - "loss": 0.2538, + "epoch": 2.978159801059574, + "grad_norm": 0.1933712363243103, + "learning_rate": 1.856128681938375e-05, + "loss": 0.3719, "step": 82635 }, { - "epoch": 2.91, - "learning_rate": 1.9672747560119877e-05, - "loss": 0.2731, + "epoch": 2.9783400007207987, + "grad_norm": 0.24124352633953094, + "learning_rate": 1.8558467171774946e-05, + "loss": 0.4093, "step": 82640 }, { - "epoch": 2.91, - "learning_rate": 1.9669964301253252e-05, - "loss": 0.2618, + "epoch": 2.9785202003820235, + "grad_norm": 0.22895336151123047, + "learning_rate": 1.8555647611930788e-05, + "loss": 0.3979, "step": 82645 }, { - "epoch": 2.91, - "learning_rate": 1.9667181111593618e-05, - "loss": 0.2696, + "epoch": 2.9787004000432478, + "grad_norm": 0.240083709359169, + "learning_rate": 1.8552828139889673e-05, + "loss": 0.403, "step": 82650 }, { - "epoch": 2.91, - "learning_rate": 1.966439799117712e-05, - "loss": 0.2296, + "epoch": 2.9788805997044725, + "grad_norm": 0.18165172636508942, + "learning_rate": 1.8550008755690034e-05, + "loss": 0.3757, "step": 82655 }, { - "epoch": 2.91, - "learning_rate": 1.966161494003991e-05, - "loss": 0.263, + "epoch": 2.9790607993656972, + "grad_norm": 0.2198115587234497, + "learning_rate": 1.8547189459370275e-05, + "loss": 0.374, "step": 82660 }, { - "epoch": 2.91, - "learning_rate": 1.9658831958218107e-05, - "loss": 0.2833, + "epoch": 2.979240999026922, + "grad_norm": 0.19602136313915253, + "learning_rate": 1.8544370250968808e-05, + "loss": 0.3852, "step": 82665 }, { - "epoch": 2.91, - "learning_rate": 1.9656049045747853e-05, - "loss": 0.2817, + "epoch": 2.9794211986881463, + "grad_norm": 0.24065439403057098, + "learning_rate": 1.854155113052405e-05, + "loss": 0.3971, "step": 82670 }, { - "epoch": 2.91, - "learning_rate": 1.965326620266527e-05, - "loss": 0.2714, + "epoch": 2.979601398349371, + "grad_norm": 0.20084550976753235, + "learning_rate": 1.853873209807441e-05, + "loss": 0.3652, "step": 82675 }, { - "epoch": 2.91, - "learning_rate": 1.9650483429006512e-05, - "loss": 0.2619, + "epoch": 2.9797815980105957, + "grad_norm": 0.18546749651432037, + "learning_rate": 1.8535913153658284e-05, + "loss": 0.3908, "step": 82680 }, { - "epoch": 2.91, - "learning_rate": 1.96477007248077e-05, - "loss": 0.2632, + "epoch": 2.9799617976718205, + "grad_norm": 0.19629612565040588, + "learning_rate": 1.8533094297314108e-05, + "loss": 0.3711, "step": 82685 }, { - "epoch": 2.91, - "learning_rate": 1.9644918090104968e-05, - "loss": 0.2504, + "epoch": 2.980141997333045, + "grad_norm": 0.19778327643871307, + "learning_rate": 1.8530275529080248e-05, + "loss": 0.4318, "step": 82690 }, { - "epoch": 2.91, - "learning_rate": 1.9642135524934434e-05, - "loss": 0.2637, + "epoch": 2.98032219699427, + "grad_norm": 0.19688297808170319, + "learning_rate": 1.8527456848995148e-05, + "loss": 0.3831, "step": 82695 }, { - "epoch": 2.91, - "learning_rate": 1.9639353029332255e-05, - "loss": 0.2801, + "epoch": 2.980502396655494, + "grad_norm": 0.22742833197116852, + "learning_rate": 1.852463825709719e-05, + "loss": 0.4061, "step": 82700 }, { - "epoch": 2.91, - "learning_rate": 1.9636570603334537e-05, - "loss": 0.2835, + "epoch": 2.980682596316719, + "grad_norm": 0.20782898366451263, + "learning_rate": 1.852181975342478e-05, + "loss": 0.3998, "step": 82705 }, { - "epoch": 2.91, - "learning_rate": 1.9633788246977415e-05, - "loss": 0.2811, + "epoch": 2.9808627959779437, + "grad_norm": 0.20285415649414062, + "learning_rate": 1.8519001338016324e-05, + "loss": 0.3738, "step": 82710 }, { - "epoch": 2.91, - "learning_rate": 1.963100596029701e-05, - "loss": 0.2644, + "epoch": 2.981042995639168, + "grad_norm": 0.26227110624313354, + "learning_rate": 1.8516183010910216e-05, + "loss": 0.3849, "step": 82715 }, { - "epoch": 2.91, - "learning_rate": 1.9628223743329466e-05, - "loss": 0.2563, + "epoch": 2.9812231953003927, + "grad_norm": 0.20241287350654602, + "learning_rate": 1.8513364772144863e-05, + "loss": 0.3869, "step": 82720 }, { - "epoch": 2.91, - "learning_rate": 1.9625441596110894e-05, - "loss": 0.2486, + "epoch": 2.9814033949616174, + "grad_norm": 0.25886163115501404, + "learning_rate": 1.851054662175866e-05, + "loss": 0.43, "step": 82725 }, { - "epoch": 2.91, - "learning_rate": 1.9622659518677416e-05, - "loss": 0.2612, + "epoch": 2.981583594622842, + "grad_norm": 0.20351019501686096, + "learning_rate": 1.8507728559789996e-05, + "loss": 0.3872, "step": 82730 }, { - "epoch": 2.91, - "learning_rate": 1.9619877511065157e-05, - "loss": 0.2785, + "epoch": 2.981763794284067, + "grad_norm": 0.27192798256874084, + "learning_rate": 1.850491058627728e-05, + "loss": 0.3813, "step": 82735 }, { - "epoch": 2.91, - "learning_rate": 1.961709557331025e-05, - "loss": 0.2814, + "epoch": 2.9819439939452916, + "grad_norm": 0.19435738027095795, + "learning_rate": 1.85020927012589e-05, + "loss": 0.3965, "step": 82740 }, { - "epoch": 2.91, - "learning_rate": 1.9614313705448814e-05, - "loss": 0.2843, + "epoch": 2.982124193606516, + "grad_norm": 0.20769937336444855, + "learning_rate": 1.8499274904773245e-05, + "loss": 0.3737, "step": 82745 }, { - "epoch": 2.91, - "learning_rate": 1.961153190751695e-05, - "loss": 0.2801, + "epoch": 2.9823043932677407, + "grad_norm": 0.22676123678684235, + "learning_rate": 1.8496457196858714e-05, + "loss": 0.3807, "step": 82750 }, { - "epoch": 2.91, - "learning_rate": 1.9608750179550806e-05, - "loss": 0.2575, + "epoch": 2.9824845929289654, + "grad_norm": 0.18023423850536346, + "learning_rate": 1.849363957755369e-05, + "loss": 0.4336, "step": 82755 }, { - "epoch": 2.91, - "learning_rate": 1.960596852158648e-05, - "loss": 0.272, + "epoch": 2.9826647925901897, + "grad_norm": 0.2146742194890976, + "learning_rate": 1.849082204689657e-05, + "loss": 0.3597, "step": 82760 }, { - "epoch": 2.91, - "learning_rate": 1.960318693366011e-05, - "loss": 0.2707, + "epoch": 2.9828449922514144, + "grad_norm": 0.24750038981437683, + "learning_rate": 1.848800460492575e-05, + "loss": 0.3814, "step": 82765 }, { - "epoch": 2.91, - "learning_rate": 1.9600405415807786e-05, - "loss": 0.2527, + "epoch": 2.983025191912639, + "grad_norm": 0.20825496315956116, + "learning_rate": 1.8485187251679586e-05, + "loss": 0.4108, "step": 82770 }, { - "epoch": 2.91, - "learning_rate": 1.9597623968065647e-05, - "loss": 0.2772, + "epoch": 2.983205391573864, + "grad_norm": 0.23861125111579895, + "learning_rate": 1.8482369987196503e-05, + "loss": 0.4002, "step": 82775 }, { - "epoch": 2.91, - "learning_rate": 1.9594842590469807e-05, - "loss": 0.2358, + "epoch": 2.9833855912350886, + "grad_norm": 0.2297501564025879, + "learning_rate": 1.847955281151486e-05, + "loss": 0.4099, "step": 82780 }, { - "epoch": 2.91, - "learning_rate": 1.9592061283056363e-05, - "loss": 0.2604, + "epoch": 2.9835657908963134, + "grad_norm": 0.2385878562927246, + "learning_rate": 1.847673572467304e-05, + "loss": 0.4512, "step": 82785 }, { - "epoch": 2.91, - "learning_rate": 1.958928004586144e-05, - "loss": 0.2802, + "epoch": 2.9837459905575376, + "grad_norm": 0.21177738904953003, + "learning_rate": 1.8473918726709442e-05, + "loss": 0.394, "step": 82790 }, { - "epoch": 2.91, - "learning_rate": 1.9586498878921153e-05, - "loss": 0.2797, + "epoch": 2.9839261902187624, + "grad_norm": 0.17175810039043427, + "learning_rate": 1.8471101817662428e-05, + "loss": 0.4103, "step": 82795 }, { - "epoch": 2.91, - "learning_rate": 1.9583717782271614e-05, - "loss": 0.2408, + "epoch": 2.984106389879987, + "grad_norm": 0.22006794810295105, + "learning_rate": 1.8468284997570404e-05, + "loss": 0.3797, "step": 82800 }, { - "epoch": 2.91, - "learning_rate": 1.9580936755948932e-05, - "loss": 0.2661, + "epoch": 2.9842865895412114, + "grad_norm": 0.20512345433235168, + "learning_rate": 1.8465468266471724e-05, + "loss": 0.4287, "step": 82805 }, { - "epoch": 2.91, - "learning_rate": 1.95781557999892e-05, - "loss": 0.2735, + "epoch": 2.984466789202436, + "grad_norm": 0.24116021394729614, + "learning_rate": 1.846265162440477e-05, + "loss": 0.3939, "step": 82810 }, { - "epoch": 2.91, - "learning_rate": 1.9575374914428548e-05, - "loss": 0.2756, + "epoch": 2.984646988863661, + "grad_norm": 0.21826866269111633, + "learning_rate": 1.8459835071407928e-05, + "loss": 0.3739, "step": 82815 }, { - "epoch": 2.91, - "learning_rate": 1.9572594099303086e-05, - "loss": 0.2808, + "epoch": 2.9848271885248856, + "grad_norm": 0.20197410881519318, + "learning_rate": 1.8457018607519567e-05, + "loss": 0.4008, "step": 82820 }, { - "epoch": 2.91, - "learning_rate": 1.956981335464891e-05, - "loss": 0.2507, + "epoch": 2.9850073881861103, + "grad_norm": 0.21542875468730927, + "learning_rate": 1.845420223277805e-05, + "loss": 0.3851, "step": 82825 }, { - "epoch": 2.91, - "learning_rate": 1.956703268050212e-05, - "loss": 0.2484, + "epoch": 2.985187587847335, + "grad_norm": 0.19995024800300598, + "learning_rate": 1.8451385947221773e-05, + "loss": 0.3678, "step": 82830 }, { - "epoch": 2.91, - "learning_rate": 1.9564252076898842e-05, - "loss": 0.2662, + "epoch": 2.9853677875085594, + "grad_norm": 0.20630434155464172, + "learning_rate": 1.8448569750889083e-05, + "loss": 0.3832, "step": 82835 }, { - "epoch": 2.91, - "learning_rate": 1.956147154387516e-05, - "loss": 0.2683, + "epoch": 2.985547987169784, + "grad_norm": 0.20057667791843414, + "learning_rate": 1.844575364381837e-05, + "loss": 0.394, "step": 82840 }, { - "epoch": 2.91, - "learning_rate": 1.9558691081467193e-05, - "loss": 0.2644, + "epoch": 2.985728186831009, + "grad_norm": 0.2470291405916214, + "learning_rate": 1.8442937626047995e-05, + "loss": 0.3905, "step": 82845 }, { - "epoch": 2.91, - "learning_rate": 1.9555910689711025e-05, - "loss": 0.256, + "epoch": 2.9859083864922336, + "grad_norm": 0.20389066636562347, + "learning_rate": 1.844012169761632e-05, + "loss": 0.4083, "step": 82850 }, { - "epoch": 2.92, - "learning_rate": 1.955313036864278e-05, - "loss": 0.2545, + "epoch": 2.986088586153458, + "grad_norm": 0.26393190026283264, + "learning_rate": 1.843730585856172e-05, + "loss": 0.4205, "step": 82855 }, { - "epoch": 2.92, - "learning_rate": 1.955035011829855e-05, - "loss": 0.2803, + "epoch": 2.9862687858146826, + "grad_norm": 0.2350768744945526, + "learning_rate": 1.8434490108922566e-05, + "loss": 0.3914, "step": 82860 }, { - "epoch": 2.92, - "learning_rate": 1.9547569938714415e-05, - "loss": 0.2853, + "epoch": 2.9864489854759073, + "grad_norm": 0.20153537392616272, + "learning_rate": 1.8431674448737195e-05, + "loss": 0.3954, "step": 82865 }, { - "epoch": 2.92, - "learning_rate": 1.9544789829926507e-05, - "loss": 0.2946, + "epoch": 2.986629185137132, + "grad_norm": 0.22236128151416779, + "learning_rate": 1.8428858878044007e-05, + "loss": 0.4122, "step": 82870 }, { - "epoch": 2.92, - "learning_rate": 1.95420097919709e-05, - "loss": 0.2517, + "epoch": 2.986809384798357, + "grad_norm": 0.18074600398540497, + "learning_rate": 1.8426043396881326e-05, + "loss": 0.3938, "step": 82875 }, { - "epoch": 2.92, - "learning_rate": 1.953922982488371e-05, - "loss": 0.2771, + "epoch": 2.986989584459581, + "grad_norm": 0.21660077571868896, + "learning_rate": 1.8423228005287546e-05, + "loss": 0.4038, "step": 82880 }, { - "epoch": 2.92, - "learning_rate": 1.9536449928701007e-05, - "loss": 0.2789, + "epoch": 2.987169784120806, + "grad_norm": 0.23685570061206818, + "learning_rate": 1.8420412703301004e-05, + "loss": 0.3751, "step": 82885 }, { - "epoch": 2.92, - "learning_rate": 1.9533670103458912e-05, - "loss": 0.2807, + "epoch": 2.9873499837820305, + "grad_norm": 0.20489926636219025, + "learning_rate": 1.8417597490960066e-05, + "loss": 0.4195, "step": 82890 }, { - "epoch": 2.92, - "learning_rate": 1.9530890349193502e-05, - "loss": 0.2674, + "epoch": 2.9875301834432553, + "grad_norm": 0.23835612833499908, + "learning_rate": 1.8414782368303088e-05, + "loss": 0.3734, "step": 82895 }, { - "epoch": 2.92, - "learning_rate": 1.9528110665940882e-05, - "loss": 0.2566, + "epoch": 2.9877103831044796, + "grad_norm": 0.24969185888767242, + "learning_rate": 1.841196733536842e-05, + "loss": 0.3929, "step": 82900 }, { - "epoch": 2.92, - "learning_rate": 1.9525331053737128e-05, - "loss": 0.2781, + "epoch": 2.9878905827657043, + "grad_norm": 0.22482214868068695, + "learning_rate": 1.8409152392194427e-05, + "loss": 0.4001, "step": 82905 }, { - "epoch": 2.92, - "learning_rate": 1.9522551512618354e-05, - "loss": 0.2592, + "epoch": 2.988070782426929, + "grad_norm": 0.19898438453674316, + "learning_rate": 1.8406337538819462e-05, + "loss": 0.3767, "step": 82910 }, { - "epoch": 2.92, - "learning_rate": 1.9519772042620637e-05, - "loss": 0.2896, + "epoch": 2.9882509820881538, + "grad_norm": 0.2599036991596222, + "learning_rate": 1.840352277528186e-05, + "loss": 0.4169, "step": 82915 }, { - "epoch": 2.92, - "learning_rate": 1.9516992643780068e-05, - "loss": 0.2845, + "epoch": 2.9884311817493785, + "grad_norm": 0.1831098049879074, + "learning_rate": 1.840070810161999e-05, + "loss": 0.3763, "step": 82920 }, { - "epoch": 2.92, - "learning_rate": 1.9514213316132732e-05, - "loss": 0.256, + "epoch": 2.988611381410603, + "grad_norm": 0.26548436284065247, + "learning_rate": 1.8397893517872196e-05, + "loss": 0.3815, "step": 82925 }, { - "epoch": 2.92, - "learning_rate": 1.951143405971472e-05, - "loss": 0.2472, + "epoch": 2.9887915810718275, + "grad_norm": 0.24817430973052979, + "learning_rate": 1.839507902407682e-05, + "loss": 0.4, "step": 82930 }, { - "epoch": 2.92, - "learning_rate": 1.9508654874562127e-05, - "loss": 0.2716, + "epoch": 2.9889717807330523, + "grad_norm": NaN, + "learning_rate": 1.839282749383203e-05, + "loss": 0.4199, "step": 82935 }, { - "epoch": 2.92, - "learning_rate": 1.950587576071103e-05, - "loss": 0.2728, + "epoch": 2.989151980394277, + "grad_norm": 0.2185213267803192, + "learning_rate": 1.839001316204766e-05, + "loss": 0.3663, "step": 82940 }, { - "epoch": 2.92, - "learning_rate": 1.9503096718197507e-05, - "loss": 0.279, + "epoch": 2.9893321800555013, + "grad_norm": 0.24561047554016113, + "learning_rate": 1.8387198920323063e-05, + "loss": 0.4042, "step": 82945 }, { - "epoch": 2.92, - "learning_rate": 1.9500317747057662e-05, - "loss": 0.2568, + "epoch": 2.989512379716726, + "grad_norm": 0.1906471997499466, + "learning_rate": 1.8384384768696615e-05, + "loss": 0.3998, "step": 82950 }, { - "epoch": 2.92, - "learning_rate": 1.9497538847327557e-05, - "loss": 0.2763, + "epoch": 2.9896925793779507, + "grad_norm": 0.2308565229177475, + "learning_rate": 1.8381570707206637e-05, + "loss": 0.3625, "step": 82955 }, { - "epoch": 2.92, - "learning_rate": 1.9494760019043292e-05, - "loss": 0.2828, + "epoch": 2.9898727790391755, + "grad_norm": 0.2136189490556717, + "learning_rate": 1.837875673589147e-05, + "loss": 0.3868, "step": 82960 }, { - "epoch": 2.92, - "learning_rate": 1.949198126224093e-05, - "loss": 0.3088, + "epoch": 2.9900529787004, + "grad_norm": 0.21687231957912445, + "learning_rate": 1.8375942854789475e-05, + "loss": 0.3923, "step": 82965 }, { - "epoch": 2.92, - "learning_rate": 1.948920257695657e-05, - "loss": 0.253, + "epoch": 2.990233178361625, + "grad_norm": 0.24081559479236603, + "learning_rate": 1.837312906393896e-05, + "loss": 0.3877, "step": 82970 }, { - "epoch": 2.92, - "learning_rate": 1.9486423963226284e-05, - "loss": 0.2667, + "epoch": 2.9904133780228492, + "grad_norm": 0.19234254956245422, + "learning_rate": 1.8370315363378293e-05, + "loss": 0.4074, "step": 82975 }, { - "epoch": 2.92, - "learning_rate": 1.9483645421086137e-05, - "loss": 0.2419, + "epoch": 2.990593577684074, + "grad_norm": 0.22776223719120026, + "learning_rate": 1.8367501753145792e-05, + "loss": 0.4108, "step": 82980 }, { - "epoch": 2.92, - "learning_rate": 1.9480866950572227e-05, - "loss": 0.2721, + "epoch": 2.9907737773452987, + "grad_norm": 0.23385420441627502, + "learning_rate": 1.836468823327979e-05, + "loss": 0.3966, "step": 82985 }, { - "epoch": 2.92, - "learning_rate": 1.9478088551720626e-05, - "loss": 0.2733, + "epoch": 2.990953977006523, + "grad_norm": 0.25053873658180237, + "learning_rate": 1.836187480381863e-05, + "loss": 0.4033, "step": 82990 }, { - "epoch": 2.92, - "learning_rate": 1.9475310224567405e-05, - "loss": 0.2441, + "epoch": 2.9911341766677477, + "grad_norm": 0.22640636563301086, + "learning_rate": 1.835906146480064e-05, + "loss": 0.3687, "step": 82995 }, { - "epoch": 2.92, - "learning_rate": 1.9472531969148632e-05, - "loss": 0.2591, + "epoch": 2.9913143763289725, + "grad_norm": 0.2172614485025406, + "learning_rate": 1.835624821626415e-05, + "loss": 0.3921, "step": 83000 }, { - "epoch": 2.92, - "eval_loss": 0.25857260823249817, - "eval_runtime": 10.5923, - "eval_samples_per_second": 9.441, - "eval_steps_per_second": 9.441, + "epoch": 2.9913143763289725, + "eval_loss": 0.4294629693031311, + "eval_runtime": 3.5292, + "eval_samples_per_second": 28.335, + "eval_steps_per_second": 7.084, "step": 83000 }, { - "epoch": 2.92, - "learning_rate": 1.9469753785500397e-05, - "loss": 0.2793, + "epoch": 2.991494575990197, + "grad_norm": 0.17745208740234375, + "learning_rate": 1.8353435058247496e-05, + "loss": 0.3833, "step": 83005 }, { - "epoch": 2.92, - "learning_rate": 1.946697567365876e-05, - "loss": 0.2883, + "epoch": 2.991674775651422, + "grad_norm": 0.25056445598602295, + "learning_rate": 1.8350621990788998e-05, + "loss": 0.4054, "step": 83010 }, { - "epoch": 2.92, - "learning_rate": 1.9464197633659802e-05, - "loss": 0.2625, + "epoch": 2.9918549753126467, + "grad_norm": 0.20053359866142273, + "learning_rate": 1.834780901392699e-05, + "loss": 0.4106, "step": 83015 }, { - "epoch": 2.92, - "learning_rate": 1.9461419665539582e-05, - "loss": 0.2913, + "epoch": 2.992035174973871, + "grad_norm": 0.18675009906291962, + "learning_rate": 1.83449961276998e-05, + "loss": 0.3792, "step": 83020 }, { - "epoch": 2.92, - "learning_rate": 1.9458641769334184e-05, - "loss": 0.2658, + "epoch": 2.9922153746350957, + "grad_norm": 0.22878988087177277, + "learning_rate": 1.834218333214574e-05, + "loss": 0.4279, "step": 83025 }, { - "epoch": 2.92, - "learning_rate": 1.945586394507967e-05, - "loss": 0.2692, + "epoch": 2.9923955742963204, + "grad_norm": 0.20670390129089355, + "learning_rate": 1.833937062730315e-05, + "loss": 0.3876, "step": 83030 }, { - "epoch": 2.92, - "learning_rate": 1.945308619281211e-05, - "loss": 0.2514, + "epoch": 2.9925757739575447, + "grad_norm": 0.18799588084220886, + "learning_rate": 1.8336558013210355e-05, + "loss": 0.3937, "step": 83035 }, { - "epoch": 2.92, - "learning_rate": 1.9450308512567562e-05, - "loss": 0.2617, + "epoch": 2.9927559736187694, + "grad_norm": 0.19332025945186615, + "learning_rate": 1.8333745489905646e-05, + "loss": 0.4095, "step": 83040 }, { - "epoch": 2.92, - "learning_rate": 1.9447530904382108e-05, - "loss": 0.2585, + "epoch": 2.992936173279994, + "grad_norm": 0.22102457284927368, + "learning_rate": 1.8330933057427378e-05, + "loss": 0.3956, "step": 83045 }, { - "epoch": 2.92, - "learning_rate": 1.9444753368291808e-05, - "loss": 0.2485, + "epoch": 2.993116372941219, + "grad_norm": 0.2692544758319855, + "learning_rate": 1.8328120715813845e-05, + "loss": 0.403, "step": 83050 }, { - "epoch": 2.92, - "learning_rate": 1.9441975904332727e-05, - "loss": 0.2593, + "epoch": 2.9932965726024436, + "grad_norm": 0.20256653428077698, + "learning_rate": 1.832530846510339e-05, + "loss": 0.3552, "step": 83055 }, { - "epoch": 2.92, - "learning_rate": 1.9439198512540913e-05, - "loss": 0.2562, + "epoch": 2.9934767722636684, + "grad_norm": 0.2464403659105301, + "learning_rate": 1.8322496305334312e-05, + "loss": 0.3758, "step": 83060 }, { - "epoch": 2.92, - "learning_rate": 1.9436421192952445e-05, - "loss": 0.2747, + "epoch": 2.9936569719248927, + "grad_norm": 0.20932310819625854, + "learning_rate": 1.831968423654492e-05, + "loss": 0.3809, "step": 83065 }, { - "epoch": 2.92, - "learning_rate": 1.943364394560339e-05, - "loss": 0.2603, + "epoch": 2.9938371715861174, + "grad_norm": 0.19015516340732574, + "learning_rate": 1.8316872258773543e-05, + "loss": 0.4073, "step": 83070 }, { - "epoch": 2.92, - "learning_rate": 1.94308667705298e-05, - "loss": 0.2622, + "epoch": 2.994017371247342, + "grad_norm": 0.2418580800294876, + "learning_rate": 1.831406037205849e-05, + "loss": 0.432, "step": 83075 }, { - "epoch": 2.92, - "learning_rate": 1.9428089667767722e-05, - "loss": 0.2696, + "epoch": 2.9941975709085664, + "grad_norm": 0.23939929902553558, + "learning_rate": 1.831124857643806e-05, + "loss": 0.4024, "step": 83080 }, { - "epoch": 2.92, - "learning_rate": 1.942531263735324e-05, - "loss": 0.2567, + "epoch": 2.994377770569791, + "grad_norm": 0.259090781211853, + "learning_rate": 1.8308436871950584e-05, + "loss": 0.3824, "step": 83085 }, { - "epoch": 2.92, - "learning_rate": 1.942253567932239e-05, - "loss": 0.2608, + "epoch": 2.994557970231016, + "grad_norm": 0.2107851803302765, + "learning_rate": 1.8305625258634353e-05, + "loss": 0.4087, "step": 83090 }, { - "epoch": 2.92, - "learning_rate": 1.941975879371124e-05, - "loss": 0.2979, + "epoch": 2.9947381698922406, + "grad_norm": 0.22065068781375885, + "learning_rate": 1.8302813736527686e-05, + "loss": 0.4059, "step": 83095 }, { - "epoch": 2.92, - "learning_rate": 1.941698198055585e-05, - "loss": 0.2658, + "epoch": 2.9949183695534654, + "grad_norm": 0.21330714225769043, + "learning_rate": 1.8300002305668884e-05, + "loss": 0.3922, "step": 83100 }, { - "epoch": 2.92, - "learning_rate": 1.941420523989227e-05, - "loss": 0.2573, + "epoch": 2.99509856921469, + "grad_norm": 0.19063615798950195, + "learning_rate": 1.829719096609625e-05, + "loss": 0.3862, "step": 83105 }, { - "epoch": 2.92, - "learning_rate": 1.941142857175656e-05, - "loss": 0.2538, + "epoch": 2.9952787688759144, + "grad_norm": 0.2311249077320099, + "learning_rate": 1.8294379717848095e-05, + "loss": 0.3986, "step": 83110 }, { - "epoch": 2.92, - "learning_rate": 1.940865197618475e-05, - "loss": 0.2743, + "epoch": 2.995458968537139, + "grad_norm": 0.18495197594165802, + "learning_rate": 1.8291568560962723e-05, + "loss": 0.4199, "step": 83115 }, { - "epoch": 2.92, - "learning_rate": 1.940587545321291e-05, - "loss": 0.2432, + "epoch": 2.995639168198364, + "grad_norm": 0.1809265911579132, + "learning_rate": 1.828875749547842e-05, + "loss": 0.4077, "step": 83120 }, { - "epoch": 2.92, - "learning_rate": 1.9403099002877108e-05, - "loss": 0.2777, + "epoch": 2.9958193678595886, + "grad_norm": 0.2279941588640213, + "learning_rate": 1.82859465214335e-05, + "loss": 0.4244, "step": 83125 }, { - "epoch": 2.92, - "learning_rate": 1.940032262521337e-05, - "loss": 0.2655, + "epoch": 2.995999567520813, + "grad_norm": 0.23656921088695526, + "learning_rate": 1.8283135638866263e-05, + "loss": 0.3996, "step": 83130 }, { - "epoch": 2.92, - "learning_rate": 1.939754632025774e-05, - "loss": 0.2827, + "epoch": 2.9961797671820376, + "grad_norm": 0.2122529149055481, + "learning_rate": 1.8280324847815e-05, + "loss": 0.4041, "step": 83135 }, { - "epoch": 2.93, - "learning_rate": 1.9394770088046294e-05, - "loss": 0.265, + "epoch": 2.9963599668432623, + "grad_norm": 0.21249407529830933, + "learning_rate": 1.8277514148318014e-05, + "loss": 0.3736, "step": 83140 }, { - "epoch": 2.93, - "learning_rate": 1.939199392861506e-05, - "loss": 0.2621, + "epoch": 2.996540166504487, + "grad_norm": 0.2389555722475052, + "learning_rate": 1.8274703540413584e-05, + "loss": 0.3884, "step": 83145 }, { - "epoch": 2.93, - "learning_rate": 1.9389217842000088e-05, - "loss": 0.2781, + "epoch": 2.996720366165712, + "grad_norm": 0.24687263369560242, + "learning_rate": 1.8271893024140034e-05, + "loss": 0.4034, "step": 83150 }, { - "epoch": 2.93, - "learning_rate": 1.938644182823742e-05, - "loss": 0.2644, + "epoch": 2.996900565826936, + "grad_norm": 0.19258186221122742, + "learning_rate": 1.826908259953562e-05, + "loss": 0.3823, "step": 83155 }, { - "epoch": 2.93, - "learning_rate": 1.9383665887363113e-05, - "loss": 0.2719, + "epoch": 2.997080765488161, + "grad_norm": 0.21487314999103546, + "learning_rate": 1.826627226663867e-05, + "loss": 0.4198, "step": 83160 }, { - "epoch": 2.93, - "learning_rate": 1.93808900194132e-05, - "loss": 0.2746, + "epoch": 2.9972609651493856, + "grad_norm": 0.17476046085357666, + "learning_rate": 1.826346202548745e-05, + "loss": 0.3885, "step": 83165 }, { - "epoch": 2.93, - "learning_rate": 1.9378114224423726e-05, - "loss": 0.2669, + "epoch": 2.9974411648106103, + "grad_norm": 0.22289486229419708, + "learning_rate": 1.826065187612025e-05, + "loss": 0.3922, "step": 83170 }, { - "epoch": 2.93, - "learning_rate": 1.937533850243073e-05, - "loss": 0.2738, + "epoch": 2.9976213644718346, + "grad_norm": 0.21170969307422638, + "learning_rate": 1.825784181857537e-05, + "loss": 0.3817, "step": 83175 }, { - "epoch": 2.93, - "learning_rate": 1.9372562853470263e-05, - "loss": 0.2719, + "epoch": 2.9978015641330593, + "grad_norm": 0.17165972292423248, + "learning_rate": 1.825503185289109e-05, + "loss": 0.3939, "step": 83180 }, { - "epoch": 2.93, - "learning_rate": 1.9369787277578365e-05, - "loss": 0.2708, + "epoch": 2.997981763794284, + "grad_norm": 0.2255578637123108, + "learning_rate": 1.8252221979105687e-05, + "loss": 0.3985, "step": 83185 }, { - "epoch": 2.93, - "learning_rate": 1.9367011774791062e-05, - "loss": 0.2549, + "epoch": 2.998161963455509, + "grad_norm": 0.23678001761436462, + "learning_rate": 1.824941219725746e-05, + "loss": 0.4077, "step": 83190 }, { - "epoch": 2.93, - "learning_rate": 1.9364236345144393e-05, - "loss": 0.2744, + "epoch": 2.9983421631167335, + "grad_norm": 0.32480308413505554, + "learning_rate": 1.824660250738468e-05, + "loss": 0.4023, "step": 83195 }, { - "epoch": 2.93, - "learning_rate": 1.93614609886744e-05, - "loss": 0.3034, + "epoch": 2.9985223627779583, + "grad_norm": 0.18831786513328552, + "learning_rate": 1.8243792909525643e-05, + "loss": 0.3703, "step": 83200 }, { - "epoch": 2.93, - "learning_rate": 1.9358685705417133e-05, - "loss": 0.2712, + "epoch": 2.9987025624391825, + "grad_norm": 0.22749020159244537, + "learning_rate": 1.8240983403718614e-05, + "loss": 0.3943, "step": 83205 }, { - "epoch": 2.93, - "learning_rate": 1.935591049540861e-05, - "loss": 0.2648, + "epoch": 2.9988827621004073, + "grad_norm": 0.2319597750902176, + "learning_rate": 1.8238173990001874e-05, + "loss": 0.3893, "step": 83210 }, { - "epoch": 2.93, - "learning_rate": 1.935313535868486e-05, - "loss": 0.2597, + "epoch": 2.999062961761632, + "grad_norm": 0.2220054417848587, + "learning_rate": 1.8235364668413705e-05, + "loss": 0.3845, "step": 83215 }, { - "epoch": 2.93, - "learning_rate": 1.9350360295281937e-05, - "loss": 0.2538, + "epoch": 2.9992431614228563, + "grad_norm": 0.29346784949302673, + "learning_rate": 1.8232555438992395e-05, + "loss": 0.4334, "step": 83220 }, { - "epoch": 2.93, - "learning_rate": 1.9347585305235858e-05, - "loss": 0.2593, + "epoch": 2.999423361084081, + "grad_norm": 0.19180335104465485, + "learning_rate": 1.8229746301776187e-05, + "loss": 0.3889, "step": 83225 }, { - "epoch": 2.93, - "learning_rate": 1.934481038858266e-05, - "loss": 0.2686, + "epoch": 2.9996035607453058, + "grad_norm": 0.2530876100063324, + "learning_rate": 1.8226937256803396e-05, + "loss": 0.3928, "step": 83230 }, { - "epoch": 2.93, - "learning_rate": 1.9342035545358366e-05, - "loss": 0.2657, + "epoch": 2.9997837604065305, + "grad_norm": 0.1911705732345581, + "learning_rate": 1.8224128304112252e-05, + "loss": 0.4052, "step": 83235 }, { - "epoch": 2.93, - "learning_rate": 1.933926077559902e-05, - "loss": 0.2594, + "epoch": 2.9999639600677552, + "grad_norm": 0.21257303655147552, + "learning_rate": 1.822131944374106e-05, + "loss": 0.3966, "step": 83240 }, { - "epoch": 2.93, - "learning_rate": 1.9336486079340643e-05, - "loss": 0.2823, + "epoch": 3.0001441597289795, + "grad_norm": 0.24478384852409363, + "learning_rate": 1.8218510675728085e-05, + "loss": 0.3972, "step": 83245 }, { - "epoch": 2.93, - "learning_rate": 1.933371145661925e-05, - "loss": 0.2676, + "epoch": 3.0003243593902043, + "grad_norm": 0.21207980811595917, + "learning_rate": 1.821570200011157e-05, + "loss": 0.3956, "step": 83250 }, { - "epoch": 2.93, - "learning_rate": 1.9330936907470886e-05, - "loss": 0.2641, + "epoch": 3.000504559051429, + "grad_norm": 0.19810383021831512, + "learning_rate": 1.8212893416929817e-05, + "loss": 0.3571, "step": 83255 }, { - "epoch": 2.93, - "learning_rate": 1.9328162431931575e-05, - "loss": 0.2608, + "epoch": 3.0006847587126537, + "grad_norm": 0.21242433786392212, + "learning_rate": 1.8210084926221078e-05, + "loss": 0.3709, "step": 83260 }, { - "epoch": 2.93, - "learning_rate": 1.932538803003734e-05, - "loss": 0.2673, + "epoch": 3.0008649583738785, + "grad_norm": 0.21151970326900482, + "learning_rate": 1.820727652802361e-05, + "loss": 0.3948, "step": 83265 }, { - "epoch": 2.93, - "learning_rate": 1.9322613701824188e-05, - "loss": 0.2695, + "epoch": 3.0010451580351027, + "grad_norm": 0.27678802609443665, + "learning_rate": 1.8204468222375687e-05, + "loss": 0.3665, "step": 83270 }, { - "epoch": 2.93, - "learning_rate": 1.9319839447328167e-05, - "loss": 0.2779, + "epoch": 3.0012253576963275, + "grad_norm": 0.2054269164800644, + "learning_rate": 1.8201660009315563e-05, + "loss": 0.3515, "step": 83275 }, { - "epoch": 2.93, - "learning_rate": 1.9317065266585283e-05, - "loss": 0.2721, + "epoch": 3.001405557357552, + "grad_norm": 0.27250099182128906, + "learning_rate": 1.8198851888881513e-05, + "loss": 0.3725, "step": 83280 }, { - "epoch": 2.93, - "learning_rate": 1.9314291159631566e-05, - "loss": 0.2656, + "epoch": 3.001585757018777, + "grad_norm": 0.22905905544757843, + "learning_rate": 1.8196043861111788e-05, + "loss": 0.3983, "step": 83285 }, { - "epoch": 2.93, - "learning_rate": 1.931151712650302e-05, - "loss": 0.2559, + "epoch": 3.0017659566800012, + "grad_norm": 0.19048994779586792, + "learning_rate": 1.819323592604464e-05, + "loss": 0.3538, "step": 83290 }, { - "epoch": 2.93, - "learning_rate": 1.9308743167235683e-05, - "loss": 0.2672, + "epoch": 3.001946156341226, + "grad_norm": 0.2080589234828949, + "learning_rate": 1.8190428083718346e-05, + "loss": 0.3769, "step": 83295 }, { - "epoch": 2.93, - "learning_rate": 1.9305969281865568e-05, - "loss": 0.2542, + "epoch": 3.0021263560024507, + "grad_norm": 0.20519669353961945, + "learning_rate": 1.8187620334171147e-05, + "loss": 0.38, "step": 83300 }, { - "epoch": 2.93, - "learning_rate": 1.9303195470428687e-05, - "loss": 0.2666, + "epoch": 3.0023065556636754, + "grad_norm": 0.20568309724330902, + "learning_rate": 1.81848126774413e-05, + "loss": 0.4157, "step": 83305 }, { - "epoch": 2.93, - "learning_rate": 1.9300421732961055e-05, - "loss": 0.2709, + "epoch": 3.0024867553249, + "grad_norm": 0.1868577003479004, + "learning_rate": 1.8182005113567064e-05, + "loss": 0.3971, "step": 83310 }, { - "epoch": 2.93, - "learning_rate": 1.9297648069498692e-05, - "loss": 0.2806, + "epoch": 3.0026669549861245, + "grad_norm": 0.21570152044296265, + "learning_rate": 1.817919764258668e-05, + "loss": 0.3962, "step": 83315 }, { - "epoch": 2.93, - "learning_rate": 1.9294874480077613e-05, - "loss": 0.2487, + "epoch": 3.002847154647349, + "grad_norm": 0.2413526475429535, + "learning_rate": 1.817639026453842e-05, + "loss": 0.3816, "step": 83320 }, { - "epoch": 2.93, - "learning_rate": 1.9292100964733835e-05, - "loss": 0.2662, + "epoch": 3.003027354308574, + "grad_norm": 0.2983512282371521, + "learning_rate": 1.8173582979460517e-05, + "loss": 0.4202, "step": 83325 }, { - "epoch": 2.93, - "learning_rate": 1.928932752350335e-05, - "loss": 0.2892, + "epoch": 3.0032075539697987, + "grad_norm": 0.18553189933300018, + "learning_rate": 1.8170775787391213e-05, + "loss": 0.3656, "step": 83330 }, { - "epoch": 2.93, - "learning_rate": 1.9286554156422188e-05, - "loss": 0.2582, + "epoch": 3.0033877536310234, + "grad_norm": 0.2300480157136917, + "learning_rate": 1.8167968688368787e-05, + "loss": 0.3999, "step": 83335 }, { - "epoch": 2.93, - "learning_rate": 1.928378086352636e-05, - "loss": 0.2557, + "epoch": 3.0035679532922477, + "grad_norm": 0.18656012415885925, + "learning_rate": 1.8165161682431444e-05, + "loss": 0.3956, "step": 83340 }, { - "epoch": 2.93, - "learning_rate": 1.928100764485186e-05, - "loss": 0.269, + "epoch": 3.0037481529534724, + "grad_norm": 0.18259282410144806, + "learning_rate": 1.8162354769617468e-05, + "loss": 0.3691, "step": 83345 }, { - "epoch": 2.93, - "learning_rate": 1.927823450043472e-05, - "loss": 0.2733, + "epoch": 3.003928352614697, + "grad_norm": 0.21580132842063904, + "learning_rate": 1.815954794996508e-05, + "loss": 0.3893, "step": 83350 }, { - "epoch": 2.93, - "learning_rate": 1.927546143031093e-05, - "loss": 0.267, + "epoch": 3.004108552275922, + "grad_norm": 0.20566125214099884, + "learning_rate": 1.815674122351252e-05, + "loss": 0.3643, "step": 83355 }, { - "epoch": 2.93, - "learning_rate": 1.92726884345165e-05, - "loss": 0.2704, + "epoch": 3.004288751937146, + "grad_norm": 0.2558181583881378, + "learning_rate": 1.815393459029804e-05, + "loss": 0.3879, "step": 83360 }, { - "epoch": 2.93, - "learning_rate": 1.9269915513087437e-05, - "loss": 0.277, + "epoch": 3.004468951598371, + "grad_norm": 0.1984751671552658, + "learning_rate": 1.815112805035988e-05, + "loss": 0.4131, "step": 83365 }, { - "epoch": 2.93, - "learning_rate": 1.926714266605974e-05, - "loss": 0.2227, + "epoch": 3.0046491512595956, + "grad_norm": 0.19502590596675873, + "learning_rate": 1.8148321603736263e-05, + "loss": 0.3646, "step": 83370 }, { - "epoch": 2.93, - "learning_rate": 1.9264369893469424e-05, - "loss": 0.2612, + "epoch": 3.0048293509208204, + "grad_norm": 0.22936676442623138, + "learning_rate": 1.8145515250465446e-05, + "loss": 0.3653, "step": 83375 }, { - "epoch": 2.93, - "learning_rate": 1.9261597195352486e-05, - "loss": 0.2579, + "epoch": 3.005009550582045, + "grad_norm": 0.2669048309326172, + "learning_rate": 1.814270899058565e-05, + "loss": 0.4142, "step": 83380 }, { - "epoch": 2.93, - "learning_rate": 1.9258824571744917e-05, - "loss": 0.2713, + "epoch": 3.0051897502432694, + "grad_norm": 0.21119844913482666, + "learning_rate": 1.8139902824135124e-05, + "loss": 0.4132, "step": 83385 }, { - "epoch": 2.93, - "learning_rate": 1.9256052022682738e-05, - "loss": 0.2766, + "epoch": 3.005369949904494, + "grad_norm": 0.20271877944469452, + "learning_rate": 1.8137096751152093e-05, + "loss": 0.3699, "step": 83390 }, { - "epoch": 2.93, - "learning_rate": 1.925327954820193e-05, - "loss": 0.27, + "epoch": 3.005550149565719, + "grad_norm": 0.21155229210853577, + "learning_rate": 1.8134290771674784e-05, + "loss": 0.3365, "step": 83395 }, { - "epoch": 2.93, - "learning_rate": 1.925050714833851e-05, - "loss": 0.2737, + "epoch": 3.0057303492269436, + "grad_norm": 0.21775977313518524, + "learning_rate": 1.813148488574144e-05, + "loss": 0.3988, "step": 83400 }, { - "epoch": 2.93, - "learning_rate": 1.924773482312845e-05, - "loss": 0.2652, + "epoch": 3.005910548888168, + "grad_norm": 0.2257860004901886, + "learning_rate": 1.8128679093390282e-05, + "loss": 0.3699, "step": 83405 }, { - "epoch": 2.93, - "learning_rate": 1.9244962572607776e-05, - "loss": 0.2849, + "epoch": 3.0060907485493926, + "grad_norm": 0.1963503360748291, + "learning_rate": 1.8125873394659543e-05, + "loss": 0.358, "step": 83410 }, { - "epoch": 2.93, - "learning_rate": 1.9242190396812472e-05, - "loss": 0.2707, + "epoch": 3.0062709482106174, + "grad_norm": 0.17363089323043823, + "learning_rate": 1.8123067789587443e-05, + "loss": 0.3867, "step": 83415 }, { - "epoch": 2.93, - "learning_rate": 1.9239418295778518e-05, - "loss": 0.2855, + "epoch": 3.006451147871842, + "grad_norm": 0.22728854417800903, + "learning_rate": 1.8120262278212216e-05, + "loss": 0.3821, "step": 83420 }, { - "epoch": 2.94, - "learning_rate": 1.923664626954192e-05, - "loss": 0.2633, + "epoch": 3.006631347533067, + "grad_norm": 0.2198934406042099, + "learning_rate": 1.8117456860572085e-05, + "loss": 0.3799, "step": 83425 }, { - "epoch": 2.94, - "learning_rate": 1.9233874318138688e-05, - "loss": 0.2519, + "epoch": 3.006811547194291, + "grad_norm": 0.27746328711509705, + "learning_rate": 1.811465153670528e-05, + "loss": 0.3917, "step": 83430 }, { - "epoch": 2.94, - "learning_rate": 1.9231102441604793e-05, - "loss": 0.256, + "epoch": 3.006991746855516, + "grad_norm": 0.25935912132263184, + "learning_rate": 1.811184630665e-05, + "loss": 0.3629, "step": 83435 }, { - "epoch": 2.94, - "learning_rate": 1.9228330639976223e-05, - "loss": 0.2413, + "epoch": 3.0071719465167406, + "grad_norm": 0.2055579125881195, + "learning_rate": 1.810904117044449e-05, + "loss": 0.361, "step": 83440 }, { - "epoch": 2.94, - "learning_rate": 1.9225558913288982e-05, - "loss": 0.2396, + "epoch": 3.0073521461779653, + "grad_norm": 0.2863004207611084, + "learning_rate": 1.810623612812696e-05, + "loss": 0.3877, "step": 83445 }, { - "epoch": 2.94, - "learning_rate": 1.922278726157905e-05, - "loss": 0.2843, + "epoch": 3.0075323458391896, + "grad_norm": 0.21766206622123718, + "learning_rate": 1.810343117973562e-05, + "loss": 0.3755, "step": 83450 }, { - "epoch": 2.94, - "learning_rate": 1.922001568488242e-05, - "loss": 0.256, + "epoch": 3.0077125455004143, + "grad_norm": 0.21764706075191498, + "learning_rate": 1.81006263253087e-05, + "loss": 0.4227, "step": 83455 }, { - "epoch": 2.94, - "learning_rate": 1.9217244183235073e-05, - "loss": 0.2692, + "epoch": 3.007892745161639, + "grad_norm": 0.25549066066741943, + "learning_rate": 1.8097821564884408e-05, + "loss": 0.4138, "step": 83460 }, { - "epoch": 2.94, - "learning_rate": 1.9214472756673008e-05, - "loss": 0.2827, + "epoch": 3.008072944822864, + "grad_norm": 0.22530651092529297, + "learning_rate": 1.8095016898500962e-05, + "loss": 0.4006, "step": 83465 }, { - "epoch": 2.94, - "learning_rate": 1.92117014052322e-05, - "loss": 0.2706, + "epoch": 3.0082531444840885, + "grad_norm": 0.20702116191387177, + "learning_rate": 1.8092212326196578e-05, + "loss": 0.3831, "step": 83470 }, { - "epoch": 2.94, - "learning_rate": 1.920893012894863e-05, - "loss": 0.2947, + "epoch": 3.008433344145313, + "grad_norm": 0.21814846992492676, + "learning_rate": 1.8089407848009457e-05, + "loss": 0.3723, "step": 83475 }, { - "epoch": 2.94, - "learning_rate": 1.920615892785828e-05, - "loss": 0.278, + "epoch": 3.0086135438065376, + "grad_norm": 0.19958257675170898, + "learning_rate": 1.808660346397782e-05, + "loss": 0.3971, "step": 83480 }, { - "epoch": 2.94, - "learning_rate": 1.9203387801997148e-05, - "loss": 0.2691, + "epoch": 3.0087937434677623, + "grad_norm": 0.2572198510169983, + "learning_rate": 1.8083799174139876e-05, + "loss": 0.3909, "step": 83485 }, { - "epoch": 2.94, - "learning_rate": 1.9200616751401207e-05, - "loss": 0.2807, + "epoch": 3.008973943128987, + "grad_norm": 0.1767335683107376, + "learning_rate": 1.8080994978533822e-05, + "loss": 0.3866, "step": 83490 }, { - "epoch": 2.94, - "learning_rate": 1.919784577610643e-05, - "loss": 0.2716, + "epoch": 3.0091541427902118, + "grad_norm": 0.22340673208236694, + "learning_rate": 1.8078190877197876e-05, + "loss": 0.3917, "step": 83495 }, { - "epoch": 2.94, - "learning_rate": 1.91950748761488e-05, - "loss": 0.2561, + "epoch": 3.009334342451436, + "grad_norm": 0.21768926084041595, + "learning_rate": 1.8075386870170233e-05, + "loss": 0.3831, "step": 83500 }, { - "epoch": 2.94, - "eval_loss": 0.2586362063884735, - "eval_runtime": 10.5563, - "eval_samples_per_second": 9.473, - "eval_steps_per_second": 9.473, + "epoch": 3.009334342451436, + "eval_loss": 0.4312828481197357, + "eval_runtime": 3.5249, + "eval_samples_per_second": 28.369, + "eval_steps_per_second": 7.092, "step": 83500 }, { - "epoch": 2.94, - "learning_rate": 1.9192304051564296e-05, - "loss": 0.2662, + "epoch": 3.009514542112661, + "grad_norm": 0.23713305592536926, + "learning_rate": 1.8072582957489108e-05, + "loss": 0.3742, "step": 83505 }, { - "epoch": 2.94, - "learning_rate": 1.9189533302388906e-05, - "loss": 0.2656, + "epoch": 3.0096947417738855, + "grad_norm": 0.18255163729190826, + "learning_rate": 1.806977913919271e-05, + "loss": 0.407, "step": 83510 }, { - "epoch": 2.94, - "learning_rate": 1.9186762628658594e-05, - "loss": 0.2826, + "epoch": 3.0098749414351103, + "grad_norm": 0.20699362456798553, + "learning_rate": 1.806697541531921e-05, + "loss": 0.348, "step": 83515 }, { - "epoch": 2.94, - "learning_rate": 1.9183992030409333e-05, - "loss": 0.2547, + "epoch": 3.0100551410963345, + "grad_norm": 0.22750404477119446, + "learning_rate": 1.806417178590684e-05, + "loss": 0.3584, "step": 83520 }, { - "epoch": 2.94, - "learning_rate": 1.9181221507677104e-05, - "loss": 0.2424, + "epoch": 3.0102353407575593, + "grad_norm": 0.2072783261537552, + "learning_rate": 1.8061368250993777e-05, + "loss": 0.42, "step": 83525 }, { - "epoch": 2.94, - "learning_rate": 1.917845106049788e-05, - "loss": 0.2857, + "epoch": 3.010415540418784, + "grad_norm": 0.18794316053390503, + "learning_rate": 1.8058564810618233e-05, + "loss": 0.3754, "step": 83530 }, { - "epoch": 2.94, - "learning_rate": 1.917568068890764e-05, - "loss": 0.2393, + "epoch": 3.0105957400800087, + "grad_norm": 0.15637893974781036, + "learning_rate": 1.805576146481841e-05, + "loss": 0.3791, "step": 83535 }, { - "epoch": 2.94, - "learning_rate": 1.917291039294233e-05, - "loss": 0.2903, + "epoch": 3.0107759397412335, + "grad_norm": 0.23354141414165497, + "learning_rate": 1.8052958213632477e-05, + "loss": 0.3636, "step": 83540 }, { - "epoch": 2.94, - "learning_rate": 1.9170140172637953e-05, - "loss": 0.2738, + "epoch": 3.0109561394024578, + "grad_norm": 0.20593926310539246, + "learning_rate": 1.8050155057098656e-05, + "loss": 0.35, "step": 83545 }, { - "epoch": 2.94, - "learning_rate": 1.916737002803046e-05, - "loss": 0.2613, + "epoch": 3.0111363390636825, + "grad_norm": 0.26360243558883667, + "learning_rate": 1.8047351995255123e-05, + "loss": 0.3946, "step": 83550 }, { - "epoch": 2.94, - "learning_rate": 1.9164599959155823e-05, - "loss": 0.2761, + "epoch": 3.0113165387249072, + "grad_norm": 0.20182831585407257, + "learning_rate": 1.8044549028140068e-05, + "loss": 0.3674, "step": 83555 }, { - "epoch": 2.94, - "learning_rate": 1.9161829966050003e-05, - "loss": 0.27, + "epoch": 3.011496738386132, + "grad_norm": 0.1877097487449646, + "learning_rate": 1.804174615579169e-05, + "loss": 0.3906, "step": 83560 }, { - "epoch": 2.94, - "learning_rate": 1.9159060048748984e-05, - "loss": 0.2818, + "epoch": 3.0116769380473563, + "grad_norm": 0.23039346933364868, + "learning_rate": 1.8038943378248165e-05, + "loss": 0.3893, "step": 83565 }, { - "epoch": 2.94, - "learning_rate": 1.9156290207288723e-05, - "loss": 0.2836, + "epoch": 3.011857137708581, + "grad_norm": 0.25139108300209045, + "learning_rate": 1.80361406955477e-05, + "loss": 0.3811, "step": 83570 }, { - "epoch": 2.94, - "learning_rate": 1.9153520441705168e-05, - "loss": 0.2686, + "epoch": 3.0120373373698057, + "grad_norm": 0.2260846644639969, + "learning_rate": 1.8033338107728465e-05, + "loss": 0.3805, "step": 83575 }, { - "epoch": 2.94, - "learning_rate": 1.915075075203431e-05, - "loss": 0.2763, + "epoch": 3.0122175370310305, + "grad_norm": 0.21495521068572998, + "learning_rate": 1.803053561482865e-05, + "loss": 0.3935, "step": 83580 }, { - "epoch": 2.94, - "learning_rate": 1.9147981138312093e-05, - "loss": 0.2754, + "epoch": 3.012397736692255, + "grad_norm": 0.24546730518341064, + "learning_rate": 1.802773321688644e-05, + "loss": 0.3861, "step": 83585 }, { - "epoch": 2.94, - "learning_rate": 1.914521160057449e-05, - "loss": 0.2634, + "epoch": 3.0125779363534795, + "grad_norm": 0.2514590322971344, + "learning_rate": 1.8024930913940015e-05, + "loss": 0.3718, "step": 83590 }, { - "epoch": 2.94, - "learning_rate": 1.914244213885744e-05, - "loss": 0.2856, + "epoch": 3.012758136014704, + "grad_norm": 0.23116329312324524, + "learning_rate": 1.8022128706027547e-05, + "loss": 0.354, "step": 83595 }, { - "epoch": 2.94, - "learning_rate": 1.9139672753196936e-05, - "loss": 0.2785, + "epoch": 3.012938335675929, + "grad_norm": 0.17475327849388123, + "learning_rate": 1.8019326593187232e-05, + "loss": 0.3732, "step": 83600 }, { - "epoch": 2.94, - "learning_rate": 1.9136903443628916e-05, - "loss": 0.255, + "epoch": 3.0131185353371537, + "grad_norm": 0.2183002233505249, + "learning_rate": 1.801652457545724e-05, + "loss": 0.3881, "step": 83605 }, { - "epoch": 2.94, - "learning_rate": 1.913413421018934e-05, - "loss": 0.2541, + "epoch": 3.0132987349983784, + "grad_norm": 0.23634588718414307, + "learning_rate": 1.8013722652875747e-05, + "loss": 0.4003, "step": 83610 }, { - "epoch": 2.94, - "learning_rate": 1.9131365052914154e-05, - "loss": 0.2687, + "epoch": 3.0134789346596027, + "grad_norm": 0.22611938416957855, + "learning_rate": 1.801092082548094e-05, + "loss": 0.3966, "step": 83615 }, { - "epoch": 2.94, - "learning_rate": 1.912859597183934e-05, - "loss": 0.2711, + "epoch": 3.0136591343208274, + "grad_norm": 0.23099973797798157, + "learning_rate": 1.8008119093310965e-05, + "loss": 0.3739, "step": 83620 }, { - "epoch": 2.94, - "learning_rate": 1.9125826967000836e-05, - "loss": 0.2686, + "epoch": 3.013839333982052, + "grad_norm": 0.2718352675437927, + "learning_rate": 1.8005317456404034e-05, + "loss": 0.3796, "step": 83625 }, { - "epoch": 2.94, - "learning_rate": 1.91230580384346e-05, - "loss": 0.2434, + "epoch": 3.014019533643277, + "grad_norm": 0.18968239426612854, + "learning_rate": 1.800251591479829e-05, + "loss": 0.3575, "step": 83630 }, { - "epoch": 2.94, - "learning_rate": 1.912028918617657e-05, - "loss": 0.3043, + "epoch": 3.014199733304501, + "grad_norm": 0.1754741370677948, + "learning_rate": 1.7999714468531906e-05, + "loss": 0.374, "step": 83635 }, { - "epoch": 2.94, - "learning_rate": 1.911752041026271e-05, - "loss": 0.2653, + "epoch": 3.014379932965726, + "grad_norm": 0.25998741388320923, + "learning_rate": 1.7996913117643064e-05, + "loss": 0.3882, "step": 83640 }, { - "epoch": 2.94, - "learning_rate": 1.9114751710728983e-05, - "loss": 0.2592, + "epoch": 3.0145601326269507, + "grad_norm": 0.25412383675575256, + "learning_rate": 1.799411186216992e-05, + "loss": 0.3873, "step": 83645 }, { - "epoch": 2.94, - "learning_rate": 1.9111983087611328e-05, - "loss": 0.2862, + "epoch": 3.0147403322881754, + "grad_norm": 0.23785263299942017, + "learning_rate": 1.7991310702150647e-05, + "loss": 0.3314, "step": 83650 }, { - "epoch": 2.94, - "learning_rate": 1.9109214540945677e-05, - "loss": 0.2584, + "epoch": 3.0149205319494, + "grad_norm": 0.21944360435009003, + "learning_rate": 1.7988509637623414e-05, + "loss": 0.3885, "step": 83655 }, { - "epoch": 2.94, - "learning_rate": 1.9106446070768007e-05, - "loss": 0.2845, + "epoch": 3.0151007316106244, + "grad_norm": 0.20874114334583282, + "learning_rate": 1.7985708668626373e-05, + "loss": 0.3503, "step": 83660 }, { - "epoch": 2.94, - "learning_rate": 1.9103677677114244e-05, - "loss": 0.264, + "epoch": 3.015280931271849, + "grad_norm": 0.19543851912021637, + "learning_rate": 1.7982907795197695e-05, + "loss": 0.3552, "step": 83665 }, { - "epoch": 2.94, - "learning_rate": 1.9100909360020343e-05, - "loss": 0.2732, + "epoch": 3.015461130933074, + "grad_norm": 0.17719921469688416, + "learning_rate": 1.7980107017375543e-05, + "loss": 0.3681, "step": 83670 }, { - "epoch": 2.94, - "learning_rate": 1.909814111952224e-05, - "loss": 0.2679, + "epoch": 3.0156413305942986, + "grad_norm": 0.20741024613380432, + "learning_rate": 1.7977306335198067e-05, + "loss": 0.3569, "step": 83675 }, { - "epoch": 2.94, - "learning_rate": 1.9095372955655894e-05, - "loss": 0.267, + "epoch": 3.015821530255523, + "grad_norm": 0.2016208916902542, + "learning_rate": 1.797450574870344e-05, + "loss": 0.3684, "step": 83680 }, { - "epoch": 2.94, - "learning_rate": 1.9092604868457238e-05, - "loss": 0.2704, + "epoch": 3.0160017299167476, + "grad_norm": 0.23120005428791046, + "learning_rate": 1.7971705257929806e-05, + "loss": 0.3627, "step": 83685 }, { - "epoch": 2.94, - "learning_rate": 1.9089836857962203e-05, - "loss": 0.2713, + "epoch": 3.0161819295779724, + "grad_norm": 0.22338317334651947, + "learning_rate": 1.796890486291533e-05, + "loss": 0.3849, "step": 83690 }, { - "epoch": 2.94, - "learning_rate": 1.908706892420675e-05, - "loss": 0.2662, + "epoch": 3.016362129239197, + "grad_norm": 0.2729624807834625, + "learning_rate": 1.7966104563698165e-05, + "loss": 0.3771, "step": 83695 }, { - "epoch": 2.94, - "learning_rate": 1.9084301067226815e-05, - "loss": 0.2738, + "epoch": 3.016542328900422, + "grad_norm": 0.24947573244571686, + "learning_rate": 1.796330436031646e-05, + "loss": 0.3744, "step": 83700 }, { - "epoch": 2.94, - "learning_rate": 1.9081533287058328e-05, - "loss": 0.2564, + "epoch": 3.016722528561646, + "grad_norm": 0.22686618566513062, + "learning_rate": 1.7960504252808374e-05, + "loss": 0.3786, "step": 83705 }, { - "epoch": 2.95, - "learning_rate": 1.9078765583737223e-05, - "loss": 0.2507, + "epoch": 3.016902728222871, + "grad_norm": 0.2134973108768463, + "learning_rate": 1.7957704241212063e-05, + "loss": 0.4183, "step": 83710 }, { - "epoch": 2.95, - "learning_rate": 1.9075997957299456e-05, - "loss": 0.2706, + "epoch": 3.0170829278840956, + "grad_norm": 0.17952316999435425, + "learning_rate": 1.795490432556565e-05, + "loss": 0.3963, "step": 83715 }, { - "epoch": 2.95, - "learning_rate": 1.9073230407780947e-05, - "loss": 0.2618, + "epoch": 3.0172631275453203, + "grad_norm": 0.22514955699443817, + "learning_rate": 1.795210450590732e-05, + "loss": 0.3789, "step": 83720 }, { - "epoch": 2.95, - "learning_rate": 1.9070462935217638e-05, - "loss": 0.277, + "epoch": 3.017443327206545, + "grad_norm": 0.19981583952903748, + "learning_rate": 1.794930478227518e-05, + "loss": 0.3763, "step": 83725 }, { - "epoch": 2.95, - "learning_rate": 1.9067695539645453e-05, - "loss": 0.2748, + "epoch": 3.0176235268677694, + "grad_norm": 0.2752089500427246, + "learning_rate": 1.7946505154707416e-05, + "loss": 0.3775, "step": 83730 }, { - "epoch": 2.95, - "learning_rate": 1.906492822110034e-05, - "loss": 0.2872, + "epoch": 3.017803726528994, + "grad_norm": 0.21153953671455383, + "learning_rate": 1.7943705623242153e-05, + "loss": 0.3939, "step": 83735 }, { - "epoch": 2.95, - "learning_rate": 1.9062160979618224e-05, - "loss": 0.256, + "epoch": 3.017983926190219, + "grad_norm": 0.22683672606945038, + "learning_rate": 1.794090618791752e-05, + "loss": 0.3739, "step": 83740 }, { - "epoch": 2.95, - "learning_rate": 1.9059393815235026e-05, - "loss": 0.2739, + "epoch": 3.0181641258514436, + "grad_norm": 0.2554776668548584, + "learning_rate": 1.793810684877169e-05, + "loss": 0.4071, "step": 83745 }, { - "epoch": 2.95, - "learning_rate": 1.9056626727986686e-05, - "loss": 0.2743, + "epoch": 3.018344325512668, + "grad_norm": 0.2107311338186264, + "learning_rate": 1.7935307605842775e-05, + "loss": 0.3646, "step": 83750 }, { - "epoch": 2.95, - "learning_rate": 1.9053859717909132e-05, - "loss": 0.2674, + "epoch": 3.0185245251738926, + "grad_norm": 0.22267073392868042, + "learning_rate": 1.793250845916893e-05, + "loss": 0.4121, "step": 83755 }, { - "epoch": 2.95, - "learning_rate": 1.9051092785038297e-05, - "loss": 0.2634, + "epoch": 3.0187047248351173, + "grad_norm": 0.2603355646133423, + "learning_rate": 1.7929709408788292e-05, + "loss": 0.371, "step": 83760 }, { - "epoch": 2.95, - "learning_rate": 1.9048325929410103e-05, - "loss": 0.2678, + "epoch": 3.018884924496342, + "grad_norm": 0.22211819887161255, + "learning_rate": 1.792691045473899e-05, + "loss": 0.353, "step": 83765 }, { - "epoch": 2.95, - "learning_rate": 1.904555915106046e-05, - "loss": 0.2741, + "epoch": 3.019065124157567, + "grad_norm": 0.1939542144536972, + "learning_rate": 1.792411159705917e-05, + "loss": 0.3708, "step": 83770 }, { - "epoch": 2.95, - "learning_rate": 1.9042792450025313e-05, - "loss": 0.2709, + "epoch": 3.019245323818791, + "grad_norm": 0.23032665252685547, + "learning_rate": 1.7921312835786952e-05, + "loss": 0.3856, "step": 83775 }, { - "epoch": 2.95, - "learning_rate": 1.904002582634059e-05, - "loss": 0.2552, + "epoch": 3.019425523480016, + "grad_norm": 0.22775132954120636, + "learning_rate": 1.7918514170960478e-05, + "loss": 0.3811, "step": 83780 }, { - "epoch": 2.95, - "learning_rate": 1.9037259280042197e-05, - "loss": 0.2854, + "epoch": 3.0196057231412405, + "grad_norm": 0.2607553005218506, + "learning_rate": 1.791571560261788e-05, + "loss": 0.3733, "step": 83785 }, { - "epoch": 2.95, - "learning_rate": 1.9034492811166056e-05, - "loss": 0.2707, + "epoch": 3.0197859228024653, + "grad_norm": 0.22313840687274933, + "learning_rate": 1.791291713079728e-05, + "loss": 0.4006, "step": 83790 }, { - "epoch": 2.95, - "learning_rate": 1.9031726419748103e-05, - "loss": 0.2613, + "epoch": 3.0199661224636896, + "grad_norm": 0.22880607843399048, + "learning_rate": 1.791011875553682e-05, + "loss": 0.4368, "step": 83795 }, { - "epoch": 2.95, - "learning_rate": 1.9028960105824245e-05, - "loss": 0.27, + "epoch": 3.0201463221249143, + "grad_norm": 0.2502082288265228, + "learning_rate": 1.790732047687462e-05, + "loss": 0.3866, "step": 83800 }, { - "epoch": 2.95, - "learning_rate": 1.9026193869430404e-05, - "loss": 0.2898, + "epoch": 3.020326521786139, + "grad_norm": 0.23791562020778656, + "learning_rate": 1.7904522294848792e-05, + "loss": 0.3749, "step": 83805 }, { - "epoch": 2.95, - "learning_rate": 1.9023427710602504e-05, - "loss": 0.261, + "epoch": 3.0205067214473638, + "grad_norm": 0.22234022617340088, + "learning_rate": 1.790172420949749e-05, + "loss": 0.3662, "step": 83810 }, { - "epoch": 2.95, - "learning_rate": 1.9020661629376456e-05, - "loss": 0.2876, + "epoch": 3.0206869211085885, + "grad_norm": 0.169847309589386, + "learning_rate": 1.7898926220858817e-05, + "loss": 0.361, "step": 83815 }, { - "epoch": 2.95, - "learning_rate": 1.9017895625788178e-05, - "loss": 0.2652, + "epoch": 3.020867120769813, + "grad_norm": 0.1907045543193817, + "learning_rate": 1.7896128328970886e-05, + "loss": 0.3964, "step": 83820 }, { - "epoch": 2.95, - "learning_rate": 1.9015129699873573e-05, - "loss": 0.2626, + "epoch": 3.0210473204310375, + "grad_norm": 0.21488279104232788, + "learning_rate": 1.789333053387185e-05, + "loss": 0.3986, "step": 83825 }, { - "epoch": 2.95, - "learning_rate": 1.9012363851668576e-05, - "loss": 0.2728, + "epoch": 3.0212275200922623, + "grad_norm": 0.193211168050766, + "learning_rate": 1.7890532835599793e-05, + "loss": 0.3955, "step": 83830 }, { - "epoch": 2.95, - "learning_rate": 1.900959808120908e-05, - "loss": 0.2711, + "epoch": 3.021407719753487, + "grad_norm": 0.2113611400127411, + "learning_rate": 1.7887735234192864e-05, + "loss": 0.3767, "step": 83835 }, { - "epoch": 2.95, - "learning_rate": 1.900683238853101e-05, - "loss": 0.2751, + "epoch": 3.0215879194147117, + "grad_norm": 0.2665266990661621, + "learning_rate": 1.788493772968916e-05, + "loss": 0.3705, "step": 83840 }, { - "epoch": 2.95, - "learning_rate": 1.9004066773670265e-05, - "loss": 0.2988, + "epoch": 3.021768119075936, + "grad_norm": 0.24903304874897003, + "learning_rate": 1.78821403221268e-05, + "loss": 0.4004, "step": 83845 }, { - "epoch": 2.95, - "learning_rate": 1.900130123666277e-05, - "loss": 0.2605, + "epoch": 3.0219483187371607, + "grad_norm": 0.2464405745267868, + "learning_rate": 1.7879343011543905e-05, + "loss": 0.3908, "step": 83850 }, { - "epoch": 2.95, - "learning_rate": 1.8998535777544424e-05, - "loss": 0.2607, + "epoch": 3.0221285183983855, + "grad_norm": 0.2581247389316559, + "learning_rate": 1.7876545797978576e-05, + "loss": 0.3983, "step": 83855 }, { - "epoch": 2.95, - "learning_rate": 1.8995770396351134e-05, - "loss": 0.2659, + "epoch": 3.02230871805961, + "grad_norm": 0.3195907473564148, + "learning_rate": 1.7873748681468932e-05, + "loss": 0.3965, "step": 83860 }, { - "epoch": 2.95, - "learning_rate": 1.89930050931188e-05, - "loss": 0.2775, + "epoch": 3.0224889177208345, + "grad_norm": 0.22984401881694794, + "learning_rate": 1.787095166205309e-05, + "loss": 0.3762, "step": 83865 }, { - "epoch": 2.95, - "learning_rate": 1.8990239867883348e-05, - "loss": 0.2859, + "epoch": 3.0226691173820592, + "grad_norm": 0.22616103291511536, + "learning_rate": 1.7868154739769144e-05, + "loss": 0.3776, "step": 83870 }, { - "epoch": 2.95, - "learning_rate": 1.898747472068067e-05, - "loss": 0.2673, + "epoch": 3.022849317043284, + "grad_norm": 0.17865821719169617, + "learning_rate": 1.7865357914655212e-05, + "loss": 0.3676, "step": 83875 }, { - "epoch": 2.95, - "learning_rate": 1.8984709651546667e-05, - "loss": 0.2706, + "epoch": 3.0230295167045087, + "grad_norm": 0.21580396592617035, + "learning_rate": 1.78625611867494e-05, + "loss": 0.3549, "step": 83880 }, { - "epoch": 2.95, - "learning_rate": 1.8981944660517243e-05, - "loss": 0.27, + "epoch": 3.0232097163657334, + "grad_norm": 0.22355122864246368, + "learning_rate": 1.7859764556089804e-05, + "loss": 0.4141, "step": 83885 }, { - "epoch": 2.95, - "learning_rate": 1.89791797476283e-05, - "loss": 0.2736, + "epoch": 3.0233899160269577, + "grad_norm": 0.2217842936515808, + "learning_rate": 1.7856968022714535e-05, + "loss": 0.387, "step": 83890 }, { - "epoch": 2.95, - "learning_rate": 1.8976414912915748e-05, - "loss": 0.2677, + "epoch": 3.0235701156881825, + "grad_norm": 0.29023489356040955, + "learning_rate": 1.78541715866617e-05, + "loss": 0.3755, "step": 83895 }, { - "epoch": 2.95, - "learning_rate": 1.897365015641548e-05, - "loss": 0.2639, + "epoch": 3.023750315349407, + "grad_norm": 0.19948424398899078, + "learning_rate": 1.785137524796938e-05, + "loss": 0.3434, "step": 83900 }, { - "epoch": 2.95, - "learning_rate": 1.897088547816338e-05, - "loss": 0.2631, + "epoch": 3.023930515010632, + "grad_norm": 0.1968270093202591, + "learning_rate": 1.7848579006675707e-05, + "loss": 0.3828, "step": 83905 }, { - "epoch": 2.95, - "learning_rate": 1.896812087819537e-05, - "loss": 0.2578, + "epoch": 3.024110714671856, + "grad_norm": 0.20622436702251434, + "learning_rate": 1.784578286281874e-05, + "loss": 0.3828, "step": 83910 }, { - "epoch": 2.95, - "learning_rate": 1.896535635654733e-05, - "loss": 0.2558, + "epoch": 3.024290914333081, + "grad_norm": 0.25507232546806335, + "learning_rate": 1.7842986816436613e-05, + "loss": 0.3875, "step": 83915 }, { - "epoch": 2.95, - "learning_rate": 1.8962591913255158e-05, - "loss": 0.2719, + "epoch": 3.0244711139943057, + "grad_norm": 0.18650664389133453, + "learning_rate": 1.7840190867567403e-05, + "loss": 0.3826, "step": 83920 }, { - "epoch": 2.95, - "learning_rate": 1.895982754835476e-05, - "loss": 0.2821, + "epoch": 3.0246513136555304, + "grad_norm": 0.23238393664360046, + "learning_rate": 1.7837395016249198e-05, + "loss": 0.3914, "step": 83925 }, { - "epoch": 2.95, - "learning_rate": 1.895706326188202e-05, - "loss": 0.2708, + "epoch": 3.024831513316755, + "grad_norm": 0.23915240168571472, + "learning_rate": 1.7834599262520102e-05, + "loss": 0.3834, "step": 83930 }, { - "epoch": 2.95, - "learning_rate": 1.895429905387283e-05, - "loss": 0.282, + "epoch": 3.0250117129779794, + "grad_norm": 0.2193489819765091, + "learning_rate": 1.78318036064182e-05, + "loss": 0.4019, "step": 83935 }, { - "epoch": 2.95, - "learning_rate": 1.895153492436307e-05, - "loss": 0.2764, + "epoch": 3.025191912639204, + "grad_norm": 0.2331564873456955, + "learning_rate": 1.7829008047981594e-05, + "loss": 0.3714, "step": 83940 }, { - "epoch": 2.95, - "learning_rate": 1.8948770873388654e-05, - "loss": 0.2581, + "epoch": 3.025372112300429, + "grad_norm": 0.22607645392417908, + "learning_rate": 1.7826212587248364e-05, + "loss": 0.4046, "step": 83945 }, { - "epoch": 2.95, - "learning_rate": 1.894600690098546e-05, - "loss": 0.2626, + "epoch": 3.0255523119616536, + "grad_norm": 0.235866516828537, + "learning_rate": 1.7823417224256594e-05, + "loss": 0.3759, "step": 83950 }, { - "epoch": 2.95, - "learning_rate": 1.894324300718938e-05, - "loss": 0.2693, + "epoch": 3.025732511622878, + "grad_norm": 0.23294222354888916, + "learning_rate": 1.782062195904438e-05, + "loss": 0.3864, "step": 83955 }, { - "epoch": 2.95, - "learning_rate": 1.8940479192036286e-05, - "loss": 0.2833, + "epoch": 3.0259127112841027, + "grad_norm": 0.22986343502998352, + "learning_rate": 1.78178267916498e-05, + "loss": 0.3758, "step": 83960 }, { - "epoch": 2.95, - "learning_rate": 1.893771545556209e-05, - "loss": 0.2659, + "epoch": 3.0260929109453274, + "grad_norm": 0.17476344108581543, + "learning_rate": 1.7815031722110935e-05, + "loss": 0.3921, "step": 83965 }, { - "epoch": 2.95, - "learning_rate": 1.8934951797802654e-05, - "loss": 0.2795, + "epoch": 3.026273110606552, + "grad_norm": 0.23283426463603973, + "learning_rate": 1.7812236750465876e-05, + "loss": 0.4051, "step": 83970 }, { - "epoch": 2.95, - "learning_rate": 1.893218821879388e-05, - "loss": 0.2528, + "epoch": 3.026453310267777, + "grad_norm": 0.19704559445381165, + "learning_rate": 1.7809441876752695e-05, + "loss": 0.3977, "step": 83975 }, { - "epoch": 2.95, - "learning_rate": 1.892942471857163e-05, - "loss": 0.2465, + "epoch": 3.026633509929001, + "grad_norm": 0.1991409957408905, + "learning_rate": 1.7806647101009484e-05, + "loss": 0.3977, "step": 83980 }, { - "epoch": 2.95, - "learning_rate": 1.8926661297171816e-05, - "loss": 0.2652, + "epoch": 3.026813709590226, + "grad_norm": 0.2306070774793625, + "learning_rate": 1.780385242327431e-05, + "loss": 0.3909, "step": 83985 }, { - "epoch": 2.96, - "learning_rate": 1.8923897954630298e-05, - "loss": 0.2615, + "epoch": 3.0269939092514506, + "grad_norm": 0.25500649213790894, + "learning_rate": 1.780105784358525e-05, + "loss": 0.4118, "step": 83990 }, { - "epoch": 2.96, - "learning_rate": 1.8921134690982957e-05, - "loss": 0.2867, + "epoch": 3.0271741089126754, + "grad_norm": 0.19943642616271973, + "learning_rate": 1.7798263361980388e-05, + "loss": 0.366, "step": 83995 }, { - "epoch": 2.96, - "learning_rate": 1.8918371506265666e-05, - "loss": 0.2851, + "epoch": 3.0273543085739, + "grad_norm": 0.22723056375980377, + "learning_rate": 1.77954689784978e-05, + "loss": 0.4048, "step": 84000 }, { - "epoch": 2.96, - "eval_loss": 0.2586404085159302, - "eval_runtime": 10.5437, - "eval_samples_per_second": 9.484, - "eval_steps_per_second": 9.484, + "epoch": 3.0273543085739, + "eval_loss": 0.4311674237251282, + "eval_runtime": 3.5287, + "eval_samples_per_second": 28.339, + "eval_steps_per_second": 7.085, "step": 84000 }, { - "epoch": 2.96, - "learning_rate": 1.891560840051433e-05, - "loss": 0.2446, + "epoch": 3.0275345082351244, + "grad_norm": 0.20490515232086182, + "learning_rate": 1.7792674693175534e-05, + "loss": 0.3958, "step": 84005 }, { - "epoch": 2.96, - "learning_rate": 1.8912845373764804e-05, - "loss": 0.2668, + "epoch": 3.027714707896349, + "grad_norm": 0.24055179953575134, + "learning_rate": 1.77898805060517e-05, + "loss": 0.3762, "step": 84010 }, { - "epoch": 2.96, - "learning_rate": 1.8910082426052973e-05, - "loss": 0.2509, + "epoch": 3.027894907557574, + "grad_norm": 0.25771138072013855, + "learning_rate": 1.778708641716433e-05, + "loss": 0.4009, "step": 84015 }, { - "epoch": 2.96, - "learning_rate": 1.8907319557414696e-05, - "loss": 0.2543, + "epoch": 3.0280751072187986, + "grad_norm": 0.19675886631011963, + "learning_rate": 1.7784292426551525e-05, + "loss": 0.3787, "step": 84020 }, { - "epoch": 2.96, - "learning_rate": 1.8904556767885865e-05, - "loss": 0.2615, + "epoch": 3.028255306880023, + "grad_norm": 0.2063799649477005, + "learning_rate": 1.7781498534251334e-05, + "loss": 0.428, "step": 84025 }, { - "epoch": 2.96, - "learning_rate": 1.890179405750235e-05, - "loss": 0.2538, + "epoch": 3.0284355065412476, + "grad_norm": 0.23454803228378296, + "learning_rate": 1.7778704740301823e-05, + "loss": 0.3933, "step": 84030 }, { - "epoch": 2.96, - "learning_rate": 1.889903142630001e-05, - "loss": 0.2779, + "epoch": 3.0286157062024723, + "grad_norm": 0.20957069098949432, + "learning_rate": 1.7775911044741067e-05, + "loss": 0.3725, "step": 84035 }, { - "epoch": 2.96, - "learning_rate": 1.8896268874314743e-05, - "loss": 0.2719, + "epoch": 3.028795905863697, + "grad_norm": 0.24163176119327545, + "learning_rate": 1.7773117447607128e-05, + "loss": 0.3617, "step": 84040 }, { - "epoch": 2.96, - "learning_rate": 1.8893506401582397e-05, - "loss": 0.261, + "epoch": 3.028976105524922, + "grad_norm": 0.19776524603366852, + "learning_rate": 1.7770323948938056e-05, + "loss": 0.3595, "step": 84045 }, { - "epoch": 2.96, - "learning_rate": 1.889074400813884e-05, - "loss": 0.2607, + "epoch": 3.029156305186146, + "grad_norm": 0.21786345541477203, + "learning_rate": 1.7767530548771926e-05, + "loss": 0.4257, "step": 84050 }, { - "epoch": 2.96, - "learning_rate": 1.8887981694019946e-05, - "loss": 0.2538, + "epoch": 3.029336504847371, + "grad_norm": 0.2272718995809555, + "learning_rate": 1.7764737247146783e-05, + "loss": 0.3771, "step": 84055 }, { - "epoch": 2.96, - "learning_rate": 1.8885219459261588e-05, - "loss": 0.2449, + "epoch": 3.0295167045085956, + "grad_norm": 0.2050115466117859, + "learning_rate": 1.7761944044100707e-05, + "loss": 0.3718, "step": 84060 }, { - "epoch": 2.96, - "learning_rate": 1.8882457303899626e-05, - "loss": 0.2915, + "epoch": 3.0296969041698203, + "grad_norm": 0.26552021503448486, + "learning_rate": 1.7759150939671735e-05, + "loss": 0.3983, "step": 84065 }, { - "epoch": 2.96, - "learning_rate": 1.8879695227969923e-05, - "loss": 0.2925, + "epoch": 3.0298771038310446, + "grad_norm": 0.22904445230960846, + "learning_rate": 1.775635793389793e-05, + "loss": 0.377, "step": 84070 }, { - "epoch": 2.96, - "learning_rate": 1.8876933231508337e-05, - "loss": 0.2451, + "epoch": 3.0300573034922693, + "grad_norm": 0.1924734264612198, + "learning_rate": 1.7753565026817348e-05, + "loss": 0.3702, "step": 84075 }, { - "epoch": 2.96, - "learning_rate": 1.887417131455074e-05, - "loss": 0.272, + "epoch": 3.030237503153494, + "grad_norm": 0.2412857860326767, + "learning_rate": 1.775077221846805e-05, + "loss": 0.3691, "step": 84080 }, { - "epoch": 2.96, - "learning_rate": 1.8871409477132997e-05, - "loss": 0.265, + "epoch": 3.030417702814719, + "grad_norm": 0.2012851983308792, + "learning_rate": 1.7747979508888053e-05, + "loss": 0.3935, "step": 84085 }, { - "epoch": 2.96, - "learning_rate": 1.8868647719290966e-05, - "loss": 0.2835, + "epoch": 3.0305979024759435, + "grad_norm": 0.21327096223831177, + "learning_rate": 1.774518689811545e-05, + "loss": 0.3731, "step": 84090 }, { - "epoch": 2.96, - "learning_rate": 1.886588604106049e-05, - "loss": 0.2811, + "epoch": 3.030778102137168, + "grad_norm": 0.19363613426685333, + "learning_rate": 1.7742394386188255e-05, + "loss": 0.3753, "step": 84095 }, { - "epoch": 2.96, - "learning_rate": 1.8863124442477453e-05, - "loss": 0.2649, + "epoch": 3.0309583017983925, + "grad_norm": 0.19046692550182343, + "learning_rate": 1.7739601973144547e-05, + "loss": 0.3963, "step": 84100 }, { - "epoch": 2.96, - "learning_rate": 1.8860362923577694e-05, - "loss": 0.2909, + "epoch": 3.0311385014596173, + "grad_norm": 0.2194364368915558, + "learning_rate": 1.7736809659022354e-05, + "loss": 0.374, "step": 84105 }, { - "epoch": 2.96, - "learning_rate": 1.8857601484397082e-05, - "loss": 0.2592, + "epoch": 3.031318701120842, + "grad_norm": 0.1966681331396103, + "learning_rate": 1.7734017443859715e-05, + "loss": 0.3419, "step": 84110 }, { - "epoch": 2.96, - "learning_rate": 1.885484012497146e-05, - "loss": 0.2774, + "epoch": 3.0314989007820667, + "grad_norm": 0.2259567826986313, + "learning_rate": 1.7731225327694688e-05, + "loss": 0.3931, "step": 84115 }, { - "epoch": 2.96, - "learning_rate": 1.8852078845336694e-05, - "loss": 0.2713, + "epoch": 3.031679100443291, + "grad_norm": 0.21452829241752625, + "learning_rate": 1.7728433310565302e-05, + "loss": 0.3811, "step": 84120 }, { - "epoch": 2.96, - "learning_rate": 1.8849317645528636e-05, - "loss": 0.2646, + "epoch": 3.0318593001045158, + "grad_norm": 0.20977729558944702, + "learning_rate": 1.7725641392509614e-05, + "loss": 0.3805, "step": 84125 }, { - "epoch": 2.96, - "learning_rate": 1.8846556525583127e-05, - "loss": 0.2542, + "epoch": 3.0320394997657405, + "grad_norm": 0.2387772798538208, + "learning_rate": 1.772284957356565e-05, + "loss": 0.3516, "step": 84130 }, { - "epoch": 2.96, - "learning_rate": 1.8843795485536027e-05, - "loss": 0.2764, + "epoch": 3.0322196994269652, + "grad_norm": 0.21212005615234375, + "learning_rate": 1.772005785377145e-05, + "loss": 0.3751, "step": 84135 }, { - "epoch": 2.96, - "learning_rate": 1.884103452542319e-05, - "loss": 0.2772, + "epoch": 3.0323998990881895, + "grad_norm": 0.22513705492019653, + "learning_rate": 1.7717266233165054e-05, + "loss": 0.3696, "step": 84140 }, { - "epoch": 2.96, - "learning_rate": 1.883827364528046e-05, - "loss": 0.281, + "epoch": 3.0325800987494143, + "grad_norm": 0.25777748227119446, + "learning_rate": 1.7714474711784496e-05, + "loss": 0.3954, "step": 84145 }, { - "epoch": 2.96, - "learning_rate": 1.883551284514368e-05, - "loss": 0.2541, + "epoch": 3.032760298410639, + "grad_norm": 0.20755411684513092, + "learning_rate": 1.77116832896678e-05, + "loss": 0.3365, "step": 84150 }, { - "epoch": 2.96, - "learning_rate": 1.8832752125048713e-05, - "loss": 0.2678, + "epoch": 3.0329404980718637, + "grad_norm": 0.18738418817520142, + "learning_rate": 1.770889196685302e-05, + "loss": 0.3949, "step": 84155 }, { - "epoch": 2.96, - "learning_rate": 1.8829991485031387e-05, - "loss": 0.2587, + "epoch": 3.0331206977330885, + "grad_norm": 0.21156027913093567, + "learning_rate": 1.7706100743378168e-05, + "loss": 0.3566, "step": 84160 }, { - "epoch": 2.96, - "learning_rate": 1.8827230925127566e-05, - "loss": 0.2556, + "epoch": 3.0333008973943127, + "grad_norm": 0.28246161341667175, + "learning_rate": 1.770330961928129e-05, + "loss": 0.3625, "step": 84165 }, { - "epoch": 2.96, - "learning_rate": 1.8824470445373065e-05, - "loss": 0.2801, + "epoch": 3.0334810970555375, + "grad_norm": 0.22130875289440155, + "learning_rate": 1.77005185946004e-05, + "loss": 0.3402, "step": 84170 }, { - "epoch": 2.96, - "learning_rate": 1.882171004580376e-05, - "loss": 0.2753, + "epoch": 3.033661296716762, + "grad_norm": 0.22155120968818665, + "learning_rate": 1.7697727669373525e-05, + "loss": 0.375, "step": 84175 }, { - "epoch": 2.96, - "learning_rate": 1.881894972645548e-05, - "loss": 0.2691, + "epoch": 3.033841496377987, + "grad_norm": 0.22286799550056458, + "learning_rate": 1.7694936843638707e-05, + "loss": 0.3858, "step": 84180 }, { - "epoch": 2.96, - "learning_rate": 1.8816189487364056e-05, - "loss": 0.2618, + "epoch": 3.0340216960392112, + "grad_norm": 0.1948540061712265, + "learning_rate": 1.7692146117433965e-05, + "loss": 0.3905, "step": 84185 }, { - "epoch": 2.96, - "learning_rate": 1.8813429328565342e-05, - "loss": 0.244, + "epoch": 3.034201895700436, + "grad_norm": 0.228069469332695, + "learning_rate": 1.76893554907973e-05, + "loss": 0.41, "step": 84190 }, { - "epoch": 2.96, - "learning_rate": 1.8810669250095163e-05, - "loss": 0.2633, + "epoch": 3.0343820953616607, + "grad_norm": 0.21206477284431458, + "learning_rate": 1.7686564963766765e-05, + "loss": 0.3649, "step": 84195 }, { - "epoch": 2.96, - "learning_rate": 1.8807909251989376e-05, - "loss": 0.2525, + "epoch": 3.0345622950228854, + "grad_norm": 0.2614726722240448, + "learning_rate": 1.768377453638035e-05, + "loss": 0.377, "step": 84200 }, { - "epoch": 2.96, - "learning_rate": 1.8805149334283807e-05, - "loss": 0.2616, + "epoch": 3.03474249468411, + "grad_norm": 0.21512839198112488, + "learning_rate": 1.768098420867611e-05, + "loss": 0.3931, "step": 84205 }, { - "epoch": 2.96, - "learning_rate": 1.880238949701428e-05, - "loss": 0.2629, + "epoch": 3.0349226943453345, + "grad_norm": 0.22823360562324524, + "learning_rate": 1.7678193980692036e-05, + "loss": 0.368, "step": 84210 }, { - "epoch": 2.96, - "learning_rate": 1.8799629740216643e-05, - "loss": 0.2652, + "epoch": 3.035102894006559, + "grad_norm": 0.2092510610818863, + "learning_rate": 1.767540385246615e-05, + "loss": 0.3904, "step": 84215 }, { - "epoch": 2.96, - "learning_rate": 1.8796870063926733e-05, - "loss": 0.2691, + "epoch": 3.035283093667784, + "grad_norm": 0.22960545122623444, + "learning_rate": 1.767261382403647e-05, + "loss": 0.4228, "step": 84220 }, { - "epoch": 2.96, - "learning_rate": 1.8794110468180383e-05, - "loss": 0.2754, + "epoch": 3.0354632933290087, + "grad_norm": 0.2698182761669159, + "learning_rate": 1.7669823895441007e-05, + "loss": 0.3652, "step": 84225 }, { - "epoch": 2.96, - "learning_rate": 1.8791350953013402e-05, - "loss": 0.281, + "epoch": 3.0356434929902334, + "grad_norm": 0.27109652757644653, + "learning_rate": 1.7667034066717768e-05, + "loss": 0.3765, "step": 84230 }, { - "epoch": 2.96, - "learning_rate": 1.8788591518461647e-05, - "loss": 0.2688, + "epoch": 3.0358236926514577, + "grad_norm": 0.178133025765419, + "learning_rate": 1.766424433790478e-05, + "loss": 0.3916, "step": 84235 }, { - "epoch": 2.96, - "learning_rate": 1.8785832164560937e-05, - "loss": 0.2692, + "epoch": 3.0360038923126824, + "grad_norm": 0.26019546389579773, + "learning_rate": 1.7661454709040036e-05, + "loss": 0.4084, "step": 84240 }, { - "epoch": 2.96, - "learning_rate": 1.8783072891347105e-05, - "loss": 0.2761, + "epoch": 3.036184091973907, + "grad_norm": 0.21639469265937805, + "learning_rate": 1.7658665180161555e-05, + "loss": 0.3909, "step": 84245 }, { - "epoch": 2.96, - "learning_rate": 1.8780313698855964e-05, - "loss": 0.2574, + "epoch": 3.036364291635132, + "grad_norm": 0.2332758754491806, + "learning_rate": 1.7655875751307338e-05, + "loss": 0.3712, "step": 84250 }, { - "epoch": 2.96, - "learning_rate": 1.877755458712336e-05, - "loss": 0.2711, + "epoch": 3.036544491296356, + "grad_norm": 0.19095510244369507, + "learning_rate": 1.765308642251539e-05, + "loss": 0.3552, "step": 84255 }, { - "epoch": 2.96, - "learning_rate": 1.877479555618511e-05, - "loss": 0.2504, + "epoch": 3.036724690957581, + "grad_norm": 0.18508897721767426, + "learning_rate": 1.765029719382372e-05, + "loss": 0.4014, "step": 84260 }, { - "epoch": 2.96, - "learning_rate": 1.8772036606077025e-05, - "loss": 0.2722, + "epoch": 3.0369048906188056, + "grad_norm": 0.22159694135189056, + "learning_rate": 1.7647508065270335e-05, + "loss": 0.3882, "step": 84265 }, { - "epoch": 2.96, - "learning_rate": 1.8769277736834954e-05, - "loss": 0.2722, + "epoch": 3.0370850902800304, + "grad_norm": 0.22813962399959564, + "learning_rate": 1.764471903689321e-05, + "loss": 0.3698, "step": 84270 }, { - "epoch": 2.97, - "learning_rate": 1.87665189484947e-05, - "loss": 0.2855, + "epoch": 3.037265289941255, + "grad_norm": 0.21553412079811096, + "learning_rate": 1.7641930108730377e-05, + "loss": 0.3682, "step": 84275 }, { - "epoch": 2.97, - "learning_rate": 1.876376024109209e-05, - "loss": 0.2642, + "epoch": 3.0374454896024794, + "grad_norm": 0.2294461727142334, + "learning_rate": 1.7639141280819815e-05, + "loss": 0.3852, "step": 84280 }, { - "epoch": 2.97, - "learning_rate": 1.8761001614662938e-05, - "loss": 0.2801, + "epoch": 3.037625689263704, + "grad_norm": 0.240443617105484, + "learning_rate": 1.7636352553199537e-05, + "loss": 0.3859, "step": 84285 }, { - "epoch": 2.97, - "learning_rate": 1.8758243069243077e-05, - "loss": 0.2736, + "epoch": 3.037805888924929, + "grad_norm": 0.21389326453208923, + "learning_rate": 1.7633563925907533e-05, + "loss": 0.36, "step": 84290 }, { - "epoch": 2.97, - "learning_rate": 1.8755484604868315e-05, - "loss": 0.2919, + "epoch": 3.0379860885861536, + "grad_norm": 0.21817508339881897, + "learning_rate": 1.7630775398981777e-05, + "loss": 0.4133, "step": 84295 }, { - "epoch": 2.97, - "learning_rate": 1.8752726221574466e-05, - "loss": 0.2704, + "epoch": 3.038166288247378, + "grad_norm": 0.2270621359348297, + "learning_rate": 1.7627986972460298e-05, + "loss": 0.4165, "step": 84300 }, { - "epoch": 2.97, - "learning_rate": 1.8749967919397342e-05, - "loss": 0.2514, + "epoch": 3.0383464879086026, + "grad_norm": 0.24196819961071014, + "learning_rate": 1.762519864638106e-05, + "loss": 0.393, "step": 84305 }, { - "epoch": 2.97, - "learning_rate": 1.874720969837278e-05, - "loss": 0.2576, + "epoch": 3.0385266875698274, + "grad_norm": 0.2399672567844391, + "learning_rate": 1.7622410420782058e-05, + "loss": 0.4056, "step": 84310 }, { - "epoch": 2.97, - "learning_rate": 1.874445155853658e-05, - "loss": 0.2626, + "epoch": 3.038706887231052, + "grad_norm": 0.23226651549339294, + "learning_rate": 1.7619622295701296e-05, + "loss": 0.3552, "step": 84315 }, { - "epoch": 2.97, - "learning_rate": 1.8741693499924546e-05, - "loss": 0.269, + "epoch": 3.038887086892277, + "grad_norm": 0.2692866027355194, + "learning_rate": 1.761683427117674e-05, + "loss": 0.3849, "step": 84320 }, { - "epoch": 2.97, - "learning_rate": 1.8738935522572498e-05, - "loss": 0.2554, + "epoch": 3.039067286553501, + "grad_norm": 0.2093464732170105, + "learning_rate": 1.76140463472464e-05, + "loss": 0.3731, "step": 84325 }, { - "epoch": 2.97, - "learning_rate": 1.8736177626516246e-05, - "loss": 0.266, + "epoch": 3.039247486214726, + "grad_norm": 0.227401465177536, + "learning_rate": 1.7611258523948242e-05, + "loss": 0.3783, "step": 84330 }, { - "epoch": 2.97, - "learning_rate": 1.8733419811791604e-05, - "loss": 0.2702, + "epoch": 3.0394276858759506, + "grad_norm": 0.212887704372406, + "learning_rate": 1.7608470801320253e-05, + "loss": 0.354, "step": 84335 }, { - "epoch": 2.97, - "learning_rate": 1.873066207843438e-05, - "loss": 0.2851, + "epoch": 3.0396078855371753, + "grad_norm": 0.2043050229549408, + "learning_rate": 1.7605683179400423e-05, + "loss": 0.3788, "step": 84340 }, { - "epoch": 2.97, - "learning_rate": 1.8727904426480362e-05, - "loss": 0.2812, + "epoch": 3.0397880851984, + "grad_norm": 0.24479785561561584, + "learning_rate": 1.7602895658226725e-05, + "loss": 0.3753, "step": 84345 }, { - "epoch": 2.97, - "learning_rate": 1.8725146855965385e-05, - "loss": 0.2573, + "epoch": 3.0399682848596243, + "grad_norm": 0.2218136191368103, + "learning_rate": 1.7600108237837144e-05, + "loss": 0.4084, "step": 84350 }, { - "epoch": 2.97, - "learning_rate": 1.8722389366925232e-05, - "loss": 0.2504, + "epoch": 3.040148484520849, + "grad_norm": 0.23123931884765625, + "learning_rate": 1.7597320918269658e-05, + "loss": 0.4059, "step": 84355 }, { - "epoch": 2.97, - "learning_rate": 1.871963195939572e-05, - "loss": 0.2567, + "epoch": 3.040328684182074, + "grad_norm": 0.2367209494113922, + "learning_rate": 1.7594533699562234e-05, + "loss": 0.3901, "step": 84360 }, { - "epoch": 2.97, - "learning_rate": 1.8716874633412647e-05, - "loss": 0.2856, + "epoch": 3.0405088838432985, + "grad_norm": 0.22968047857284546, + "learning_rate": 1.759174658175286e-05, + "loss": 0.3823, "step": 84365 }, { - "epoch": 2.97, - "learning_rate": 1.871411738901182e-05, - "loss": 0.2759, + "epoch": 3.040689083504523, + "grad_norm": 0.23370803892612457, + "learning_rate": 1.758895956487951e-05, + "loss": 0.384, "step": 84370 }, { - "epoch": 2.97, - "learning_rate": 1.8711360226229035e-05, - "loss": 0.2461, + "epoch": 3.0408692831657476, + "grad_norm": 0.19679541885852814, + "learning_rate": 1.758617264898014e-05, + "loss": 0.3785, "step": 84375 }, { - "epoch": 2.97, - "learning_rate": 1.8708603145100085e-05, - "loss": 0.2644, + "epoch": 3.0410494828269723, + "grad_norm": 0.2017953097820282, + "learning_rate": 1.758338583409274e-05, + "loss": 0.3986, "step": 84380 }, { - "epoch": 2.97, - "learning_rate": 1.8705846145660782e-05, - "loss": 0.2525, + "epoch": 3.041229682488197, + "grad_norm": 0.1906876564025879, + "learning_rate": 1.7580599120255263e-05, + "loss": 0.3619, "step": 84385 }, { - "epoch": 2.97, - "learning_rate": 1.8703089227946924e-05, - "loss": 0.274, + "epoch": 3.0414098821494218, + "grad_norm": 0.1913730502128601, + "learning_rate": 1.75778125075057e-05, + "loss": 0.3926, "step": 84390 }, { - "epoch": 2.97, - "learning_rate": 1.87003323919943e-05, - "loss": 0.2611, + "epoch": 3.041590081810646, + "grad_norm": 0.24644358456134796, + "learning_rate": 1.7575025995881998e-05, + "loss": 0.3757, "step": 84395 }, { - "epoch": 2.97, - "learning_rate": 1.8697575637838703e-05, - "loss": 0.2785, + "epoch": 3.041770281471871, + "grad_norm": 0.2385040670633316, + "learning_rate": 1.7572239585422116e-05, + "loss": 0.4107, "step": 84400 }, { - "epoch": 2.97, - "learning_rate": 1.8694818965515938e-05, - "loss": 0.2823, + "epoch": 3.0419504811330955, + "grad_norm": 0.18101677298545837, + "learning_rate": 1.7569453276164054e-05, + "loss": 0.387, "step": 84405 }, { - "epoch": 2.97, - "learning_rate": 1.869206237506179e-05, - "loss": 0.2583, + "epoch": 3.0421306807943203, + "grad_norm": 0.21769458055496216, + "learning_rate": 1.756666706814574e-05, + "loss": 0.4045, "step": 84410 }, { - "epoch": 2.97, - "learning_rate": 1.8689305866512063e-05, - "loss": 0.2773, + "epoch": 3.0423108804555445, + "grad_norm": 0.2499932199716568, + "learning_rate": 1.7563880961405148e-05, + "loss": 0.3898, "step": 84415 }, { - "epoch": 2.97, - "learning_rate": 1.8686549439902528e-05, - "loss": 0.2528, + "epoch": 3.0424910801167693, + "grad_norm": 0.24783965945243835, + "learning_rate": 1.7561094955980247e-05, + "loss": 0.3607, "step": 84420 }, { - "epoch": 2.97, - "learning_rate": 1.8683793095268997e-05, - "loss": 0.2625, + "epoch": 3.042671279777994, + "grad_norm": 0.21402627229690552, + "learning_rate": 1.7558309051908976e-05, + "loss": 0.3521, "step": 84425 }, { - "epoch": 2.97, - "learning_rate": 1.8681036832647253e-05, - "loss": 0.274, + "epoch": 3.0428514794392187, + "grad_norm": 0.21035562455654144, + "learning_rate": 1.7555523249229312e-05, + "loss": 0.4272, "step": 84430 }, { - "epoch": 2.97, - "learning_rate": 1.8678280652073074e-05, - "loss": 0.2494, + "epoch": 3.0430316791004435, + "grad_norm": 0.2314184308052063, + "learning_rate": 1.7552737547979197e-05, + "loss": 0.3946, "step": 84435 }, { - "epoch": 2.97, - "learning_rate": 1.867552455358225e-05, - "loss": 0.272, + "epoch": 3.0432118787616678, + "grad_norm": 0.18554693460464478, + "learning_rate": 1.754995194819659e-05, + "loss": 0.3658, "step": 84440 }, { - "epoch": 2.97, - "learning_rate": 1.8672768537210583e-05, - "loss": 0.2845, + "epoch": 3.0433920784228925, + "grad_norm": 0.20421580970287323, + "learning_rate": 1.754716644991945e-05, + "loss": 0.3572, "step": 84445 }, { - "epoch": 2.97, - "learning_rate": 1.8670012602993843e-05, - "loss": 0.2485, + "epoch": 3.0435722780841172, + "grad_norm": 0.29508501291275024, + "learning_rate": 1.7544381053185723e-05, + "loss": 0.4097, "step": 84450 }, { - "epoch": 2.97, - "learning_rate": 1.866725675096782e-05, - "loss": 0.267, + "epoch": 3.043752477745342, + "grad_norm": 0.24256087839603424, + "learning_rate": 1.7541595758033357e-05, + "loss": 0.3888, "step": 84455 }, { - "epoch": 2.97, - "learning_rate": 1.8664500981168286e-05, - "loss": 0.3005, + "epoch": 3.0439326774065663, + "grad_norm": 0.17925018072128296, + "learning_rate": 1.753881056450031e-05, + "loss": 0.3381, "step": 84460 }, { - "epoch": 2.97, - "learning_rate": 1.8661745293631036e-05, - "loss": 0.2624, + "epoch": 3.044112877067791, + "grad_norm": 0.2275160253047943, + "learning_rate": 1.7536025472624518e-05, + "loss": 0.4123, "step": 84465 }, { - "epoch": 2.97, - "learning_rate": 1.865898968839185e-05, - "loss": 0.255, + "epoch": 3.0442930767290157, + "grad_norm": 0.2509187161922455, + "learning_rate": 1.7533240482443942e-05, + "loss": 0.3866, "step": 84470 }, { - "epoch": 2.97, - "learning_rate": 1.8656234165486506e-05, - "loss": 0.27, + "epoch": 3.0444732763902405, + "grad_norm": 0.231712207198143, + "learning_rate": 1.753045559399652e-05, + "loss": 0.4006, "step": 84475 }, { - "epoch": 2.97, - "learning_rate": 1.865347872495077e-05, - "loss": 0.2593, + "epoch": 3.044653476051465, + "grad_norm": 0.23315437138080597, + "learning_rate": 1.7527670807320183e-05, + "loss": 0.4009, "step": 84480 }, { - "epoch": 2.97, - "learning_rate": 1.865072336682044e-05, - "loss": 0.2904, + "epoch": 3.0448336757126895, + "grad_norm": 0.24659566581249237, + "learning_rate": 1.7524886122452897e-05, + "loss": 0.4056, "step": 84485 }, { - "epoch": 2.97, - "learning_rate": 1.8647968091131273e-05, - "loss": 0.2726, + "epoch": 3.045013875373914, + "grad_norm": 0.22627463936805725, + "learning_rate": 1.7522101539432582e-05, + "loss": 0.3803, "step": 84490 }, { - "epoch": 2.97, - "learning_rate": 1.8645212897919067e-05, - "loss": 0.261, + "epoch": 3.045194075035139, + "grad_norm": 0.2701631486415863, + "learning_rate": 1.7519317058297188e-05, + "loss": 0.3936, "step": 84495 }, { - "epoch": 2.97, - "learning_rate": 1.864245778721957e-05, - "loss": 0.2694, + "epoch": 3.0453742746963637, + "grad_norm": 0.23009376227855682, + "learning_rate": 1.7516532679084652e-05, + "loss": 0.3687, "step": 84500 }, { - "epoch": 2.97, - "eval_loss": 0.2578907310962677, - "eval_runtime": 10.553, - "eval_samples_per_second": 9.476, - "eval_steps_per_second": 9.476, + "epoch": 3.0453742746963637, + "eval_loss": 0.43199047446250916, + "eval_runtime": 3.5354, + "eval_samples_per_second": 28.286, + "eval_steps_per_second": 7.071, "step": 84500 }, { - "epoch": 2.97, - "learning_rate": 1.8639702759068578e-05, - "loss": 0.2505, + "epoch": 3.0455544743575884, + "grad_norm": 0.24204762279987335, + "learning_rate": 1.7513748401832904e-05, + "loss": 0.3531, "step": 84505 }, { - "epoch": 2.97, - "learning_rate": 1.8636947813501853e-05, - "loss": 0.2453, + "epoch": 3.0457346740188127, + "grad_norm": 0.27944275736808777, + "learning_rate": 1.751096422657989e-05, + "loss": 0.3562, "step": 84510 }, { - "epoch": 2.97, - "learning_rate": 1.8634192950555163e-05, - "loss": 0.2729, + "epoch": 3.0459148736800374, + "grad_norm": 0.2434234917163849, + "learning_rate": 1.750818015336354e-05, + "loss": 0.406, "step": 84515 }, { - "epoch": 2.97, - "learning_rate": 1.863143817026428e-05, - "loss": 0.2606, + "epoch": 3.046095073341262, + "grad_norm": 0.18763190507888794, + "learning_rate": 1.7505396182221777e-05, + "loss": 0.3698, "step": 84520 }, { - "epoch": 2.97, - "learning_rate": 1.8628683472664988e-05, - "loss": 0.2258, + "epoch": 3.046275273002487, + "grad_norm": 0.21916723251342773, + "learning_rate": 1.750261231319255e-05, + "loss": 0.4321, "step": 84525 }, { - "epoch": 2.97, - "learning_rate": 1.8625928857793034e-05, - "loss": 0.2844, + "epoch": 3.046455472663711, + "grad_norm": 0.22559642791748047, + "learning_rate": 1.749982854631377e-05, + "loss": 0.3608, "step": 84530 }, { - "epoch": 2.97, - "learning_rate": 1.8623725225483194e-05, - "loss": 0.2798, + "epoch": 3.046635672324936, + "grad_norm": 0.20946092903614044, + "learning_rate": 1.749704488162338e-05, + "loss": 0.3531, "step": 84535 }, { - "epoch": 2.97, - "learning_rate": 1.8620970759610597e-05, - "loss": 0.2855, + "epoch": 3.0468158719861607, + "grad_norm": 0.2142314463853836, + "learning_rate": 1.74942613191593e-05, + "loss": 0.3761, "step": 84540 }, { - "epoch": 2.97, - "learning_rate": 1.861821637656549e-05, - "loss": 0.2655, + "epoch": 3.0469960716473854, + "grad_norm": 0.18263986706733704, + "learning_rate": 1.7491477858959453e-05, + "loss": 0.3924, "step": 84545 }, { - "epoch": 2.97, - "learning_rate": 1.8615462076383633e-05, - "loss": 0.279, + "epoch": 3.04717627130861, + "grad_norm": 0.19309696555137634, + "learning_rate": 1.748869450106177e-05, + "loss": 0.3453, "step": 84550 }, { - "epoch": 2.97, - "learning_rate": 1.8612707859100796e-05, - "loss": 0.2723, + "epoch": 3.0473564709698344, + "grad_norm": 0.2552222013473511, + "learning_rate": 1.7485911245504173e-05, + "loss": 0.3855, "step": 84555 }, { - "epoch": 2.98, - "learning_rate": 1.8609953724752745e-05, - "loss": 0.2618, + "epoch": 3.047536670631059, + "grad_norm": 0.2853567600250244, + "learning_rate": 1.7483128092324567e-05, + "loss": 0.4424, "step": 84560 }, { - "epoch": 2.98, - "learning_rate": 1.860719967337523e-05, - "loss": 0.2769, + "epoch": 3.047716870292284, + "grad_norm": 0.2120617926120758, + "learning_rate": 1.74803450415609e-05, + "loss": 0.3929, "step": 84565 }, { - "epoch": 2.98, - "learning_rate": 1.860444570500401e-05, - "loss": 0.276, + "epoch": 3.0478970699535086, + "grad_norm": 0.2228347659111023, + "learning_rate": 1.7477562093251066e-05, + "loss": 0.3744, "step": 84570 }, { - "epoch": 2.98, - "learning_rate": 1.8601691819674847e-05, - "loss": 0.2715, + "epoch": 3.048077269614733, + "grad_norm": 0.2142227590084076, + "learning_rate": 1.7474779247432998e-05, + "loss": 0.4072, "step": 84575 }, { - "epoch": 2.98, - "learning_rate": 1.8598938017423505e-05, - "loss": 0.2972, + "epoch": 3.0482574692759576, + "grad_norm": 0.23196277022361755, + "learning_rate": 1.7471996504144612e-05, + "loss": 0.3609, "step": 84580 }, { - "epoch": 2.98, - "learning_rate": 1.8596184298285725e-05, - "loss": 0.2717, + "epoch": 3.0484376689371824, + "grad_norm": 0.20943579077720642, + "learning_rate": 1.74692138634238e-05, + "loss": 0.3989, "step": 84585 }, { - "epoch": 2.98, - "learning_rate": 1.859343066229728e-05, - "loss": 0.2735, + "epoch": 3.048617868598407, + "grad_norm": 0.28706827759742737, + "learning_rate": 1.7466431325308507e-05, + "loss": 0.3669, "step": 84590 }, { - "epoch": 2.98, - "learning_rate": 1.8590677109493918e-05, - "loss": 0.2694, + "epoch": 3.048798068259632, + "grad_norm": 0.20854423940181732, + "learning_rate": 1.7463648889836627e-05, + "loss": 0.3551, "step": 84595 }, { - "epoch": 2.98, - "learning_rate": 1.858792363991139e-05, - "loss": 0.2491, + "epoch": 3.048978267920856, + "grad_norm": 0.20572051405906677, + "learning_rate": 1.746086655704606e-05, + "loss": 0.3609, "step": 84600 }, { - "epoch": 2.98, - "learning_rate": 1.8585170253585436e-05, - "loss": 0.2665, + "epoch": 3.049158467582081, + "grad_norm": 0.1955312341451645, + "learning_rate": 1.7458084326974732e-05, + "loss": 0.3764, "step": 84605 }, { - "epoch": 2.98, - "learning_rate": 1.8582416950551823e-05, - "loss": 0.2561, + "epoch": 3.0493386672433056, + "grad_norm": 0.1977349817752838, + "learning_rate": 1.7455302199660544e-05, + "loss": 0.3836, "step": 84610 }, { - "epoch": 2.98, - "learning_rate": 1.8579663730846305e-05, - "loss": 0.2626, + "epoch": 3.0495188669045303, + "grad_norm": 0.2958627939224243, + "learning_rate": 1.7452520175141406e-05, + "loss": 0.4121, "step": 84615 }, { - "epoch": 2.98, - "learning_rate": 1.857691059450462e-05, - "loss": 0.2642, + "epoch": 3.049699066565755, + "grad_norm": 0.20005503296852112, + "learning_rate": 1.7449738253455223e-05, + "loss": 0.3998, "step": 84620 }, { - "epoch": 2.98, - "learning_rate": 1.857415754156251e-05, - "loss": 0.2656, + "epoch": 3.0498792662269794, + "grad_norm": 0.2881629168987274, + "learning_rate": 1.7446956434639884e-05, + "loss": 0.4127, "step": 84625 }, { - "epoch": 2.98, - "learning_rate": 1.857140457205574e-05, - "loss": 0.2666, + "epoch": 3.050059465888204, + "grad_norm": 0.2254873812198639, + "learning_rate": 1.7444174718733308e-05, + "loss": 0.4088, "step": 84630 }, { - "epoch": 2.98, - "learning_rate": 1.8568651686020037e-05, - "loss": 0.288, + "epoch": 3.050239665549429, + "grad_norm": 0.21899674832820892, + "learning_rate": 1.744139310577339e-05, + "loss": 0.3857, "step": 84635 }, { - "epoch": 2.98, - "learning_rate": 1.856589888349116e-05, - "loss": 0.2713, + "epoch": 3.0504198652106536, + "grad_norm": 0.23545971512794495, + "learning_rate": 1.743861159579802e-05, + "loss": 0.3672, "step": 84640 }, { - "epoch": 2.98, - "learning_rate": 1.856314616450483e-05, - "loss": 0.2631, + "epoch": 3.050600064871878, + "grad_norm": 0.23571446537971497, + "learning_rate": 1.743583018884511e-05, + "loss": 0.3931, "step": 84645 }, { - "epoch": 2.98, - "learning_rate": 1.8560393529096827e-05, - "loss": 0.2697, + "epoch": 3.0507802645331026, + "grad_norm": 0.23098576068878174, + "learning_rate": 1.7433048884952548e-05, + "loss": 0.3968, "step": 84650 }, { - "epoch": 2.98, - "learning_rate": 1.855764097730286e-05, - "loss": 0.2666, + "epoch": 3.0509604641943273, + "grad_norm": 0.2752792537212372, + "learning_rate": 1.7430267684158226e-05, + "loss": 0.3622, "step": 84655 }, { - "epoch": 2.98, - "learning_rate": 1.855488850915868e-05, - "loss": 0.2862, + "epoch": 3.051140663855552, + "grad_norm": 0.2383970022201538, + "learning_rate": 1.7427486586500053e-05, + "loss": 0.4099, "step": 84660 }, { - "epoch": 2.98, - "learning_rate": 1.8552136124700014e-05, - "loss": 0.264, + "epoch": 3.051320863516777, + "grad_norm": 0.22088317573070526, + "learning_rate": 1.742470559201589e-05, + "loss": 0.382, "step": 84665 }, { - "epoch": 2.98, - "learning_rate": 1.8549383823962625e-05, - "loss": 0.2594, + "epoch": 3.051501063178001, + "grad_norm": 0.2405635267496109, + "learning_rate": 1.7421924700743668e-05, + "loss": 0.3674, "step": 84670 }, { - "epoch": 2.98, - "learning_rate": 1.854663160698224e-05, - "loss": 0.2513, + "epoch": 3.051681262839226, + "grad_norm": 0.2541629672050476, + "learning_rate": 1.741914391272124e-05, + "loss": 0.3796, "step": 84675 }, { - "epoch": 2.98, - "learning_rate": 1.8543879473794584e-05, - "loss": 0.2686, + "epoch": 3.0518614625004505, + "grad_norm": 0.18502090871334076, + "learning_rate": 1.7416363227986507e-05, + "loss": 0.4146, "step": 84680 }, { - "epoch": 2.98, - "learning_rate": 1.854112742443539e-05, - "loss": 0.2723, + "epoch": 3.0520416621616753, + "grad_norm": 0.20814743638038635, + "learning_rate": 1.741358264657737e-05, + "loss": 0.3825, "step": 84685 }, { - "epoch": 2.98, - "learning_rate": 1.8538375458940408e-05, - "loss": 0.2528, + "epoch": 3.0522218618228996, + "grad_norm": 0.22809316217899323, + "learning_rate": 1.7410802168531684e-05, + "loss": 0.4106, "step": 84690 }, { - "epoch": 2.98, - "learning_rate": 1.8535623577345364e-05, - "loss": 0.285, + "epoch": 3.0524020614841243, + "grad_norm": 0.2442580610513687, + "learning_rate": 1.7408021793887363e-05, + "loss": 0.3786, "step": 84695 }, { - "epoch": 2.98, - "learning_rate": 1.853287177968598e-05, - "loss": 0.2968, + "epoch": 3.052582261145349, + "grad_norm": 0.2511539161205292, + "learning_rate": 1.7405241522682276e-05, + "loss": 0.4042, "step": 84700 }, { - "epoch": 2.98, - "learning_rate": 1.8530120065998004e-05, - "loss": 0.2695, + "epoch": 3.0527624608065738, + "grad_norm": 0.20690898597240448, + "learning_rate": 1.740246135495429e-05, + "loss": 0.3782, "step": 84705 }, { - "epoch": 2.98, - "learning_rate": 1.852736843631715e-05, - "loss": 0.2702, + "epoch": 3.0529426604677985, + "grad_norm": 0.20105348527431488, + "learning_rate": 1.7399681290741308e-05, + "loss": 0.3946, "step": 84710 }, { - "epoch": 2.98, - "learning_rate": 1.852461689067915e-05, - "loss": 0.2808, + "epoch": 3.053122860129023, + "grad_norm": 0.2233029156923294, + "learning_rate": 1.739690133008119e-05, + "loss": 0.3783, "step": 84715 }, { - "epoch": 2.98, - "learning_rate": 1.852186542911973e-05, - "loss": 0.2427, + "epoch": 3.0533030597902475, + "grad_norm": 0.2557271122932434, + "learning_rate": 1.7394121473011825e-05, + "loss": 0.3672, "step": 84720 }, { - "epoch": 2.98, - "learning_rate": 1.8519114051674626e-05, - "loss": 0.2538, + "epoch": 3.0534832594514723, + "grad_norm": 0.2504826784133911, + "learning_rate": 1.739134171957108e-05, + "loss": 0.3653, "step": 84725 }, { - "epoch": 2.98, - "learning_rate": 1.8516362758379552e-05, - "loss": 0.2875, + "epoch": 3.053663459112697, + "grad_norm": 0.19489239156246185, + "learning_rate": 1.7388562069796827e-05, + "loss": 0.3771, "step": 84730 }, { - "epoch": 2.98, - "learning_rate": 1.851361154927024e-05, - "loss": 0.2583, + "epoch": 3.0538436587739217, + "grad_norm": 0.18892985582351685, + "learning_rate": 1.738578252372695e-05, + "loss": 0.3507, "step": 84735 }, { - "epoch": 2.98, - "learning_rate": 1.8510860424382392e-05, - "loss": 0.2762, + "epoch": 3.054023858435146, + "grad_norm": 0.2819010317325592, + "learning_rate": 1.7383003081399308e-05, + "loss": 0.3892, "step": 84740 }, { - "epoch": 2.98, - "learning_rate": 1.8508109383751754e-05, - "loss": 0.2963, + "epoch": 3.0542040580963707, + "grad_norm": 0.22074325382709503, + "learning_rate": 1.738022374285177e-05, + "loss": 0.3679, "step": 84745 }, { - "epoch": 2.98, - "learning_rate": 1.8505358427414037e-05, - "loss": 0.258, + "epoch": 3.0543842577575955, + "grad_norm": 0.23581378161907196, + "learning_rate": 1.7377444508122215e-05, + "loss": 0.3676, "step": 84750 }, { - "epoch": 2.98, - "learning_rate": 1.8502607555404966e-05, - "loss": 0.2605, + "epoch": 3.05456445741882, + "grad_norm": 0.23171649873256683, + "learning_rate": 1.73746653772485e-05, + "loss": 0.3905, "step": 84755 }, { - "epoch": 2.98, - "learning_rate": 1.8499856767760243e-05, - "loss": 0.2667, + "epoch": 3.0547446570800445, + "grad_norm": 0.22440069913864136, + "learning_rate": 1.7371886350268494e-05, + "loss": 0.3745, "step": 84760 }, { - "epoch": 2.98, - "learning_rate": 1.8497106064515605e-05, - "loss": 0.2566, + "epoch": 3.0549248567412692, + "grad_norm": 0.17826688289642334, + "learning_rate": 1.7369107427220066e-05, + "loss": 0.3795, "step": 84765 }, { - "epoch": 2.98, - "learning_rate": 1.8494355445706755e-05, - "loss": 0.2565, + "epoch": 3.055105056402494, + "grad_norm": 0.2648460268974304, + "learning_rate": 1.7366328608141057e-05, + "loss": 0.3735, "step": 84770 }, { - "epoch": 2.98, - "learning_rate": 1.8491604911369418e-05, - "loss": 0.2795, + "epoch": 3.0552852560637187, + "grad_norm": 0.18997584283351898, + "learning_rate": 1.7363549893069355e-05, + "loss": 0.3955, "step": 84775 }, { - "epoch": 2.98, - "learning_rate": 1.848885446153929e-05, - "loss": 0.2754, + "epoch": 3.0554654557249434, + "grad_norm": 0.18987253308296204, + "learning_rate": 1.7360771282042807e-05, + "loss": 0.383, "step": 84780 }, { - "epoch": 2.98, - "learning_rate": 1.8486104096252104e-05, - "loss": 0.2819, + "epoch": 3.0556456553861677, + "grad_norm": 0.23954762518405914, + "learning_rate": 1.7357992775099264e-05, + "loss": 0.407, "step": 84785 }, { - "epoch": 2.98, - "learning_rate": 1.848335381554357e-05, - "loss": 0.255, + "epoch": 3.0558258550473925, + "grad_norm": 0.19741961359977722, + "learning_rate": 1.7355214372276596e-05, + "loss": 0.3922, "step": 84790 }, { - "epoch": 2.98, - "learning_rate": 1.8480603619449383e-05, - "loss": 0.2634, + "epoch": 3.056006054708617, + "grad_norm": 0.20754849910736084, + "learning_rate": 1.7352436073612644e-05, + "loss": 0.3883, "step": 84795 }, { - "epoch": 2.98, - "learning_rate": 1.8477853508005255e-05, - "loss": 0.2802, + "epoch": 3.056186254369842, + "grad_norm": 0.2479812055826187, + "learning_rate": 1.7349657879145274e-05, + "loss": 0.3505, "step": 84800 }, { - "epoch": 2.98, - "learning_rate": 1.8475103481246912e-05, - "loss": 0.277, + "epoch": 3.056366454031066, + "grad_norm": 0.26793172955513, + "learning_rate": 1.734687978891234e-05, + "loss": 0.4124, "step": 84805 }, { - "epoch": 2.98, - "learning_rate": 1.8472353539210052e-05, - "loss": 0.2462, + "epoch": 3.056546653692291, + "grad_norm": 0.2121279090642929, + "learning_rate": 1.734410180295168e-05, + "loss": 0.3928, "step": 84810 }, { - "epoch": 2.98, - "learning_rate": 1.846960368193037e-05, - "loss": 0.2716, + "epoch": 3.0567268533535157, + "grad_norm": 0.2421426624059677, + "learning_rate": 1.7341323921301154e-05, + "loss": 0.3936, "step": 84815 }, { - "epoch": 2.98, - "learning_rate": 1.8466853909443588e-05, - "loss": 0.2653, + "epoch": 3.0569070530147404, + "grad_norm": 0.21488474309444427, + "learning_rate": 1.733854614399861e-05, + "loss": 0.3913, "step": 84820 }, { - "epoch": 2.98, - "learning_rate": 1.84641042217854e-05, - "loss": 0.281, + "epoch": 3.057087252675965, + "grad_norm": 0.18251733481884003, + "learning_rate": 1.733576847108188e-05, + "loss": 0.4056, "step": 84825 }, { - "epoch": 2.98, - "learning_rate": 1.846135461899151e-05, - "loss": 0.2615, + "epoch": 3.0572674523371894, + "grad_norm": 0.25495290756225586, + "learning_rate": 1.733299090258883e-05, + "loss": 0.3676, "step": 84830 }, { - "epoch": 2.98, - "learning_rate": 1.8458605101097618e-05, - "loss": 0.2775, + "epoch": 3.057447651998414, + "grad_norm": 0.23162417113780975, + "learning_rate": 1.733021343855729e-05, + "loss": 0.3768, "step": 84835 }, { - "epoch": 2.98, - "learning_rate": 1.8455855668139434e-05, - "loss": 0.2839, + "epoch": 3.057627851659639, + "grad_norm": 0.20512209832668304, + "learning_rate": 1.7327436079025112e-05, + "loss": 0.3293, "step": 84840 }, { - "epoch": 2.99, - "learning_rate": 1.8453106320152654e-05, - "loss": 0.2763, + "epoch": 3.0578080513208636, + "grad_norm": 0.197612926363945, + "learning_rate": 1.7324658824030133e-05, + "loss": 0.3904, "step": 84845 }, { - "epoch": 2.99, - "learning_rate": 1.8450357057172967e-05, - "loss": 0.2809, + "epoch": 3.0579882509820884, + "grad_norm": 0.29313725233078003, + "learning_rate": 1.7321881673610184e-05, + "loss": 0.396, "step": 84850 }, { - "epoch": 2.99, - "learning_rate": 1.8447607879236078e-05, - "loss": 0.2501, + "epoch": 3.0581684506433127, + "grad_norm": 0.2089136838912964, + "learning_rate": 1.7319104627803117e-05, + "loss": 0.3205, "step": 84855 }, { - "epoch": 2.99, - "learning_rate": 1.8444858786377682e-05, - "loss": 0.2753, + "epoch": 3.0583486503045374, + "grad_norm": 0.25626543164253235, + "learning_rate": 1.7316327686646767e-05, + "loss": 0.3811, "step": 84860 }, { - "epoch": 2.99, - "learning_rate": 1.844210977863348e-05, - "loss": 0.2645, + "epoch": 3.058528849965762, + "grad_norm": 0.2188102900981903, + "learning_rate": 1.731355085017895e-05, + "loss": 0.3782, "step": 84865 }, { - "epoch": 2.99, - "learning_rate": 1.8439360856039163e-05, - "loss": 0.3029, + "epoch": 3.058709049626987, + "grad_norm": 0.2118126004934311, + "learning_rate": 1.731077411843753e-05, + "loss": 0.3928, "step": 84870 }, { - "epoch": 2.99, - "learning_rate": 1.843661201863041e-05, - "loss": 0.2619, + "epoch": 3.058889249288211, + "grad_norm": 0.25317180156707764, + "learning_rate": 1.7307997491460306e-05, + "loss": 0.3693, "step": 84875 }, { - "epoch": 2.99, - "learning_rate": 1.8433863266442937e-05, - "loss": 0.2809, + "epoch": 3.059069448949436, + "grad_norm": 0.2602880299091339, + "learning_rate": 1.7305220969285148e-05, + "loss": 0.3928, "step": 84880 }, { - "epoch": 2.99, - "learning_rate": 1.8431114599512412e-05, - "loss": 0.2983, + "epoch": 3.0592496486106606, + "grad_norm": 0.23756670951843262, + "learning_rate": 1.7302444551949853e-05, + "loss": 0.3839, "step": 84885 }, { - "epoch": 2.99, - "learning_rate": 1.842836601787454e-05, - "loss": 0.2685, + "epoch": 3.0594298482718854, + "grad_norm": 0.2189938724040985, + "learning_rate": 1.729966823949226e-05, + "loss": 0.3825, "step": 84890 }, { - "epoch": 2.99, - "learning_rate": 1.8425617521564998e-05, - "loss": 0.2727, + "epoch": 3.05961004793311, + "grad_norm": 0.24415577948093414, + "learning_rate": 1.72968920319502e-05, + "loss": 0.3731, "step": 84895 }, { - "epoch": 2.99, - "learning_rate": 1.842286911061949e-05, - "loss": 0.2634, + "epoch": 3.0597902475943344, + "grad_norm": 0.1736556589603424, + "learning_rate": 1.7294115929361492e-05, + "loss": 0.3822, "step": 84900 }, { - "epoch": 2.99, - "learning_rate": 1.8420120785073685e-05, - "loss": 0.2866, + "epoch": 3.059970447255559, + "grad_norm": 0.2100590169429779, + "learning_rate": 1.7291339931763962e-05, + "loss": 0.3883, "step": 84905 }, { - "epoch": 2.99, - "learning_rate": 1.841737254496328e-05, - "loss": 0.2641, + "epoch": 3.060150646916784, + "grad_norm": 0.2625679671764374, + "learning_rate": 1.7288564039195434e-05, + "loss": 0.3733, "step": 84910 }, { - "epoch": 2.99, - "learning_rate": 1.8414624390323946e-05, - "loss": 0.2716, + "epoch": 3.0603308465780086, + "grad_norm": 0.2331903725862503, + "learning_rate": 1.7285788251693723e-05, + "loss": 0.3834, "step": 84915 }, { - "epoch": 2.99, - "learning_rate": 1.8411876321191384e-05, - "loss": 0.2535, + "epoch": 3.060511046239233, + "grad_norm": 0.21731112897396088, + "learning_rate": 1.7283012569296665e-05, + "loss": 0.3721, "step": 84920 }, { - "epoch": 2.99, - "learning_rate": 1.8409128337601267e-05, - "loss": 0.2594, + "epoch": 3.0606912459004576, + "grad_norm": 0.22888799011707306, + "learning_rate": 1.728023699204206e-05, + "loss": 0.3532, "step": 84925 }, { - "epoch": 2.99, - "learning_rate": 1.8406380439589262e-05, - "loss": 0.2455, + "epoch": 3.0608714455616823, + "grad_norm": 0.2064668983221054, + "learning_rate": 1.727746151996773e-05, + "loss": 0.3789, "step": 84930 }, { - "epoch": 2.99, - "learning_rate": 1.8403632627191074e-05, - "loss": 0.2776, + "epoch": 3.061051645222907, + "grad_norm": 0.1944301277399063, + "learning_rate": 1.7274686153111496e-05, + "loss": 0.3762, "step": 84935 }, { - "epoch": 2.99, - "learning_rate": 1.840088490044236e-05, - "loss": 0.2514, + "epoch": 3.061231844884132, + "grad_norm": 0.2432466596364975, + "learning_rate": 1.7271910891511163e-05, + "loss": 0.3953, "step": 84940 }, { - "epoch": 2.99, - "learning_rate": 1.8398137259378816e-05, - "loss": 0.2845, + "epoch": 3.061412044545356, + "grad_norm": 0.1888858526945114, + "learning_rate": 1.726913573520455e-05, + "loss": 0.3832, "step": 84945 }, { - "epoch": 2.99, - "learning_rate": 1.8395389704036096e-05, - "loss": 0.2742, + "epoch": 3.061592244206581, + "grad_norm": 0.27773165702819824, + "learning_rate": 1.7266360684229473e-05, + "loss": 0.3922, "step": 84950 }, { - "epoch": 2.99, - "learning_rate": 1.8392642234449903e-05, - "loss": 0.2583, + "epoch": 3.0617724438678056, + "grad_norm": 0.1935848444700241, + "learning_rate": 1.7263585738623715e-05, + "loss": 0.3716, "step": 84955 }, { - "epoch": 2.99, - "learning_rate": 1.8389894850655895e-05, - "loss": 0.2654, + "epoch": 3.0619526435290303, + "grad_norm": 0.2068513184785843, + "learning_rate": 1.7260810898425126e-05, + "loss": 0.3646, "step": 84960 }, { - "epoch": 2.99, - "learning_rate": 1.8387147552689737e-05, - "loss": 0.2815, + "epoch": 3.0621328431902546, + "grad_norm": 0.22322344779968262, + "learning_rate": 1.725803616367148e-05, + "loss": 0.3975, "step": 84965 }, { - "epoch": 2.99, - "learning_rate": 1.838440034058711e-05, - "loss": 0.2674, + "epoch": 3.0623130428514793, + "grad_norm": 0.19500023126602173, + "learning_rate": 1.7255261534400585e-05, + "loss": 0.3771, "step": 84970 }, { - "epoch": 2.99, - "learning_rate": 1.8381653214383692e-05, - "loss": 0.2691, + "epoch": 3.062493242512704, + "grad_norm": 0.20665855705738068, + "learning_rate": 1.7252487010650266e-05, + "loss": 0.3912, "step": 84975 }, { - "epoch": 2.99, - "learning_rate": 1.8378906174115145e-05, - "loss": 0.2662, + "epoch": 3.062673442173929, + "grad_norm": 0.24489200115203857, + "learning_rate": 1.7249712592458294e-05, + "loss": 0.3861, "step": 84980 }, { - "epoch": 2.99, - "learning_rate": 1.8376159219817135e-05, - "loss": 0.2625, + "epoch": 3.0628536418351535, + "grad_norm": 0.22476324439048767, + "learning_rate": 1.7246938279862508e-05, + "loss": 0.35, "step": 84985 }, { - "epoch": 2.99, - "learning_rate": 1.837341235152533e-05, - "loss": 0.2713, + "epoch": 3.063033841496378, + "grad_norm": 0.2135080248117447, + "learning_rate": 1.7244164072900678e-05, + "loss": 0.3628, "step": 84990 }, { - "epoch": 2.99, - "learning_rate": 1.83706655692754e-05, - "loss": 0.2785, + "epoch": 3.0632140411576025, + "grad_norm": 0.2153581976890564, + "learning_rate": 1.7241389971610607e-05, + "loss": 0.3808, "step": 84995 }, { - "epoch": 2.99, - "learning_rate": 1.8367918873103012e-05, - "loss": 0.2897, + "epoch": 3.0633942408188273, + "grad_norm": 0.27249398827552795, + "learning_rate": 1.72386159760301e-05, + "loss": 0.3824, "step": 85000 }, { - "epoch": 2.99, - "eval_loss": 0.2575870752334595, - "eval_runtime": 10.5655, - "eval_samples_per_second": 9.465, - "eval_steps_per_second": 9.465, + "epoch": 3.0633942408188273, + "eval_loss": 0.43113651871681213, + "eval_runtime": 3.5309, + "eval_samples_per_second": 28.322, + "eval_steps_per_second": 7.08, "step": 85000 }, { - "epoch": 2.99, - "learning_rate": 1.8365172263043827e-05, - "loss": 0.2663, + "epoch": 3.063574440480052, + "grad_norm": 0.23285353183746338, + "learning_rate": 1.723584208619695e-05, + "loss": 0.3787, "step": 85005 }, { - "epoch": 2.99, - "learning_rate": 1.83624257391335e-05, - "loss": 0.2902, + "epoch": 3.0637546401412767, + "grad_norm": 0.21958500146865845, + "learning_rate": 1.7233068302148943e-05, + "loss": 0.4006, "step": 85010 }, { - "epoch": 2.99, - "learning_rate": 1.8359679301407705e-05, - "loss": 0.2739, + "epoch": 3.063934839802501, + "grad_norm": 0.26001232862472534, + "learning_rate": 1.7230294623923876e-05, + "loss": 0.3732, "step": 85015 }, { - "epoch": 2.99, - "learning_rate": 1.83569329499021e-05, - "loss": 0.2879, + "epoch": 3.0641150394637258, + "grad_norm": 0.20157134532928467, + "learning_rate": 1.722752105155954e-05, + "loss": 0.3474, "step": 85020 }, { - "epoch": 2.99, - "learning_rate": 1.8354186684652343e-05, - "loss": 0.2705, + "epoch": 3.0642952391249505, + "grad_norm": 0.2562786638736725, + "learning_rate": 1.722474758509373e-05, + "loss": 0.4045, "step": 85025 }, { - "epoch": 2.99, - "learning_rate": 1.8351440505694083e-05, - "loss": 0.2706, + "epoch": 3.0644754387861752, + "grad_norm": 0.22175267338752747, + "learning_rate": 1.722197422456423e-05, + "loss": 0.3797, "step": 85030 }, { - "epoch": 2.99, - "learning_rate": 1.8348694413063e-05, - "loss": 0.2817, + "epoch": 3.0646556384473995, + "grad_norm": 0.19869141280651093, + "learning_rate": 1.721920097000882e-05, + "loss": 0.3716, "step": 85035 }, { - "epoch": 2.99, - "learning_rate": 1.8345948406794733e-05, - "loss": 0.2729, + "epoch": 3.0648358381086243, + "grad_norm": 0.2057720124721527, + "learning_rate": 1.7216427821465292e-05, + "loss": 0.3932, "step": 85040 }, { - "epoch": 2.99, - "learning_rate": 1.8343202486924933e-05, - "loss": 0.2621, + "epoch": 3.065016037769849, + "grad_norm": 0.24935896694660187, + "learning_rate": 1.7213654778971436e-05, + "loss": 0.3798, "step": 85045 }, { - "epoch": 2.99, - "learning_rate": 1.8340456653489264e-05, - "loss": 0.2582, + "epoch": 3.0651962374310737, + "grad_norm": 0.2382826954126358, + "learning_rate": 1.7210881842565007e-05, + "loss": 0.3826, "step": 85050 }, { - "epoch": 2.99, - "learning_rate": 1.8337710906523384e-05, - "loss": 0.2626, + "epoch": 3.0653764370922985, + "grad_norm": 0.2057560682296753, + "learning_rate": 1.7208109012283824e-05, + "loss": 0.3825, "step": 85055 }, { - "epoch": 2.99, - "learning_rate": 1.8334965246062936e-05, - "loss": 0.2544, + "epoch": 3.0655566367535227, + "grad_norm": 0.20751681923866272, + "learning_rate": 1.720533628816563e-05, + "loss": 0.3988, "step": 85060 }, { - "epoch": 2.99, - "learning_rate": 1.833221967214356e-05, - "loss": 0.2565, + "epoch": 3.0657368364147475, + "grad_norm": 0.22527416050434113, + "learning_rate": 1.720256367024824e-05, + "loss": 0.3742, "step": 85065 }, { - "epoch": 2.99, - "learning_rate": 1.8329474184800932e-05, - "loss": 0.2458, + "epoch": 3.065917036075972, + "grad_norm": 0.24858717620372772, + "learning_rate": 1.71997911585694e-05, + "loss": 0.385, "step": 85070 }, { - "epoch": 2.99, - "learning_rate": 1.8326728784070676e-05, - "loss": 0.2674, + "epoch": 3.066097235737197, + "grad_norm": 0.2016977220773697, + "learning_rate": 1.7197018753166895e-05, + "loss": 0.3803, "step": 85075 }, { - "epoch": 2.99, - "learning_rate": 1.8323983469988458e-05, - "loss": 0.2548, + "epoch": 3.0662774353984212, + "grad_norm": 0.23507557809352875, + "learning_rate": 1.71942464540785e-05, + "loss": 0.3956, "step": 85080 }, { - "epoch": 2.99, - "learning_rate": 1.83212382425899e-05, - "loss": 0.279, + "epoch": 3.066457635059646, + "grad_norm": 0.18204432725906372, + "learning_rate": 1.7191474261341982e-05, + "loss": 0.4038, "step": 85085 }, { - "epoch": 2.99, - "learning_rate": 1.8318493101910676e-05, - "loss": 0.2629, + "epoch": 3.0666378347208707, + "grad_norm": 0.19573719799518585, + "learning_rate": 1.7188702174995115e-05, + "loss": 0.3751, "step": 85090 }, { - "epoch": 2.99, - "learning_rate": 1.8315748047986415e-05, - "loss": 0.2786, + "epoch": 3.0668180343820954, + "grad_norm": 0.2764735221862793, + "learning_rate": 1.7185930195075675e-05, + "loss": 0.3545, "step": 85095 }, { - "epoch": 2.99, - "learning_rate": 1.8313003080852755e-05, - "loss": 0.2567, + "epoch": 3.06699823404332, + "grad_norm": 0.2690604031085968, + "learning_rate": 1.718315832162142e-05, + "loss": 0.3845, "step": 85100 }, { - "epoch": 2.99, - "learning_rate": 1.8310258200545338e-05, - "loss": 0.2676, + "epoch": 3.0671784337045445, + "grad_norm": 0.22735117375850677, + "learning_rate": 1.718038655467012e-05, + "loss": 0.4114, "step": 85105 }, { - "epoch": 2.99, - "learning_rate": 1.830751340709982e-05, - "loss": 0.2571, + "epoch": 3.067358633365769, + "grad_norm": 0.25227391719818115, + "learning_rate": 1.7177614894259538e-05, + "loss": 0.4067, "step": 85110 }, { - "epoch": 2.99, - "learning_rate": 1.830476870055183e-05, - "loss": 0.2583, + "epoch": 3.067538833026994, + "grad_norm": 0.22190016508102417, + "learning_rate": 1.717484334042744e-05, + "loss": 0.4165, "step": 85115 }, { - "epoch": 2.99, - "learning_rate": 1.8302024080937008e-05, - "loss": 0.2619, + "epoch": 3.0677190326882187, + "grad_norm": 0.2443481832742691, + "learning_rate": 1.7172071893211583e-05, + "loss": 0.4004, "step": 85120 }, { - "epoch": 2.99, - "learning_rate": 1.8299279548290975e-05, - "loss": 0.2721, + "epoch": 3.0678992323494434, + "grad_norm": 0.19374127686023712, + "learning_rate": 1.716930055264973e-05, + "loss": 0.3809, "step": 85125 }, { - "epoch": 3.0, - "learning_rate": 1.8296535102649388e-05, - "loss": 0.2721, + "epoch": 3.0680794320106677, + "grad_norm": 0.21551166474819183, + "learning_rate": 1.716652931877965e-05, + "loss": 0.3959, "step": 85130 }, { - "epoch": 3.0, - "learning_rate": 1.829379074404788e-05, - "loss": 0.2777, + "epoch": 3.0682596316718924, + "grad_norm": 0.2144090086221695, + "learning_rate": 1.7163758191639085e-05, + "loss": 0.4067, "step": 85135 }, { - "epoch": 3.0, - "learning_rate": 1.829104647252208e-05, - "loss": 0.2647, + "epoch": 3.068439831333117, + "grad_norm": 0.25093093514442444, + "learning_rate": 1.7160987171265798e-05, + "loss": 0.3913, "step": 85140 }, { - "epoch": 3.0, - "learning_rate": 1.8288302288107613e-05, - "loss": 0.2806, + "epoch": 3.068620030994342, + "grad_norm": 0.24009042978286743, + "learning_rate": 1.7158216257697545e-05, + "loss": 0.3445, "step": 85145 }, { - "epoch": 3.0, - "learning_rate": 1.8285558190840123e-05, - "loss": 0.2671, + "epoch": 3.068800230655566, + "grad_norm": 0.21893219649791718, + "learning_rate": 1.715544545097208e-05, + "loss": 0.3871, "step": 85150 }, { - "epoch": 3.0, - "learning_rate": 1.828281418075523e-05, - "loss": 0.2809, + "epoch": 3.068980430316791, + "grad_norm": 0.21242910623550415, + "learning_rate": 1.715267475112714e-05, + "loss": 0.3822, "step": 85155 }, { - "epoch": 3.0, - "learning_rate": 1.8280070257888565e-05, - "loss": 0.2558, + "epoch": 3.0691606299780156, + "grad_norm": 0.209553524851799, + "learning_rate": 1.7149904158200504e-05, + "loss": 0.379, "step": 85160 }, { - "epoch": 3.0, - "learning_rate": 1.8277326422275767e-05, - "loss": 0.2798, + "epoch": 3.0693408296392404, + "grad_norm": 0.2270086407661438, + "learning_rate": 1.7147133672229885e-05, + "loss": 0.3807, "step": 85165 }, { - "epoch": 3.0, - "learning_rate": 1.8274582673952458e-05, - "loss": 0.2723, + "epoch": 3.069521029300465, + "grad_norm": 0.1918162852525711, + "learning_rate": 1.7144363293253066e-05, + "loss": 0.3994, "step": 85170 }, { - "epoch": 3.0, - "learning_rate": 1.8271839012954258e-05, - "loss": 0.2397, + "epoch": 3.0697012289616894, + "grad_norm": 0.18857698142528534, + "learning_rate": 1.7141593021307774e-05, + "loss": 0.412, "step": 85175 }, { - "epoch": 3.0, - "learning_rate": 1.8269095439316786e-05, - "loss": 0.259, + "epoch": 3.069881428622914, + "grad_norm": 0.27960577607154846, + "learning_rate": 1.7138822856431746e-05, + "loss": 0.3907, "step": 85180 }, { - "epoch": 3.0, - "learning_rate": 1.8266351953075676e-05, - "loss": 0.2452, + "epoch": 3.070061628284139, + "grad_norm": 0.21300694346427917, + "learning_rate": 1.713605279866274e-05, + "loss": 0.3445, "step": 85185 }, { - "epoch": 3.0, - "learning_rate": 1.826360855426656e-05, - "loss": 0.2612, + "epoch": 3.0702418279453636, + "grad_norm": 0.2177317589521408, + "learning_rate": 1.7133282848038495e-05, + "loss": 0.3532, "step": 85190 }, { - "epoch": 3.0, - "learning_rate": 1.8260865242925042e-05, - "loss": 0.2567, + "epoch": 3.070422027606588, + "grad_norm": 0.2585134208202362, + "learning_rate": 1.713051300459674e-05, + "loss": 0.4105, "step": 85195 }, { - "epoch": 3.0, - "learning_rate": 1.8258122019086736e-05, - "loss": 0.2513, + "epoch": 3.0706022272678126, + "grad_norm": 0.2013564258813858, + "learning_rate": 1.712774326837523e-05, + "loss": 0.3809, "step": 85200 }, { - "epoch": 3.0, - "learning_rate": 1.825537888278729e-05, - "loss": 0.2751, + "epoch": 3.0707824269290374, + "grad_norm": 0.1960568130016327, + "learning_rate": 1.7124973639411686e-05, + "loss": 0.3597, "step": 85205 }, { - "epoch": 3.0, - "learning_rate": 1.8252635834062292e-05, - "loss": 0.2635, + "epoch": 3.070962626590262, + "grad_norm": 0.2271984964609146, + "learning_rate": 1.712220411774386e-05, + "loss": 0.3732, "step": 85210 }, { - "epoch": 3.0, - "learning_rate": 1.824989287294738e-05, - "loss": 0.2698, + "epoch": 3.071142826251487, + "grad_norm": 0.2209077626466751, + "learning_rate": 1.7119434703409475e-05, + "loss": 0.3632, "step": 85215 }, { - "epoch": 3.0, - "learning_rate": 1.824714999947815e-05, - "loss": 0.2512, + "epoch": 3.071323025912711, + "grad_norm": 0.24356262385845184, + "learning_rate": 1.7116665396446262e-05, + "loss": 0.3656, "step": 85220 }, { - "epoch": 3.0, - "learning_rate": 1.8244407213690234e-05, - "loss": 0.287, + "epoch": 3.071503225573936, + "grad_norm": 0.2605443298816681, + "learning_rate": 1.7113896196891963e-05, + "loss": 0.3577, "step": 85225 }, { - "epoch": 3.0, - "learning_rate": 1.8241664515619245e-05, - "loss": 0.2691, + "epoch": 3.0716834252351606, + "grad_norm": 0.23770549893379211, + "learning_rate": 1.7111127104784305e-05, + "loss": 0.3835, "step": 85230 }, { - "epoch": 3.0, - "learning_rate": 1.8238921905300777e-05, - "loss": 0.2627, + "epoch": 3.0718636248963853, + "grad_norm": 0.225930318236351, + "learning_rate": 1.7108358120160997e-05, + "loss": 0.4269, "step": 85235 }, { - "epoch": 3.0, - "learning_rate": 1.8236179382770448e-05, - "loss": 0.2863, + "epoch": 3.0720438245576096, + "grad_norm": 0.24659954011440277, + "learning_rate": 1.7105589243059798e-05, + "loss": 0.3578, "step": 85240 }, { - "epoch": 3.0, - "learning_rate": 1.8233436948063885e-05, - "loss": 0.2653, + "epoch": 3.0722240242188343, + "grad_norm": 0.17880700528621674, + "learning_rate": 1.7102820473518404e-05, + "loss": 0.3679, "step": 85245 }, { - "epoch": 3.0, - "learning_rate": 1.823069460121668e-05, - "loss": 0.2855, + "epoch": 3.072404223880059, + "grad_norm": 0.21226419508457184, + "learning_rate": 1.7100051811574564e-05, + "loss": 0.3876, "step": 85250 }, { - "epoch": 3.0, - "learning_rate": 1.822795234226445e-05, - "loss": 0.289, + "epoch": 3.072584423541284, + "grad_norm": 0.23851273953914642, + "learning_rate": 1.7097283257265983e-05, + "loss": 0.3621, "step": 85255 }, { - "epoch": 3.0, - "learning_rate": 1.8225210171242778e-05, - "loss": 0.2794, + "epoch": 3.0727646232025085, + "grad_norm": 0.20797961950302124, + "learning_rate": 1.709451481063038e-05, + "loss": 0.3759, "step": 85260 }, { - "epoch": 3.0, - "learning_rate": 1.82224680881873e-05, - "loss": 0.2918, + "epoch": 3.072944822863733, + "grad_norm": 0.22600434720516205, + "learning_rate": 1.70917464717055e-05, + "loss": 0.3779, "step": 85265 }, { - "epoch": 3.0, - "learning_rate": 1.821972609313361e-05, - "loss": 0.2634, + "epoch": 3.0731250225249576, + "grad_norm": 0.2033650130033493, + "learning_rate": 1.7088978240529034e-05, + "loss": 0.3632, "step": 85270 }, { - "epoch": 3.0, - "learning_rate": 1.8216984186117297e-05, - "loss": 0.249, + "epoch": 3.0733052221861823, + "grad_norm": 0.22226615250110626, + "learning_rate": 1.70862101171387e-05, + "loss": 0.3842, "step": 85275 }, { - "epoch": 3.0, - "learning_rate": 1.8214242367173984e-05, - "loss": 0.2576, + "epoch": 3.073485421847407, + "grad_norm": 0.22112597525119781, + "learning_rate": 1.7083442101572235e-05, + "loss": 0.3838, "step": 85280 }, { - "epoch": 3.0, - "learning_rate": 1.821150063633926e-05, - "loss": 0.2529, + "epoch": 3.0736656215086318, + "grad_norm": 0.2128104269504547, + "learning_rate": 1.7080674193867325e-05, + "loss": 0.3375, "step": 85285 }, { - "epoch": 3.0, - "learning_rate": 1.820875899364873e-05, - "loss": 0.2693, + "epoch": 3.073845821169856, + "grad_norm": 0.2777286469936371, + "learning_rate": 1.7077906394061703e-05, + "loss": 0.3895, "step": 85290 }, { - "epoch": 3.0, - "learning_rate": 1.820601743913798e-05, - "loss": 0.2434, + "epoch": 3.074026020831081, + "grad_norm": 0.22353100776672363, + "learning_rate": 1.7075138702193074e-05, + "loss": 0.3957, "step": 85295 }, { - "epoch": 3.0, - "learning_rate": 1.820327597284262e-05, - "loss": 0.2495, + "epoch": 3.0742062204923055, + "grad_norm": 0.2158191204071045, + "learning_rate": 1.7072371118299142e-05, + "loss": 0.4044, "step": 85300 }, { - "epoch": 3.0, - "learning_rate": 1.8200534594798244e-05, - "loss": 0.247, + "epoch": 3.0743864201535303, + "grad_norm": 0.2143658846616745, + "learning_rate": 1.7069603642417622e-05, + "loss": 0.3425, "step": 85305 }, { - "epoch": 3.0, - "learning_rate": 1.819779330504045e-05, - "loss": 0.2358, + "epoch": 3.0745666198147545, + "grad_norm": 0.21374057233333588, + "learning_rate": 1.7066836274586214e-05, + "loss": 0.3924, "step": 85310 }, { - "epoch": 3.0, - "learning_rate": 1.819505210360481e-05, - "loss": 0.2602, + "epoch": 3.0747468194759793, + "grad_norm": 0.21760691702365875, + "learning_rate": 1.7064069014842626e-05, + "loss": 0.4043, "step": 85315 }, { - "epoch": 3.0, - "learning_rate": 1.819231099052695e-05, - "loss": 0.2637, + "epoch": 3.074927019137204, + "grad_norm": 0.25695380568504333, + "learning_rate": 1.7061301863224566e-05, + "loss": 0.3793, "step": 85320 }, { - "epoch": 3.0, - "learning_rate": 1.818956996584244e-05, - "loss": 0.2622, + "epoch": 3.0751072187984287, + "grad_norm": 0.22985416650772095, + "learning_rate": 1.7058534819769724e-05, + "loss": 0.3659, "step": 85325 }, { - "epoch": 3.0, - "learning_rate": 1.818682902958688e-05, - "loss": 0.2495, + "epoch": 3.0752874184596535, + "grad_norm": 0.2104073315858841, + "learning_rate": 1.7055767884515815e-05, + "loss": 0.3891, "step": 85330 }, { - "epoch": 3.0, - "learning_rate": 1.8184088181795844e-05, - "loss": 0.2399, + "epoch": 3.0754676181208778, + "grad_norm": 0.22502779960632324, + "learning_rate": 1.7053001057500534e-05, + "loss": 0.3922, "step": 85335 }, { - "epoch": 3.0, - "learning_rate": 1.818134742250494e-05, - "loss": 0.2366, + "epoch": 3.0756478177821025, + "grad_norm": 0.26956361532211304, + "learning_rate": 1.705023433876156e-05, + "loss": 0.3993, "step": 85340 }, { - "epoch": 3.0, - "learning_rate": 1.8178606751749742e-05, - "loss": 0.2724, + "epoch": 3.0758280174433272, + "grad_norm": 0.22516946494579315, + "learning_rate": 1.704746772833662e-05, + "loss": 0.3681, "step": 85345 }, { - "epoch": 3.0, - "learning_rate": 1.817586616956585e-05, - "loss": 0.2627, + "epoch": 3.076008217104552, + "grad_norm": 0.2101748287677765, + "learning_rate": 1.7044701226263374e-05, + "loss": 0.3783, "step": 85350 }, { - "epoch": 3.0, - "learning_rate": 1.8173125675988822e-05, - "loss": 0.2537, + "epoch": 3.0761884167657767, + "grad_norm": 0.2022121697664261, + "learning_rate": 1.704193483257955e-05, + "loss": 0.4104, "step": 85355 }, { - "epoch": 3.0, - "learning_rate": 1.817038527105427e-05, - "loss": 0.2662, + "epoch": 3.076368616427001, + "grad_norm": 0.2616555690765381, + "learning_rate": 1.703916854732282e-05, + "loss": 0.395, "step": 85360 }, { - "epoch": 3.0, - "learning_rate": 1.8167644954797763e-05, - "loss": 0.2653, + "epoch": 3.0765488160882257, + "grad_norm": 0.2326449602842331, + "learning_rate": 1.7036402370530867e-05, + "loss": 0.3604, "step": 85365 }, { - "epoch": 3.0, - "learning_rate": 1.8164904727254883e-05, - "loss": 0.2675, + "epoch": 3.0767290157494505, + "grad_norm": 0.25773555040359497, + "learning_rate": 1.70336363022414e-05, + "loss": 0.3656, "step": 85370 }, { - "epoch": 3.0, - "learning_rate": 1.8162164588461207e-05, - "loss": 0.2528, + "epoch": 3.076909215410675, + "grad_norm": 0.228920578956604, + "learning_rate": 1.7030870342492098e-05, + "loss": 0.3941, "step": 85375 }, { - "epoch": 3.0, - "learning_rate": 1.815942453845232e-05, - "loss": 0.265, + "epoch": 3.0770894150718995, + "grad_norm": 0.21083427965641022, + "learning_rate": 1.7028104491320636e-05, + "loss": 0.4001, "step": 85380 }, { - "epoch": 3.0, - "learning_rate": 1.81566845772638e-05, - "loss": 0.2637, + "epoch": 3.077269614733124, + "grad_norm": 0.20418711006641388, + "learning_rate": 1.7025338748764713e-05, + "loss": 0.37, "step": 85385 }, { - "epoch": 3.0, - "learning_rate": 1.815394470493121e-05, - "loss": 0.2915, + "epoch": 3.077449814394349, + "grad_norm": 0.2817372977733612, + "learning_rate": 1.7022573114862e-05, + "loss": 0.3804, "step": 85390 }, { - "epoch": 3.0, - "learning_rate": 1.8151204921490146e-05, - "loss": 0.2486, + "epoch": 3.0776300140555737, + "grad_norm": 0.2094438374042511, + "learning_rate": 1.7019807589650187e-05, + "loss": 0.3896, "step": 85395 }, { - "epoch": 3.0, - "learning_rate": 1.8148465226976175e-05, - "loss": 0.2617, + "epoch": 3.0778102137167984, + "grad_norm": 0.2494175136089325, + "learning_rate": 1.701704217316696e-05, + "loss": 0.3862, "step": 85400 }, { - "epoch": 3.0, - "learning_rate": 1.814572562142486e-05, - "loss": 0.2724, + "epoch": 3.0779904133780227, + "grad_norm": 0.19059158861637115, + "learning_rate": 1.7014276865449973e-05, + "loss": 0.3787, "step": 85405 }, { - "epoch": 3.0, - "learning_rate": 1.814298610487178e-05, - "loss": 0.2499, + "epoch": 3.0781706130392474, + "grad_norm": 0.21895138919353485, + "learning_rate": 1.7011511666536923e-05, + "loss": 0.3882, "step": 85410 }, { - "epoch": 3.01, - "learning_rate": 1.8140246677352514e-05, - "loss": 0.2662, + "epoch": 3.078350812700472, + "grad_norm": 0.20317794382572174, + "learning_rate": 1.700874657646549e-05, + "loss": 0.3827, "step": 85415 }, { - "epoch": 3.01, - "learning_rate": 1.8137507338902625e-05, - "loss": 0.2476, + "epoch": 3.078531012361697, + "grad_norm": 0.23056404292583466, + "learning_rate": 1.700598159527332e-05, + "loss": 0.3973, "step": 85420 }, { - "epoch": 3.01, - "learning_rate": 1.8134768089557675e-05, - "loss": 0.2447, + "epoch": 3.078711212022921, + "grad_norm": 0.22910591959953308, + "learning_rate": 1.7003216722998112e-05, + "loss": 0.3841, "step": 85425 }, { - "epoch": 3.01, - "learning_rate": 1.8132028929353235e-05, - "loss": 0.2333, + "epoch": 3.078891411684146, + "grad_norm": 0.22995778918266296, + "learning_rate": 1.700045195967752e-05, + "loss": 0.4064, "step": 85430 }, { - "epoch": 3.01, - "learning_rate": 1.8129289858324875e-05, - "loss": 0.271, + "epoch": 3.0790716113453707, + "grad_norm": 0.24501994252204895, + "learning_rate": 1.6997687305349234e-05, + "loss": 0.3691, "step": 85435 }, { - "epoch": 3.01, - "learning_rate": 1.8126550876508163e-05, - "loss": 0.2516, + "epoch": 3.0792518110065954, + "grad_norm": 0.19479656219482422, + "learning_rate": 1.699492276005091e-05, + "loss": 0.3536, "step": 85440 }, { - "epoch": 3.01, - "learning_rate": 1.812381198393866e-05, - "loss": 0.2651, + "epoch": 3.07943201066782, + "grad_norm": 0.24624988436698914, + "learning_rate": 1.69921583238202e-05, + "loss": 0.3855, "step": 85445 }, { - "epoch": 3.01, - "learning_rate": 1.8121073180651915e-05, - "loss": 0.2639, + "epoch": 3.0796122103290444, + "grad_norm": 0.23105396330356598, + "learning_rate": 1.6989393996694797e-05, + "loss": 0.3399, "step": 85450 }, { - "epoch": 3.01, - "learning_rate": 1.811833446668351e-05, - "loss": 0.2613, + "epoch": 3.079792409990269, + "grad_norm": 0.23639746010303497, + "learning_rate": 1.6986629778712344e-05, + "loss": 0.4176, "step": 85455 }, { - "epoch": 3.01, - "learning_rate": 1.8115595842068995e-05, - "loss": 0.2641, + "epoch": 3.079972609651494, + "grad_norm": 0.27155548334121704, + "learning_rate": 1.6983865669910504e-05, + "loss": 0.3482, "step": 85460 }, { - "epoch": 3.01, - "learning_rate": 1.8112857306843933e-05, - "loss": 0.2608, + "epoch": 3.0801528093127186, + "grad_norm": 0.2704147398471832, + "learning_rate": 1.6981101670326945e-05, + "loss": 0.3908, "step": 85465 }, { - "epoch": 3.01, - "learning_rate": 1.811011886104387e-05, - "loss": 0.2452, + "epoch": 3.080333008973943, + "grad_norm": 0.21449273824691772, + "learning_rate": 1.697833777999932e-05, + "loss": 0.3481, "step": 85470 }, { - "epoch": 3.01, - "learning_rate": 1.8107380504704386e-05, - "loss": 0.2699, + "epoch": 3.0805132086351676, + "grad_norm": 0.2628542482852936, + "learning_rate": 1.6975573998965298e-05, + "loss": 0.3956, "step": 85475 }, { - "epoch": 3.01, - "learning_rate": 1.810464223786102e-05, - "loss": 0.2613, + "epoch": 3.0806934082963924, + "grad_norm": 0.20272031426429749, + "learning_rate": 1.697281032726252e-05, + "loss": 0.3353, "step": 85480 }, { - "epoch": 3.01, - "learning_rate": 1.8101904060549328e-05, - "loss": 0.271, + "epoch": 3.080873607957617, + "grad_norm": 0.20484726130962372, + "learning_rate": 1.697004676492865e-05, + "loss": 0.3335, "step": 85485 }, { - "epoch": 3.01, - "learning_rate": 1.809916597280486e-05, - "loss": 0.2667, + "epoch": 3.081053807618842, + "grad_norm": 0.21416746079921722, + "learning_rate": 1.696728331200134e-05, + "loss": 0.3687, "step": 85490 }, { - "epoch": 3.01, - "learning_rate": 1.8096427974663183e-05, - "loss": 0.2599, + "epoch": 3.081234007280066, + "grad_norm": 0.2598028779029846, + "learning_rate": 1.696451996851824e-05, + "loss": 0.3842, "step": 85495 }, { - "epoch": 3.01, - "learning_rate": 1.809369006615984e-05, - "loss": 0.2713, + "epoch": 3.081414206941291, + "grad_norm": 0.23536814749240875, + "learning_rate": 1.6961756734516994e-05, + "loss": 0.3904, "step": 85500 }, { - "epoch": 3.01, - "eval_loss": 0.2571566104888916, - "eval_runtime": 10.5571, - "eval_samples_per_second": 9.472, - "eval_steps_per_second": 9.472, + "epoch": 3.081414206941291, + "eval_loss": 0.4317238926887512, + "eval_runtime": 3.5362, + "eval_samples_per_second": 28.279, + "eval_steps_per_second": 7.07, "step": 85500 }, { - "epoch": 3.01, - "learning_rate": 1.8090952247330374e-05, - "loss": 0.2468, + "epoch": 3.0815944066025156, + "grad_norm": 0.17988234758377075, + "learning_rate": 1.695899361003526e-05, + "loss": 0.3702, "step": 85505 }, { - "epoch": 3.01, - "learning_rate": 1.8088214518210345e-05, - "loss": 0.2664, + "epoch": 3.0817746062637403, + "grad_norm": 0.22040648758411407, + "learning_rate": 1.695623059511068e-05, + "loss": 0.3716, "step": 85510 }, { - "epoch": 3.01, - "learning_rate": 1.808547687883529e-05, - "loss": 0.2476, + "epoch": 3.081954805924965, + "grad_norm": 0.23150260746479034, + "learning_rate": 1.69534676897809e-05, + "loss": 0.3891, "step": 85515 }, { - "epoch": 3.01, - "learning_rate": 1.808273932924077e-05, - "loss": 0.2681, + "epoch": 3.0821350055861894, + "grad_norm": 0.21168793737888336, + "learning_rate": 1.6950704894083575e-05, + "loss": 0.363, "step": 85520 }, { - "epoch": 3.01, - "learning_rate": 1.8080001869462308e-05, - "loss": 0.2499, + "epoch": 3.082315205247414, + "grad_norm": 0.21894006431102753, + "learning_rate": 1.6947942208056316e-05, + "loss": 0.3604, "step": 85525 }, { - "epoch": 3.01, - "learning_rate": 1.807726449953547e-05, - "loss": 0.2518, + "epoch": 3.082495404908639, + "grad_norm": 0.23681262135505676, + "learning_rate": 1.6945179631736807e-05, + "loss": 0.3973, "step": 85530 }, { - "epoch": 3.01, - "learning_rate": 1.8074527219495793e-05, - "loss": 0.2667, + "epoch": 3.0826756045698636, + "grad_norm": 0.2256009578704834, + "learning_rate": 1.6942417165162648e-05, + "loss": 0.3661, "step": 85535 }, { - "epoch": 3.01, - "learning_rate": 1.807179002937881e-05, - "loss": 0.268, + "epoch": 3.082855804231088, + "grad_norm": 0.21623054146766663, + "learning_rate": 1.6939654808371515e-05, + "loss": 0.3936, "step": 85540 }, { - "epoch": 3.01, - "learning_rate": 1.8069052929220064e-05, - "loss": 0.2868, + "epoch": 3.0830360038923126, + "grad_norm": 0.2143433690071106, + "learning_rate": 1.693689256140101e-05, + "loss": 0.3677, "step": 85545 }, { - "epoch": 3.01, - "learning_rate": 1.8066315919055106e-05, - "loss": 0.2774, + "epoch": 3.0832162035535373, + "grad_norm": 0.2282250076532364, + "learning_rate": 1.693413042428878e-05, + "loss": 0.3712, "step": 85550 }, { - "epoch": 3.01, - "learning_rate": 1.8063578998919467e-05, - "loss": 0.2597, + "epoch": 3.083396403214762, + "grad_norm": 0.19568027555942535, + "learning_rate": 1.693136839707248e-05, + "loss": 0.3908, "step": 85555 }, { - "epoch": 3.01, - "learning_rate": 1.8060842168848686e-05, - "loss": 0.269, + "epoch": 3.083576602875987, + "grad_norm": 0.20706535875797272, + "learning_rate": 1.692860647978971e-05, + "loss": 0.388, "step": 85560 }, { - "epoch": 3.01, - "learning_rate": 1.805810542887828e-05, - "loss": 0.2514, + "epoch": 3.083756802537211, + "grad_norm": 0.23546600341796875, + "learning_rate": 1.6925844672478115e-05, + "loss": 0.3772, "step": 85565 }, { - "epoch": 3.01, - "learning_rate": 1.805536877904381e-05, - "loss": 0.2565, + "epoch": 3.083937002198436, + "grad_norm": 0.2380140721797943, + "learning_rate": 1.6923082975175325e-05, + "loss": 0.3468, "step": 85570 }, { - "epoch": 3.01, - "learning_rate": 1.8052632219380806e-05, - "loss": 0.2527, + "epoch": 3.0841172018596605, + "grad_norm": 0.23840586841106415, + "learning_rate": 1.6920321387918957e-05, + "loss": 0.3679, "step": 85575 }, { - "epoch": 3.01, - "learning_rate": 1.804989574992479e-05, - "loss": 0.2334, + "epoch": 3.0842974015208853, + "grad_norm": 0.2252638339996338, + "learning_rate": 1.6917559910746657e-05, + "loss": 0.4021, "step": 85580 }, { - "epoch": 3.01, - "learning_rate": 1.804715937071129e-05, - "loss": 0.2542, + "epoch": 3.0844776011821096, + "grad_norm": 0.20066937804222107, + "learning_rate": 1.6914798543696036e-05, + "loss": 0.4079, "step": 85585 }, { - "epoch": 3.01, - "learning_rate": 1.8044423081775853e-05, - "loss": 0.2429, + "epoch": 3.0846578008433343, + "grad_norm": 0.22038015723228455, + "learning_rate": 1.6912037286804717e-05, + "loss": 0.3824, "step": 85590 }, { - "epoch": 3.01, - "learning_rate": 1.8041686883153992e-05, - "loss": 0.2496, + "epoch": 3.084838000504559, + "grad_norm": 0.21279798448085785, + "learning_rate": 1.6909276140110324e-05, + "loss": 0.3655, "step": 85595 }, { - "epoch": 3.01, - "learning_rate": 1.8038950774881245e-05, - "loss": 0.2554, + "epoch": 3.0850182001657838, + "grad_norm": 0.24839600920677185, + "learning_rate": 1.6906515103650482e-05, + "loss": 0.3607, "step": 85600 }, { - "epoch": 3.01, - "learning_rate": 1.803621475699313e-05, - "loss": 0.239, + "epoch": 3.0851983998270085, + "grad_norm": 0.23541690409183502, + "learning_rate": 1.69037541774628e-05, + "loss": 0.3618, "step": 85605 }, { - "epoch": 3.01, - "learning_rate": 1.8033478829525185e-05, - "loss": 0.2629, + "epoch": 3.085378599488233, + "grad_norm": 0.24776875972747803, + "learning_rate": 1.69009933615849e-05, + "loss": 0.3814, "step": 85610 }, { - "epoch": 3.01, - "learning_rate": 1.8030742992512924e-05, - "loss": 0.258, + "epoch": 3.0855587991494575, + "grad_norm": 0.19632692635059357, + "learning_rate": 1.6898232656054398e-05, + "loss": 0.413, "step": 85615 }, { - "epoch": 3.01, - "learning_rate": 1.8028007245991863e-05, - "loss": 0.2504, + "epoch": 3.0857389988106823, + "grad_norm": 0.24694256484508514, + "learning_rate": 1.6895472060908907e-05, + "loss": 0.387, "step": 85620 }, { - "epoch": 3.01, - "learning_rate": 1.802527158999754e-05, - "loss": 0.2538, + "epoch": 3.085919198471907, + "grad_norm": 0.21106529235839844, + "learning_rate": 1.689271157618605e-05, + "loss": 0.3476, "step": 85625 }, { - "epoch": 3.01, - "learning_rate": 1.8022536024565474e-05, - "loss": 0.251, + "epoch": 3.0860993981331317, + "grad_norm": 0.24675370752811432, + "learning_rate": 1.688995120192341e-05, + "loss": 0.396, "step": 85630 }, { - "epoch": 3.01, - "learning_rate": 1.801980054973118e-05, - "loss": 0.2519, + "epoch": 3.086279597794356, + "grad_norm": 0.20427581667900085, + "learning_rate": 1.688719093815863e-05, + "loss": 0.3869, "step": 85635 }, { - "epoch": 3.01, - "learning_rate": 1.8017065165530162e-05, - "loss": 0.2593, + "epoch": 3.0864597974555807, + "grad_norm": 0.2514338493347168, + "learning_rate": 1.68844307849293e-05, + "loss": 0.3824, "step": 85640 }, { - "epoch": 3.01, - "learning_rate": 1.8014329871997963e-05, - "loss": 0.3128, + "epoch": 3.0866399971168055, + "grad_norm": 0.25234454870224, + "learning_rate": 1.6881670742273023e-05, + "loss": 0.3886, "step": 85645 }, { - "epoch": 3.01, - "learning_rate": 1.8011594669170078e-05, - "loss": 0.2533, + "epoch": 3.08682019677803, + "grad_norm": 0.26698270440101624, + "learning_rate": 1.6878910810227417e-05, + "loss": 0.3554, "step": 85650 }, { - "epoch": 3.01, - "learning_rate": 1.800885955708204e-05, - "loss": 0.2519, + "epoch": 3.0870003964392545, + "grad_norm": 0.21970665454864502, + "learning_rate": 1.687615098883007e-05, + "loss": 0.3698, "step": 85655 }, { - "epoch": 3.01, - "learning_rate": 1.8006124535769345e-05, - "loss": 0.2527, + "epoch": 3.0871805961004792, + "grad_norm": 0.2174971103668213, + "learning_rate": 1.68733912781186e-05, + "loss": 0.3598, "step": 85660 }, { - "epoch": 3.01, - "learning_rate": 1.800338960526752e-05, - "loss": 0.2399, + "epoch": 3.087360795761704, + "grad_norm": 0.26610609889030457, + "learning_rate": 1.68706316781306e-05, + "loss": 0.3654, "step": 85665 }, { - "epoch": 3.01, - "learning_rate": 1.8000654765612074e-05, - "loss": 0.2729, + "epoch": 3.0875409954229287, + "grad_norm": 0.22846719622612, + "learning_rate": 1.6867872188903667e-05, + "loss": 0.3823, "step": 85670 }, { - "epoch": 3.01, - "learning_rate": 1.7997920016838504e-05, - "loss": 0.2545, + "epoch": 3.0877211950841534, + "grad_norm": 0.23386985063552856, + "learning_rate": 1.6865112810475403e-05, + "loss": 0.3609, "step": 85675 }, { - "epoch": 3.01, - "learning_rate": 1.7995185358982323e-05, - "loss": 0.2372, + "epoch": 3.0879013947453777, + "grad_norm": 0.19239924848079681, + "learning_rate": 1.6862353542883404e-05, + "loss": 0.3711, "step": 85680 }, { - "epoch": 3.01, - "learning_rate": 1.7992450792079056e-05, - "loss": 0.2608, + "epoch": 3.0880815944066025, + "grad_norm": 0.18704915046691895, + "learning_rate": 1.6859594386165255e-05, + "loss": 0.3522, "step": 85685 }, { - "epoch": 3.01, - "learning_rate": 1.7989716316164198e-05, - "loss": 0.2648, + "epoch": 3.088261794067827, + "grad_norm": 0.2616778016090393, + "learning_rate": 1.6856835340358563e-05, + "loss": 0.3941, "step": 85690 }, { - "epoch": 3.01, - "learning_rate": 1.7986981931273254e-05, - "loss": 0.2433, + "epoch": 3.088441993729052, + "grad_norm": 0.21047228574752808, + "learning_rate": 1.685407640550091e-05, + "loss": 0.3471, "step": 85695 }, { - "epoch": 3.02, - "learning_rate": 1.7984247637441714e-05, - "loss": 0.2697, + "epoch": 3.088622193390276, + "grad_norm": 0.23516976833343506, + "learning_rate": 1.685131758162989e-05, + "loss": 0.3674, "step": 85700 }, { - "epoch": 3.02, - "learning_rate": 1.79815134347051e-05, - "loss": 0.2455, + "epoch": 3.088802393051501, + "grad_norm": 0.23034684360027313, + "learning_rate": 1.6848558868783098e-05, + "loss": 0.394, "step": 85705 }, { - "epoch": 3.02, - "learning_rate": 1.797877932309892e-05, - "loss": 0.2586, + "epoch": 3.0889825927127257, + "grad_norm": 0.2383045256137848, + "learning_rate": 1.6845800266998098e-05, + "loss": 0.3637, "step": 85710 }, { - "epoch": 3.02, - "learning_rate": 1.7976045302658657e-05, - "loss": 0.282, + "epoch": 3.0891627923739504, + "grad_norm": 0.21178589761257172, + "learning_rate": 1.6843041776312503e-05, + "loss": 0.364, "step": 85715 }, { - "epoch": 3.02, - "learning_rate": 1.797331137341981e-05, - "loss": 0.2418, + "epoch": 3.089342992035175, + "grad_norm": 0.2509348392486572, + "learning_rate": 1.6840283396763872e-05, + "loss": 0.3614, "step": 85720 }, { - "epoch": 3.02, - "learning_rate": 1.7970577535417892e-05, - "loss": 0.2651, + "epoch": 3.0895231916963994, + "grad_norm": 0.24692773818969727, + "learning_rate": 1.683752512838981e-05, + "loss": 0.3893, "step": 85725 }, { - "epoch": 3.02, - "learning_rate": 1.7967843788688388e-05, - "loss": 0.2649, + "epoch": 3.089703391357624, + "grad_norm": 0.19897426664829254, + "learning_rate": 1.6834766971227893e-05, + "loss": 0.3856, "step": 85730 }, { - "epoch": 3.02, - "learning_rate": 1.7965110133266806e-05, - "loss": 0.2598, + "epoch": 3.089883591018849, + "grad_norm": 0.21682973206043243, + "learning_rate": 1.6832008925315677e-05, + "loss": 0.3699, "step": 85735 }, { - "epoch": 3.02, - "learning_rate": 1.7962376569188617e-05, - "loss": 0.2503, + "epoch": 3.0900637906800736, + "grad_norm": 0.27642059326171875, + "learning_rate": 1.682925099069078e-05, + "loss": 0.3887, "step": 85740 }, { - "epoch": 3.02, - "learning_rate": 1.7959643096489342e-05, - "loss": 0.2473, + "epoch": 3.090243990341298, + "grad_norm": 0.23973903059959412, + "learning_rate": 1.6826493167390746e-05, + "loss": 0.3442, "step": 85745 }, { - "epoch": 3.02, - "learning_rate": 1.795690971520447e-05, - "loss": 0.2483, + "epoch": 3.0904241900025227, + "grad_norm": 0.21967796981334686, + "learning_rate": 1.6823735455453155e-05, + "loss": 0.4142, "step": 85750 }, { - "epoch": 3.02, - "learning_rate": 1.7954176425369463e-05, - "loss": 0.2506, + "epoch": 3.0906043896637474, + "grad_norm": 0.1877850890159607, + "learning_rate": 1.6820977854915593e-05, + "loss": 0.3623, "step": 85755 }, { - "epoch": 3.02, - "learning_rate": 1.7951443227019843e-05, - "loss": 0.2643, + "epoch": 3.090784589324972, + "grad_norm": 0.22713260352611542, + "learning_rate": 1.681822036581562e-05, + "loss": 0.3917, "step": 85760 }, { - "epoch": 3.02, - "learning_rate": 1.7948710120191086e-05, - "loss": 0.257, + "epoch": 3.090964788986197, + "grad_norm": 0.2456943690776825, + "learning_rate": 1.681546298819081e-05, + "loss": 0.3744, "step": 85765 }, { - "epoch": 3.02, - "learning_rate": 1.794597710491869e-05, - "loss": 0.2715, + "epoch": 3.091144988647421, + "grad_norm": 0.2482386976480484, + "learning_rate": 1.6812705722078738e-05, + "loss": 0.3686, "step": 85770 }, { - "epoch": 3.02, - "learning_rate": 1.794324418123811e-05, - "loss": 0.2561, + "epoch": 3.091325188308646, + "grad_norm": 0.28733116388320923, + "learning_rate": 1.6809948567516955e-05, + "loss": 0.3872, "step": 85775 }, { - "epoch": 3.02, - "learning_rate": 1.7940511349184873e-05, - "loss": 0.2645, + "epoch": 3.0915053879698706, + "grad_norm": 0.266248881816864, + "learning_rate": 1.6807191524543045e-05, + "loss": 0.3957, "step": 85780 }, { - "epoch": 3.02, - "learning_rate": 1.7937778608794437e-05, - "loss": 0.2468, + "epoch": 3.0916855876310954, + "grad_norm": 0.21898359060287476, + "learning_rate": 1.6804434593194565e-05, + "loss": 0.3749, "step": 85785 }, { - "epoch": 3.02, - "learning_rate": 1.7935045960102298e-05, - "loss": 0.2588, + "epoch": 3.09186578729232, + "grad_norm": 0.22204652428627014, + "learning_rate": 1.6801677773509074e-05, + "loss": 0.3562, "step": 85790 }, { - "epoch": 3.02, - "learning_rate": 1.793231340314392e-05, - "loss": 0.2506, + "epoch": 3.0920459869535444, + "grad_norm": 0.228468656539917, + "learning_rate": 1.6798921065524138e-05, + "loss": 0.3875, "step": 85795 }, { - "epoch": 3.02, - "learning_rate": 1.7929580937954803e-05, - "loss": 0.2624, + "epoch": 3.092226186614769, + "grad_norm": 0.2888953685760498, + "learning_rate": 1.679616446927731e-05, + "loss": 0.3727, "step": 85800 }, { - "epoch": 3.02, - "learning_rate": 1.792684856457042e-05, - "loss": 0.2708, + "epoch": 3.092406386275994, + "grad_norm": 0.2353111207485199, + "learning_rate": 1.6793407984806153e-05, + "loss": 0.4015, "step": 85805 }, { - "epoch": 3.02, - "learning_rate": 1.7924116283026244e-05, - "loss": 0.2593, + "epoch": 3.0925865859372186, + "grad_norm": 0.21720938384532928, + "learning_rate": 1.6790651612148235e-05, + "loss": 0.3622, "step": 85810 }, { - "epoch": 3.02, - "learning_rate": 1.7921384093357753e-05, - "loss": 0.2432, + "epoch": 3.092766785598443, + "grad_norm": 0.25723299384117126, + "learning_rate": 1.6787895351341083e-05, + "loss": 0.4135, "step": 85815 }, { - "epoch": 3.02, - "learning_rate": 1.7918651995600423e-05, - "loss": 0.2573, + "epoch": 3.0929469852596676, + "grad_norm": 0.2206987887620926, + "learning_rate": 1.678513920242228e-05, + "loss": 0.4013, "step": 85820 }, { - "epoch": 3.02, - "learning_rate": 1.7915919989789733e-05, - "loss": 0.2652, + "epoch": 3.0931271849208923, + "grad_norm": 0.23632338643074036, + "learning_rate": 1.6782383165429364e-05, + "loss": 0.3843, "step": 85825 }, { - "epoch": 3.02, - "learning_rate": 1.7913188075961156e-05, - "loss": 0.2361, + "epoch": 3.093307384582117, + "grad_norm": 0.19661295413970947, + "learning_rate": 1.677962724039987e-05, + "loss": 0.3777, "step": 85830 }, { - "epoch": 3.02, - "learning_rate": 1.7910456254150152e-05, - "loss": 0.242, + "epoch": 3.093487584243342, + "grad_norm": 0.2105666697025299, + "learning_rate": 1.6776871427371384e-05, + "loss": 0.3652, "step": 85835 }, { - "epoch": 3.02, - "learning_rate": 1.790772452439221e-05, - "loss": 0.2871, + "epoch": 3.093667783904566, + "grad_norm": 0.21203243732452393, + "learning_rate": 1.677411572638142e-05, + "loss": 0.3687, "step": 85840 }, { - "epoch": 3.02, - "learning_rate": 1.7904992886722787e-05, - "loss": 0.2457, + "epoch": 3.093847983565791, + "grad_norm": 0.20673683285713196, + "learning_rate": 1.6771360137467546e-05, + "loss": 0.3816, "step": 85845 }, { - "epoch": 3.02, - "learning_rate": 1.7902261341177363e-05, - "loss": 0.2623, + "epoch": 3.0940281832270156, + "grad_norm": 0.23549970984458923, + "learning_rate": 1.6768604660667293e-05, + "loss": 0.3927, "step": 85850 }, { - "epoch": 3.02, - "learning_rate": 1.7899529887791386e-05, - "loss": 0.2535, + "epoch": 3.0942083828882403, + "grad_norm": 0.18549276888370514, + "learning_rate": 1.6765849296018203e-05, + "loss": 0.3757, "step": 85855 }, { - "epoch": 3.02, - "learning_rate": 1.7896798526600345e-05, - "loss": 0.2648, + "epoch": 3.094388582549465, + "grad_norm": 0.2178041934967041, + "learning_rate": 1.6763094043557825e-05, + "loss": 0.3638, "step": 85860 }, { - "epoch": 3.02, - "learning_rate": 1.7894067257639686e-05, - "loss": 0.2569, + "epoch": 3.0945687822106893, + "grad_norm": 0.25327563285827637, + "learning_rate": 1.67603389033237e-05, + "loss": 0.3797, "step": 85865 }, { - "epoch": 3.02, - "learning_rate": 1.7891336080944882e-05, - "loss": 0.2785, + "epoch": 3.094748981871914, + "grad_norm": 0.24706877768039703, + "learning_rate": 1.6757583875353355e-05, + "loss": 0.3832, "step": 85870 }, { - "epoch": 3.02, - "learning_rate": 1.7888604996551394e-05, - "loss": 0.2462, + "epoch": 3.094929181533139, + "grad_norm": 0.20161230862140656, + "learning_rate": 1.675482895968434e-05, + "loss": 0.3857, "step": 85875 }, { - "epoch": 3.02, - "learning_rate": 1.7885874004494692e-05, - "loss": 0.265, + "epoch": 3.0951093811943635, + "grad_norm": 0.23628760874271393, + "learning_rate": 1.675207415635418e-05, + "loss": 0.3709, "step": 85880 }, { - "epoch": 3.02, - "learning_rate": 1.7883143104810223e-05, - "loss": 0.2593, + "epoch": 3.095289580855588, + "grad_norm": 0.2771933376789093, + "learning_rate": 1.6749319465400414e-05, + "loss": 0.3934, "step": 85885 }, { - "epoch": 3.02, - "learning_rate": 1.7880412297533444e-05, - "loss": 0.2675, + "epoch": 3.0954697805168125, + "grad_norm": 0.2511608898639679, + "learning_rate": 1.6746564886860577e-05, + "loss": 0.4084, "step": 85890 }, { - "epoch": 3.02, - "learning_rate": 1.7877681582699827e-05, - "loss": 0.2416, + "epoch": 3.0956499801780373, + "grad_norm": 0.2165483683347702, + "learning_rate": 1.6743810420772186e-05, + "loss": 0.3714, "step": 85895 }, { - "epoch": 3.02, - "learning_rate": 1.7874950960344818e-05, - "loss": 0.2507, + "epoch": 3.095830179839262, + "grad_norm": 0.22518706321716309, + "learning_rate": 1.674105606717279e-05, + "loss": 0.375, "step": 85900 }, { - "epoch": 3.02, - "learning_rate": 1.7872220430503878e-05, - "loss": 0.2235, + "epoch": 3.0960103795004867, + "grad_norm": 0.21256521344184875, + "learning_rate": 1.67383018260999e-05, + "loss": 0.3688, "step": 85905 }, { - "epoch": 3.02, - "learning_rate": 1.7869489993212445e-05, - "loss": 0.2617, + "epoch": 3.096190579161711, + "grad_norm": 0.2093464583158493, + "learning_rate": 1.673554769759105e-05, + "loss": 0.3451, "step": 85910 }, { - "epoch": 3.02, - "learning_rate": 1.7866759648506e-05, - "loss": 0.2712, + "epoch": 3.0963707788229358, + "grad_norm": 0.16468511521816254, + "learning_rate": 1.673279368168377e-05, + "loss": 0.4124, "step": 85915 }, { - "epoch": 3.02, - "learning_rate": 1.786402939641998e-05, - "loss": 0.2656, + "epoch": 3.0965509784841605, + "grad_norm": 0.21614669263362885, + "learning_rate": 1.673003977841556e-05, + "loss": 0.3886, "step": 85920 }, { - "epoch": 3.02, - "learning_rate": 1.7861299236989832e-05, - "loss": 0.2583, + "epoch": 3.0967311781453852, + "grad_norm": 0.25867360830307007, + "learning_rate": 1.672728598782397e-05, + "loss": 0.388, "step": 85925 }, { - "epoch": 3.02, - "learning_rate": 1.7858569170251e-05, - "loss": 0.2673, + "epoch": 3.0969113778066095, + "grad_norm": 0.25877588987350464, + "learning_rate": 1.6724532309946506e-05, + "loss": 0.4002, "step": 85930 }, { - "epoch": 3.02, - "learning_rate": 1.7855839196238956e-05, - "loss": 0.2758, + "epoch": 3.0970915774678343, + "grad_norm": 0.22398416697978973, + "learning_rate": 1.6721778744820678e-05, + "loss": 0.4284, "step": 85935 }, { - "epoch": 3.02, - "learning_rate": 1.7853109314989125e-05, - "loss": 0.2585, + "epoch": 3.097271777129059, + "grad_norm": 0.2191021740436554, + "learning_rate": 1.671902529248402e-05, + "loss": 0.4128, "step": 85940 }, { - "epoch": 3.02, - "learning_rate": 1.7850379526536958e-05, - "loss": 0.2618, + "epoch": 3.0974519767902837, + "grad_norm": 0.23091648519039154, + "learning_rate": 1.6716271952974033e-05, + "loss": 0.3747, "step": 85945 }, { - "epoch": 3.02, - "learning_rate": 1.78476498309179e-05, - "loss": 0.2605, + "epoch": 3.0976321764515085, + "grad_norm": 0.2016901671886444, + "learning_rate": 1.671351872632824e-05, + "loss": 0.3443, "step": 85950 }, { - "epoch": 3.02, - "learning_rate": 1.7844920228167397e-05, - "loss": 0.2557, + "epoch": 3.0978123761127327, + "grad_norm": 0.2278120517730713, + "learning_rate": 1.671076561258415e-05, + "loss": 0.4025, "step": 85955 }, { - "epoch": 3.02, - "learning_rate": 1.7842190718320896e-05, - "loss": 0.2569, + "epoch": 3.0979925757739575, + "grad_norm": 0.18883484601974487, + "learning_rate": 1.6708012611779273e-05, + "loss": 0.3807, "step": 85960 }, { - "epoch": 3.02, - "learning_rate": 1.783946130141383e-05, - "loss": 0.2488, + "epoch": 3.098172775435182, + "grad_norm": 0.2371702492237091, + "learning_rate": 1.670525972395112e-05, + "loss": 0.3861, "step": 85965 }, { - "epoch": 3.02, - "learning_rate": 1.7836731977481628e-05, - "loss": 0.2511, + "epoch": 3.098352975096407, + "grad_norm": 0.2216133028268814, + "learning_rate": 1.6702506949137197e-05, + "loss": 0.3951, "step": 85970 }, { - "epoch": 3.02, - "learning_rate": 1.7834002746559754e-05, - "loss": 0.2645, + "epoch": 3.0985331747576312, + "grad_norm": 0.25640398263931274, + "learning_rate": 1.669975428737501e-05, + "loss": 0.3929, "step": 85975 }, { - "epoch": 3.03, - "learning_rate": 1.783127360868362e-05, - "loss": 0.2593, + "epoch": 3.098713374418856, + "grad_norm": 0.22617505490779877, + "learning_rate": 1.669700173870206e-05, + "loss": 0.4057, "step": 85980 }, { - "epoch": 3.03, - "learning_rate": 1.7828544563888675e-05, - "loss": 0.2706, + "epoch": 3.0988935740800807, + "grad_norm": 0.25993672013282776, + "learning_rate": 1.6694249303155857e-05, + "loss": 0.3954, "step": 85985 }, { - "epoch": 3.03, - "learning_rate": 1.782581561221036e-05, - "loss": 0.2762, + "epoch": 3.0990737737413054, + "grad_norm": 0.23373205959796906, + "learning_rate": 1.6691496980773903e-05, + "loss": 0.4111, "step": 85990 }, { - "epoch": 3.03, - "learning_rate": 1.7823086753684103e-05, - "loss": 0.2745, + "epoch": 3.09925397340253, + "grad_norm": 0.19884000718593597, + "learning_rate": 1.668874477159369e-05, + "loss": 0.4071, "step": 85995 }, { - "epoch": 3.03, - "learning_rate": 1.7820357988345336e-05, - "loss": 0.2397, + "epoch": 3.0994341730637545, + "grad_norm": 0.23313197493553162, + "learning_rate": 1.6685992675652717e-05, + "loss": 0.3718, "step": 86000 }, { - "epoch": 3.03, - "eval_loss": 0.2575805187225342, - "eval_runtime": 10.5481, - "eval_samples_per_second": 9.48, - "eval_steps_per_second": 9.48, + "epoch": 3.0994341730637545, + "eval_loss": 0.4313110411167145, + "eval_runtime": 3.5318, + "eval_samples_per_second": 28.314, + "eval_steps_per_second": 7.079, "step": 86000 }, { - "epoch": 3.03, - "learning_rate": 1.7817629316229474e-05, - "loss": 0.2688, + "epoch": 3.099614372724979, + "grad_norm": 0.22738061845302582, + "learning_rate": 1.6683240692988492e-05, + "loss": 0.3886, "step": 86005 }, { - "epoch": 3.03, - "learning_rate": 1.781490073737197e-05, - "loss": 0.2693, + "epoch": 3.099794572386204, + "grad_norm": 0.21752703189849854, + "learning_rate": 1.6680488823638508e-05, + "loss": 0.4185, "step": 86010 }, { - "epoch": 3.03, - "learning_rate": 1.781217225180825e-05, - "loss": 0.2262, + "epoch": 3.0999747720474287, + "grad_norm": 0.22272847592830658, + "learning_rate": 1.6677737067640232e-05, + "loss": 0.3903, "step": 86015 }, { - "epoch": 3.03, - "learning_rate": 1.7809443859573738e-05, - "loss": 0.2576, + "epoch": 3.1001549717086534, + "grad_norm": 0.2768046259880066, + "learning_rate": 1.6674985425031197e-05, + "loss": 0.3713, "step": 86020 }, { - "epoch": 3.03, - "learning_rate": 1.7806715560703845e-05, - "loss": 0.2587, + "epoch": 3.1003351713698777, + "grad_norm": 0.23231996595859528, + "learning_rate": 1.667223389584886e-05, + "loss": 0.3758, "step": 86025 }, { - "epoch": 3.03, - "learning_rate": 1.7803987355234023e-05, - "loss": 0.2747, + "epoch": 3.1005153710311024, + "grad_norm": 0.21626095473766327, + "learning_rate": 1.6669482480130734e-05, + "loss": 0.3965, "step": 86030 }, { - "epoch": 3.03, - "learning_rate": 1.7801259243199676e-05, - "loss": 0.2475, + "epoch": 3.100695570692327, + "grad_norm": 0.2107696682214737, + "learning_rate": 1.6666731177914292e-05, + "loss": 0.3717, "step": 86035 }, { - "epoch": 3.03, - "learning_rate": 1.779853122463624e-05, - "loss": 0.2707, + "epoch": 3.100875770353552, + "grad_norm": 0.24785815179347992, + "learning_rate": 1.666397998923702e-05, + "loss": 0.3698, "step": 86040 }, { - "epoch": 3.03, - "learning_rate": 1.7795803299579116e-05, - "loss": 0.2519, + "epoch": 3.101055970014776, + "grad_norm": 0.21663641929626465, + "learning_rate": 1.6661228914136414e-05, + "loss": 0.3634, "step": 86045 }, { - "epoch": 3.03, - "learning_rate": 1.7793075468063755e-05, - "loss": 0.2443, + "epoch": 3.101236169676001, + "grad_norm": 0.23445236682891846, + "learning_rate": 1.6658477952649946e-05, + "loss": 0.387, "step": 86050 }, { - "epoch": 3.03, - "learning_rate": 1.7790347730125555e-05, - "loss": 0.2526, + "epoch": 3.1014163693372256, + "grad_norm": 0.20155306160449982, + "learning_rate": 1.6655727104815104e-05, + "loss": 0.3826, "step": 86055 }, { - "epoch": 3.03, - "learning_rate": 1.7787620085799935e-05, - "loss": 0.2438, + "epoch": 3.1015965689984504, + "grad_norm": 0.20625048875808716, + "learning_rate": 1.6652976370669362e-05, + "loss": 0.388, "step": 86060 }, { - "epoch": 3.03, - "learning_rate": 1.7784892535122312e-05, - "loss": 0.2435, + "epoch": 3.101776768659675, + "grad_norm": 0.2746394872665405, + "learning_rate": 1.6650225750250197e-05, + "loss": 0.3988, "step": 86065 }, { - "epoch": 3.03, - "learning_rate": 1.778216507812811e-05, - "loss": 0.2692, + "epoch": 3.1019569683208994, + "grad_norm": 0.21470344066619873, + "learning_rate": 1.6647475243595096e-05, + "loss": 0.3931, "step": 86070 }, { - "epoch": 3.03, - "learning_rate": 1.777943771485274e-05, - "loss": 0.2492, + "epoch": 3.102137167982124, + "grad_norm": 0.22546519339084625, + "learning_rate": 1.6644724850741528e-05, + "loss": 0.3552, "step": 86075 }, { - "epoch": 3.03, - "learning_rate": 1.7776710445331613e-05, - "loss": 0.2531, + "epoch": 3.102317367643349, + "grad_norm": 0.24429123103618622, + "learning_rate": 1.6641974571726958e-05, + "loss": 0.3673, "step": 86080 }, { - "epoch": 3.03, - "learning_rate": 1.777398326960013e-05, - "loss": 0.248, + "epoch": 3.1024975673045736, + "grad_norm": 0.17867647111415863, + "learning_rate": 1.6639224406588876e-05, + "loss": 0.3625, "step": 86085 }, { - "epoch": 3.03, - "learning_rate": 1.777125618769372e-05, - "loss": 0.2593, + "epoch": 3.102677766965798, + "grad_norm": 0.23398911952972412, + "learning_rate": 1.6636474355364746e-05, + "loss": 0.3595, "step": 86090 }, { - "epoch": 3.03, - "learning_rate": 1.7768529199647784e-05, - "loss": 0.2566, + "epoch": 3.1028579666270226, + "grad_norm": 0.2413238137960434, + "learning_rate": 1.6633724418092016e-05, + "loss": 0.3635, "step": 86095 }, { - "epoch": 3.03, - "learning_rate": 1.7765802305497724e-05, - "loss": 0.2718, + "epoch": 3.1030381662882474, + "grad_norm": 0.23708127439022064, + "learning_rate": 1.6630974594808192e-05, + "loss": 0.3597, "step": 86100 }, { - "epoch": 3.03, - "learning_rate": 1.7763075505278962e-05, - "loss": 0.2584, + "epoch": 3.103218365949472, + "grad_norm": 0.20175603032112122, + "learning_rate": 1.6628224885550697e-05, + "loss": 0.4123, "step": 86105 }, { - "epoch": 3.03, - "learning_rate": 1.7760348799026896e-05, - "loss": 0.267, + "epoch": 3.103398565610697, + "grad_norm": 0.2378152459859848, + "learning_rate": 1.6625475290357037e-05, + "loss": 0.3938, "step": 86110 }, { - "epoch": 3.03, - "learning_rate": 1.7757622186776922e-05, - "loss": 0.2841, + "epoch": 3.103578765271921, + "grad_norm": 0.19391091167926788, + "learning_rate": 1.662272580926465e-05, + "loss": 0.3861, "step": 86115 }, { - "epoch": 3.03, - "learning_rate": 1.7754895668564452e-05, - "loss": 0.2695, + "epoch": 3.103758964933146, + "grad_norm": 0.20529715716838837, + "learning_rate": 1.6619976442310987e-05, + "loss": 0.3877, "step": 86120 }, { - "epoch": 3.03, - "learning_rate": 1.7752169244424894e-05, - "loss": 0.2629, + "epoch": 3.1039391645943706, + "grad_norm": 0.21286916732788086, + "learning_rate": 1.661722718953354e-05, + "loss": 0.3867, "step": 86125 }, { - "epoch": 3.03, - "learning_rate": 1.7749442914393644e-05, - "loss": 0.2655, + "epoch": 3.1041193642555953, + "grad_norm": 0.2500961422920227, + "learning_rate": 1.6614478050969738e-05, + "loss": 0.3611, "step": 86130 }, { - "epoch": 3.03, - "learning_rate": 1.7746716678506094e-05, - "loss": 0.2585, + "epoch": 3.10429956391682, + "grad_norm": 0.2693064212799072, + "learning_rate": 1.661172902665706e-05, + "loss": 0.3954, "step": 86135 }, { - "epoch": 3.03, - "learning_rate": 1.774399053679764e-05, - "loss": 0.2798, + "epoch": 3.1044797635780443, + "grad_norm": 0.22724072635173798, + "learning_rate": 1.6608980116632945e-05, + "loss": 0.3488, "step": 86140 }, { - "epoch": 3.03, - "learning_rate": 1.7741264489303694e-05, - "loss": 0.2878, + "epoch": 3.104659963239269, + "grad_norm": 0.24723055958747864, + "learning_rate": 1.660623132093485e-05, + "loss": 0.3801, "step": 86145 }, { - "epoch": 3.03, - "learning_rate": 1.773853853605965e-05, - "loss": 0.2479, + "epoch": 3.104840162900494, + "grad_norm": 0.26212194561958313, + "learning_rate": 1.6603482639600232e-05, + "loss": 0.3625, "step": 86150 }, { - "epoch": 3.03, - "learning_rate": 1.7735812677100896e-05, - "loss": 0.2513, + "epoch": 3.1050203625617185, + "grad_norm": 0.19167543947696686, + "learning_rate": 1.6600734072666535e-05, + "loss": 0.3962, "step": 86155 }, { - "epoch": 3.03, - "learning_rate": 1.7733086912462817e-05, - "loss": 0.2343, + "epoch": 3.105200562222943, + "grad_norm": 0.2559322118759155, + "learning_rate": 1.6597985620171207e-05, + "loss": 0.3864, "step": 86160 }, { - "epoch": 3.03, - "learning_rate": 1.7730361242180827e-05, - "loss": 0.2601, + "epoch": 3.1053807618841676, + "grad_norm": 0.17590028047561646, + "learning_rate": 1.65952372821517e-05, + "loss": 0.368, "step": 86165 }, { - "epoch": 3.03, - "learning_rate": 1.7727635666290298e-05, - "loss": 0.2608, + "epoch": 3.1055609615453923, + "grad_norm": 0.23424407839775085, + "learning_rate": 1.6592489058645455e-05, + "loss": 0.3734, "step": 86170 }, { - "epoch": 3.03, - "learning_rate": 1.772491018482664e-05, - "loss": 0.2794, + "epoch": 3.105741161206617, + "grad_norm": 0.215243399143219, + "learning_rate": 1.6589740949689926e-05, + "loss": 0.3799, "step": 86175 }, { - "epoch": 3.03, - "learning_rate": 1.772218479782521e-05, - "loss": 0.2484, + "epoch": 3.1059213608678418, + "grad_norm": 0.22049564123153687, + "learning_rate": 1.6586992955322546e-05, + "loss": 0.3542, "step": 86180 }, { - "epoch": 3.03, - "learning_rate": 1.7719459505321425e-05, - "loss": 0.2363, + "epoch": 3.106101560529066, + "grad_norm": 0.24883852899074554, + "learning_rate": 1.6584245075580753e-05, + "loss": 0.3818, "step": 86185 }, { - "epoch": 3.03, - "learning_rate": 1.7716734307350664e-05, - "loss": 0.2372, + "epoch": 3.106281760190291, + "grad_norm": 0.25981536507606506, + "learning_rate": 1.6581497310502e-05, + "loss": 0.3753, "step": 86190 }, { - "epoch": 3.03, - "learning_rate": 1.77140092039483e-05, - "loss": 0.2774, + "epoch": 3.1064619598515155, + "grad_norm": 0.23729638755321503, + "learning_rate": 1.6578749660123715e-05, + "loss": 0.3649, "step": 86195 }, { - "epoch": 3.03, - "learning_rate": 1.771128419514972e-05, - "loss": 0.2725, + "epoch": 3.1066421595127403, + "grad_norm": 0.1951550394296646, + "learning_rate": 1.6576002124483324e-05, + "loss": 0.3514, "step": 86200 }, { - "epoch": 3.03, - "learning_rate": 1.7708559280990322e-05, - "loss": 0.2564, + "epoch": 3.1068223591739645, + "grad_norm": 0.23986147344112396, + "learning_rate": 1.657325470361829e-05, + "loss": 0.3732, "step": 86205 }, { - "epoch": 3.03, - "learning_rate": 1.7705834461505477e-05, - "loss": 0.2536, + "epoch": 3.1070025588351893, + "grad_norm": 0.27552303671836853, + "learning_rate": 1.657050739756601e-05, + "loss": 0.3641, "step": 86210 }, { - "epoch": 3.03, - "learning_rate": 1.7703109736730556e-05, - "loss": 0.2554, + "epoch": 3.107182758496414, + "grad_norm": 0.2763289511203766, + "learning_rate": 1.6567760206363953e-05, + "loss": 0.3584, "step": 86215 }, { - "epoch": 3.03, - "learning_rate": 1.770038510670095e-05, - "loss": 0.2723, + "epoch": 3.1073629581576387, + "grad_norm": 0.21052367985248566, + "learning_rate": 1.6565013130049526e-05, + "loss": 0.3756, "step": 86220 }, { - "epoch": 3.03, - "learning_rate": 1.7697660571452034e-05, - "loss": 0.2475, + "epoch": 3.1075431578188635, + "grad_norm": 0.2306980937719345, + "learning_rate": 1.6562266168660153e-05, + "loss": 0.3861, "step": 86225 }, { - "epoch": 3.03, - "learning_rate": 1.7694936131019186e-05, - "loss": 0.2406, + "epoch": 3.1077233574800878, + "grad_norm": 0.2257169485092163, + "learning_rate": 1.655951932223328e-05, + "loss": 0.4018, "step": 86230 }, { - "epoch": 3.03, - "learning_rate": 1.7692211785437768e-05, - "loss": 0.259, + "epoch": 3.1079035571413125, + "grad_norm": 0.20288598537445068, + "learning_rate": 1.6556772590806318e-05, + "loss": 0.3691, "step": 86235 }, { - "epoch": 3.03, - "learning_rate": 1.7689487534743177e-05, - "loss": 0.2834, + "epoch": 3.1080837568025372, + "grad_norm": 0.23873363435268402, + "learning_rate": 1.6554025974416693e-05, + "loss": 0.3547, "step": 86240 }, { - "epoch": 3.03, - "learning_rate": 1.7686763378970767e-05, - "loss": 0.289, + "epoch": 3.108263956463762, + "grad_norm": 0.2184680998325348, + "learning_rate": 1.6551279473101834e-05, + "loss": 0.3309, "step": 86245 }, { - "epoch": 3.03, - "learning_rate": 1.7684039318155914e-05, - "loss": 0.2726, + "epoch": 3.1084441561249863, + "grad_norm": 0.26064011454582214, + "learning_rate": 1.654853308689915e-05, + "loss": 0.3793, "step": 86250 }, { - "epoch": 3.03, - "learning_rate": 1.7681315352333987e-05, - "loss": 0.2704, + "epoch": 3.108624355786211, + "grad_norm": 0.22625210881233215, + "learning_rate": 1.6545786815846067e-05, + "loss": 0.401, "step": 86255 }, { - "epoch": 3.03, - "learning_rate": 1.7678591481540357e-05, - "loss": 0.2619, + "epoch": 3.1088045554474357, + "grad_norm": 0.21573030948638916, + "learning_rate": 1.654304065998001e-05, + "loss": 0.3825, "step": 86260 }, { - "epoch": 3.04, - "learning_rate": 1.76758677058104e-05, - "loss": 0.2641, + "epoch": 3.1089847551086605, + "grad_norm": 0.21487176418304443, + "learning_rate": 1.6540294619338377e-05, + "loss": 0.3793, "step": 86265 }, { - "epoch": 3.04, - "learning_rate": 1.7673144025179473e-05, - "loss": 0.2632, + "epoch": 3.109164954769885, + "grad_norm": 0.19863709807395935, + "learning_rate": 1.6537548693958593e-05, + "loss": 0.3945, "step": 86270 }, { - "epoch": 3.04, - "learning_rate": 1.767042043968293e-05, - "loss": 0.26, + "epoch": 3.1093451544311095, + "grad_norm": 0.2572442889213562, + "learning_rate": 1.6534802883878083e-05, + "loss": 0.3819, "step": 86275 }, { - "epoch": 3.04, - "learning_rate": 1.7667696949356156e-05, - "loss": 0.2451, + "epoch": 3.109525354092334, + "grad_norm": 0.26214733719825745, + "learning_rate": 1.6532057189134227e-05, + "loss": 0.4062, "step": 86280 }, { - "epoch": 3.04, - "learning_rate": 1.7664973554234505e-05, - "loss": 0.2431, + "epoch": 3.109705553753559, + "grad_norm": 0.21623064577579498, + "learning_rate": 1.6529311609764458e-05, + "loss": 0.3626, "step": 86285 }, { - "epoch": 3.04, - "learning_rate": 1.7662250254353337e-05, - "loss": 0.2845, + "epoch": 3.1098857534147837, + "grad_norm": 0.24655792117118835, + "learning_rate": 1.6526566145806177e-05, + "loss": 0.4102, "step": 86290 }, { - "epoch": 3.04, - "learning_rate": 1.765952704974801e-05, - "loss": 0.2732, + "epoch": 3.1100659530760084, + "grad_norm": 0.1786569356918335, + "learning_rate": 1.6523820797296796e-05, + "loss": 0.3942, "step": 86295 }, { - "epoch": 3.04, - "learning_rate": 1.7656803940453893e-05, - "loss": 0.227, + "epoch": 3.1102461527372327, + "grad_norm": 0.23387044668197632, + "learning_rate": 1.652107556427372e-05, + "loss": 0.3798, "step": 86300 }, { - "epoch": 3.04, - "learning_rate": 1.7654080926506335e-05, - "loss": 0.2505, + "epoch": 3.1104263523984574, + "grad_norm": 0.21776267886161804, + "learning_rate": 1.6518330446774334e-05, + "loss": 0.3593, "step": 86305 }, { - "epoch": 3.04, - "learning_rate": 1.7651358007940694e-05, - "loss": 0.2697, + "epoch": 3.110606552059682, + "grad_norm": 0.21297864615917206, + "learning_rate": 1.651558544483607e-05, + "loss": 0.3782, "step": 86310 }, { - "epoch": 3.04, - "learning_rate": 1.764863518479232e-05, - "loss": 0.2797, + "epoch": 3.110786751720907, + "grad_norm": 0.2200326770544052, + "learning_rate": 1.6512840558496295e-05, + "loss": 0.3814, "step": 86315 }, { - "epoch": 3.04, - "learning_rate": 1.7645912457096586e-05, - "loss": 0.2775, + "epoch": 3.110966951382131, + "grad_norm": 0.2481064796447754, + "learning_rate": 1.651009578779244e-05, + "loss": 0.3499, "step": 86320 }, { - "epoch": 3.04, - "learning_rate": 1.7643189824888827e-05, - "loss": 0.2449, + "epoch": 3.111147151043356, + "grad_norm": 0.20087507367134094, + "learning_rate": 1.650735113276188e-05, + "loss": 0.3656, "step": 86325 }, { - "epoch": 3.04, - "learning_rate": 1.764046728820439e-05, - "loss": 0.2385, + "epoch": 3.1113273507045807, + "grad_norm": 0.225091814994812, + "learning_rate": 1.6504606593442014e-05, + "loss": 0.3703, "step": 86330 }, { - "epoch": 3.04, - "learning_rate": 1.7637744847078652e-05, - "loss": 0.2674, + "epoch": 3.1115075503658054, + "grad_norm": 0.20192007720470428, + "learning_rate": 1.650186216987024e-05, + "loss": 0.4075, "step": 86335 }, { - "epoch": 3.04, - "learning_rate": 1.7635022501546938e-05, - "loss": 0.2579, + "epoch": 3.11168775002703, + "grad_norm": 0.2589082717895508, + "learning_rate": 1.6499117862083953e-05, + "loss": 0.405, "step": 86340 }, { - "epoch": 3.04, - "learning_rate": 1.7632300251644606e-05, - "loss": 0.2655, + "epoch": 3.1118679496882544, + "grad_norm": 0.2047402262687683, + "learning_rate": 1.649637367012053e-05, + "loss": 0.3555, "step": 86345 }, { - "epoch": 3.04, - "learning_rate": 1.762957809740699e-05, - "loss": 0.2744, + "epoch": 3.112048149349479, + "grad_norm": 0.2570793032646179, + "learning_rate": 1.6493629594017376e-05, + "loss": 0.3869, "step": 86350 }, { - "epoch": 3.04, - "learning_rate": 1.7626856038869456e-05, - "loss": 0.247, + "epoch": 3.112228349010704, + "grad_norm": 0.23956820368766785, + "learning_rate": 1.6490885633811868e-05, + "loss": 0.3472, "step": 86355 }, { - "epoch": 3.04, - "learning_rate": 1.7624134076067336e-05, - "loss": 0.268, + "epoch": 3.1124085486719286, + "grad_norm": 0.24650585651397705, + "learning_rate": 1.64881417895414e-05, + "loss": 0.3631, "step": 86360 }, { - "epoch": 3.04, - "learning_rate": 1.762141220903597e-05, - "loss": 0.2387, + "epoch": 3.1125887483331534, + "grad_norm": 0.22965595126152039, + "learning_rate": 1.6485398061243353e-05, + "loss": 0.3844, "step": 86365 }, { - "epoch": 3.04, - "learning_rate": 1.7618690437810697e-05, - "loss": 0.2421, + "epoch": 3.1127689479943776, + "grad_norm": 0.22221916913986206, + "learning_rate": 1.6482654448955097e-05, + "loss": 0.3907, "step": 86370 }, { - "epoch": 3.04, - "learning_rate": 1.7615968762426876e-05, - "loss": 0.2817, + "epoch": 3.1129491476556024, + "grad_norm": 0.22202599048614502, + "learning_rate": 1.6479910952714038e-05, + "loss": 0.4044, "step": 86375 }, { - "epoch": 3.04, - "learning_rate": 1.7613247182919836e-05, - "loss": 0.2491, + "epoch": 3.113129347316827, + "grad_norm": 0.21764975786209106, + "learning_rate": 1.6477167572557547e-05, + "loss": 0.4054, "step": 86380 }, { - "epoch": 3.04, - "learning_rate": 1.7610525699324908e-05, - "loss": 0.2615, + "epoch": 3.113309546978052, + "grad_norm": 0.29039353132247925, + "learning_rate": 1.647442430852298e-05, + "loss": 0.3839, "step": 86385 }, { - "epoch": 3.04, - "learning_rate": 1.7607804311677427e-05, - "loss": 0.2524, + "epoch": 3.113489746639276, + "grad_norm": 0.1968279629945755, + "learning_rate": 1.6471681160647752e-05, + "loss": 0.3557, "step": 86390 }, { - "epoch": 3.04, - "learning_rate": 1.7605083020012744e-05, - "loss": 0.2461, + "epoch": 3.113669946300501, + "grad_norm": 0.22088350355625153, + "learning_rate": 1.6468938128969194e-05, + "loss": 0.3762, "step": 86395 }, { - "epoch": 3.04, - "learning_rate": 1.7602361824366186e-05, - "loss": 0.2614, + "epoch": 3.1138501459617256, + "grad_norm": 0.30237895250320435, + "learning_rate": 1.6466195213524722e-05, + "loss": 0.4007, "step": 86400 }, { - "epoch": 3.04, - "learning_rate": 1.7599640724773085e-05, - "loss": 0.2538, + "epoch": 3.1140303456229503, + "grad_norm": 0.21387836337089539, + "learning_rate": 1.6463452414351683e-05, + "loss": 0.3915, "step": 86405 }, { - "epoch": 3.04, - "learning_rate": 1.759691972126876e-05, - "loss": 0.2505, + "epoch": 3.114210545284175, + "grad_norm": 0.22359785437583923, + "learning_rate": 1.646070973148744e-05, + "loss": 0.3664, "step": 86410 }, { - "epoch": 3.04, - "learning_rate": 1.7594198813888568e-05, - "loss": 0.2782, + "epoch": 3.1143907449453994, + "grad_norm": 0.2460165023803711, + "learning_rate": 1.645796716496939e-05, + "loss": 0.4142, "step": 86415 }, { - "epoch": 3.04, - "learning_rate": 1.7591478002667817e-05, - "loss": 0.2593, + "epoch": 3.114570944606624, + "grad_norm": 0.20692665874958038, + "learning_rate": 1.6455224714834876e-05, + "loss": 0.3838, "step": 86420 }, { - "epoch": 3.04, - "learning_rate": 1.7588757287641844e-05, - "loss": 0.2685, + "epoch": 3.114751144267849, + "grad_norm": 0.18075329065322876, + "learning_rate": 1.645248238112127e-05, + "loss": 0.3807, "step": 86425 }, { - "epoch": 3.04, - "learning_rate": 1.7586036668845966e-05, - "loss": 0.282, + "epoch": 3.1149313439290736, + "grad_norm": 0.21312923729419708, + "learning_rate": 1.644974016386594e-05, + "loss": 0.3996, "step": 86430 }, { - "epoch": 3.04, - "learning_rate": 1.7583316146315525e-05, - "loss": 0.2636, + "epoch": 3.115111543590298, + "grad_norm": 0.1856447458267212, + "learning_rate": 1.644699806310624e-05, + "loss": 0.3807, "step": 86435 }, { - "epoch": 3.04, - "learning_rate": 1.7580595720085835e-05, - "loss": 0.2661, + "epoch": 3.1152917432515226, + "grad_norm": 0.23847022652626038, + "learning_rate": 1.6444256078879537e-05, + "loss": 0.3593, "step": 86440 }, { - "epoch": 3.04, - "learning_rate": 1.757787539019221e-05, - "loss": 0.2676, + "epoch": 3.1154719429127473, + "grad_norm": 0.2461748719215393, + "learning_rate": 1.6441514211223197e-05, + "loss": 0.3845, "step": 86445 }, { - "epoch": 3.04, - "learning_rate": 1.7575155156669993e-05, - "loss": 0.257, + "epoch": 3.115652142573972, + "grad_norm": 0.17290343344211578, + "learning_rate": 1.6438772460174558e-05, + "loss": 0.385, "step": 86450 }, { - "epoch": 3.04, - "learning_rate": 1.757243501955449e-05, - "loss": 0.2919, + "epoch": 3.115832342235197, + "grad_norm": 0.2519152760505676, + "learning_rate": 1.6436030825770992e-05, + "loss": 0.3341, "step": 86455 }, { - "epoch": 3.04, - "learning_rate": 1.756971497888103e-05, - "loss": 0.2632, + "epoch": 3.116012541896421, + "grad_norm": 0.22315552830696106, + "learning_rate": 1.6433289308049847e-05, + "loss": 0.3806, "step": 86460 }, { - "epoch": 3.04, - "learning_rate": 1.7566995034684908e-05, - "loss": 0.2783, + "epoch": 3.116192741557646, + "grad_norm": 0.20392777025699615, + "learning_rate": 1.6430547907048474e-05, + "loss": 0.3915, "step": 86465 }, { - "epoch": 3.04, - "learning_rate": 1.7564275187001472e-05, - "loss": 0.2767, + "epoch": 3.1163729412188705, + "grad_norm": 0.2624046206474304, + "learning_rate": 1.6427806622804233e-05, + "loss": 0.4327, "step": 86470 }, { - "epoch": 3.04, - "learning_rate": 1.7561555435866012e-05, - "loss": 0.2671, + "epoch": 3.1165531408800953, + "grad_norm": 0.1981981098651886, + "learning_rate": 1.6425065455354457e-05, + "loss": 0.3494, "step": 86475 }, { - "epoch": 3.04, - "learning_rate": 1.755883578131386e-05, - "loss": 0.2403, + "epoch": 3.1167333405413196, + "grad_norm": 0.2108498364686966, + "learning_rate": 1.6422324404736512e-05, + "loss": 0.3365, "step": 86480 }, { - "epoch": 3.04, - "learning_rate": 1.7556116223380315e-05, - "loss": 0.2492, + "epoch": 3.1169135402025443, + "grad_norm": 0.30326491594314575, + "learning_rate": 1.6419583470987742e-05, + "loss": 0.3961, "step": 86485 }, { - "epoch": 3.04, - "learning_rate": 1.7553396762100702e-05, - "loss": 0.257, + "epoch": 3.117093739863769, + "grad_norm": 0.22434242069721222, + "learning_rate": 1.641684265414547e-05, + "loss": 0.3698, "step": 86490 }, { - "epoch": 3.04, - "learning_rate": 1.7550677397510324e-05, - "loss": 0.2787, + "epoch": 3.1172739395249938, + "grad_norm": 0.2361210584640503, + "learning_rate": 1.641410195424707e-05, + "loss": 0.3758, "step": 86495 }, { - "epoch": 3.04, - "learning_rate": 1.754795812964449e-05, - "loss": 0.2483, + "epoch": 3.1174541391862185, + "grad_norm": 0.23360228538513184, + "learning_rate": 1.6411361371329852e-05, + "loss": 0.3541, "step": 86500 }, { - "epoch": 3.04, - "eval_loss": 0.2569468021392822, - "eval_runtime": 10.5367, - "eval_samples_per_second": 9.491, - "eval_steps_per_second": 9.491, + "epoch": 3.1174541391862185, + "eval_loss": 0.43187737464904785, + "eval_runtime": 3.5321, + "eval_samples_per_second": 28.311, + "eval_steps_per_second": 7.078, "step": 86500 }, { - "epoch": 3.04, - "learning_rate": 1.7545238958538496e-05, - "loss": 0.2703, + "epoch": 3.117634338847443, + "grad_norm": 0.26229509711265564, + "learning_rate": 1.6408620905431192e-05, + "loss": 0.3866, "step": 86505 }, { - "epoch": 3.04, - "learning_rate": 1.7542519884227677e-05, - "loss": 0.2612, + "epoch": 3.1178145385086675, + "grad_norm": 0.22185193002223969, + "learning_rate": 1.64058805565884e-05, + "loss": 0.3794, "step": 86510 }, { - "epoch": 3.04, - "learning_rate": 1.753980090674732e-05, - "loss": 0.2541, + "epoch": 3.1179947381698923, + "grad_norm": 0.26995113492012024, + "learning_rate": 1.6403140324838817e-05, + "loss": 0.3945, "step": 86515 }, { - "epoch": 3.04, - "learning_rate": 1.7537082026132734e-05, - "loss": 0.2626, + "epoch": 3.118174937831117, + "grad_norm": 0.30093368887901306, + "learning_rate": 1.640040021021979e-05, + "loss": 0.3729, "step": 86520 }, { - "epoch": 3.04, - "learning_rate": 1.7534363242419208e-05, - "loss": 0.2525, + "epoch": 3.1183551374923417, + "grad_norm": 0.24983744323253632, + "learning_rate": 1.6397660212768643e-05, + "loss": 0.369, "step": 86525 }, { - "epoch": 3.04, - "learning_rate": 1.753164455564206e-05, - "loss": 0.2516, + "epoch": 3.118535337153566, + "grad_norm": 0.2591390609741211, + "learning_rate": 1.6394920332522705e-05, + "loss": 0.398, "step": 86530 }, { - "epoch": 3.04, - "learning_rate": 1.752892596583659e-05, - "loss": 0.2643, + "epoch": 3.1187155368147907, + "grad_norm": 0.23025575280189514, + "learning_rate": 1.6392180569519316e-05, + "loss": 0.38, "step": 86535 }, { - "epoch": 3.04, - "learning_rate": 1.7526207473038096e-05, - "loss": 0.2558, + "epoch": 3.1188957364760155, + "grad_norm": 0.1807858794927597, + "learning_rate": 1.6389440923795795e-05, + "loss": 0.3726, "step": 86540 }, { - "epoch": 3.04, - "learning_rate": 1.752348907728186e-05, - "loss": 0.2617, + "epoch": 3.11907593613724, + "grad_norm": 0.24324113130569458, + "learning_rate": 1.6386701395389484e-05, + "loss": 0.3855, "step": 86545 }, { - "epoch": 3.05, - "learning_rate": 1.7520770778603203e-05, - "loss": 0.2511, + "epoch": 3.1192561357984645, + "grad_norm": 0.17595435678958893, + "learning_rate": 1.6383961984337694e-05, + "loss": 0.3635, "step": 86550 }, { - "epoch": 3.05, - "learning_rate": 1.7518052577037404e-05, - "loss": 0.271, + "epoch": 3.1194363354596892, + "grad_norm": 0.2983537018299103, + "learning_rate": 1.638122269067775e-05, + "loss": 0.393, "step": 86555 }, { - "epoch": 3.05, - "learning_rate": 1.751533447261975e-05, - "loss": 0.2645, + "epoch": 3.119616535120914, + "grad_norm": 0.25049325823783875, + "learning_rate": 1.6378483514446984e-05, + "loss": 0.3885, "step": 86560 }, { - "epoch": 3.05, - "learning_rate": 1.7512616465385562e-05, - "loss": 0.2534, + "epoch": 3.1197967347821387, + "grad_norm": 0.26442599296569824, + "learning_rate": 1.637574445568272e-05, + "loss": 0.3671, "step": 86565 }, { - "epoch": 3.05, - "learning_rate": 1.7509898555370113e-05, - "loss": 0.2519, + "epoch": 3.1199769344433634, + "grad_norm": 0.2337508350610733, + "learning_rate": 1.6373005514422246e-05, + "loss": 0.3758, "step": 86570 }, { - "epoch": 3.05, - "learning_rate": 1.7507180742608695e-05, - "loss": 0.2461, + "epoch": 3.1201571341045877, + "grad_norm": 0.2032548189163208, + "learning_rate": 1.6370266690702912e-05, + "loss": 0.3562, "step": 86575 }, { - "epoch": 3.05, - "learning_rate": 1.7504463027136585e-05, - "loss": 0.249, + "epoch": 3.1203373337658125, + "grad_norm": 0.24068522453308105, + "learning_rate": 1.6367527984562024e-05, + "loss": 0.36, "step": 86580 }, { - "epoch": 3.05, - "learning_rate": 1.750174540898909e-05, - "loss": 0.2673, + "epoch": 3.120517533427037, + "grad_norm": 0.26474300026893616, + "learning_rate": 1.6364789396036895e-05, + "loss": 0.4097, "step": 86585 }, { - "epoch": 3.05, - "learning_rate": 1.7499027888201496e-05, - "loss": 0.2567, + "epoch": 3.120697733088262, + "grad_norm": 0.17933666706085205, + "learning_rate": 1.6362050925164846e-05, + "loss": 0.3599, "step": 86590 }, { - "epoch": 3.05, - "learning_rate": 1.749631046480908e-05, - "loss": 0.2565, + "epoch": 3.120877932749486, + "grad_norm": 0.23188619315624237, + "learning_rate": 1.635931257198317e-05, + "loss": 0.3753, "step": 86595 }, { - "epoch": 3.05, - "learning_rate": 1.7493593138847113e-05, - "loss": 0.2932, + "epoch": 3.121058132410711, + "grad_norm": 0.19330739974975586, + "learning_rate": 1.6356574336529196e-05, + "loss": 0.3503, "step": 86600 }, { - "epoch": 3.05, - "learning_rate": 1.749087591035091e-05, - "loss": 0.2615, + "epoch": 3.1212383320719357, + "grad_norm": 0.21732577681541443, + "learning_rate": 1.6353836218840223e-05, + "loss": 0.4139, "step": 86605 }, { - "epoch": 3.05, - "learning_rate": 1.7488158779355724e-05, - "loss": 0.2538, + "epoch": 3.1214185317331604, + "grad_norm": 0.2698850929737091, + "learning_rate": 1.635109821895355e-05, + "loss": 0.3719, "step": 86610 }, { - "epoch": 3.05, - "learning_rate": 1.7485441745896853e-05, - "loss": 0.2511, + "epoch": 3.121598731394385, + "grad_norm": 0.21727710962295532, + "learning_rate": 1.6348360336906492e-05, + "loss": 0.367, "step": 86615 }, { - "epoch": 3.05, - "learning_rate": 1.748272481000956e-05, - "loss": 0.2427, + "epoch": 3.1217789310556094, + "grad_norm": 0.23365622758865356, + "learning_rate": 1.6345622572736345e-05, + "loss": 0.4036, "step": 86620 }, { - "epoch": 3.05, - "learning_rate": 1.748000797172914e-05, - "loss": 0.2435, + "epoch": 3.121959130716834, + "grad_norm": 0.23919281363487244, + "learning_rate": 1.6342884926480425e-05, + "loss": 0.3967, "step": 86625 }, { - "epoch": 3.05, - "learning_rate": 1.7477291231090857e-05, - "loss": 0.242, + "epoch": 3.122139330378059, + "grad_norm": 0.1928279846906662, + "learning_rate": 1.634014739817602e-05, + "loss": 0.4055, "step": 86630 }, { - "epoch": 3.05, - "learning_rate": 1.7474574588129987e-05, - "loss": 0.2656, + "epoch": 3.1223195300392836, + "grad_norm": 0.21561673283576965, + "learning_rate": 1.6337409987860423e-05, + "loss": 0.3846, "step": 86635 }, { - "epoch": 3.05, - "learning_rate": 1.7471858042881805e-05, - "loss": 0.2405, + "epoch": 3.1224997297005084, + "grad_norm": 0.2624969780445099, + "learning_rate": 1.6334672695570942e-05, + "loss": 0.4019, "step": 86640 }, { - "epoch": 3.05, - "learning_rate": 1.7469141595381594e-05, - "loss": 0.2518, + "epoch": 3.1226799293617327, + "grad_norm": 0.18644605576992035, + "learning_rate": 1.633193552134487e-05, + "loss": 0.3881, "step": 86645 }, { - "epoch": 3.05, - "learning_rate": 1.7466425245664614e-05, - "loss": 0.2752, + "epoch": 3.1228601290229574, + "grad_norm": 0.20565631985664368, + "learning_rate": 1.632919846521949e-05, + "loss": 0.3493, "step": 86650 }, { - "epoch": 3.05, - "learning_rate": 1.7463708993766137e-05, - "loss": 0.2634, + "epoch": 3.123040328684182, + "grad_norm": 0.2321755588054657, + "learning_rate": 1.632646152723211e-05, + "loss": 0.3675, "step": 86655 }, { - "epoch": 3.05, - "learning_rate": 1.7460992839721422e-05, - "loss": 0.2472, + "epoch": 3.123220528345407, + "grad_norm": 0.20580174028873444, + "learning_rate": 1.6323724707420013e-05, + "loss": 0.3705, "step": 86660 }, { - "epoch": 3.05, - "learning_rate": 1.7458276783565754e-05, - "loss": 0.2352, + "epoch": 3.123400728006631, + "grad_norm": 0.19330060482025146, + "learning_rate": 1.6320988005820484e-05, + "loss": 0.3382, "step": 86665 }, { - "epoch": 3.05, - "learning_rate": 1.74555608253344e-05, - "loss": 0.2356, + "epoch": 3.123580927667856, + "grad_norm": 0.20522934198379517, + "learning_rate": 1.631825142247082e-05, + "loss": 0.3555, "step": 86670 }, { - "epoch": 3.05, - "learning_rate": 1.7452844965062598e-05, - "loss": 0.2326, + "epoch": 3.1237611273290806, + "grad_norm": 0.23507462441921234, + "learning_rate": 1.6315514957408284e-05, + "loss": 0.3862, "step": 86675 }, { - "epoch": 3.05, - "learning_rate": 1.7450129202785647e-05, - "loss": 0.2642, + "epoch": 3.1239413269903054, + "grad_norm": 0.2328176349401474, + "learning_rate": 1.6312778610670193e-05, + "loss": 0.3903, "step": 86680 }, { - "epoch": 3.05, - "learning_rate": 1.744741353853879e-05, - "loss": 0.2588, + "epoch": 3.12412152665153, + "grad_norm": 0.220947727560997, + "learning_rate": 1.6310042382293806e-05, + "loss": 0.3965, "step": 86685 }, { - "epoch": 3.05, - "learning_rate": 1.7444697972357287e-05, - "loss": 0.2801, + "epoch": 3.1243017263127544, + "grad_norm": 0.2658975124359131, + "learning_rate": 1.6307306272316402e-05, + "loss": 0.3763, "step": 86690 }, { - "epoch": 3.05, - "learning_rate": 1.74419825042764e-05, - "loss": 0.2571, + "epoch": 3.124481925973979, + "grad_norm": 0.2326037585735321, + "learning_rate": 1.6304570280775275e-05, + "loss": 0.4033, "step": 86695 }, { - "epoch": 3.05, - "learning_rate": 1.7439267134331393e-05, - "loss": 0.2463, + "epoch": 3.124662125635204, + "grad_norm": 0.22777698934078217, + "learning_rate": 1.6301834407707684e-05, + "loss": 0.4019, "step": 86700 }, { - "epoch": 3.05, - "learning_rate": 1.743655186255752e-05, - "loss": 0.2573, + "epoch": 3.1248423252964286, + "grad_norm": 0.2394534796476364, + "learning_rate": 1.6299098653150926e-05, + "loss": 0.3871, "step": 86705 }, { - "epoch": 3.05, - "learning_rate": 1.743383668899004e-05, - "loss": 0.2593, + "epoch": 3.125022524957653, + "grad_norm": 0.18719437718391418, + "learning_rate": 1.6296363017142264e-05, + "loss": 0.3503, "step": 86710 }, { - "epoch": 3.05, - "learning_rate": 1.7431121613664193e-05, - "loss": 0.257, + "epoch": 3.1252027246188776, + "grad_norm": 0.24379460513591766, + "learning_rate": 1.6293627499718962e-05, + "loss": 0.3867, "step": 86715 }, { - "epoch": 3.05, - "learning_rate": 1.7428406636615252e-05, - "loss": 0.2623, + "epoch": 3.1253829242801023, + "grad_norm": 0.20296038687229156, + "learning_rate": 1.629089210091831e-05, + "loss": 0.3609, "step": 86720 }, { - "epoch": 3.05, - "learning_rate": 1.742569175787846e-05, - "loss": 0.2597, + "epoch": 3.125563123941327, + "grad_norm": 0.21820303797721863, + "learning_rate": 1.6288156820777556e-05, + "loss": 0.3685, "step": 86725 }, { - "epoch": 3.05, - "learning_rate": 1.742297697748907e-05, - "loss": 0.2608, + "epoch": 3.125743323602552, + "grad_norm": 0.25657927989959717, + "learning_rate": 1.628542165933399e-05, + "loss": 0.4238, "step": 86730 }, { - "epoch": 3.05, - "learning_rate": 1.742026229548232e-05, - "loss": 0.2469, + "epoch": 3.125923523263776, + "grad_norm": 0.26692578196525574, + "learning_rate": 1.628268661662486e-05, + "loss": 0.3551, "step": 86735 }, { - "epoch": 3.05, - "learning_rate": 1.7417547711893485e-05, - "loss": 0.2675, + "epoch": 3.126103722925001, + "grad_norm": 0.23433983325958252, + "learning_rate": 1.6279951692687433e-05, + "loss": 0.3719, "step": 86740 }, { - "epoch": 3.05, - "learning_rate": 1.7414833226757786e-05, - "loss": 0.2315, + "epoch": 3.1262839225862256, + "grad_norm": 0.19793805480003357, + "learning_rate": 1.6277216887558982e-05, + "loss": 0.3766, "step": 86745 }, { - "epoch": 3.05, - "learning_rate": 1.7412118840110484e-05, - "loss": 0.2394, + "epoch": 3.1264641222474503, + "grad_norm": 0.2359396368265152, + "learning_rate": 1.6274482201276765e-05, + "loss": 0.3728, "step": 86750 }, { - "epoch": 3.05, - "learning_rate": 1.740940455198681e-05, - "loss": 0.267, + "epoch": 3.1266443219086746, + "grad_norm": 0.24847714602947235, + "learning_rate": 1.6271747633878032e-05, + "loss": 0.3969, "step": 86755 }, { - "epoch": 3.05, - "learning_rate": 1.7406690362422023e-05, - "loss": 0.2702, + "epoch": 3.1268245215698993, + "grad_norm": 0.26364535093307495, + "learning_rate": 1.6269013185400054e-05, + "loss": 0.3949, "step": 86760 }, { - "epoch": 3.05, - "learning_rate": 1.7403976271451358e-05, - "loss": 0.2547, + "epoch": 3.127004721231124, + "grad_norm": 0.22220592200756073, + "learning_rate": 1.6266278855880075e-05, + "loss": 0.3691, "step": 86765 }, { - "epoch": 3.05, - "learning_rate": 1.7401805069686268e-05, - "loss": 0.2618, + "epoch": 3.127184920892349, + "grad_norm": 0.2486381232738495, + "learning_rate": 1.626354464535536e-05, + "loss": 0.3708, "step": 86770 }, { - "epoch": 3.05, - "learning_rate": 1.7399091156273828e-05, - "loss": 0.2611, + "epoch": 3.1273651205535735, + "grad_norm": 0.23804841935634613, + "learning_rate": 1.626081055386316e-05, + "loss": 0.3858, "step": 86775 }, { - "epoch": 3.05, - "learning_rate": 1.7396377341554176e-05, - "loss": 0.2334, + "epoch": 3.127545320214798, + "grad_norm": 0.19268541038036346, + "learning_rate": 1.6258076581440712e-05, + "loss": 0.3594, "step": 86780 }, { - "epoch": 3.05, - "learning_rate": 1.7393663625562544e-05, - "loss": 0.2733, + "epoch": 3.1277255198760225, + "grad_norm": 0.2013455331325531, + "learning_rate": 1.6255342728125295e-05, + "loss": 0.3667, "step": 86785 }, { - "epoch": 3.05, - "learning_rate": 1.7390950008334194e-05, - "loss": 0.2609, + "epoch": 3.1279057195372473, + "grad_norm": 0.26369357109069824, + "learning_rate": 1.6252608993954132e-05, + "loss": 0.4069, "step": 86790 }, { - "epoch": 3.05, - "learning_rate": 1.738823648990434e-05, - "loss": 0.2555, + "epoch": 3.128085919198472, + "grad_norm": 0.23781251907348633, + "learning_rate": 1.6249875378964475e-05, + "loss": 0.3672, "step": 86795 }, { - "epoch": 3.05, - "learning_rate": 1.7385523070308218e-05, - "loss": 0.2658, + "epoch": 3.1282661188596967, + "grad_norm": 0.1853938102722168, + "learning_rate": 1.6247141883193578e-05, + "loss": 0.3969, "step": 86800 }, { - "epoch": 3.05, - "learning_rate": 1.738280974958105e-05, - "loss": 0.2902, + "epoch": 3.128446318520921, + "grad_norm": 0.1940661072731018, + "learning_rate": 1.624440850667867e-05, + "loss": 0.3897, "step": 86805 }, { - "epoch": 3.05, - "learning_rate": 1.738009652775809e-05, - "loss": 0.2581, + "epoch": 3.1286265181821458, + "grad_norm": 0.21411295235157013, + "learning_rate": 1.624167524945701e-05, + "loss": 0.386, "step": 86810 }, { - "epoch": 3.05, - "learning_rate": 1.737738340487455e-05, - "loss": 0.2536, + "epoch": 3.1288067178433705, + "grad_norm": 0.19931474328041077, + "learning_rate": 1.6238942111565826e-05, + "loss": 0.4114, "step": 86815 }, { - "epoch": 3.05, - "learning_rate": 1.7374670380965668e-05, - "loss": 0.2576, + "epoch": 3.1289869175045952, + "grad_norm": 0.19459813833236694, + "learning_rate": 1.6236209093042355e-05, + "loss": 0.3829, "step": 86820 }, { - "epoch": 3.05, - "learning_rate": 1.7371957456066657e-05, - "loss": 0.2596, + "epoch": 3.1291671171658195, + "grad_norm": 0.18339698016643524, + "learning_rate": 1.623347619392384e-05, + "loss": 0.3981, "step": 86825 }, { - "epoch": 3.05, - "learning_rate": 1.7369244630212757e-05, - "loss": 0.2531, + "epoch": 3.1293473168270443, + "grad_norm": 0.21319806575775146, + "learning_rate": 1.6230743414247522e-05, + "loss": 0.3896, "step": 86830 }, { - "epoch": 3.06, - "learning_rate": 1.7366531903439184e-05, - "loss": 0.2439, + "epoch": 3.129527516488269, + "grad_norm": 0.2697208523750305, + "learning_rate": 1.6228010754050615e-05, + "loss": 0.4042, "step": 86835 }, { - "epoch": 3.06, - "learning_rate": 1.7363819275781168e-05, - "loss": 0.2591, + "epoch": 3.1297077161494937, + "grad_norm": 0.2193807065486908, + "learning_rate": 1.6225278213370373e-05, + "loss": 0.4035, "step": 86840 }, { - "epoch": 3.06, - "learning_rate": 1.7361106747273918e-05, - "loss": 0.2785, + "epoch": 3.1298879158107185, + "grad_norm": 0.195016011595726, + "learning_rate": 1.622254579224401e-05, + "loss": 0.3613, "step": 86845 }, { - "epoch": 3.06, - "learning_rate": 1.7358394317952668e-05, - "loss": 0.2839, + "epoch": 3.1300681154719427, + "grad_norm": 0.19086046516895294, + "learning_rate": 1.6219813490708767e-05, + "loss": 0.4081, "step": 86850 }, { - "epoch": 3.06, - "learning_rate": 1.735568198785264e-05, - "loss": 0.2684, + "epoch": 3.1302483151331675, + "grad_norm": 0.1884526163339615, + "learning_rate": 1.621708130880187e-05, + "loss": 0.3415, "step": 86855 }, { - "epoch": 3.06, - "learning_rate": 1.7352969757009025e-05, - "loss": 0.2733, + "epoch": 3.130428514794392, + "grad_norm": 0.2053939402103424, + "learning_rate": 1.6214349246560522e-05, + "loss": 0.38, "step": 86860 }, { - "epoch": 3.06, - "learning_rate": 1.735025762545707e-05, - "loss": 0.2549, + "epoch": 3.130608714455617, + "grad_norm": 0.18308919668197632, + "learning_rate": 1.6211617304021977e-05, + "loss": 0.3578, "step": 86865 }, { - "epoch": 3.06, - "learning_rate": 1.734754559323198e-05, - "loss": 0.2776, + "epoch": 3.1307889141168417, + "grad_norm": 0.21582022309303284, + "learning_rate": 1.620888548122345e-05, + "loss": 0.3648, "step": 86870 }, { - "epoch": 3.06, - "learning_rate": 1.7344833660368967e-05, - "loss": 0.2678, + "epoch": 3.130969113778066, + "grad_norm": 0.2522687315940857, + "learning_rate": 1.6206153778202144e-05, + "loss": 0.3881, "step": 86875 }, { - "epoch": 3.06, - "learning_rate": 1.7342121826903235e-05, - "loss": 0.2734, + "epoch": 3.1311493134392907, + "grad_norm": 0.2382495403289795, + "learning_rate": 1.62034221949953e-05, + "loss": 0.3718, "step": 86880 }, { - "epoch": 3.06, - "learning_rate": 1.7339410092870013e-05, - "loss": 0.2364, + "epoch": 3.1313295131005154, + "grad_norm": 0.2659468650817871, + "learning_rate": 1.6200690731640112e-05, + "loss": 0.3654, "step": 86885 }, { - "epoch": 3.06, - "learning_rate": 1.7336698458304497e-05, - "loss": 0.2606, + "epoch": 3.13150971276174, + "grad_norm": 0.24328552186489105, + "learning_rate": 1.6197959388173825e-05, + "loss": 0.3785, "step": 86890 }, { - "epoch": 3.06, - "learning_rate": 1.7333986923241906e-05, - "loss": 0.2835, + "epoch": 3.1316899124229645, + "grad_norm": 0.22692778706550598, + "learning_rate": 1.6195228164633634e-05, + "loss": 0.363, "step": 86895 }, { - "epoch": 3.06, - "learning_rate": 1.7331275487717437e-05, - "loss": 0.239, + "epoch": 3.131870112084189, + "grad_norm": 0.24278061091899872, + "learning_rate": 1.6192497061056747e-05, + "loss": 0.3806, "step": 86900 }, { - "epoch": 3.06, - "learning_rate": 1.732856415176631e-05, - "loss": 0.2545, + "epoch": 3.132050311745414, + "grad_norm": 0.23407572507858276, + "learning_rate": 1.6189766077480386e-05, + "loss": 0.3659, "step": 86905 }, { - "epoch": 3.06, - "learning_rate": 1.732585291542372e-05, - "loss": 0.2363, + "epoch": 3.1322305114066387, + "grad_norm": 0.24145354330539703, + "learning_rate": 1.6187035213941754e-05, + "loss": 0.3704, "step": 86910 }, { - "epoch": 3.06, - "learning_rate": 1.732314177872487e-05, - "loss": 0.2535, + "epoch": 3.1324107110678634, + "grad_norm": 0.18080276250839233, + "learning_rate": 1.618430447047807e-05, + "loss": 0.3888, "step": 86915 }, { - "epoch": 3.06, - "learning_rate": 1.7320430741704962e-05, - "loss": 0.266, + "epoch": 3.1325909107290877, + "grad_norm": 0.2304733842611313, + "learning_rate": 1.6181573847126525e-05, + "loss": 0.3904, "step": 86920 }, { - "epoch": 3.06, - "learning_rate": 1.73177198043992e-05, - "loss": 0.2702, + "epoch": 3.1327711103903124, + "grad_norm": 0.261940598487854, + "learning_rate": 1.617884334392432e-05, + "loss": 0.3794, "step": 86925 }, { - "epoch": 3.06, - "learning_rate": 1.7315008966842788e-05, - "loss": 0.2557, + "epoch": 3.132951310051537, + "grad_norm": 0.21325023472309113, + "learning_rate": 1.617611296090868e-05, + "loss": 0.3466, "step": 86930 }, { - "epoch": 3.06, - "learning_rate": 1.731229822907092e-05, - "loss": 0.2587, + "epoch": 3.133131509712762, + "grad_norm": 0.21175888180732727, + "learning_rate": 1.617338269811679e-05, + "loss": 0.3665, "step": 86935 }, { - "epoch": 3.06, - "learning_rate": 1.7309587591118787e-05, - "loss": 0.2599, + "epoch": 3.133311709373986, + "grad_norm": 0.2089497148990631, + "learning_rate": 1.6170652555585847e-05, + "loss": 0.3469, "step": 86940 }, { - "epoch": 3.06, - "learning_rate": 1.7306877053021602e-05, - "loss": 0.2537, + "epoch": 3.133491909035211, + "grad_norm": 0.2231149524450302, + "learning_rate": 1.6167922533353057e-05, + "loss": 0.3621, "step": 86945 }, { - "epoch": 3.06, - "learning_rate": 1.7304166614814538e-05, - "loss": 0.2791, + "epoch": 3.1336721086964356, + "grad_norm": 0.19638942182064056, + "learning_rate": 1.6165192631455605e-05, + "loss": 0.3358, "step": 86950 }, { - "epoch": 3.06, - "learning_rate": 1.7301456276532808e-05, - "loss": 0.2666, + "epoch": 3.1338523083576604, + "grad_norm": 0.22249744832515717, + "learning_rate": 1.6162462849930704e-05, + "loss": 0.3728, "step": 86955 }, { - "epoch": 3.06, - "learning_rate": 1.729874603821158e-05, - "loss": 0.2499, + "epoch": 3.134032508018885, + "grad_norm": 0.27574974298477173, + "learning_rate": 1.6159733188815537e-05, + "loss": 0.3883, "step": 86960 }, { - "epoch": 3.06, - "learning_rate": 1.729603589988607e-05, - "loss": 0.2402, + "epoch": 3.1342127076801094, + "grad_norm": 0.23349010944366455, + "learning_rate": 1.615700364814728e-05, + "loss": 0.3996, "step": 86965 }, { - "epoch": 3.06, - "learning_rate": 1.7293325861591456e-05, - "loss": 0.2629, + "epoch": 3.134392907341334, + "grad_norm": 0.2402789145708084, + "learning_rate": 1.615427422796315e-05, + "loss": 0.4117, "step": 86970 }, { - "epoch": 3.06, - "learning_rate": 1.729061592336293e-05, - "loss": 0.2554, + "epoch": 3.134573107002559, + "grad_norm": 0.2527726888656616, + "learning_rate": 1.6151544928300317e-05, + "loss": 0.3607, "step": 86975 }, { - "epoch": 3.06, - "learning_rate": 1.7287906085235662e-05, - "loss": 0.2523, + "epoch": 3.1347533066637836, + "grad_norm": 0.2573539614677429, + "learning_rate": 1.614881574919596e-05, + "loss": 0.3634, "step": 86980 }, { - "epoch": 3.06, - "learning_rate": 1.7285196347244868e-05, - "loss": 0.2578, + "epoch": 3.134933506325008, + "grad_norm": 0.2581014037132263, + "learning_rate": 1.6146086690687294e-05, + "loss": 0.3679, "step": 86985 }, { - "epoch": 3.06, - "learning_rate": 1.728248670942571e-05, - "loss": 0.2509, + "epoch": 3.1351137059862326, + "grad_norm": 0.2501007616519928, + "learning_rate": 1.6143357752811463e-05, + "loss": 0.3852, "step": 86990 }, { - "epoch": 3.06, - "learning_rate": 1.7279777171813366e-05, - "loss": 0.2498, + "epoch": 3.1352939056474574, + "grad_norm": 0.2757335603237152, + "learning_rate": 1.6140628935605684e-05, + "loss": 0.3801, "step": 86995 }, { - "epoch": 3.06, - "learning_rate": 1.7277067734443038e-05, - "loss": 0.2589, + "epoch": 3.135474105308682, + "grad_norm": 0.22444647550582886, + "learning_rate": 1.6137900239107118e-05, + "loss": 0.4285, "step": 87000 }, { - "epoch": 3.06, - "eval_loss": 0.25700682401657104, - "eval_runtime": 10.5666, - "eval_samples_per_second": 9.464, - "eval_steps_per_second": 9.464, + "epoch": 3.135474105308682, + "eval_loss": 0.43149107694625854, + "eval_runtime": 3.5463, + "eval_samples_per_second": 28.199, + "eval_steps_per_second": 7.05, "step": 87000 }, { - "epoch": 3.06, - "learning_rate": 1.7274358397349895e-05, - "loss": 0.2571, + "epoch": 3.135654304969907, + "grad_norm": 0.20283716917037964, + "learning_rate": 1.613517166335294e-05, + "loss": 0.3473, "step": 87005 }, { - "epoch": 3.06, - "learning_rate": 1.727164916056912e-05, - "loss": 0.2686, + "epoch": 3.135834504631131, + "grad_norm": 0.2266281247138977, + "learning_rate": 1.6132443208380333e-05, + "loss": 0.3585, "step": 87010 }, { - "epoch": 3.06, - "learning_rate": 1.726894002413587e-05, - "loss": 0.2568, + "epoch": 3.136014704292356, + "grad_norm": 0.21769681572914124, + "learning_rate": 1.6129714874226473e-05, + "loss": 0.3786, "step": 87015 }, { - "epoch": 3.06, - "learning_rate": 1.726623098808536e-05, - "loss": 0.2401, + "epoch": 3.1361949039535806, + "grad_norm": 0.19556161761283875, + "learning_rate": 1.6126986660928525e-05, + "loss": 0.3848, "step": 87020 }, { - "epoch": 3.06, - "learning_rate": 1.7263522052452734e-05, - "loss": 0.2597, + "epoch": 3.1363751036148053, + "grad_norm": 0.2081514149904251, + "learning_rate": 1.6124258568523672e-05, + "loss": 0.4028, "step": 87025 }, { - "epoch": 3.06, - "learning_rate": 1.7260813217273178e-05, - "loss": 0.2436, + "epoch": 3.13655530327603, + "grad_norm": 0.2071877270936966, + "learning_rate": 1.6121530597049072e-05, + "loss": 0.4077, "step": 87030 }, { - "epoch": 3.06, - "learning_rate": 1.7258104482581854e-05, - "loss": 0.2491, + "epoch": 3.1367355029372543, + "grad_norm": 0.1915891468524933, + "learning_rate": 1.61188027465419e-05, + "loss": 0.3885, "step": 87035 }, { - "epoch": 3.06, - "learning_rate": 1.7255395848413954e-05, - "loss": 0.25, + "epoch": 3.136915702598479, + "grad_norm": 0.2584543228149414, + "learning_rate": 1.611607501703933e-05, + "loss": 0.3638, "step": 87040 }, { - "epoch": 3.06, - "learning_rate": 1.725268731480463e-05, - "loss": 0.2528, + "epoch": 3.137095902259704, + "grad_norm": 0.20328520238399506, + "learning_rate": 1.6113347408578504e-05, + "loss": 0.3868, "step": 87045 }, { - "epoch": 3.06, - "learning_rate": 1.7249978881789053e-05, - "loss": 0.2766, + "epoch": 3.1372761019209285, + "grad_norm": 0.2620421350002289, + "learning_rate": 1.6110619921196607e-05, + "loss": 0.4069, "step": 87050 }, { - "epoch": 3.06, - "learning_rate": 1.7247270549402393e-05, - "loss": 0.2714, + "epoch": 3.137456301582153, + "grad_norm": 0.20489437878131866, + "learning_rate": 1.61078925549308e-05, + "loss": 0.352, "step": 87055 }, { - "epoch": 3.06, - "learning_rate": 1.7244562317679816e-05, - "loss": 0.2443, + "epoch": 3.1376365012433776, + "grad_norm": 0.2264631986618042, + "learning_rate": 1.610516530981822e-05, + "loss": 0.3975, "step": 87060 }, { - "epoch": 3.06, - "learning_rate": 1.7241854186656493e-05, - "loss": 0.2568, + "epoch": 3.1378167009046023, + "grad_norm": 0.21898336708545685, + "learning_rate": 1.6102438185896052e-05, + "loss": 0.3688, "step": 87065 }, { - "epoch": 3.06, - "learning_rate": 1.723914615636758e-05, - "loss": 0.2646, + "epoch": 3.137996900565827, + "grad_norm": 0.21995946764945984, + "learning_rate": 1.6099711183201427e-05, + "loss": 0.388, "step": 87070 }, { - "epoch": 3.06, - "learning_rate": 1.7236438226848224e-05, - "loss": 0.2628, + "epoch": 3.1381771002270518, + "grad_norm": 0.2543283998966217, + "learning_rate": 1.6096984301771535e-05, + "loss": 0.3847, "step": 87075 }, { - "epoch": 3.06, - "learning_rate": 1.7233730398133616e-05, - "loss": 0.2443, + "epoch": 3.138357299888276, + "grad_norm": 0.210366889834404, + "learning_rate": 1.6094257541643494e-05, + "loss": 0.4029, "step": 87080 }, { - "epoch": 3.06, - "learning_rate": 1.7231022670258894e-05, - "loss": 0.2564, + "epoch": 3.138537499549501, + "grad_norm": 0.22760994732379913, + "learning_rate": 1.609153090285447e-05, + "loss": 0.4536, "step": 87085 }, { - "epoch": 3.06, - "learning_rate": 1.722831504325923e-05, - "loss": 0.2679, + "epoch": 3.1387176992107255, + "grad_norm": 0.23996824026107788, + "learning_rate": 1.6088804385441616e-05, + "loss": 0.3483, "step": 87090 }, { - "epoch": 3.06, - "learning_rate": 1.722560751716976e-05, - "loss": 0.2628, + "epoch": 3.1388978988719503, + "grad_norm": 0.2677284777164459, + "learning_rate": 1.608607798944207e-05, + "loss": 0.3906, "step": 87095 }, { - "epoch": 3.06, - "learning_rate": 1.7222900092025668e-05, - "loss": 0.2385, + "epoch": 3.139078098533175, + "grad_norm": 0.28425002098083496, + "learning_rate": 1.6083351714892997e-05, + "loss": 0.3969, "step": 87100 }, { - "epoch": 3.06, - "learning_rate": 1.7220192767862085e-05, - "loss": 0.2463, + "epoch": 3.1392582981943993, + "grad_norm": 0.2697744369506836, + "learning_rate": 1.6080625561831528e-05, + "loss": 0.3645, "step": 87105 }, { - "epoch": 3.06, - "learning_rate": 1.7217485544714167e-05, - "loss": 0.2827, + "epoch": 3.139438497855624, + "grad_norm": 0.1847570687532425, + "learning_rate": 1.60778995302948e-05, + "loss": 0.3889, "step": 87110 }, { - "epoch": 3.06, - "learning_rate": 1.7214778422617077e-05, - "loss": 0.2391, + "epoch": 3.1396186975168487, + "grad_norm": 0.2160649597644806, + "learning_rate": 1.6075173620319972e-05, + "loss": 0.3851, "step": 87115 }, { - "epoch": 3.07, - "learning_rate": 1.7212071401605962e-05, - "loss": 0.2367, + "epoch": 3.1397988971780735, + "grad_norm": 0.22959807515144348, + "learning_rate": 1.6072447831944177e-05, + "loss": 0.36, "step": 87120 }, { - "epoch": 3.07, - "learning_rate": 1.7209364481715968e-05, - "loss": 0.2658, + "epoch": 3.1399790968392978, + "grad_norm": 0.2232397347688675, + "learning_rate": 1.6069722165204544e-05, + "loss": 0.3793, "step": 87125 }, { - "epoch": 3.07, - "learning_rate": 1.7206657662982227e-05, - "loss": 0.245, + "epoch": 3.1401592965005225, + "grad_norm": 0.22282375395298004, + "learning_rate": 1.6066996620138224e-05, + "loss": 0.3569, "step": 87130 }, { - "epoch": 3.07, - "learning_rate": 1.7203950945439917e-05, - "loss": 0.261, + "epoch": 3.1403394961617472, + "grad_norm": 0.25350263714790344, + "learning_rate": 1.6064271196782337e-05, + "loss": 0.4028, "step": 87135 }, { - "epoch": 3.07, - "learning_rate": 1.720124432912416e-05, - "loss": 0.2542, + "epoch": 3.140519695822972, + "grad_norm": 0.19245287775993347, + "learning_rate": 1.6061545895174036e-05, + "loss": 0.3635, "step": 87140 }, { - "epoch": 3.07, - "learning_rate": 1.719853781407011e-05, - "loss": 0.2666, + "epoch": 3.1406998954841967, + "grad_norm": 0.18547998368740082, + "learning_rate": 1.6058820715350438e-05, + "loss": 0.3974, "step": 87145 }, { - "epoch": 3.07, - "learning_rate": 1.719583140031289e-05, - "loss": 0.2677, + "epoch": 3.140880095145421, + "grad_norm": 0.20061656832695007, + "learning_rate": 1.605609565734867e-05, + "loss": 0.3442, "step": 87150 }, { - "epoch": 3.07, - "learning_rate": 1.7193125087887673e-05, - "loss": 0.27, + "epoch": 3.1410602948066457, + "grad_norm": 0.2687702178955078, + "learning_rate": 1.6053370721205877e-05, + "loss": 0.3709, "step": 87155 }, { - "epoch": 3.07, - "learning_rate": 1.7190418876829577e-05, - "loss": 0.2322, + "epoch": 3.1412404944678705, + "grad_norm": 0.22766318917274475, + "learning_rate": 1.6050645906959178e-05, + "loss": 0.3863, "step": 87160 }, { - "epoch": 3.07, - "learning_rate": 1.7187712767173747e-05, - "loss": 0.2537, + "epoch": 3.141420694129095, + "grad_norm": 0.21267448365688324, + "learning_rate": 1.6047921214645677e-05, + "loss": 0.3719, "step": 87165 }, { - "epoch": 3.07, - "learning_rate": 1.718500675895531e-05, - "loss": 0.2604, + "epoch": 3.1416008937903195, + "grad_norm": 0.19360697269439697, + "learning_rate": 1.604519664430254e-05, + "loss": 0.39, "step": 87170 }, { - "epoch": 3.07, - "learning_rate": 1.7182300852209416e-05, - "loss": 0.2583, + "epoch": 3.141781093451544, + "grad_norm": 0.1883769929409027, + "learning_rate": 1.6042472195966843e-05, + "loss": 0.3667, "step": 87175 }, { - "epoch": 3.07, - "learning_rate": 1.7179595046971196e-05, - "loss": 0.2391, + "epoch": 3.141961293112769, + "grad_norm": 0.22920463979244232, + "learning_rate": 1.6039747869675747e-05, + "loss": 0.3861, "step": 87180 }, { - "epoch": 3.07, - "learning_rate": 1.717688934327578e-05, - "loss": 0.2704, + "epoch": 3.1421414927739937, + "grad_norm": 0.23282212018966675, + "learning_rate": 1.6037023665466345e-05, + "loss": 0.3846, "step": 87185 }, { - "epoch": 3.07, - "learning_rate": 1.717418374115829e-05, - "loss": 0.2694, + "epoch": 3.1423216924352184, + "grad_norm": 0.23070281744003296, + "learning_rate": 1.6034299583375752e-05, + "loss": 0.408, "step": 87190 }, { - "epoch": 3.07, - "learning_rate": 1.717147824065387e-05, - "loss": 0.2692, + "epoch": 3.1425018920964427, + "grad_norm": 0.22182480990886688, + "learning_rate": 1.60315756234411e-05, + "loss": 0.3623, "step": 87195 }, { - "epoch": 3.07, - "learning_rate": 1.7168772841797658e-05, - "loss": 0.2604, + "epoch": 3.1426820917576674, + "grad_norm": 0.23669365048408508, + "learning_rate": 1.6028851785699496e-05, + "loss": 0.3811, "step": 87200 }, { - "epoch": 3.07, - "learning_rate": 1.7166067544624765e-05, - "loss": 0.2707, + "epoch": 3.142862291418892, + "grad_norm": 0.2434239238500595, + "learning_rate": 1.602612807018804e-05, + "loss": 0.4078, "step": 87205 }, { - "epoch": 3.07, - "learning_rate": 1.716336234917031e-05, - "loss": 0.2606, + "epoch": 3.143042491080117, + "grad_norm": 0.190851092338562, + "learning_rate": 1.602340447694386e-05, + "loss": 0.363, "step": 87210 }, { - "epoch": 3.07, - "learning_rate": 1.7160657255469442e-05, - "loss": 0.236, + "epoch": 3.143222690741341, + "grad_norm": 0.26526564359664917, + "learning_rate": 1.6020681006004046e-05, + "loss": 0.4028, "step": 87215 }, { - "epoch": 3.07, - "learning_rate": 1.715795226355727e-05, - "loss": 0.2382, + "epoch": 3.143402890402566, + "grad_norm": 0.2453896403312683, + "learning_rate": 1.6017957657405724e-05, + "loss": 0.3361, "step": 87220 }, { - "epoch": 3.07, - "learning_rate": 1.7155247373468912e-05, - "loss": 0.2898, + "epoch": 3.1435830900637907, + "grad_norm": 0.28423580527305603, + "learning_rate": 1.601523443118599e-05, + "loss": 0.3749, "step": 87225 }, { - "epoch": 3.07, - "learning_rate": 1.715254258523951e-05, - "loss": 0.2667, + "epoch": 3.1437632897250154, + "grad_norm": 0.24882547557353973, + "learning_rate": 1.601251132738194e-05, + "loss": 0.4034, "step": 87230 }, { - "epoch": 3.07, - "learning_rate": 1.714983789890417e-05, - "loss": 0.2648, + "epoch": 3.14394348938624, + "grad_norm": 0.2349334955215454, + "learning_rate": 1.600978834603069e-05, + "loss": 0.3975, "step": 87235 }, { - "epoch": 3.07, - "learning_rate": 1.7147133314498008e-05, - "loss": 0.2587, + "epoch": 3.1441236890474644, + "grad_norm": 0.2091837227344513, + "learning_rate": 1.6007065487169338e-05, + "loss": 0.3942, "step": 87240 }, { - "epoch": 3.07, - "learning_rate": 1.714442883205614e-05, - "loss": 0.2615, + "epoch": 3.144303888708689, + "grad_norm": 0.19795401394367218, + "learning_rate": 1.6004342750834965e-05, + "loss": 0.3833, "step": 87245 }, { - "epoch": 3.07, - "learning_rate": 1.714172445161369e-05, - "loss": 0.2606, + "epoch": 3.144484088369914, + "grad_norm": 0.21962173283100128, + "learning_rate": 1.6001620137064694e-05, + "loss": 0.3709, "step": 87250 }, { - "epoch": 3.07, - "learning_rate": 1.713902017320577e-05, - "loss": 0.2385, + "epoch": 3.1446642880311386, + "grad_norm": 0.2462879866361618, + "learning_rate": 1.5998897645895587e-05, + "loss": 0.3942, "step": 87255 }, { - "epoch": 3.07, - "learning_rate": 1.7136315996867497e-05, - "loss": 0.243, + "epoch": 3.144844487692363, + "grad_norm": 0.21551023423671722, + "learning_rate": 1.5996175277364777e-05, + "loss": 0.3778, "step": 87260 }, { - "epoch": 3.07, - "learning_rate": 1.713361192263397e-05, - "loss": 0.2508, + "epoch": 3.1450246873535876, + "grad_norm": 0.23634564876556396, + "learning_rate": 1.599345303150933e-05, + "loss": 0.3598, "step": 87265 }, { - "epoch": 3.07, - "learning_rate": 1.713090795054031e-05, - "loss": 0.2614, + "epoch": 3.1452048870148124, + "grad_norm": 0.18265362083911896, + "learning_rate": 1.599073090836633e-05, + "loss": 0.3772, "step": 87270 }, { - "epoch": 3.07, - "learning_rate": 1.7128204080621627e-05, - "loss": 0.2454, + "epoch": 3.145385086676037, + "grad_norm": 0.1682208925485611, + "learning_rate": 1.5988008907972894e-05, + "loss": 0.336, "step": 87275 }, { - "epoch": 3.07, - "learning_rate": 1.712550031291303e-05, - "loss": 0.2599, + "epoch": 3.145565286337262, + "grad_norm": 0.28196775913238525, + "learning_rate": 1.5985287030366086e-05, + "loss": 0.4021, "step": 87280 }, { - "epoch": 3.07, - "learning_rate": 1.712279664744961e-05, - "loss": 0.2705, + "epoch": 3.145745485998486, + "grad_norm": 0.23696090281009674, + "learning_rate": 1.598256527558299e-05, + "loss": 0.4015, "step": 87285 }, { - "epoch": 3.07, - "learning_rate": 1.7120093084266493e-05, - "loss": 0.2621, + "epoch": 3.145925685659711, + "grad_norm": 0.24363209307193756, + "learning_rate": 1.5979843643660703e-05, + "loss": 0.3606, "step": 87290 }, { - "epoch": 3.07, - "learning_rate": 1.7117389623398776e-05, - "loss": 0.242, + "epoch": 3.1461058853209356, + "grad_norm": 0.24763990938663483, + "learning_rate": 1.5977122134636297e-05, + "loss": 0.3774, "step": 87295 }, { - "epoch": 3.07, - "learning_rate": 1.7114686264881557e-05, - "loss": 0.2857, + "epoch": 3.1462860849821603, + "grad_norm": 0.23061999678611755, + "learning_rate": 1.597440074854686e-05, + "loss": 0.3816, "step": 87300 }, { - "epoch": 3.07, - "learning_rate": 1.7111983008749937e-05, - "loss": 0.2446, + "epoch": 3.146466284643385, + "grad_norm": 0.22484789788722992, + "learning_rate": 1.5971679485429457e-05, + "loss": 0.3834, "step": 87305 }, { - "epoch": 3.07, - "learning_rate": 1.7109279855039023e-05, - "loss": 0.2541, + "epoch": 3.1466464843046094, + "grad_norm": 0.24081788957118988, + "learning_rate": 1.5968958345321178e-05, + "loss": 0.3798, "step": 87310 }, { - "epoch": 3.07, - "learning_rate": 1.7106576803783913e-05, - "loss": 0.2635, + "epoch": 3.146826683965834, + "grad_norm": 0.21690495312213898, + "learning_rate": 1.5966237328259092e-05, + "loss": 0.3613, "step": 87315 }, { - "epoch": 3.07, - "learning_rate": 1.71038738550197e-05, - "loss": 0.2522, + "epoch": 3.147006883627059, + "grad_norm": 0.1942356824874878, + "learning_rate": 1.5963516434280275e-05, + "loss": 0.3946, "step": 87320 }, { - "epoch": 3.07, - "learning_rate": 1.710117100878147e-05, - "loss": 0.2472, + "epoch": 3.1471870832882836, + "grad_norm": 0.23814404010772705, + "learning_rate": 1.5960795663421796e-05, + "loss": 0.4, "step": 87325 }, { - "epoch": 3.07, - "learning_rate": 1.7098468265104336e-05, - "loss": 0.2591, + "epoch": 3.147367282949508, + "grad_norm": 0.20741520822048187, + "learning_rate": 1.595807501572073e-05, + "loss": 0.3594, "step": 87330 }, { - "epoch": 3.07, - "learning_rate": 1.709576562402339e-05, - "loss": 0.2769, + "epoch": 3.1475474826107326, + "grad_norm": 0.24436409771442413, + "learning_rate": 1.595535449121413e-05, + "loss": 0.383, "step": 87335 }, { - "epoch": 3.07, - "learning_rate": 1.709306308557371e-05, - "loss": 0.285, + "epoch": 3.1477276822719573, + "grad_norm": 0.24962151050567627, + "learning_rate": 1.5952634089939083e-05, + "loss": 0.4004, "step": 87340 }, { - "epoch": 3.07, - "learning_rate": 1.7090360649790398e-05, - "loss": 0.2444, + "epoch": 3.147907881933182, + "grad_norm": 0.2126917541027069, + "learning_rate": 1.5949913811932654e-05, + "loss": 0.4005, "step": 87345 }, { - "epoch": 3.07, - "learning_rate": 1.708765831670855e-05, - "loss": 0.2723, + "epoch": 3.148088081594407, + "grad_norm": 0.19167962670326233, + "learning_rate": 1.5947193657231878e-05, + "loss": 0.3429, "step": 87350 }, { - "epoch": 3.07, - "learning_rate": 1.7084956086363228e-05, - "loss": 0.2508, + "epoch": 3.148268281255631, + "grad_norm": 0.22857992351055145, + "learning_rate": 1.594447362587385e-05, + "loss": 0.368, "step": 87355 }, { - "epoch": 3.07, - "learning_rate": 1.7082253958789545e-05, - "loss": 0.2719, + "epoch": 3.148448480916856, + "grad_norm": 0.25584468245506287, + "learning_rate": 1.5941753717895604e-05, + "loss": 0.4065, "step": 87360 }, { - "epoch": 3.07, - "learning_rate": 1.707955193402257e-05, - "loss": 0.2668, + "epoch": 3.1486286805780805, + "grad_norm": 0.25212666392326355, + "learning_rate": 1.5939033933334222e-05, + "loss": 0.3659, "step": 87365 }, { - "epoch": 3.07, - "learning_rate": 1.7076850012097405e-05, - "loss": 0.2523, + "epoch": 3.1488088802393053, + "grad_norm": 0.21378327906131744, + "learning_rate": 1.5936314272226743e-05, + "loss": 0.3726, "step": 87370 }, { - "epoch": 3.07, - "learning_rate": 1.7074148193049116e-05, - "loss": 0.2612, + "epoch": 3.14898907990053, + "grad_norm": 0.2281913012266159, + "learning_rate": 1.5933594734610225e-05, + "loss": 0.3816, "step": 87375 }, { - "epoch": 3.07, - "learning_rate": 1.7071446476912776e-05, - "loss": 0.2518, + "epoch": 3.1491692795617543, + "grad_norm": 0.21938398480415344, + "learning_rate": 1.5930875320521728e-05, + "loss": 0.3704, "step": 87380 }, { - "epoch": 3.07, - "learning_rate": 1.7068744863723496e-05, - "loss": 0.2548, + "epoch": 3.149349479222979, + "grad_norm": 0.23328323662281036, + "learning_rate": 1.5928156029998296e-05, + "loss": 0.3732, "step": 87385 }, { - "epoch": 3.07, - "learning_rate": 1.706604335351633e-05, - "loss": 0.2838, + "epoch": 3.1495296788842038, + "grad_norm": 0.26327353715896606, + "learning_rate": 1.5925436863076975e-05, + "loss": 0.3539, "step": 87390 }, { - "epoch": 3.07, - "learning_rate": 1.7063341946326366e-05, - "loss": 0.2316, + "epoch": 3.1497098785454285, + "grad_norm": 0.2839113175868988, + "learning_rate": 1.592271781979483e-05, + "loss": 0.3575, "step": 87395 }, { - "epoch": 3.07, - "learning_rate": 1.7060640642188664e-05, - "loss": 0.2461, + "epoch": 3.149890078206653, + "grad_norm": 0.23041868209838867, + "learning_rate": 1.5919998900188894e-05, + "loss": 0.3847, "step": 87400 }, { - "epoch": 3.08, - "learning_rate": 1.7057939441138328e-05, - "loss": 0.2472, + "epoch": 3.1500702778678775, + "grad_norm": 0.20966331660747528, + "learning_rate": 1.5917280104296213e-05, + "loss": 0.3631, "step": 87405 }, { - "epoch": 3.08, - "learning_rate": 1.70552383432104e-05, - "loss": 0.2563, + "epoch": 3.1502504775291023, + "grad_norm": 0.25130027532577515, + "learning_rate": 1.5914561432153836e-05, + "loss": 0.4388, "step": 87410 }, { - "epoch": 3.08, - "learning_rate": 1.7052537348439977e-05, - "loss": 0.2647, + "epoch": 3.150430677190327, + "grad_norm": 0.20757076144218445, + "learning_rate": 1.5911842883798796e-05, + "loss": 0.3988, "step": 87415 }, { - "epoch": 3.08, - "learning_rate": 1.704983645686211e-05, - "loss": 0.2284, + "epoch": 3.1506108768515517, + "grad_norm": 0.19644039869308472, + "learning_rate": 1.590912445926814e-05, + "loss": 0.386, "step": 87420 }, { - "epoch": 3.08, - "learning_rate": 1.704713566851188e-05, - "loss": 0.2526, + "epoch": 3.150791076512776, + "grad_norm": 0.23030658066272736, + "learning_rate": 1.590640615859891e-05, + "loss": 0.3882, "step": 87425 }, { - "epoch": 3.08, - "learning_rate": 1.7044434983424363e-05, - "loss": 0.2665, + "epoch": 3.1509712761740007, + "grad_norm": 0.24640274047851562, + "learning_rate": 1.590368798182812e-05, + "loss": 0.3869, "step": 87430 }, { - "epoch": 3.08, - "learning_rate": 1.70417344016346e-05, - "loss": 0.2559, + "epoch": 3.1511514758352255, + "grad_norm": 0.24621576070785522, + "learning_rate": 1.5900969928992832e-05, + "loss": 0.3922, "step": 87435 }, { - "epoch": 3.08, - "learning_rate": 1.7039033923177676e-05, - "loss": 0.255, + "epoch": 3.15133167549645, + "grad_norm": 0.2239450216293335, + "learning_rate": 1.589825200013006e-05, + "loss": 0.3877, "step": 87440 }, { - "epoch": 3.08, - "learning_rate": 1.7036333548088652e-05, - "loss": 0.2554, + "epoch": 3.1515118751576745, + "grad_norm": 0.22715716063976288, + "learning_rate": 1.5895534195276846e-05, + "loss": 0.3739, "step": 87445 }, { - "epoch": 3.08, - "learning_rate": 1.703363327640259e-05, - "loss": 0.2613, + "epoch": 3.1516920748188992, + "grad_norm": 0.22491981089115143, + "learning_rate": 1.5892816514470223e-05, + "loss": 0.363, "step": 87450 }, { - "epoch": 3.08, - "learning_rate": 1.7030933108154544e-05, - "loss": 0.2755, + "epoch": 3.151872274480124, + "grad_norm": 0.23676250874996185, + "learning_rate": 1.5890098957747197e-05, + "loss": 0.3657, "step": 87455 }, { - "epoch": 3.08, - "learning_rate": 1.7028233043379584e-05, - "loss": 0.2605, + "epoch": 3.1520524741413487, + "grad_norm": 0.22871702909469604, + "learning_rate": 1.5887381525144823e-05, + "loss": 0.4015, "step": 87460 }, { - "epoch": 3.08, - "learning_rate": 1.7025533082112768e-05, - "loss": 0.2295, + "epoch": 3.1522326738025734, + "grad_norm": 0.2012801617383957, + "learning_rate": 1.5884664216700107e-05, + "loss": 0.3338, "step": 87465 }, { - "epoch": 3.08, - "learning_rate": 1.7022833224389144e-05, - "loss": 0.2576, + "epoch": 3.1524128734637977, + "grad_norm": 0.2810083031654358, + "learning_rate": 1.588194703245007e-05, + "loss": 0.3743, "step": 87470 }, { - "epoch": 3.08, - "learning_rate": 1.7020133470243768e-05, - "loss": 0.27, + "epoch": 3.1525930731250225, + "grad_norm": 0.21618230640888214, + "learning_rate": 1.5879229972431748e-05, + "loss": 0.3838, "step": 87475 }, { - "epoch": 3.08, - "learning_rate": 1.7017433819711707e-05, - "loss": 0.2392, + "epoch": 3.152773272786247, + "grad_norm": 0.23298971354961395, + "learning_rate": 1.5876513036682143e-05, + "loss": 0.359, "step": 87480 }, { - "epoch": 3.08, - "learning_rate": 1.7014734272828016e-05, - "loss": 0.2587, + "epoch": 3.152953472447472, + "grad_norm": 0.23836317658424377, + "learning_rate": 1.5873796225238284e-05, + "loss": 0.3654, "step": 87485 }, { - "epoch": 3.08, - "learning_rate": 1.7012034829627727e-05, - "loss": 0.252, + "epoch": 3.153133672108696, + "grad_norm": 0.20612043142318726, + "learning_rate": 1.587107953813719e-05, + "loss": 0.3643, "step": 87490 }, { - "epoch": 3.08, - "learning_rate": 1.70093354901459e-05, - "loss": 0.2711, + "epoch": 3.153313871769921, + "grad_norm": 0.20500215888023376, + "learning_rate": 1.5868362975415862e-05, + "loss": 0.3322, "step": 87495 }, { - "epoch": 3.08, - "learning_rate": 1.7006636254417583e-05, - "loss": 0.275, + "epoch": 3.1534940714311457, + "grad_norm": 0.22878599166870117, + "learning_rate": 1.586564653711133e-05, + "loss": 0.347, "step": 87500 }, { - "epoch": 3.08, - "eval_loss": 0.25675198435783386, - "eval_runtime": 10.5938, - "eval_samples_per_second": 9.439, - "eval_steps_per_second": 9.439, + "epoch": 3.1534940714311457, + "eval_loss": 0.4312251806259155, + "eval_runtime": 3.5283, + "eval_samples_per_second": 28.342, + "eval_steps_per_second": 7.086, "step": 87500 }, { - "epoch": 3.08, - "learning_rate": 1.700393712247784e-05, - "loss": 0.2602, + "epoch": 3.1536742710923704, + "grad_norm": 0.23596695065498352, + "learning_rate": 1.5862930223260587e-05, + "loss": 0.4027, "step": 87505 }, { - "epoch": 3.08, - "learning_rate": 1.70012380943617e-05, - "loss": 0.2764, + "epoch": 3.153854470753595, + "grad_norm": 0.2374979853630066, + "learning_rate": 1.5860214033900657e-05, + "loss": 0.3555, "step": 87510 }, { - "epoch": 3.08, - "learning_rate": 1.69985391701042e-05, - "loss": 0.2733, + "epoch": 3.1540346704148194, + "grad_norm": 0.2398587167263031, + "learning_rate": 1.5857497969068535e-05, + "loss": 0.3814, "step": 87515 }, { - "epoch": 3.08, - "learning_rate": 1.699584034974041e-05, - "loss": 0.2469, + "epoch": 3.154214870076044, + "grad_norm": 0.1979122906923294, + "learning_rate": 1.5854782028801235e-05, + "loss": 0.3692, "step": 87520 }, { - "epoch": 3.08, - "learning_rate": 1.699314163330535e-05, - "loss": 0.2683, + "epoch": 3.154395069737269, + "grad_norm": 0.23171058297157288, + "learning_rate": 1.5852066213135764e-05, + "loss": 0.3734, "step": 87525 }, { - "epoch": 3.08, - "learning_rate": 1.6990443020834073e-05, - "loss": 0.2443, + "epoch": 3.1545752693984936, + "grad_norm": 0.221579447388649, + "learning_rate": 1.5849350522109124e-05, + "loss": 0.3978, "step": 87530 }, { - "epoch": 3.08, - "learning_rate": 1.6987744512361607e-05, - "loss": 0.2555, + "epoch": 3.1547554690597184, + "grad_norm": 0.23226810991764069, + "learning_rate": 1.5846634955758294e-05, + "loss": 0.3808, "step": 87535 }, { - "epoch": 3.08, - "learning_rate": 1.698504610792301e-05, - "loss": 0.2549, + "epoch": 3.1549356687209427, + "grad_norm": 0.25539645552635193, + "learning_rate": 1.584391951412031e-05, + "loss": 0.3677, "step": 87540 }, { - "epoch": 3.08, - "learning_rate": 1.69823478075533e-05, - "loss": 0.268, + "epoch": 3.1551158683821674, + "grad_norm": 0.23339438438415527, + "learning_rate": 1.5841204197232128e-05, + "loss": 0.4242, "step": 87545 }, { - "epoch": 3.08, - "learning_rate": 1.697964961128752e-05, - "loss": 0.2675, + "epoch": 3.155296068043392, + "grad_norm": 0.20781894028186798, + "learning_rate": 1.5838489005130784e-05, + "loss": 0.3916, "step": 87550 }, { - "epoch": 3.08, - "learning_rate": 1.69769515191607e-05, - "loss": 0.2432, + "epoch": 3.155476267704617, + "grad_norm": 0.2721303701400757, + "learning_rate": 1.5835773937853254e-05, + "loss": 0.3906, "step": 87555 }, { - "epoch": 3.08, - "learning_rate": 1.6974253531207886e-05, - "loss": 0.2515, + "epoch": 3.155656467365841, + "grad_norm": 0.20113010704517365, + "learning_rate": 1.5833058995436512e-05, + "loss": 0.3924, "step": 87560 }, { - "epoch": 3.08, - "learning_rate": 1.6971555647464103e-05, - "loss": 0.2636, + "epoch": 3.155836667027066, + "grad_norm": 0.22510038316249847, + "learning_rate": 1.5830344177917587e-05, + "loss": 0.4007, "step": 87565 }, { - "epoch": 3.08, - "learning_rate": 1.6968857867964367e-05, - "loss": 0.2557, + "epoch": 3.1560168666882906, + "grad_norm": 0.2468447983264923, + "learning_rate": 1.582762948533344e-05, + "loss": 0.3602, "step": 87570 }, { - "epoch": 3.08, - "learning_rate": 1.6966160192743727e-05, - "loss": 0.2648, + "epoch": 3.1561970663495154, + "grad_norm": 0.22670796513557434, + "learning_rate": 1.582491491772106e-05, + "loss": 0.3703, "step": 87575 }, { - "epoch": 3.08, - "learning_rate": 1.69634626218372e-05, - "loss": 0.2296, + "epoch": 3.15637726601074, + "grad_norm": 0.2570918798446655, + "learning_rate": 1.5822200475117442e-05, + "loss": 0.3967, "step": 87580 }, { - "epoch": 3.08, - "learning_rate": 1.6960765155279818e-05, - "loss": 0.2627, + "epoch": 3.1565574656719644, + "grad_norm": 0.23322150111198425, + "learning_rate": 1.581948615755956e-05, + "loss": 0.416, "step": 87585 }, { - "epoch": 3.08, - "learning_rate": 1.695806779310659e-05, - "loss": 0.2599, + "epoch": 3.156737665333189, + "grad_norm": 0.20641043782234192, + "learning_rate": 1.581677196508441e-05, + "loss": 0.3588, "step": 87590 }, { - "epoch": 3.08, - "learning_rate": 1.6955370535352566e-05, - "loss": 0.2587, + "epoch": 3.156917864994414, + "grad_norm": 0.21482810378074646, + "learning_rate": 1.5814057897728964e-05, + "loss": 0.3548, "step": 87595 }, { - "epoch": 3.08, - "learning_rate": 1.695267338205275e-05, - "loss": 0.2623, + "epoch": 3.1570980646556386, + "grad_norm": 0.2732873260974884, + "learning_rate": 1.5811343955530194e-05, + "loss": 0.3961, "step": 87600 }, { - "epoch": 3.08, - "learning_rate": 1.694997633324217e-05, - "loss": 0.2587, + "epoch": 3.1572782643168633, + "grad_norm": 0.23953673243522644, + "learning_rate": 1.5808630138525088e-05, + "loss": 0.4481, "step": 87605 }, { - "epoch": 3.08, - "learning_rate": 1.694727938895583e-05, - "loss": 0.2652, + "epoch": 3.1574584639780876, + "grad_norm": 0.23516350984573364, + "learning_rate": 1.580591644675062e-05, + "loss": 0.3804, "step": 87610 }, { - "epoch": 3.08, - "learning_rate": 1.694458254922877e-05, - "loss": 0.2439, + "epoch": 3.1576386636393123, + "grad_norm": 0.21418847143650055, + "learning_rate": 1.5803202880243754e-05, + "loss": 0.3417, "step": 87615 }, { - "epoch": 3.08, - "learning_rate": 1.6941885814096004e-05, - "loss": 0.245, + "epoch": 3.157818863300537, + "grad_norm": 0.21651922166347504, + "learning_rate": 1.580048943904147e-05, + "loss": 0.3643, "step": 87620 }, { - "epoch": 3.08, - "learning_rate": 1.6939189183592537e-05, - "loss": 0.2677, + "epoch": 3.157999062961762, + "grad_norm": 0.309662789106369, + "learning_rate": 1.5797776123180736e-05, + "loss": 0.4107, "step": 87625 }, { - "epoch": 3.08, - "learning_rate": 1.6936492657753373e-05, - "loss": 0.266, + "epoch": 3.158179262622986, + "grad_norm": 0.2053297758102417, + "learning_rate": 1.5795062932698522e-05, + "loss": 0.3574, "step": 87630 }, { - "epoch": 3.08, - "learning_rate": 1.693379623661354e-05, - "loss": 0.253, + "epoch": 3.158359462284211, + "grad_norm": 0.2058108001947403, + "learning_rate": 1.57923498676318e-05, + "loss": 0.3742, "step": 87635 }, { - "epoch": 3.08, - "learning_rate": 1.6931099920208058e-05, - "loss": 0.2574, + "epoch": 3.1585396619454356, + "grad_norm": 0.22333082556724548, + "learning_rate": 1.5789636928017514e-05, + "loss": 0.3823, "step": 87640 }, { - "epoch": 3.08, - "learning_rate": 1.6928403708571924e-05, - "loss": 0.2643, + "epoch": 3.1587198616066603, + "grad_norm": 0.2727280259132385, + "learning_rate": 1.5786924113892655e-05, + "loss": 0.3995, "step": 87645 }, { - "epoch": 3.08, - "learning_rate": 1.6925707601740136e-05, - "loss": 0.2571, + "epoch": 3.158900061267885, + "grad_norm": 0.2285049855709076, + "learning_rate": 1.578421142529417e-05, + "loss": 0.3648, "step": 87650 }, { - "epoch": 3.08, - "learning_rate": 1.692301159974773e-05, - "loss": 0.2617, + "epoch": 3.1590802609291093, + "grad_norm": 0.24408283829689026, + "learning_rate": 1.5781498862259007e-05, + "loss": 0.4071, "step": 87655 }, { - "epoch": 3.08, - "learning_rate": 1.6920315702629685e-05, - "loss": 0.2567, + "epoch": 3.159260460590334, + "grad_norm": 0.23655414581298828, + "learning_rate": 1.577878642482415e-05, + "loss": 0.3995, "step": 87660 }, { - "epoch": 3.08, - "learning_rate": 1.6917619910421025e-05, - "loss": 0.2675, + "epoch": 3.159440660251559, + "grad_norm": 0.21964088082313538, + "learning_rate": 1.5776074113026533e-05, + "loss": 0.3779, "step": 87665 }, { - "epoch": 3.08, - "learning_rate": 1.6914924223156727e-05, - "loss": 0.2791, + "epoch": 3.1596208599127835, + "grad_norm": 0.18904650211334229, + "learning_rate": 1.577336192690313e-05, + "loss": 0.404, "step": 87670 }, { - "epoch": 3.08, - "learning_rate": 1.6912228640871823e-05, - "loss": 0.26, + "epoch": 3.159801059574008, + "grad_norm": 0.2197270691394806, + "learning_rate": 1.577064986649088e-05, + "loss": 0.3763, "step": 87675 }, { - "epoch": 3.08, - "learning_rate": 1.6909533163601305e-05, - "loss": 0.2729, + "epoch": 3.1599812592352325, + "grad_norm": 0.2542881667613983, + "learning_rate": 1.576793793182674e-05, + "loss": 0.4108, "step": 87680 }, { - "epoch": 3.09, - "learning_rate": 1.6906837791380152e-05, - "loss": 0.2604, + "epoch": 3.1601614588964573, + "grad_norm": 0.2189742773771286, + "learning_rate": 1.5765768474658893e-05, + "loss": 0.4039, "step": 87685 }, { - "epoch": 3.09, - "learning_rate": 1.6904142524243383e-05, - "loss": 0.2677, + "epoch": 3.160341658557682, + "grad_norm": 0.2676211893558502, + "learning_rate": 1.576305676643446e-05, + "loss": 0.3957, "step": 87690 }, { - "epoch": 3.09, - "learning_rate": 1.6901447362225992e-05, - "loss": 0.2759, + "epoch": 3.1605218582189067, + "grad_norm": 0.2037624716758728, + "learning_rate": 1.5760345184061586e-05, + "loss": 0.3458, "step": 87695 }, { - "epoch": 3.09, - "learning_rate": 1.689875230536297e-05, - "loss": 0.2578, + "epoch": 3.160702057880131, + "grad_norm": 0.21000508964061737, + "learning_rate": 1.575763372757723e-05, + "loss": 0.3576, "step": 87700 }, { - "epoch": 3.09, - "learning_rate": 1.6896057353689305e-05, - "loss": 0.2391, + "epoch": 3.1608822575413558, + "grad_norm": 0.27110737562179565, + "learning_rate": 1.5754922397018323e-05, + "loss": 0.35, "step": 87705 }, { - "epoch": 3.09, - "learning_rate": 1.6893362507239998e-05, - "loss": 0.252, + "epoch": 3.1610624572025805, + "grad_norm": 0.24699506163597107, + "learning_rate": 1.5752211192421794e-05, + "loss": 0.3493, "step": 87710 }, { - "epoch": 3.09, - "learning_rate": 1.6890667766050032e-05, - "loss": 0.2659, + "epoch": 3.1612426568638052, + "grad_norm": 0.25574126839637756, + "learning_rate": 1.574950011382462e-05, + "loss": 0.3795, "step": 87715 }, { - "epoch": 3.09, - "learning_rate": 1.688797313015441e-05, - "loss": 0.2584, + "epoch": 3.1614228565250295, + "grad_norm": 0.24138563871383667, + "learning_rate": 1.5746789161263702e-05, + "loss": 0.3382, "step": 87720 }, { - "epoch": 3.09, - "learning_rate": 1.68852785995881e-05, - "loss": 0.2694, + "epoch": 3.1616030561862543, + "grad_norm": 0.2855709493160248, + "learning_rate": 1.574407833477599e-05, + "loss": 0.3956, "step": 87725 }, { - "epoch": 3.09, - "learning_rate": 1.6882584174386106e-05, - "loss": 0.2415, + "epoch": 3.161783255847479, + "grad_norm": 0.1987280547618866, + "learning_rate": 1.574136763439842e-05, + "loss": 0.3667, "step": 87730 }, { - "epoch": 3.09, - "learning_rate": 1.687988985458341e-05, - "loss": 0.2396, + "epoch": 3.1619634555087037, + "grad_norm": 0.19212450087070465, + "learning_rate": 1.5738657060167916e-05, + "loss": 0.3855, "step": 87735 }, { - "epoch": 3.09, - "learning_rate": 1.6877195640214987e-05, - "loss": 0.2591, + "epoch": 3.1621436551699285, + "grad_norm": 0.20062510669231415, + "learning_rate": 1.573594661212143e-05, + "loss": 0.3914, "step": 87740 }, { - "epoch": 3.09, - "learning_rate": 1.6874501531315828e-05, - "loss": 0.2512, + "epoch": 3.1623238548311527, + "grad_norm": 0.24137015640735626, + "learning_rate": 1.5733236290295873e-05, + "loss": 0.3458, "step": 87745 }, { - "epoch": 3.09, - "learning_rate": 1.6871807527920907e-05, - "loss": 0.2601, + "epoch": 3.1625040544923775, + "grad_norm": 0.1870691478252411, + "learning_rate": 1.5730526094728173e-05, + "loss": 0.3664, "step": 87750 }, { - "epoch": 3.09, - "learning_rate": 1.6869113630065217e-05, - "loss": 0.2553, + "epoch": 3.162684254153602, + "grad_norm": 0.21274974942207336, + "learning_rate": 1.5727816025455267e-05, + "loss": 0.3506, "step": 87755 }, { - "epoch": 3.09, - "learning_rate": 1.6866419837783727e-05, - "loss": 0.2651, + "epoch": 3.162864453814827, + "grad_norm": 0.21418865025043488, + "learning_rate": 1.5725106082514066e-05, + "loss": 0.4101, "step": 87760 }, { - "epoch": 3.09, - "learning_rate": 1.6863726151111403e-05, - "loss": 0.244, + "epoch": 3.1630446534760512, + "grad_norm": 0.2074471414089203, + "learning_rate": 1.5722396265941504e-05, + "loss": 0.3705, "step": 87765 }, { - "epoch": 3.09, - "learning_rate": 1.686103257008324e-05, - "loss": 0.2777, + "epoch": 3.163224853137276, + "grad_norm": 0.26662349700927734, + "learning_rate": 1.5719686575774494e-05, + "loss": 0.3715, "step": 87770 }, { - "epoch": 3.09, - "learning_rate": 1.6858339094734212e-05, - "loss": 0.2576, + "epoch": 3.1634050527985007, + "grad_norm": 0.22255268692970276, + "learning_rate": 1.5716977012049958e-05, + "loss": 0.3973, "step": 87775 }, { - "epoch": 3.09, - "learning_rate": 1.6855645725099283e-05, - "loss": 0.2737, + "epoch": 3.1635852524597254, + "grad_norm": 0.2301638126373291, + "learning_rate": 1.5714267574804814e-05, + "loss": 0.3646, "step": 87780 }, { - "epoch": 3.09, - "learning_rate": 1.6852952461213416e-05, - "loss": 0.2445, + "epoch": 3.16376545212095, + "grad_norm": 0.2992319166660309, + "learning_rate": 1.5711558264075978e-05, + "loss": 0.3886, "step": 87785 }, { - "epoch": 3.09, - "learning_rate": 1.6850259303111606e-05, - "loss": 0.2697, + "epoch": 3.1639456517821745, + "grad_norm": 0.2510296106338501, + "learning_rate": 1.5708849079900355e-05, + "loss": 0.3778, "step": 87790 }, { - "epoch": 3.09, - "learning_rate": 1.68475662508288e-05, - "loss": 0.247, + "epoch": 3.164125851443399, + "grad_norm": 0.23558221757411957, + "learning_rate": 1.5706140022314874e-05, + "loss": 0.3634, "step": 87795 }, { - "epoch": 3.09, - "learning_rate": 1.6844873304399977e-05, - "loss": 0.2542, + "epoch": 3.164306051104624, + "grad_norm": 0.1713656485080719, + "learning_rate": 1.570343109135643e-05, + "loss": 0.3438, "step": 87800 }, { - "epoch": 3.09, - "learning_rate": 1.6842180463860095e-05, - "loss": 0.2503, + "epoch": 3.1644862507658487, + "grad_norm": 0.26629698276519775, + "learning_rate": 1.570072228706194e-05, + "loss": 0.3836, "step": 87805 }, { - "epoch": 3.09, - "learning_rate": 1.683948772924413e-05, - "loss": 0.2418, + "epoch": 3.1646664504270734, + "grad_norm": 0.2552080750465393, + "learning_rate": 1.5698013609468314e-05, + "loss": 0.3914, "step": 87810 }, { - "epoch": 3.09, - "learning_rate": 1.683679510058704e-05, - "loss": 0.2551, + "epoch": 3.1648466500882977, + "grad_norm": 0.2017066776752472, + "learning_rate": 1.5695305058612436e-05, + "loss": 0.3464, "step": 87815 }, { - "epoch": 3.09, - "learning_rate": 1.6834102577923777e-05, - "loss": 0.275, + "epoch": 3.1650268497495224, + "grad_norm": 0.18393248319625854, + "learning_rate": 1.569259663453124e-05, + "loss": 0.4058, "step": 87820 }, { - "epoch": 3.09, - "learning_rate": 1.6831410161289317e-05, - "loss": 0.2568, + "epoch": 3.165207049410747, + "grad_norm": 0.2161942571401596, + "learning_rate": 1.568988833726161e-05, + "loss": 0.3795, "step": 87825 }, { - "epoch": 3.09, - "learning_rate": 1.682871785071861e-05, - "loss": 0.2469, + "epoch": 3.165387249071972, + "grad_norm": 0.21547988057136536, + "learning_rate": 1.5687180166840442e-05, + "loss": 0.3861, "step": 87830 }, { - "epoch": 3.09, - "learning_rate": 1.6826025646246623e-05, - "loss": 0.2618, + "epoch": 3.165567448733196, + "grad_norm": 0.27021363377571106, + "learning_rate": 1.5684472123304648e-05, + "loss": 0.3855, "step": 87835 }, { - "epoch": 3.09, - "learning_rate": 1.6823333547908292e-05, - "loss": 0.2836, + "epoch": 3.165747648394421, + "grad_norm": 0.2427399456501007, + "learning_rate": 1.5681764206691114e-05, + "loss": 0.3669, "step": 87840 }, { - "epoch": 3.09, - "learning_rate": 1.68206415557386e-05, - "loss": 0.2508, + "epoch": 3.1659278480556456, + "grad_norm": 0.2702856957912445, + "learning_rate": 1.5679056417036743e-05, + "loss": 0.403, "step": 87845 }, { - "epoch": 3.09, - "learning_rate": 1.6817949669772483e-05, - "loss": 0.246, + "epoch": 3.1661080477168704, + "grad_norm": 0.24403263628482819, + "learning_rate": 1.567634875437842e-05, + "loss": 0.3985, "step": 87850 }, { - "epoch": 3.09, - "learning_rate": 1.6815257890044898e-05, - "loss": 0.2478, + "epoch": 3.166288247378095, + "grad_norm": 0.24820050597190857, + "learning_rate": 1.567364121875304e-05, + "loss": 0.3987, "step": 87855 }, { - "epoch": 3.09, - "learning_rate": 1.6812566216590786e-05, - "loss": 0.259, + "epoch": 3.1664684470393194, + "grad_norm": 0.20138351619243622, + "learning_rate": 1.567093381019749e-05, + "loss": 0.3929, "step": 87860 }, { - "epoch": 3.09, - "learning_rate": 1.6809874649445116e-05, - "loss": 0.2453, + "epoch": 3.166648646700544, + "grad_norm": 0.22523026168346405, + "learning_rate": 1.566822652874867e-05, + "loss": 0.3857, "step": 87865 }, { - "epoch": 3.09, - "learning_rate": 1.6807183188642827e-05, - "loss": 0.249, + "epoch": 3.166828846361769, + "grad_norm": 0.20283663272857666, + "learning_rate": 1.5665519374443447e-05, + "loss": 0.3864, "step": 87870 }, { - "epoch": 3.09, - "learning_rate": 1.6804491834218864e-05, - "loss": 0.2558, + "epoch": 3.1670090460229936, + "grad_norm": 0.22071103751659393, + "learning_rate": 1.566281234731872e-05, + "loss": 0.3571, "step": 87875 }, { - "epoch": 3.09, - "learning_rate": 1.6801800586208168e-05, - "loss": 0.2504, + "epoch": 3.1671892456842183, + "grad_norm": 0.262087345123291, + "learning_rate": 1.5660105447411364e-05, + "loss": 0.365, "step": 87880 }, { - "epoch": 3.09, - "learning_rate": 1.679910944464569e-05, - "loss": 0.2629, + "epoch": 3.1673694453454426, + "grad_norm": 0.26408714056015015, + "learning_rate": 1.565739867475827e-05, + "loss": 0.4295, "step": 87885 }, { - "epoch": 3.09, - "learning_rate": 1.679641840956638e-05, - "loss": 0.2806, + "epoch": 3.1675496450066674, + "grad_norm": 0.30089375376701355, + "learning_rate": 1.5654692029396308e-05, + "loss": 0.4188, "step": 87890 }, { - "epoch": 3.09, - "learning_rate": 1.6793727481005168e-05, - "loss": 0.2629, + "epoch": 3.167729844667892, + "grad_norm": 0.24746596813201904, + "learning_rate": 1.565198551136235e-05, + "loss": 0.3927, "step": 87895 }, { - "epoch": 3.09, - "learning_rate": 1.6791036658996984e-05, - "loss": 0.2511, + "epoch": 3.167910044329117, + "grad_norm": 0.2371826320886612, + "learning_rate": 1.5649279120693283e-05, + "loss": 0.3807, "step": 87900 }, { - "epoch": 3.09, - "learning_rate": 1.678834594357679e-05, - "loss": 0.2563, + "epoch": 3.168090243990341, + "grad_norm": 0.2455732524394989, + "learning_rate": 1.564657285742599e-05, + "loss": 0.3812, "step": 87905 }, { - "epoch": 3.09, - "learning_rate": 1.678565533477951e-05, - "loss": 0.2638, + "epoch": 3.168270443651566, + "grad_norm": 0.22298307716846466, + "learning_rate": 1.564386672159731e-05, + "loss": 0.4142, "step": 87910 }, { - "epoch": 3.09, - "learning_rate": 1.6782964832640075e-05, - "loss": 0.2509, + "epoch": 3.1684506433127906, + "grad_norm": 0.24019892513751984, + "learning_rate": 1.5641160713244153e-05, + "loss": 0.3813, "step": 87915 }, { - "epoch": 3.09, - "learning_rate": 1.6780274437193435e-05, - "loss": 0.2488, + "epoch": 3.1686308429740153, + "grad_norm": 0.22240878641605377, + "learning_rate": 1.563845483240335e-05, + "loss": 0.4095, "step": 87920 }, { - "epoch": 3.09, - "learning_rate": 1.6777584148474518e-05, - "loss": 0.2512, + "epoch": 3.16881104263524, + "grad_norm": 0.22564803063869476, + "learning_rate": 1.5635749079111807e-05, + "loss": 0.3787, "step": 87925 }, { - "epoch": 3.09, - "learning_rate": 1.6774893966518242e-05, - "loss": 0.2355, + "epoch": 3.1689912422964643, + "grad_norm": 0.23460455238819122, + "learning_rate": 1.5633043453406364e-05, + "loss": 0.3668, "step": 87930 }, { - "epoch": 3.09, - "learning_rate": 1.677220389135955e-05, - "loss": 0.2639, + "epoch": 3.169171441957689, + "grad_norm": 0.2320113182067871, + "learning_rate": 1.5630337955323883e-05, + "loss": 0.401, "step": 87935 }, { - "epoch": 3.09, - "learning_rate": 1.676951392303337e-05, - "loss": 0.2628, + "epoch": 3.169351641618914, + "grad_norm": 0.21181990206241608, + "learning_rate": 1.562763258490124e-05, + "loss": 0.379, "step": 87940 }, { - "epoch": 3.09, - "learning_rate": 1.6766824061574632e-05, - "loss": 0.2853, + "epoch": 3.1695318412801385, + "grad_norm": 0.23171097040176392, + "learning_rate": 1.5624927342175287e-05, + "loss": 0.3618, "step": 87945 }, { - "epoch": 3.09, - "learning_rate": 1.676413430701826e-05, - "loss": 0.2306, + "epoch": 3.169712040941363, + "grad_norm": 0.2735525965690613, + "learning_rate": 1.5622222227182884e-05, + "loss": 0.4178, "step": 87950 }, { - "epoch": 3.09, - "learning_rate": 1.6761444659399163e-05, - "loss": 0.2469, + "epoch": 3.1698922406025876, + "grad_norm": 0.23134300112724304, + "learning_rate": 1.561951723996089e-05, + "loss": 0.3827, "step": 87955 }, { - "epoch": 3.09, - "learning_rate": 1.6758755118752284e-05, - "loss": 0.2654, + "epoch": 3.1700724402638123, + "grad_norm": 0.21240036189556122, + "learning_rate": 1.561681238054615e-05, + "loss": 0.382, "step": 87960 }, { - "epoch": 3.09, - "learning_rate": 1.6756065685112538e-05, - "loss": 0.2666, + "epoch": 3.170252639925037, + "grad_norm": 0.26621800661087036, + "learning_rate": 1.5614107648975532e-05, + "loss": 0.3772, "step": 87965 }, { - "epoch": 3.1, - "learning_rate": 1.6753376358514852e-05, - "loss": 0.2589, + "epoch": 3.1704328395862618, + "grad_norm": 0.22227172553539276, + "learning_rate": 1.561140304528588e-05, + "loss": 0.3594, "step": 87970 }, { - "epoch": 3.1, - "learning_rate": 1.6750687138994125e-05, - "loss": 0.2552, + "epoch": 3.170613039247486, + "grad_norm": 0.23364649713039398, + "learning_rate": 1.5608698569514037e-05, + "loss": 0.372, "step": 87975 }, { - "epoch": 3.1, - "learning_rate": 1.6747998026585306e-05, - "loss": 0.2464, + "epoch": 3.170793238908711, + "grad_norm": 0.23514457046985626, + "learning_rate": 1.5605994221696864e-05, + "loss": 0.4162, "step": 87980 }, { - "epoch": 3.1, - "learning_rate": 1.6745309021323287e-05, - "loss": 0.2492, + "epoch": 3.1709734385699355, + "grad_norm": 0.25838416814804077, + "learning_rate": 1.5603290001871197e-05, + "loss": 0.4076, "step": 87985 }, { - "epoch": 3.1, - "learning_rate": 1.674262012324299e-05, - "loss": 0.2546, + "epoch": 3.1711536382311603, + "grad_norm": 0.2554473280906677, + "learning_rate": 1.560058591007389e-05, + "loss": 0.422, "step": 87990 }, { - "epoch": 3.1, - "learning_rate": 1.673993133237932e-05, - "loss": 0.2565, + "epoch": 3.1713338378923845, + "grad_norm": 0.23790916800498962, + "learning_rate": 1.559788194634178e-05, + "loss": 0.3746, "step": 87995 }, { - "epoch": 3.1, - "learning_rate": 1.6737242648767215e-05, - "loss": 0.2637, + "epoch": 3.1715140375536093, + "grad_norm": 0.21130967140197754, + "learning_rate": 1.55951781107117e-05, + "loss": 0.3621, "step": 88000 }, { - "epoch": 3.1, - "eval_loss": 0.25597599148750305, - "eval_runtime": 10.5513, - "eval_samples_per_second": 9.477, - "eval_steps_per_second": 9.477, + "epoch": 3.1715140375536093, + "eval_loss": 0.4313719570636749, + "eval_runtime": 3.532, + "eval_samples_per_second": 28.313, + "eval_steps_per_second": 7.078, "step": 88000 }, { - "epoch": 3.1, - "learning_rate": 1.6734554072441563e-05, - "loss": 0.2452, + "epoch": 3.171694237214834, + "grad_norm": 0.2613160312175751, + "learning_rate": 1.559247440322051e-05, + "loss": 0.3657, "step": 88005 }, { - "epoch": 3.1, - "learning_rate": 1.673186560343728e-05, - "loss": 0.2522, + "epoch": 3.1718744368760587, + "grad_norm": 0.17358632385730743, + "learning_rate": 1.5589770823905027e-05, + "loss": 0.36, "step": 88010 }, { - "epoch": 3.1, - "learning_rate": 1.672917724178926e-05, - "loss": 0.2383, + "epoch": 3.1720546365372835, + "grad_norm": 0.2220027893781662, + "learning_rate": 1.5587067372802092e-05, + "loss": 0.3705, "step": 88015 }, { - "epoch": 3.1, - "learning_rate": 1.6726488987532434e-05, - "loss": 0.2508, + "epoch": 3.1722348361985078, + "grad_norm": 0.20974379777908325, + "learning_rate": 1.5584364049948548e-05, + "loss": 0.3666, "step": 88020 }, { - "epoch": 3.1, - "learning_rate": 1.6723800840701702e-05, - "loss": 0.266, + "epoch": 3.1724150358597325, + "grad_norm": 0.20734605193138123, + "learning_rate": 1.5581660855381207e-05, + "loss": 0.3625, "step": 88025 }, { - "epoch": 3.1, - "learning_rate": 1.6721112801331946e-05, - "loss": 0.2572, + "epoch": 3.1725952355209572, + "grad_norm": 0.24206040799617767, + "learning_rate": 1.557895778913693e-05, + "loss": 0.3759, "step": 88030 }, { - "epoch": 3.1, - "learning_rate": 1.67184248694581e-05, - "loss": 0.2678, + "epoch": 3.172775435182182, + "grad_norm": 0.20642173290252686, + "learning_rate": 1.5576254851252526e-05, + "loss": 0.3891, "step": 88035 }, { - "epoch": 3.1, - "learning_rate": 1.6715737045115043e-05, - "loss": 0.2583, + "epoch": 3.1729556348434063, + "grad_norm": 0.23246215283870697, + "learning_rate": 1.5573552041764818e-05, + "loss": 0.4263, "step": 88040 }, { - "epoch": 3.1, - "learning_rate": 1.6713049328337683e-05, - "loss": 0.2586, + "epoch": 3.173135834504631, + "grad_norm": 0.2090509980916977, + "learning_rate": 1.557084936071064e-05, + "loss": 0.3807, "step": 88045 }, { - "epoch": 3.1, - "learning_rate": 1.6710361719160905e-05, - "loss": 0.2745, + "epoch": 3.1733160341658557, + "grad_norm": 0.22444801032543182, + "learning_rate": 1.5568146808126823e-05, + "loss": 0.3697, "step": 88050 }, { - "epoch": 3.1, - "learning_rate": 1.670767421761963e-05, - "loss": 0.2549, + "epoch": 3.1734962338270805, + "grad_norm": 0.24101091921329498, + "learning_rate": 1.556544438405017e-05, + "loss": 0.3802, "step": 88055 }, { - "epoch": 3.1, - "learning_rate": 1.670498682374874e-05, - "loss": 0.2587, + "epoch": 3.173676433488305, + "grad_norm": 0.2750411927700043, + "learning_rate": 1.556274208851752e-05, + "loss": 0.391, "step": 88060 }, { - "epoch": 3.1, - "learning_rate": 1.6702299537583126e-05, - "loss": 0.2475, + "epoch": 3.1738566331495295, + "grad_norm": 0.21167200803756714, + "learning_rate": 1.5560039921565667e-05, + "loss": 0.3587, "step": 88065 }, { - "epoch": 3.1, - "learning_rate": 1.669961235915768e-05, - "loss": 0.2702, + "epoch": 3.174036832810754, + "grad_norm": 0.28249591588974, + "learning_rate": 1.5557337883231455e-05, + "loss": 0.3977, "step": 88070 }, { - "epoch": 3.1, - "learning_rate": 1.6696925288507297e-05, - "loss": 0.2693, + "epoch": 3.174217032471979, + "grad_norm": 0.23094874620437622, + "learning_rate": 1.5554635973551684e-05, + "loss": 0.3555, "step": 88075 }, { - "epoch": 3.1, - "learning_rate": 1.669423832566688e-05, - "loss": 0.2599, + "epoch": 3.1743972321332037, + "grad_norm": 0.24824340641498566, + "learning_rate": 1.5551934192563165e-05, + "loss": 0.3596, "step": 88080 }, { - "epoch": 3.1, - "learning_rate": 1.6691551470671296e-05, - "loss": 0.2832, + "epoch": 3.1745774317944284, + "grad_norm": 0.23412884771823883, + "learning_rate": 1.5549232540302718e-05, + "loss": 0.4005, "step": 88085 }, { - "epoch": 3.1, - "learning_rate": 1.6688864723555436e-05, - "loss": 0.255, + "epoch": 3.1747576314556527, + "grad_norm": 0.2383238971233368, + "learning_rate": 1.5546531016807152e-05, + "loss": 0.4006, "step": 88090 }, { - "epoch": 3.1, - "learning_rate": 1.6686178084354198e-05, - "loss": 0.2701, + "epoch": 3.1749378311168774, + "grad_norm": 0.2111070156097412, + "learning_rate": 1.5543829622113253e-05, + "loss": 0.3623, "step": 88095 }, { - "epoch": 3.1, - "learning_rate": 1.6683491553102454e-05, - "loss": 0.2571, + "epoch": 3.175118030778102, + "grad_norm": 0.2079268842935562, + "learning_rate": 1.5541128356257866e-05, + "loss": 0.3812, "step": 88100 }, { - "epoch": 3.1, - "learning_rate": 1.66808051298351e-05, - "loss": 0.2642, + "epoch": 3.175298230439327, + "grad_norm": 0.2111787348985672, + "learning_rate": 1.5538427219277756e-05, + "loss": 0.4066, "step": 88105 }, { - "epoch": 3.1, - "learning_rate": 1.667811881458699e-05, - "loss": 0.2762, + "epoch": 3.1754784301005516, + "grad_norm": 0.22174887359142303, + "learning_rate": 1.5535726211209758e-05, + "loss": 0.3579, "step": 88110 }, { - "epoch": 3.1, - "learning_rate": 1.667543260739304e-05, - "loss": 0.2425, + "epoch": 3.175658629761776, + "grad_norm": 0.2652639150619507, + "learning_rate": 1.5533025332090655e-05, + "loss": 0.4119, "step": 88115 }, { - "epoch": 3.1, - "learning_rate": 1.667274650828811e-05, - "loss": 0.2539, + "epoch": 3.1758388294230007, + "grad_norm": 0.22351694107055664, + "learning_rate": 1.553032458195724e-05, + "loss": 0.3969, "step": 88120 }, { - "epoch": 3.1, - "learning_rate": 1.6670060517307072e-05, - "loss": 0.2692, + "epoch": 3.1760190290842254, + "grad_norm": 0.2620088756084442, + "learning_rate": 1.5527623960846328e-05, + "loss": 0.3849, "step": 88125 }, { - "epoch": 3.1, - "learning_rate": 1.6667374634484806e-05, - "loss": 0.2809, + "epoch": 3.17619922874545, + "grad_norm": 0.24307698011398315, + "learning_rate": 1.55249234687947e-05, + "loss": 0.38, "step": 88130 }, { - "epoch": 3.1, - "learning_rate": 1.6664688859856196e-05, - "loss": 0.2754, + "epoch": 3.1763794284066744, + "grad_norm": 0.2753578722476959, + "learning_rate": 1.5522223105839157e-05, + "loss": 0.369, "step": 88135 }, { - "epoch": 3.1, - "learning_rate": 1.666200319345611e-05, - "loss": 0.2544, + "epoch": 3.176559628067899, + "grad_norm": 0.21059057116508484, + "learning_rate": 1.551952287201649e-05, + "loss": 0.4148, "step": 88140 }, { - "epoch": 3.1, - "learning_rate": 1.665931763531942e-05, - "loss": 0.2284, + "epoch": 3.176739827729124, + "grad_norm": 0.21120493113994598, + "learning_rate": 1.551682276736349e-05, + "loss": 0.3609, "step": 88145 }, { - "epoch": 3.1, - "learning_rate": 1.665663218548098e-05, - "loss": 0.2623, + "epoch": 3.1769200273903486, + "grad_norm": 0.22285541892051697, + "learning_rate": 1.551412279191694e-05, + "loss": 0.3983, "step": 88150 }, { - "epoch": 3.1, - "learning_rate": 1.665394684397568e-05, - "loss": 0.246, + "epoch": 3.1771002270515734, + "grad_norm": 0.22482122480869293, + "learning_rate": 1.5511422945713633e-05, + "loss": 0.3582, "step": 88155 }, { - "epoch": 3.1, - "learning_rate": 1.6651261610838383e-05, - "loss": 0.2496, + "epoch": 3.1772804267127976, + "grad_norm": 0.21789222955703735, + "learning_rate": 1.550872322879035e-05, + "loss": 0.3925, "step": 88160 }, { - "epoch": 3.1, - "learning_rate": 1.664857648610394e-05, - "loss": 0.2721, + "epoch": 3.1774606263740224, + "grad_norm": 0.21532319486141205, + "learning_rate": 1.5506023641183876e-05, + "loss": 0.3784, "step": 88165 }, { - "epoch": 3.1, - "learning_rate": 1.6645891469807242e-05, - "loss": 0.2511, + "epoch": 3.177640826035247, + "grad_norm": 0.23733597993850708, + "learning_rate": 1.550332418293099e-05, + "loss": 0.3538, "step": 88170 }, { - "epoch": 3.1, - "learning_rate": 1.6643206561983138e-05, - "loss": 0.2559, + "epoch": 3.177821025696472, + "grad_norm": 0.18654675781726837, + "learning_rate": 1.550062485406848e-05, + "loss": 0.4173, "step": 88175 }, { - "epoch": 3.1, - "learning_rate": 1.6640521762666482e-05, - "loss": 0.2766, + "epoch": 3.178001225357696, + "grad_norm": 0.2497045248746872, + "learning_rate": 1.5497925654633118e-05, + "loss": 0.41, "step": 88180 }, { - "epoch": 3.1, - "learning_rate": 1.663783707189214e-05, - "loss": 0.2513, + "epoch": 3.178181425018921, + "grad_norm": 0.27806994318962097, + "learning_rate": 1.5495226584661664e-05, + "loss": 0.428, "step": 88185 }, { - "epoch": 3.1, - "learning_rate": 1.6635152489694976e-05, - "loss": 0.2542, + "epoch": 3.1783616246801456, + "grad_norm": 0.20720724761486053, + "learning_rate": 1.549252764419092e-05, + "loss": 0.3697, "step": 88190 }, { - "epoch": 3.1, - "learning_rate": 1.6632468016109848e-05, - "loss": 0.26, + "epoch": 3.1785418243413703, + "grad_norm": 0.21904359757900238, + "learning_rate": 1.548982883325765e-05, + "loss": 0.3729, "step": 88195 }, { - "epoch": 3.1, - "learning_rate": 1.662978365117161e-05, - "loss": 0.2627, + "epoch": 3.178722024002595, + "grad_norm": 0.2742276191711426, + "learning_rate": 1.5487130151898603e-05, + "loss": 0.3845, "step": 88200 }, { - "epoch": 3.1, - "learning_rate": 1.66270993949151e-05, - "loss": 0.275, + "epoch": 3.1789022236638194, + "grad_norm": 0.2257348746061325, + "learning_rate": 1.5484431600150587e-05, + "loss": 0.3508, "step": 88205 }, { - "epoch": 3.1, - "learning_rate": 1.66244152473752e-05, - "loss": 0.2463, + "epoch": 3.179082423325044, + "grad_norm": 0.23579144477844238, + "learning_rate": 1.548173317805033e-05, + "loss": 0.3784, "step": 88210 }, { - "epoch": 3.1, - "learning_rate": 1.662173120858675e-05, - "loss": 0.2575, + "epoch": 3.179262622986269, + "grad_norm": 0.2549505829811096, + "learning_rate": 1.547903488563463e-05, + "loss": 0.3866, "step": 88215 }, { - "epoch": 3.1, - "learning_rate": 1.6619047278584596e-05, - "loss": 0.2504, + "epoch": 3.1794428226474936, + "grad_norm": 0.2171338051557541, + "learning_rate": 1.5476336722940233e-05, + "loss": 0.3705, "step": 88220 }, { - "epoch": 3.1, - "learning_rate": 1.661636345740358e-05, - "loss": 0.2562, + "epoch": 3.179623022308718, + "grad_norm": 0.21523796021938324, + "learning_rate": 1.54736386900039e-05, + "loss": 0.3732, "step": 88225 }, { - "epoch": 3.1, - "learning_rate": 1.661367974507857e-05, - "loss": 0.2457, + "epoch": 3.1798032219699426, + "grad_norm": 0.22669003903865814, + "learning_rate": 1.5470940786862397e-05, + "loss": 0.3728, "step": 88230 }, { - "epoch": 3.1, - "learning_rate": 1.6610996141644397e-05, - "loss": 0.2446, + "epoch": 3.1799834216311673, + "grad_norm": 0.20555952191352844, + "learning_rate": 1.5468243013552486e-05, + "loss": 0.3688, "step": 88235 }, { - "epoch": 3.1, - "learning_rate": 1.6608312647135917e-05, - "loss": 0.2849, + "epoch": 3.180163621292392, + "grad_norm": 0.24928292632102966, + "learning_rate": 1.546554537011091e-05, + "loss": 0.4122, "step": 88240 }, { - "epoch": 3.1, - "learning_rate": 1.6605629261587957e-05, - "loss": 0.2788, + "epoch": 3.180343820953617, + "grad_norm": 0.2341477870941162, + "learning_rate": 1.5462847856574437e-05, + "loss": 0.396, "step": 88245 }, { - "epoch": 3.1, - "learning_rate": 1.660294598503538e-05, - "loss": 0.2552, + "epoch": 3.180524020614841, + "grad_norm": 0.21584312617778778, + "learning_rate": 1.546015047297981e-05, + "loss": 0.3566, "step": 88250 }, { - "epoch": 3.11, - "learning_rate": 1.660026281751301e-05, - "loss": 0.2568, + "epoch": 3.180704220276066, + "grad_norm": 0.25343307852745056, + "learning_rate": 1.5457453219363794e-05, + "loss": 0.3825, "step": 88255 }, { - "epoch": 3.11, - "learning_rate": 1.6597579759055688e-05, - "loss": 0.2444, + "epoch": 3.1808844199372905, + "grad_norm": 0.26470595598220825, + "learning_rate": 1.545475609576313e-05, + "loss": 0.3954, "step": 88260 }, { - "epoch": 3.11, - "learning_rate": 1.659489680969826e-05, - "loss": 0.2542, + "epoch": 3.1810646195985153, + "grad_norm": 0.19923968613147736, + "learning_rate": 1.5452059102214558e-05, + "loss": 0.3574, "step": 88265 }, { - "epoch": 3.11, - "learning_rate": 1.6592213969475557e-05, - "loss": 0.2692, + "epoch": 3.1812448192597396, + "grad_norm": 0.20713098347187042, + "learning_rate": 1.5449362238754838e-05, + "loss": 0.3896, "step": 88270 }, { - "epoch": 3.11, - "learning_rate": 1.6589531238422416e-05, - "loss": 0.2584, + "epoch": 3.1814250189209643, + "grad_norm": 0.24674662947654724, + "learning_rate": 1.5446665505420714e-05, + "loss": 0.3901, "step": 88275 }, { - "epoch": 3.11, - "learning_rate": 1.658684861657366e-05, - "loss": 0.2641, + "epoch": 3.181605218582189, + "grad_norm": 0.22390885651111603, + "learning_rate": 1.5443968902248913e-05, + "loss": 0.375, "step": 88280 }, { - "epoch": 3.11, - "learning_rate": 1.6584166103964143e-05, - "loss": 0.2619, + "epoch": 3.1817854182434138, + "grad_norm": 0.252699077129364, + "learning_rate": 1.5441272429276193e-05, + "loss": 0.3771, "step": 88285 }, { - "epoch": 3.11, - "learning_rate": 1.6581483700628676e-05, - "loss": 0.2526, + "epoch": 3.1819656179046385, + "grad_norm": 0.25395792722702026, + "learning_rate": 1.5438576086539275e-05, + "loss": 0.389, "step": 88290 }, { - "epoch": 3.11, - "learning_rate": 1.65788014066021e-05, - "loss": 0.2417, + "epoch": 3.182145817565863, + "grad_norm": 0.27715232968330383, + "learning_rate": 1.543587987407492e-05, + "loss": 0.3793, "step": 88295 }, { - "epoch": 3.11, - "learning_rate": 1.6576119221919228e-05, - "loss": 0.2669, + "epoch": 3.1823260172270875, + "grad_norm": 0.27703672647476196, + "learning_rate": 1.5433183791919844e-05, + "loss": 0.3654, "step": 88300 }, { - "epoch": 3.11, - "learning_rate": 1.6573437146614906e-05, - "loss": 0.2453, + "epoch": 3.1825062168883123, + "grad_norm": 0.21371644735336304, + "learning_rate": 1.543048784011078e-05, + "loss": 0.3736, "step": 88305 }, { - "epoch": 3.11, - "learning_rate": 1.657075518072395e-05, - "loss": 0.2621, + "epoch": 3.182686416549537, + "grad_norm": 0.18627633154392242, + "learning_rate": 1.542779201868448e-05, + "loss": 0.332, "step": 88310 }, { - "epoch": 3.11, - "learning_rate": 1.6568073324281178e-05, - "loss": 0.2774, + "epoch": 3.1828666162107617, + "grad_norm": 0.24603888392448425, + "learning_rate": 1.542509632767765e-05, + "loss": 0.3981, "step": 88315 }, { - "epoch": 3.11, - "learning_rate": 1.6565391577321415e-05, - "loss": 0.2645, + "epoch": 3.183046815871986, + "grad_norm": 0.26360347867012024, + "learning_rate": 1.5422400767127034e-05, + "loss": 0.3647, "step": 88320 }, { - "epoch": 3.11, - "learning_rate": 1.6562709939879486e-05, - "loss": 0.245, + "epoch": 3.1832270155332107, + "grad_norm": 0.24063238501548767, + "learning_rate": 1.5419705337069353e-05, + "loss": 0.3559, "step": 88325 }, { - "epoch": 3.11, - "learning_rate": 1.6560028411990213e-05, - "loss": 0.2613, + "epoch": 3.1834072151944355, + "grad_norm": 0.20714794099330902, + "learning_rate": 1.5417010037541322e-05, + "loss": 0.3922, "step": 88330 }, { - "epoch": 3.11, - "learning_rate": 1.6557346993688412e-05, - "loss": 0.2542, + "epoch": 3.18358741485566, + "grad_norm": 0.24574214220046997, + "learning_rate": 1.5414314868579687e-05, + "loss": 0.384, "step": 88335 }, { - "epoch": 3.11, - "learning_rate": 1.655466568500888e-05, - "loss": 0.2499, + "epoch": 3.1837676145168845, + "grad_norm": 0.2243025153875351, + "learning_rate": 1.541161983022115e-05, + "loss": 0.3988, "step": 88340 }, { - "epoch": 3.11, - "learning_rate": 1.655198448598646e-05, - "loss": 0.2386, + "epoch": 3.1839478141781092, + "grad_norm": 0.22772471606731415, + "learning_rate": 1.5408924922502437e-05, + "loss": 0.4002, "step": 88345 }, { - "epoch": 3.11, - "learning_rate": 1.6549303396655953e-05, - "loss": 0.2671, + "epoch": 3.184128013839334, + "grad_norm": 0.27433350682258606, + "learning_rate": 1.5406230145460264e-05, + "loss": 0.3967, "step": 88350 }, { - "epoch": 3.11, - "learning_rate": 1.6546622417052174e-05, - "loss": 0.2524, + "epoch": 3.1843082135005587, + "grad_norm": 0.24873776733875275, + "learning_rate": 1.540353549913135e-05, + "loss": 0.3847, "step": 88355 }, { - "epoch": 3.11, - "learning_rate": 1.6543941547209923e-05, - "loss": 0.2415, + "epoch": 3.1844884131617834, + "grad_norm": 0.21438409388065338, + "learning_rate": 1.5400840983552405e-05, + "loss": 0.3762, "step": 88360 }, { - "epoch": 3.11, - "learning_rate": 1.6541260787164026e-05, - "loss": 0.2601, + "epoch": 3.1846686128230077, + "grad_norm": 0.20840908586978912, + "learning_rate": 1.539814659876015e-05, + "loss": 0.4169, "step": 88365 }, { - "epoch": 3.11, - "learning_rate": 1.653858013694928e-05, - "loss": 0.2599, + "epoch": 3.1848488124842325, + "grad_norm": 0.30924826860427856, + "learning_rate": 1.5395452344791277e-05, + "loss": 0.3751, "step": 88370 }, { - "epoch": 3.11, - "learning_rate": 1.65358995966005e-05, - "loss": 0.2621, + "epoch": 3.185029012145457, + "grad_norm": 0.25707584619522095, + "learning_rate": 1.5392758221682516e-05, + "loss": 0.4209, "step": 88375 }, { - "epoch": 3.11, - "learning_rate": 1.6533219166152473e-05, - "loss": 0.2534, + "epoch": 3.185209211806682, + "grad_norm": 0.26053330302238464, + "learning_rate": 1.5390064229470568e-05, + "loss": 0.3743, "step": 88380 }, { - "epoch": 3.11, - "learning_rate": 1.653053884564003e-05, - "loss": 0.25, + "epoch": 3.1853894114679067, + "grad_norm": 0.24853472411632538, + "learning_rate": 1.5387370368192116e-05, + "loss": 0.3825, "step": 88385 }, { - "epoch": 3.11, - "learning_rate": 1.6527858635097955e-05, - "loss": 0.2317, + "epoch": 3.185569611129131, + "grad_norm": 0.20806756615638733, + "learning_rate": 1.5384676637883904e-05, + "loss": 0.3562, "step": 88390 }, { - "epoch": 3.11, - "learning_rate": 1.652517853456104e-05, - "loss": 0.2635, + "epoch": 3.1857498107903557, + "grad_norm": 0.2653723359107971, + "learning_rate": 1.5381983038582588e-05, + "loss": 0.374, "step": 88395 }, { - "epoch": 3.11, - "learning_rate": 1.6522498544064113e-05, - "loss": 0.2467, + "epoch": 3.1859300104515804, + "grad_norm": 0.25022828578948975, + "learning_rate": 1.5379289570324912e-05, + "loss": 0.3863, "step": 88400 }, { - "epoch": 3.11, - "learning_rate": 1.651981866364195e-05, - "loss": 0.2529, + "epoch": 3.186110210112805, + "grad_norm": 0.2051696926355362, + "learning_rate": 1.5376596233147543e-05, + "loss": 0.4092, "step": 88405 }, { - "epoch": 3.11, - "learning_rate": 1.6517138893329355e-05, - "loss": 0.2823, + "epoch": 3.1862904097740294, + "grad_norm": 0.1869521141052246, + "learning_rate": 1.5373903027087185e-05, + "loss": 0.3755, "step": 88410 }, { - "epoch": 3.11, - "learning_rate": 1.651445923316111e-05, - "loss": 0.2919, + "epoch": 3.186470609435254, + "grad_norm": 0.17560672760009766, + "learning_rate": 1.537120995218054e-05, + "loss": 0.3518, "step": 88415 }, { - "epoch": 3.11, - "learning_rate": 1.651177968317203e-05, - "loss": 0.2554, + "epoch": 3.186650809096479, + "grad_norm": 0.24432684481143951, + "learning_rate": 1.5368517008464295e-05, + "loss": 0.3625, "step": 88420 }, { - "epoch": 3.11, - "learning_rate": 1.65091002433969e-05, - "loss": 0.233, + "epoch": 3.1868310087577036, + "grad_norm": 0.2695041298866272, + "learning_rate": 1.5365824195975137e-05, + "loss": 0.3801, "step": 88425 }, { - "epoch": 3.11, - "learning_rate": 1.65064209138705e-05, - "loss": 0.2487, + "epoch": 3.1870112084189284, + "grad_norm": 0.2649804651737213, + "learning_rate": 1.5363131514749762e-05, + "loss": 0.3637, "step": 88430 }, { - "epoch": 3.11, - "learning_rate": 1.650374169462762e-05, - "loss": 0.2457, + "epoch": 3.1871914080801527, + "grad_norm": 0.2614179849624634, + "learning_rate": 1.5360438964824846e-05, + "loss": 0.3891, "step": 88435 }, { - "epoch": 3.11, - "learning_rate": 1.650106258570307e-05, - "loss": 0.254, + "epoch": 3.1873716077413774, + "grad_norm": 0.2579090893268585, + "learning_rate": 1.5357746546237093e-05, + "loss": 0.3654, "step": 88440 }, { - "epoch": 3.11, - "learning_rate": 1.649838358713162e-05, - "loss": 0.2363, + "epoch": 3.187551807402602, + "grad_norm": 0.2515721619129181, + "learning_rate": 1.5355054259023176e-05, + "loss": 0.414, "step": 88445 }, { - "epoch": 3.11, - "learning_rate": 1.6495704698948057e-05, - "loss": 0.2572, + "epoch": 3.187732007063827, + "grad_norm": 0.19676761329174042, + "learning_rate": 1.5352362103219772e-05, + "loss": 0.4121, "step": 88450 }, { - "epoch": 3.11, - "learning_rate": 1.649302592118715e-05, - "loss": 0.259, + "epoch": 3.187912206725051, + "grad_norm": 0.3330390751361847, + "learning_rate": 1.534967007886357e-05, + "loss": 0.4198, "step": 88455 }, { - "epoch": 3.11, - "learning_rate": 1.64903472538837e-05, - "loss": 0.2359, + "epoch": 3.188092406386276, + "grad_norm": 0.1731852889060974, + "learning_rate": 1.5346978185991254e-05, + "loss": 0.3721, "step": 88460 }, { - "epoch": 3.11, - "learning_rate": 1.6487668697072493e-05, - "loss": 0.2649, + "epoch": 3.1882726060475006, + "grad_norm": 0.21727561950683594, + "learning_rate": 1.5344286424639473e-05, + "loss": 0.3973, "step": 88465 }, { - "epoch": 3.11, - "learning_rate": 1.6484990250788293e-05, - "loss": 0.2425, + "epoch": 3.1884528057087254, + "grad_norm": 0.21226783096790314, + "learning_rate": 1.534159479484493e-05, + "loss": 0.3798, "step": 88470 }, { - "epoch": 3.11, - "learning_rate": 1.6482311915065874e-05, - "loss": 0.2795, + "epoch": 3.18863300536995, + "grad_norm": 0.21172350645065308, + "learning_rate": 1.5338903296644282e-05, + "loss": 0.3675, "step": 88475 }, { - "epoch": 3.11, - "learning_rate": 1.6479633689940032e-05, - "loss": 0.2461, + "epoch": 3.1888132050311744, + "grad_norm": 0.23403914272785187, + "learning_rate": 1.533621193007421e-05, + "loss": 0.3956, "step": 88480 }, { - "epoch": 3.11, - "learning_rate": 1.647695557544553e-05, - "loss": 0.2561, + "epoch": 3.188993404692399, + "grad_norm": 0.24958845973014832, + "learning_rate": 1.533352069517139e-05, + "loss": 0.3753, "step": 88485 }, { - "epoch": 3.11, - "learning_rate": 1.6474277571617145e-05, - "loss": 0.2542, + "epoch": 3.189173604353624, + "grad_norm": 0.29716435074806213, + "learning_rate": 1.5330829591972463e-05, + "loss": 0.3674, "step": 88490 }, { - "epoch": 3.11, - "learning_rate": 1.6471599678489634e-05, - "loss": 0.2431, + "epoch": 3.1893538040148486, + "grad_norm": 0.23734360933303833, + "learning_rate": 1.5328138620514125e-05, + "loss": 0.3635, "step": 88495 }, { - "epoch": 3.11, - "learning_rate": 1.6468921896097793e-05, - "loss": 0.2672, + "epoch": 3.189534003676073, + "grad_norm": 0.22716392576694489, + "learning_rate": 1.532544778083302e-05, + "loss": 0.3993, "step": 88500 }, { - "epoch": 3.11, - "eval_loss": 0.25621533393859863, - "eval_runtime": 10.5456, - "eval_samples_per_second": 9.483, - "eval_steps_per_second": 9.483, + "epoch": 3.189534003676073, + "eval_loss": 0.4313284754753113, + "eval_runtime": 3.5346, + "eval_samples_per_second": 28.292, + "eval_steps_per_second": 7.073, "step": 88500 }, { - "epoch": 3.11, - "learning_rate": 1.646624422447638e-05, - "loss": 0.2621, + "epoch": 3.1897142033372976, + "grad_norm": 0.24912869930267334, + "learning_rate": 1.532275707296581e-05, + "loss": 0.3939, "step": 88505 }, { - "epoch": 3.11, - "learning_rate": 1.6463566663660153e-05, - "loss": 0.2563, + "epoch": 3.1898944029985223, + "grad_norm": 0.20654475688934326, + "learning_rate": 1.5320066496949164e-05, + "loss": 0.3825, "step": 88510 }, { - "epoch": 3.11, - "learning_rate": 1.6460889213683892e-05, - "loss": 0.2306, + "epoch": 3.190074602659747, + "grad_norm": 0.23317840695381165, + "learning_rate": 1.531737605281974e-05, + "loss": 0.3779, "step": 88515 }, { - "epoch": 3.11, - "learning_rate": 1.645821187458236e-05, - "loss": 0.2811, + "epoch": 3.190254802320972, + "grad_norm": 0.2050599455833435, + "learning_rate": 1.531468574061419e-05, + "loss": 0.3616, "step": 88520 }, { - "epoch": 3.11, - "learning_rate": 1.645553464639032e-05, - "loss": 0.2564, + "epoch": 3.190435001982196, + "grad_norm": 0.24573275446891785, + "learning_rate": 1.5311995560369176e-05, + "loss": 0.3505, "step": 88525 }, { - "epoch": 3.11, - "learning_rate": 1.6452857529142518e-05, - "loss": 0.2592, + "epoch": 3.190615201643421, + "grad_norm": 0.22783836722373962, + "learning_rate": 1.530930551212134e-05, + "loss": 0.3509, "step": 88530 }, { - "epoch": 3.11, - "learning_rate": 1.645018052287374e-05, - "loss": 0.2565, + "epoch": 3.1907954013046456, + "grad_norm": 0.22740013897418976, + "learning_rate": 1.5306615595907348e-05, + "loss": 0.35, "step": 88535 }, { - "epoch": 3.12, - "learning_rate": 1.6447503627618728e-05, - "loss": 0.2429, + "epoch": 3.1909756009658703, + "grad_norm": 0.23683089017868042, + "learning_rate": 1.5303925811763833e-05, + "loss": 0.3592, "step": 88540 }, { - "epoch": 3.12, - "learning_rate": 1.6444826843412254e-05, - "loss": 0.2409, + "epoch": 3.1911558006270946, + "grad_norm": 0.2351250797510147, + "learning_rate": 1.530123615972746e-05, + "loss": 0.4032, "step": 88545 }, { - "epoch": 3.12, - "learning_rate": 1.644215017028905e-05, - "loss": 0.2629, + "epoch": 3.1913360002883193, + "grad_norm": 0.23128874599933624, + "learning_rate": 1.5298546639834867e-05, + "loss": 0.3866, "step": 88550 }, { - "epoch": 3.12, - "learning_rate": 1.6439473608283894e-05, - "loss": 0.2676, + "epoch": 3.191516199949544, + "grad_norm": 0.24116584658622742, + "learning_rate": 1.529585725212269e-05, + "loss": 0.3896, "step": 88555 }, { - "epoch": 3.12, - "learning_rate": 1.643679715743154e-05, - "loss": 0.2427, + "epoch": 3.191696399610769, + "grad_norm": 0.19982759654521942, + "learning_rate": 1.5293167996627583e-05, + "loss": 0.3896, "step": 88560 }, { - "epoch": 3.12, - "learning_rate": 1.6434120817766723e-05, - "loss": 0.2589, + "epoch": 3.1918765992719935, + "grad_norm": 0.19958491623401642, + "learning_rate": 1.529047887338619e-05, + "loss": 0.3574, "step": 88565 }, { - "epoch": 3.12, - "learning_rate": 1.6431444589324193e-05, - "loss": 0.2767, + "epoch": 3.192056798933218, + "grad_norm": 0.22124148905277252, + "learning_rate": 1.5287789882435126e-05, + "loss": 0.3592, "step": 88570 }, { - "epoch": 3.12, - "learning_rate": 1.6428768472138723e-05, - "loss": 0.2622, + "epoch": 3.1922369985944425, + "grad_norm": 0.19804465770721436, + "learning_rate": 1.528510102381106e-05, + "loss": 0.3993, "step": 88575 }, { - "epoch": 3.12, - "learning_rate": 1.6426092466245044e-05, - "loss": 0.2695, + "epoch": 3.1924171982556673, + "grad_norm": 0.18390753865242004, + "learning_rate": 1.5282412297550603e-05, + "loss": 0.4034, "step": 88580 }, { - "epoch": 3.12, - "learning_rate": 1.6423416571677904e-05, - "loss": 0.2686, + "epoch": 3.192597397916892, + "grad_norm": 0.2427562028169632, + "learning_rate": 1.5279723703690404e-05, + "loss": 0.3776, "step": 88585 }, { - "epoch": 3.12, - "learning_rate": 1.642074078847204e-05, - "loss": 0.2474, + "epoch": 3.1927775975781167, + "grad_norm": 0.2625272572040558, + "learning_rate": 1.5277035242267085e-05, + "loss": 0.4091, "step": 88590 }, { - "epoch": 3.12, - "learning_rate": 1.6418065116662202e-05, - "loss": 0.2619, + "epoch": 3.192957797239341, + "grad_norm": 0.20139677822589874, + "learning_rate": 1.527434691331727e-05, + "loss": 0.4056, "step": 88595 }, { - "epoch": 3.12, - "learning_rate": 1.6415389556283144e-05, - "loss": 0.2584, + "epoch": 3.1931379969005658, + "grad_norm": 0.22303380072116852, + "learning_rate": 1.5271658716877607e-05, + "loss": 0.3861, "step": 88600 }, { - "epoch": 3.12, - "learning_rate": 1.6412714107369593e-05, - "loss": 0.2847, + "epoch": 3.1933181965617905, + "grad_norm": 0.24220505356788635, + "learning_rate": 1.526897065298471e-05, + "loss": 0.3655, "step": 88605 }, { - "epoch": 3.12, - "learning_rate": 1.6410038769956277e-05, - "loss": 0.2579, + "epoch": 3.1934983962230152, + "grad_norm": 0.2237461358308792, + "learning_rate": 1.5266282721675196e-05, + "loss": 0.3396, "step": 88610 }, { - "epoch": 3.12, - "learning_rate": 1.640736354407796e-05, - "loss": 0.2679, + "epoch": 3.19367859588424, + "grad_norm": 0.22719275951385498, + "learning_rate": 1.5263594922985698e-05, + "loss": 0.389, "step": 88615 }, { - "epoch": 3.12, - "learning_rate": 1.6404688429769362e-05, - "loss": 0.2467, + "epoch": 3.1938587955454643, + "grad_norm": 0.20860248804092407, + "learning_rate": 1.5260907256952832e-05, + "loss": 0.376, "step": 88620 }, { - "epoch": 3.12, - "learning_rate": 1.6402013427065215e-05, - "loss": 0.2191, + "epoch": 3.194038995206689, + "grad_norm": 0.25672057271003723, + "learning_rate": 1.5258219723613226e-05, + "loss": 0.3926, "step": 88625 }, { - "epoch": 3.12, - "learning_rate": 1.6399338536000258e-05, - "loss": 0.2667, + "epoch": 3.1942191948679137, + "grad_norm": 0.24725201725959778, + "learning_rate": 1.5255532323003488e-05, + "loss": 0.3647, "step": 88630 }, { - "epoch": 3.12, - "learning_rate": 1.639666375660923e-05, - "loss": 0.2494, + "epoch": 3.1943993945291385, + "grad_norm": 0.21268433332443237, + "learning_rate": 1.5252845055160231e-05, + "loss": 0.4103, "step": 88635 }, { - "epoch": 3.12, - "learning_rate": 1.639398908892685e-05, - "loss": 0.2712, + "epoch": 3.1945795941903627, + "grad_norm": 0.2236996740102768, + "learning_rate": 1.5250157920120079e-05, + "loss": 0.3829, "step": 88640 }, { - "epoch": 3.12, - "learning_rate": 1.6391314532987844e-05, - "loss": 0.2481, + "epoch": 3.1947597938515875, + "grad_norm": 0.2377428263425827, + "learning_rate": 1.5247470917919634e-05, + "loss": 0.3897, "step": 88645 }, { - "epoch": 3.12, - "learning_rate": 1.6388640088826946e-05, - "loss": 0.2413, + "epoch": 3.194939993512812, + "grad_norm": 0.1965668499469757, + "learning_rate": 1.5244784048595506e-05, + "loss": 0.4063, "step": 88650 }, { - "epoch": 3.12, - "learning_rate": 1.6385965756478887e-05, - "loss": 0.2479, + "epoch": 3.195120193174037, + "grad_norm": 0.21718864142894745, + "learning_rate": 1.5242097312184314e-05, + "loss": 0.3925, "step": 88655 }, { - "epoch": 3.12, - "learning_rate": 1.638329153597839e-05, - "loss": 0.2568, + "epoch": 3.1953003928352617, + "grad_norm": 0.25005796551704407, + "learning_rate": 1.5239410708722651e-05, + "loss": 0.4104, "step": 88660 }, { - "epoch": 3.12, - "learning_rate": 1.6380617427360164e-05, - "loss": 0.2788, + "epoch": 3.195480592496486, + "grad_norm": 0.18256092071533203, + "learning_rate": 1.5236724238247132e-05, + "loss": 0.3653, "step": 88665 }, { - "epoch": 3.12, - "learning_rate": 1.6377943430658942e-05, - "loss": 0.2436, + "epoch": 3.1956607921577107, + "grad_norm": 0.25699880719184875, + "learning_rate": 1.5234037900794362e-05, + "loss": 0.3736, "step": 88670 }, { - "epoch": 3.12, - "learning_rate": 1.6375269545909445e-05, - "loss": 0.2451, + "epoch": 3.1958409918189354, + "grad_norm": 0.20626196265220642, + "learning_rate": 1.5231351696400915e-05, + "loss": 0.3671, "step": 88675 }, { - "epoch": 3.12, - "learning_rate": 1.637259577314639e-05, - "loss": 0.2385, + "epoch": 3.19602119148016, + "grad_norm": 0.2111903876066208, + "learning_rate": 1.5228665625103428e-05, + "loss": 0.3878, "step": 88680 }, { - "epoch": 3.12, - "learning_rate": 1.6369922112404488e-05, - "loss": 0.2284, + "epoch": 3.1962013911413845, + "grad_norm": 0.22062833607196808, + "learning_rate": 1.5225979686938474e-05, + "loss": 0.3727, "step": 88685 }, { - "epoch": 3.12, - "learning_rate": 1.6367248563718468e-05, - "loss": 0.2568, + "epoch": 3.196381590802609, + "grad_norm": 0.2414456009864807, + "learning_rate": 1.5223293881942653e-05, + "loss": 0.3594, "step": 88690 }, { - "epoch": 3.12, - "learning_rate": 1.6364575127123034e-05, - "loss": 0.2702, + "epoch": 3.196561790463834, + "grad_norm": 0.21987886726856232, + "learning_rate": 1.5220608210152559e-05, + "loss": 0.3376, "step": 88695 }, { - "epoch": 3.12, - "learning_rate": 1.6361901802652896e-05, - "loss": 0.2529, + "epoch": 3.1967419901250587, + "grad_norm": 0.20280367136001587, + "learning_rate": 1.5217922671604784e-05, + "loss": 0.3523, "step": 88700 }, { - "epoch": 3.12, - "learning_rate": 1.6359228590342774e-05, - "loss": 0.2551, + "epoch": 3.1969221897862834, + "grad_norm": 0.259671151638031, + "learning_rate": 1.5215237266335925e-05, + "loss": 0.3875, "step": 88705 }, { - "epoch": 3.12, - "learning_rate": 1.635655549022737e-05, - "loss": 0.2418, + "epoch": 3.1971023894475077, + "grad_norm": 0.26262468099594116, + "learning_rate": 1.5212551994382562e-05, + "loss": 0.3624, "step": 88710 }, { - "epoch": 3.12, - "learning_rate": 1.6353882502341404e-05, - "loss": 0.2548, + "epoch": 3.1972825891087324, + "grad_norm": 0.2648344933986664, + "learning_rate": 1.5209866855781277e-05, + "loss": 0.3959, "step": 88715 }, { - "epoch": 3.12, - "learning_rate": 1.6351209626719578e-05, - "loss": 0.2475, + "epoch": 3.197462788769957, + "grad_norm": 0.18271714448928833, + "learning_rate": 1.5207181850568666e-05, + "loss": 0.3252, "step": 88720 }, { - "epoch": 3.12, - "learning_rate": 1.634853686339658e-05, - "loss": 0.2506, + "epoch": 3.197642988431182, + "grad_norm": 0.23713482916355133, + "learning_rate": 1.5204496978781301e-05, + "loss": 0.3915, "step": 88725 }, { - "epoch": 3.12, - "learning_rate": 1.6345864212407136e-05, - "loss": 0.2439, + "epoch": 3.197823188092406, + "grad_norm": 0.2027212232351303, + "learning_rate": 1.5201812240455776e-05, + "loss": 0.368, "step": 88730 }, { - "epoch": 3.12, - "learning_rate": 1.6343191673785953e-05, - "loss": 0.2548, + "epoch": 3.198003387753631, + "grad_norm": 0.26085880398750305, + "learning_rate": 1.519912763562866e-05, + "loss": 0.3783, "step": 88735 }, { - "epoch": 3.12, - "learning_rate": 1.6340519247567702e-05, - "loss": 0.2592, + "epoch": 3.1981835874148556, + "grad_norm": 0.23765386641025543, + "learning_rate": 1.5196443164336526e-05, + "loss": 0.3829, "step": 88740 }, { - "epoch": 3.12, - "learning_rate": 1.6337846933787116e-05, - "loss": 0.2338, + "epoch": 3.1983637870760804, + "grad_norm": 0.2554900348186493, + "learning_rate": 1.5193758826615964e-05, + "loss": 0.3891, "step": 88745 }, { - "epoch": 3.12, - "learning_rate": 1.633517473247888e-05, - "loss": 0.2454, + "epoch": 3.198543986737305, + "grad_norm": 0.21230515837669373, + "learning_rate": 1.5191074622503542e-05, + "loss": 0.4042, "step": 88750 }, { - "epoch": 3.12, - "learning_rate": 1.6332502643677683e-05, - "loss": 0.2575, + "epoch": 3.1987241863985294, + "grad_norm": 0.31726622581481934, + "learning_rate": 1.5188390552035813e-05, + "loss": 0.3832, "step": 88755 }, { - "epoch": 3.12, - "learning_rate": 1.6329830667418228e-05, - "loss": 0.2732, + "epoch": 3.198904386059754, + "grad_norm": 0.21085330843925476, + "learning_rate": 1.5185706615249378e-05, + "loss": 0.375, "step": 88760 }, { - "epoch": 3.12, - "learning_rate": 1.632715880373521e-05, - "loss": 0.2445, + "epoch": 3.199084585720979, + "grad_norm": 0.27633991837501526, + "learning_rate": 1.5183022812180783e-05, + "loss": 0.4169, "step": 88765 }, { - "epoch": 3.12, - "learning_rate": 1.632448705266332e-05, - "loss": 0.2622, + "epoch": 3.1992647853822036, + "grad_norm": 0.21804660558700562, + "learning_rate": 1.5180339142866607e-05, + "loss": 0.3719, "step": 88770 }, { - "epoch": 3.12, - "learning_rate": 1.632181541423725e-05, - "loss": 0.2481, + "epoch": 3.199444985043428, + "grad_norm": 0.24333439767360687, + "learning_rate": 1.517765560734341e-05, + "loss": 0.381, "step": 88775 }, { - "epoch": 3.12, - "learning_rate": 1.6319143888491675e-05, - "loss": 0.2467, + "epoch": 3.1996251847046526, + "grad_norm": 0.23350374400615692, + "learning_rate": 1.5174972205647744e-05, + "loss": 0.3529, "step": 88780 }, { - "epoch": 3.12, - "learning_rate": 1.631647247546131e-05, - "loss": 0.2521, + "epoch": 3.1998053843658774, + "grad_norm": 0.23045288026332855, + "learning_rate": 1.5172288937816193e-05, + "loss": 0.3478, "step": 88785 }, { - "epoch": 3.12, - "learning_rate": 1.6313801175180822e-05, - "loss": 0.2684, + "epoch": 3.199985584027102, + "grad_norm": 0.19861705601215363, + "learning_rate": 1.5169605803885296e-05, + "loss": 0.3737, "step": 88790 }, { - "epoch": 3.12, - "learning_rate": 1.6311129987684902e-05, - "loss": 0.2602, + "epoch": 3.200165783688327, + "grad_norm": 0.3103479743003845, + "learning_rate": 1.5166922803891615e-05, + "loss": 0.3906, "step": 88795 }, { - "epoch": 3.12, - "learning_rate": 1.6308458913008228e-05, - "loss": 0.2543, + "epoch": 3.200345983349551, + "grad_norm": 0.2640889585018158, + "learning_rate": 1.5164239937871708e-05, + "loss": 0.3636, "step": 88800 }, { - "epoch": 3.12, - "learning_rate": 1.6305787951185494e-05, - "loss": 0.2528, + "epoch": 3.200526183010776, + "grad_norm": 0.22964945435523987, + "learning_rate": 1.5161557205862127e-05, + "loss": 0.3578, "step": 88805 }, { - "epoch": 3.12, - "learning_rate": 1.630311710225137e-05, - "loss": 0.2517, + "epoch": 3.2007063826720006, + "grad_norm": 0.20563103258609772, + "learning_rate": 1.5158874607899423e-05, + "loss": 0.3629, "step": 88810 }, { - "epoch": 3.12, - "learning_rate": 1.6300446366240543e-05, - "loss": 0.2709, + "epoch": 3.2008865823332253, + "grad_norm": 0.22984810173511505, + "learning_rate": 1.515619214402015e-05, + "loss": 0.4054, "step": 88815 }, { - "epoch": 3.12, - "learning_rate": 1.6297775743187677e-05, - "loss": 0.2588, + "epoch": 3.20106678199445, + "grad_norm": 0.23353759944438934, + "learning_rate": 1.5153509814260849e-05, + "loss": 0.3607, "step": 88820 }, { - "epoch": 3.13, - "learning_rate": 1.6295105233127468e-05, - "loss": 0.2637, + "epoch": 3.2012469816556743, + "grad_norm": 0.2417394518852234, + "learning_rate": 1.5150827618658075e-05, + "loss": 0.3985, "step": 88825 }, { - "epoch": 3.13, - "learning_rate": 1.6292434836094584e-05, - "loss": 0.2487, + "epoch": 3.201427181316899, + "grad_norm": 0.2204267978668213, + "learning_rate": 1.514814555724837e-05, + "loss": 0.3738, "step": 88830 }, { - "epoch": 3.13, - "learning_rate": 1.6289764552123688e-05, - "loss": 0.2458, + "epoch": 3.201607380978124, + "grad_norm": 0.2334311157464981, + "learning_rate": 1.5145463630068268e-05, + "loss": 0.3758, "step": 88835 }, { - "epoch": 3.13, - "learning_rate": 1.6287094381249457e-05, - "loss": 0.2593, + "epoch": 3.2017875806393485, + "grad_norm": 0.21753154695034027, + "learning_rate": 1.5142781837154319e-05, + "loss": 0.3782, "step": 88840 }, { - "epoch": 3.13, - "learning_rate": 1.6284424323506564e-05, - "loss": 0.237, + "epoch": 3.201967780300573, + "grad_norm": 0.27690520882606506, + "learning_rate": 1.5140100178543057e-05, + "loss": 0.377, "step": 88845 }, { - "epoch": 3.13, - "learning_rate": 1.6281754378929684e-05, - "loss": 0.2576, + "epoch": 3.2021479799617976, + "grad_norm": 0.21353106200695038, + "learning_rate": 1.5137418654271025e-05, + "loss": 0.3773, "step": 88850 }, { - "epoch": 3.13, - "learning_rate": 1.6279084547553465e-05, - "loss": 0.2742, + "epoch": 3.2023281796230223, + "grad_norm": 0.19500304758548737, + "learning_rate": 1.513473726437476e-05, + "loss": 0.351, "step": 88855 }, { - "epoch": 3.13, - "learning_rate": 1.62764148294126e-05, - "loss": 0.2643, + "epoch": 3.202508379284247, + "grad_norm": 0.21235820651054382, + "learning_rate": 1.5132056008890771e-05, + "loss": 0.3599, "step": 88860 }, { - "epoch": 3.13, - "learning_rate": 1.6273745224541735e-05, - "loss": 0.2551, + "epoch": 3.2026885789454718, + "grad_norm": 0.2564550042152405, + "learning_rate": 1.5129374887855629e-05, + "loss": 0.3966, "step": 88865 }, { - "epoch": 3.13, - "learning_rate": 1.627107573297554e-05, - "loss": 0.2677, + "epoch": 3.202868778606696, + "grad_norm": 0.24082764983177185, + "learning_rate": 1.5126693901305836e-05, + "loss": 0.3829, "step": 88870 }, { - "epoch": 3.13, - "learning_rate": 1.6268406354748662e-05, - "loss": 0.2393, + "epoch": 3.203048978267921, + "grad_norm": 0.22911947965621948, + "learning_rate": 1.512401304927792e-05, + "loss": 0.3408, "step": 88875 }, { - "epoch": 3.13, - "learning_rate": 1.6265737089895783e-05, - "loss": 0.2798, + "epoch": 3.2032291779291455, + "grad_norm": 0.26279905438423157, + "learning_rate": 1.5121332331808421e-05, + "loss": 0.4124, "step": 88880 }, { - "epoch": 3.13, - "learning_rate": 1.6263067938451553e-05, - "loss": 0.2618, + "epoch": 3.2034093775903703, + "grad_norm": 0.2682557702064514, + "learning_rate": 1.511865174893385e-05, + "loss": 0.3681, "step": 88885 }, { - "epoch": 3.13, - "learning_rate": 1.626039890045063e-05, - "loss": 0.2606, + "epoch": 3.203589577251595, + "grad_norm": 0.25883546471595764, + "learning_rate": 1.5115971300690746e-05, + "loss": 0.4007, "step": 88890 }, { - "epoch": 3.13, - "learning_rate": 1.6257729975927656e-05, - "loss": 0.2465, + "epoch": 3.2037697769128193, + "grad_norm": 0.21053394675254822, + "learning_rate": 1.5113290987115614e-05, + "loss": 0.3837, "step": 88895 }, { - "epoch": 3.13, - "learning_rate": 1.62550611649173e-05, - "loss": 0.2683, + "epoch": 3.203949976574044, + "grad_norm": 0.22569601237773895, + "learning_rate": 1.5110610808244974e-05, + "loss": 0.3969, "step": 88900 }, { - "epoch": 3.13, - "learning_rate": 1.625239246745422e-05, - "loss": 0.2552, + "epoch": 3.2041301762352687, + "grad_norm": 0.24073798954486847, + "learning_rate": 1.5107930764115352e-05, + "loss": 0.3974, "step": 88905 }, { - "epoch": 3.13, - "learning_rate": 1.624972388357306e-05, - "loss": 0.2637, + "epoch": 3.2043103758964935, + "grad_norm": 0.24545975029468536, + "learning_rate": 1.5105250854763259e-05, + "loss": 0.3976, "step": 88910 }, { - "epoch": 3.13, - "learning_rate": 1.6247055413308455e-05, - "loss": 0.2644, + "epoch": 3.2044905755577178, + "grad_norm": 0.18453896045684814, + "learning_rate": 1.5102571080225203e-05, + "loss": 0.3786, "step": 88915 }, { - "epoch": 3.13, - "learning_rate": 1.624438705669508e-05, - "loss": 0.2455, + "epoch": 3.2046707752189425, + "grad_norm": 0.2695164084434509, + "learning_rate": 1.5099891440537705e-05, + "loss": 0.3783, "step": 88920 }, { - "epoch": 3.13, - "learning_rate": 1.624171881376756e-05, - "loss": 0.2699, + "epoch": 3.2048509748801672, + "grad_norm": 0.22112925350666046, + "learning_rate": 1.5097211935737263e-05, + "loss": 0.4013, "step": 88925 }, { - "epoch": 3.13, - "learning_rate": 1.6239050684560557e-05, - "loss": 0.2428, + "epoch": 3.205031174541392, + "grad_norm": 0.2127871960401535, + "learning_rate": 1.5094532565860398e-05, + "loss": 0.3885, "step": 88930 }, { - "epoch": 3.13, - "learning_rate": 1.6236382669108696e-05, - "loss": 0.2529, + "epoch": 3.2052113742026167, + "grad_norm": 0.22440172731876373, + "learning_rate": 1.5091853330943606e-05, + "loss": 0.3864, "step": 88935 }, { - "epoch": 3.13, - "learning_rate": 1.6233714767446645e-05, - "loss": 0.2753, + "epoch": 3.205391573863841, + "grad_norm": 0.2635382115840912, + "learning_rate": 1.508917423102339e-05, + "loss": 0.3542, "step": 88940 }, { - "epoch": 3.13, - "learning_rate": 1.6231046979609026e-05, - "loss": 0.2709, + "epoch": 3.2055717735250657, + "grad_norm": 0.20323844254016876, + "learning_rate": 1.5086495266136263e-05, + "loss": 0.3546, "step": 88945 }, { - "epoch": 3.13, - "learning_rate": 1.622837930563048e-05, - "loss": 0.2455, + "epoch": 3.2057519731862905, + "grad_norm": 0.21139764785766602, + "learning_rate": 1.5083816436318716e-05, + "loss": 0.3542, "step": 88950 }, { - "epoch": 3.13, - "learning_rate": 1.622571174554564e-05, - "loss": 0.2726, + "epoch": 3.205932172847515, + "grad_norm": 0.20650440454483032, + "learning_rate": 1.5081137741607249e-05, + "loss": 0.3755, "step": 88955 }, { - "epoch": 3.13, - "learning_rate": 1.6223044299389163e-05, - "loss": 0.2659, + "epoch": 3.2061123725087395, + "grad_norm": 0.22922226786613464, + "learning_rate": 1.5078459182038367e-05, + "loss": 0.3651, "step": 88960 }, { - "epoch": 3.13, - "learning_rate": 1.6220376967195673e-05, - "loss": 0.2429, + "epoch": 3.206292572169964, + "grad_norm": 0.2639060914516449, + "learning_rate": 1.5075780757648544e-05, + "loss": 0.4042, "step": 88965 }, { - "epoch": 3.13, - "learning_rate": 1.6217709748999787e-05, - "loss": 0.2912, + "epoch": 3.206472771831189, + "grad_norm": 0.20227624475955963, + "learning_rate": 1.5073102468474304e-05, + "loss": 0.3607, "step": 88970 }, { - "epoch": 3.13, - "learning_rate": 1.621504264483617e-05, - "loss": 0.2627, + "epoch": 3.2066529714924137, + "grad_norm": 0.23656994104385376, + "learning_rate": 1.5070424314552112e-05, + "loss": 0.4143, "step": 88975 }, { - "epoch": 3.13, - "learning_rate": 1.621237565473942e-05, - "loss": 0.2782, + "epoch": 3.2068331711536384, + "grad_norm": 0.21116869151592255, + "learning_rate": 1.5067746295918462e-05, + "loss": 0.3836, "step": 88980 }, { - "epoch": 3.13, - "learning_rate": 1.6210242144813453e-05, - "loss": 0.27, + "epoch": 3.2070133708148627, + "grad_norm": 0.20764394104480743, + "learning_rate": 1.506506841260985e-05, + "loss": 0.3969, "step": 88985 }, { - "epoch": 3.13, - "learning_rate": 1.620757536012436e-05, - "loss": 0.2398, + "epoch": 3.2071935704760874, + "grad_norm": 0.2260044366121292, + "learning_rate": 1.5062390664662757e-05, + "loss": 0.3741, "step": 88990 }, { - "epoch": 3.13, - "learning_rate": 1.6204908689599114e-05, - "loss": 0.2631, + "epoch": 3.207373770137312, + "grad_norm": 0.2571514844894409, + "learning_rate": 1.5059713052113666e-05, + "loss": 0.3532, "step": 88995 }, { - "epoch": 3.13, - "learning_rate": 1.6202242133272327e-05, - "loss": 0.2362, + "epoch": 3.207553969798537, + "grad_norm": 0.27601492404937744, + "learning_rate": 1.5057035574999067e-05, + "loss": 0.3968, "step": 89000 }, { - "epoch": 3.13, - "eval_loss": 0.25561562180519104, - "eval_runtime": 10.5484, - "eval_samples_per_second": 9.48, - "eval_steps_per_second": 9.48, + "epoch": 3.207553969798537, + "eval_loss": 0.4308409094810486, + "eval_runtime": 3.5383, + "eval_samples_per_second": 28.262, + "eval_steps_per_second": 7.066, "step": 89000 }, { - "epoch": 3.13, - "learning_rate": 1.619957569117862e-05, - "loss": 0.2543, + "epoch": 3.207734169459761, + "grad_norm": 0.22598758339881897, + "learning_rate": 1.5054358233355423e-05, + "loss": 0.373, "step": 89005 }, { - "epoch": 3.13, - "learning_rate": 1.6196909363352637e-05, - "loss": 0.2474, + "epoch": 3.207914369120986, + "grad_norm": 0.23747682571411133, + "learning_rate": 1.5051681027219228e-05, + "loss": 0.366, "step": 89010 }, { - "epoch": 3.13, - "learning_rate": 1.6194243149828974e-05, - "loss": 0.2468, + "epoch": 3.2080945687822107, + "grad_norm": 0.1627287119626999, + "learning_rate": 1.5049003956626957e-05, + "loss": 0.3545, "step": 89015 }, { - "epoch": 3.13, - "learning_rate": 1.6191577050642267e-05, - "loss": 0.2666, + "epoch": 3.2082747684434354, + "grad_norm": 0.28767460584640503, + "learning_rate": 1.504632702161507e-05, + "loss": 0.3935, "step": 89020 }, { - "epoch": 3.13, - "learning_rate": 1.6188911065827118e-05, - "loss": 0.2764, + "epoch": 3.20845496810466, + "grad_norm": 0.28209447860717773, + "learning_rate": 1.5043650222220057e-05, + "loss": 0.3836, "step": 89025 }, { - "epoch": 3.13, - "learning_rate": 1.6186245195418162e-05, - "loss": 0.2577, + "epoch": 3.2086351677658844, + "grad_norm": 0.2737613022327423, + "learning_rate": 1.504097355847838e-05, + "loss": 0.3978, "step": 89030 }, { - "epoch": 3.13, - "learning_rate": 1.6183579439449998e-05, - "loss": 0.2554, + "epoch": 3.208815367427109, + "grad_norm": 0.2262285202741623, + "learning_rate": 1.5038297030426513e-05, + "loss": 0.3948, "step": 89035 }, { - "epoch": 3.13, - "learning_rate": 1.6180913797957246e-05, - "loss": 0.2567, + "epoch": 3.208995567088334, + "grad_norm": 0.24471984803676605, + "learning_rate": 1.5035620638100926e-05, + "loss": 0.4265, "step": 89040 }, { - "epoch": 3.13, - "learning_rate": 1.6178248270974515e-05, - "loss": 0.273, + "epoch": 3.2091757667495586, + "grad_norm": 0.2190433293581009, + "learning_rate": 1.5032944381538061e-05, + "loss": 0.3845, "step": 89045 }, { - "epoch": 3.13, - "learning_rate": 1.617558285853642e-05, - "loss": 0.2565, + "epoch": 3.209355966410783, + "grad_norm": 0.22950243949890137, + "learning_rate": 1.503026826077441e-05, + "loss": 0.4014, "step": 89050 }, { - "epoch": 3.13, - "learning_rate": 1.617291756067757e-05, - "loss": 0.2484, + "epoch": 3.2095361660720076, + "grad_norm": 0.23515675961971283, + "learning_rate": 1.502759227584643e-05, + "loss": 0.3942, "step": 89055 }, { - "epoch": 3.13, - "learning_rate": 1.6170252377432554e-05, - "loss": 0.2745, + "epoch": 3.2097163657332324, + "grad_norm": 0.24395707249641418, + "learning_rate": 1.502491642679056e-05, + "loss": 0.3746, "step": 89060 }, { - "epoch": 3.13, - "learning_rate": 1.6167587308836007e-05, - "loss": 0.2313, + "epoch": 3.209896565394457, + "grad_norm": 0.28980615735054016, + "learning_rate": 1.5022240713643287e-05, + "loss": 0.401, "step": 89065 }, { - "epoch": 3.13, - "learning_rate": 1.6164922354922507e-05, - "loss": 0.2444, + "epoch": 3.210076765055682, + "grad_norm": 0.1918734461069107, + "learning_rate": 1.5019565136441038e-05, + "loss": 0.3971, "step": 89070 }, { - "epoch": 3.13, - "learning_rate": 1.6162257515726678e-05, - "loss": 0.2472, + "epoch": 3.210256964716906, + "grad_norm": 0.22146931290626526, + "learning_rate": 1.50168896952203e-05, + "loss": 0.3756, "step": 89075 }, { - "epoch": 3.13, - "learning_rate": 1.6159592791283097e-05, - "loss": 0.2551, + "epoch": 3.210437164378131, + "grad_norm": 0.20011606812477112, + "learning_rate": 1.5014214390017496e-05, + "loss": 0.3584, "step": 89080 }, { - "epoch": 3.13, - "learning_rate": 1.615692818162639e-05, - "loss": 0.2704, + "epoch": 3.2106173640393556, + "grad_norm": 0.2056010365486145, + "learning_rate": 1.5011539220869084e-05, + "loss": 0.3509, "step": 89085 }, { - "epoch": 3.13, - "learning_rate": 1.6154263686791143e-05, - "loss": 0.266, + "epoch": 3.2107975637005803, + "grad_norm": 0.2631101608276367, + "learning_rate": 1.5008864187811522e-05, + "loss": 0.3855, "step": 89090 }, { - "epoch": 3.13, - "learning_rate": 1.6151599306811946e-05, - "loss": 0.2748, + "epoch": 3.210977763361805, + "grad_norm": 0.2047474980354309, + "learning_rate": 1.500618929088125e-05, + "loss": 0.3843, "step": 89095 }, { - "epoch": 3.13, - "learning_rate": 1.6148935041723396e-05, - "loss": 0.2489, + "epoch": 3.2111579630230294, + "grad_norm": 0.2543022632598877, + "learning_rate": 1.5003514530114712e-05, + "loss": 0.3901, "step": 89100 }, { - "epoch": 3.13, - "learning_rate": 1.61462708915601e-05, - "loss": 0.24, + "epoch": 3.211338162684254, + "grad_norm": 0.22199147939682007, + "learning_rate": 1.5000839905548359e-05, + "loss": 0.372, "step": 89105 }, { - "epoch": 3.14, - "learning_rate": 1.6143606856356644e-05, - "loss": 0.2735, + "epoch": 3.211518362345479, + "grad_norm": 0.19281704723834991, + "learning_rate": 1.4998165417218618e-05, + "loss": 0.341, "step": 89110 }, { - "epoch": 3.14, - "learning_rate": 1.6140942936147615e-05, - "loss": 0.2626, + "epoch": 3.2116985620067036, + "grad_norm": 0.2250460386276245, + "learning_rate": 1.499549106516194e-05, + "loss": 0.3559, "step": 89115 }, { - "epoch": 3.14, - "learning_rate": 1.6138279130967594e-05, - "loss": 0.2559, + "epoch": 3.2118787616679283, + "grad_norm": 0.2156555950641632, + "learning_rate": 1.4992816849414765e-05, + "loss": 0.3516, "step": 89120 }, { - "epoch": 3.14, - "learning_rate": 1.6135615440851185e-05, - "loss": 0.2752, + "epoch": 3.2120589613291526, + "grad_norm": 0.2305130809545517, + "learning_rate": 1.4990142770013512e-05, + "loss": 0.3911, "step": 89125 }, { - "epoch": 3.14, - "learning_rate": 1.6132951865832968e-05, - "loss": 0.2571, + "epoch": 3.2122391609903773, + "grad_norm": 0.23964077234268188, + "learning_rate": 1.4987468826994632e-05, + "loss": 0.4231, "step": 89130 }, { - "epoch": 3.14, - "learning_rate": 1.6130288405947534e-05, - "loss": 0.2491, + "epoch": 3.212419360651602, + "grad_norm": 0.2621869742870331, + "learning_rate": 1.4984795020394548e-05, + "loss": 0.378, "step": 89135 }, { - "epoch": 3.14, - "learning_rate": 1.6127625061229447e-05, - "loss": 0.2669, + "epoch": 3.212599560312827, + "grad_norm": 0.18443959951400757, + "learning_rate": 1.49821213502497e-05, + "loss": 0.3616, "step": 89140 }, { - "epoch": 3.14, - "learning_rate": 1.612496183171331e-05, - "loss": 0.2549, + "epoch": 3.212779759974051, + "grad_norm": 0.20639346539974213, + "learning_rate": 1.4979447816596508e-05, + "loss": 0.3615, "step": 89145 }, { - "epoch": 3.14, - "learning_rate": 1.612229871743369e-05, - "loss": 0.27, + "epoch": 3.212959959635276, + "grad_norm": 0.24619300663471222, + "learning_rate": 1.497677441947139e-05, + "loss": 0.4164, "step": 89150 }, { - "epoch": 3.14, - "learning_rate": 1.6119635718425163e-05, - "loss": 0.2503, + "epoch": 3.2131401592965005, + "grad_norm": 0.3176526427268982, + "learning_rate": 1.4974101158910791e-05, + "loss": 0.3682, "step": 89155 }, { - "epoch": 3.14, - "learning_rate": 1.6116972834722328e-05, - "loss": 0.2405, + "epoch": 3.2133203589577253, + "grad_norm": 0.2009301781654358, + "learning_rate": 1.497142803495112e-05, + "loss": 0.3765, "step": 89160 }, { - "epoch": 3.14, - "learning_rate": 1.6114310066359746e-05, - "loss": 0.2495, + "epoch": 3.21350055861895, + "grad_norm": 0.2197410613298416, + "learning_rate": 1.4968755047628796e-05, + "loss": 0.3795, "step": 89165 }, { - "epoch": 3.14, - "learning_rate": 1.6111647413371992e-05, - "loss": 0.2459, + "epoch": 3.2136807582801743, + "grad_norm": 0.23432065546512604, + "learning_rate": 1.4966082196980247e-05, + "loss": 0.3779, "step": 89170 }, { - "epoch": 3.14, - "learning_rate": 1.6108984875793626e-05, - "loss": 0.2613, + "epoch": 3.213860957941399, + "grad_norm": 0.29888421297073364, + "learning_rate": 1.4963409483041874e-05, + "loss": 0.3599, "step": 89175 }, { - "epoch": 3.14, - "learning_rate": 1.610632245365924e-05, - "loss": 0.2814, + "epoch": 3.2140411576026238, + "grad_norm": 0.19982536137104034, + "learning_rate": 1.496073690585012e-05, + "loss": 0.3989, "step": 89180 }, { - "epoch": 3.14, - "learning_rate": 1.6103660147003402e-05, - "loss": 0.2423, + "epoch": 3.2142213572638485, + "grad_norm": 0.22147499024868011, + "learning_rate": 1.4958064465441374e-05, + "loss": 0.4284, "step": 89185 }, { - "epoch": 3.14, - "learning_rate": 1.610099795586067e-05, - "loss": 0.2627, + "epoch": 3.214401556925073, + "grad_norm": 0.23238062858581543, + "learning_rate": 1.4955392161852056e-05, + "loss": 0.368, "step": 89190 }, { - "epoch": 3.14, - "learning_rate": 1.6098335880265608e-05, - "loss": 0.2506, + "epoch": 3.2145817565862975, + "grad_norm": 0.251499205827713, + "learning_rate": 1.4952719995118574e-05, + "loss": 0.4231, "step": 89195 }, { - "epoch": 3.14, - "learning_rate": 1.6095673920252795e-05, - "loss": 0.2539, + "epoch": 3.2147619562475223, + "grad_norm": 0.24169008433818817, + "learning_rate": 1.4950047965277342e-05, + "loss": 0.3682, "step": 89200 }, { - "epoch": 3.14, - "learning_rate": 1.6093012075856787e-05, - "loss": 0.2608, + "epoch": 3.214942155908747, + "grad_norm": 0.21621721982955933, + "learning_rate": 1.4947376072364752e-05, + "loss": 0.3801, "step": 89205 }, { - "epoch": 3.14, - "learning_rate": 1.6090350347112148e-05, - "loss": 0.267, + "epoch": 3.2151223555699717, + "grad_norm": 0.2918843626976013, + "learning_rate": 1.4944704316417223e-05, + "loss": 0.3745, "step": 89210 }, { - "epoch": 3.14, - "learning_rate": 1.608768873405343e-05, - "loss": 0.2623, + "epoch": 3.215302555231196, + "grad_norm": 0.1639867126941681, + "learning_rate": 1.4942032697471148e-05, + "loss": 0.3755, "step": 89215 }, { - "epoch": 3.14, - "learning_rate": 1.6085027236715212e-05, - "loss": 0.2672, + "epoch": 3.2154827548924207, + "grad_norm": 0.2433352768421173, + "learning_rate": 1.493936121556293e-05, + "loss": 0.3648, "step": 89220 }, { - "epoch": 3.14, - "learning_rate": 1.608236585513204e-05, - "loss": 0.2301, + "epoch": 3.2156629545536455, + "grad_norm": 0.20584319531917572, + "learning_rate": 1.4936689870728974e-05, + "loss": 0.3858, "step": 89225 }, { - "epoch": 3.14, - "learning_rate": 1.6079704589338463e-05, - "loss": 0.2536, + "epoch": 3.21584315421487, + "grad_norm": 0.2562110424041748, + "learning_rate": 1.4934018663005662e-05, + "loss": 0.3917, "step": 89230 }, { - "epoch": 3.14, - "learning_rate": 1.6077043439369037e-05, - "loss": 0.2361, + "epoch": 3.2160233538760945, + "grad_norm": 0.19980406761169434, + "learning_rate": 1.4931347592429401e-05, + "loss": 0.3751, "step": 89235 }, { - "epoch": 3.14, - "learning_rate": 1.6074382405258337e-05, - "loss": 0.273, + "epoch": 3.2162035535373192, + "grad_norm": 0.24331307411193848, + "learning_rate": 1.4928676659036586e-05, + "loss": 0.3779, "step": 89240 }, { - "epoch": 3.14, - "learning_rate": 1.6071721487040897e-05, - "loss": 0.2442, + "epoch": 3.216383753198544, + "grad_norm": 0.20756937563419342, + "learning_rate": 1.4926005862863584e-05, + "loss": 0.38, "step": 89245 }, { - "epoch": 3.14, - "learning_rate": 1.606906068475127e-05, - "loss": 0.2665, + "epoch": 3.2165639528597687, + "grad_norm": 0.21686789393424988, + "learning_rate": 1.492333520394682e-05, + "loss": 0.3911, "step": 89250 }, { - "epoch": 3.14, - "learning_rate": 1.606639999842399e-05, - "loss": 0.2583, + "epoch": 3.2167441525209934, + "grad_norm": 0.1949186772108078, + "learning_rate": 1.492066468232265e-05, + "loss": 0.3791, "step": 89255 }, { - "epoch": 3.14, - "learning_rate": 1.6063739428093626e-05, - "loss": 0.2467, + "epoch": 3.2169243521822177, + "grad_norm": 0.19429568946361542, + "learning_rate": 1.4917994298027482e-05, + "loss": 0.3407, "step": 89260 }, { - "epoch": 3.14, - "learning_rate": 1.6061078973794725e-05, - "loss": 0.2623, + "epoch": 3.2171045518434425, + "grad_norm": 0.25820302963256836, + "learning_rate": 1.4915324051097688e-05, + "loss": 0.3799, "step": 89265 }, { - "epoch": 3.14, - "learning_rate": 1.6058418635561805e-05, - "loss": 0.2491, + "epoch": 3.217284751504667, + "grad_norm": 0.2778339684009552, + "learning_rate": 1.4912653941569642e-05, + "loss": 0.3973, "step": 89270 }, { - "epoch": 3.14, - "learning_rate": 1.605575841342944e-05, - "loss": 0.2509, + "epoch": 3.217464951165892, + "grad_norm": 0.2582213878631592, + "learning_rate": 1.4909983969479747e-05, + "loss": 0.3764, "step": 89275 }, { - "epoch": 3.14, - "learning_rate": 1.6053098307432162e-05, - "loss": 0.2567, + "epoch": 3.217645150827116, + "grad_norm": 0.1809670329093933, + "learning_rate": 1.490731413486436e-05, + "loss": 0.3945, "step": 89280 }, { - "epoch": 3.14, - "learning_rate": 1.6050438317604496e-05, - "loss": 0.2441, + "epoch": 3.217825350488341, + "grad_norm": 0.2597140371799469, + "learning_rate": 1.490464443775986e-05, + "loss": 0.4122, "step": 89285 }, { - "epoch": 3.14, - "learning_rate": 1.6047778443980992e-05, - "loss": 0.2644, + "epoch": 3.2180055501495657, + "grad_norm": 0.23340429365634918, + "learning_rate": 1.490197487820263e-05, + "loss": 0.3487, "step": 89290 }, { - "epoch": 3.14, - "learning_rate": 1.6045118686596182e-05, - "loss": 0.2314, + "epoch": 3.2181857498107904, + "grad_norm": 0.2497396469116211, + "learning_rate": 1.4899305456229032e-05, + "loss": 0.3891, "step": 89295 }, { - "epoch": 3.14, - "learning_rate": 1.6042459045484614e-05, - "loss": 0.2554, + "epoch": 3.218365949472015, + "grad_norm": 0.28371942043304443, + "learning_rate": 1.4896636171875447e-05, + "loss": 0.4068, "step": 89300 }, { - "epoch": 3.14, - "learning_rate": 1.603979952068081e-05, - "loss": 0.2759, + "epoch": 3.2185461491332394, + "grad_norm": 0.21911130845546722, + "learning_rate": 1.489396702517824e-05, + "loss": 0.4053, "step": 89305 }, { - "epoch": 3.14, - "learning_rate": 1.6037140112219294e-05, - "loss": 0.2714, + "epoch": 3.218726348794464, + "grad_norm": 0.2320789396762848, + "learning_rate": 1.489129801617377e-05, + "loss": 0.3911, "step": 89310 }, { - "epoch": 3.14, - "learning_rate": 1.6034480820134615e-05, - "loss": 0.2457, + "epoch": 3.218906548455689, + "grad_norm": 0.21250326931476593, + "learning_rate": 1.4888629144898409e-05, + "loss": 0.3706, "step": 89315 }, { - "epoch": 3.14, - "learning_rate": 1.6031821644461296e-05, - "loss": 0.2447, + "epoch": 3.2190867481169136, + "grad_norm": 0.26723039150238037, + "learning_rate": 1.4885960411388517e-05, + "loss": 0.3946, "step": 89320 }, { - "epoch": 3.14, - "learning_rate": 1.6029162585233866e-05, - "loss": 0.2676, + "epoch": 3.2192669477781384, + "grad_norm": 0.20231136679649353, + "learning_rate": 1.4883291815680459e-05, + "loss": 0.3582, "step": 89325 }, { - "epoch": 3.14, - "learning_rate": 1.6026503642486835e-05, - "loss": 0.2476, + "epoch": 3.2194471474393627, + "grad_norm": 0.2787347435951233, + "learning_rate": 1.4880623357810597e-05, + "loss": 0.3847, "step": 89330 }, { - "epoch": 3.14, - "learning_rate": 1.602384481625475e-05, - "loss": 0.2467, + "epoch": 3.2196273471005874, + "grad_norm": 0.2298644334077835, + "learning_rate": 1.4877955037815267e-05, + "loss": 0.3643, "step": 89335 }, { - "epoch": 3.14, - "learning_rate": 1.602118610657212e-05, - "loss": 0.2636, + "epoch": 3.219807546761812, + "grad_norm": 0.23724813759326935, + "learning_rate": 1.4875286855730852e-05, + "loss": 0.4065, "step": 89340 }, { - "epoch": 3.14, - "learning_rate": 1.6018527513473476e-05, - "loss": 0.2591, + "epoch": 3.219987746423037, + "grad_norm": 0.1757110357284546, + "learning_rate": 1.4872618811593697e-05, + "loss": 0.3819, "step": 89345 }, { - "epoch": 3.14, - "learning_rate": 1.6015869036993326e-05, - "loss": 0.2523, + "epoch": 3.220167946084261, + "grad_norm": 0.20195148885250092, + "learning_rate": 1.4869950905440136e-05, + "loss": 0.3823, "step": 89350 }, { - "epoch": 3.14, - "learning_rate": 1.60132106771662e-05, - "loss": 0.2732, + "epoch": 3.220348145745486, + "grad_norm": 0.1882009357213974, + "learning_rate": 1.4867283137306547e-05, + "loss": 0.339, "step": 89355 }, { - "epoch": 3.14, - "learning_rate": 1.6010552434026615e-05, - "loss": 0.2625, + "epoch": 3.2205283454067106, + "grad_norm": 0.24396227300167084, + "learning_rate": 1.4864615507229246e-05, + "loss": 0.3963, "step": 89360 }, { - "epoch": 3.14, - "learning_rate": 1.6007894307609078e-05, - "loss": 0.2702, + "epoch": 3.2207085450679354, + "grad_norm": 0.2632978558540344, + "learning_rate": 1.4861948015244615e-05, + "loss": 0.4177, "step": 89365 }, { - "epoch": 3.14, - "learning_rate": 1.6005236297948107e-05, - "loss": 0.2705, + "epoch": 3.22088874472916, + "grad_norm": 0.20694774389266968, + "learning_rate": 1.4859280661388974e-05, + "loss": 0.367, "step": 89370 }, { - "epoch": 3.14, - "learning_rate": 1.600257840507821e-05, - "loss": 0.2774, + "epoch": 3.2210689443903844, + "grad_norm": 0.21386392414569855, + "learning_rate": 1.4856613445698664e-05, + "loss": 0.359, "step": 89375 }, { - "epoch": 3.14, - "learning_rate": 1.5999920629033914e-05, - "loss": 0.2673, + "epoch": 3.221249144051609, + "grad_norm": 0.1855984479188919, + "learning_rate": 1.4853946368210036e-05, + "loss": 0.3242, "step": 89380 }, { - "epoch": 3.14, - "learning_rate": 1.5997262969849712e-05, - "loss": 0.25, + "epoch": 3.221429343712834, + "grad_norm": 0.23867137730121613, + "learning_rate": 1.485127942895943e-05, + "loss": 0.363, "step": 89385 }, { - "epoch": 3.14, - "learning_rate": 1.5994605427560112e-05, - "loss": 0.2511, + "epoch": 3.2216095433740586, + "grad_norm": 0.19533032178878784, + "learning_rate": 1.4848612627983166e-05, + "loss": 0.4011, "step": 89390 }, { - "epoch": 3.15, - "learning_rate": 1.599194800219963e-05, - "loss": 0.2466, + "epoch": 3.2217897430352833, + "grad_norm": 0.22574682533740997, + "learning_rate": 1.4845945965317598e-05, + "loss": 0.3869, "step": 89395 }, { - "epoch": 3.15, - "learning_rate": 1.598929069380277e-05, - "loss": 0.2701, + "epoch": 3.2219699426965076, + "grad_norm": 0.268163800239563, + "learning_rate": 1.4843279440999044e-05, + "loss": 0.3802, "step": 89400 }, { - "epoch": 3.15, - "learning_rate": 1.5986633502404024e-05, - "loss": 0.2589, + "epoch": 3.2221501423577323, + "grad_norm": 0.23362210392951965, + "learning_rate": 1.4840613055063845e-05, + "loss": 0.4094, "step": 89405 }, { - "epoch": 3.15, - "learning_rate": 1.598397642803791e-05, - "loss": 0.2441, + "epoch": 3.222330342018957, + "grad_norm": 0.2540440559387207, + "learning_rate": 1.483794680754833e-05, + "loss": 0.366, "step": 89410 }, { - "epoch": 3.15, - "learning_rate": 1.598131947073892e-05, - "loss": 0.2767, + "epoch": 3.222510541680182, + "grad_norm": 0.2499903291463852, + "learning_rate": 1.4835280698488812e-05, + "loss": 0.3656, "step": 89415 }, { - "epoch": 3.15, - "learning_rate": 1.597866263054155e-05, - "loss": 0.2721, + "epoch": 3.222690741341406, + "grad_norm": 0.21211077272891998, + "learning_rate": 1.4832614727921634e-05, + "loss": 0.3654, "step": 89420 }, { - "epoch": 3.15, - "learning_rate": 1.5976005907480303e-05, - "loss": 0.2415, + "epoch": 3.222870941002631, + "grad_norm": 0.20574845373630524, + "learning_rate": 1.482994889588312e-05, + "loss": 0.39, "step": 89425 }, { - "epoch": 3.15, - "learning_rate": 1.5973349301589668e-05, - "loss": 0.276, + "epoch": 3.2230511406638556, + "grad_norm": 0.21333856880664825, + "learning_rate": 1.4827283202409564e-05, + "loss": 0.4014, "step": 89430 }, { - "epoch": 3.15, - "learning_rate": 1.5970692812904148e-05, - "loss": 0.264, + "epoch": 3.2232313403250803, + "grad_norm": 0.1983533501625061, + "learning_rate": 1.4824617647537323e-05, + "loss": 0.3733, "step": 89435 }, { - "epoch": 3.15, - "learning_rate": 1.5968036441458233e-05, - "loss": 0.244, + "epoch": 3.223411539986305, + "grad_norm": 0.1866043210029602, + "learning_rate": 1.482195223130268e-05, + "loss": 0.3471, "step": 89440 }, { - "epoch": 3.15, - "learning_rate": 1.59653801872864e-05, - "loss": 0.2484, + "epoch": 3.2235917396475293, + "grad_norm": 0.22153297066688538, + "learning_rate": 1.481928695374198e-05, + "loss": 0.3847, "step": 89445 }, { - "epoch": 3.15, - "learning_rate": 1.5962724050423168e-05, - "loss": 0.2579, + "epoch": 3.223771939308754, + "grad_norm": 0.26498228311538696, + "learning_rate": 1.4816621814891524e-05, + "loss": 0.3581, "step": 89450 }, { - "epoch": 3.15, - "learning_rate": 1.5960068030902997e-05, - "loss": 0.2605, + "epoch": 3.223952138969979, + "grad_norm": 0.24101939797401428, + "learning_rate": 1.4813956814787616e-05, + "loss": 0.3513, "step": 89455 }, { - "epoch": 3.15, - "learning_rate": 1.595741212876039e-05, - "loss": 0.254, + "epoch": 3.2241323386312035, + "grad_norm": 0.23949559032917023, + "learning_rate": 1.481129195346658e-05, + "loss": 0.375, "step": 89460 }, { - "epoch": 3.15, - "learning_rate": 1.595475634402982e-05, - "loss": 0.2555, + "epoch": 3.224312538292428, + "grad_norm": 0.20637662708759308, + "learning_rate": 1.4808627230964717e-05, + "loss": 0.3764, "step": 89465 }, { - "epoch": 3.15, - "learning_rate": 1.5952100676745792e-05, - "loss": 0.2482, + "epoch": 3.2244927379536525, + "grad_norm": 0.25291964411735535, + "learning_rate": 1.4805962647318333e-05, + "loss": 0.3823, "step": 89470 }, { - "epoch": 3.15, - "learning_rate": 1.594944512694276e-05, - "loss": 0.2741, + "epoch": 3.2246729376148773, + "grad_norm": 0.2150401920080185, + "learning_rate": 1.4803298202563737e-05, + "loss": 0.3555, "step": 89475 }, { - "epoch": 3.15, - "learning_rate": 1.5946789694655228e-05, - "loss": 0.2761, + "epoch": 3.224853137276102, + "grad_norm": 0.25628653168678284, + "learning_rate": 1.4800633896737226e-05, + "loss": 0.3762, "step": 89480 }, { - "epoch": 3.15, - "learning_rate": 1.5944134379917657e-05, - "loss": 0.2633, + "epoch": 3.2250333369373267, + "grad_norm": 0.2144256830215454, + "learning_rate": 1.4797969729875106e-05, + "loss": 0.3503, "step": 89485 }, { - "epoch": 3.15, - "learning_rate": 1.5941479182764542e-05, - "loss": 0.236, + "epoch": 3.225213536598551, + "grad_norm": 0.2198164016008377, + "learning_rate": 1.4795305702013674e-05, + "loss": 0.3467, "step": 89490 }, { - "epoch": 3.15, - "learning_rate": 1.5938824103230344e-05, - "loss": 0.2626, + "epoch": 3.2253937362597758, + "grad_norm": 0.2541256844997406, + "learning_rate": 1.4792641813189222e-05, + "loss": 0.3787, "step": 89495 }, { - "epoch": 3.15, - "learning_rate": 1.5936169141349548e-05, - "loss": 0.2246, + "epoch": 3.2255739359210005, + "grad_norm": 0.21869847178459167, + "learning_rate": 1.4789978063438054e-05, + "loss": 0.3738, "step": 89500 }, { - "epoch": 3.15, - "eval_loss": 0.25540173053741455, - "eval_runtime": 10.5366, - "eval_samples_per_second": 9.491, - "eval_steps_per_second": 9.491, + "epoch": 3.2255739359210005, + "eval_loss": 0.43070846796035767, + "eval_runtime": 3.5348, + "eval_samples_per_second": 28.29, + "eval_steps_per_second": 7.073, "step": 89500 }, { - "epoch": 3.15, - "learning_rate": 1.5933514297156614e-05, - "loss": 0.2681, + "epoch": 3.2257541355822252, + "grad_norm": 0.3209418058395386, + "learning_rate": 1.4787314452796458e-05, + "loss": 0.3811, "step": 89505 }, { - "epoch": 3.15, - "learning_rate": 1.593085957068603e-05, - "loss": 0.2602, + "epoch": 3.2259343352434495, + "grad_norm": 0.2772040069103241, + "learning_rate": 1.4784650981300718e-05, + "loss": 0.3696, "step": 89510 }, { - "epoch": 3.15, - "learning_rate": 1.5928204961972257e-05, - "loss": 0.2618, + "epoch": 3.2261145349046743, + "grad_norm": 0.2352096289396286, + "learning_rate": 1.4781987648987138e-05, + "loss": 0.3697, "step": 89515 }, { - "epoch": 3.15, - "learning_rate": 1.5925550471049756e-05, - "loss": 0.2692, + "epoch": 3.226294734565899, + "grad_norm": 0.19842156767845154, + "learning_rate": 1.4779324455891994e-05, + "loss": 0.3671, "step": 89520 }, { - "epoch": 3.15, - "learning_rate": 1.592289609795301e-05, - "loss": 0.2481, + "epoch": 3.2264749342271237, + "grad_norm": 0.2521081864833832, + "learning_rate": 1.4776661402051578e-05, + "loss": 0.3927, "step": 89525 }, { - "epoch": 3.15, - "learning_rate": 1.5920241842716477e-05, - "loss": 0.2424, + "epoch": 3.2266551338883485, + "grad_norm": 0.23003843426704407, + "learning_rate": 1.4773998487502178e-05, + "loss": 0.3673, "step": 89530 }, { - "epoch": 3.15, - "learning_rate": 1.5917587705374614e-05, - "loss": 0.2458, + "epoch": 3.2268353335495727, + "grad_norm": 0.20491987466812134, + "learning_rate": 1.4771335712280054e-05, + "loss": 0.37, "step": 89535 }, { - "epoch": 3.15, - "learning_rate": 1.5914933685961886e-05, - "loss": 0.266, + "epoch": 3.2270155332107975, + "grad_norm": 0.21708033978939056, + "learning_rate": 1.4768673076421512e-05, + "loss": 0.3611, "step": 89540 }, { - "epoch": 3.15, - "learning_rate": 1.5912279784512767e-05, - "loss": 0.2782, + "epoch": 3.227195732872022, + "grad_norm": 0.21462254226207733, + "learning_rate": 1.4766010579962808e-05, + "loss": 0.392, "step": 89545 }, { - "epoch": 3.15, - "learning_rate": 1.5909626001061707e-05, - "loss": 0.2647, + "epoch": 3.227375932533247, + "grad_norm": 0.20276306569576263, + "learning_rate": 1.4763348222940243e-05, + "loss": 0.3416, "step": 89550 }, { - "epoch": 3.15, - "learning_rate": 1.590697233564316e-05, - "loss": 0.2916, + "epoch": 3.2275561321944712, + "grad_norm": 0.2577986419200897, + "learning_rate": 1.4760686005390068e-05, + "loss": 0.3846, "step": 89555 }, { - "epoch": 3.15, - "learning_rate": 1.5904318788291577e-05, - "loss": 0.2814, + "epoch": 3.227736331855696, + "grad_norm": 0.22566761076450348, + "learning_rate": 1.4758023927348563e-05, + "loss": 0.416, "step": 89560 }, { - "epoch": 3.15, - "learning_rate": 1.5901665359041425e-05, - "loss": 0.2549, + "epoch": 3.2279165315169207, + "grad_norm": 0.2527579665184021, + "learning_rate": 1.4755361988852002e-05, + "loss": 0.4009, "step": 89565 }, { - "epoch": 3.15, - "learning_rate": 1.5899012047927155e-05, - "loss": 0.2596, + "epoch": 3.2280967311781454, + "grad_norm": 0.23741693794727325, + "learning_rate": 1.4752700189936649e-05, + "loss": 0.3604, "step": 89570 }, { - "epoch": 3.15, - "learning_rate": 1.5896358854983216e-05, - "loss": 0.2437, + "epoch": 3.22827693083937, + "grad_norm": 0.2663268446922302, + "learning_rate": 1.4750038530638769e-05, + "loss": 0.3795, "step": 89575 }, { - "epoch": 3.15, - "learning_rate": 1.589370578024405e-05, - "loss": 0.2533, + "epoch": 3.2284571305005945, + "grad_norm": 0.2541162371635437, + "learning_rate": 1.4747377010994634e-05, + "loss": 0.367, "step": 89580 }, { - "epoch": 3.15, - "learning_rate": 1.589105282374412e-05, - "loss": 0.2313, + "epoch": 3.228637330161819, + "grad_norm": 0.21635161340236664, + "learning_rate": 1.4744715631040495e-05, + "loss": 0.3491, "step": 89585 }, { - "epoch": 3.15, - "learning_rate": 1.5888399985517863e-05, - "loss": 0.2523, + "epoch": 3.228817529823044, + "grad_norm": 0.2601666748523712, + "learning_rate": 1.4742054390812631e-05, + "loss": 0.4016, "step": 89590 }, { - "epoch": 3.15, - "learning_rate": 1.588574726559973e-05, - "loss": 0.2657, + "epoch": 3.2289977294842687, + "grad_norm": 0.2198321521282196, + "learning_rate": 1.4739393290347286e-05, + "loss": 0.3752, "step": 89595 }, { - "epoch": 3.15, - "learning_rate": 1.5883094664024155e-05, - "loss": 0.2559, + "epoch": 3.2291779291454934, + "grad_norm": 0.22527672350406647, + "learning_rate": 1.4736732329680714e-05, + "loss": 0.3589, "step": 89600 }, { - "epoch": 3.15, - "learning_rate": 1.58804421808256e-05, - "loss": 0.2689, + "epoch": 3.2293581288067177, + "grad_norm": 0.28329557180404663, + "learning_rate": 1.4734071508849184e-05, + "loss": 0.3664, "step": 89605 }, { - "epoch": 3.15, - "learning_rate": 1.587778981603849e-05, - "loss": 0.25, + "epoch": 3.2295383284679424, + "grad_norm": 0.2408505380153656, + "learning_rate": 1.4731410827888947e-05, + "loss": 0.4055, "step": 89610 }, { - "epoch": 3.15, - "learning_rate": 1.587513756969726e-05, - "loss": 0.2579, + "epoch": 3.229718528129167, + "grad_norm": 0.2271721065044403, + "learning_rate": 1.4728750286836236e-05, + "loss": 0.3716, "step": 89615 }, { - "epoch": 3.15, - "learning_rate": 1.5872485441836353e-05, - "loss": 0.2435, + "epoch": 3.229898727790392, + "grad_norm": 0.20692966878414154, + "learning_rate": 1.472608988572733e-05, + "loss": 0.3708, "step": 89620 }, { - "epoch": 3.15, - "learning_rate": 1.586983343249022e-05, - "loss": 0.2647, + "epoch": 3.2300789274516166, + "grad_norm": 0.19603079557418823, + "learning_rate": 1.472342962459844e-05, + "loss": 0.3608, "step": 89625 }, { - "epoch": 3.15, - "learning_rate": 1.5867181541693278e-05, - "loss": 0.2701, + "epoch": 3.230259127112841, + "grad_norm": 0.21330444514751434, + "learning_rate": 1.4720769503485845e-05, + "loss": 0.3808, "step": 89630 }, { - "epoch": 3.15, - "learning_rate": 1.586452976947996e-05, - "loss": 0.2476, + "epoch": 3.2304393267740656, + "grad_norm": 0.22534000873565674, + "learning_rate": 1.471810952242578e-05, + "loss": 0.4035, "step": 89635 }, { - "epoch": 3.15, - "learning_rate": 1.586187811588471e-05, - "loss": 0.2859, + "epoch": 3.2306195264352904, + "grad_norm": 0.22428487241268158, + "learning_rate": 1.4715449681454468e-05, + "loss": 0.3776, "step": 89640 }, { - "epoch": 3.15, - "learning_rate": 1.5859226580941945e-05, - "loss": 0.2638, + "epoch": 3.230799726096515, + "grad_norm": 0.2968350052833557, + "learning_rate": 1.4712789980608177e-05, + "loss": 0.3924, "step": 89645 }, { - "epoch": 3.15, - "learning_rate": 1.5856575164686104e-05, - "loss": 0.2614, + "epoch": 3.2309799257577394, + "grad_norm": 0.2184687852859497, + "learning_rate": 1.4710130419923123e-05, + "loss": 0.3787, "step": 89650 }, { - "epoch": 3.15, - "learning_rate": 1.5853923867151596e-05, - "loss": 0.243, + "epoch": 3.231160125418964, + "grad_norm": 0.23696725070476532, + "learning_rate": 1.470747099943555e-05, + "loss": 0.3938, "step": 89655 }, { - "epoch": 3.15, - "learning_rate": 1.585127268837287e-05, - "loss": 0.2492, + "epoch": 3.231340325080189, + "grad_norm": 0.24137650430202484, + "learning_rate": 1.4704811719181693e-05, + "loss": 0.4011, "step": 89660 }, { - "epoch": 3.15, - "learning_rate": 1.584862162838434e-05, - "loss": 0.2499, + "epoch": 3.2315205247414136, + "grad_norm": 0.2342267483472824, + "learning_rate": 1.4702152579197782e-05, + "loss": 0.3619, "step": 89665 }, { - "epoch": 3.15, - "learning_rate": 1.5845970687220424e-05, - "loss": 0.2493, + "epoch": 3.2317007244026383, + "grad_norm": 0.2602199912071228, + "learning_rate": 1.4699493579520052e-05, + "loss": 0.4182, "step": 89670 }, { - "epoch": 3.16, - "learning_rate": 1.584331986491554e-05, - "loss": 0.2587, + "epoch": 3.2318809240638626, + "grad_norm": 0.2467493712902069, + "learning_rate": 1.4696834720184725e-05, + "loss": 0.3843, "step": 89675 }, { - "epoch": 3.16, - "learning_rate": 1.5840669161504125e-05, - "loss": 0.2524, + "epoch": 3.2320611237250874, + "grad_norm": 0.23336918652057648, + "learning_rate": 1.4694176001228027e-05, + "loss": 0.3789, "step": 89680 }, { - "epoch": 3.16, - "learning_rate": 1.583801857702058e-05, - "loss": 0.2829, + "epoch": 3.232241323386312, + "grad_norm": 0.22967779636383057, + "learning_rate": 1.4691517422686191e-05, + "loss": 0.3782, "step": 89685 }, { - "epoch": 3.16, - "learning_rate": 1.5835368111499328e-05, - "loss": 0.2596, + "epoch": 3.232421523047537, + "grad_norm": 0.19825366139411926, + "learning_rate": 1.4688858984595432e-05, + "loss": 0.3776, "step": 89690 }, { - "epoch": 3.16, - "learning_rate": 1.5832717764974772e-05, - "loss": 0.2582, + "epoch": 3.232601722708761, + "grad_norm": 0.18190155923366547, + "learning_rate": 1.4686200686991972e-05, + "loss": 0.3398, "step": 89695 }, { - "epoch": 3.16, - "learning_rate": 1.5830067537481337e-05, - "loss": 0.2688, + "epoch": 3.232781922369986, + "grad_norm": 0.21826298534870148, + "learning_rate": 1.468354252991203e-05, + "loss": 0.371, "step": 89700 }, { - "epoch": 3.16, - "learning_rate": 1.582741742905344e-05, - "loss": 0.243, + "epoch": 3.2329621220312106, + "grad_norm": 0.2672412097454071, + "learning_rate": 1.4680884513391824e-05, + "loss": 0.3691, "step": 89705 }, { - "epoch": 3.16, - "learning_rate": 1.582476743972548e-05, - "loss": 0.2272, + "epoch": 3.2331423216924353, + "grad_norm": 0.25871455669403076, + "learning_rate": 1.4678226637467574e-05, + "loss": 0.3965, "step": 89710 }, { - "epoch": 3.16, - "learning_rate": 1.582211756953186e-05, - "loss": 0.2525, + "epoch": 3.23332252135366, + "grad_norm": 0.24801315367221832, + "learning_rate": 1.467556890217549e-05, + "loss": 0.4047, "step": 89715 }, { - "epoch": 3.16, - "learning_rate": 1.5819467818507005e-05, - "loss": 0.2644, + "epoch": 3.2335027210148843, + "grad_norm": 0.20735400915145874, + "learning_rate": 1.4672911307551768e-05, + "loss": 0.3561, "step": 89720 }, { - "epoch": 3.16, - "learning_rate": 1.5816818186685306e-05, - "loss": 0.2635, + "epoch": 3.233682920676109, + "grad_norm": 0.24072608351707458, + "learning_rate": 1.4670253853632642e-05, + "loss": 0.3899, "step": 89725 }, { - "epoch": 3.16, - "learning_rate": 1.5814168674101174e-05, - "loss": 0.2387, + "epoch": 3.233863120337334, + "grad_norm": 0.24866987764835358, + "learning_rate": 1.4667596540454296e-05, + "loss": 0.3894, "step": 89730 }, { - "epoch": 3.16, - "learning_rate": 1.5811519280789e-05, - "loss": 0.2879, + "epoch": 3.2340433199985585, + "grad_norm": 0.2773805856704712, + "learning_rate": 1.466493936805296e-05, + "loss": 0.4052, "step": 89735 }, { - "epoch": 3.16, - "learning_rate": 1.58088700067832e-05, - "loss": 0.2594, + "epoch": 3.234223519659783, + "grad_norm": 0.2559952735900879, + "learning_rate": 1.4662282336464817e-05, + "loss": 0.3995, "step": 89740 }, { - "epoch": 3.16, - "learning_rate": 1.5806220852118166e-05, - "loss": 0.2604, + "epoch": 3.2344037193210076, + "grad_norm": 0.20046544075012207, + "learning_rate": 1.4659625445726068e-05, + "loss": 0.388, "step": 89745 }, { - "epoch": 3.16, - "learning_rate": 1.5803571816828287e-05, - "loss": 0.2514, + "epoch": 3.2345839189822323, + "grad_norm": 0.24034267663955688, + "learning_rate": 1.4656968695872936e-05, + "loss": 0.3999, "step": 89750 }, { - "epoch": 3.16, - "learning_rate": 1.5800922900947973e-05, - "loss": 0.2494, + "epoch": 3.234764118643457, + "grad_norm": 0.20936627686023712, + "learning_rate": 1.4654312086941597e-05, + "loss": 0.3932, "step": 89755 }, { - "epoch": 3.16, - "learning_rate": 1.5798274104511614e-05, - "loss": 0.2555, + "epoch": 3.2349443183046818, + "grad_norm": 0.25857529044151306, + "learning_rate": 1.4651655618968246e-05, + "loss": 0.3954, "step": 89760 }, { - "epoch": 3.16, - "learning_rate": 1.5795625427553606e-05, - "loss": 0.2639, + "epoch": 3.235124517965906, + "grad_norm": 0.20144201815128326, + "learning_rate": 1.4648999291989093e-05, + "loss": 0.3765, "step": 89765 }, { - "epoch": 3.16, - "learning_rate": 1.5792976870108324e-05, - "loss": 0.2454, + "epoch": 3.235304717627131, + "grad_norm": 0.25030654668807983, + "learning_rate": 1.4646343106040314e-05, + "loss": 0.3872, "step": 89770 }, { - "epoch": 3.16, - "learning_rate": 1.5790328432210176e-05, - "loss": 0.253, + "epoch": 3.2354849172883555, + "grad_norm": 0.18118837475776672, + "learning_rate": 1.464368706115811e-05, + "loss": 0.3797, "step": 89775 }, { - "epoch": 3.16, - "learning_rate": 1.5787680113893544e-05, - "loss": 0.2621, + "epoch": 3.2356651169495803, + "grad_norm": 0.2649378180503845, + "learning_rate": 1.4641031157378665e-05, + "loss": 0.3662, "step": 89780 }, { - "epoch": 3.16, - "learning_rate": 1.5785031915192818e-05, - "loss": 0.2478, + "epoch": 3.2358453166108045, + "grad_norm": 0.22662107646465302, + "learning_rate": 1.4638375394738157e-05, + "loss": 0.3622, "step": 89785 }, { - "epoch": 3.16, - "learning_rate": 1.578238383614237e-05, - "loss": 0.2364, + "epoch": 3.2360255162720293, + "grad_norm": 0.2865629494190216, + "learning_rate": 1.4635719773272788e-05, + "loss": 0.4174, "step": 89790 }, { - "epoch": 3.16, - "learning_rate": 1.5779735876776604e-05, - "loss": 0.2658, + "epoch": 3.236205715933254, + "grad_norm": 0.2678220868110657, + "learning_rate": 1.4633064293018728e-05, + "loss": 0.3557, "step": 89795 }, { - "epoch": 3.16, - "learning_rate": 1.5777088037129888e-05, - "loss": 0.2406, + "epoch": 3.2363859155944787, + "grad_norm": 0.2645963728427887, + "learning_rate": 1.4630408954012153e-05, + "loss": 0.3925, "step": 89800 }, { - "epoch": 3.16, - "learning_rate": 1.5774440317236606e-05, - "loss": 0.2387, + "epoch": 3.2365661152557035, + "grad_norm": 0.21265912055969238, + "learning_rate": 1.4627753756289258e-05, + "loss": 0.349, "step": 89805 }, { - "epoch": 3.16, - "learning_rate": 1.5771792717131133e-05, - "loss": 0.2783, + "epoch": 3.2367463149169278, + "grad_norm": 0.23955461382865906, + "learning_rate": 1.4625098699886204e-05, + "loss": 0.42, "step": 89810 }, { - "epoch": 3.16, - "learning_rate": 1.576914523684785e-05, - "loss": 0.2633, + "epoch": 3.2369265145781525, + "grad_norm": 0.28037208318710327, + "learning_rate": 1.4622443784839174e-05, + "loss": 0.3545, "step": 89815 }, { - "epoch": 3.16, - "learning_rate": 1.576649787642114e-05, - "loss": 0.2534, + "epoch": 3.2371067142393772, + "grad_norm": 0.20563095808029175, + "learning_rate": 1.4619789011184343e-05, + "loss": 0.3833, "step": 89820 }, { - "epoch": 3.16, - "learning_rate": 1.5763850635885367e-05, - "loss": 0.2456, + "epoch": 3.237286913900602, + "grad_norm": 0.2627568542957306, + "learning_rate": 1.4617134378957864e-05, + "loss": 0.3868, "step": 89825 }, { - "epoch": 3.16, - "learning_rate": 1.5761203515274898e-05, - "loss": 0.2555, + "epoch": 3.2374671135618267, + "grad_norm": 0.23573757708072662, + "learning_rate": 1.4614479888195931e-05, + "loss": 0.3716, "step": 89830 }, { - "epoch": 3.16, - "learning_rate": 1.5758556514624118e-05, - "loss": 0.2793, + "epoch": 3.237647313223051, + "grad_norm": 0.22677142918109894, + "learning_rate": 1.4611825538934699e-05, + "loss": 0.39, "step": 89835 }, { - "epoch": 3.16, - "learning_rate": 1.575590963396739e-05, - "loss": 0.259, + "epoch": 3.2378275128842757, + "grad_norm": 0.23571011424064636, + "learning_rate": 1.4609171331210324e-05, + "loss": 0.383, "step": 89840 }, { - "epoch": 3.16, - "learning_rate": 1.5753262873339093e-05, - "loss": 0.2493, + "epoch": 3.2380077125455005, + "grad_norm": 0.2776787281036377, + "learning_rate": 1.4606517265058982e-05, + "loss": 0.3464, "step": 89845 }, { - "epoch": 3.16, - "learning_rate": 1.575061623277357e-05, - "loss": 0.2773, + "epoch": 3.238187912206725, + "grad_norm": 0.21779267489910126, + "learning_rate": 1.4603863340516826e-05, + "loss": 0.3814, "step": 89850 }, { - "epoch": 3.16, - "learning_rate": 1.574796971230521e-05, - "loss": 0.2326, + "epoch": 3.2383681118679495, + "grad_norm": 0.2640026807785034, + "learning_rate": 1.4601209557620027e-05, + "loss": 0.3906, "step": 89855 }, { - "epoch": 3.16, - "learning_rate": 1.574532331196836e-05, - "loss": 0.2391, + "epoch": 3.238548311529174, + "grad_norm": 0.2678093910217285, + "learning_rate": 1.4598555916404732e-05, + "loss": 0.3533, "step": 89860 }, { - "epoch": 3.16, - "learning_rate": 1.5742677031797388e-05, - "loss": 0.2515, + "epoch": 3.238728511190399, + "grad_norm": 0.24538059532642365, + "learning_rate": 1.4595902416907092e-05, + "loss": 0.3712, "step": 89865 }, { - "epoch": 3.16, - "learning_rate": 1.5740030871826656e-05, - "loss": 0.2605, + "epoch": 3.2389087108516237, + "grad_norm": 0.27062126994132996, + "learning_rate": 1.4593249059163283e-05, + "loss": 0.3648, "step": 89870 }, { - "epoch": 3.16, - "learning_rate": 1.5737384832090523e-05, - "loss": 0.2575, + "epoch": 3.2390889105128484, + "grad_norm": 0.24212132394313812, + "learning_rate": 1.4590595843209432e-05, + "loss": 0.3855, "step": 89875 }, { - "epoch": 3.16, - "learning_rate": 1.573473891262334e-05, - "loss": 0.2595, + "epoch": 3.2392691101740727, + "grad_norm": 0.2248118668794632, + "learning_rate": 1.4587942769081689e-05, + "loss": 0.347, "step": 89880 }, { - "epoch": 3.16, - "learning_rate": 1.5732093113459466e-05, - "loss": 0.242, + "epoch": 3.2394493098352974, + "grad_norm": 0.21070916950702667, + "learning_rate": 1.4585289836816218e-05, + "loss": 0.3758, "step": 89885 }, { - "epoch": 3.16, - "learning_rate": 1.572944743463326e-05, - "loss": 0.2755, + "epoch": 3.239629509496522, + "grad_norm": 0.19322839379310608, + "learning_rate": 1.4582637046449148e-05, + "loss": 0.3913, "step": 89890 }, { - "epoch": 3.16, - "learning_rate": 1.5726801876179066e-05, - "loss": 0.2453, + "epoch": 3.239809709157747, + "grad_norm": 0.2393057942390442, + "learning_rate": 1.4579984398016649e-05, + "loss": 0.3938, "step": 89895 }, { - "epoch": 3.16, - "learning_rate": 1.572415643813124e-05, - "loss": 0.2467, + "epoch": 3.2399899088189716, + "grad_norm": 0.24970926344394684, + "learning_rate": 1.4577331891554839e-05, + "loss": 0.3932, "step": 89900 }, { - "epoch": 3.16, - "learning_rate": 1.5721511120524117e-05, - "loss": 0.2782, + "epoch": 3.240170108480196, + "grad_norm": 0.23957009613513947, + "learning_rate": 1.4574679527099852e-05, + "loss": 0.3849, "step": 89905 }, { - "epoch": 3.16, - "learning_rate": 1.5718865923392073e-05, - "loss": 0.2259, + "epoch": 3.2403503081414207, + "grad_norm": 0.22815951704978943, + "learning_rate": 1.4572027304687847e-05, + "loss": 0.3937, "step": 89910 }, { - "epoch": 3.16, - "learning_rate": 1.5716220846769434e-05, - "loss": 0.2757, + "epoch": 3.2405305078026454, + "grad_norm": 0.20002481341362, + "learning_rate": 1.4569375224354953e-05, + "loss": 0.3943, "step": 89915 }, { - "epoch": 3.16, - "learning_rate": 1.5713575890690553e-05, - "loss": 0.2559, + "epoch": 3.24071070746387, + "grad_norm": 0.2669081687927246, + "learning_rate": 1.4566723286137301e-05, + "loss": 0.4006, "step": 89920 }, { - "epoch": 3.16, - "learning_rate": 1.571093105518976e-05, - "loss": 0.2471, + "epoch": 3.2408909071250944, + "grad_norm": 0.2839129567146301, + "learning_rate": 1.4564071490071024e-05, + "loss": 0.3667, "step": 89925 }, { - "epoch": 3.16, - "learning_rate": 1.570828634030141e-05, - "loss": 0.2502, + "epoch": 3.241071106786319, + "grad_norm": 0.1999591588973999, + "learning_rate": 1.4561419836192241e-05, + "loss": 0.3618, "step": 89930 }, { - "epoch": 3.16, - "learning_rate": 1.5705641746059843e-05, - "loss": 0.2415, + "epoch": 3.241251306447544, + "grad_norm": 0.21648964285850525, + "learning_rate": 1.4558768324537105e-05, + "loss": 0.3854, "step": 89935 }, { - "epoch": 3.16, - "learning_rate": 1.5702997272499387e-05, - "loss": 0.2449, + "epoch": 3.2414315061087686, + "grad_norm": 0.17775964736938477, + "learning_rate": 1.4556116955141729e-05, + "loss": 0.3296, "step": 89940 }, { - "epoch": 3.16, - "learning_rate": 1.5700352919654386e-05, - "loss": 0.2644, + "epoch": 3.2416117057699934, + "grad_norm": 0.24124689400196075, + "learning_rate": 1.4553465728042236e-05, + "loss": 0.3892, "step": 89945 }, { - "epoch": 3.16, - "learning_rate": 1.569770868755917e-05, - "loss": 0.2435, + "epoch": 3.2417919054312176, + "grad_norm": 0.19329720735549927, + "learning_rate": 1.4550814643274752e-05, + "loss": 0.3676, "step": 89950 }, { - "epoch": 3.16, - "learning_rate": 1.5695064576248088e-05, - "loss": 0.2489, + "epoch": 3.2419721050924424, + "grad_norm": 0.1987384855747223, + "learning_rate": 1.4548163700875384e-05, + "loss": 0.3755, "step": 89955 }, { - "epoch": 3.17, - "learning_rate": 1.569242058575545e-05, - "loss": 0.2684, + "epoch": 3.242152304753667, + "grad_norm": 0.2366037666797638, + "learning_rate": 1.4545512900880271e-05, + "loss": 0.3561, "step": 89960 }, { - "epoch": 3.17, - "learning_rate": 1.5689776716115594e-05, - "loss": 0.2492, + "epoch": 3.242332504414892, + "grad_norm": 0.2112271934747696, + "learning_rate": 1.454286224332553e-05, + "loss": 0.381, "step": 89965 }, { - "epoch": 3.17, - "learning_rate": 1.568713296736286e-05, - "loss": 0.266, + "epoch": 3.242512704076116, + "grad_norm": 0.2448207587003708, + "learning_rate": 1.454021172824725e-05, + "loss": 0.4151, "step": 89970 }, { - "epoch": 3.17, - "learning_rate": 1.5684489339531563e-05, - "loss": 0.2666, + "epoch": 3.242692903737341, + "grad_norm": 0.24623465538024902, + "learning_rate": 1.4537561355681567e-05, + "loss": 0.369, "step": 89975 }, { - "epoch": 3.17, - "learning_rate": 1.568184583265603e-05, - "loss": 0.2632, + "epoch": 3.2428731033985656, + "grad_norm": 0.2527907192707062, + "learning_rate": 1.4534911125664585e-05, + "loss": 0.4135, "step": 89980 }, { - "epoch": 3.17, - "learning_rate": 1.567920244677059e-05, - "loss": 0.2733, + "epoch": 3.2430533030597903, + "grad_norm": 0.21829581260681152, + "learning_rate": 1.4532261038232414e-05, + "loss": 0.3546, "step": 89985 }, { - "epoch": 3.17, - "learning_rate": 1.567655918190957e-05, - "loss": 0.2579, + "epoch": 3.243233502721015, + "grad_norm": 0.196324422955513, + "learning_rate": 1.4529611093421158e-05, + "loss": 0.4074, "step": 89990 }, { - "epoch": 3.17, - "learning_rate": 1.5673916038107283e-05, - "loss": 0.2433, + "epoch": 3.2434137023822394, + "grad_norm": 0.22843465209007263, + "learning_rate": 1.4526961291266916e-05, + "loss": 0.3604, "step": 89995 }, { - "epoch": 3.17, - "learning_rate": 1.5671273015398042e-05, - "loss": 0.2469, + "epoch": 3.243593902043464, + "grad_norm": 0.19749246537685394, + "learning_rate": 1.452431163180581e-05, + "loss": 0.3636, "step": 90000 }, { - "epoch": 3.17, - "eval_loss": 0.2552323043346405, - "eval_runtime": 10.5438, - "eval_samples_per_second": 9.484, - "eval_steps_per_second": 9.484, + "epoch": 3.243593902043464, + "eval_loss": 0.4301738440990448, + "eval_runtime": 3.5334, + "eval_samples_per_second": 28.301, + "eval_steps_per_second": 7.075, "step": 90000 }, { - "epoch": 3.17, - "learning_rate": 1.5668630113816173e-05, - "loss": 0.2563, + "epoch": 3.243774101704689, + "grad_norm": 0.2458840310573578, + "learning_rate": 1.4521662115073925e-05, + "loss": 0.3655, "step": 90005 }, { - "epoch": 3.17, - "learning_rate": 1.5665987333396003e-05, - "loss": 0.256, + "epoch": 3.2439543013659136, + "grad_norm": 0.23517417907714844, + "learning_rate": 1.4519012741107368e-05, + "loss": 0.3811, "step": 90010 }, { - "epoch": 3.17, - "learning_rate": 1.5663344674171832e-05, - "loss": 0.2445, + "epoch": 3.244134501027138, + "grad_norm": 0.21552741527557373, + "learning_rate": 1.4516363509942233e-05, + "loss": 0.3699, "step": 90015 }, { - "epoch": 3.17, - "learning_rate": 1.5660702136177968e-05, - "loss": 0.253, + "epoch": 3.2443147006883626, + "grad_norm": 0.2056073546409607, + "learning_rate": 1.4513714421614617e-05, + "loss": 0.3778, "step": 90020 }, { - "epoch": 3.17, - "learning_rate": 1.5658059719448745e-05, - "loss": 0.2588, + "epoch": 3.2444949003495873, + "grad_norm": 0.219630628824234, + "learning_rate": 1.4511065476160598e-05, + "loss": 0.3692, "step": 90025 }, { - "epoch": 3.17, - "learning_rate": 1.5655417424018455e-05, - "loss": 0.2653, + "epoch": 3.244675100010812, + "grad_norm": 0.2458181381225586, + "learning_rate": 1.4508416673616292e-05, + "loss": 0.3989, "step": 90030 }, { - "epoch": 3.17, - "learning_rate": 1.5652775249921413e-05, - "loss": 0.2451, + "epoch": 3.244855299672037, + "grad_norm": 0.23041844367980957, + "learning_rate": 1.4505768014017782e-05, + "loss": 0.3763, "step": 90035 }, { - "epoch": 3.17, - "learning_rate": 1.5650133197191914e-05, - "loss": 0.2793, + "epoch": 3.245035499333261, + "grad_norm": 0.2688606083393097, + "learning_rate": 1.4503119497401147e-05, + "loss": 0.3814, "step": 90040 }, { - "epoch": 3.17, - "learning_rate": 1.5647491265864287e-05, - "loss": 0.2702, + "epoch": 3.245215698994486, + "grad_norm": 0.20905734598636627, + "learning_rate": 1.4500471123802478e-05, + "loss": 0.3603, "step": 90045 }, { - "epoch": 3.17, - "learning_rate": 1.564484945597282e-05, - "loss": 0.2383, + "epoch": 3.2453958986557105, + "grad_norm": 0.2165406346321106, + "learning_rate": 1.4497822893257845e-05, + "loss": 0.3778, "step": 90050 }, { - "epoch": 3.17, - "learning_rate": 1.5642207767551814e-05, - "loss": 0.2614, + "epoch": 3.2455760983169353, + "grad_norm": 0.28338685631752014, + "learning_rate": 1.4495174805803358e-05, + "loss": 0.3849, "step": 90055 }, { - "epoch": 3.17, - "learning_rate": 1.5639566200635565e-05, - "loss": 0.2682, + "epoch": 3.2457562979781596, + "grad_norm": 0.24347181618213654, + "learning_rate": 1.4492526861475084e-05, + "loss": 0.3794, "step": 90060 }, { - "epoch": 3.17, - "learning_rate": 1.5636924755258396e-05, - "loss": 0.2693, + "epoch": 3.2459364976393843, + "grad_norm": 0.17866498231887817, + "learning_rate": 1.448987906030908e-05, + "loss": 0.3434, "step": 90065 }, { - "epoch": 3.17, - "learning_rate": 1.5634283431454582e-05, - "loss": 0.2385, + "epoch": 3.246116697300609, + "grad_norm": 0.23330989480018616, + "learning_rate": 1.4487231402341447e-05, + "loss": 0.377, "step": 90070 }, { - "epoch": 3.17, - "learning_rate": 1.563164222925843e-05, - "loss": 0.2838, + "epoch": 3.2462968969618338, + "grad_norm": 0.23260824382305145, + "learning_rate": 1.4484583887608243e-05, + "loss": 0.3774, "step": 90075 }, { - "epoch": 3.17, - "learning_rate": 1.5629001148704218e-05, - "loss": 0.263, + "epoch": 3.2464770966230585, + "grad_norm": 0.20770181715488434, + "learning_rate": 1.4481936516145567e-05, + "loss": 0.3724, "step": 90080 }, { - "epoch": 3.17, - "learning_rate": 1.5626360189826257e-05, - "loss": 0.2606, + "epoch": 3.246657296284283, + "grad_norm": 0.20958863198757172, + "learning_rate": 1.447928928798946e-05, + "loss": 0.3703, "step": 90085 }, { - "epoch": 3.17, - "learning_rate": 1.5623719352658833e-05, - "loss": 0.2521, + "epoch": 3.2468374959455075, + "grad_norm": 0.2624190151691437, + "learning_rate": 1.4476642203175988e-05, + "loss": 0.3906, "step": 90090 }, { - "epoch": 3.17, - "learning_rate": 1.5621078637236224e-05, - "loss": 0.239, + "epoch": 3.2470176956067323, + "grad_norm": 0.27643921971321106, + "learning_rate": 1.4473995261741238e-05, + "loss": 0.36, "step": 90095 }, { - "epoch": 3.17, - "learning_rate": 1.5618438043592744e-05, - "loss": 0.2759, + "epoch": 3.247197895267957, + "grad_norm": 0.24671484529972076, + "learning_rate": 1.4471348463721268e-05, + "loss": 0.3819, "step": 90100 }, { - "epoch": 3.17, - "learning_rate": 1.5615797571762658e-05, - "loss": 0.2607, + "epoch": 3.2473780949291817, + "grad_norm": 0.22529368102550507, + "learning_rate": 1.4468701809152135e-05, + "loss": 0.352, "step": 90105 }, { - "epoch": 3.17, - "learning_rate": 1.561315722178025e-05, - "loss": 0.2598, + "epoch": 3.247558294590406, + "grad_norm": 0.22821401059627533, + "learning_rate": 1.44660552980699e-05, + "loss": 0.3884, "step": 90110 }, { - "epoch": 3.17, - "learning_rate": 1.561051699367981e-05, - "loss": 0.2588, + "epoch": 3.2477384942516307, + "grad_norm": 0.22324234247207642, + "learning_rate": 1.4463408930510614e-05, + "loss": 0.3781, "step": 90115 }, { - "epoch": 3.17, - "learning_rate": 1.5607876887495628e-05, - "loss": 0.2403, + "epoch": 3.2479186939128555, + "grad_norm": 0.25382617115974426, + "learning_rate": 1.446076270651035e-05, + "loss": 0.3944, "step": 90120 }, { - "epoch": 3.17, - "learning_rate": 1.560523690326197e-05, - "loss": 0.232, + "epoch": 3.24809889357408, + "grad_norm": 0.2712723910808563, + "learning_rate": 1.4458116626105154e-05, + "loss": 0.3762, "step": 90125 }, { - "epoch": 3.17, - "learning_rate": 1.5602597041013122e-05, - "loss": 0.2458, + "epoch": 3.248279093235305, + "grad_norm": 0.2119145393371582, + "learning_rate": 1.4455470689331078e-05, + "loss": 0.3578, "step": 90130 }, { - "epoch": 3.17, - "learning_rate": 1.5599957300783346e-05, - "loss": 0.2349, + "epoch": 3.2484592928965292, + "grad_norm": 0.2648928165435791, + "learning_rate": 1.4452824896224171e-05, + "loss": 0.3913, "step": 90135 }, { - "epoch": 3.17, - "learning_rate": 1.559731768260694e-05, - "loss": 0.2434, + "epoch": 3.248639492557754, + "grad_norm": 0.21984988451004028, + "learning_rate": 1.4450179246820475e-05, + "loss": 0.3591, "step": 90140 }, { - "epoch": 3.17, - "learning_rate": 1.5594678186518168e-05, - "loss": 0.2512, + "epoch": 3.2488196922189787, + "grad_norm": 0.25489935278892517, + "learning_rate": 1.4447533741156055e-05, + "loss": 0.3913, "step": 90145 }, { - "epoch": 3.17, - "learning_rate": 1.5592038812551303e-05, - "loss": 0.2682, + "epoch": 3.2489998918802034, + "grad_norm": 0.19354936480522156, + "learning_rate": 1.4444888379266952e-05, + "loss": 0.3815, "step": 90150 }, { - "epoch": 3.17, - "learning_rate": 1.5589399560740603e-05, - "loss": 0.2573, + "epoch": 3.2491800915414277, + "grad_norm": 0.2649206519126892, + "learning_rate": 1.4442243161189184e-05, + "loss": 0.3611, "step": 90155 }, { - "epoch": 3.17, - "learning_rate": 1.5586760431120358e-05, - "loss": 0.2615, + "epoch": 3.2493602912026525, + "grad_norm": 0.23419325053691864, + "learning_rate": 1.4439598086958816e-05, + "loss": 0.3869, "step": 90160 }, { - "epoch": 3.17, - "learning_rate": 1.558412142372482e-05, - "loss": 0.2723, + "epoch": 3.249540490863877, + "grad_norm": 0.2443428486585617, + "learning_rate": 1.4436953156611881e-05, + "loss": 0.4055, "step": 90165 }, { - "epoch": 3.17, - "learning_rate": 1.5581482538588265e-05, - "loss": 0.2662, + "epoch": 3.249720690525102, + "grad_norm": 0.18674413859844208, + "learning_rate": 1.4434308370184413e-05, + "loss": 0.3629, "step": 90170 }, { - "epoch": 3.17, - "learning_rate": 1.5578843775744945e-05, - "loss": 0.2276, + "epoch": 3.2499008901863267, + "grad_norm": 0.2267744094133377, + "learning_rate": 1.443166372771245e-05, + "loss": 0.3401, "step": 90175 }, { - "epoch": 3.17, - "learning_rate": 1.5576205135229137e-05, - "loss": 0.2404, + "epoch": 3.250081089847551, + "grad_norm": 0.1851803958415985, + "learning_rate": 1.442901922923201e-05, + "loss": 0.3667, "step": 90180 }, { - "epoch": 3.17, - "learning_rate": 1.5573566617075096e-05, - "loss": 0.2428, + "epoch": 3.2502612895087757, + "grad_norm": 0.2092495560646057, + "learning_rate": 1.4426374874779146e-05, + "loss": 0.4123, "step": 90185 }, { - "epoch": 3.17, - "learning_rate": 1.5570928221317075e-05, - "loss": 0.2637, + "epoch": 3.2504414891700004, + "grad_norm": 0.20582100749015808, + "learning_rate": 1.4423730664389883e-05, + "loss": 0.4061, "step": 90190 }, { - "epoch": 3.17, - "learning_rate": 1.556828994798933e-05, - "loss": 0.276, + "epoch": 3.250621688831225, + "grad_norm": 0.23701712489128113, + "learning_rate": 1.4421086598100237e-05, + "loss": 0.3766, "step": 90195 }, { - "epoch": 3.17, - "learning_rate": 1.5565651797126137e-05, - "loss": 0.2541, + "epoch": 3.2508018884924494, + "grad_norm": 0.1818612813949585, + "learning_rate": 1.441844267594624e-05, + "loss": 0.3682, "step": 90200 }, { - "epoch": 3.17, - "learning_rate": 1.5563013768761735e-05, - "loss": 0.2626, + "epoch": 3.250982088153674, + "grad_norm": 0.23264910280704498, + "learning_rate": 1.4415798897963911e-05, + "loss": 0.3912, "step": 90205 }, { - "epoch": 3.17, - "learning_rate": 1.5560375862930376e-05, - "loss": 0.2614, + "epoch": 3.251162287814899, + "grad_norm": 0.23024223744869232, + "learning_rate": 1.4413155264189266e-05, + "loss": 0.3769, "step": 90210 }, { - "epoch": 3.17, - "learning_rate": 1.555773807966632e-05, - "loss": 0.2798, + "epoch": 3.2513424874761236, + "grad_norm": 0.1898718625307083, + "learning_rate": 1.4410511774658336e-05, + "loss": 0.3937, "step": 90215 }, { - "epoch": 3.17, - "learning_rate": 1.555510041900381e-05, - "loss": 0.2506, + "epoch": 3.2515226871373484, + "grad_norm": 0.2725496292114258, + "learning_rate": 1.4407868429407138e-05, + "loss": 0.4097, "step": 90220 }, { - "epoch": 3.17, - "learning_rate": 1.5552462880977098e-05, - "loss": 0.2327, + "epoch": 3.2517028867985727, + "grad_norm": 0.2061624974012375, + "learning_rate": 1.4405225228471678e-05, + "loss": 0.3782, "step": 90225 }, { - "epoch": 3.17, - "learning_rate": 1.554982546562042e-05, - "loss": 0.2532, + "epoch": 3.2518830864597974, + "grad_norm": 0.240001380443573, + "learning_rate": 1.4402582171887973e-05, + "loss": 0.3728, "step": 90230 }, { - "epoch": 3.17, - "learning_rate": 1.5547188172968048e-05, - "loss": 0.2477, + "epoch": 3.252063286121022, + "grad_norm": 0.22183412313461304, + "learning_rate": 1.4399939259692027e-05, + "loss": 0.3642, "step": 90235 }, { - "epoch": 3.17, - "learning_rate": 1.55445510030542e-05, - "loss": 0.2468, + "epoch": 3.252243485782247, + "grad_norm": 0.2705990672111511, + "learning_rate": 1.4397296491919865e-05, + "loss": 0.3807, "step": 90240 }, { - "epoch": 3.18, - "learning_rate": 1.5541913955913124e-05, - "loss": 0.2551, + "epoch": 3.252423685443471, + "grad_norm": 0.2311464250087738, + "learning_rate": 1.4394653868607497e-05, + "loss": 0.3711, "step": 90245 }, { - "epoch": 3.18, - "learning_rate": 1.5539277031579058e-05, - "loss": 0.263, + "epoch": 3.252603885104696, + "grad_norm": 0.226415753364563, + "learning_rate": 1.4392011389790893e-05, + "loss": 0.4024, "step": 90250 }, { - "epoch": 3.18, - "learning_rate": 1.553664023008625e-05, - "loss": 0.2437, + "epoch": 3.2527840847659206, + "grad_norm": 0.24060435593128204, + "learning_rate": 1.4389369055506094e-05, + "loss": 0.4227, "step": 90255 }, { - "epoch": 3.18, - "learning_rate": 1.5534003551468934e-05, - "loss": 0.2818, + "epoch": 3.2529642844271454, + "grad_norm": 0.2353263795375824, + "learning_rate": 1.4386726865789077e-05, + "loss": 0.3729, "step": 90260 }, { - "epoch": 3.18, - "learning_rate": 1.5531366995761347e-05, - "loss": 0.2727, + "epoch": 3.25314448408837, + "grad_norm": 0.23941341042518616, + "learning_rate": 1.438408482067587e-05, + "loss": 0.3689, "step": 90265 }, { - "epoch": 3.18, - "learning_rate": 1.5528730562997705e-05, - "loss": 0.2619, + "epoch": 3.2533246837495944, + "grad_norm": 0.24477094411849976, + "learning_rate": 1.4381442920202442e-05, + "loss": 0.4205, "step": 90270 }, { - "epoch": 3.18, - "learning_rate": 1.5526094253212264e-05, - "loss": 0.2655, + "epoch": 3.253504883410819, + "grad_norm": 0.21615129709243774, + "learning_rate": 1.437880116440479e-05, + "loss": 0.3556, "step": 90275 }, { - "epoch": 3.18, - "learning_rate": 1.5523458066439248e-05, - "loss": 0.2708, + "epoch": 3.253685083072044, + "grad_norm": 0.2210584431886673, + "learning_rate": 1.4376159553318924e-05, + "loss": 0.3909, "step": 90280 }, { - "epoch": 3.18, - "learning_rate": 1.5520822002712883e-05, - "loss": 0.2733, + "epoch": 3.2538652827332686, + "grad_norm": 0.26296374201774597, + "learning_rate": 1.4373518086980827e-05, + "loss": 0.3978, "step": 90285 }, { - "epoch": 3.18, - "learning_rate": 1.551818606206739e-05, - "loss": 0.2635, + "epoch": 3.254045482394493, + "grad_norm": 0.26196447014808655, + "learning_rate": 1.437087676542649e-05, + "loss": 0.3911, "step": 90290 }, { - "epoch": 3.18, - "learning_rate": 1.551555024453701e-05, - "loss": 0.2432, + "epoch": 3.2542256820557176, + "grad_norm": 0.2408955991268158, + "learning_rate": 1.43682355886919e-05, + "loss": 0.38, "step": 90295 }, { - "epoch": 3.18, - "learning_rate": 1.5512914550155956e-05, - "loss": 0.2506, + "epoch": 3.2544058817169423, + "grad_norm": 0.21061177551746368, + "learning_rate": 1.436559455681303e-05, + "loss": 0.3891, "step": 90300 }, { - "epoch": 3.18, - "learning_rate": 1.551027897895846e-05, - "loss": 0.2604, + "epoch": 3.254586081378167, + "grad_norm": 0.31329545378685, + "learning_rate": 1.4362953669825885e-05, + "loss": 0.3841, "step": 90305 }, { - "epoch": 3.18, - "learning_rate": 1.550764353097873e-05, - "loss": 0.2649, + "epoch": 3.254766281039392, + "grad_norm": 0.22759318351745605, + "learning_rate": 1.4360312927766439e-05, + "loss": 0.3843, "step": 90310 }, { - "epoch": 3.18, - "learning_rate": 1.5505008206250995e-05, - "loss": 0.2424, + "epoch": 3.254946480700616, + "grad_norm": 0.22500398755073547, + "learning_rate": 1.4357672330670668e-05, + "loss": 0.389, "step": 90315 }, { - "epoch": 3.18, - "learning_rate": 1.5502373004809475e-05, - "loss": 0.2583, + "epoch": 3.255126680361841, + "grad_norm": 0.2273656278848648, + "learning_rate": 1.435503187857455e-05, + "loss": 0.4046, "step": 90320 }, { - "epoch": 3.18, - "learning_rate": 1.5499737926688374e-05, - "loss": 0.2623, + "epoch": 3.2553068800230656, + "grad_norm": 0.22215020656585693, + "learning_rate": 1.4352391571514053e-05, + "loss": 0.4307, "step": 90325 }, { - "epoch": 3.18, - "learning_rate": 1.5497102971921924e-05, - "loss": 0.2689, + "epoch": 3.2554870796842903, + "grad_norm": 0.2710254490375519, + "learning_rate": 1.434975140952517e-05, + "loss": 0.3778, "step": 90330 }, { - "epoch": 3.18, - "learning_rate": 1.5494468140544323e-05, - "loss": 0.2514, + "epoch": 3.2556672793455146, + "grad_norm": 0.2155771702528, + "learning_rate": 1.4347111392643873e-05, + "loss": 0.3671, "step": 90335 }, { - "epoch": 3.18, - "learning_rate": 1.5491833432589796e-05, - "loss": 0.2469, + "epoch": 3.2558474790067393, + "grad_norm": 0.2274942249059677, + "learning_rate": 1.43444715209061e-05, + "loss": 0.4002, "step": 90340 }, { - "epoch": 3.18, - "learning_rate": 1.5489198848092533e-05, - "loss": 0.24, + "epoch": 3.256027678667964, + "grad_norm": 0.22359316051006317, + "learning_rate": 1.4341831794347849e-05, + "loss": 0.3652, "step": 90345 }, { - "epoch": 3.18, - "learning_rate": 1.5486564387086766e-05, - "loss": 0.2573, + "epoch": 3.256207878329189, + "grad_norm": 0.239881694316864, + "learning_rate": 1.4339192213005076e-05, + "loss": 0.3881, "step": 90350 }, { - "epoch": 3.18, - "learning_rate": 1.5483930049606686e-05, - "loss": 0.2465, + "epoch": 3.2563880779904135, + "grad_norm": 0.22023212909698486, + "learning_rate": 1.4336552776913733e-05, + "loss": 0.3707, "step": 90355 }, { - "epoch": 3.18, - "learning_rate": 1.548129583568651e-05, - "loss": 0.2557, + "epoch": 3.2565682776516383, + "grad_norm": 0.2799815535545349, + "learning_rate": 1.4333913486109817e-05, + "loss": 0.3659, "step": 90360 }, { - "epoch": 3.18, - "learning_rate": 1.5478661745360423e-05, - "loss": 0.258, + "epoch": 3.2567484773128625, + "grad_norm": 0.19893276691436768, + "learning_rate": 1.4331274340629245e-05, + "loss": 0.4441, "step": 90365 }, { - "epoch": 3.18, - "learning_rate": 1.547602777866265e-05, - "loss": 0.2579, + "epoch": 3.2569286769740873, + "grad_norm": 0.2362791895866394, + "learning_rate": 1.4328635340508001e-05, + "loss": 0.3203, "step": 90370 }, { - "epoch": 3.18, - "learning_rate": 1.5473393935627377e-05, - "loss": 0.2479, + "epoch": 3.257108876635312, + "grad_norm": 0.29334524273872375, + "learning_rate": 1.4325996485782039e-05, + "loss": 0.3978, "step": 90375 }, { - "epoch": 3.18, - "learning_rate": 1.5470760216288804e-05, - "loss": 0.2549, + "epoch": 3.2572890762965367, + "grad_norm": 0.20384852588176727, + "learning_rate": 1.4323357776487307e-05, + "loss": 0.3841, "step": 90380 }, { - "epoch": 3.18, - "learning_rate": 1.5468126620681125e-05, - "loss": 0.2813, + "epoch": 3.257469275957761, + "grad_norm": 0.21409177780151367, + "learning_rate": 1.4320719212659756e-05, + "loss": 0.3804, "step": 90385 }, { - "epoch": 3.18, - "learning_rate": 1.5465493148838546e-05, - "loss": 0.2543, + "epoch": 3.2576494756189858, + "grad_norm": 0.26546648144721985, + "learning_rate": 1.431808079433534e-05, + "loss": 0.3746, "step": 90390 }, { - "epoch": 3.18, - "learning_rate": 1.5462859800795253e-05, - "loss": 0.2616, + "epoch": 3.2578296752802105, + "grad_norm": 0.24117311835289001, + "learning_rate": 1.4315442521549993e-05, + "loss": 0.3813, "step": 90395 }, { - "epoch": 3.18, - "learning_rate": 1.5460226576585447e-05, - "loss": 0.2329, + "epoch": 3.2580098749414352, + "grad_norm": 0.2505226731300354, + "learning_rate": 1.4312804394339686e-05, + "loss": 0.3831, "step": 90400 }, { - "epoch": 3.18, - "learning_rate": 1.5457593476243295e-05, - "loss": 0.245, + "epoch": 3.25819007460266, + "grad_norm": 0.24122489988803864, + "learning_rate": 1.431016641274035e-05, + "loss": 0.3852, "step": 90405 }, { - "epoch": 3.18, - "learning_rate": 1.5454960499803016e-05, - "loss": 0.2666, + "epoch": 3.2583702742638843, + "grad_norm": 0.2857030928134918, + "learning_rate": 1.4307528576787927e-05, + "loss": 0.3887, "step": 90410 }, { - "epoch": 3.18, - "learning_rate": 1.5452327647298782e-05, - "loss": 0.2771, + "epoch": 3.258550473925109, + "grad_norm": 0.22176389396190643, + "learning_rate": 1.4304890886518352e-05, + "loss": 0.3954, "step": 90415 }, { - "epoch": 3.18, - "learning_rate": 1.5449694918764785e-05, - "loss": 0.2595, + "epoch": 3.2587306735863337, + "grad_norm": 0.2822429835796356, + "learning_rate": 1.4302253341967564e-05, + "loss": 0.3836, "step": 90420 }, { - "epoch": 3.18, - "learning_rate": 1.5447062314235196e-05, - "loss": 0.2372, + "epoch": 3.2589108732475585, + "grad_norm": 0.23664531111717224, + "learning_rate": 1.4299615943171509e-05, + "loss": 0.3671, "step": 90425 }, { - "epoch": 3.18, - "learning_rate": 1.5444429833744213e-05, - "loss": 0.2799, + "epoch": 3.2590910729087827, + "grad_norm": 0.24214255809783936, + "learning_rate": 1.429697869016613e-05, + "loss": 0.404, "step": 90430 }, { - "epoch": 3.18, - "learning_rate": 1.5441797477326017e-05, - "loss": 0.2616, + "epoch": 3.2592712725700075, + "grad_norm": 0.21557815372943878, + "learning_rate": 1.429434158298732e-05, + "loss": 0.3875, "step": 90435 }, { - "epoch": 3.18, - "learning_rate": 1.543916524501477e-05, - "loss": 0.2696, + "epoch": 3.259451472231232, + "grad_norm": 0.2581924498081207, + "learning_rate": 1.4291704621671043e-05, + "loss": 0.3645, "step": 90440 }, { - "epoch": 3.18, - "learning_rate": 1.5436533136844665e-05, - "loss": 0.2675, + "epoch": 3.259631671892457, + "grad_norm": 0.23095859587192535, + "learning_rate": 1.4289067806253208e-05, + "loss": 0.3912, "step": 90445 }, { - "epoch": 3.18, - "learning_rate": 1.543390115284988e-05, - "loss": 0.264, + "epoch": 3.2598118715536817, + "grad_norm": 0.25443336367607117, + "learning_rate": 1.4286431136769774e-05, + "loss": 0.3588, "step": 90450 }, { - "epoch": 3.18, - "learning_rate": 1.5431269293064585e-05, - "loss": 0.2748, + "epoch": 3.259992071214906, + "grad_norm": 0.3058500289916992, + "learning_rate": 1.428379461325663e-05, + "loss": 0.4221, "step": 90455 }, { - "epoch": 3.18, - "learning_rate": 1.5428637557522942e-05, - "loss": 0.2724, + "epoch": 3.2601722708761307, + "grad_norm": 0.2576063573360443, + "learning_rate": 1.4281158235749698e-05, + "loss": 0.3781, "step": 90460 }, { - "epoch": 3.18, - "learning_rate": 1.542600594625914e-05, - "loss": 0.2642, + "epoch": 3.2603524705373554, + "grad_norm": 0.25434017181396484, + "learning_rate": 1.4278522004284919e-05, + "loss": 0.3894, "step": 90465 }, { - "epoch": 3.18, - "learning_rate": 1.542337445930734e-05, - "loss": 0.2555, + "epoch": 3.26053267019858, + "grad_norm": 0.2770051956176758, + "learning_rate": 1.4275885918898201e-05, + "loss": 0.4269, "step": 90470 }, { - "epoch": 3.18, - "learning_rate": 1.5420743096701717e-05, - "loss": 0.2335, + "epoch": 3.2607128698598045, + "grad_norm": 0.24559049308300018, + "learning_rate": 1.4273249979625464e-05, + "loss": 0.3713, "step": 90475 }, { - "epoch": 3.18, - "learning_rate": 1.541811185847642e-05, - "loss": 0.2708, + "epoch": 3.260893069521029, + "grad_norm": 0.24192962050437927, + "learning_rate": 1.427061418650262e-05, + "loss": 0.3708, "step": 90480 }, { - "epoch": 3.18, - "learning_rate": 1.5415480744665635e-05, - "loss": 0.2621, + "epoch": 3.261073269182254, + "grad_norm": 0.23208360373973846, + "learning_rate": 1.4268505657256397e-05, + "loss": 0.3847, "step": 90485 }, { - "epoch": 3.18, - "learning_rate": 1.5412849755303514e-05, - "loss": 0.2507, + "epoch": 3.2612534688434787, + "grad_norm": 0.26153233647346497, + "learning_rate": 1.4265870127293846e-05, + "loss": 0.425, "step": 90490 }, { - "epoch": 3.18, - "learning_rate": 1.5410218890424222e-05, - "loss": 0.2686, + "epoch": 3.2614336685047034, + "grad_norm": 0.22417205572128296, + "learning_rate": 1.4263234743581744e-05, + "loss": 0.396, "step": 90495 }, { - "epoch": 3.18, - "learning_rate": 1.5407588150061907e-05, - "loss": 0.2645, + "epoch": 3.2616138681659277, + "grad_norm": 0.3092048764228821, + "learning_rate": 1.4260599506156002e-05, + "loss": 0.392, "step": 90500 }, { - "epoch": 3.18, - "eval_loss": 0.2550298273563385, - "eval_runtime": 10.5455, - "eval_samples_per_second": 9.483, - "eval_steps_per_second": 9.483, + "epoch": 3.2616138681659277, + "eval_loss": 0.4304320514202118, + "eval_runtime": 3.5275, + "eval_samples_per_second": 28.349, + "eval_steps_per_second": 7.087, "step": 90500 }, { - "epoch": 3.18, - "learning_rate": 1.5404957534250752e-05, - "loss": 0.2426, + "epoch": 3.2617940678271524, + "grad_norm": 0.19845503568649292, + "learning_rate": 1.4257964415052491e-05, + "loss": 0.3698, "step": 90505 }, { - "epoch": 3.18, - "learning_rate": 1.54023270430249e-05, - "loss": 0.2579, + "epoch": 3.261974267488377, + "grad_norm": 0.29068851470947266, + "learning_rate": 1.4255329470307146e-05, + "loss": 0.393, "step": 90510 }, { - "epoch": 3.18, - "learning_rate": 1.5399696676418506e-05, - "loss": 0.2536, + "epoch": 3.262154467149602, + "grad_norm": 0.22075437009334564, + "learning_rate": 1.4252694671955847e-05, + "loss": 0.3644, "step": 90515 }, { - "epoch": 3.18, - "learning_rate": 1.5397066434465712e-05, - "loss": 0.245, + "epoch": 3.262334666810826, + "grad_norm": 0.2166365683078766, + "learning_rate": 1.4250060020034511e-05, + "loss": 0.362, "step": 90520 }, { - "epoch": 3.18, - "learning_rate": 1.5394436317200687e-05, - "loss": 0.2526, + "epoch": 3.262514866472051, + "grad_norm": 0.23188582062721252, + "learning_rate": 1.4247425514579038e-05, + "loss": 0.3859, "step": 90525 }, { - "epoch": 3.19, - "learning_rate": 1.539180632465758e-05, - "loss": 0.2504, + "epoch": 3.2626950661332756, + "grad_norm": 0.1937790811061859, + "learning_rate": 1.4244791155625289e-05, + "loss": 0.3916, "step": 90530 }, { - "epoch": 3.19, - "learning_rate": 1.538917645687054e-05, - "loss": 0.266, + "epoch": 3.2628752657945004, + "grad_norm": 0.20566551387310028, + "learning_rate": 1.424215694320919e-05, + "loss": 0.3681, "step": 90535 }, { - "epoch": 3.19, - "learning_rate": 1.5386546713873695e-05, - "loss": 0.2471, + "epoch": 3.263055465455725, + "grad_norm": 0.21733631193637848, + "learning_rate": 1.4239522877366618e-05, + "loss": 0.3715, "step": 90540 }, { - "epoch": 3.19, - "learning_rate": 1.5383917095701218e-05, - "loss": 0.2568, + "epoch": 3.2632356651169494, + "grad_norm": 0.23890431225299835, + "learning_rate": 1.4236888958133464e-05, + "loss": 0.3793, "step": 90545 }, { - "epoch": 3.19, - "learning_rate": 1.5381287602387236e-05, - "loss": 0.2708, + "epoch": 3.263415864778174, + "grad_norm": 0.19045878946781158, + "learning_rate": 1.4234255185545614e-05, + "loss": 0.3516, "step": 90550 }, { - "epoch": 3.19, - "learning_rate": 1.5378658233965893e-05, - "loss": 0.2876, + "epoch": 3.263596064439399, + "grad_norm": 0.27482134103775024, + "learning_rate": 1.4231621559638941e-05, + "loss": 0.3766, "step": 90555 }, { - "epoch": 3.19, - "learning_rate": 1.5376028990471336e-05, - "loss": 0.2632, + "epoch": 3.2637762641006236, + "grad_norm": 0.20117837190628052, + "learning_rate": 1.422898808044935e-05, + "loss": 0.3557, "step": 90560 }, { - "epoch": 3.19, - "learning_rate": 1.5373399871937706e-05, - "loss": 0.2386, + "epoch": 3.263956463761848, + "grad_norm": 0.30366307497024536, + "learning_rate": 1.4226354748012716e-05, + "loss": 0.4117, "step": 90565 }, { - "epoch": 3.19, - "learning_rate": 1.537077087839913e-05, - "loss": 0.2629, + "epoch": 3.2641366634230726, + "grad_norm": 0.23265069723129272, + "learning_rate": 1.4223721562364908e-05, + "loss": 0.3705, "step": 90570 }, { - "epoch": 3.19, - "learning_rate": 1.5368142009889737e-05, - "loss": 0.2412, + "epoch": 3.2643168630842974, + "grad_norm": 0.2623595893383026, + "learning_rate": 1.4221088523541809e-05, + "loss": 0.3727, "step": 90575 }, { - "epoch": 3.19, - "learning_rate": 1.5365513266443683e-05, - "loss": 0.2609, + "epoch": 3.264497062745522, + "grad_norm": 0.22619140148162842, + "learning_rate": 1.421845563157928e-05, + "loss": 0.4174, "step": 90580 }, { - "epoch": 3.19, - "learning_rate": 1.536288464809509e-05, - "loss": 0.2865, + "epoch": 3.264677262406747, + "grad_norm": 0.25267648696899414, + "learning_rate": 1.421582288651322e-05, + "loss": 0.395, "step": 90585 }, { - "epoch": 3.19, - "learning_rate": 1.5360256154878093e-05, - "loss": 0.2689, + "epoch": 3.2648574620679716, + "grad_norm": 0.30874866247177124, + "learning_rate": 1.421319028837948e-05, + "loss": 0.4297, "step": 90590 }, { - "epoch": 3.19, - "learning_rate": 1.53576277868268e-05, - "loss": 0.2651, + "epoch": 3.265037661729196, + "grad_norm": 0.2350592315196991, + "learning_rate": 1.421055783721394e-05, + "loss": 0.3709, "step": 90595 }, { - "epoch": 3.19, - "learning_rate": 1.5354999543975372e-05, - "loss": 0.251, + "epoch": 3.2652178613904206, + "grad_norm": 0.22834807634353638, + "learning_rate": 1.4207925533052455e-05, + "loss": 0.3666, "step": 90600 }, { - "epoch": 3.19, - "learning_rate": 1.5352371426357913e-05, - "loss": 0.2549, + "epoch": 3.2653980610516453, + "grad_norm": 0.21915757656097412, + "learning_rate": 1.4205293375930898e-05, + "loss": 0.3658, "step": 90605 }, { - "epoch": 3.19, - "learning_rate": 1.5349743434008556e-05, - "loss": 0.245, + "epoch": 3.26557826071287, + "grad_norm": 0.2512879967689514, + "learning_rate": 1.4202661365885116e-05, + "loss": 0.3719, "step": 90610 }, { - "epoch": 3.19, - "learning_rate": 1.534711556696141e-05, - "loss": 0.2429, + "epoch": 3.2657584603740943, + "grad_norm": 0.19401970505714417, + "learning_rate": 1.4200029502951007e-05, + "loss": 0.3696, "step": 90615 }, { - "epoch": 3.19, - "learning_rate": 1.5344487825250624e-05, - "loss": 0.2646, + "epoch": 3.265938660035319, + "grad_norm": 0.2430254966020584, + "learning_rate": 1.419739778716438e-05, + "loss": 0.3845, "step": 90620 }, { - "epoch": 3.19, - "learning_rate": 1.5341860208910292e-05, - "loss": 0.2681, + "epoch": 3.266118859696544, + "grad_norm": 0.24256938695907593, + "learning_rate": 1.4194766218561128e-05, + "loss": 0.3866, "step": 90625 }, { - "epoch": 3.19, - "learning_rate": 1.533923271797454e-05, - "loss": 0.2423, + "epoch": 3.2662990593577685, + "grad_norm": 0.2600161135196686, + "learning_rate": 1.4192134797177098e-05, + "loss": 0.3928, "step": 90630 }, { - "epoch": 3.19, - "learning_rate": 1.5336605352477476e-05, - "loss": 0.2569, + "epoch": 3.2664792590189933, + "grad_norm": 0.21937114000320435, + "learning_rate": 1.4189503523048137e-05, + "loss": 0.4063, "step": 90635 }, { - "epoch": 3.19, - "learning_rate": 1.5333978112453235e-05, - "loss": 0.2557, + "epoch": 3.2666594586802176, + "grad_norm": 0.21056115627288818, + "learning_rate": 1.4186872396210096e-05, + "loss": 0.3868, "step": 90640 }, { - "epoch": 3.19, - "learning_rate": 1.5331350997935916e-05, - "loss": 0.2587, + "epoch": 3.2668396583414423, + "grad_norm": 0.2408798485994339, + "learning_rate": 1.4184241416698827e-05, + "loss": 0.3905, "step": 90645 }, { - "epoch": 3.19, - "learning_rate": 1.5328724008959637e-05, - "loss": 0.2696, + "epoch": 3.267019858002667, + "grad_norm": 0.2301928699016571, + "learning_rate": 1.4181610584550165e-05, + "loss": 0.4008, "step": 90650 }, { - "epoch": 3.19, - "learning_rate": 1.532609714555849e-05, - "loss": 0.2512, + "epoch": 3.2672000576638918, + "grad_norm": 0.21639235317707062, + "learning_rate": 1.4178979899799971e-05, + "loss": 0.3618, "step": 90655 }, { - "epoch": 3.19, - "learning_rate": 1.5323470407766602e-05, - "loss": 0.2534, + "epoch": 3.267380257325116, + "grad_norm": 0.20622292160987854, + "learning_rate": 1.4176349362484085e-05, + "loss": 0.3987, "step": 90660 }, { - "epoch": 3.19, - "learning_rate": 1.532084379561808e-05, - "loss": 0.2649, + "epoch": 3.267560456986341, + "grad_norm": 0.24098491668701172, + "learning_rate": 1.4173718972638339e-05, + "loss": 0.4114, "step": 90665 }, { - "epoch": 3.19, - "learning_rate": 1.531821730914702e-05, - "loss": 0.26, + "epoch": 3.2677406566475655, + "grad_norm": 0.2708558142185211, + "learning_rate": 1.417108873029858e-05, + "loss": 0.388, "step": 90670 }, { - "epoch": 3.19, - "learning_rate": 1.5315590948387516e-05, - "loss": 0.2505, + "epoch": 3.2679208563087903, + "grad_norm": 0.24035510420799255, + "learning_rate": 1.4168458635500628e-05, + "loss": 0.3668, "step": 90675 }, { - "epoch": 3.19, - "learning_rate": 1.5312964713373695e-05, - "loss": 0.2635, + "epoch": 3.268101055970015, + "grad_norm": 0.22504648566246033, + "learning_rate": 1.416582868828034e-05, + "loss": 0.4099, "step": 90680 }, { - "epoch": 3.19, - "learning_rate": 1.5310338604139638e-05, - "loss": 0.2791, + "epoch": 3.2682812556312393, + "grad_norm": 0.21226103603839874, + "learning_rate": 1.416319888867354e-05, + "loss": 0.3926, "step": 90685 }, { - "epoch": 3.19, - "learning_rate": 1.5307712620719444e-05, - "loss": 0.2511, + "epoch": 3.268461455292464, + "grad_norm": 0.2621418535709381, + "learning_rate": 1.4160569236716059e-05, + "loss": 0.4155, "step": 90690 }, { - "epoch": 3.19, - "learning_rate": 1.530508676314722e-05, - "loss": 0.2455, + "epoch": 3.2686416549536887, + "grad_norm": 0.2505376636981964, + "learning_rate": 1.415793973244372e-05, + "loss": 0.3616, "step": 90695 }, { - "epoch": 3.19, - "learning_rate": 1.5302461031457057e-05, - "loss": 0.2518, + "epoch": 3.2688218546149135, + "grad_norm": 0.24424095451831818, + "learning_rate": 1.4155310375892345e-05, + "loss": 0.39, "step": 90700 }, { - "epoch": 3.19, - "learning_rate": 1.529983542568305e-05, - "loss": 0.2622, + "epoch": 3.2690020542761378, + "grad_norm": 0.2086549997329712, + "learning_rate": 1.4152681167097775e-05, + "loss": 0.3737, "step": 90705 }, { - "epoch": 3.19, - "learning_rate": 1.5297209945859274e-05, - "loss": 0.2675, + "epoch": 3.2691822539373625, + "grad_norm": 0.25878942012786865, + "learning_rate": 1.4150052106095834e-05, + "loss": 0.4075, "step": 90710 }, { - "epoch": 3.19, - "learning_rate": 1.5294584592019835e-05, - "loss": 0.2519, + "epoch": 3.2693624535985872, + "grad_norm": 0.21511436998844147, + "learning_rate": 1.4147423192922314e-05, + "loss": 0.3487, "step": 90715 }, { - "epoch": 3.19, - "learning_rate": 1.529195936419883e-05, - "loss": 0.2756, + "epoch": 3.269542653259812, + "grad_norm": 0.27404505014419556, + "learning_rate": 1.4144794427613061e-05, + "loss": 0.3914, "step": 90720 }, { - "epoch": 3.19, - "learning_rate": 1.5289334262430332e-05, - "loss": 0.2521, + "epoch": 3.2697228529210367, + "grad_norm": 0.24359211325645447, + "learning_rate": 1.4142165810203881e-05, + "loss": 0.3774, "step": 90725 }, { - "epoch": 3.19, - "learning_rate": 1.5286709286748417e-05, - "loss": 0.2765, + "epoch": 3.269903052582261, + "grad_norm": 0.19888465106487274, + "learning_rate": 1.413953734073059e-05, + "loss": 0.3661, "step": 90730 }, { - "epoch": 3.19, - "learning_rate": 1.5284084437187194e-05, - "loss": 0.2716, + "epoch": 3.2700832522434857, + "grad_norm": 0.250573992729187, + "learning_rate": 1.4136909019229003e-05, + "loss": 0.3367, "step": 90735 }, { - "epoch": 3.19, - "learning_rate": 1.5281459713780726e-05, - "loss": 0.2615, + "epoch": 3.2702634519047105, + "grad_norm": 0.2471703588962555, + "learning_rate": 1.4134280845734915e-05, + "loss": 0.3536, "step": 90740 }, { - "epoch": 3.19, - "learning_rate": 1.52788351165631e-05, - "loss": 0.2552, + "epoch": 3.270443651565935, + "grad_norm": 0.2185695767402649, + "learning_rate": 1.4131652820284158e-05, + "loss": 0.39, "step": 90745 }, { - "epoch": 3.19, - "learning_rate": 1.527621064556839e-05, - "loss": 0.2574, + "epoch": 3.2706238512271595, + "grad_norm": 0.22518201172351837, + "learning_rate": 1.4129024942912525e-05, + "loss": 0.3999, "step": 90750 }, { - "epoch": 3.19, - "learning_rate": 1.5273586300830684e-05, - "loss": 0.2539, + "epoch": 3.270804050888384, + "grad_norm": 0.23426784574985504, + "learning_rate": 1.4126397213655824e-05, + "loss": 0.3719, "step": 90755 }, { - "epoch": 3.19, - "learning_rate": 1.5270962082384047e-05, - "loss": 0.2696, + "epoch": 3.270984250549609, + "grad_norm": 0.28812581300735474, + "learning_rate": 1.4123769632549855e-05, + "loss": 0.3956, "step": 90760 }, { - "epoch": 3.19, - "learning_rate": 1.5268337990262555e-05, - "loss": 0.2712, + "epoch": 3.2711644502108337, + "grad_norm": 0.2103385478258133, + "learning_rate": 1.412114219963041e-05, + "loss": 0.4033, "step": 90765 }, { - "epoch": 3.19, - "learning_rate": 1.5265714024500276e-05, - "loss": 0.279, + "epoch": 3.2713446498720584, + "grad_norm": 0.23068909347057343, + "learning_rate": 1.4118514914933306e-05, + "loss": 0.3719, "step": 90770 }, { - "epoch": 3.19, - "learning_rate": 1.5263090185131287e-05, - "loss": 0.2613, + "epoch": 3.2715248495332827, + "grad_norm": 0.219300776720047, + "learning_rate": 1.411588777849433e-05, + "loss": 0.3917, "step": 90775 }, { - "epoch": 3.19, - "learning_rate": 1.526046647218966e-05, - "loss": 0.2419, + "epoch": 3.2717050491945074, + "grad_norm": 0.18937250971794128, + "learning_rate": 1.4113260790349275e-05, + "loss": 0.3667, "step": 90780 }, { - "epoch": 3.19, - "learning_rate": 1.525784288570946e-05, - "loss": 0.2895, + "epoch": 3.271885248855732, + "grad_norm": 0.24838852882385254, + "learning_rate": 1.4110633950533936e-05, + "loss": 0.3858, "step": 90785 }, { - "epoch": 3.19, - "learning_rate": 1.5255219425724737e-05, - "loss": 0.2659, + "epoch": 3.272065448516957, + "grad_norm": 0.23256389796733856, + "learning_rate": 1.4108007259084099e-05, + "loss": 0.3784, "step": 90790 }, { - "epoch": 3.19, - "learning_rate": 1.5252596092269575e-05, - "loss": 0.2426, + "epoch": 3.272245648178181, + "grad_norm": 0.2195148915052414, + "learning_rate": 1.4105380716035544e-05, + "loss": 0.3441, "step": 90795 }, { - "epoch": 3.19, - "learning_rate": 1.5249972885378034e-05, - "loss": 0.2553, + "epoch": 3.272425847839406, + "grad_norm": 0.23615360260009766, + "learning_rate": 1.4102754321424088e-05, + "loss": 0.4046, "step": 90800 }, { - "epoch": 3.19, - "learning_rate": 1.5247349805084155e-05, - "loss": 0.2577, + "epoch": 3.2726060475006307, + "grad_norm": 0.25502121448516846, + "learning_rate": 1.4100128075285473e-05, + "loss": 0.3555, "step": 90805 }, { - "epoch": 3.19, - "learning_rate": 1.5244726851422026e-05, - "loss": 0.2744, + "epoch": 3.2727862471618554, + "grad_norm": 0.23027649521827698, + "learning_rate": 1.4097501977655517e-05, + "loss": 0.3715, "step": 90810 }, { - "epoch": 3.2, - "learning_rate": 1.5242104024425691e-05, - "loss": 0.2499, + "epoch": 3.27296644682308, + "grad_norm": 0.25443360209465027, + "learning_rate": 1.4094876028569983e-05, + "loss": 0.3668, "step": 90815 }, { - "epoch": 3.2, - "learning_rate": 1.5239481324129195e-05, - "loss": 0.2641, + "epoch": 3.2731466464843044, + "grad_norm": 0.28707796335220337, + "learning_rate": 1.4092250228064643e-05, + "loss": 0.4048, "step": 90820 }, { - "epoch": 3.2, - "learning_rate": 1.5236858750566604e-05, - "loss": 0.2535, + "epoch": 3.273326846145529, + "grad_norm": 0.2626727521419525, + "learning_rate": 1.4089624576175301e-05, + "loss": 0.4063, "step": 90825 }, { - "epoch": 3.2, - "learning_rate": 1.5234236303771967e-05, - "loss": 0.2481, + "epoch": 3.273507045806754, + "grad_norm": 0.2237686812877655, + "learning_rate": 1.4086999072937702e-05, + "loss": 0.3797, "step": 90830 }, { - "epoch": 3.2, - "learning_rate": 1.5231613983779341e-05, - "loss": 0.2672, + "epoch": 3.2736872454679786, + "grad_norm": 0.20242911577224731, + "learning_rate": 1.4084373718387617e-05, + "loss": 0.3658, "step": 90835 }, { - "epoch": 3.2, - "learning_rate": 1.522899179062277e-05, - "loss": 0.2875, + "epoch": 3.273867445129203, + "grad_norm": 0.18086695671081543, + "learning_rate": 1.4081748512560838e-05, + "loss": 0.3622, "step": 90840 }, { - "epoch": 3.2, - "learning_rate": 1.5226369724336292e-05, - "loss": 0.2434, + "epoch": 3.2740476447904276, + "grad_norm": 0.269579142332077, + "learning_rate": 1.4079123455493121e-05, + "loss": 0.4093, "step": 90845 }, { - "epoch": 3.2, - "learning_rate": 1.5223747784953974e-05, - "loss": 0.2503, + "epoch": 3.2742278444516524, + "grad_norm": 0.20566615462303162, + "learning_rate": 1.4076498547220234e-05, + "loss": 0.3837, "step": 90850 }, { - "epoch": 3.2, - "learning_rate": 1.5221125972509839e-05, - "loss": 0.2584, + "epoch": 3.274408044112877, + "grad_norm": 0.22256791591644287, + "learning_rate": 1.4073873787777936e-05, + "loss": 0.3359, "step": 90855 }, { - "epoch": 3.2, - "learning_rate": 1.5218504287037947e-05, - "loss": 0.2519, + "epoch": 3.274588243774102, + "grad_norm": 0.2186269313097, + "learning_rate": 1.4071249177201984e-05, + "loss": 0.372, "step": 90860 }, { - "epoch": 3.2, - "learning_rate": 1.5215882728572317e-05, - "loss": 0.2567, + "epoch": 3.2747684434353266, + "grad_norm": 0.21954451501369476, + "learning_rate": 1.4068624715528158e-05, + "loss": 0.3913, "step": 90865 }, { - "epoch": 3.2, - "learning_rate": 1.5213261297147014e-05, - "loss": 0.2547, + "epoch": 3.274948643096551, + "grad_norm": 0.2596687972545624, + "learning_rate": 1.40660004027922e-05, + "loss": 0.371, "step": 90870 }, { - "epoch": 3.2, - "learning_rate": 1.5210639992796063e-05, - "loss": 0.2447, + "epoch": 3.2751288427577756, + "grad_norm": 0.2262258678674698, + "learning_rate": 1.4063376239029875e-05, + "loss": 0.4175, "step": 90875 }, { - "epoch": 3.2, - "learning_rate": 1.5208018815553496e-05, - "loss": 0.2541, + "epoch": 3.2753090424190003, + "grad_norm": 0.26203519105911255, + "learning_rate": 1.406075222427693e-05, + "loss": 0.3837, "step": 90880 }, { - "epoch": 3.2, - "learning_rate": 1.5205397765453348e-05, - "loss": 0.2599, + "epoch": 3.275489242080225, + "grad_norm": 0.20488256216049194, + "learning_rate": 1.4058128358569106e-05, + "loss": 0.3964, "step": 90885 }, { - "epoch": 3.2, - "learning_rate": 1.5202776842529662e-05, - "loss": 0.2661, + "epoch": 3.2756694417414494, + "grad_norm": 0.23428840935230255, + "learning_rate": 1.4055504641942175e-05, + "loss": 0.3849, "step": 90890 }, { - "epoch": 3.2, - "learning_rate": 1.5200156046816463e-05, - "loss": 0.2734, + "epoch": 3.275849641402674, + "grad_norm": 0.23843036592006683, + "learning_rate": 1.4052881074431884e-05, + "loss": 0.3872, "step": 90895 }, { - "epoch": 3.2, - "learning_rate": 1.5197535378347775e-05, - "loss": 0.2833, + "epoch": 3.276029841063899, + "grad_norm": 0.22145730257034302, + "learning_rate": 1.4050257656073947e-05, + "loss": 0.4031, "step": 90900 }, { - "epoch": 3.2, - "learning_rate": 1.5194914837157631e-05, - "loss": 0.2635, + "epoch": 3.2762100407251236, + "grad_norm": 0.2753254175186157, + "learning_rate": 1.4047634386904142e-05, + "loss": 0.3998, "step": 90905 }, { - "epoch": 3.2, - "learning_rate": 1.5192294423280052e-05, - "loss": 0.2675, + "epoch": 3.2763902403863483, + "grad_norm": 0.18128567934036255, + "learning_rate": 1.4045011266958197e-05, + "loss": 0.4172, "step": 90910 }, { - "epoch": 3.2, - "learning_rate": 1.5189674136749077e-05, - "loss": 0.2532, + "epoch": 3.2765704400475726, + "grad_norm": 0.2414032369852066, + "learning_rate": 1.4042388296271856e-05, + "loss": 0.3816, "step": 90915 }, { - "epoch": 3.2, - "learning_rate": 1.5187053977598702e-05, - "loss": 0.2469, + "epoch": 3.2767506397087973, + "grad_norm": 0.21249370276927948, + "learning_rate": 1.4039765474880851e-05, + "loss": 0.3528, "step": 90920 }, { - "epoch": 3.2, - "learning_rate": 1.518443394586298e-05, - "loss": 0.2738, + "epoch": 3.276930839370022, + "grad_norm": 0.238897442817688, + "learning_rate": 1.4037142802820907e-05, + "loss": 0.4205, "step": 90925 }, { - "epoch": 3.2, - "learning_rate": 1.5181814041575908e-05, - "loss": 0.2736, + "epoch": 3.277111039031247, + "grad_norm": 0.2670360803604126, + "learning_rate": 1.4034520280127782e-05, + "loss": 0.3664, "step": 90930 }, { - "epoch": 3.2, - "learning_rate": 1.5179194264771512e-05, - "loss": 0.2674, + "epoch": 3.277291238692471, + "grad_norm": 0.20420727133750916, + "learning_rate": 1.4031897906837194e-05, + "loss": 0.3831, "step": 90935 }, { - "epoch": 3.2, - "learning_rate": 1.5176574615483796e-05, - "loss": 0.2715, + "epoch": 3.277471438353696, + "grad_norm": 0.2537064850330353, + "learning_rate": 1.4029275682984878e-05, + "loss": 0.3979, "step": 90940 }, { - "epoch": 3.2, - "learning_rate": 1.51739550937468e-05, - "loss": 0.2848, + "epoch": 3.2776516380149205, + "grad_norm": 0.3141496181488037, + "learning_rate": 1.402665360860655e-05, + "loss": 0.3572, "step": 90945 }, { - "epoch": 3.2, - "learning_rate": 1.5171335699594519e-05, - "loss": 0.2449, + "epoch": 3.2778318376761453, + "grad_norm": 0.18175874650478363, + "learning_rate": 1.4024031683737937e-05, + "loss": 0.387, "step": 90950 }, { - "epoch": 3.2, - "learning_rate": 1.5168716433060964e-05, - "loss": 0.2451, + "epoch": 3.27801203733737, + "grad_norm": 0.2895739674568176, + "learning_rate": 1.4021409908414773e-05, + "loss": 0.367, "step": 90955 }, { - "epoch": 3.2, - "learning_rate": 1.516609729418014e-05, - "loss": 0.2459, + "epoch": 3.2781922369985943, + "grad_norm": 0.21518366038799286, + "learning_rate": 1.401878828267278e-05, + "loss": 0.3816, "step": 90960 }, { - "epoch": 3.2, - "learning_rate": 1.5163478282986065e-05, - "loss": 0.2462, + "epoch": 3.278372436659819, + "grad_norm": 0.25440242886543274, + "learning_rate": 1.4016166806547664e-05, + "loss": 0.3703, "step": 90965 }, { - "epoch": 3.2, - "learning_rate": 1.516085939951275e-05, - "loss": 0.2721, + "epoch": 3.2785526363210438, + "grad_norm": 0.271392822265625, + "learning_rate": 1.4013545480075153e-05, + "loss": 0.4075, "step": 90970 }, { - "epoch": 3.2, - "learning_rate": 1.515824064379419e-05, - "loss": 0.275, + "epoch": 3.2787328359822685, + "grad_norm": 0.22662685811519623, + "learning_rate": 1.4010924303290957e-05, + "loss": 0.3931, "step": 90975 }, { - "epoch": 3.2, - "learning_rate": 1.515562201586438e-05, - "loss": 0.2739, + "epoch": 3.278913035643493, + "grad_norm": 0.24230170249938965, + "learning_rate": 1.4008303276230777e-05, + "loss": 0.378, "step": 90980 }, { - "epoch": 3.2, - "learning_rate": 1.5153003515757335e-05, - "loss": 0.2541, + "epoch": 3.2790932353047175, + "grad_norm": 0.2077885866165161, + "learning_rate": 1.4005682398930347e-05, + "loss": 0.3544, "step": 90985 }, { - "epoch": 3.2, - "learning_rate": 1.5150385143507052e-05, - "loss": 0.2435, + "epoch": 3.2792734349659423, + "grad_norm": 0.22285114228725433, + "learning_rate": 1.4003061671425368e-05, + "loss": 0.3823, "step": 90990 }, { - "epoch": 3.2, - "learning_rate": 1.5147766899147531e-05, - "loss": 0.2621, + "epoch": 3.279453634627167, + "grad_norm": 0.22684328258037567, + "learning_rate": 1.4000441093751546e-05, + "loss": 0.3657, "step": 90995 }, { - "epoch": 3.2, - "learning_rate": 1.5145148782712748e-05, - "loss": 0.2644, + "epoch": 3.2796338342883917, + "grad_norm": 0.2049197107553482, + "learning_rate": 1.3997820665944584e-05, + "loss": 0.4181, "step": 91000 }, { - "epoch": 3.2, - "eval_loss": 0.25480425357818604, - "eval_runtime": 10.5461, - "eval_samples_per_second": 9.482, - "eval_steps_per_second": 9.482, + "epoch": 3.2796338342883917, + "eval_loss": 0.4301852881908417, + "eval_runtime": 3.5299, + "eval_samples_per_second": 28.329, + "eval_steps_per_second": 7.082, "step": 91000 }, { - "epoch": 3.2, - "learning_rate": 1.5142530794236731e-05, - "loss": 0.2754, + "epoch": 3.279814033949616, + "grad_norm": 0.22128911316394806, + "learning_rate": 1.3995200388040172e-05, + "loss": 0.3903, "step": 91005 }, { - "epoch": 3.2, - "learning_rate": 1.5139912933753451e-05, - "loss": 0.2911, + "epoch": 3.2799942336108407, + "grad_norm": 0.3121122717857361, + "learning_rate": 1.3992580260074046e-05, + "loss": 0.4014, "step": 91010 }, { - "epoch": 3.2, - "learning_rate": 1.5137295201296897e-05, - "loss": 0.2636, + "epoch": 3.2801744332720655, + "grad_norm": 0.25039294362068176, + "learning_rate": 1.3989960282081874e-05, + "loss": 0.3716, "step": 91015 }, { - "epoch": 3.2, - "learning_rate": 1.5134677596901065e-05, - "loss": 0.2511, + "epoch": 3.28035463293329, + "grad_norm": 0.20780634880065918, + "learning_rate": 1.398734045409935e-05, + "loss": 0.3885, "step": 91020 }, { - "epoch": 3.2, - "learning_rate": 1.513206012059995e-05, - "loss": 0.281, + "epoch": 3.2805348325945145, + "grad_norm": 0.204863503575325, + "learning_rate": 1.3984720776162192e-05, + "loss": 0.3911, "step": 91025 }, { - "epoch": 3.2, - "learning_rate": 1.5129442772427531e-05, - "loss": 0.2354, + "epoch": 3.2807150322557392, + "grad_norm": 0.24138136208057404, + "learning_rate": 1.398210124830608e-05, + "loss": 0.3948, "step": 91030 }, { - "epoch": 3.2, - "learning_rate": 1.5126825552417784e-05, - "loss": 0.2607, + "epoch": 3.280895231916964, + "grad_norm": 0.21834784746170044, + "learning_rate": 1.3979481870566703e-05, + "loss": 0.3692, "step": 91035 }, { - "epoch": 3.2, - "learning_rate": 1.5124208460604708e-05, - "loss": 0.2657, + "epoch": 3.2810754315781887, + "grad_norm": 0.20662029087543488, + "learning_rate": 1.3976862642979755e-05, + "loss": 0.4021, "step": 91040 }, { - "epoch": 3.2, - "learning_rate": 1.5121591497022275e-05, - "loss": 0.2594, + "epoch": 3.2812556312394134, + "grad_norm": 0.2747882604598999, + "learning_rate": 1.3974243565580907e-05, + "loss": 0.355, "step": 91045 }, { - "epoch": 3.2, - "learning_rate": 1.5118974661704465e-05, - "loss": 0.2513, + "epoch": 3.2814358309006377, + "grad_norm": 0.2811215817928314, + "learning_rate": 1.3971624638405867e-05, + "loss": 0.3675, "step": 91050 }, { - "epoch": 3.2, - "learning_rate": 1.5116357954685257e-05, - "loss": 0.2565, + "epoch": 3.2816160305618625, + "grad_norm": 0.3407902121543884, + "learning_rate": 1.3969005861490305e-05, + "loss": 0.429, "step": 91055 }, { - "epoch": 3.2, - "learning_rate": 1.511374137599863e-05, - "loss": 0.2841, + "epoch": 3.281796230223087, + "grad_norm": 0.1697690635919571, + "learning_rate": 1.3966387234869904e-05, + "loss": 0.3446, "step": 91060 }, { - "epoch": 3.2, - "learning_rate": 1.5111124925678558e-05, - "loss": 0.2329, + "epoch": 3.281976429884312, + "grad_norm": 0.20448936522006989, + "learning_rate": 1.3963768758580342e-05, + "loss": 0.3826, "step": 91065 }, { - "epoch": 3.2, - "learning_rate": 1.5108508603759008e-05, - "loss": 0.2446, + "epoch": 3.282156629545536, + "grad_norm": 0.2799994647502899, + "learning_rate": 1.3961150432657283e-05, + "loss": 0.3621, "step": 91070 }, { - "epoch": 3.2, - "learning_rate": 1.5105892410273947e-05, - "loss": 0.2464, + "epoch": 3.282336829206761, + "grad_norm": 0.21096967160701752, + "learning_rate": 1.395853225713642e-05, + "loss": 0.3655, "step": 91075 }, { - "epoch": 3.2, - "learning_rate": 1.5103276345257368e-05, - "loss": 0.2742, + "epoch": 3.2825170288679857, + "grad_norm": 0.23864832520484924, + "learning_rate": 1.395591423205343e-05, + "loss": 0.3979, "step": 91080 }, { - "epoch": 3.2, - "learning_rate": 1.5100660408743218e-05, - "loss": 0.2279, + "epoch": 3.2826972285292104, + "grad_norm": 0.2114400416612625, + "learning_rate": 1.3953296357443946e-05, + "loss": 0.4144, "step": 91085 }, { - "epoch": 3.2, - "learning_rate": 1.509804460076547e-05, - "loss": 0.252, + "epoch": 3.282877428190435, + "grad_norm": 0.2655703127384186, + "learning_rate": 1.3950678633343675e-05, + "loss": 0.3678, "step": 91090 }, { - "epoch": 3.2, - "learning_rate": 1.5095428921358079e-05, - "loss": 0.2564, + "epoch": 3.28305762785166, + "grad_norm": 0.24133309721946716, + "learning_rate": 1.3948061059788267e-05, + "loss": 0.3951, "step": 91095 }, { - "epoch": 3.21, - "learning_rate": 1.509281337055502e-05, - "loss": 0.2621, + "epoch": 3.283237827512884, + "grad_norm": 0.22033952176570892, + "learning_rate": 1.3945443636813376e-05, + "loss": 0.3699, "step": 91100 }, { - "epoch": 3.21, - "learning_rate": 1.5090197948390255e-05, - "loss": 0.2564, + "epoch": 3.283418027174109, + "grad_norm": 0.23784691095352173, + "learning_rate": 1.3942826364454697e-05, + "loss": 0.4171, "step": 91105 }, { - "epoch": 3.21, - "learning_rate": 1.5087582654897738e-05, - "loss": 0.2645, + "epoch": 3.2835982268353336, + "grad_norm": 0.17935135960578918, + "learning_rate": 1.3940209242747847e-05, + "loss": 0.3836, "step": 91110 }, { - "epoch": 3.21, - "learning_rate": 1.5084967490111419e-05, - "loss": 0.265, + "epoch": 3.2837784264965584, + "grad_norm": 0.2037777453660965, + "learning_rate": 1.3937592271728517e-05, + "loss": 0.3883, "step": 91115 }, { - "epoch": 3.21, - "learning_rate": 1.5082352454065274e-05, - "loss": 0.2581, + "epoch": 3.2839586261577827, + "grad_norm": 0.24445310235023499, + "learning_rate": 1.3934975451432346e-05, + "loss": 0.3805, "step": 91120 }, { - "epoch": 3.21, - "learning_rate": 1.5079737546793237e-05, - "loss": 0.2452, + "epoch": 3.2841388258190074, + "grad_norm": 0.24390850961208344, + "learning_rate": 1.3932358781894996e-05, + "loss": 0.3939, "step": 91125 }, { - "epoch": 3.21, - "learning_rate": 1.5077122768329282e-05, - "loss": 0.2548, + "epoch": 3.284319025480232, + "grad_norm": 0.19269582629203796, + "learning_rate": 1.3929742263152115e-05, + "loss": 0.3994, "step": 91130 }, { - "epoch": 3.21, - "learning_rate": 1.5074508118707334e-05, - "loss": 0.2566, + "epoch": 3.284499225141457, + "grad_norm": 0.2648945748806, + "learning_rate": 1.3927125895239352e-05, + "loss": 0.3578, "step": 91135 }, { - "epoch": 3.21, - "learning_rate": 1.5071893597961367e-05, - "loss": 0.2787, + "epoch": 3.2846794248026816, + "grad_norm": 0.18639974296092987, + "learning_rate": 1.3924509678192343e-05, + "loss": 0.3366, "step": 91140 }, { - "epoch": 3.21, - "learning_rate": 1.506927920612532e-05, - "loss": 0.2527, + "epoch": 3.284859624463906, + "grad_norm": 0.1827782690525055, + "learning_rate": 1.3921893612046757e-05, + "loss": 0.3588, "step": 91145 }, { - "epoch": 3.21, - "learning_rate": 1.5066664943233128e-05, - "loss": 0.2547, + "epoch": 3.2850398241251306, + "grad_norm": 0.20156405866146088, + "learning_rate": 1.3919277696838227e-05, + "loss": 0.3694, "step": 91150 }, { - "epoch": 3.21, - "learning_rate": 1.506405080931875e-05, - "loss": 0.236, + "epoch": 3.2852200237863554, + "grad_norm": 0.1734452098608017, + "learning_rate": 1.391666193260239e-05, + "loss": 0.3947, "step": 91155 }, { - "epoch": 3.21, - "learning_rate": 1.5061436804416129e-05, - "loss": 0.2815, + "epoch": 3.28540022344758, + "grad_norm": 0.2353818714618683, + "learning_rate": 1.3914046319374891e-05, + "loss": 0.3996, "step": 91160 }, { - "epoch": 3.21, - "learning_rate": 1.5058822928559202e-05, - "loss": 0.2583, + "epoch": 3.2855804231088044, + "grad_norm": 0.23739945888519287, + "learning_rate": 1.3911430857191351e-05, + "loss": 0.3798, "step": 91165 }, { - "epoch": 3.21, - "learning_rate": 1.5056209181781896e-05, - "loss": 0.2591, + "epoch": 3.285760622770029, + "grad_norm": 0.20489168167114258, + "learning_rate": 1.3908815546087434e-05, + "loss": 0.3993, "step": 91170 }, { - "epoch": 3.21, - "learning_rate": 1.5053595564118172e-05, - "loss": 0.2473, + "epoch": 3.285940822431254, + "grad_norm": 0.22435033321380615, + "learning_rate": 1.3906200386098753e-05, + "loss": 0.3806, "step": 91175 }, { - "epoch": 3.21, - "learning_rate": 1.5050982075601949e-05, - "loss": 0.2496, + "epoch": 3.2861210220924786, + "grad_norm": 0.227759450674057, + "learning_rate": 1.3903585377260947e-05, + "loss": 0.3604, "step": 91180 }, { - "epoch": 3.21, - "learning_rate": 1.5048368716267175e-05, - "loss": 0.2478, + "epoch": 3.2863012217537033, + "grad_norm": 0.23122897744178772, + "learning_rate": 1.390097051960964e-05, + "loss": 0.3973, "step": 91185 }, { - "epoch": 3.21, - "learning_rate": 1.5045755486147761e-05, - "loss": 0.2934, + "epoch": 3.2864814214149276, + "grad_norm": 0.2522181570529938, + "learning_rate": 1.3898355813180453e-05, + "loss": 0.3383, "step": 91190 }, { - "epoch": 3.21, - "learning_rate": 1.5043142385277664e-05, - "loss": 0.2542, + "epoch": 3.2866616210761523, + "grad_norm": 0.3048706650733948, + "learning_rate": 1.3895741258009038e-05, + "loss": 0.3729, "step": 91195 }, { - "epoch": 3.21, - "learning_rate": 1.5040529413690802e-05, - "loss": 0.2574, + "epoch": 3.286841820737377, + "grad_norm": 0.20635949075222015, + "learning_rate": 1.3893126854130985e-05, + "loss": 0.3899, "step": 91200 }, { - "epoch": 3.21, - "learning_rate": 1.5037916571421094e-05, - "loss": 0.2496, + "epoch": 3.287022020398602, + "grad_norm": 0.2326965183019638, + "learning_rate": 1.3890512601581923e-05, + "loss": 0.3528, "step": 91205 }, { - "epoch": 3.21, - "learning_rate": 1.5035303858502474e-05, - "loss": 0.2459, + "epoch": 3.287202220059826, + "grad_norm": 0.22335182130336761, + "learning_rate": 1.3887898500397484e-05, + "loss": 0.3865, "step": 91210 }, { - "epoch": 3.21, - "learning_rate": 1.5032691274968868e-05, - "loss": 0.2913, + "epoch": 3.287382419721051, + "grad_norm": 0.2084750384092331, + "learning_rate": 1.3885284550613264e-05, + "loss": 0.3724, "step": 91215 }, { - "epoch": 3.21, - "learning_rate": 1.5030078820854205e-05, - "loss": 0.28, + "epoch": 3.2875626193822756, + "grad_norm": 0.18253043293952942, + "learning_rate": 1.3882670752264915e-05, + "loss": 0.3941, "step": 91220 }, { - "epoch": 3.21, - "learning_rate": 1.5027466496192393e-05, - "loss": 0.2536, + "epoch": 3.2877428190435003, + "grad_norm": 0.20223382115364075, + "learning_rate": 1.388005710538801e-05, + "loss": 0.3569, "step": 91225 }, { - "epoch": 3.21, - "learning_rate": 1.5024854301017349e-05, - "loss": 0.2663, + "epoch": 3.287923018704725, + "grad_norm": 0.20182040333747864, + "learning_rate": 1.3877443610018168e-05, + "loss": 0.3787, "step": 91230 }, { - "epoch": 3.21, - "learning_rate": 1.5022242235363004e-05, - "loss": 0.2726, + "epoch": 3.2881032183659493, + "grad_norm": 0.2429923266172409, + "learning_rate": 1.3874830266191014e-05, + "loss": 0.3608, "step": 91235 }, { - "epoch": 3.21, - "learning_rate": 1.501963029926327e-05, - "loss": 0.2587, + "epoch": 3.288283418027174, + "grad_norm": 0.23385964334011078, + "learning_rate": 1.387221707394214e-05, + "loss": 0.3892, "step": 91240 }, { - "epoch": 3.21, - "learning_rate": 1.501701849275206e-05, - "loss": 0.2533, + "epoch": 3.288463617688399, + "grad_norm": 0.20496410131454468, + "learning_rate": 1.3869604033307154e-05, + "loss": 0.3716, "step": 91245 }, { - "epoch": 3.21, - "learning_rate": 1.501440681586328e-05, - "loss": 0.2719, + "epoch": 3.2886438173496235, + "grad_norm": 0.20968787372112274, + "learning_rate": 1.3866991144321661e-05, + "loss": 0.3497, "step": 91250 }, { - "epoch": 3.21, - "learning_rate": 1.5011795268630852e-05, - "loss": 0.2614, + "epoch": 3.288824017010848, + "grad_norm": 0.2523500323295593, + "learning_rate": 1.3864378407021244e-05, + "loss": 0.3817, "step": 91255 }, { - "epoch": 3.21, - "learning_rate": 1.5009183851088677e-05, - "loss": 0.2336, + "epoch": 3.2890042166720725, + "grad_norm": 0.1904435008764267, + "learning_rate": 1.386176582144153e-05, + "loss": 0.3555, "step": 91260 }, { - "epoch": 3.21, - "learning_rate": 1.5006572563270665e-05, - "loss": 0.26, + "epoch": 3.2891844163332973, + "grad_norm": 0.2404657006263733, + "learning_rate": 1.3859153387618098e-05, + "loss": 0.3649, "step": 91265 }, { - "epoch": 3.21, - "learning_rate": 1.5003961405210723e-05, - "loss": 0.2684, + "epoch": 3.289364615994522, + "grad_norm": 0.23486708104610443, + "learning_rate": 1.3856541105586545e-05, + "loss": 0.3715, "step": 91270 }, { - "epoch": 3.21, - "learning_rate": 1.5001350376942758e-05, - "loss": 0.2504, + "epoch": 3.2895448156557467, + "grad_norm": 0.20871885120868683, + "learning_rate": 1.3853928975382464e-05, + "loss": 0.3904, "step": 91275 }, { - "epoch": 3.21, - "learning_rate": 1.499873947850067e-05, - "loss": 0.2491, + "epoch": 3.289725015316971, + "grad_norm": 0.23163028061389923, + "learning_rate": 1.3851316997041438e-05, + "loss": 0.3893, "step": 91280 }, { - "epoch": 3.21, - "learning_rate": 1.499612870991835e-05, - "loss": 0.2461, + "epoch": 3.2899052149781958, + "grad_norm": 0.32008373737335205, + "learning_rate": 1.3848705170599053e-05, + "loss": 0.3878, "step": 91285 }, { - "epoch": 3.21, - "learning_rate": 1.4993518071229712e-05, - "loss": 0.2483, + "epoch": 3.2900854146394205, + "grad_norm": 0.26068592071533203, + "learning_rate": 1.3846093496090918e-05, + "loss": 0.3768, "step": 91290 }, { - "epoch": 3.21, - "learning_rate": 1.4990907562468647e-05, - "loss": 0.2515, + "epoch": 3.2902656143006452, + "grad_norm": 0.2309846431016922, + "learning_rate": 1.384348197355258e-05, + "loss": 0.4094, "step": 91295 }, { - "epoch": 3.21, - "learning_rate": 1.4988297183669054e-05, - "loss": 0.2458, + "epoch": 3.2904458139618695, + "grad_norm": 0.250918447971344, + "learning_rate": 1.3840870603019655e-05, + "loss": 0.3681, "step": 91300 }, { - "epoch": 3.21, - "learning_rate": 1.4985686934864812e-05, - "loss": 0.2697, + "epoch": 3.2906260136230943, + "grad_norm": 0.22328318655490875, + "learning_rate": 1.3838259384527702e-05, + "loss": 0.3665, "step": 91305 }, { - "epoch": 3.21, - "learning_rate": 1.498307681608984e-05, - "loss": 0.2606, + "epoch": 3.290806213284319, + "grad_norm": 0.25075024366378784, + "learning_rate": 1.3835648318112307e-05, + "loss": 0.3973, "step": 91310 }, { - "epoch": 3.21, - "learning_rate": 1.4980466827378009e-05, - "loss": 0.2456, + "epoch": 3.2909864129455437, + "grad_norm": 0.20350109040737152, + "learning_rate": 1.383303740380904e-05, + "loss": 0.3423, "step": 91315 }, { - "epoch": 3.21, - "learning_rate": 1.4977856968763212e-05, - "loss": 0.2617, + "epoch": 3.2911666126067685, + "grad_norm": 0.2606266438961029, + "learning_rate": 1.383042664165348e-05, + "loss": 0.3894, "step": 91320 }, { - "epoch": 3.21, - "learning_rate": 1.4975247240279332e-05, - "loss": 0.2436, + "epoch": 3.2913468122679927, + "grad_norm": 0.20949092507362366, + "learning_rate": 1.3827816031681182e-05, + "loss": 0.4185, "step": 91325 }, { - "epoch": 3.21, - "learning_rate": 1.497263764196027e-05, - "loss": 0.2471, + "epoch": 3.2915270119292175, + "grad_norm": 0.23206953704357147, + "learning_rate": 1.3825205573927736e-05, + "loss": 0.3802, "step": 91330 }, { - "epoch": 3.21, - "learning_rate": 1.4970028173839901e-05, - "loss": 0.2406, + "epoch": 3.291707211590442, + "grad_norm": 0.21940544247627258, + "learning_rate": 1.3822595268428703e-05, + "loss": 0.3655, "step": 91335 }, { - "epoch": 3.21, - "learning_rate": 1.4967418835952095e-05, - "loss": 0.2429, + "epoch": 3.291887411251667, + "grad_norm": 0.1804596483707428, + "learning_rate": 1.3819985115219644e-05, + "loss": 0.3701, "step": 91340 }, { - "epoch": 3.21, - "learning_rate": 1.496480962833075e-05, - "loss": 0.2761, + "epoch": 3.2920676109128912, + "grad_norm": 0.22188700735569, + "learning_rate": 1.3817375114336123e-05, + "loss": 0.3908, "step": 91345 }, { - "epoch": 3.21, - "learning_rate": 1.4962200551009735e-05, - "loss": 0.2409, + "epoch": 3.292247810574116, + "grad_norm": 0.2676689326763153, + "learning_rate": 1.3814765265813695e-05, + "loss": 0.4065, "step": 91350 }, { - "epoch": 3.21, - "learning_rate": 1.4959591604022937e-05, - "loss": 0.2541, + "epoch": 3.2924280102353407, + "grad_norm": 0.20868746936321259, + "learning_rate": 1.381215556968793e-05, + "loss": 0.3471, "step": 91355 }, { - "epoch": 3.21, - "learning_rate": 1.4956982787404228e-05, - "loss": 0.2431, + "epoch": 3.2926082098965654, + "grad_norm": 0.21724528074264526, + "learning_rate": 1.380954602599438e-05, + "loss": 0.382, "step": 91360 }, { - "epoch": 3.21, - "learning_rate": 1.4954374101187465e-05, - "loss": 0.241, + "epoch": 3.29278840955779, + "grad_norm": 0.21302174031734467, + "learning_rate": 1.3806936634768603e-05, + "loss": 0.37, "step": 91365 }, { - "epoch": 3.21, - "learning_rate": 1.4951765545406547e-05, - "loss": 0.2682, + "epoch": 3.292968609219015, + "grad_norm": 0.21098196506500244, + "learning_rate": 1.3804327396046143e-05, + "loss": 0.3938, "step": 91370 }, { - "epoch": 3.21, - "learning_rate": 1.4949157120095324e-05, - "loss": 0.2601, + "epoch": 3.293148808880239, + "grad_norm": 0.20903551578521729, + "learning_rate": 1.3801718309862546e-05, + "loss": 0.3812, "step": 91375 }, { - "epoch": 3.22, - "learning_rate": 1.4946548825287668e-05, - "loss": 0.2532, + "epoch": 3.293329008541464, + "grad_norm": 0.2137085646390915, + "learning_rate": 1.3799109376253378e-05, + "loss": 0.3976, "step": 91380 }, { - "epoch": 3.22, - "learning_rate": 1.4943940661017458e-05, - "loss": 0.2727, + "epoch": 3.2935092082026887, + "grad_norm": 0.2167220115661621, + "learning_rate": 1.3796500595254187e-05, + "loss": 0.3906, "step": 91385 }, { - "epoch": 3.22, - "learning_rate": 1.4941332627318555e-05, - "loss": 0.2476, + "epoch": 3.2936894078639134, + "grad_norm": 0.19560924172401428, + "learning_rate": 1.3793891966900488e-05, + "loss": 0.3904, "step": 91390 }, { - "epoch": 3.22, - "learning_rate": 1.4938724724224817e-05, - "loss": 0.256, + "epoch": 3.2938696075251377, + "grad_norm": 0.24281176924705505, + "learning_rate": 1.379128349122785e-05, + "loss": 0.3647, "step": 91395 }, { - "epoch": 3.22, - "learning_rate": 1.4936116951770095e-05, - "loss": 0.2726, + "epoch": 3.2940498071863624, + "grad_norm": 0.2524743676185608, + "learning_rate": 1.3788675168271791e-05, + "loss": 0.3615, "step": 91400 }, { - "epoch": 3.22, - "learning_rate": 1.4933509309988271e-05, - "loss": 0.2599, + "epoch": 3.294230006847587, + "grad_norm": 0.2531737983226776, + "learning_rate": 1.3786066998067887e-05, + "loss": 0.3995, "step": 91405 }, { - "epoch": 3.22, - "learning_rate": 1.49309017989132e-05, - "loss": 0.2461, + "epoch": 3.294410206508812, + "grad_norm": 0.2063448578119278, + "learning_rate": 1.3783458980651637e-05, + "loss": 0.3758, "step": 91410 }, { - "epoch": 3.22, - "learning_rate": 1.4928294418578732e-05, - "loss": 0.2535, + "epoch": 3.2945904061700366, + "grad_norm": 0.23044107854366302, + "learning_rate": 1.3780851116058579e-05, + "loss": 0.3673, "step": 91415 }, { - "epoch": 3.22, - "learning_rate": 1.4925687169018712e-05, - "loss": 0.2698, + "epoch": 3.294770605831261, + "grad_norm": 0.23507073521614075, + "learning_rate": 1.377824340432426e-05, + "loss": 0.3559, "step": 91420 }, { - "epoch": 3.22, - "learning_rate": 1.492308005026702e-05, - "loss": 0.2688, + "epoch": 3.2949508054924856, + "grad_norm": 0.22942350804805756, + "learning_rate": 1.3775635845484203e-05, + "loss": 0.3899, "step": 91425 }, { - "epoch": 3.22, - "learning_rate": 1.4920473062357481e-05, - "loss": 0.2572, + "epoch": 3.2951310051537104, + "grad_norm": 0.282632976770401, + "learning_rate": 1.3773028439573935e-05, + "loss": 0.3621, "step": 91430 }, { - "epoch": 3.22, - "learning_rate": 1.4917866205323966e-05, - "loss": 0.233, + "epoch": 3.295311204814935, + "grad_norm": 0.2871980369091034, + "learning_rate": 1.3770421186628979e-05, + "loss": 0.3672, "step": 91435 }, { - "epoch": 3.22, - "learning_rate": 1.49152594792003e-05, - "loss": 0.2626, + "epoch": 3.2954914044761594, + "grad_norm": 0.27828335762023926, + "learning_rate": 1.3767814086684853e-05, + "loss": 0.4216, "step": 91440 }, { - "epoch": 3.22, - "learning_rate": 1.4912652884020356e-05, - "loss": 0.2667, + "epoch": 3.295671604137384, + "grad_norm": 0.268162339925766, + "learning_rate": 1.3765207139777091e-05, + "loss": 0.4116, "step": 91445 }, { - "epoch": 3.22, - "learning_rate": 1.4910046419817966e-05, - "loss": 0.2822, + "epoch": 3.295851803798609, + "grad_norm": 0.20781943202018738, + "learning_rate": 1.3762600345941213e-05, + "loss": 0.3331, "step": 91450 }, { - "epoch": 3.22, - "learning_rate": 1.490744008662697e-05, - "loss": 0.2621, + "epoch": 3.2960320034598336, + "grad_norm": 0.3062985837459564, + "learning_rate": 1.3759993705212726e-05, + "loss": 0.392, "step": 91455 }, { - "epoch": 3.22, - "learning_rate": 1.4904833884481206e-05, - "loss": 0.2532, + "epoch": 3.2962122031210583, + "grad_norm": 0.1981145143508911, + "learning_rate": 1.3757387217627146e-05, + "loss": 0.3496, "step": 91460 }, { - "epoch": 3.22, - "learning_rate": 1.4902227813414531e-05, - "loss": 0.2704, + "epoch": 3.2963924027822826, + "grad_norm": 0.2891653776168823, + "learning_rate": 1.3754780883219992e-05, + "loss": 0.4288, "step": 91465 }, { - "epoch": 3.22, - "learning_rate": 1.4899621873460778e-05, - "loss": 0.2505, + "epoch": 3.2965726024435074, + "grad_norm": 0.23202994465827942, + "learning_rate": 1.375217470202676e-05, + "loss": 0.4005, "step": 91470 }, { - "epoch": 3.22, - "learning_rate": 1.4897016064653774e-05, - "loss": 0.2427, + "epoch": 3.296752802104732, + "grad_norm": 0.26071614027023315, + "learning_rate": 1.374956867408299e-05, + "loss": 0.4132, "step": 91475 }, { - "epoch": 3.22, - "learning_rate": 1.489441038702735e-05, - "loss": 0.2428, + "epoch": 3.296933001765957, + "grad_norm": 0.22965455055236816, + "learning_rate": 1.3746962799424151e-05, + "loss": 0.3659, "step": 91480 }, { - "epoch": 3.22, - "learning_rate": 1.4891804840615353e-05, - "loss": 0.2644, + "epoch": 3.297113201427181, + "grad_norm": 0.19163905084133148, + "learning_rate": 1.3744357078085773e-05, + "loss": 0.3467, "step": 91485 }, { - "epoch": 3.22, - "learning_rate": 1.488919942545161e-05, - "loss": 0.2517, + "epoch": 3.297293401088406, + "grad_norm": 0.21488019824028015, + "learning_rate": 1.3741751510103352e-05, + "loss": 0.3741, "step": 91490 }, { - "epoch": 3.22, - "learning_rate": 1.4886594141569944e-05, - "loss": 0.2629, + "epoch": 3.2974736007496306, + "grad_norm": 0.19963303208351135, + "learning_rate": 1.3739146095512383e-05, + "loss": 0.3876, "step": 91495 }, { - "epoch": 3.22, - "learning_rate": 1.4883988989004194e-05, - "loss": 0.2442, + "epoch": 3.2976538004108553, + "grad_norm": 0.18200761079788208, + "learning_rate": 1.3736540834348372e-05, + "loss": 0.3782, "step": 91500 }, { - "epoch": 3.22, - "eval_loss": 0.25463610887527466, - "eval_runtime": 10.5511, - "eval_samples_per_second": 9.478, - "eval_steps_per_second": 9.478, + "epoch": 3.2976538004108553, + "eval_loss": 0.430542528629303, + "eval_runtime": 3.5303, + "eval_samples_per_second": 28.326, + "eval_steps_per_second": 7.081, "step": 91500 }, { - "epoch": 3.22, - "learning_rate": 1.4881383967788181e-05, - "loss": 0.2399, + "epoch": 3.29783400007208, + "grad_norm": 0.2648855149745941, + "learning_rate": 1.3733935726646812e-05, + "loss": 0.3551, "step": 91505 }, { - "epoch": 3.22, - "learning_rate": 1.4878779077955723e-05, - "loss": 0.2549, + "epoch": 3.2980141997333043, + "grad_norm": 0.26277956366539, + "learning_rate": 1.3731330772443186e-05, + "loss": 0.4238, "step": 91510 }, { - "epoch": 3.22, - "learning_rate": 1.4876174319540647e-05, - "loss": 0.2428, + "epoch": 3.298194399394529, + "grad_norm": 0.2899339199066162, + "learning_rate": 1.3728725971773004e-05, + "loss": 0.3986, "step": 91515 }, { - "epoch": 3.22, - "learning_rate": 1.4873569692576784e-05, - "loss": 0.2622, + "epoch": 3.298374599055754, + "grad_norm": 0.23275549709796906, + "learning_rate": 1.372612132467175e-05, + "loss": 0.3944, "step": 91520 }, { - "epoch": 3.22, - "learning_rate": 1.4870965197097944e-05, - "loss": 0.2583, + "epoch": 3.2985547987169785, + "grad_norm": 0.3062422573566437, + "learning_rate": 1.3723516831174912e-05, + "loss": 0.4204, "step": 91525 }, { - "epoch": 3.22, - "learning_rate": 1.4868360833137945e-05, - "loss": 0.2568, + "epoch": 3.298734998378203, + "grad_norm": 0.17250539362430573, + "learning_rate": 1.3720912491317972e-05, + "loss": 0.3552, "step": 91530 }, { - "epoch": 3.22, - "learning_rate": 1.4865756600730597e-05, - "loss": 0.2574, + "epoch": 3.2989151980394276, + "grad_norm": 0.17668567597866058, + "learning_rate": 1.3718308305136407e-05, + "loss": 0.3811, "step": 91535 }, { - "epoch": 3.22, - "learning_rate": 1.4863152499909722e-05, - "loss": 0.2483, + "epoch": 3.2990953977006523, + "grad_norm": 0.267254501581192, + "learning_rate": 1.3715704272665717e-05, + "loss": 0.3756, "step": 91540 }, { - "epoch": 3.22, - "learning_rate": 1.4861069314018015e-05, - "loss": 0.2347, + "epoch": 3.299275597361877, + "grad_norm": 0.23662297427654266, + "learning_rate": 1.3713100393941369e-05, + "loss": 0.366, "step": 91545 }, { - "epoch": 3.22, - "learning_rate": 1.4858465450137993e-05, - "loss": 0.2609, + "epoch": 3.2994557970231018, + "grad_norm": 0.21058867871761322, + "learning_rate": 1.3710496668998845e-05, + "loss": 0.3798, "step": 91550 }, { - "epoch": 3.22, - "learning_rate": 1.485586171793912e-05, - "loss": 0.2563, + "epoch": 3.299635996684326, + "grad_norm": 0.22281815111637115, + "learning_rate": 1.3707893097873619e-05, + "loss": 0.3874, "step": 91555 }, { - "epoch": 3.22, - "learning_rate": 1.4853258117455202e-05, - "loss": 0.2392, + "epoch": 3.299816196345551, + "grad_norm": 0.29249441623687744, + "learning_rate": 1.3705289680601152e-05, + "loss": 0.3828, "step": 91560 }, { - "epoch": 3.22, - "learning_rate": 1.4850654648720031e-05, - "loss": 0.2914, + "epoch": 3.2999963960067755, + "grad_norm": 0.2529570460319519, + "learning_rate": 1.3702686417216937e-05, + "loss": 0.3894, "step": 91565 }, { - "epoch": 3.22, - "learning_rate": 1.4848051311767424e-05, - "loss": 0.2669, + "epoch": 3.3001765956680003, + "grad_norm": 0.23126383125782013, + "learning_rate": 1.3700083307756443e-05, + "loss": 0.3985, "step": 91570 }, { - "epoch": 3.22, - "learning_rate": 1.4845448106631182e-05, - "loss": 0.2672, + "epoch": 3.3003567953292245, + "grad_norm": 0.21352465450763702, + "learning_rate": 1.3697480352255105e-05, + "loss": 0.3434, "step": 91575 }, { - "epoch": 3.22, - "learning_rate": 1.4842845033345106e-05, - "loss": 0.2848, + "epoch": 3.3005369949904493, + "grad_norm": 0.2859989404678345, + "learning_rate": 1.3694877550748419e-05, + "loss": 0.3767, "step": 91580 }, { - "epoch": 3.22, - "learning_rate": 1.4840242091942986e-05, - "loss": 0.2635, + "epoch": 3.300717194651674, + "grad_norm": 0.2089170217514038, + "learning_rate": 1.3692274903271828e-05, + "loss": 0.3966, "step": 91585 }, { - "epoch": 3.22, - "learning_rate": 1.4837639282458635e-05, - "loss": 0.2705, + "epoch": 3.3008973943128987, + "grad_norm": 0.24411603808403015, + "learning_rate": 1.368967240986082e-05, + "loss": 0.3803, "step": 91590 }, { - "epoch": 3.22, - "learning_rate": 1.483503660492584e-05, - "loss": 0.2306, + "epoch": 3.3010775939741235, + "grad_norm": 0.2391398698091507, + "learning_rate": 1.3687070070550829e-05, + "loss": 0.361, "step": 91595 }, { - "epoch": 3.22, - "learning_rate": 1.4832434059378392e-05, - "loss": 0.2859, + "epoch": 3.301257793635348, + "grad_norm": 0.23561078310012817, + "learning_rate": 1.3684467885377306e-05, + "loss": 0.373, "step": 91600 }, { - "epoch": 3.22, - "learning_rate": 1.4829831645850084e-05, - "loss": 0.2609, + "epoch": 3.3014379932965725, + "grad_norm": 0.20145799219608307, + "learning_rate": 1.3681865854375728e-05, + "loss": 0.3838, "step": 91605 }, { - "epoch": 3.22, - "learning_rate": 1.4827229364374718e-05, - "loss": 0.2488, + "epoch": 3.3016181929577972, + "grad_norm": 0.26969730854034424, + "learning_rate": 1.3679263977581536e-05, + "loss": 0.388, "step": 91610 }, { - "epoch": 3.22, - "learning_rate": 1.4824627214986072e-05, - "loss": 0.2563, + "epoch": 3.301798392619022, + "grad_norm": 0.28870394825935364, + "learning_rate": 1.3676662255030182e-05, + "loss": 0.4321, "step": 91615 }, { - "epoch": 3.22, - "learning_rate": 1.4822025197717937e-05, - "loss": 0.2649, + "epoch": 3.3019785922802467, + "grad_norm": 0.2489212453365326, + "learning_rate": 1.3674060686757113e-05, + "loss": 0.3873, "step": 91620 }, { - "epoch": 3.22, - "learning_rate": 1.4819423312604091e-05, - "loss": 0.2611, + "epoch": 3.302158791941471, + "grad_norm": 0.2643071115016937, + "learning_rate": 1.367145927279776e-05, + "loss": 0.3973, "step": 91625 }, { - "epoch": 3.22, - "learning_rate": 1.4816821559678324e-05, - "loss": 0.2774, + "epoch": 3.3023389916026957, + "grad_norm": 0.25098204612731934, + "learning_rate": 1.3668858013187597e-05, + "loss": 0.3781, "step": 91630 }, { - "epoch": 3.22, - "learning_rate": 1.4814219938974428e-05, - "loss": 0.258, + "epoch": 3.3025191912639205, + "grad_norm": 0.20655451714992523, + "learning_rate": 1.3666256907962044e-05, + "loss": 0.3913, "step": 91635 }, { - "epoch": 3.22, - "learning_rate": 1.481161845052617e-05, - "loss": 0.2518, + "epoch": 3.302699390925145, + "grad_norm": 0.20496463775634766, + "learning_rate": 1.366365595715655e-05, + "loss": 0.3778, "step": 91640 }, { - "epoch": 3.22, - "learning_rate": 1.4809017094367322e-05, - "loss": 0.2379, + "epoch": 3.30287959058637, + "grad_norm": 0.2476882040500641, + "learning_rate": 1.3661055160806546e-05, + "loss": 0.398, "step": 91645 }, { - "epoch": 3.22, - "learning_rate": 1.4806415870531681e-05, - "loss": 0.2669, + "epoch": 3.303059790247594, + "grad_norm": 0.20680633187294006, + "learning_rate": 1.3658454518947475e-05, + "loss": 0.353, "step": 91650 }, { - "epoch": 3.22, - "learning_rate": 1.480381477905301e-05, - "loss": 0.2635, + "epoch": 3.303239989908819, + "grad_norm": 0.21282508969306946, + "learning_rate": 1.3655854031614751e-05, + "loss": 0.3972, "step": 91655 }, { - "epoch": 3.22, - "learning_rate": 1.4801213819965087e-05, - "loss": 0.2717, + "epoch": 3.3034201895700437, + "grad_norm": 0.2705020010471344, + "learning_rate": 1.3653253698843844e-05, + "loss": 0.3935, "step": 91660 }, { - "epoch": 3.23, - "learning_rate": 1.479861299330167e-05, - "loss": 0.2847, + "epoch": 3.3036003892312684, + "grad_norm": 0.32260480523109436, + "learning_rate": 1.3650653520670134e-05, + "loss": 0.3799, "step": 91665 }, { - "epoch": 3.23, - "learning_rate": 1.4796012299096551e-05, - "loss": 0.2471, + "epoch": 3.3037805888924927, + "grad_norm": 0.2574411928653717, + "learning_rate": 1.3648053497129082e-05, + "loss": 0.4028, "step": 91670 }, { - "epoch": 3.23, - "learning_rate": 1.4793411737383488e-05, - "loss": 0.2323, + "epoch": 3.3039607885537174, + "grad_norm": 0.22409643232822418, + "learning_rate": 1.3645453628256105e-05, + "loss": 0.3615, "step": 91675 }, { - "epoch": 3.23, - "learning_rate": 1.479081130819624e-05, - "loss": 0.2785, + "epoch": 3.304140988214942, + "grad_norm": 0.2090173065662384, + "learning_rate": 1.3642853914086617e-05, + "loss": 0.373, "step": 91680 }, { - "epoch": 3.23, - "learning_rate": 1.4788211011568581e-05, - "loss": 0.2688, + "epoch": 3.304321187876167, + "grad_norm": 0.1917799711227417, + "learning_rate": 1.3640254354656062e-05, + "loss": 0.3696, "step": 91685 }, { - "epoch": 3.23, - "learning_rate": 1.478561084753428e-05, - "loss": 0.2751, + "epoch": 3.3045013875373916, + "grad_norm": 0.2533906400203705, + "learning_rate": 1.3637654949999833e-05, + "loss": 0.4081, "step": 91690 }, { - "epoch": 3.23, - "learning_rate": 1.4783010816127088e-05, - "loss": 0.2484, + "epoch": 3.304681587198616, + "grad_norm": 0.19297879934310913, + "learning_rate": 1.3635055700153346e-05, + "loss": 0.3785, "step": 91695 }, { - "epoch": 3.23, - "learning_rate": 1.4780410917380757e-05, - "loss": 0.2553, + "epoch": 3.3048617868598407, + "grad_norm": 0.20561909675598145, + "learning_rate": 1.3632456605152033e-05, + "loss": 0.3966, "step": 91700 }, { - "epoch": 3.23, - "learning_rate": 1.4777811151329068e-05, - "loss": 0.2591, + "epoch": 3.3050419865210654, + "grad_norm": 0.22225363552570343, + "learning_rate": 1.36298576650313e-05, + "loss": 0.4088, "step": 91705 }, { - "epoch": 3.23, - "learning_rate": 1.4775211518005756e-05, - "loss": 0.2395, + "epoch": 3.30522218618229, + "grad_norm": 0.23833619058132172, + "learning_rate": 1.3627258879826554e-05, + "loss": 0.3973, "step": 91710 }, { - "epoch": 3.23, - "learning_rate": 1.4772612017444592e-05, - "loss": 0.2454, + "epoch": 3.3054023858435144, + "grad_norm": 0.23555156588554382, + "learning_rate": 1.3624660249573207e-05, + "loss": 0.3813, "step": 91715 }, { - "epoch": 3.23, - "learning_rate": 1.477001264967931e-05, - "loss": 0.2699, + "epoch": 3.305582585504739, + "grad_norm": 0.21567942202091217, + "learning_rate": 1.3622061774306647e-05, + "loss": 0.381, "step": 91720 }, { - "epoch": 3.23, - "learning_rate": 1.476741341474368e-05, - "loss": 0.269, + "epoch": 3.305762785165964, + "grad_norm": 0.2811535894870758, + "learning_rate": 1.3619463454062304e-05, + "loss": 0.3864, "step": 91725 }, { - "epoch": 3.23, - "learning_rate": 1.4764814312671446e-05, - "loss": 0.2733, + "epoch": 3.3059429848271886, + "grad_norm": 0.23214657604694366, + "learning_rate": 1.3616865288875568e-05, + "loss": 0.3726, "step": 91730 }, { - "epoch": 3.23, - "learning_rate": 1.4762215343496346e-05, - "loss": 0.2514, + "epoch": 3.3061231844884134, + "grad_norm": 0.2762070596218109, + "learning_rate": 1.3614267278781839e-05, + "loss": 0.3761, "step": 91735 }, { - "epoch": 3.23, - "learning_rate": 1.4759616507252127e-05, - "loss": 0.2639, + "epoch": 3.3063033841496376, + "grad_norm": 0.21005326509475708, + "learning_rate": 1.3611669423816514e-05, + "loss": 0.3974, "step": 91740 }, { - "epoch": 3.23, - "learning_rate": 1.4757017803972551e-05, - "loss": 0.2544, + "epoch": 3.3064835838108624, + "grad_norm": 0.2921294569969177, + "learning_rate": 1.360907172401498e-05, + "loss": 0.3958, "step": 91745 }, { - "epoch": 3.23, - "learning_rate": 1.475441923369135e-05, - "loss": 0.2346, + "epoch": 3.306663783472087, + "grad_norm": 0.24011696875095367, + "learning_rate": 1.3606474179412645e-05, + "loss": 0.3846, "step": 91750 }, { - "epoch": 3.23, - "learning_rate": 1.4751820796442261e-05, - "loss": 0.2813, + "epoch": 3.306843983133312, + "grad_norm": 0.2656587064266205, + "learning_rate": 1.3603876790044906e-05, + "loss": 0.3611, "step": 91755 }, { - "epoch": 3.23, - "learning_rate": 1.4749222492259013e-05, - "loss": 0.2688, + "epoch": 3.307024182794536, + "grad_norm": 0.2405286282300949, + "learning_rate": 1.360127955594712e-05, + "loss": 0.3816, "step": 91760 }, { - "epoch": 3.23, - "learning_rate": 1.4746624321175365e-05, - "loss": 0.2435, + "epoch": 3.307204382455761, + "grad_norm": 0.23016515374183655, + "learning_rate": 1.3598682477154701e-05, + "loss": 0.3776, "step": 91765 }, { - "epoch": 3.23, - "learning_rate": 1.4744026283225045e-05, - "loss": 0.265, + "epoch": 3.3073845821169856, + "grad_norm": 0.24398526549339294, + "learning_rate": 1.3596085553703014e-05, + "loss": 0.3715, "step": 91770 }, { - "epoch": 3.23, - "learning_rate": 1.4741428378441786e-05, - "loss": 0.2716, + "epoch": 3.3075647817782103, + "grad_norm": 0.25272274017333984, + "learning_rate": 1.3593488785627478e-05, + "loss": 0.3561, "step": 91775 }, { - "epoch": 3.23, - "learning_rate": 1.4738830606859309e-05, - "loss": 0.2371, + "epoch": 3.307744981439435, + "grad_norm": 0.22344271838665009, + "learning_rate": 1.3590892172963437e-05, + "loss": 0.3864, "step": 91780 }, { - "epoch": 3.23, - "learning_rate": 1.4736232968511362e-05, - "loss": 0.2622, + "epoch": 3.3079251811006594, + "grad_norm": 0.2128763496875763, + "learning_rate": 1.3588295715746272e-05, + "loss": 0.3617, "step": 91785 }, { - "epoch": 3.23, - "learning_rate": 1.4733635463431661e-05, - "loss": 0.2683, + "epoch": 3.308105380761884, + "grad_norm": 0.24568325281143188, + "learning_rate": 1.3585699414011376e-05, + "loss": 0.3945, "step": 91790 }, { - "epoch": 3.23, - "learning_rate": 1.4731038091653943e-05, - "loss": 0.249, + "epoch": 3.308285580423109, + "grad_norm": 0.230985626578331, + "learning_rate": 1.3583103267794117e-05, + "loss": 0.3645, "step": 91795 }, { - "epoch": 3.23, - "learning_rate": 1.4728440853211923e-05, - "loss": 0.2776, + "epoch": 3.3084657800843336, + "grad_norm": 0.25532665848731995, + "learning_rate": 1.3580507277129867e-05, + "loss": 0.3828, "step": 91800 }, { - "epoch": 3.23, - "learning_rate": 1.4725843748139335e-05, - "loss": 0.2587, + "epoch": 3.308645979745558, + "grad_norm": 0.26051023602485657, + "learning_rate": 1.3577911442053992e-05, + "loss": 0.377, "step": 91805 }, { - "epoch": 3.23, - "learning_rate": 1.4723246776469896e-05, - "loss": 0.2436, + "epoch": 3.3088261794067826, + "grad_norm": 0.22641240060329437, + "learning_rate": 1.3575315762601853e-05, + "loss": 0.3672, "step": 91810 }, { - "epoch": 3.23, - "learning_rate": 1.4720649938237313e-05, - "loss": 0.267, + "epoch": 3.3090063790680073, + "grad_norm": 0.29088813066482544, + "learning_rate": 1.357272023880884e-05, + "loss": 0.3662, "step": 91815 }, { - "epoch": 3.23, - "learning_rate": 1.471805323347532e-05, - "loss": 0.2601, + "epoch": 3.309186578729232, + "grad_norm": 0.24642913043498993, + "learning_rate": 1.3570124870710293e-05, + "loss": 0.3872, "step": 91820 }, { - "epoch": 3.23, - "learning_rate": 1.4715456662217639e-05, - "loss": 0.2803, + "epoch": 3.309366778390457, + "grad_norm": 0.20131155848503113, + "learning_rate": 1.3567529658341587e-05, + "loss": 0.3784, "step": 91825 }, { - "epoch": 3.23, - "learning_rate": 1.4712860224497973e-05, - "loss": 0.2677, + "epoch": 3.309546978051681, + "grad_norm": 0.2340930700302124, + "learning_rate": 1.3564934601738075e-05, + "loss": 0.409, "step": 91830 }, { - "epoch": 3.23, - "learning_rate": 1.4710263920350028e-05, - "loss": 0.2623, + "epoch": 3.309727177712906, + "grad_norm": 0.22841165959835052, + "learning_rate": 1.3562339700935114e-05, + "loss": 0.3586, "step": 91835 }, { - "epoch": 3.23, - "learning_rate": 1.4707667749807535e-05, - "loss": 0.2359, + "epoch": 3.3099073773741305, + "grad_norm": 0.2462332546710968, + "learning_rate": 1.355974495596805e-05, + "loss": 0.4212, "step": 91840 }, { - "epoch": 3.23, - "learning_rate": 1.4705071712904189e-05, - "loss": 0.2593, + "epoch": 3.3100875770353553, + "grad_norm": 0.2860056459903717, + "learning_rate": 1.355715036687226e-05, + "loss": 0.3739, "step": 91845 }, { - "epoch": 3.23, - "learning_rate": 1.4702475809673707e-05, - "loss": 0.2581, + "epoch": 3.3102677766965796, + "grad_norm": 0.24574606120586395, + "learning_rate": 1.3554555933683077e-05, + "loss": 0.3489, "step": 91850 }, { - "epoch": 3.23, - "learning_rate": 1.469988004014978e-05, - "loss": 0.2673, + "epoch": 3.3104479763578043, + "grad_norm": 0.20963186025619507, + "learning_rate": 1.3551961656435852e-05, + "loss": 0.3463, "step": 91855 }, { - "epoch": 3.23, - "learning_rate": 1.4697284404366133e-05, - "loss": 0.2592, + "epoch": 3.310628176019029, + "grad_norm": 0.2524944841861725, + "learning_rate": 1.3549367535165935e-05, + "loss": 0.3816, "step": 91860 }, { - "epoch": 3.23, - "learning_rate": 1.4694688902356458e-05, - "loss": 0.2567, + "epoch": 3.3108083756802538, + "grad_norm": 0.22844867408275604, + "learning_rate": 1.3546773569908661e-05, + "loss": 0.3623, "step": 91865 }, { - "epoch": 3.23, - "learning_rate": 1.469209353415445e-05, - "loss": 0.2353, + "epoch": 3.3109885753414785, + "grad_norm": 0.20473162829875946, + "learning_rate": 1.35441797606994e-05, + "loss": 0.4076, "step": 91870 }, { - "epoch": 3.23, - "learning_rate": 1.4689498299793813e-05, - "loss": 0.258, + "epoch": 3.3111687750027032, + "grad_norm": 0.2806645631790161, + "learning_rate": 1.3541586107573456e-05, + "loss": 0.3582, "step": 91875 }, { - "epoch": 3.23, - "learning_rate": 1.468690319930825e-05, - "loss": 0.2777, + "epoch": 3.3113489746639275, + "grad_norm": 0.2380189746618271, + "learning_rate": 1.3538992610566175e-05, + "loss": 0.3897, "step": 91880 }, { - "epoch": 3.23, - "learning_rate": 1.4684308232731457e-05, - "loss": 0.2432, + "epoch": 3.3115291743251523, + "grad_norm": 0.20420801639556885, + "learning_rate": 1.3536399269712912e-05, + "loss": 0.3863, "step": 91885 }, { - "epoch": 3.23, - "learning_rate": 1.468171340009712e-05, - "loss": 0.2557, + "epoch": 3.311709373986377, + "grad_norm": 0.27330854535102844, + "learning_rate": 1.3533806085048991e-05, + "loss": 0.3967, "step": 91890 }, { - "epoch": 3.23, - "learning_rate": 1.4679118701438926e-05, - "loss": 0.2605, + "epoch": 3.3118895736476017, + "grad_norm": 0.26601719856262207, + "learning_rate": 1.3531213056609744e-05, + "loss": 0.3883, "step": 91895 }, { - "epoch": 3.23, - "learning_rate": 1.4676524136790584e-05, - "loss": 0.251, + "epoch": 3.312069773308826, + "grad_norm": 0.22995233535766602, + "learning_rate": 1.352862018443049e-05, + "loss": 0.3689, "step": 91900 }, { - "epoch": 3.23, - "learning_rate": 1.4673929706185769e-05, - "loss": 0.2479, + "epoch": 3.3122499729700507, + "grad_norm": 0.26952049136161804, + "learning_rate": 1.3526027468546562e-05, + "loss": 0.3816, "step": 91905 }, { - "epoch": 3.23, - "learning_rate": 1.4671335409658175e-05, - "loss": 0.2499, + "epoch": 3.3124301726312755, + "grad_norm": 0.23863959312438965, + "learning_rate": 1.3523434908993299e-05, + "loss": 0.3797, "step": 91910 }, { - "epoch": 3.23, - "learning_rate": 1.4668741247241475e-05, - "loss": 0.2418, + "epoch": 3.3126103722925, + "grad_norm": 0.2401474565267563, + "learning_rate": 1.3520842505806008e-05, + "loss": 0.3767, "step": 91915 }, { - "epoch": 3.23, - "learning_rate": 1.4666147218969367e-05, - "loss": 0.2717, + "epoch": 3.312790571953725, + "grad_norm": 0.24980275332927704, + "learning_rate": 1.3518250259020021e-05, + "loss": 0.3933, "step": 91920 }, { - "epoch": 3.23, - "learning_rate": 1.4663553324875529e-05, - "loss": 0.2779, + "epoch": 3.3129707716149492, + "grad_norm": 0.2250710129737854, + "learning_rate": 1.3515658168670647e-05, + "loss": 0.4035, "step": 91925 }, { - "epoch": 3.23, - "learning_rate": 1.466095956499363e-05, - "loss": 0.2812, + "epoch": 3.313150971276174, + "grad_norm": 0.23497453331947327, + "learning_rate": 1.3513066234793198e-05, + "loss": 0.3815, "step": 91930 }, { - "epoch": 3.23, - "learning_rate": 1.4658365939357366e-05, - "loss": 0.2558, + "epoch": 3.3133311709373987, + "grad_norm": Infinity, + "learning_rate": 1.3510992800374772e-05, + "loss": 0.3933, "step": 91935 }, { - "epoch": 3.23, - "learning_rate": 1.4655772448000408e-05, - "loss": 0.2468, + "epoch": 3.3135113705986234, + "grad_norm": 0.20314912497997284, + "learning_rate": 1.3508401148235816e-05, + "loss": 0.4087, "step": 91940 }, { - "epoch": 3.23, - "learning_rate": 1.4653179090956425e-05, - "loss": 0.2882, + "epoch": 3.3136915702598477, + "grad_norm": 0.25075605511665344, + "learning_rate": 1.3505809652667658e-05, + "loss": 0.3749, "step": 91945 }, { - "epoch": 3.24, - "learning_rate": 1.4650585868259087e-05, - "loss": 0.2731, + "epoch": 3.3138717699210725, + "grad_norm": 0.2547110617160797, + "learning_rate": 1.3503218313705612e-05, + "loss": 0.4021, "step": 91950 }, { - "epoch": 3.24, - "learning_rate": 1.4647992779942077e-05, - "loss": 0.2524, + "epoch": 3.314051969582297, + "grad_norm": 0.28053396940231323, + "learning_rate": 1.3500627131384996e-05, + "loss": 0.399, "step": 91955 }, { - "epoch": 3.24, - "learning_rate": 1.4645399826039052e-05, - "loss": 0.264, + "epoch": 3.314232169243522, + "grad_norm": 0.2546713054180145, + "learning_rate": 1.3498036105741113e-05, + "loss": 0.394, "step": 91960 }, { - "epoch": 3.24, - "learning_rate": 1.4642807006583698e-05, - "loss": 0.2514, + "epoch": 3.3144123689047467, + "grad_norm": 0.24178683757781982, + "learning_rate": 1.3495445236809263e-05, + "loss": 0.3621, "step": 91965 }, { - "epoch": 3.24, - "learning_rate": 1.4640214321609653e-05, - "loss": 0.2804, + "epoch": 3.314592568565971, + "grad_norm": 0.2593505084514618, + "learning_rate": 1.3492854524624737e-05, + "loss": 0.3819, "step": 91970 }, { - "epoch": 3.24, - "learning_rate": 1.463762177115061e-05, - "loss": 0.2562, + "epoch": 3.3147727682271957, + "grad_norm": 0.25846415758132935, + "learning_rate": 1.3490263969222838e-05, + "loss": 0.3856, "step": 91975 }, { - "epoch": 3.24, - "learning_rate": 1.4635029355240221e-05, - "loss": 0.2612, + "epoch": 3.3149529678884204, + "grad_norm": 0.20164726674556732, + "learning_rate": 1.348767357063887e-05, + "loss": 0.3679, "step": 91980 }, { - "epoch": 3.24, - "learning_rate": 1.4632437073912136e-05, - "loss": 0.2569, + "epoch": 3.315133167549645, + "grad_norm": 0.22583597898483276, + "learning_rate": 1.348508332890812e-05, + "loss": 0.3902, "step": 91985 }, { - "epoch": 3.24, - "learning_rate": 1.462984492720002e-05, - "loss": 0.2519, + "epoch": 3.3153133672108694, + "grad_norm": 0.223631352186203, + "learning_rate": 1.348249324406588e-05, + "loss": 0.3685, "step": 91990 }, { - "epoch": 3.24, - "learning_rate": 1.4627252915137546e-05, - "loss": 0.2588, + "epoch": 3.315493566872094, + "grad_norm": 0.2747786343097687, + "learning_rate": 1.347990331614744e-05, + "loss": 0.4107, "step": 91995 }, { - "epoch": 3.24, - "learning_rate": 1.4624661037758356e-05, - "loss": 0.2677, + "epoch": 3.315673766533319, + "grad_norm": 0.22893761098384857, + "learning_rate": 1.347731354518808e-05, + "loss": 0.3957, "step": 92000 }, { - "epoch": 3.24, - "eval_loss": 0.2548165023326874, - "eval_runtime": 10.5617, - "eval_samples_per_second": 9.468, - "eval_steps_per_second": 9.468, + "epoch": 3.315673766533319, + "eval_loss": 0.4305948317050934, + "eval_runtime": 3.5291, + "eval_samples_per_second": 28.336, + "eval_steps_per_second": 7.084, "step": 92000 }, { - "epoch": 3.24, - "learning_rate": 1.4622069295096096e-05, - "loss": 0.2453, + "epoch": 3.3158539661945436, + "grad_norm": 0.269552081823349, + "learning_rate": 1.3474723931223102e-05, + "loss": 0.3697, "step": 92005 }, { - "epoch": 3.24, - "learning_rate": 1.461947768718443e-05, - "loss": 0.2439, + "epoch": 3.3160341658557684, + "grad_norm": 0.21320883929729462, + "learning_rate": 1.3472134474287777e-05, + "loss": 0.3593, "step": 92010 }, { - "epoch": 3.24, - "learning_rate": 1.4616886214057007e-05, - "loss": 0.2609, + "epoch": 3.3162143655169927, + "grad_norm": 0.25191807746887207, + "learning_rate": 1.3469545174417386e-05, + "loss": 0.4161, "step": 92015 }, { - "epoch": 3.24, - "learning_rate": 1.4614294875747475e-05, - "loss": 0.2477, + "epoch": 3.3163945651782174, + "grad_norm": 0.3065241575241089, + "learning_rate": 1.3466956031647212e-05, + "loss": 0.3889, "step": 92020 }, { - "epoch": 3.24, - "learning_rate": 1.4611703672289484e-05, - "loss": 0.2486, + "epoch": 3.316574764839442, + "grad_norm": 0.22870443761348724, + "learning_rate": 1.3464367046012521e-05, + "loss": 0.3896, "step": 92025 }, { - "epoch": 3.24, - "learning_rate": 1.460911260371666e-05, - "loss": 0.2642, + "epoch": 3.316754964500667, + "grad_norm": 0.2070244699716568, + "learning_rate": 1.3461778217548603e-05, + "loss": 0.4069, "step": 92030 }, { - "epoch": 3.24, - "learning_rate": 1.4606521670062673e-05, - "loss": 0.2674, + "epoch": 3.316935164161891, + "grad_norm": 0.20733942091464996, + "learning_rate": 1.3459189546290735e-05, + "loss": 0.3833, "step": 92035 }, { - "epoch": 3.24, - "learning_rate": 1.460393087136115e-05, - "loss": 0.2648, + "epoch": 3.317115363823116, + "grad_norm": 0.22954271733760834, + "learning_rate": 1.3456601032274153e-05, + "loss": 0.4083, "step": 92040 }, { - "epoch": 3.24, - "learning_rate": 1.4601340207645726e-05, - "loss": 0.2583, + "epoch": 3.3172955634843406, + "grad_norm": 0.23598207533359528, + "learning_rate": 1.3454012675534156e-05, + "loss": 0.3831, "step": 92045 }, { - "epoch": 3.24, - "learning_rate": 1.4598749678950052e-05, - "loss": 0.2811, + "epoch": 3.3174757631455654, + "grad_norm": 0.2571483850479126, + "learning_rate": 1.3451424476106004e-05, + "loss": 0.3722, "step": 92050 }, { - "epoch": 3.24, - "learning_rate": 1.4596159285307758e-05, - "loss": 0.2561, + "epoch": 3.31765596280679, + "grad_norm": 0.22039219737052917, + "learning_rate": 1.3448836434024955e-05, + "loss": 0.3507, "step": 92055 }, { - "epoch": 3.24, - "learning_rate": 1.4593569026752484e-05, - "loss": 0.2414, + "epoch": 3.3178361624680144, + "grad_norm": 0.2569846510887146, + "learning_rate": 1.3446248549326274e-05, + "loss": 0.3714, "step": 92060 }, { - "epoch": 3.24, - "learning_rate": 1.459097890331785e-05, - "loss": 0.2629, + "epoch": 3.318016362129239, + "grad_norm": 0.24034719169139862, + "learning_rate": 1.3443660822045211e-05, + "loss": 0.3962, "step": 92065 }, { - "epoch": 3.24, - "learning_rate": 1.4588388915037504e-05, - "loss": 0.2519, + "epoch": 3.318196561790464, + "grad_norm": 0.27031049132347107, + "learning_rate": 1.344107325221704e-05, + "loss": 0.3683, "step": 92070 }, { - "epoch": 3.24, - "learning_rate": 1.4585799061945065e-05, - "loss": 0.2487, + "epoch": 3.3183767614516886, + "grad_norm": 0.31037384271621704, + "learning_rate": 1.343848583987701e-05, + "loss": 0.4435, "step": 92075 }, { - "epoch": 3.24, - "learning_rate": 1.4583209344074164e-05, - "loss": 0.2793, + "epoch": 3.318556961112913, + "grad_norm": 0.21680322289466858, + "learning_rate": 1.3435898585060372e-05, + "loss": 0.3767, "step": 92080 }, { - "epoch": 3.24, - "learning_rate": 1.4580619761458414e-05, - "loss": 0.2651, + "epoch": 3.3187371607741376, + "grad_norm": 0.22905753552913666, + "learning_rate": 1.3433311487802375e-05, + "loss": 0.3935, "step": 92085 }, { - "epoch": 3.24, - "learning_rate": 1.4578030314131457e-05, - "loss": 0.2645, + "epoch": 3.3189173604353623, + "grad_norm": 0.27699750661849976, + "learning_rate": 1.343072454813827e-05, + "loss": 0.4165, "step": 92090 }, { - "epoch": 3.24, - "learning_rate": 1.4575441002126913e-05, - "loss": 0.2459, + "epoch": 3.319097560096587, + "grad_norm": 0.19221347570419312, + "learning_rate": 1.3428137766103293e-05, + "loss": 0.3773, "step": 92095 }, { - "epoch": 3.24, - "learning_rate": 1.4572851825478395e-05, - "loss": 0.2574, + "epoch": 3.319277759757812, + "grad_norm": 0.2585064172744751, + "learning_rate": 1.3425551141732712e-05, + "loss": 0.3909, "step": 92100 }, { - "epoch": 3.24, - "learning_rate": 1.4570262784219513e-05, - "loss": 0.2627, + "epoch": 3.3194579594190365, + "grad_norm": 0.21457339823246002, + "learning_rate": 1.3422964675061752e-05, + "loss": 0.3761, "step": 92105 }, { - "epoch": 3.24, - "learning_rate": 1.4567673878383897e-05, - "loss": 0.2483, + "epoch": 3.319638159080261, + "grad_norm": 0.22903072834014893, + "learning_rate": 1.3420378366125657e-05, + "loss": 0.3808, "step": 92110 }, { - "epoch": 3.24, - "learning_rate": 1.4565085108005172e-05, - "loss": 0.2788, + "epoch": 3.3198183587414856, + "grad_norm": 0.24865801632404327, + "learning_rate": 1.3417792214959668e-05, + "loss": 0.3829, "step": 92115 }, { - "epoch": 3.24, - "learning_rate": 1.4562496473116938e-05, - "loss": 0.2663, + "epoch": 3.3199985584027103, + "grad_norm": 0.22578608989715576, + "learning_rate": 1.3415206221599006e-05, + "loss": 0.3397, "step": 92120 }, { - "epoch": 3.24, - "learning_rate": 1.4559907973752798e-05, - "loss": 0.2607, + "epoch": 3.320178758063935, + "grad_norm": 0.21695365011692047, + "learning_rate": 1.3412620386078933e-05, + "loss": 0.3552, "step": 92125 }, { - "epoch": 3.24, - "learning_rate": 1.4557319609946385e-05, - "loss": 0.254, + "epoch": 3.3203589577251593, + "grad_norm": 0.19830626249313354, + "learning_rate": 1.341003470843466e-05, + "loss": 0.3794, "step": 92130 }, { - "epoch": 3.24, - "learning_rate": 1.4554731381731291e-05, - "loss": 0.2415, + "epoch": 3.320539157386384, + "grad_norm": 0.25142696499824524, + "learning_rate": 1.3407449188701408e-05, + "loss": 0.387, "step": 92135 }, { - "epoch": 3.24, - "learning_rate": 1.4552143289141129e-05, - "loss": 0.2497, + "epoch": 3.320719357047609, + "grad_norm": 0.25763440132141113, + "learning_rate": 1.3404863826914424e-05, + "loss": 0.4225, "step": 92140 }, { - "epoch": 3.24, - "learning_rate": 1.454955533220949e-05, - "loss": 0.2357, + "epoch": 3.3208995567088335, + "grad_norm": 0.2544826567173004, + "learning_rate": 1.3402278623108915e-05, + "loss": 0.3771, "step": 92145 }, { - "epoch": 3.24, - "learning_rate": 1.454696751097e-05, - "loss": 0.2616, + "epoch": 3.3210797563700583, + "grad_norm": 0.24022381007671356, + "learning_rate": 1.3399693577320137e-05, + "loss": 0.3724, "step": 92150 }, { - "epoch": 3.24, - "learning_rate": 1.4544379825456247e-05, - "loss": 0.2607, + "epoch": 3.3212599560312825, + "grad_norm": 0.24866530299186707, + "learning_rate": 1.3397108689583276e-05, + "loss": 0.3865, "step": 92155 }, { - "epoch": 3.24, - "learning_rate": 1.4541792275701821e-05, - "loss": 0.2507, + "epoch": 3.3214401556925073, + "grad_norm": 0.2202133685350418, + "learning_rate": 1.3394523959933559e-05, + "loss": 0.3777, "step": 92160 }, { - "epoch": 3.24, - "learning_rate": 1.453920486174033e-05, - "loss": 0.2505, + "epoch": 3.321620355353732, + "grad_norm": 0.1919008046388626, + "learning_rate": 1.339193938840621e-05, + "loss": 0.3427, "step": 92165 }, { - "epoch": 3.24, - "learning_rate": 1.4536617583605381e-05, - "loss": 0.2799, + "epoch": 3.3218005550149567, + "grad_norm": 0.21456719934940338, + "learning_rate": 1.338935497503644e-05, + "loss": 0.406, "step": 92170 }, { - "epoch": 3.24, - "learning_rate": 1.4534030441330556e-05, - "loss": 0.2376, + "epoch": 3.321980754676181, + "grad_norm": 0.17022041976451874, + "learning_rate": 1.338677071985946e-05, + "loss": 0.3656, "step": 92175 }, { - "epoch": 3.24, - "learning_rate": 1.4531443434949439e-05, - "loss": 0.2609, + "epoch": 3.3221609543374058, + "grad_norm": 0.2558014392852783, + "learning_rate": 1.3384186622910482e-05, + "loss": 0.3812, "step": 92180 }, { - "epoch": 3.24, - "learning_rate": 1.4528856564495642e-05, - "loss": 0.2734, + "epoch": 3.3223411539986305, + "grad_norm": 0.24474850296974182, + "learning_rate": 1.3381602684224703e-05, + "loss": 0.372, "step": 92185 }, { - "epoch": 3.24, - "learning_rate": 1.4526269830002737e-05, - "loss": 0.2529, + "epoch": 3.3225213536598552, + "grad_norm": 0.2648925483226776, + "learning_rate": 1.3379018903837348e-05, + "loss": 0.3879, "step": 92190 }, { - "epoch": 3.24, - "learning_rate": 1.4523683231504318e-05, - "loss": 0.263, + "epoch": 3.32270155332108, + "grad_norm": 0.2510066330432892, + "learning_rate": 1.3376435281783608e-05, + "loss": 0.3624, "step": 92195 }, { - "epoch": 3.24, - "learning_rate": 1.4521096769033959e-05, - "loss": 0.2631, + "epoch": 3.3228817529823043, + "grad_norm": 0.20649553835391998, + "learning_rate": 1.337385181809869e-05, + "loss": 0.3811, "step": 92200 }, { - "epoch": 3.24, - "learning_rate": 1.451851044262526e-05, - "loss": 0.2638, + "epoch": 3.323061952643529, + "grad_norm": 0.24206553399562836, + "learning_rate": 1.337126851281779e-05, + "loss": 0.3888, "step": 92205 }, { - "epoch": 3.24, - "learning_rate": 1.4515924252311796e-05, - "loss": 0.2381, + "epoch": 3.3232421523047537, + "grad_norm": 0.21972547471523285, + "learning_rate": 1.3368685365976091e-05, + "loss": 0.3822, "step": 92210 }, { - "epoch": 3.24, - "learning_rate": 1.4513338198127142e-05, - "loss": 0.2614, + "epoch": 3.3234223519659785, + "grad_norm": 0.22937068343162537, + "learning_rate": 1.3366102377608817e-05, + "loss": 0.3684, "step": 92215 }, { - "epoch": 3.24, - "learning_rate": 1.4510752280104872e-05, - "loss": 0.2711, + "epoch": 3.3236025516272028, + "grad_norm": 0.23047223687171936, + "learning_rate": 1.3363519547751152e-05, + "loss": 0.3663, "step": 92220 }, { - "epoch": 3.24, - "learning_rate": 1.4508166498278586e-05, - "loss": 0.2606, + "epoch": 3.3237827512884275, + "grad_norm": 0.19826874136924744, + "learning_rate": 1.3360936876438263e-05, + "loss": 0.3945, "step": 92225 }, { - "epoch": 3.24, - "learning_rate": 1.4505580852681843e-05, - "loss": 0.2653, + "epoch": 3.323962950949652, + "grad_norm": 0.2385808378458023, + "learning_rate": 1.3358354363705365e-05, + "loss": 0.4018, "step": 92230 }, { - "epoch": 3.25, - "learning_rate": 1.4502995343348218e-05, - "loss": 0.2511, + "epoch": 3.324143150610877, + "grad_norm": 0.24572807550430298, + "learning_rate": 1.3355772009587634e-05, + "loss": 0.405, "step": 92235 }, { - "epoch": 3.25, - "learning_rate": 1.4500409970311268e-05, - "loss": 0.2755, + "epoch": 3.3243233502721017, + "grad_norm": 0.237590491771698, + "learning_rate": 1.3353189814120253e-05, + "loss": 0.3893, "step": 92240 }, { - "epoch": 3.25, - "learning_rate": 1.4497824733604588e-05, - "loss": 0.2681, + "epoch": 3.324503549933326, + "grad_norm": 0.20121124386787415, + "learning_rate": 1.3350607777338409e-05, + "loss": 0.3707, "step": 92245 }, { - "epoch": 3.25, - "learning_rate": 1.4495239633261728e-05, - "loss": 0.2792, + "epoch": 3.3246837495945507, + "grad_norm": 0.260689377784729, + "learning_rate": 1.334802589927727e-05, + "loss": 0.3887, "step": 92250 }, { - "epoch": 3.25, - "learning_rate": 1.4492654669316263e-05, - "loss": 0.2421, + "epoch": 3.3248639492557754, + "grad_norm": 0.31178975105285645, + "learning_rate": 1.3345444179972027e-05, + "loss": 0.4013, "step": 92255 }, { - "epoch": 3.25, - "learning_rate": 1.449006984180174e-05, - "loss": 0.2473, + "epoch": 3.325044148917, + "grad_norm": 0.24525390565395355, + "learning_rate": 1.3342862619457858e-05, + "loss": 0.3908, "step": 92260 }, { - "epoch": 3.25, - "learning_rate": 1.4487485150751745e-05, - "loss": 0.2751, + "epoch": 3.3252243485782245, + "grad_norm": 0.20837341248989105, + "learning_rate": 1.3340281217769922e-05, + "loss": 0.3367, "step": 92265 }, { - "epoch": 3.25, - "learning_rate": 1.4484900596199816e-05, - "loss": 0.2476, + "epoch": 3.325404548239449, + "grad_norm": 0.24146051704883575, + "learning_rate": 1.3337699974943401e-05, + "loss": 0.3905, "step": 92270 }, { - "epoch": 3.25, - "learning_rate": 1.4482316178179534e-05, - "loss": 0.261, + "epoch": 3.325584747900674, + "grad_norm": 0.2158283293247223, + "learning_rate": 1.3335118891013459e-05, + "loss": 0.3847, "step": 92275 }, { - "epoch": 3.25, - "learning_rate": 1.4479731896724436e-05, - "loss": 0.2639, + "epoch": 3.3257649475618987, + "grad_norm": 0.2629423141479492, + "learning_rate": 1.3332537966015251e-05, + "loss": 0.4141, "step": 92280 }, { - "epoch": 3.25, - "learning_rate": 1.4477147751868092e-05, - "loss": 0.2642, + "epoch": 3.3259451472231234, + "grad_norm": 0.22925181686878204, + "learning_rate": 1.3329957199983967e-05, + "loss": 0.3676, "step": 92285 }, { - "epoch": 3.25, - "learning_rate": 1.4474563743644054e-05, - "loss": 0.2661, + "epoch": 3.3261253468843477, + "grad_norm": 0.2732671797275543, + "learning_rate": 1.3327376592954757e-05, + "loss": 0.3693, "step": 92290 }, { - "epoch": 3.25, - "learning_rate": 1.4471979872085856e-05, - "loss": 0.2732, + "epoch": 3.3263055465455724, + "grad_norm": 0.21771977841854095, + "learning_rate": 1.3324796144962781e-05, + "loss": 0.3854, "step": 92295 }, { - "epoch": 3.25, - "learning_rate": 1.4469396137227076e-05, - "loss": 0.2934, + "epoch": 3.326485746206797, + "grad_norm": 0.19893218576908112, + "learning_rate": 1.3322215856043196e-05, + "loss": 0.3719, "step": 92300 }, { - "epoch": 3.25, - "learning_rate": 1.4466812539101242e-05, - "loss": 0.2657, + "epoch": 3.326665945868022, + "grad_norm": 0.253020703792572, + "learning_rate": 1.331963572623115e-05, + "loss": 0.3934, "step": 92305 }, { - "epoch": 3.25, - "learning_rate": 1.446422907774191e-05, - "loss": 0.2484, + "epoch": 3.326846145529246, + "grad_norm": 0.272172749042511, + "learning_rate": 1.3317055755561817e-05, + "loss": 0.4027, "step": 92310 }, { - "epoch": 3.25, - "learning_rate": 1.4461645753182607e-05, - "loss": 0.2537, + "epoch": 3.327026345190471, + "grad_norm": 0.22048524022102356, + "learning_rate": 1.3314475944070345e-05, + "loss": 0.383, "step": 92315 }, { - "epoch": 3.25, - "learning_rate": 1.44590625654569e-05, - "loss": 0.2774, + "epoch": 3.3272065448516956, + "grad_norm": 0.188111811876297, + "learning_rate": 1.3311896291791859e-05, + "loss": 0.3955, "step": 92320 }, { - "epoch": 3.25, - "learning_rate": 1.445647951459831e-05, - "loss": 0.2674, + "epoch": 3.3273867445129204, + "grad_norm": 0.21891506016254425, + "learning_rate": 1.3309316798761529e-05, + "loss": 0.3913, "step": 92325 }, { - "epoch": 3.25, - "learning_rate": 1.445389660064039e-05, - "loss": 0.2582, + "epoch": 3.327566944174145, + "grad_norm": 0.24252718687057495, + "learning_rate": 1.3306737465014485e-05, + "loss": 0.3619, "step": 92330 }, { - "epoch": 3.25, - "learning_rate": 1.445131382361667e-05, - "loss": 0.2502, + "epoch": 3.3277471438353694, + "grad_norm": 0.23667490482330322, + "learning_rate": 1.33041582905859e-05, + "loss": 0.3844, "step": 92335 }, { - "epoch": 3.25, - "learning_rate": 1.4448731183560694e-05, - "loss": 0.2593, + "epoch": 3.327927343496594, + "grad_norm": 0.21754367649555206, + "learning_rate": 1.3301579275510887e-05, + "loss": 0.3438, "step": 92340 }, { - "epoch": 3.25, - "learning_rate": 1.4446148680505989e-05, - "loss": 0.2414, + "epoch": 3.328107543157819, + "grad_norm": 0.2473175972700119, + "learning_rate": 1.3299000419824578e-05, + "loss": 0.4039, "step": 92345 }, { - "epoch": 3.25, - "learning_rate": 1.444356631448609e-05, - "loss": 0.2283, + "epoch": 3.3282877428190436, + "grad_norm": 0.2296813279390335, + "learning_rate": 1.3296421723562133e-05, + "loss": 0.3446, "step": 92350 }, { - "epoch": 3.25, - "learning_rate": 1.4440984085534514e-05, - "loss": 0.272, + "epoch": 3.328467942480268, + "grad_norm": 0.24328921735286713, + "learning_rate": 1.3293843186758675e-05, + "loss": 0.3843, "step": 92355 }, { - "epoch": 3.25, - "learning_rate": 1.4438401993684811e-05, - "loss": 0.2576, + "epoch": 3.3286481421414926, + "grad_norm": 0.23065999150276184, + "learning_rate": 1.329126480944934e-05, + "loss": 0.3811, "step": 92360 }, { - "epoch": 3.25, - "learning_rate": 1.4435820038970498e-05, - "loss": 0.2794, + "epoch": 3.3288283418027174, + "grad_norm": 0.2626056373119354, + "learning_rate": 1.3288686591669255e-05, + "loss": 0.3911, "step": 92365 }, { - "epoch": 3.25, - "learning_rate": 1.44332382214251e-05, - "loss": 0.2481, + "epoch": 3.329008541463942, + "grad_norm": 0.2469651997089386, + "learning_rate": 1.3286108533453537e-05, + "loss": 0.3647, "step": 92370 }, { - "epoch": 3.25, - "learning_rate": 1.4430656541082133e-05, - "loss": 0.2631, + "epoch": 3.329188741125167, + "grad_norm": 0.23114638030529022, + "learning_rate": 1.3283530634837333e-05, + "loss": 0.374, "step": 92375 }, { - "epoch": 3.25, - "learning_rate": 1.4428074997975132e-05, - "loss": 0.2682, + "epoch": 3.3293689407863916, + "grad_norm": 0.209586039185524, + "learning_rate": 1.3280952895855753e-05, + "loss": 0.3747, "step": 92380 }, { - "epoch": 3.25, - "learning_rate": 1.4425493592137601e-05, - "loss": 0.2555, + "epoch": 3.329549140447616, + "grad_norm": 0.25766298174858093, + "learning_rate": 1.3278375316543924e-05, + "loss": 0.3928, "step": 92385 }, { - "epoch": 3.25, - "learning_rate": 1.4422912323603066e-05, - "loss": 0.2567, + "epoch": 3.3297293401088406, + "grad_norm": 0.20684154331684113, + "learning_rate": 1.3275797896936958e-05, + "loss": 0.3196, "step": 92390 }, { - "epoch": 3.25, - "learning_rate": 1.4420331192405053e-05, - "loss": 0.2441, + "epoch": 3.3299095397700653, + "grad_norm": 0.27292490005493164, + "learning_rate": 1.3273220637069971e-05, + "loss": 0.3889, "step": 92395 }, { - "epoch": 3.25, - "learning_rate": 1.4417750198577069e-05, - "loss": 0.2532, + "epoch": 3.33008973943129, + "grad_norm": 0.2662750780582428, + "learning_rate": 1.327064353697809e-05, + "loss": 0.385, "step": 92400 }, { - "epoch": 3.25, - "learning_rate": 1.4415169342152624e-05, - "loss": 0.2683, + "epoch": 3.3302699390925143, + "grad_norm": 0.23216348886489868, + "learning_rate": 1.326806659669643e-05, + "loss": 0.3605, "step": 92405 }, { - "epoch": 3.25, - "learning_rate": 1.4412588623165218e-05, - "loss": 0.2695, + "epoch": 3.330450138753739, + "grad_norm": 0.2415631115436554, + "learning_rate": 1.3265489816260071e-05, + "loss": 0.4063, "step": 92410 }, { - "epoch": 3.25, - "learning_rate": 1.4410008041648388e-05, - "loss": 0.257, + "epoch": 3.330630338414964, + "grad_norm": 0.2386670559644699, + "learning_rate": 1.3262913195704152e-05, + "loss": 0.405, "step": 92415 }, { - "epoch": 3.25, - "learning_rate": 1.4407427597635616e-05, - "loss": 0.2384, + "epoch": 3.3308105380761885, + "grad_norm": 0.29892489314079285, + "learning_rate": 1.3260336735063766e-05, + "loss": 0.3817, "step": 92420 }, { - "epoch": 3.25, - "learning_rate": 1.4404847291160423e-05, - "loss": 0.2613, + "epoch": 3.3309907377374133, + "grad_norm": 0.2595389485359192, + "learning_rate": 1.325776043437401e-05, + "loss": 0.4289, "step": 92425 }, { - "epoch": 3.25, - "learning_rate": 1.4402267122256291e-05, - "loss": 0.2498, + "epoch": 3.3311709373986376, + "grad_norm": 0.23359209299087524, + "learning_rate": 1.325518429367002e-05, + "loss": 0.3704, "step": 92430 }, { - "epoch": 3.25, - "learning_rate": 1.4399687090956746e-05, - "loss": 0.2649, + "epoch": 3.3313511370598623, + "grad_norm": 0.22734783589839935, + "learning_rate": 1.3252608312986844e-05, + "loss": 0.3439, "step": 92435 }, { - "epoch": 3.25, - "learning_rate": 1.4397107197295268e-05, - "loss": 0.2418, + "epoch": 3.331531336721087, + "grad_norm": 0.21022412180900574, + "learning_rate": 1.3250032492359618e-05, + "loss": 0.3662, "step": 92440 }, { - "epoch": 3.25, - "learning_rate": 1.4394527441305378e-05, - "loss": 0.2674, + "epoch": 3.3317115363823118, + "grad_norm": 0.2911469042301178, + "learning_rate": 1.3247456831823423e-05, + "loss": 0.3742, "step": 92445 }, { - "epoch": 3.25, - "learning_rate": 1.439194782302055e-05, - "loss": 0.2477, + "epoch": 3.331891736043536, + "grad_norm": 0.24090684950351715, + "learning_rate": 1.3244881331413356e-05, + "loss": 0.371, "step": 92450 }, { - "epoch": 3.25, - "learning_rate": 1.4389368342474296e-05, - "loss": 0.2359, + "epoch": 3.332071935704761, + "grad_norm": 0.27764245867729187, + "learning_rate": 1.32423059911645e-05, + "loss": 0.3933, "step": 92455 }, { - "epoch": 3.25, - "learning_rate": 1.4386788999700102e-05, - "loss": 0.2487, + "epoch": 3.3322521353659855, + "grad_norm": 0.23873978853225708, + "learning_rate": 1.3239730811111952e-05, + "loss": 0.3861, "step": 92460 }, { - "epoch": 3.25, - "learning_rate": 1.4384209794731463e-05, - "loss": 0.2506, + "epoch": 3.3324323350272103, + "grad_norm": 0.19886447489261627, + "learning_rate": 1.3237155791290784e-05, + "loss": 0.3883, "step": 92465 }, { - "epoch": 3.25, - "learning_rate": 1.4381630727601847e-05, - "loss": 0.255, + "epoch": 3.332612534688435, + "grad_norm": 0.24735817313194275, + "learning_rate": 1.3234580931736102e-05, + "loss": 0.366, "step": 92470 }, { - "epoch": 3.25, - "learning_rate": 1.4379051798344772e-05, - "loss": 0.2906, + "epoch": 3.3327927343496593, + "grad_norm": 0.21200679242610931, + "learning_rate": 1.3232006232482978e-05, + "loss": 0.3777, "step": 92475 }, { - "epoch": 3.25, - "learning_rate": 1.437647300699371e-05, - "loss": 0.2423, + "epoch": 3.332972934010884, + "grad_norm": 0.22038812935352325, + "learning_rate": 1.3229431693566488e-05, + "loss": 0.3966, "step": 92480 }, { - "epoch": 3.25, - "learning_rate": 1.4373894353582135e-05, - "loss": 0.2642, + "epoch": 3.3331531336721087, + "grad_norm": 0.21706204116344452, + "learning_rate": 1.3226857315021712e-05, + "loss": 0.3784, "step": 92485 }, { - "epoch": 3.25, - "learning_rate": 1.4371315838143545e-05, - "loss": 0.2657, + "epoch": 3.3333333333333335, + "grad_norm": 0.26988691091537476, + "learning_rate": 1.3224283096883717e-05, + "loss": 0.3762, "step": 92490 }, { - "epoch": 3.25, - "learning_rate": 1.4368737460711406e-05, - "loss": 0.2582, + "epoch": 3.3335135329945578, + "grad_norm": 0.2401093691587448, + "learning_rate": 1.3221709039187597e-05, + "loss": 0.4154, "step": 92495 }, { - "epoch": 3.25, - "learning_rate": 1.436615922131922e-05, - "loss": 0.2564, + "epoch": 3.3336937326557825, + "grad_norm": 0.18479788303375244, + "learning_rate": 1.321913514196842e-05, + "loss": 0.3788, "step": 92500 }, { - "epoch": 3.25, - "eval_loss": 0.25434496998786926, - "eval_runtime": 10.5432, - "eval_samples_per_second": 9.485, - "eval_steps_per_second": 9.485, + "epoch": 3.3336937326557825, + "eval_loss": 0.42996159195899963, + "eval_runtime": 3.5289, + "eval_samples_per_second": 28.337, + "eval_steps_per_second": 7.084, "step": 92500 }, { - "epoch": 3.25, - "learning_rate": 1.4363581120000432e-05, - "loss": 0.2536, + "epoch": 3.3338739323170072, + "grad_norm": 0.28573232889175415, + "learning_rate": 1.3216561405261224e-05, + "loss": 0.4017, "step": 92505 }, { - "epoch": 3.25, - "learning_rate": 1.436100315678855e-05, - "loss": 0.2728, + "epoch": 3.334054131978232, + "grad_norm": 0.2534043490886688, + "learning_rate": 1.3213987829101108e-05, + "loss": 0.3497, "step": 92510 }, { - "epoch": 3.25, - "learning_rate": 1.4358425331717023e-05, - "loss": 0.2573, + "epoch": 3.3342343316394567, + "grad_norm": 0.218218594789505, + "learning_rate": 1.3211414413523115e-05, + "loss": 0.4098, "step": 92515 }, { - "epoch": 3.26, - "learning_rate": 1.4355847644819336e-05, - "loss": 0.2449, + "epoch": 3.334414531300681, + "grad_norm": 0.21875832974910736, + "learning_rate": 1.320884115856234e-05, + "loss": 0.3693, "step": 92520 }, { - "epoch": 3.26, - "learning_rate": 1.435327009612894e-05, - "loss": 0.245, + "epoch": 3.3345947309619057, + "grad_norm": 0.2261316180229187, + "learning_rate": 1.320626806425381e-05, + "loss": 0.3835, "step": 92525 }, { - "epoch": 3.26, - "learning_rate": 1.4350692685679323e-05, - "loss": 0.2713, + "epoch": 3.3347749306231305, + "grad_norm": 0.21817557513713837, + "learning_rate": 1.3203695130632587e-05, + "loss": 0.3939, "step": 92530 }, { - "epoch": 3.26, - "learning_rate": 1.4348115413503948e-05, - "loss": 0.2275, + "epoch": 3.334955130284355, + "grad_norm": 0.21988247334957123, + "learning_rate": 1.3201122357733742e-05, + "loss": 0.4075, "step": 92535 }, { - "epoch": 3.26, - "learning_rate": 1.4345538279636273e-05, - "loss": 0.2387, + "epoch": 3.3351353299455795, + "grad_norm": 0.26080963015556335, + "learning_rate": 1.319854974559232e-05, + "loss": 0.4094, "step": 92540 }, { - "epoch": 3.26, - "learning_rate": 1.4342961284109751e-05, - "loss": 0.2323, + "epoch": 3.335315529606804, + "grad_norm": 0.2803710103034973, + "learning_rate": 1.3195977294243372e-05, + "loss": 0.431, "step": 92545 }, { - "epoch": 3.26, - "learning_rate": 1.4340384426957854e-05, - "loss": 0.254, + "epoch": 3.335495729268029, + "grad_norm": 0.21274006366729736, + "learning_rate": 1.3193405003721951e-05, + "loss": 0.3808, "step": 92550 }, { - "epoch": 3.26, - "learning_rate": 1.4337807708214052e-05, - "loss": 0.2489, + "epoch": 3.3356759289292537, + "grad_norm": 0.23850922286510468, + "learning_rate": 1.3190832874063092e-05, + "loss": 0.406, "step": 92555 }, { - "epoch": 3.26, - "learning_rate": 1.4335231127911786e-05, - "loss": 0.2568, + "epoch": 3.3358561285904784, + "grad_norm": 0.22884266078472137, + "learning_rate": 1.3188260905301852e-05, + "loss": 0.373, "step": 92560 }, { - "epoch": 3.26, - "learning_rate": 1.4332654686084507e-05, - "loss": 0.2389, + "epoch": 3.3360363282517027, + "grad_norm": 0.20494328439235687, + "learning_rate": 1.3185689097473278e-05, + "loss": 0.3745, "step": 92565 }, { - "epoch": 3.26, - "learning_rate": 1.4330078382765688e-05, - "loss": 0.2694, + "epoch": 3.3362165279129274, + "grad_norm": 0.24264514446258545, + "learning_rate": 1.3183117450612398e-05, + "loss": 0.3718, "step": 92570 }, { - "epoch": 3.26, - "learning_rate": 1.4327502217988769e-05, - "loss": 0.2535, + "epoch": 3.336396727574152, + "grad_norm": 0.1675427258014679, + "learning_rate": 1.3180545964754257e-05, + "loss": 0.3765, "step": 92575 }, { - "epoch": 3.26, - "learning_rate": 1.4324926191787202e-05, - "loss": 0.236, + "epoch": 3.336576927235377, + "grad_norm": 0.25368401408195496, + "learning_rate": 1.3177974639933877e-05, + "loss": 0.3799, "step": 92580 }, { - "epoch": 3.26, - "learning_rate": 1.432235030419442e-05, - "loss": 0.2577, + "epoch": 3.336757126896601, + "grad_norm": 0.30531957745552063, + "learning_rate": 1.3175403476186315e-05, + "loss": 0.4008, "step": 92585 }, { - "epoch": 3.26, - "learning_rate": 1.4319774555243895e-05, - "loss": 0.2554, + "epoch": 3.336937326557826, + "grad_norm": 0.2088593989610672, + "learning_rate": 1.3172832473546593e-05, + "loss": 0.3525, "step": 92590 }, { - "epoch": 3.26, - "learning_rate": 1.4317198944969057e-05, - "loss": 0.2546, + "epoch": 3.3371175262190507, + "grad_norm": 0.28222382068634033, + "learning_rate": 1.3170261632049736e-05, + "loss": 0.3578, "step": 92595 }, { - "epoch": 3.26, - "learning_rate": 1.4314623473403343e-05, - "loss": 0.2744, + "epoch": 3.3372977258802754, + "grad_norm": 0.18309058248996735, + "learning_rate": 1.3167690951730777e-05, + "loss": 0.3696, "step": 92600 }, { - "epoch": 3.26, - "learning_rate": 1.43120481405802e-05, - "loss": 0.2617, + "epoch": 3.3374779255415, + "grad_norm": 0.22721010446548462, + "learning_rate": 1.3165120432624734e-05, + "loss": 0.3963, "step": 92605 }, { - "epoch": 3.26, - "learning_rate": 1.4309472946533075e-05, - "loss": 0.2554, + "epoch": 3.337658125202725, + "grad_norm": 0.30004847049713135, + "learning_rate": 1.3162550074766627e-05, + "loss": 0.4081, "step": 92610 }, { - "epoch": 3.26, - "learning_rate": 1.4306897891295403e-05, - "loss": 0.2573, + "epoch": 3.337838324863949, + "grad_norm": 0.29648730158805847, + "learning_rate": 1.3159979878191503e-05, + "loss": 0.3961, "step": 92615 }, { - "epoch": 3.26, - "learning_rate": 1.4304322974900605e-05, - "loss": 0.2565, + "epoch": 3.338018524525174, + "grad_norm": 0.19478410482406616, + "learning_rate": 1.3157409842934338e-05, + "loss": 0.3629, "step": 92620 }, { - "epoch": 3.26, - "learning_rate": 1.4301748197382131e-05, - "loss": 0.2631, + "epoch": 3.3381987241863986, + "grad_norm": 0.2053857296705246, + "learning_rate": 1.3154839969030186e-05, + "loss": 0.3789, "step": 92625 }, { - "epoch": 3.26, - "learning_rate": 1.429917355877341e-05, - "loss": 0.269, + "epoch": 3.3383789238476234, + "grad_norm": 0.19927579164505005, + "learning_rate": 1.3152270256514044e-05, + "loss": 0.3949, "step": 92630 }, { - "epoch": 3.26, - "learning_rate": 1.4296599059107865e-05, - "loss": 0.2527, + "epoch": 3.3385591235088476, + "grad_norm": 0.2706872224807739, + "learning_rate": 1.3149700705420923e-05, + "loss": 0.3705, "step": 92635 }, { - "epoch": 3.26, - "learning_rate": 1.4294024698418917e-05, - "loss": 0.2742, + "epoch": 3.3387393231700724, + "grad_norm": 0.2197032868862152, + "learning_rate": 1.3147131315785837e-05, + "loss": 0.3917, "step": 92640 }, { - "epoch": 3.26, - "learning_rate": 1.4291450476740018e-05, - "loss": 0.2426, + "epoch": 3.338919522831297, + "grad_norm": 0.2595154643058777, + "learning_rate": 1.3144562087643794e-05, + "loss": 0.3981, "step": 92645 }, { - "epoch": 3.26, - "learning_rate": 1.4288876394104572e-05, - "loss": 0.2568, + "epoch": 3.339099722492522, + "grad_norm": 0.20744380354881287, + "learning_rate": 1.3141993021029786e-05, + "loss": 0.3941, "step": 92650 }, { - "epoch": 3.26, - "learning_rate": 1.4286302450546003e-05, - "loss": 0.2631, + "epoch": 3.3392799221537466, + "grad_norm": 0.2614065408706665, + "learning_rate": 1.3139424115978838e-05, + "loss": 0.3667, "step": 92655 }, { - "epoch": 3.26, - "learning_rate": 1.4283728646097727e-05, - "loss": 0.2552, + "epoch": 3.339460121814971, + "grad_norm": 0.26152557134628296, + "learning_rate": 1.3136855372525939e-05, + "loss": 0.3953, "step": 92660 }, { - "epoch": 3.26, - "learning_rate": 1.4281154980793187e-05, - "loss": 0.2428, + "epoch": 3.3396403214761956, + "grad_norm": 0.2180803269147873, + "learning_rate": 1.313428679070609e-05, + "loss": 0.3865, "step": 92665 }, { - "epoch": 3.26, - "learning_rate": 1.4278581454665787e-05, - "loss": 0.2668, + "epoch": 3.3398205211374203, + "grad_norm": 0.21196460723876953, + "learning_rate": 1.3131718370554286e-05, + "loss": 0.3598, "step": 92670 }, { - "epoch": 3.26, - "learning_rate": 1.4276008067748941e-05, - "loss": 0.2527, + "epoch": 3.340000720798645, + "grad_norm": 0.23230072855949402, + "learning_rate": 1.312915011210551e-05, + "loss": 0.3827, "step": 92675 }, { - "epoch": 3.26, - "learning_rate": 1.4273434820076053e-05, - "loss": 0.2457, + "epoch": 3.3401809204598694, + "grad_norm": 0.24028299748897552, + "learning_rate": 1.3126582015394778e-05, + "loss": 0.41, "step": 92680 }, { - "epoch": 3.26, - "learning_rate": 1.4270861711680553e-05, - "loss": 0.252, + "epoch": 3.340361120121094, + "grad_norm": 0.29149261116981506, + "learning_rate": 1.3124014080457076e-05, + "loss": 0.3888, "step": 92685 }, { - "epoch": 3.26, - "learning_rate": 1.4268288742595842e-05, - "loss": 0.2754, + "epoch": 3.340541319782319, + "grad_norm": 0.2105678766965866, + "learning_rate": 1.3121446307327362e-05, + "loss": 0.3682, "step": 92690 }, { - "epoch": 3.26, - "learning_rate": 1.4265715912855331e-05, - "loss": 0.2431, + "epoch": 3.3407215194435436, + "grad_norm": 0.29433146119117737, + "learning_rate": 1.3118878696040655e-05, + "loss": 0.3464, "step": 92695 }, { - "epoch": 3.26, - "learning_rate": 1.4263143222492414e-05, - "loss": 0.2656, + "epoch": 3.3409017191047683, + "grad_norm": 0.19459545612335205, + "learning_rate": 1.311631124663191e-05, + "loss": 0.372, "step": 92700 }, { - "epoch": 3.26, - "learning_rate": 1.426057067154052e-05, - "loss": 0.2387, + "epoch": 3.3410819187659926, + "grad_norm": 0.22271791100502014, + "learning_rate": 1.3113743959136137e-05, + "loss": 0.3698, "step": 92705 }, { - "epoch": 3.26, - "learning_rate": 1.4257998260033024e-05, - "loss": 0.2546, + "epoch": 3.3412621184272173, + "grad_norm": 0.24294120073318481, + "learning_rate": 1.3111176833588312e-05, + "loss": 0.3947, "step": 92710 }, { - "epoch": 3.26, - "learning_rate": 1.425542598800335e-05, - "loss": 0.2652, + "epoch": 3.341442318088442, + "grad_norm": 0.2713625729084015, + "learning_rate": 1.3108609870023381e-05, + "loss": 0.3898, "step": 92715 }, { - "epoch": 3.26, - "learning_rate": 1.4252853855484883e-05, - "loss": 0.2448, + "epoch": 3.341622517749667, + "grad_norm": 0.24030660092830658, + "learning_rate": 1.3106043068476348e-05, + "loss": 0.3998, "step": 92720 }, { - "epoch": 3.26, - "learning_rate": 1.4250281862511034e-05, - "loss": 0.2666, + "epoch": 3.341802717410891, + "grad_norm": 0.23403751850128174, + "learning_rate": 1.3103476428982172e-05, + "loss": 0.4276, "step": 92725 }, { - "epoch": 3.26, - "learning_rate": 1.4247710009115192e-05, - "loss": 0.2428, + "epoch": 3.341982917072116, + "grad_norm": 0.23179000616073608, + "learning_rate": 1.3100909951575829e-05, + "loss": 0.3936, "step": 92730 }, { - "epoch": 3.26, - "learning_rate": 1.424513829533074e-05, - "loss": 0.2671, + "epoch": 3.3421631167333405, + "grad_norm": 0.2009132206439972, + "learning_rate": 1.3098343636292287e-05, + "loss": 0.3873, "step": 92735 }, { - "epoch": 3.26, - "learning_rate": 1.4242566721191087e-05, - "loss": 0.2536, + "epoch": 3.3423433163945653, + "grad_norm": 0.22999610006809235, + "learning_rate": 1.3095777483166493e-05, + "loss": 0.3685, "step": 92740 }, { - "epoch": 3.26, - "learning_rate": 1.4239995286729615e-05, - "loss": 0.2517, + "epoch": 3.34252351605579, + "grad_norm": 0.19379504024982452, + "learning_rate": 1.3093211492233443e-05, + "loss": 0.3747, "step": 92745 }, { - "epoch": 3.26, - "learning_rate": 1.4237423991979715e-05, - "loss": 0.2407, + "epoch": 3.3427037157170143, + "grad_norm": 0.21033187210559845, + "learning_rate": 1.3090645663528075e-05, + "loss": 0.3403, "step": 92750 }, { - "epoch": 3.26, - "learning_rate": 1.423485283697476e-05, - "loss": 0.244, + "epoch": 3.342883915378239, + "grad_norm": 0.19492049515247345, + "learning_rate": 1.3088079997085356e-05, + "loss": 0.3598, "step": 92755 }, { - "epoch": 3.26, - "learning_rate": 1.423228182174816e-05, - "loss": 0.2608, + "epoch": 3.3430641150394638, + "grad_norm": 0.21218262612819672, + "learning_rate": 1.3085514492940243e-05, + "loss": 0.3883, "step": 92760 }, { - "epoch": 3.26, - "learning_rate": 1.4229710946333272e-05, - "loss": 0.2484, + "epoch": 3.3432443147006885, + "grad_norm": 0.21746402978897095, + "learning_rate": 1.3082949151127688e-05, + "loss": 0.3835, "step": 92765 }, { - "epoch": 3.26, - "learning_rate": 1.42271402107635e-05, - "loss": 0.2848, + "epoch": 3.343424514361913, + "grad_norm": 0.22117747366428375, + "learning_rate": 1.308038397168263e-05, + "loss": 0.3615, "step": 92770 }, { - "epoch": 3.26, - "learning_rate": 1.4224569615072203e-05, - "loss": 0.2537, + "epoch": 3.3436047140231375, + "grad_norm": 0.19114363193511963, + "learning_rate": 1.3077818954640048e-05, + "loss": 0.3707, "step": 92775 }, { - "epoch": 3.26, - "learning_rate": 1.4221999159292776e-05, - "loss": 0.2552, + "epoch": 3.3437849136843623, + "grad_norm": 0.19133096933364868, + "learning_rate": 1.307525410003487e-05, + "loss": 0.3634, "step": 92780 }, { - "epoch": 3.26, - "learning_rate": 1.4219428843458588e-05, - "loss": 0.2513, + "epoch": 3.343965113345587, + "grad_norm": 0.21959801018238068, + "learning_rate": 1.3072689407902047e-05, + "loss": 0.3866, "step": 92785 }, { - "epoch": 3.26, - "learning_rate": 1.4216858667603015e-05, - "loss": 0.2473, + "epoch": 3.3441453130068117, + "grad_norm": 0.26673710346221924, + "learning_rate": 1.3070124878276524e-05, + "loss": 0.3999, "step": 92790 }, { - "epoch": 3.26, - "learning_rate": 1.4214288631759413e-05, - "loss": 0.2535, + "epoch": 3.344325512668036, + "grad_norm": 0.27120256423950195, + "learning_rate": 1.3067560511193227e-05, + "loss": 0.3931, "step": 92795 }, { - "epoch": 3.26, - "learning_rate": 1.4211718735961174e-05, - "loss": 0.2803, + "epoch": 3.3445057123292607, + "grad_norm": 0.22337371110916138, + "learning_rate": 1.3064996306687127e-05, + "loss": 0.3907, "step": 92800 }, { - "epoch": 3.27, - "learning_rate": 1.420914898024166e-05, - "loss": 0.2466, + "epoch": 3.3446859119904855, + "grad_norm": 0.22518573701381683, + "learning_rate": 1.3062432264793122e-05, + "loss": 0.3561, "step": 92805 }, { - "epoch": 3.27, - "learning_rate": 1.4206579364634231e-05, - "loss": 0.2617, + "epoch": 3.34486611165171, + "grad_norm": 0.23381398618221283, + "learning_rate": 1.3059868385546181e-05, + "loss": 0.3455, "step": 92810 }, { - "epoch": 3.27, - "learning_rate": 1.4204009889172248e-05, - "loss": 0.2561, + "epoch": 3.3450463113129345, + "grad_norm": 0.20269441604614258, + "learning_rate": 1.3057304668981218e-05, + "loss": 0.3855, "step": 92815 }, { - "epoch": 3.27, - "learning_rate": 1.4201440553889095e-05, - "loss": 0.2463, + "epoch": 3.3452265109741592, + "grad_norm": 0.20035311579704285, + "learning_rate": 1.3054741115133168e-05, + "loss": 0.3773, "step": 92820 }, { - "epoch": 3.27, - "learning_rate": 1.4198871358818106e-05, - "loss": 0.2531, + "epoch": 3.345406710635384, + "grad_norm": 0.1864159256219864, + "learning_rate": 1.3052177724036957e-05, + "loss": 0.3679, "step": 92825 }, { - "epoch": 3.27, - "learning_rate": 1.4196302303992668e-05, - "loss": 0.2655, + "epoch": 3.3455869102966087, + "grad_norm": 0.22926171123981476, + "learning_rate": 1.3049614495727507e-05, + "loss": 0.348, "step": 92830 }, { - "epoch": 3.27, - "learning_rate": 1.4193733389446113e-05, - "loss": 0.2617, + "epoch": 3.3457671099578334, + "grad_norm": 0.27731531858444214, + "learning_rate": 1.3047051430239744e-05, + "loss": 0.3804, "step": 92835 }, { - "epoch": 3.27, - "learning_rate": 1.4191164615211816e-05, - "loss": 0.2699, + "epoch": 3.3459473096190577, + "grad_norm": 0.2885668873786926, + "learning_rate": 1.3044488527608595e-05, + "loss": 0.397, "step": 92840 }, { - "epoch": 3.27, - "learning_rate": 1.4188595981323124e-05, - "loss": 0.2499, + "epoch": 3.3461275092802825, + "grad_norm": 0.23300670087337494, + "learning_rate": 1.3041925787868978e-05, + "loss": 0.3717, "step": 92845 }, { - "epoch": 3.27, - "learning_rate": 1.418602748781338e-05, - "loss": 0.2351, + "epoch": 3.346307708941507, + "grad_norm": 0.2299395352602005, + "learning_rate": 1.3039363211055806e-05, + "loss": 0.3944, "step": 92850 }, { - "epoch": 3.27, - "learning_rate": 1.4183459134715952e-05, - "loss": 0.2457, + "epoch": 3.346487908602732, + "grad_norm": 0.23562583327293396, + "learning_rate": 1.3036800797203997e-05, + "loss": 0.3838, "step": 92855 }, { - "epoch": 3.27, - "learning_rate": 1.4180890922064177e-05, - "loss": 0.2566, + "epoch": 3.346668108263956, + "grad_norm": 0.18811137974262238, + "learning_rate": 1.3034238546348449e-05, + "loss": 0.4058, "step": 92860 }, { - "epoch": 3.27, - "learning_rate": 1.4178322849891404e-05, - "loss": 0.2558, + "epoch": 3.346848307925181, + "grad_norm": 0.3136909306049347, + "learning_rate": 1.3031676458524092e-05, + "loss": 0.3802, "step": 92865 }, { - "epoch": 3.27, - "learning_rate": 1.4175754918230966e-05, - "loss": 0.256, + "epoch": 3.3470285075864057, + "grad_norm": 0.2185976505279541, + "learning_rate": 1.302911453376583e-05, + "loss": 0.3782, "step": 92870 }, { - "epoch": 3.27, - "learning_rate": 1.417318712711623e-05, - "loss": 0.2528, + "epoch": 3.3472087072476304, + "grad_norm": 0.2267122119665146, + "learning_rate": 1.302655277210856e-05, + "loss": 0.3729, "step": 92875 }, { - "epoch": 3.27, - "learning_rate": 1.4170619476580513e-05, - "loss": 0.2599, + "epoch": 3.347388906908855, + "grad_norm": 0.20721164345741272, + "learning_rate": 1.3023991173587193e-05, + "loss": 0.3868, "step": 92880 }, { - "epoch": 3.27, - "learning_rate": 1.4168051966657176e-05, - "loss": 0.2291, + "epoch": 3.34756910657008, + "grad_norm": 0.21538670361042023, + "learning_rate": 1.3021429738236618e-05, + "loss": 0.3675, "step": 92885 }, { - "epoch": 3.27, - "learning_rate": 1.4165484597379534e-05, - "loss": 0.2616, + "epoch": 3.347749306231304, + "grad_norm": 0.24019305408000946, + "learning_rate": 1.3018868466091755e-05, + "loss": 0.3555, "step": 92890 }, { - "epoch": 3.27, - "learning_rate": 1.4162917368780943e-05, - "loss": 0.2558, + "epoch": 3.347929505892529, + "grad_norm": 0.25520575046539307, + "learning_rate": 1.3016307357187496e-05, + "loss": 0.4072, "step": 92895 }, { - "epoch": 3.27, - "learning_rate": 1.416035028089473e-05, - "loss": 0.2478, + "epoch": 3.3481097055537536, + "grad_norm": 0.21573813259601593, + "learning_rate": 1.3013746411558714e-05, + "loss": 0.3914, "step": 92900 }, { - "epoch": 3.27, - "learning_rate": 1.4157783333754226e-05, - "loss": 0.2743, + "epoch": 3.3482899052149784, + "grad_norm": 0.22230984270572662, + "learning_rate": 1.3011185629240321e-05, + "loss": 0.3971, "step": 92905 }, { - "epoch": 3.27, - "learning_rate": 1.4155216527392747e-05, - "loss": 0.2864, + "epoch": 3.3484701048762027, + "grad_norm": 0.21699829399585724, + "learning_rate": 1.3008625010267206e-05, + "loss": 0.373, "step": 92910 }, { - "epoch": 3.27, - "learning_rate": 1.4152649861843648e-05, - "loss": 0.2724, + "epoch": 3.3486503045374274, + "grad_norm": 0.27921581268310547, + "learning_rate": 1.3006064554674252e-05, + "loss": 0.3984, "step": 92915 }, { - "epoch": 3.27, - "learning_rate": 1.415008333714024e-05, - "loss": 0.2614, + "epoch": 3.348830504198652, + "grad_norm": 0.20376195013523102, + "learning_rate": 1.3003504262496347e-05, + "loss": 0.3565, "step": 92920 }, { - "epoch": 3.27, - "learning_rate": 1.4147516953315836e-05, - "loss": 0.2682, + "epoch": 3.349010703859877, + "grad_norm": 0.26665177941322327, + "learning_rate": 1.3000944133768368e-05, + "loss": 0.3716, "step": 92925 }, { - "epoch": 3.27, - "learning_rate": 1.4144950710403788e-05, - "loss": 0.2617, + "epoch": 3.3491909035211016, + "grad_norm": 0.22333024442195892, + "learning_rate": 1.2998384168525208e-05, + "loss": 0.3586, "step": 92930 }, { - "epoch": 3.27, - "learning_rate": 1.414238460843739e-05, - "loss": 0.2711, + "epoch": 3.349371103182326, + "grad_norm": 0.26054447889328003, + "learning_rate": 1.299582436680174e-05, + "loss": 0.4145, "step": 92935 }, { - "epoch": 3.27, - "learning_rate": 1.4139818647449981e-05, - "loss": 0.2508, + "epoch": 3.3495513028435506, + "grad_norm": 0.25818753242492676, + "learning_rate": 1.299326472863284e-05, + "loss": 0.3409, "step": 92940 }, { - "epoch": 3.27, - "learning_rate": 1.413725282747487e-05, - "loss": 0.2638, + "epoch": 3.3497315025047754, + "grad_norm": 0.17389363050460815, + "learning_rate": 1.2990705254053385e-05, + "loss": 0.3626, "step": 92945 }, { - "epoch": 3.27, - "learning_rate": 1.4134687148545362e-05, - "loss": 0.2476, + "epoch": 3.349911702166, + "grad_norm": 0.24069735407829285, + "learning_rate": 1.2988145943098245e-05, + "loss": 0.3704, "step": 92950 }, { - "epoch": 3.27, - "learning_rate": 1.4132121610694792e-05, - "loss": 0.2468, + "epoch": 3.3500919018272244, + "grad_norm": 0.2535139322280884, + "learning_rate": 1.2985586795802279e-05, + "loss": 0.3725, "step": 92955 }, { - "epoch": 3.27, - "learning_rate": 1.412955621395646e-05, - "loss": 0.2461, + "epoch": 3.350272101488449, + "grad_norm": 0.2423035055398941, + "learning_rate": 1.2983027812200382e-05, + "loss": 0.3808, "step": 92960 }, { - "epoch": 3.27, - "learning_rate": 1.4126990958363668e-05, - "loss": 0.2656, + "epoch": 3.350452301149674, + "grad_norm": 0.2165379524230957, + "learning_rate": 1.2980468992327402e-05, + "loss": 0.3942, "step": 92965 }, { - "epoch": 3.27, - "learning_rate": 1.412442584394974e-05, - "loss": 0.2755, + "epoch": 3.3506325008108986, + "grad_norm": 0.27460017800331116, + "learning_rate": 1.2977910336218204e-05, + "loss": 0.3583, "step": 92970 }, { - "epoch": 3.27, - "learning_rate": 1.4121860870747978e-05, - "loss": 0.2617, + "epoch": 3.3508127004721233, + "grad_norm": 0.2811183035373688, + "learning_rate": 1.2975351843907652e-05, + "loss": 0.369, "step": 92975 }, { - "epoch": 3.27, - "learning_rate": 1.4119296038791683e-05, - "loss": 0.2502, + "epoch": 3.3509929001333476, + "grad_norm": 0.22959589958190918, + "learning_rate": 1.2972793515430592e-05, + "loss": 0.3604, "step": 92980 }, { - "epoch": 3.27, - "learning_rate": 1.4116731348114148e-05, - "loss": 0.2519, + "epoch": 3.3511730997945723, + "grad_norm": 0.2398064136505127, + "learning_rate": 1.2970235350821911e-05, + "loss": 0.3713, "step": 92985 }, { - "epoch": 3.27, - "learning_rate": 1.4114166798748684e-05, - "loss": 0.2423, + "epoch": 3.351353299455797, + "grad_norm": 0.23232856392860413, + "learning_rate": 1.2967677350116425e-05, + "loss": 0.3658, "step": 92990 }, { - "epoch": 3.27, - "learning_rate": 1.4111602390728602e-05, - "loss": 0.2438, + "epoch": 3.351533499117022, + "grad_norm": 0.2607458829879761, + "learning_rate": 1.2965119513349016e-05, + "loss": 0.3745, "step": 92995 }, { - "epoch": 3.27, - "learning_rate": 1.4109038124087181e-05, - "loss": 0.2675, + "epoch": 3.351713698778246, + "grad_norm": 0.17985029518604279, + "learning_rate": 1.2962561840554524e-05, + "loss": 0.3852, "step": 93000 }, { - "epoch": 3.27, - "eval_loss": 0.2542540431022644, - "eval_runtime": 10.5543, - "eval_samples_per_second": 9.475, - "eval_steps_per_second": 9.475, + "epoch": 3.351713698778246, + "eval_loss": 0.4302482306957245, + "eval_runtime": 3.5321, + "eval_samples_per_second": 28.312, + "eval_steps_per_second": 7.078, "step": 93000 }, { - "epoch": 3.27, - "learning_rate": 1.4106473998857717e-05, - "loss": 0.2569, + "epoch": 3.351893898439471, + "grad_norm": 0.22021250426769257, + "learning_rate": 1.2960004331767786e-05, + "loss": 0.36, "step": 93005 }, { - "epoch": 3.27, - "learning_rate": 1.4103910015073518e-05, - "loss": 0.2748, + "epoch": 3.3520740981006956, + "grad_norm": 0.24476410448551178, + "learning_rate": 1.2957446987023678e-05, + "loss": 0.4055, "step": 93010 }, { - "epoch": 3.27, - "learning_rate": 1.4101346172767871e-05, - "loss": 0.2652, + "epoch": 3.3522542977619203, + "grad_norm": 0.20762693881988525, + "learning_rate": 1.2954889806357012e-05, + "loss": 0.3946, "step": 93015 }, { - "epoch": 3.27, - "learning_rate": 1.4098782471974057e-05, - "loss": 0.2259, + "epoch": 3.352434497423145, + "grad_norm": 0.30094394087791443, + "learning_rate": 1.2952332789802631e-05, + "loss": 0.3695, "step": 93020 }, { - "epoch": 3.27, - "learning_rate": 1.4096218912725356e-05, - "loss": 0.2508, + "epoch": 3.3526146970843693, + "grad_norm": 0.27970486879348755, + "learning_rate": 1.2949775937395392e-05, + "loss": 0.3885, "step": 93025 }, { - "epoch": 3.27, - "learning_rate": 1.4093655495055081e-05, - "loss": 0.2728, + "epoch": 3.352794896745594, + "grad_norm": 0.2970288395881653, + "learning_rate": 1.2947219249170128e-05, + "loss": 0.3899, "step": 93030 }, { - "epoch": 3.27, - "learning_rate": 1.4091092218996502e-05, - "loss": 0.2435, + "epoch": 3.352975096406819, + "grad_norm": 0.18664143979549408, + "learning_rate": 1.2944662725161663e-05, + "loss": 0.3715, "step": 93035 }, { - "epoch": 3.27, - "learning_rate": 1.408852908458289e-05, - "loss": 0.2542, + "epoch": 3.3531552960680435, + "grad_norm": 0.2144254446029663, + "learning_rate": 1.2942106365404839e-05, + "loss": 0.3757, "step": 93040 }, { - "epoch": 3.27, - "learning_rate": 1.4085966091847535e-05, - "loss": 0.2551, + "epoch": 3.353335495729268, + "grad_norm": 0.24365659058094025, + "learning_rate": 1.2939550169934466e-05, + "loss": 0.3741, "step": 93045 }, { - "epoch": 3.27, - "learning_rate": 1.4083403240823732e-05, - "loss": 0.2636, + "epoch": 3.3535156953904925, + "grad_norm": 0.2775290012359619, + "learning_rate": 1.29369941387854e-05, + "loss": 0.3854, "step": 93050 }, { - "epoch": 3.27, - "learning_rate": 1.408084053154474e-05, - "loss": 0.2521, + "epoch": 3.3536958950517173, + "grad_norm": 0.20349471271038055, + "learning_rate": 1.2934438271992452e-05, + "loss": 0.3809, "step": 93055 }, { - "epoch": 3.27, - "learning_rate": 1.407827796404384e-05, - "loss": 0.2496, + "epoch": 3.353876094712942, + "grad_norm": 0.27783820033073425, + "learning_rate": 1.2931882569590448e-05, + "loss": 0.353, "step": 93060 }, { - "epoch": 3.27, - "learning_rate": 1.407571553835429e-05, - "loss": 0.2704, + "epoch": 3.3540562943741667, + "grad_norm": 0.26392999291419983, + "learning_rate": 1.2929327031614204e-05, + "loss": 0.4006, "step": 93065 }, { - "epoch": 3.27, - "learning_rate": 1.4073153254509386e-05, - "loss": 0.2535, + "epoch": 3.354236494035391, + "grad_norm": 0.2420482635498047, + "learning_rate": 1.2926771658098535e-05, + "loss": 0.3974, "step": 93070 }, { - "epoch": 3.27, - "learning_rate": 1.4070591112542387e-05, - "loss": 0.2552, + "epoch": 3.3544166936966158, + "grad_norm": 0.2563861012458801, + "learning_rate": 1.2924216449078274e-05, + "loss": 0.3876, "step": 93075 }, { - "epoch": 3.27, - "learning_rate": 1.4068029112486547e-05, - "loss": 0.2795, + "epoch": 3.3545968933578405, + "grad_norm": 0.23027673363685608, + "learning_rate": 1.292166140458824e-05, + "loss": 0.3777, "step": 93080 }, { - "epoch": 3.27, - "learning_rate": 1.4065467254375159e-05, - "loss": 0.2735, + "epoch": 3.3547770930190652, + "grad_norm": 0.2890183925628662, + "learning_rate": 1.2919106524663208e-05, + "loss": 0.3627, "step": 93085 }, { - "epoch": 3.28, - "learning_rate": 1.406290553824147e-05, - "loss": 0.2424, + "epoch": 3.3549572926802895, + "grad_norm": 0.2011183500289917, + "learning_rate": 1.2916551809338018e-05, + "loss": 0.3793, "step": 93090 }, { - "epoch": 3.28, - "learning_rate": 1.4060343964118733e-05, - "loss": 0.273, + "epoch": 3.3551374923415143, + "grad_norm": 0.23442140221595764, + "learning_rate": 1.2913997258647476e-05, + "loss": 0.3532, "step": 93095 }, { - "epoch": 3.28, - "learning_rate": 1.405778253204022e-05, - "loss": 0.2706, + "epoch": 3.355317692002739, + "grad_norm": 0.18820983171463013, + "learning_rate": 1.2911442872626376e-05, + "loss": 0.3741, "step": 93100 }, { - "epoch": 3.28, - "learning_rate": 1.4055221242039202e-05, - "loss": 0.2269, + "epoch": 3.3554978916639637, + "grad_norm": 0.17393651604652405, + "learning_rate": 1.2908888651309528e-05, + "loss": 0.3554, "step": 93105 }, { - "epoch": 3.28, - "learning_rate": 1.4052660094148922e-05, - "loss": 0.248, + "epoch": 3.3556780913251885, + "grad_norm": 0.2209923267364502, + "learning_rate": 1.2906334594731722e-05, + "loss": 0.4142, "step": 93110 }, { - "epoch": 3.28, - "learning_rate": 1.405009908840264e-05, - "loss": 0.254, + "epoch": 3.355858290986413, + "grad_norm": 0.2804467976093292, + "learning_rate": 1.2903780702927776e-05, + "loss": 0.3901, "step": 93115 }, { - "epoch": 3.28, - "learning_rate": 1.404753822483359e-05, - "loss": 0.2408, + "epoch": 3.3560384906476375, + "grad_norm": 0.2660714089870453, + "learning_rate": 1.2901226975932474e-05, + "loss": 0.3976, "step": 93120 }, { - "epoch": 3.28, - "learning_rate": 1.4044977503475051e-05, - "loss": 0.266, + "epoch": 3.356218690308862, + "grad_norm": 0.24791589379310608, + "learning_rate": 1.2898673413780616e-05, + "loss": 0.4206, "step": 93125 }, { - "epoch": 3.28, - "learning_rate": 1.4042416924360257e-05, - "loss": 0.2522, + "epoch": 3.356398889970087, + "grad_norm": 0.25095242261886597, + "learning_rate": 1.2896120016506985e-05, + "loss": 0.373, "step": 93130 }, { - "epoch": 3.28, - "learning_rate": 1.4039856487522463e-05, - "loss": 0.2745, + "epoch": 3.3565790896313117, + "grad_norm": 0.31840965151786804, + "learning_rate": 1.2893566784146377e-05, + "loss": 0.3655, "step": 93135 }, { - "epoch": 3.28, - "learning_rate": 1.4037296192994893e-05, - "loss": 0.2434, + "epoch": 3.356759289292536, + "grad_norm": 0.20555394887924194, + "learning_rate": 1.2891013716733569e-05, + "loss": 0.3744, "step": 93140 }, { - "epoch": 3.28, - "learning_rate": 1.403473604081082e-05, - "loss": 0.2449, + "epoch": 3.3569394889537607, + "grad_norm": 0.24305355548858643, + "learning_rate": 1.2888460814303363e-05, + "loss": 0.3667, "step": 93145 }, { - "epoch": 3.28, - "learning_rate": 1.4032176031003463e-05, - "loss": 0.2645, + "epoch": 3.3571196886149854, + "grad_norm": 0.22731393575668335, + "learning_rate": 1.2885908076890536e-05, + "loss": 0.3577, "step": 93150 }, { - "epoch": 3.28, - "learning_rate": 1.4029616163606085e-05, - "loss": 0.2864, + "epoch": 3.35729988827621, + "grad_norm": 0.22640764713287354, + "learning_rate": 1.2883355504529865e-05, + "loss": 0.3424, "step": 93155 }, { - "epoch": 3.28, - "learning_rate": 1.4027056438651898e-05, - "loss": 0.2473, + "epoch": 3.357480087937435, + "grad_norm": 0.21849682927131653, + "learning_rate": 1.2880803097256128e-05, + "loss": 0.3742, "step": 93160 }, { - "epoch": 3.28, - "learning_rate": 1.4024496856174162e-05, - "loss": 0.2521, + "epoch": 3.357660287598659, + "grad_norm": 0.20635724067687988, + "learning_rate": 1.2878250855104094e-05, + "loss": 0.4043, "step": 93165 }, { - "epoch": 3.28, - "learning_rate": 1.4021937416206101e-05, - "loss": 0.2574, + "epoch": 3.357840487259884, + "grad_norm": 0.17877210676670074, + "learning_rate": 1.2875698778108552e-05, + "loss": 0.3761, "step": 93170 }, { - "epoch": 3.28, - "learning_rate": 1.4019378118780951e-05, - "loss": 0.2324, + "epoch": 3.3580206869211087, + "grad_norm": 0.19038045406341553, + "learning_rate": 1.287314686630427e-05, + "loss": 0.3983, "step": 93175 }, { - "epoch": 3.28, - "learning_rate": 1.4016818963931927e-05, - "loss": 0.2426, + "epoch": 3.3582008865823334, + "grad_norm": 0.25942298769950867, + "learning_rate": 1.2870595119726015e-05, + "loss": 0.42, "step": 93180 }, { - "epoch": 3.28, - "learning_rate": 1.4014259951692279e-05, - "loss": 0.244, + "epoch": 3.3583810862435577, + "grad_norm": 0.2063370645046234, + "learning_rate": 1.2868043538408553e-05, + "loss": 0.4054, "step": 93185 }, { - "epoch": 3.28, - "learning_rate": 1.401170108209523e-05, - "loss": 0.2634, + "epoch": 3.3585612859047824, + "grad_norm": 0.22962912917137146, + "learning_rate": 1.2865492122386636e-05, + "loss": 0.3537, "step": 93190 }, { - "epoch": 3.28, - "learning_rate": 1.4009142355173996e-05, - "loss": 0.2655, + "epoch": 3.358741485566007, + "grad_norm": 0.21510834991931915, + "learning_rate": 1.2862940871695062e-05, + "loss": 0.3888, "step": 93195 }, { - "epoch": 3.28, - "learning_rate": 1.4006583770961798e-05, - "loss": 0.249, + "epoch": 3.358921685227232, + "grad_norm": 0.22031091153621674, + "learning_rate": 1.2860389786368559e-05, + "loss": 0.3752, "step": 93200 }, { - "epoch": 3.28, - "learning_rate": 1.4004025329491866e-05, - "loss": 0.2634, + "epoch": 3.3591018848884566, + "grad_norm": 0.2270020842552185, + "learning_rate": 1.2857838866441885e-05, + "loss": 0.3664, "step": 93205 }, { - "epoch": 3.28, - "learning_rate": 1.4001467030797428e-05, - "loss": 0.2556, + "epoch": 3.359282084549681, + "grad_norm": 0.22233691811561584, + "learning_rate": 1.2855288111949811e-05, + "loss": 0.3757, "step": 93210 }, { - "epoch": 3.28, - "learning_rate": 1.3998908874911679e-05, - "loss": 0.2586, + "epoch": 3.3594622842109056, + "grad_norm": 0.25379857420921326, + "learning_rate": 1.2852737522927089e-05, + "loss": 0.3823, "step": 93215 }, { - "epoch": 3.28, - "learning_rate": 1.399635086186786e-05, - "loss": 0.2485, + "epoch": 3.3596424838721304, + "grad_norm": 0.19441834092140198, + "learning_rate": 1.2850187099408467e-05, + "loss": 0.3814, "step": 93220 }, { - "epoch": 3.28, - "learning_rate": 1.3993792991699173e-05, - "loss": 0.2416, + "epoch": 3.359822683533355, + "grad_norm": 0.21130891144275665, + "learning_rate": 1.2847636841428695e-05, + "loss": 0.3634, "step": 93225 }, { - "epoch": 3.28, - "learning_rate": 1.3991235264438833e-05, - "loss": 0.2645, + "epoch": 3.3600028831945794, + "grad_norm": 0.22752270102500916, + "learning_rate": 1.2845086749022506e-05, + "loss": 0.3821, "step": 93230 }, { - "epoch": 3.28, - "learning_rate": 1.3988677680120036e-05, - "loss": 0.2557, + "epoch": 3.360183082855804, + "grad_norm": 0.2163986712694168, + "learning_rate": 1.2842536822224666e-05, + "loss": 0.3591, "step": 93235 }, { - "epoch": 3.28, - "learning_rate": 1.3986120238776012e-05, - "loss": 0.2617, + "epoch": 3.360363282517029, + "grad_norm": 0.2803928852081299, + "learning_rate": 1.2839987061069908e-05, + "loss": 0.356, "step": 93240 }, { - "epoch": 3.28, - "learning_rate": 1.398356294043996e-05, - "loss": 0.2646, + "epoch": 3.3605434821782536, + "grad_norm": 0.26921606063842773, + "learning_rate": 1.2837437465592972e-05, + "loss": 0.4204, "step": 93245 }, { - "epoch": 3.28, - "learning_rate": 1.3981005785145085e-05, - "loss": 0.2721, + "epoch": 3.3607236818394783, + "grad_norm": 0.2192206233739853, + "learning_rate": 1.2834888035828596e-05, + "loss": 0.3889, "step": 93250 }, { - "epoch": 3.28, - "learning_rate": 1.3978448772924574e-05, - "loss": 0.2445, + "epoch": 3.3609038815007026, + "grad_norm": 0.26289498805999756, + "learning_rate": 1.2832338771811508e-05, + "loss": 0.3843, "step": 93255 }, { - "epoch": 3.28, - "learning_rate": 1.3975891903811653e-05, - "loss": 0.2669, + "epoch": 3.3610840811619274, + "grad_norm": 0.2620001435279846, + "learning_rate": 1.2829789673576456e-05, + "loss": 0.3524, "step": 93260 }, { - "epoch": 3.28, - "learning_rate": 1.3973335177839503e-05, - "loss": 0.256, + "epoch": 3.361264280823152, + "grad_norm": 0.2221224457025528, + "learning_rate": 1.2827240741158175e-05, + "loss": 0.3894, "step": 93265 }, { - "epoch": 3.28, - "learning_rate": 1.3970778595041333e-05, - "loss": 0.2373, + "epoch": 3.361444480484377, + "grad_norm": 0.21585452556610107, + "learning_rate": 1.282469197459136e-05, + "loss": 0.3812, "step": 93270 }, { - "epoch": 3.28, - "learning_rate": 1.396822215545033e-05, - "loss": 0.2583, + "epoch": 3.361624680145601, + "grad_norm": 0.21924617886543274, + "learning_rate": 1.2822143373910771e-05, + "loss": 0.3903, "step": 93275 }, { - "epoch": 3.28, - "learning_rate": 1.3965665859099702e-05, - "loss": 0.2802, + "epoch": 3.361804879806826, + "grad_norm": 0.1916201114654541, + "learning_rate": 1.281959493915112e-05, + "loss": 0.386, "step": 93280 }, { - "epoch": 3.28, - "learning_rate": 1.396310970602263e-05, - "loss": 0.2618, + "epoch": 3.3619850794680506, + "grad_norm": 0.27522167563438416, + "learning_rate": 1.2817046670347121e-05, + "loss": 0.3835, "step": 93285 }, { - "epoch": 3.28, - "learning_rate": 1.3960553696252304e-05, - "loss": 0.2519, + "epoch": 3.3621652791292753, + "grad_norm": 0.25006136298179626, + "learning_rate": 1.2814498567533523e-05, + "loss": 0.3698, "step": 93290 }, { - "epoch": 3.28, - "learning_rate": 1.3957997829821903e-05, - "loss": 0.2962, + "epoch": 3.3623454787905, + "grad_norm": 0.25098854303359985, + "learning_rate": 1.2811950630745e-05, + "loss": 0.3341, "step": 93295 }, { - "epoch": 3.28, - "learning_rate": 1.3955442106764632e-05, - "loss": 0.2457, + "epoch": 3.3625256784517243, + "grad_norm": 0.25163203477859497, + "learning_rate": 1.28094028600163e-05, + "loss": 0.4086, "step": 93300 }, { - "epoch": 3.28, - "learning_rate": 1.395288652711367e-05, - "loss": 0.2763, + "epoch": 3.362705878112949, + "grad_norm": 0.321817547082901, + "learning_rate": 1.2806855255382127e-05, + "loss": 0.3856, "step": 93305 }, { - "epoch": 3.28, - "learning_rate": 1.3950331090902186e-05, - "loss": 0.2768, + "epoch": 3.362886077774174, + "grad_norm": 0.22713923454284668, + "learning_rate": 1.2804307816877193e-05, + "loss": 0.3978, "step": 93310 }, { - "epoch": 3.28, - "learning_rate": 1.3947775798163377e-05, - "loss": 0.291, + "epoch": 3.3630662774353985, + "grad_norm": 0.24719440937042236, + "learning_rate": 1.2801760544536202e-05, + "loss": 0.3796, "step": 93315 }, { - "epoch": 3.28, - "learning_rate": 1.3945220648930408e-05, - "loss": 0.2642, + "epoch": 3.363246477096623, + "grad_norm": 0.24784217774868011, + "learning_rate": 1.2799213438393858e-05, + "loss": 0.3683, "step": 93320 }, { - "epoch": 3.28, - "learning_rate": 1.3942665643236475e-05, - "loss": 0.2497, + "epoch": 3.3634266767578476, + "grad_norm": 0.24816350638866425, + "learning_rate": 1.2796666498484865e-05, + "loss": 0.3849, "step": 93325 }, { - "epoch": 3.28, - "learning_rate": 1.3940110781114729e-05, - "loss": 0.2789, + "epoch": 3.3636068764190723, + "grad_norm": 0.2373788207769394, + "learning_rate": 1.2794119724843934e-05, + "loss": 0.388, "step": 93330 }, { - "epoch": 3.28, - "learning_rate": 1.3937556062598367e-05, - "loss": 0.2427, + "epoch": 3.363787076080297, + "grad_norm": 0.2272450029850006, + "learning_rate": 1.2791573117505761e-05, + "loss": 0.3771, "step": 93335 }, { - "epoch": 3.28, - "learning_rate": 1.3935001487720547e-05, - "loss": 0.262, + "epoch": 3.3639672757415218, + "grad_norm": 0.2307075560092926, + "learning_rate": 1.2789026676505039e-05, + "loss": 0.362, "step": 93340 }, { - "epoch": 3.28, - "learning_rate": 1.3932447056514442e-05, - "loss": 0.2798, + "epoch": 3.364147475402746, + "grad_norm": 0.22494341433048248, + "learning_rate": 1.2786480401876466e-05, + "loss": 0.3934, "step": 93345 }, { - "epoch": 3.28, - "learning_rate": 1.3929892769013208e-05, - "loss": 0.2792, + "epoch": 3.364327675063971, + "grad_norm": 0.2284454107284546, + "learning_rate": 1.2783934293654726e-05, + "loss": 0.4008, "step": 93350 }, { - "epoch": 3.28, - "learning_rate": 1.3927338625250033e-05, - "loss": 0.241, + "epoch": 3.3645078747251955, + "grad_norm": 0.2578500807285309, + "learning_rate": 1.2781388351874518e-05, + "loss": 0.3791, "step": 93355 }, { - "epoch": 3.28, - "learning_rate": 1.3924784625258069e-05, - "loss": 0.2577, + "epoch": 3.3646880743864203, + "grad_norm": 0.23964065313339233, + "learning_rate": 1.2778842576570543e-05, + "loss": 0.3465, "step": 93360 }, { - "epoch": 3.28, - "learning_rate": 1.3922230769070465e-05, - "loss": 0.2489, + "epoch": 3.3648682740476445, + "grad_norm": 0.23482295870780945, + "learning_rate": 1.277629696777745e-05, + "loss": 0.3947, "step": 93365 }, { - "epoch": 3.29, - "learning_rate": 1.3919677056720405e-05, - "loss": 0.2431, + "epoch": 3.3650484737088693, + "grad_norm": 0.27758488059043884, + "learning_rate": 1.277375152552996e-05, + "loss": 0.3551, "step": 93370 }, { - "epoch": 3.29, - "learning_rate": 1.3917123488241025e-05, - "loss": 0.2847, + "epoch": 3.365228673370094, + "grad_norm": 0.26252129673957825, + "learning_rate": 1.2771206249862722e-05, + "loss": 0.3843, "step": 93375 }, { - "epoch": 3.29, - "learning_rate": 1.3914570063665505e-05, - "loss": 0.2718, + "epoch": 3.3654088730313187, + "grad_norm": 0.23185403645038605, + "learning_rate": 1.2768661140810454e-05, + "loss": 0.4336, "step": 93380 }, { - "epoch": 3.29, - "learning_rate": 1.3912016783026984e-05, - "loss": 0.2637, + "epoch": 3.3655890726925435, + "grad_norm": 0.32735198736190796, + "learning_rate": 1.2766116198407798e-05, + "loss": 0.3843, "step": 93385 }, { - "epoch": 3.29, - "learning_rate": 1.3909463646358611e-05, - "loss": 0.2626, + "epoch": 3.365769272353768, + "grad_norm": 0.2612536549568176, + "learning_rate": 1.2763571422689432e-05, + "loss": 0.3721, "step": 93390 }, { - "epoch": 3.29, - "learning_rate": 1.390691065369355e-05, - "loss": 0.2238, + "epoch": 3.3659494720149925, + "grad_norm": 0.22687096893787384, + "learning_rate": 1.2761026813690047e-05, + "loss": 0.3867, "step": 93395 }, { - "epoch": 3.29, - "learning_rate": 1.390435780506495e-05, - "loss": 0.2608, + "epoch": 3.3661296716762172, + "grad_norm": 0.2720807194709778, + "learning_rate": 1.2758482371444299e-05, + "loss": 0.3975, "step": 93400 }, { - "epoch": 3.29, - "learning_rate": 1.3901805100505943e-05, - "loss": 0.2597, + "epoch": 3.366309871337442, + "grad_norm": 0.2100588083267212, + "learning_rate": 1.2755938095986861e-05, + "loss": 0.3865, "step": 93405 }, { - "epoch": 3.29, - "learning_rate": 1.3899252540049679e-05, - "loss": 0.2819, + "epoch": 3.3664900709986667, + "grad_norm": 0.23490869998931885, + "learning_rate": 1.2753393987352396e-05, + "loss": 0.4074, "step": 93410 }, { - "epoch": 3.29, - "learning_rate": 1.3896700123729315e-05, - "loss": 0.2807, + "epoch": 3.366670270659891, + "grad_norm": 0.25652775168418884, + "learning_rate": 1.2750850045575558e-05, + "loss": 0.3773, "step": 93415 }, { - "epoch": 3.29, - "learning_rate": 1.3894147851577979e-05, - "loss": 0.2479, + "epoch": 3.3668504703211157, + "grad_norm": 0.25951749086380005, + "learning_rate": 1.2748306270691024e-05, + "loss": 0.3564, "step": 93420 }, { - "epoch": 3.29, - "learning_rate": 1.389159572362881e-05, - "loss": 0.2543, + "epoch": 3.3670306699823405, + "grad_norm": 0.2929254174232483, + "learning_rate": 1.2745762662733445e-05, + "loss": 0.3865, "step": 93425 }, { - "epoch": 3.29, - "learning_rate": 1.3889043739914943e-05, - "loss": 0.2714, + "epoch": 3.367210869643565, + "grad_norm": 0.24950219690799713, + "learning_rate": 1.274321922173748e-05, + "loss": 0.3821, "step": 93430 }, { - "epoch": 3.29, - "learning_rate": 1.3886491900469534e-05, - "loss": 0.258, + "epoch": 3.36739106930479, + "grad_norm": 0.2351795881986618, + "learning_rate": 1.2740675947737777e-05, + "loss": 0.4021, "step": 93435 }, { - "epoch": 3.29, - "learning_rate": 1.3883940205325701e-05, - "loss": 0.2697, + "epoch": 3.367571268966014, + "grad_norm": 0.267691969871521, + "learning_rate": 1.273813284076898e-05, + "loss": 0.3983, "step": 93440 }, { - "epoch": 3.29, - "learning_rate": 1.3881388654516573e-05, - "loss": 0.2575, + "epoch": 3.367751468627239, + "grad_norm": 0.18753398954868317, + "learning_rate": 1.2735589900865762e-05, + "loss": 0.3566, "step": 93445 }, { - "epoch": 3.29, - "learning_rate": 1.3878837248075294e-05, - "loss": 0.2603, + "epoch": 3.3679316682884637, + "grad_norm": 0.243241548538208, + "learning_rate": 1.2733047128062756e-05, + "loss": 0.3818, "step": 93450 }, { - "epoch": 3.29, - "learning_rate": 1.3876285986034981e-05, - "loss": 0.2569, + "epoch": 3.3681118679496884, + "grad_norm": 0.28631412982940674, + "learning_rate": 1.273050452239461e-05, + "loss": 0.3818, "step": 93455 }, { - "epoch": 3.29, - "learning_rate": 1.3873734868428768e-05, - "loss": 0.232, + "epoch": 3.3682920676109127, + "grad_norm": 0.22822198271751404, + "learning_rate": 1.272796208389596e-05, + "loss": 0.3824, "step": 93460 }, { - "epoch": 3.29, - "learning_rate": 1.3871183895289763e-05, - "loss": 0.2485, + "epoch": 3.3684722672721374, + "grad_norm": 0.262082040309906, + "learning_rate": 1.2725419812601453e-05, + "loss": 0.377, "step": 93465 }, { - "epoch": 3.29, - "learning_rate": 1.3868633066651113e-05, - "loss": 0.2465, + "epoch": 3.368652466933362, + "grad_norm": 0.21219515800476074, + "learning_rate": 1.2722877708545717e-05, + "loss": 0.3848, "step": 93470 }, { - "epoch": 3.29, - "learning_rate": 1.3866082382545925e-05, - "loss": 0.2844, + "epoch": 3.368832666594587, + "grad_norm": 0.2659531533718109, + "learning_rate": 1.272033577176341e-05, + "loss": 0.3382, "step": 93475 }, { - "epoch": 3.29, - "learning_rate": 1.386353184300731e-05, - "loss": 0.2281, + "epoch": 3.3690128662558116, + "grad_norm": 0.2504463791847229, + "learning_rate": 1.2717794002289134e-05, + "loss": 0.3823, "step": 93480 }, { - "epoch": 3.29, - "learning_rate": 1.3860981448068394e-05, - "loss": 0.2563, + "epoch": 3.369193065917036, + "grad_norm": 0.27462002635002136, + "learning_rate": 1.2715252400157548e-05, + "loss": 0.4201, "step": 93485 }, { - "epoch": 3.29, - "learning_rate": 1.3858431197762306e-05, - "loss": 0.2635, + "epoch": 3.3693732655782607, + "grad_norm": 0.2492084950208664, + "learning_rate": 1.2712710965403269e-05, + "loss": 0.3777, "step": 93490 }, { - "epoch": 3.29, - "learning_rate": 1.3855881092122142e-05, - "loss": 0.2785, + "epoch": 3.3695534652394854, + "grad_norm": 0.2124946117401123, + "learning_rate": 1.2710169698060922e-05, + "loss": 0.3578, "step": 93495 }, { - "epoch": 3.29, - "learning_rate": 1.3853331131181022e-05, - "loss": 0.2459, + "epoch": 3.36973366490071, + "grad_norm": 0.24669291079044342, + "learning_rate": 1.2707628598165137e-05, + "loss": 0.4265, "step": 93500 }, { - "epoch": 3.29, - "eval_loss": 0.2541661560535431, - "eval_runtime": 10.5324, - "eval_samples_per_second": 9.495, - "eval_steps_per_second": 9.495, + "epoch": 3.36973366490071, + "eval_loss": 0.4303239583969116, + "eval_runtime": 3.5298, + "eval_samples_per_second": 28.331, + "eval_steps_per_second": 7.083, "step": 93500 }, { - "epoch": 3.29, - "learning_rate": 1.3850781314972038e-05, - "loss": 0.251, + "epoch": 3.3699138645619344, + "grad_norm": 0.22059577703475952, + "learning_rate": 1.2705087665750531e-05, + "loss": 0.3647, "step": 93505 }, { - "epoch": 3.29, - "learning_rate": 1.3848231643528325e-05, - "loss": 0.2656, + "epoch": 3.370094064223159, + "grad_norm": 0.20390857756137848, + "learning_rate": 1.2702546900851715e-05, + "loss": 0.3714, "step": 93510 }, { - "epoch": 3.29, - "learning_rate": 1.3845682116882972e-05, - "loss": 0.2656, + "epoch": 3.370274263884384, + "grad_norm": 0.22258242964744568, + "learning_rate": 1.2700006303503325e-05, + "loss": 0.3766, "step": 93515 }, { - "epoch": 3.29, - "learning_rate": 1.3843132735069087e-05, - "loss": 0.2524, + "epoch": 3.3704544635456086, + "grad_norm": 0.26116088032722473, + "learning_rate": 1.2697465873739966e-05, + "loss": 0.403, "step": 93520 }, { - "epoch": 3.29, - "learning_rate": 1.3840583498119759e-05, - "loss": 0.2857, + "epoch": 3.3706346632068334, + "grad_norm": 0.2494215965270996, + "learning_rate": 1.269492561159626e-05, + "loss": 0.3681, "step": 93525 }, { - "epoch": 3.29, - "learning_rate": 1.383803440606811e-05, - "loss": 0.2584, + "epoch": 3.3708148628680576, + "grad_norm": 0.24651680886745453, + "learning_rate": 1.2692385517106802e-05, + "loss": 0.3774, "step": 93530 }, { - "epoch": 3.29, - "learning_rate": 1.3835485458947217e-05, - "loss": 0.2466, + "epoch": 3.3709950625292824, + "grad_norm": 0.22229456901550293, + "learning_rate": 1.2689845590306204e-05, + "loss": 0.3997, "step": 93535 }, { - "epoch": 3.29, - "learning_rate": 1.3832936656790202e-05, - "loss": 0.2683, + "epoch": 3.371175262190507, + "grad_norm": 0.2545780539512634, + "learning_rate": 1.2687305831229084e-05, + "loss": 0.3661, "step": 93540 }, { - "epoch": 3.29, - "learning_rate": 1.3830387999630132e-05, - "loss": 0.2416, + "epoch": 3.371355461851732, + "grad_norm": 0.19883494079113007, + "learning_rate": 1.268476623991005e-05, + "loss": 0.3843, "step": 93545 }, { - "epoch": 3.29, - "learning_rate": 1.3827839487500121e-05, - "loss": 0.2635, + "epoch": 3.371535661512956, + "grad_norm": 0.19383443892002106, + "learning_rate": 1.2682226816383668e-05, + "loss": 0.3768, "step": 93550 }, { - "epoch": 3.29, - "learning_rate": 1.3825291120433257e-05, - "loss": 0.2477, + "epoch": 3.371715861174181, + "grad_norm": 0.2644541263580322, + "learning_rate": 1.2679687560684578e-05, + "loss": 0.3745, "step": 93555 }, { - "epoch": 3.29, - "learning_rate": 1.382274289846261e-05, - "loss": 0.2627, + "epoch": 3.3718960608354056, + "grad_norm": 0.24296367168426514, + "learning_rate": 1.2677148472847344e-05, + "loss": 0.4342, "step": 93560 }, { - "epoch": 3.29, - "learning_rate": 1.382019482162129e-05, - "loss": 0.2665, + "epoch": 3.3720762604966303, + "grad_norm": 0.23606744408607483, + "learning_rate": 1.2674609552906586e-05, + "loss": 0.3628, "step": 93565 }, { - "epoch": 3.29, - "learning_rate": 1.3817646889942373e-05, - "loss": 0.2546, + "epoch": 3.372256460157855, + "grad_norm": 0.20120225846767426, + "learning_rate": 1.2672070800896899e-05, + "loss": 0.3939, "step": 93570 }, { - "epoch": 3.29, - "learning_rate": 1.3815099103458942e-05, - "loss": 0.2686, + "epoch": 3.3724366598190794, + "grad_norm": 0.20473100244998932, + "learning_rate": 1.2669532216852842e-05, + "loss": 0.3502, "step": 93575 }, { - "epoch": 3.29, - "learning_rate": 1.3812551462204071e-05, - "loss": 0.2588, + "epoch": 3.372616859480304, + "grad_norm": 0.2535666823387146, + "learning_rate": 1.2666993800809026e-05, + "loss": 0.3759, "step": 93580 }, { - "epoch": 3.29, - "learning_rate": 1.3810003966210849e-05, - "loss": 0.2995, + "epoch": 3.372797059141529, + "grad_norm": 0.2333008199930191, + "learning_rate": 1.2664455552800024e-05, + "loss": 0.3961, "step": 93585 }, { - "epoch": 3.29, - "learning_rate": 1.3807456615512348e-05, - "loss": 0.2539, + "epoch": 3.3729772588027536, + "grad_norm": 0.1824401170015335, + "learning_rate": 1.2661917472860449e-05, + "loss": 0.3831, "step": 93590 }, { - "epoch": 3.29, - "learning_rate": 1.3804909410141652e-05, - "loss": 0.2459, + "epoch": 3.373157458463978, + "grad_norm": 0.26639214158058167, + "learning_rate": 1.2659379561024848e-05, + "loss": 0.3991, "step": 93595 }, { - "epoch": 3.29, - "learning_rate": 1.3802362350131826e-05, - "loss": 0.2339, + "epoch": 3.3733376581252026, + "grad_norm": 0.2088332623243332, + "learning_rate": 1.2656841817327803e-05, + "loss": 0.3711, "step": 93600 }, { - "epoch": 3.29, - "learning_rate": 1.379981543551595e-05, - "loss": 0.2507, + "epoch": 3.3735178577864273, + "grad_norm": 0.24687723815441132, + "learning_rate": 1.2654304241803904e-05, + "loss": 0.4086, "step": 93605 }, { - "epoch": 3.29, - "learning_rate": 1.379726866632709e-05, - "loss": 0.2696, + "epoch": 3.373698057447652, + "grad_norm": 0.2668485641479492, + "learning_rate": 1.2651766834487724e-05, + "loss": 0.3932, "step": 93610 }, { - "epoch": 3.29, - "learning_rate": 1.3794722042598312e-05, - "loss": 0.248, + "epoch": 3.373878257108877, + "grad_norm": 0.26824715733528137, + "learning_rate": 1.2649229595413831e-05, + "loss": 0.4194, "step": 93615 }, { - "epoch": 3.29, - "learning_rate": 1.3792175564362676e-05, - "loss": 0.2534, + "epoch": 3.3740584567701015, + "grad_norm": 0.27278998494148254, + "learning_rate": 1.2646692524616788e-05, + "loss": 0.3771, "step": 93620 }, { - "epoch": 3.29, - "learning_rate": 1.3789629231653264e-05, - "loss": 0.2547, + "epoch": 3.374238656431326, + "grad_norm": 0.2048804759979248, + "learning_rate": 1.2644155622131163e-05, + "loss": 0.3519, "step": 93625 }, { - "epoch": 3.29, - "learning_rate": 1.378708304450313e-05, - "loss": 0.2439, + "epoch": 3.3744188560925505, + "grad_norm": 0.2662324011325836, + "learning_rate": 1.2641618887991532e-05, + "loss": 0.3949, "step": 93630 }, { - "epoch": 3.29, - "learning_rate": 1.378453700294533e-05, - "loss": 0.2218, + "epoch": 3.3745990557537753, + "grad_norm": 0.24462181329727173, + "learning_rate": 1.2639082322232455e-05, + "loss": 0.3855, "step": 93635 }, { - "epoch": 3.29, - "learning_rate": 1.3781991107012914e-05, - "loss": 0.2338, + "epoch": 3.374779255415, + "grad_norm": 0.2650309205055237, + "learning_rate": 1.2636545924888485e-05, + "loss": 0.3813, "step": 93640 }, { - "epoch": 3.29, - "learning_rate": 1.3779445356738956e-05, - "loss": 0.2689, + "epoch": 3.3749594550762243, + "grad_norm": 0.26425033807754517, + "learning_rate": 1.2634009695994182e-05, + "loss": 0.3943, "step": 93645 }, { - "epoch": 3.29, - "learning_rate": 1.3776899752156513e-05, - "loss": 0.2705, + "epoch": 3.375139654737449, + "grad_norm": 0.21526920795440674, + "learning_rate": 1.2631473635584107e-05, + "loss": 0.389, "step": 93650 }, { - "epoch": 3.3, - "learning_rate": 1.377435429329863e-05, - "loss": 0.2428, + "epoch": 3.3753198543986738, + "grad_norm": 0.2526695430278778, + "learning_rate": 1.2628937743692795e-05, + "loss": 0.383, "step": 93655 }, { - "epoch": 3.3, - "learning_rate": 1.3771808980198347e-05, - "loss": 0.2571, + "epoch": 3.3755000540598985, + "grad_norm": 0.26634764671325684, + "learning_rate": 1.2626402020354832e-05, + "loss": 0.3667, "step": 93660 }, { - "epoch": 3.3, - "learning_rate": 1.3769263812888738e-05, - "loss": 0.2609, + "epoch": 3.3756802537211232, + "grad_norm": 0.21848170459270477, + "learning_rate": 1.2623866465604727e-05, + "loss": 0.3813, "step": 93665 }, { - "epoch": 3.3, - "learning_rate": 1.3766718791402836e-05, - "loss": 0.2526, + "epoch": 3.3758604533823475, + "grad_norm": 0.25666651129722595, + "learning_rate": 1.2621331079477056e-05, + "loss": 0.3716, "step": 93670 }, { - "epoch": 3.3, - "learning_rate": 1.3764173915773676e-05, - "loss": 0.2505, + "epoch": 3.3760406530435723, + "grad_norm": 0.2850119471549988, + "learning_rate": 1.261879586200635e-05, + "loss": 0.3825, "step": 93675 }, { - "epoch": 3.3, - "learning_rate": 1.3761629186034325e-05, - "loss": 0.2853, + "epoch": 3.376220852704797, + "grad_norm": 0.22654448449611664, + "learning_rate": 1.2616260813227155e-05, + "loss": 0.3777, "step": 93680 }, { - "epoch": 3.3, - "learning_rate": 1.3759084602217814e-05, - "loss": 0.2664, + "epoch": 3.3764010523660217, + "grad_norm": 0.2590138912200928, + "learning_rate": 1.2613725933174009e-05, + "loss": 0.3641, "step": 93685 }, { - "epoch": 3.3, - "learning_rate": 1.375654016435718e-05, - "loss": 0.2423, + "epoch": 3.376581252027246, + "grad_norm": 0.20931139588356018, + "learning_rate": 1.2611191221881449e-05, + "loss": 0.3663, "step": 93690 }, { - "epoch": 3.3, - "learning_rate": 1.3753995872485448e-05, - "loss": 0.2628, + "epoch": 3.3767614516884707, + "grad_norm": 0.2435794174671173, + "learning_rate": 1.2608656679384002e-05, + "loss": 0.3863, "step": 93695 }, { - "epoch": 3.3, - "learning_rate": 1.3751451726635684e-05, - "loss": 0.2537, + "epoch": 3.3769416513496955, + "grad_norm": 0.26772207021713257, + "learning_rate": 1.2606122305716215e-05, + "loss": 0.3818, "step": 93700 }, { - "epoch": 3.3, - "learning_rate": 1.3748907726840892e-05, - "loss": 0.2458, + "epoch": 3.37712185101092, + "grad_norm": 0.21198633313179016, + "learning_rate": 1.2603588100912611e-05, + "loss": 0.3792, "step": 93705 }, { - "epoch": 3.3, - "learning_rate": 1.3746363873134132e-05, - "loss": 0.2434, + "epoch": 3.377302050672145, + "grad_norm": 0.18529768288135529, + "learning_rate": 1.2601054065007722e-05, + "loss": 0.3644, "step": 93710 }, { - "epoch": 3.3, - "learning_rate": 1.374382016554841e-05, - "loss": 0.2754, + "epoch": 3.3774822503333692, + "grad_norm": 0.24963034689426422, + "learning_rate": 1.2598520198036074e-05, + "loss": 0.3925, "step": 93715 }, { - "epoch": 3.3, - "learning_rate": 1.374127660411677e-05, - "loss": 0.273, + "epoch": 3.377662449994594, + "grad_norm": 0.27459418773651123, + "learning_rate": 1.259598650003217e-05, + "loss": 0.3957, "step": 93720 }, { - "epoch": 3.3, - "learning_rate": 1.3738733188872238e-05, - "loss": 0.2431, + "epoch": 3.3778426496558187, + "grad_norm": 0.23690108954906464, + "learning_rate": 1.2593452971030564e-05, + "loss": 0.3768, "step": 93725 }, { - "epoch": 3.3, - "learning_rate": 1.3736189919847835e-05, - "loss": 0.2741, + "epoch": 3.3780228493170434, + "grad_norm": 0.25995904207229614, + "learning_rate": 1.2590919611065757e-05, + "loss": 0.3882, "step": 93730 }, { - "epoch": 3.3, - "learning_rate": 1.3733646797076571e-05, - "loss": 0.2543, + "epoch": 3.3782030489782677, + "grad_norm": 0.2160549908876419, + "learning_rate": 1.2588386420172269e-05, + "loss": 0.3617, "step": 93735 }, { - "epoch": 3.3, - "learning_rate": 1.373110382059149e-05, - "loss": 0.2623, + "epoch": 3.3783832486394925, + "grad_norm": 0.20487286150455475, + "learning_rate": 1.2585853398384612e-05, + "loss": 0.3748, "step": 93740 }, { - "epoch": 3.3, - "learning_rate": 1.3728560990425598e-05, - "loss": 0.2502, + "epoch": 3.378563448300717, + "grad_norm": 0.23326444625854492, + "learning_rate": 1.258332054573729e-05, + "loss": 0.3864, "step": 93745 }, { - "epoch": 3.3, - "learning_rate": 1.3726018306611905e-05, - "loss": 0.2691, + "epoch": 3.378743647961942, + "grad_norm": 0.23716379702091217, + "learning_rate": 1.2580787862264832e-05, + "loss": 0.3846, "step": 93750 }, { - "epoch": 3.3, - "learning_rate": 1.3723475769183442e-05, - "loss": 0.2653, + "epoch": 3.3789238476231667, + "grad_norm": 0.23601892590522766, + "learning_rate": 1.2578255348001741e-05, + "loss": 0.371, "step": 93755 }, { - "epoch": 3.3, - "learning_rate": 1.3720933378173207e-05, - "loss": 0.275, + "epoch": 3.379104047284391, + "grad_norm": 0.2128582000732422, + "learning_rate": 1.2575723002982498e-05, + "loss": 0.3677, "step": 93760 }, { - "epoch": 3.3, - "learning_rate": 1.3718391133614227e-05, - "loss": 0.2911, + "epoch": 3.3792842469456157, + "grad_norm": 0.24130862951278687, + "learning_rate": 1.2573190827241637e-05, + "loss": 0.3728, "step": 93765 }, { - "epoch": 3.3, - "learning_rate": 1.3715849035539503e-05, - "loss": 0.252, + "epoch": 3.3794644466068404, + "grad_norm": 0.24159899353981018, + "learning_rate": 1.2570658820813633e-05, + "loss": 0.3939, "step": 93770 }, { - "epoch": 3.3, - "learning_rate": 1.3713307083982035e-05, - "loss": 0.2553, + "epoch": 3.379644646268065, + "grad_norm": 0.2206299901008606, + "learning_rate": 1.2568126983733015e-05, + "loss": 0.3902, "step": 93775 }, { - "epoch": 3.3, - "learning_rate": 1.3710765278974847e-05, - "loss": 0.2501, + "epoch": 3.3798248459292894, + "grad_norm": 0.20361188054084778, + "learning_rate": 1.256559531603425e-05, + "loss": 0.3666, "step": 93780 }, { - "epoch": 3.3, - "learning_rate": 1.3708223620550928e-05, - "loss": 0.2694, + "epoch": 3.380005045590514, + "grad_norm": 0.22459423542022705, + "learning_rate": 1.2563063817751834e-05, + "loss": 0.3898, "step": 93785 }, { - "epoch": 3.3, - "learning_rate": 1.3705682108743276e-05, - "loss": 0.26, + "epoch": 3.380185245251739, + "grad_norm": 0.3262191414833069, + "learning_rate": 1.2560532488920274e-05, + "loss": 0.3945, "step": 93790 }, { - "epoch": 3.3, - "learning_rate": 1.3703140743584909e-05, - "loss": 0.2596, + "epoch": 3.3803654449129636, + "grad_norm": 0.23222355544567108, + "learning_rate": 1.2558001329574049e-05, + "loss": 0.3799, "step": 93795 }, { - "epoch": 3.3, - "learning_rate": 1.3700599525108812e-05, - "loss": 0.2495, + "epoch": 3.3805456445741884, + "grad_norm": 0.24489018321037292, + "learning_rate": 1.255547033974765e-05, + "loss": 0.4148, "step": 93800 }, { - "epoch": 3.3, - "learning_rate": 1.3698058453347975e-05, - "loss": 0.2526, + "epoch": 3.3807258442354127, + "grad_norm": 0.246272012591362, + "learning_rate": 1.2552939519475554e-05, + "loss": 0.3642, "step": 93805 }, { - "epoch": 3.3, - "learning_rate": 1.3695517528335411e-05, - "loss": 0.2751, + "epoch": 3.3809060438966374, + "grad_norm": 0.2430204451084137, + "learning_rate": 1.255040886879224e-05, + "loss": 0.3893, "step": 93810 }, { - "epoch": 3.3, - "learning_rate": 1.3692976750104092e-05, - "loss": 0.2642, + "epoch": 3.381086243557862, + "grad_norm": 0.28567835688591003, + "learning_rate": 1.2547878387732203e-05, + "loss": 0.4011, "step": 93815 }, { - "epoch": 3.3, - "learning_rate": 1.3690436118687031e-05, - "loss": 0.2701, + "epoch": 3.381266443219087, + "grad_norm": 0.28451603651046753, + "learning_rate": 1.2545348076329916e-05, + "loss": 0.3964, "step": 93820 }, { - "epoch": 3.3, - "learning_rate": 1.3687895634117199e-05, - "loss": 0.2818, + "epoch": 3.381446642880311, + "grad_norm": 0.20606903731822968, + "learning_rate": 1.254281793461985e-05, + "loss": 0.3684, "step": 93825 }, { - "epoch": 3.3, - "learning_rate": 1.3685355296427582e-05, - "loss": 0.2594, + "epoch": 3.381626842541536, + "grad_norm": 0.2716697156429291, + "learning_rate": 1.2540287962636473e-05, + "loss": 0.3818, "step": 93830 }, { - "epoch": 3.3, - "learning_rate": 1.3682815105651175e-05, - "loss": 0.2336, + "epoch": 3.3818070422027606, + "grad_norm": 0.2406664490699768, + "learning_rate": 1.2537758160414265e-05, + "loss": 0.3719, "step": 93835 }, { - "epoch": 3.3, - "learning_rate": 1.3680275061820957e-05, - "loss": 0.2715, + "epoch": 3.3819872418639854, + "grad_norm": 0.23287048935890198, + "learning_rate": 1.2535228527987674e-05, + "loss": 0.373, "step": 93840 }, { - "epoch": 3.3, - "learning_rate": 1.3677735164969909e-05, - "loss": 0.264, + "epoch": 3.38216744152521, + "grad_norm": 0.20795047283172607, + "learning_rate": 1.2532699065391206e-05, + "loss": 0.3667, "step": 93845 }, { - "epoch": 3.3, - "learning_rate": 1.3675195415130999e-05, - "loss": 0.2618, + "epoch": 3.3823476411864344, + "grad_norm": 0.23873348534107208, + "learning_rate": 1.2530169772659278e-05, + "loss": 0.3937, "step": 93850 }, { - "epoch": 3.3, - "learning_rate": 1.3672655812337221e-05, - "loss": 0.2503, + "epoch": 3.382527840847659, + "grad_norm": 0.3395892083644867, + "learning_rate": 1.2527640649826384e-05, + "loss": 0.408, "step": 93855 }, { - "epoch": 3.3, - "learning_rate": 1.3670116356621544e-05, - "loss": 0.2506, + "epoch": 3.382708040508884, + "grad_norm": 0.2514219582080841, + "learning_rate": 1.2525111696926967e-05, + "loss": 0.401, "step": 93860 }, { - "epoch": 3.3, - "learning_rate": 1.3667577048016927e-05, - "loss": 0.2564, + "epoch": 3.3828882401701086, + "grad_norm": 0.23013897240161896, + "learning_rate": 1.2522582913995484e-05, + "loss": 0.3651, "step": 93865 }, { - "epoch": 3.3, - "learning_rate": 1.3665037886556353e-05, - "loss": 0.2567, + "epoch": 3.383068439831333, + "grad_norm": 0.2501782476902008, + "learning_rate": 1.2520054301066409e-05, + "loss": 0.3569, "step": 93870 }, { - "epoch": 3.3, - "learning_rate": 1.3662498872272806e-05, - "loss": 0.244, + "epoch": 3.3832486394925576, + "grad_norm": 0.2740953266620636, + "learning_rate": 1.2517525858174166e-05, + "loss": 0.3837, "step": 93875 }, { - "epoch": 3.3, - "learning_rate": 1.3659960005199238e-05, - "loss": 0.2331, + "epoch": 3.3834288391537823, + "grad_norm": 0.2235153615474701, + "learning_rate": 1.251499758535321e-05, + "loss": 0.344, "step": 93880 }, { - "epoch": 3.3, - "learning_rate": 1.3657421285368615e-05, - "loss": 0.2539, + "epoch": 3.383609038815007, + "grad_norm": 0.21870172023773193, + "learning_rate": 1.2512469482638006e-05, + "loss": 0.3887, "step": 93885 }, { - "epoch": 3.3, - "learning_rate": 1.3654882712813893e-05, - "loss": 0.2625, + "epoch": 3.383789238476232, + "grad_norm": 0.21588364243507385, + "learning_rate": 1.2509941550062987e-05, + "loss": 0.3651, "step": 93890 }, { - "epoch": 3.3, - "learning_rate": 1.365234428756805e-05, - "loss": 0.2449, + "epoch": 3.3839694381374565, + "grad_norm": 0.24432948231697083, + "learning_rate": 1.2507413787662592e-05, + "loss": 0.3971, "step": 93895 }, { - "epoch": 3.3, - "learning_rate": 1.364980600966404e-05, - "loss": 0.2693, + "epoch": 3.384149637798681, + "grad_norm": 0.23872052133083344, + "learning_rate": 1.2504886195471272e-05, + "loss": 0.3919, "step": 93900 }, { - "epoch": 3.3, - "learning_rate": 1.3647267879134807e-05, - "loss": 0.2634, + "epoch": 3.3843298374599056, + "grad_norm": 0.25494420528411865, + "learning_rate": 1.2502358773523443e-05, + "loss": 0.3789, "step": 93905 }, { - "epoch": 3.3, - "learning_rate": 1.3644729896013331e-05, - "loss": 0.2605, + "epoch": 3.3845100371211303, + "grad_norm": 0.2569286525249481, + "learning_rate": 1.2499831521853567e-05, + "loss": 0.3544, "step": 93910 }, { - "epoch": 3.3, - "learning_rate": 1.3642192060332554e-05, - "loss": 0.2513, + "epoch": 3.384690236782355, + "grad_norm": 0.2678029537200928, + "learning_rate": 1.2497304440496068e-05, + "loss": 0.4016, "step": 93915 }, { - "epoch": 3.3, - "learning_rate": 1.3639654372125416e-05, - "loss": 0.2845, + "epoch": 3.3848704364435793, + "grad_norm": 0.21244113147258759, + "learning_rate": 1.2494777529485374e-05, + "loss": 0.3636, "step": 93920 }, { - "epoch": 3.3, - "learning_rate": 1.3637116831424879e-05, - "loss": 0.2483, + "epoch": 3.385050636104804, + "grad_norm": 0.21646173298358917, + "learning_rate": 1.2492250788855916e-05, + "loss": 0.3687, "step": 93925 }, { - "epoch": 3.3, - "learning_rate": 1.3634579438263902e-05, - "loss": 0.2557, + "epoch": 3.385230835766029, + "grad_norm": 0.214036226272583, + "learning_rate": 1.248972421864211e-05, + "loss": 0.3638, "step": 93930 }, { - "epoch": 3.3, - "learning_rate": 1.3632042192675424e-05, - "loss": 0.2434, + "epoch": 3.3854110354272535, + "grad_norm": 0.19108587503433228, + "learning_rate": 1.2487197818878399e-05, + "loss": 0.3568, "step": 93935 }, { - "epoch": 3.31, - "learning_rate": 1.3629505094692382e-05, - "loss": 0.2606, + "epoch": 3.3855912350884783, + "grad_norm": 0.2635771632194519, + "learning_rate": 1.2484671589599204e-05, + "loss": 0.3701, "step": 93940 }, { - "epoch": 3.31, - "learning_rate": 1.3626968144347715e-05, - "loss": 0.2532, + "epoch": 3.3857714347497025, + "grad_norm": 0.27943238615989685, + "learning_rate": 1.2482145530838918e-05, + "loss": 0.3563, "step": 93945 }, { - "epoch": 3.31, - "learning_rate": 1.3624431341674383e-05, - "loss": 0.2282, + "epoch": 3.3859516344109273, + "grad_norm": 0.2727324366569519, + "learning_rate": 1.2479619642631985e-05, + "loss": 0.3783, "step": 93950 }, { - "epoch": 3.31, - "learning_rate": 1.362189468670531e-05, - "loss": 0.2614, + "epoch": 3.386131834072152, + "grad_norm": 0.28211259841918945, + "learning_rate": 1.2477093925012808e-05, + "loss": 0.4027, "step": 93955 }, { - "epoch": 3.31, - "learning_rate": 1.3619358179473438e-05, - "loss": 0.2579, + "epoch": 3.3863120337333767, + "grad_norm": 0.24526701867580414, + "learning_rate": 1.2474568378015802e-05, + "loss": 0.3788, "step": 93960 }, { - "epoch": 3.31, - "learning_rate": 1.3616821820011689e-05, - "loss": 0.2629, + "epoch": 3.386492233394601, + "grad_norm": 0.2575979232788086, + "learning_rate": 1.247204300167538e-05, + "loss": 0.358, "step": 93965 }, { - "epoch": 3.31, - "learning_rate": 1.3614285608353017e-05, - "loss": 0.2686, + "epoch": 3.3866724330558258, + "grad_norm": 0.261115700006485, + "learning_rate": 1.2469517796025934e-05, + "loss": 0.3946, "step": 93970 }, { - "epoch": 3.31, - "learning_rate": 1.3611749544530334e-05, - "loss": 0.2524, + "epoch": 3.3868526327170505, + "grad_norm": 0.2626388370990753, + "learning_rate": 1.2466992761101893e-05, + "loss": 0.3957, "step": 93975 }, { - "epoch": 3.31, - "learning_rate": 1.360921362857659e-05, - "loss": 0.2663, + "epoch": 3.3870328323782752, + "grad_norm": 0.2778167724609375, + "learning_rate": 1.2464467896937649e-05, + "loss": 0.3642, "step": 93980 }, { - "epoch": 3.31, - "learning_rate": 1.3606677860524686e-05, - "loss": 0.2573, + "epoch": 3.3872130320395, + "grad_norm": 0.2699746787548065, + "learning_rate": 1.2461943203567602e-05, + "loss": 0.3913, "step": 93985 }, { - "epoch": 3.31, - "learning_rate": 1.3604142240407575e-05, - "loss": 0.2445, + "epoch": 3.3873932317007243, + "grad_norm": 0.2563049793243408, + "learning_rate": 1.2459418681026152e-05, + "loss": 0.3911, "step": 93990 }, { - "epoch": 3.31, - "learning_rate": 1.3601606768258163e-05, - "loss": 0.2402, + "epoch": 3.387573431361949, + "grad_norm": 0.27618205547332764, + "learning_rate": 1.2456894329347685e-05, + "loss": 0.4132, "step": 93995 }, { - "epoch": 3.31, - "learning_rate": 1.3599071444109378e-05, - "loss": 0.2714, + "epoch": 3.3877536310231737, + "grad_norm": 0.19964580237865448, + "learning_rate": 1.2454370148566613e-05, + "loss": 0.3732, "step": 94000 }, { - "epoch": 3.31, - "eval_loss": 0.2543759047985077, - "eval_runtime": 10.548, - "eval_samples_per_second": 9.48, - "eval_steps_per_second": 9.48, + "epoch": 3.3877536310231737, + "eval_loss": 0.43040722608566284, + "eval_runtime": 3.5294, + "eval_samples_per_second": 28.334, + "eval_steps_per_second": 7.083, "step": 94000 }, { - "epoch": 3.31, - "learning_rate": 1.3596536267994126e-05, - "loss": 0.2596, + "epoch": 3.3879338306843985, + "grad_norm": 0.2523888647556305, + "learning_rate": 1.2451846138717321e-05, + "loss": 0.3964, "step": 94005 }, { - "epoch": 3.31, - "learning_rate": 1.3594001239945345e-05, - "loss": 0.2738, + "epoch": 3.3881140303456228, + "grad_norm": 0.2638121247291565, + "learning_rate": 1.2449322299834196e-05, + "loss": 0.4108, "step": 94010 }, { - "epoch": 3.31, - "learning_rate": 1.3591466359995942e-05, - "loss": 0.2628, + "epoch": 3.3882942300068475, + "grad_norm": 0.2701154053211212, + "learning_rate": 1.2446798631951623e-05, + "loss": 0.3477, "step": 94015 }, { - "epoch": 3.31, - "learning_rate": 1.358893162817882e-05, - "loss": 0.2545, + "epoch": 3.388474429668072, + "grad_norm": 0.2638227641582489, + "learning_rate": 1.2444275135103988e-05, + "loss": 0.4001, "step": 94020 }, { - "epoch": 3.31, - "learning_rate": 1.358639704452691e-05, - "loss": 0.2568, + "epoch": 3.388654629329297, + "grad_norm": 0.20842356979846954, + "learning_rate": 1.2441751809325666e-05, + "loss": 0.3629, "step": 94025 }, { - "epoch": 3.31, - "learning_rate": 1.35838626090731e-05, - "loss": 0.2592, + "epoch": 3.3888348289905217, + "grad_norm": 0.24552541971206665, + "learning_rate": 1.2439228654651053e-05, + "loss": 0.381, "step": 94030 }, { - "epoch": 3.31, - "learning_rate": 1.3581328321850322e-05, - "loss": 0.2703, + "epoch": 3.389015028651746, + "grad_norm": 0.22910676896572113, + "learning_rate": 1.2436705671114515e-05, + "loss": 0.3913, "step": 94035 }, { - "epoch": 3.31, - "learning_rate": 1.3578794182891464e-05, - "loss": 0.2574, + "epoch": 3.3891952283129707, + "grad_norm": 0.1990845948457718, + "learning_rate": 1.2434182858750431e-05, + "loss": 0.4065, "step": 94040 }, { - "epoch": 3.31, - "learning_rate": 1.3576260192229445e-05, - "loss": 0.2624, + "epoch": 3.3893754279741954, + "grad_norm": 0.24163967370986938, + "learning_rate": 1.2431660217593175e-05, + "loss": 0.4221, "step": 94045 }, { - "epoch": 3.31, - "learning_rate": 1.3573726349897159e-05, - "loss": 0.2587, + "epoch": 3.38955562763542, + "grad_norm": 0.20850235223770142, + "learning_rate": 1.2429137747677105e-05, + "loss": 0.3812, "step": 94050 }, { - "epoch": 3.31, - "learning_rate": 1.3571192655927503e-05, - "loss": 0.2631, + "epoch": 3.3897358272966445, + "grad_norm": 0.22851799428462982, + "learning_rate": 1.2426615449036619e-05, + "loss": 0.4011, "step": 94055 }, { - "epoch": 3.31, - "learning_rate": 1.356865911035337e-05, - "loss": 0.2539, + "epoch": 3.389916026957869, + "grad_norm": 0.24421176314353943, + "learning_rate": 1.2424093321706052e-05, + "loss": 0.432, "step": 94060 }, { - "epoch": 3.31, - "learning_rate": 1.3566125713207678e-05, - "loss": 0.25, + "epoch": 3.390096226619094, + "grad_norm": 0.20470160245895386, + "learning_rate": 1.242157136571977e-05, + "loss": 0.4326, "step": 94065 }, { - "epoch": 3.31, - "learning_rate": 1.3563592464523305e-05, - "loss": 0.2691, + "epoch": 3.3902764262803187, + "grad_norm": 0.2451028823852539, + "learning_rate": 1.2419049581112152e-05, + "loss": 0.329, "step": 94070 }, { - "epoch": 3.31, - "learning_rate": 1.3561059364333147e-05, - "loss": 0.2674, + "epoch": 3.3904566259415434, + "grad_norm": 0.20993046462535858, + "learning_rate": 1.2416527967917548e-05, + "loss": 0.3511, "step": 94075 }, { - "epoch": 3.31, - "learning_rate": 1.3558526412670086e-05, - "loss": 0.2457, + "epoch": 3.3906368256027677, + "grad_norm": 0.2813032567501068, + "learning_rate": 1.2414006526170312e-05, + "loss": 0.3764, "step": 94080 }, { - "epoch": 3.31, - "learning_rate": 1.355599360956702e-05, - "loss": 0.262, + "epoch": 3.3908170252639924, + "grad_norm": 0.22048485279083252, + "learning_rate": 1.2411485255904807e-05, + "loss": 0.3629, "step": 94085 }, { - "epoch": 3.31, - "learning_rate": 1.3553460955056845e-05, - "loss": 0.2832, + "epoch": 3.390997224925217, + "grad_norm": 0.2146688997745514, + "learning_rate": 1.2408964157155362e-05, + "loss": 0.3739, "step": 94090 }, { - "epoch": 3.31, - "learning_rate": 1.3550928449172433e-05, - "loss": 0.2359, + "epoch": 3.391177424586442, + "grad_norm": 0.26328352093696594, + "learning_rate": 1.2406443229956355e-05, + "loss": 0.4042, "step": 94095 }, { - "epoch": 3.31, - "learning_rate": 1.3548396091946663e-05, - "loss": 0.2592, + "epoch": 3.391357624247666, + "grad_norm": 0.22812364995479584, + "learning_rate": 1.240392247434212e-05, + "loss": 0.3294, "step": 94100 }, { - "epoch": 3.31, - "learning_rate": 1.3545863883412432e-05, - "loss": 0.2594, + "epoch": 3.391537823908891, + "grad_norm": 0.19867856800556183, + "learning_rate": 1.2401401890347003e-05, + "loss": 0.4121, "step": 94105 }, { - "epoch": 3.31, - "learning_rate": 1.3543331823602613e-05, - "loss": 0.2325, + "epoch": 3.3917180235701156, + "grad_norm": 0.24459275603294373, + "learning_rate": 1.2398881478005345e-05, + "loss": 0.3386, "step": 94110 }, { - "epoch": 3.31, - "learning_rate": 1.3540799912550073e-05, - "loss": 0.2471, + "epoch": 3.3918982232313404, + "grad_norm": 0.2276408076286316, + "learning_rate": 1.2396361237351478e-05, + "loss": 0.3889, "step": 94115 }, { - "epoch": 3.31, - "learning_rate": 1.3538268150287688e-05, - "loss": 0.2797, + "epoch": 3.392078422892565, + "grad_norm": 0.1976245641708374, + "learning_rate": 1.2393841168419759e-05, + "loss": 0.4016, "step": 94120 }, { - "epoch": 3.31, - "learning_rate": 1.3535736536848349e-05, - "loss": 0.2508, + "epoch": 3.39225862255379, + "grad_norm": 0.2941203713417053, + "learning_rate": 1.2391321271244524e-05, + "loss": 0.3932, "step": 94125 }, { - "epoch": 3.31, - "learning_rate": 1.3533205072264916e-05, - "loss": 0.2521, + "epoch": 3.392438822215014, + "grad_norm": 0.3133787214756012, + "learning_rate": 1.2388801545860076e-05, + "loss": 0.3873, "step": 94130 }, { - "epoch": 3.31, - "learning_rate": 1.3530673756570245e-05, - "loss": 0.2397, + "epoch": 3.392619021876239, + "grad_norm": 0.2651798725128174, + "learning_rate": 1.2386281992300775e-05, + "loss": 0.4417, "step": 94135 }, { - "epoch": 3.31, - "learning_rate": 1.3528142589797227e-05, - "loss": 0.2531, + "epoch": 3.3927992215374636, + "grad_norm": 0.23838862776756287, + "learning_rate": 1.2383762610600938e-05, + "loss": 0.3781, "step": 94140 }, { - "epoch": 3.31, - "learning_rate": 1.3525611571978708e-05, - "loss": 0.2361, + "epoch": 3.3929794211986883, + "grad_norm": 0.2366458624601364, + "learning_rate": 1.2381243400794885e-05, + "loss": 0.3972, "step": 94145 }, { - "epoch": 3.31, - "learning_rate": 1.3523080703147572e-05, - "loss": 0.2691, + "epoch": 3.3931596208599126, + "grad_norm": 0.2220826894044876, + "learning_rate": 1.2378724362916966e-05, + "loss": 0.4127, "step": 94150 }, { - "epoch": 3.31, - "learning_rate": 1.3520549983336658e-05, - "loss": 0.2774, + "epoch": 3.3933398205211374, + "grad_norm": 0.20894181728363037, + "learning_rate": 1.2376205497001465e-05, + "loss": 0.3961, "step": 94155 }, { - "epoch": 3.31, - "learning_rate": 1.3518019412578848e-05, - "loss": 0.2514, + "epoch": 3.393520020182362, + "grad_norm": 0.24846115708351135, + "learning_rate": 1.2373686803082728e-05, + "loss": 0.3631, "step": 94160 }, { - "epoch": 3.31, - "learning_rate": 1.3515488990906989e-05, - "loss": 0.266, + "epoch": 3.393700219843587, + "grad_norm": 0.20306502282619476, + "learning_rate": 1.2371168281195067e-05, + "loss": 0.3865, "step": 94165 }, { - "epoch": 3.31, - "learning_rate": 1.3512958718353937e-05, - "loss": 0.2292, + "epoch": 3.3938804195048116, + "grad_norm": 0.222940593957901, + "learning_rate": 1.2368649931372791e-05, + "loss": 0.3733, "step": 94170 }, { - "epoch": 3.31, - "learning_rate": 1.3510428594952537e-05, - "loss": 0.2441, + "epoch": 3.394060619166036, + "grad_norm": 0.2514864206314087, + "learning_rate": 1.2366131753650214e-05, + "loss": 0.4155, "step": 94175 }, { - "epoch": 3.31, - "learning_rate": 1.3507898620735659e-05, - "loss": 0.2651, + "epoch": 3.3942408188272606, + "grad_norm": 0.2028576135635376, + "learning_rate": 1.2363613748061639e-05, + "loss": 0.3471, "step": 94180 }, { - "epoch": 3.31, - "learning_rate": 1.3505368795736143e-05, - "loss": 0.2452, + "epoch": 3.3944210184884853, + "grad_norm": 0.21311317384243011, + "learning_rate": 1.2361095914641388e-05, + "loss": 0.3696, "step": 94185 }, { - "epoch": 3.31, - "learning_rate": 1.3502839119986827e-05, - "loss": 0.2532, + "epoch": 3.39460121814971, + "grad_norm": 0.21538734436035156, + "learning_rate": 1.2358578253423757e-05, + "loss": 0.3949, "step": 94190 }, { - "epoch": 3.31, - "learning_rate": 1.350030959352058e-05, - "loss": 0.2476, + "epoch": 3.3947814178109343, + "grad_norm": 0.25031110644340515, + "learning_rate": 1.235606076444305e-05, + "loss": 0.3706, "step": 94195 }, { - "epoch": 3.31, - "learning_rate": 1.3497780216370226e-05, - "loss": 0.2817, + "epoch": 3.394961617472159, + "grad_norm": 0.1601521223783493, + "learning_rate": 1.235354344773357e-05, + "loss": 0.3664, "step": 94200 }, { - "epoch": 3.31, - "learning_rate": 1.349525098856862e-05, - "loss": 0.2676, + "epoch": 3.395141817133384, + "grad_norm": 0.24917049705982208, + "learning_rate": 1.235102630332961e-05, + "loss": 0.37, "step": 94205 }, { - "epoch": 3.31, - "learning_rate": 1.3492721910148604e-05, - "loss": 0.246, + "epoch": 3.3953220167946085, + "grad_norm": 0.24421679973602295, + "learning_rate": 1.234850933126546e-05, + "loss": 0.3709, "step": 94210 }, { - "epoch": 3.31, - "learning_rate": 1.3490192981142996e-05, - "loss": 0.2373, + "epoch": 3.3955022164558333, + "grad_norm": 0.20778076350688934, + "learning_rate": 1.2345992531575426e-05, + "loss": 0.4043, "step": 94215 }, { - "epoch": 3.31, - "learning_rate": 1.3487664201584659e-05, - "loss": 0.2551, + "epoch": 3.3956824161170576, + "grad_norm": 0.22638878226280212, + "learning_rate": 1.2343475904293797e-05, + "loss": 0.3479, "step": 94220 }, { - "epoch": 3.32, - "learning_rate": 1.3485135571506416e-05, - "loss": 0.2407, + "epoch": 3.3958626157782823, + "grad_norm": 0.2236730009317398, + "learning_rate": 1.2340959449454859e-05, + "loss": 0.3658, "step": 94225 }, { - "epoch": 3.32, - "learning_rate": 1.3482607090941096e-05, - "loss": 0.2685, + "epoch": 3.396042815439507, + "grad_norm": 0.22104784846305847, + "learning_rate": 1.2338443167092898e-05, + "loss": 0.3816, "step": 94230 }, { - "epoch": 3.32, - "learning_rate": 1.3480078759921527e-05, - "loss": 0.2463, + "epoch": 3.3962230151007318, + "grad_norm": 0.26684364676475525, + "learning_rate": 1.2335927057242185e-05, + "loss": 0.3682, "step": 94235 }, { - "epoch": 3.32, - "learning_rate": 1.3477550578480553e-05, - "loss": 0.2558, + "epoch": 3.396403214761956, + "grad_norm": 0.22945067286491394, + "learning_rate": 1.2333411119937036e-05, + "loss": 0.3779, "step": 94240 }, { - "epoch": 3.32, - "learning_rate": 1.3475022546650979e-05, - "loss": 0.2484, + "epoch": 3.396583414423181, + "grad_norm": 0.18376512825489044, + "learning_rate": 1.2330895355211697e-05, + "loss": 0.3449, "step": 94245 }, { - "epoch": 3.32, - "learning_rate": 1.3472494664465652e-05, - "loss": 0.2492, + "epoch": 3.3967636140844055, + "grad_norm": 0.24991095066070557, + "learning_rate": 1.2328379763100445e-05, + "loss": 0.4067, "step": 94250 }, { - "epoch": 3.32, - "learning_rate": 1.3469966931957374e-05, - "loss": 0.2552, + "epoch": 3.3969438137456303, + "grad_norm": 0.2116982489824295, + "learning_rate": 1.2325864343637577e-05, + "loss": 0.3399, "step": 94255 }, { - "epoch": 3.32, - "learning_rate": 1.346743934915899e-05, - "loss": 0.2591, + "epoch": 3.397124013406855, + "grad_norm": 0.23185938596725464, + "learning_rate": 1.2323349096857354e-05, + "loss": 0.378, "step": 94260 }, { - "epoch": 3.32, - "learning_rate": 1.34649119161033e-05, - "loss": 0.2093, + "epoch": 3.3973042130680793, + "grad_norm": 0.21932083368301392, + "learning_rate": 1.2320834022794045e-05, + "loss": 0.3541, "step": 94265 }, { - "epoch": 3.32, - "learning_rate": 1.3462384632823122e-05, - "loss": 0.2402, + "epoch": 3.397484412729304, + "grad_norm": 0.20772342383861542, + "learning_rate": 1.2318319121481917e-05, + "loss": 0.3874, "step": 94270 }, { - "epoch": 3.32, - "learning_rate": 1.3459857499351286e-05, - "loss": 0.2607, + "epoch": 3.3976646123905287, + "grad_norm": 0.2540047764778137, + "learning_rate": 1.2315804392955228e-05, + "loss": 0.4273, "step": 94275 }, { - "epoch": 3.32, - "learning_rate": 1.3457330515720592e-05, - "loss": 0.2511, + "epoch": 3.3978448120517535, + "grad_norm": 0.25412896275520325, + "learning_rate": 1.2313289837248254e-05, + "loss": 0.3906, "step": 94280 }, { - "epoch": 3.32, - "learning_rate": 1.3454803681963856e-05, - "loss": 0.253, + "epoch": 3.3980250117129778, + "grad_norm": 0.2159910500049591, + "learning_rate": 1.2310775454395252e-05, + "loss": 0.3793, "step": 94285 }, { - "epoch": 3.32, - "learning_rate": 1.3452276998113878e-05, - "loss": 0.2711, + "epoch": 3.3982052113742025, + "grad_norm": 0.23486000299453735, + "learning_rate": 1.2308261244430477e-05, + "loss": 0.3846, "step": 94290 }, { - "epoch": 3.32, - "learning_rate": 1.344975046420348e-05, - "loss": 0.2385, + "epoch": 3.3983854110354272, + "grad_norm": 0.2207733690738678, + "learning_rate": 1.2305747207388187e-05, + "loss": 0.3535, "step": 94295 }, { - "epoch": 3.32, - "learning_rate": 1.3447224080265462e-05, - "loss": 0.2699, + "epoch": 3.398565610696652, + "grad_norm": 0.22273029386997223, + "learning_rate": 1.2303233343302623e-05, + "loss": 0.3769, "step": 94300 }, { - "epoch": 3.32, - "learning_rate": 1.3444697846332612e-05, - "loss": 0.2332, + "epoch": 3.3987458103578767, + "grad_norm": 0.2068636119365692, + "learning_rate": 1.2300719652208057e-05, + "loss": 0.355, "step": 94305 }, { - "epoch": 3.32, - "learning_rate": 1.344217176243775e-05, - "loss": 0.2502, + "epoch": 3.398926010019101, + "grad_norm": 0.2813783884048462, + "learning_rate": 1.2298206134138726e-05, + "loss": 0.3902, "step": 94310 }, { - "epoch": 3.32, - "learning_rate": 1.3439645828613678e-05, - "loss": 0.2498, + "epoch": 3.3991062096803257, + "grad_norm": 0.2261374592781067, + "learning_rate": 1.2295692789128877e-05, + "loss": 0.3624, "step": 94315 }, { - "epoch": 3.32, - "learning_rate": 1.3437120044893187e-05, - "loss": 0.2674, + "epoch": 3.3992864093415505, + "grad_norm": 0.21216978132724762, + "learning_rate": 1.2293179617212755e-05, + "loss": 0.3872, "step": 94320 }, { - "epoch": 3.32, - "learning_rate": 1.343459441130907e-05, - "loss": 0.2475, + "epoch": 3.399466609002775, + "grad_norm": 0.22973038256168365, + "learning_rate": 1.22906666184246e-05, + "loss": 0.386, "step": 94325 }, { - "epoch": 3.32, - "learning_rate": 1.3432068927894118e-05, - "loss": 0.2423, + "epoch": 3.3996468086639995, + "grad_norm": 0.18079350888729095, + "learning_rate": 1.2288153792798642e-05, + "loss": 0.3672, "step": 94330 }, { - "epoch": 3.32, - "learning_rate": 1.3429543594681133e-05, - "loss": 0.267, + "epoch": 3.399827008325224, + "grad_norm": 0.2188836932182312, + "learning_rate": 1.2285641140369147e-05, + "loss": 0.4015, "step": 94335 }, { - "epoch": 3.32, - "learning_rate": 1.3427018411702901e-05, - "loss": 0.2778, + "epoch": 3.400007207986449, + "grad_norm": 0.2155511975288391, + "learning_rate": 1.2283128661170313e-05, + "loss": 0.358, "step": 94340 }, { - "epoch": 3.32, - "learning_rate": 1.342449337899221e-05, - "loss": 0.2546, + "epoch": 3.4001874076476737, + "grad_norm": 0.2768744230270386, + "learning_rate": 1.2280616355236397e-05, + "loss": 0.4151, "step": 94345 }, { - "epoch": 3.32, - "learning_rate": 1.3421968496581827e-05, - "loss": 0.2568, + "epoch": 3.4003676073088984, + "grad_norm": 0.20749439299106598, + "learning_rate": 1.2278104222601618e-05, + "loss": 0.3595, "step": 94350 }, { - "epoch": 3.32, - "learning_rate": 1.3419443764504569e-05, - "loss": 0.2568, + "epoch": 3.4005478069701227, + "grad_norm": 0.28811633586883545, + "learning_rate": 1.227559226330021e-05, + "loss": 0.4218, "step": 94355 }, { - "epoch": 3.32, - "learning_rate": 1.3416919182793192e-05, - "loss": 0.2476, + "epoch": 3.4007280066313474, + "grad_norm": 0.24326983094215393, + "learning_rate": 1.2273080477366391e-05, + "loss": 0.3668, "step": 94360 }, { - "epoch": 3.32, - "learning_rate": 1.3414394751480492e-05, - "loss": 0.2513, + "epoch": 3.400908206292572, + "grad_norm": 0.20822405815124512, + "learning_rate": 1.2270568864834378e-05, + "loss": 0.3486, "step": 94365 }, { - "epoch": 3.32, - "learning_rate": 1.3411870470599233e-05, - "loss": 0.2527, + "epoch": 3.401088405953797, + "grad_norm": 0.21231502294540405, + "learning_rate": 1.2268057425738408e-05, + "loss": 0.3724, "step": 94370 }, { - "epoch": 3.32, - "learning_rate": 1.340934634018221e-05, - "loss": 0.2936, + "epoch": 3.401268605615021, + "grad_norm": 0.21015721559524536, + "learning_rate": 1.2265546160112692e-05, + "loss": 0.3906, "step": 94375 }, { - "epoch": 3.32, - "learning_rate": 1.3406822360262183e-05, - "loss": 0.2555, + "epoch": 3.401448805276246, + "grad_norm": 0.20890061557292938, + "learning_rate": 1.2263035067991443e-05, + "loss": 0.3918, "step": 94380 }, { - "epoch": 3.32, - "learning_rate": 1.3404298530871922e-05, - "loss": 0.2488, + "epoch": 3.4016290049374707, + "grad_norm": 0.23352204263210297, + "learning_rate": 1.2260524149408875e-05, + "loss": 0.3573, "step": 94385 }, { - "epoch": 3.32, - "learning_rate": 1.3401774852044211e-05, - "loss": 0.2521, + "epoch": 3.4018092045986954, + "grad_norm": 0.24678070843219757, + "learning_rate": 1.2258013404399202e-05, + "loss": 0.3975, "step": 94390 }, { - "epoch": 3.32, - "learning_rate": 1.3399251323811807e-05, - "loss": 0.2486, + "epoch": 3.40198940425992, + "grad_norm": 0.27147865295410156, + "learning_rate": 1.2255502832996619e-05, + "loss": 0.3848, "step": 94395 }, { - "epoch": 3.32, - "learning_rate": 1.3396727946207482e-05, - "loss": 0.2644, + "epoch": 3.402169603921145, + "grad_norm": 0.2716798484325409, + "learning_rate": 1.225299243523535e-05, + "loss": 0.3492, "step": 94400 }, { - "epoch": 3.32, - "learning_rate": 1.3394204719263989e-05, - "loss": 0.2582, + "epoch": 3.402349803582369, + "grad_norm": 0.2307744324207306, + "learning_rate": 1.2250482211149591e-05, + "loss": 0.3821, "step": 94405 }, { - "epoch": 3.32, - "learning_rate": 1.3391681643014109e-05, - "loss": 0.2385, + "epoch": 3.402530003243594, + "grad_norm": 0.2626960575580597, + "learning_rate": 1.2247972160773544e-05, + "loss": 0.3784, "step": 94410 }, { - "epoch": 3.32, - "learning_rate": 1.338915871749058e-05, - "loss": 0.2645, + "epoch": 3.4027102029048186, + "grad_norm": 0.29123780131340027, + "learning_rate": 1.224546228414141e-05, + "loss": 0.3983, "step": 94415 }, { - "epoch": 3.32, - "learning_rate": 1.3386635942726183e-05, - "loss": 0.2564, + "epoch": 3.4028904025660434, + "grad_norm": 0.23781827092170715, + "learning_rate": 1.224295258128737e-05, + "loss": 0.4109, "step": 94420 }, { - "epoch": 3.32, - "learning_rate": 1.338411331875366e-05, - "loss": 0.255, + "epoch": 3.4030706022272676, + "grad_norm": 0.2665836811065674, + "learning_rate": 1.2240443052245651e-05, + "loss": 0.399, "step": 94425 }, { - "epoch": 3.32, - "learning_rate": 1.3381590845605776e-05, - "loss": 0.2679, + "epoch": 3.4032508018884924, + "grad_norm": 0.27731844782829285, + "learning_rate": 1.2237933697050416e-05, + "loss": 0.4108, "step": 94430 }, { - "epoch": 3.32, - "learning_rate": 1.337906852331528e-05, - "loss": 0.2662, + "epoch": 3.403431001549717, + "grad_norm": 0.20404113829135895, + "learning_rate": 1.2235424515735855e-05, + "loss": 0.3812, "step": 94435 }, { - "epoch": 3.32, - "learning_rate": 1.337654635191492e-05, - "loss": 0.2493, + "epoch": 3.403611201210942, + "grad_norm": 0.19817714393138885, + "learning_rate": 1.2232915508336173e-05, + "loss": 0.3763, "step": 94440 }, { - "epoch": 3.32, - "learning_rate": 1.3374024331437435e-05, - "loss": 0.2521, + "epoch": 3.4037914008721666, + "grad_norm": 0.2466011792421341, + "learning_rate": 1.2230406674885536e-05, + "loss": 0.3551, "step": 94445 }, { - "epoch": 3.32, - "learning_rate": 1.3371502461915592e-05, - "loss": 0.2722, + "epoch": 3.403971600533391, + "grad_norm": 0.2783677577972412, + "learning_rate": 1.2227898015418154e-05, + "loss": 0.3643, "step": 94450 }, { - "epoch": 3.32, - "learning_rate": 1.3368980743382126e-05, - "loss": 0.2694, + "epoch": 3.4041518001946156, + "grad_norm": 0.2562623620033264, + "learning_rate": 1.222538952996818e-05, + "loss": 0.4023, "step": 94455 }, { - "epoch": 3.32, - "learning_rate": 1.336645917586978e-05, - "loss": 0.2432, + "epoch": 3.4043319998558403, + "grad_norm": 0.2551863193511963, + "learning_rate": 1.222288121856979e-05, + "loss": 0.3695, "step": 94460 }, { - "epoch": 3.32, - "learning_rate": 1.3363937759411287e-05, - "loss": 0.2721, + "epoch": 3.404512199517065, + "grad_norm": 0.27139002084732056, + "learning_rate": 1.222037308125718e-05, + "loss": 0.3763, "step": 94465 }, { - "epoch": 3.32, - "learning_rate": 1.336141649403939e-05, - "loss": 0.2625, + "epoch": 3.4046923991782894, + "grad_norm": 0.21469560265541077, + "learning_rate": 1.2217865118064512e-05, + "loss": 0.3879, "step": 94470 }, { - "epoch": 3.32, - "learning_rate": 1.3358895379786842e-05, - "loss": 0.2666, + "epoch": 3.404872598839514, + "grad_norm": 0.20434612035751343, + "learning_rate": 1.2215357329025956e-05, + "loss": 0.3651, "step": 94475 }, { - "epoch": 3.32, - "learning_rate": 1.3356374416686368e-05, - "loss": 0.2839, + "epoch": 3.405052798500739, + "grad_norm": 0.26074668765068054, + "learning_rate": 1.2212849714175684e-05, + "loss": 0.3782, "step": 94480 }, { - "epoch": 3.32, - "learning_rate": 1.3353853604770683e-05, - "loss": 0.2386, + "epoch": 3.4052329981619636, + "grad_norm": 0.22299236059188843, + "learning_rate": 1.2210342273547848e-05, + "loss": 0.3674, "step": 94485 }, { - "epoch": 3.32, - "learning_rate": 1.3351332944072548e-05, - "loss": 0.2651, + "epoch": 3.4054131978231883, + "grad_norm": 0.2286439836025238, + "learning_rate": 1.2207835007176632e-05, + "loss": 0.3969, "step": 94490 }, { - "epoch": 3.32, - "learning_rate": 1.3348812434624677e-05, - "loss": 0.2861, + "epoch": 3.4055933974844126, + "grad_norm": 0.21935893595218658, + "learning_rate": 1.2205327915096187e-05, + "loss": 0.3621, "step": 94495 }, { - "epoch": 3.32, - "learning_rate": 1.3346292076459788e-05, - "loss": 0.261, + "epoch": 3.4057735971456373, + "grad_norm": 0.2769947946071625, + "learning_rate": 1.2202820997340673e-05, + "loss": 0.4064, "step": 94500 }, { - "epoch": 3.32, - "eval_loss": 0.25450006127357483, - "eval_runtime": 10.5436, - "eval_samples_per_second": 9.484, - "eval_steps_per_second": 9.484, + "epoch": 3.4057735971456373, + "eval_loss": 0.4305441975593567, + "eval_runtime": 3.5328, + "eval_samples_per_second": 28.307, + "eval_steps_per_second": 7.077, "step": 94500 }, { - "epoch": 3.32, - "learning_rate": 1.3343771869610627e-05, - "loss": 0.2545, + "epoch": 3.405953796806862, + "grad_norm": 0.22937120497226715, + "learning_rate": 1.2200314253944243e-05, + "loss": 0.4212, "step": 94505 }, { - "epoch": 3.33, - "learning_rate": 1.3341251814109906e-05, - "loss": 0.2607, + "epoch": 3.406133996468087, + "grad_norm": 0.2228795886039734, + "learning_rate": 1.2197807684941054e-05, + "loss": 0.3915, "step": 94510 }, { - "epoch": 3.33, - "learning_rate": 1.3338731909990348e-05, - "loss": 0.2558, + "epoch": 3.406314196129311, + "grad_norm": 0.2143542468547821, + "learning_rate": 1.2195301290365248e-05, + "loss": 0.38, "step": 94515 }, { - "epoch": 3.33, - "learning_rate": 1.3336212157284658e-05, - "loss": 0.2538, + "epoch": 3.406494395790536, + "grad_norm": 0.22707048058509827, + "learning_rate": 1.2192795070251001e-05, + "loss": 0.3598, "step": 94520 }, { - "epoch": 3.33, - "learning_rate": 1.3333692556025568e-05, - "loss": 0.2324, + "epoch": 3.4066745954517605, + "grad_norm": 0.24929481744766235, + "learning_rate": 1.2190289024632424e-05, + "loss": 0.389, "step": 94525 }, { - "epoch": 3.33, - "learning_rate": 1.3331173106245804e-05, - "loss": 0.2737, + "epoch": 3.4068547951129853, + "grad_norm": 0.2553023397922516, + "learning_rate": 1.2187783153543689e-05, + "loss": 0.3685, "step": 94530 }, { - "epoch": 3.33, - "learning_rate": 1.3328653807978064e-05, - "loss": 0.2455, + "epoch": 3.40703499477421, + "grad_norm": 0.2211211770772934, + "learning_rate": 1.2185277457018926e-05, + "loss": 0.3369, "step": 94535 }, { - "epoch": 3.33, - "learning_rate": 1.3326134661255055e-05, - "loss": 0.2488, + "epoch": 3.4072151944354343, + "grad_norm": 0.27645251154899597, + "learning_rate": 1.2182771935092277e-05, + "loss": 0.3792, "step": 94540 }, { - "epoch": 3.33, - "learning_rate": 1.3323615666109502e-05, - "loss": 0.2556, + "epoch": 3.407395394096659, + "grad_norm": 0.23426538705825806, + "learning_rate": 1.2180266587797881e-05, + "loss": 0.3623, "step": 94545 }, { - "epoch": 3.33, - "learning_rate": 1.3321096822574106e-05, - "loss": 0.2651, + "epoch": 3.4075755937578838, + "grad_norm": 0.204464390873909, + "learning_rate": 1.2177761415169869e-05, + "loss": 0.3999, "step": 94550 }, { - "epoch": 3.33, - "learning_rate": 1.331857813068157e-05, - "loss": 0.2757, + "epoch": 3.4077557934191085, + "grad_norm": 0.22453001141548157, + "learning_rate": 1.2175256417242365e-05, + "loss": 0.3882, "step": 94555 }, { - "epoch": 3.33, - "learning_rate": 1.3316059590464591e-05, - "loss": 0.2661, + "epoch": 3.407935993080333, + "grad_norm": 0.22461944818496704, + "learning_rate": 1.217275159404952e-05, + "loss": 0.4007, "step": 94560 }, { - "epoch": 3.33, - "learning_rate": 1.3313541201955885e-05, - "loss": 0.2849, + "epoch": 3.4081161927415575, + "grad_norm": 0.2279561460018158, + "learning_rate": 1.2170246945625451e-05, + "loss": 0.3697, "step": 94565 }, { - "epoch": 3.33, - "learning_rate": 1.3311022965188147e-05, - "loss": 0.2361, + "epoch": 3.4082963924027823, + "grad_norm": 0.20913352072238922, + "learning_rate": 1.2167742472004284e-05, + "loss": 0.3608, "step": 94570 }, { - "epoch": 3.33, - "learning_rate": 1.330850488019406e-05, - "loss": 0.2496, + "epoch": 3.408476592064007, + "grad_norm": 0.2152101993560791, + "learning_rate": 1.2165238173220139e-05, + "loss": 0.3632, "step": 94575 }, { - "epoch": 3.33, - "learning_rate": 1.3305986947006343e-05, - "loss": 0.2545, + "epoch": 3.4086567917252317, + "grad_norm": 0.21769098937511444, + "learning_rate": 1.216273404930713e-05, + "loss": 0.3927, "step": 94580 }, { - "epoch": 3.33, - "learning_rate": 1.3303469165657673e-05, - "loss": 0.2494, + "epoch": 3.408836991386456, + "grad_norm": 0.22281882166862488, + "learning_rate": 1.2160230100299397e-05, + "loss": 0.3571, "step": 94585 }, { - "epoch": 3.33, - "learning_rate": 1.330095153618075e-05, - "loss": 0.2668, + "epoch": 3.4090171910476808, + "grad_norm": 0.20765550434589386, + "learning_rate": 1.2157726326231041e-05, + "loss": 0.3766, "step": 94590 }, { - "epoch": 3.33, - "learning_rate": 1.3298434058608267e-05, - "loss": 0.2508, + "epoch": 3.4091973907089055, + "grad_norm": 0.23559367656707764, + "learning_rate": 1.215522272713618e-05, + "loss": 0.3658, "step": 94595 }, { - "epoch": 3.33, - "learning_rate": 1.3295916732972896e-05, - "loss": 0.2632, + "epoch": 3.40937759037013, + "grad_norm": 0.23205064237117767, + "learning_rate": 1.2152719303048919e-05, + "loss": 0.4183, "step": 94600 }, { - "epoch": 3.33, - "learning_rate": 1.329339955930734e-05, - "loss": 0.2781, + "epoch": 3.4095577900313545, + "grad_norm": 0.2513408958911896, + "learning_rate": 1.2150216054003361e-05, + "loss": 0.3678, "step": 94605 }, { - "epoch": 3.33, - "learning_rate": 1.3290882537644278e-05, - "loss": 0.2595, + "epoch": 3.4097379896925792, + "grad_norm": 0.24810358881950378, + "learning_rate": 1.2147712980033629e-05, + "loss": 0.384, "step": 94610 }, { - "epoch": 3.33, - "learning_rate": 1.3288365668016378e-05, - "loss": 0.264, + "epoch": 3.409918189353804, + "grad_norm": Infinity, + "learning_rate": 1.2145710646935363e-05, + "loss": 0.3946, "step": 94615 }, { - "epoch": 3.33, - "learning_rate": 1.3285848950456341e-05, - "loss": 0.2602, + "epoch": 3.4100983890150287, + "grad_norm": 0.2287181168794632, + "learning_rate": 1.2143207888188035e-05, + "loss": 0.3305, "step": 94620 }, { - "epoch": 3.33, - "learning_rate": 1.3283332384996838e-05, - "loss": 0.2481, + "epoch": 3.4102785886762534, + "grad_norm": 0.28771016001701355, + "learning_rate": 1.2140705304612024e-05, + "loss": 0.4085, "step": 94625 }, { - "epoch": 3.33, - "learning_rate": 1.3280815971670535e-05, - "loss": 0.2513, + "epoch": 3.4104587883374777, + "grad_norm": 0.23076020181179047, + "learning_rate": 1.2138202896241413e-05, + "loss": 0.3834, "step": 94630 }, { - "epoch": 3.33, - "learning_rate": 1.3278299710510117e-05, - "loss": 0.2562, + "epoch": 3.4106389879987025, + "grad_norm": 0.25198495388031006, + "learning_rate": 1.2135700663110295e-05, + "loss": 0.3626, "step": 94635 }, { - "epoch": 3.33, - "learning_rate": 1.3275783601548248e-05, - "loss": 0.2786, + "epoch": 3.410819187659927, + "grad_norm": 0.19510357081890106, + "learning_rate": 1.2133198605252767e-05, + "loss": 0.4004, "step": 94640 }, { - "epoch": 3.33, - "learning_rate": 1.3273267644817607e-05, - "loss": 0.2617, + "epoch": 3.410999387321152, + "grad_norm": 0.21469490230083466, + "learning_rate": 1.2130696722702917e-05, + "loss": 0.3654, "step": 94645 }, { - "epoch": 3.33, - "learning_rate": 1.3270751840350861e-05, - "loss": 0.2527, + "epoch": 3.4111795869823767, + "grad_norm": 0.22288212180137634, + "learning_rate": 1.212819501549482e-05, + "loss": 0.3701, "step": 94650 }, { - "epoch": 3.33, - "learning_rate": 1.3268236188180661e-05, - "loss": 0.2464, + "epoch": 3.411359786643601, + "grad_norm": 0.25711116194725037, + "learning_rate": 1.2125693483662586e-05, + "loss": 0.3539, "step": 94655 }, { - "epoch": 3.33, - "learning_rate": 1.3265720688339695e-05, - "loss": 0.2613, + "epoch": 3.4115399863048257, + "grad_norm": 0.30416256189346313, + "learning_rate": 1.2123192127240286e-05, + "loss": 0.3982, "step": 94660 }, { - "epoch": 3.33, - "learning_rate": 1.326320534086061e-05, - "loss": 0.2414, + "epoch": 3.4117201859660504, + "grad_norm": 0.23342035710811615, + "learning_rate": 1.2120690946262e-05, + "loss": 0.3734, "step": 94665 }, { - "epoch": 3.33, - "learning_rate": 1.326069014577607e-05, - "loss": 0.2516, + "epoch": 3.411900385627275, + "grad_norm": 0.3036738932132721, + "learning_rate": 1.2118189940761807e-05, + "loss": 0.3875, "step": 94670 }, { - "epoch": 3.33, - "learning_rate": 1.3258175103118719e-05, - "loss": 0.2528, + "epoch": 3.4120805852885, + "grad_norm": 0.20019620656967163, + "learning_rate": 1.2115689110773771e-05, + "loss": 0.3689, "step": 94675 }, { - "epoch": 3.33, - "learning_rate": 1.325566021292124e-05, - "loss": 0.2653, + "epoch": 3.412260784949724, + "grad_norm": 0.23406051099300385, + "learning_rate": 1.2113188456331987e-05, + "loss": 0.3826, "step": 94680 }, { - "epoch": 3.33, - "learning_rate": 1.3253145475216261e-05, - "loss": 0.2526, + "epoch": 3.412440984610949, + "grad_norm": 0.1972576528787613, + "learning_rate": 1.2110687977470522e-05, + "loss": 0.3868, "step": 94685 }, { - "epoch": 3.33, - "learning_rate": 1.3250630890036458e-05, - "loss": 0.2568, + "epoch": 3.4126211842721736, + "grad_norm": 0.21116188168525696, + "learning_rate": 1.210818767422342e-05, + "loss": 0.3706, "step": 94690 }, { - "epoch": 3.33, - "learning_rate": 1.3248116457414461e-05, - "loss": 0.289, + "epoch": 3.4128013839333984, + "grad_norm": 0.23035746812820435, + "learning_rate": 1.2105687546624777e-05, + "loss": 0.3828, "step": 94695 }, { - "epoch": 3.33, - "learning_rate": 1.3245602177382936e-05, - "loss": 0.2483, + "epoch": 3.4129815835946227, + "grad_norm": 0.25209763646125793, + "learning_rate": 1.2103187594708635e-05, + "loss": 0.4037, "step": 94700 }, { - "epoch": 3.33, - "learning_rate": 1.3243088049974516e-05, - "loss": 0.2605, + "epoch": 3.4131617832558474, + "grad_norm": 0.23511983454227448, + "learning_rate": 1.2100687818509083e-05, + "loss": 0.3718, "step": 94705 }, { - "epoch": 3.33, - "learning_rate": 1.3240574075221856e-05, - "loss": 0.2474, + "epoch": 3.413341982917072, + "grad_norm": 0.2819865047931671, + "learning_rate": 1.2098188218060153e-05, + "loss": 0.4107, "step": 94710 }, { - "epoch": 3.33, - "learning_rate": 1.323806025315758e-05, - "loss": 0.2783, + "epoch": 3.413522182578297, + "grad_norm": 0.21473956108093262, + "learning_rate": 1.2095688793395898e-05, + "loss": 0.4104, "step": 94715 }, { - "epoch": 3.33, - "learning_rate": 1.3235546583814348e-05, - "loss": 0.2654, + "epoch": 3.4137023822395216, + "grad_norm": 0.24196425080299377, + "learning_rate": 1.2093189544550398e-05, + "loss": 0.4131, "step": 94720 }, { - "epoch": 3.33, - "learning_rate": 1.3233033067224792e-05, - "loss": 0.2342, + "epoch": 3.413882581900746, + "grad_norm": 0.2057386040687561, + "learning_rate": 1.2090690471557689e-05, + "loss": 0.3766, "step": 94725 }, { - "epoch": 3.33, - "learning_rate": 1.3230519703421537e-05, - "loss": 0.2719, + "epoch": 3.4140627815619706, + "grad_norm": 0.268764466047287, + "learning_rate": 1.2088191574451827e-05, + "loss": 0.375, "step": 94730 }, { - "epoch": 3.33, - "learning_rate": 1.3228006492437236e-05, - "loss": 0.2502, + "epoch": 3.4142429812231954, + "grad_norm": 0.27153804898262024, + "learning_rate": 1.2085692853266852e-05, + "loss": 0.4177, "step": 94735 }, { - "epoch": 3.33, - "learning_rate": 1.3225493434304514e-05, - "loss": 0.2529, + "epoch": 3.41442318088442, + "grad_norm": 0.23466871678829193, + "learning_rate": 1.2083194308036803e-05, + "loss": 0.4096, "step": 94740 }, { - "epoch": 3.33, - "learning_rate": 1.322298052905599e-05, - "loss": 0.2508, + "epoch": 3.4146033805456444, + "grad_norm": 0.22995726764202118, + "learning_rate": 1.2080695938795739e-05, + "loss": 0.359, "step": 94745 }, { - "epoch": 3.33, - "learning_rate": 1.3220467776724301e-05, - "loss": 0.252, + "epoch": 3.414783580206869, + "grad_norm": 0.2112264633178711, + "learning_rate": 1.2078197745577693e-05, + "loss": 0.3687, "step": 94750 }, { - "epoch": 3.33, - "learning_rate": 1.3217955177342087e-05, - "loss": 0.2732, + "epoch": 3.414963779868094, + "grad_norm": 0.19448639452457428, + "learning_rate": 1.20756997284167e-05, + "loss": 0.3947, "step": 94755 }, { - "epoch": 3.33, - "learning_rate": 1.321544273094196e-05, - "loss": 0.264, + "epoch": 3.4151439795293186, + "grad_norm": 0.20815762877464294, + "learning_rate": 1.2073201887346797e-05, + "loss": 0.4007, "step": 94760 }, { - "epoch": 3.33, - "learning_rate": 1.3212930437556542e-05, - "loss": 0.2485, + "epoch": 3.4153241791905433, + "grad_norm": 0.21343150734901428, + "learning_rate": 1.2070704222402016e-05, + "loss": 0.3743, "step": 94765 }, { - "epoch": 3.33, - "learning_rate": 1.3210418297218446e-05, - "loss": 0.2582, + "epoch": 3.4155043788517676, + "grad_norm": 0.2795332372188568, + "learning_rate": 1.2068206733616375e-05, + "loss": 0.3962, "step": 94770 }, { - "epoch": 3.33, - "learning_rate": 1.3207906309960305e-05, - "loss": 0.2761, + "epoch": 3.4156845785129923, + "grad_norm": 0.26403021812438965, + "learning_rate": 1.2065709421023923e-05, + "loss": 0.3737, "step": 94775 }, { - "epoch": 3.33, - "learning_rate": 1.3205394475814732e-05, - "loss": 0.2472, + "epoch": 3.415864778174217, + "grad_norm": 0.21119317412376404, + "learning_rate": 1.2063212284658679e-05, + "loss": 0.363, "step": 94780 }, { - "epoch": 3.33, - "learning_rate": 1.320288279481434e-05, - "loss": 0.2437, + "epoch": 3.416044977835442, + "grad_norm": 0.23738868534564972, + "learning_rate": 1.206071532455466e-05, + "loss": 0.3517, "step": 94785 }, { - "epoch": 3.33, - "learning_rate": 1.320037126699173e-05, - "loss": 0.2663, + "epoch": 3.416225177496666, + "grad_norm": 0.22581474483013153, + "learning_rate": 1.2058218540745891e-05, + "loss": 0.3721, "step": 94790 }, { - "epoch": 3.34, - "learning_rate": 1.319785989237953e-05, - "loss": 0.2613, + "epoch": 3.416405377157891, + "grad_norm": 0.20880122482776642, + "learning_rate": 1.2055721933266381e-05, + "loss": 0.3756, "step": 94795 }, { - "epoch": 3.34, - "learning_rate": 1.3195348671010333e-05, - "loss": 0.2638, + "epoch": 3.4165855768191156, + "grad_norm": 0.19523420929908752, + "learning_rate": 1.205322550215017e-05, + "loss": 0.3655, "step": 94800 }, { - "epoch": 3.34, - "learning_rate": 1.3192837602916764e-05, - "loss": 0.2416, + "epoch": 3.4167657764803403, + "grad_norm": 0.26016107201576233, + "learning_rate": 1.2050729247431238e-05, + "loss": 0.3844, "step": 94805 }, { - "epoch": 3.34, - "learning_rate": 1.3190326688131407e-05, - "loss": 0.2669, + "epoch": 3.416945976141565, + "grad_norm": 0.21247242391109467, + "learning_rate": 1.2048233169143623e-05, + "loss": 0.409, "step": 94810 }, { - "epoch": 3.34, - "learning_rate": 1.3187815926686886e-05, - "loss": 0.2434, + "epoch": 3.4171261758027893, + "grad_norm": 0.27394580841064453, + "learning_rate": 1.204573726732132e-05, + "loss": 0.438, "step": 94815 }, { - "epoch": 3.34, - "learning_rate": 1.318530531861579e-05, - "loss": 0.2341, + "epoch": 3.417306375464014, + "grad_norm": 0.22982816398143768, + "learning_rate": 1.204324154199834e-05, + "loss": 0.3659, "step": 94820 }, { - "epoch": 3.34, - "learning_rate": 1.318279486395072e-05, - "loss": 0.2696, + "epoch": 3.417486575125239, + "grad_norm": 0.2429938167333603, + "learning_rate": 1.2040745993208685e-05, + "loss": 0.374, "step": 94825 }, { - "epoch": 3.34, - "learning_rate": 1.318028456272426e-05, - "loss": 0.2584, + "epoch": 3.4176667747864635, + "grad_norm": 0.2131299376487732, + "learning_rate": 1.2038250620986358e-05, + "loss": 0.4089, "step": 94830 }, { - "epoch": 3.34, - "learning_rate": 1.3177774414969023e-05, - "loss": 0.2407, + "epoch": 3.417846974447688, + "grad_norm": 0.2698366641998291, + "learning_rate": 1.2035755425365348e-05, + "loss": 0.3917, "step": 94835 }, { - "epoch": 3.34, - "learning_rate": 1.3175264420717594e-05, - "loss": 0.2605, + "epoch": 3.4180271741089125, + "grad_norm": 0.22712182998657227, + "learning_rate": 1.2033260406379668e-05, + "loss": 0.3908, "step": 94840 }, { - "epoch": 3.34, - "learning_rate": 1.3172754580002556e-05, - "loss": 0.2672, + "epoch": 3.4182073737701373, + "grad_norm": 0.25237366557121277, + "learning_rate": 1.2030765564063306e-05, + "loss": 0.4028, "step": 94845 }, { - "epoch": 3.34, - "learning_rate": 1.3170746817998638e-05, - "loss": 0.2632, + "epoch": 3.418387573431362, + "grad_norm": 0.22257663309574127, + "learning_rate": 1.2028270898450254e-05, + "loss": 0.3829, "step": 94850 }, { - "epoch": 3.34, - "learning_rate": 1.316823725373125e-05, - "loss": 0.2434, + "epoch": 3.4185677730925867, + "grad_norm": 0.22527682781219482, + "learning_rate": 1.2025776409574496e-05, + "loss": 0.3725, "step": 94855 }, { - "epoch": 3.34, - "learning_rate": 1.3165727843091493e-05, - "loss": 0.2421, + "epoch": 3.418747972753811, + "grad_norm": 0.24640600383281708, + "learning_rate": 1.2023282097470016e-05, + "loss": 0.4155, "step": 94860 }, { - "epoch": 3.34, - "learning_rate": 1.3163218586111972e-05, - "loss": 0.2645, + "epoch": 3.4189281724150358, + "grad_norm": 0.2200070023536682, + "learning_rate": 1.202078796217081e-05, + "loss": 0.3464, "step": 94865 }, { - "epoch": 3.34, - "learning_rate": 1.3160709482825257e-05, - "loss": 0.2445, + "epoch": 3.4191083720762605, + "grad_norm": 0.19475413858890533, + "learning_rate": 1.2018294003710867e-05, + "loss": 0.3841, "step": 94870 }, { - "epoch": 3.34, - "learning_rate": 1.3158200533263926e-05, - "loss": 0.2647, + "epoch": 3.4192885717374852, + "grad_norm": 0.2146916389465332, + "learning_rate": 1.2015800222124137e-05, + "loss": 0.3938, "step": 94875 }, { - "epoch": 3.34, - "learning_rate": 1.3155691737460545e-05, - "loss": 0.2784, + "epoch": 3.4194687713987095, + "grad_norm": 0.2292756289243698, + "learning_rate": 1.2013306617444625e-05, + "loss": 0.3847, "step": 94880 }, { - "epoch": 3.34, - "learning_rate": 1.3153183095447713e-05, - "loss": 0.25, + "epoch": 3.4196489710599343, + "grad_norm": 0.26337769627571106, + "learning_rate": 1.2010813189706285e-05, + "loss": 0.4117, "step": 94885 }, { - "epoch": 3.34, - "learning_rate": 1.3150674607257983e-05, - "loss": 0.2598, + "epoch": 3.419829170721159, + "grad_norm": 0.2760777175426483, + "learning_rate": 1.200831993894311e-05, + "loss": 0.3819, "step": 94890 }, { - "epoch": 3.34, - "learning_rate": 1.314816627292394e-05, - "loss": 0.2393, + "epoch": 3.4200093703823837, + "grad_norm": 0.22346773743629456, + "learning_rate": 1.200582686518907e-05, + "loss": 0.3848, "step": 94895 }, { - "epoch": 3.34, - "learning_rate": 1.3145658092478141e-05, - "loss": 0.2418, + "epoch": 3.4201895700436085, + "grad_norm": 0.2624737322330475, + "learning_rate": 1.2003333968478106e-05, + "loss": 0.3789, "step": 94900 }, { - "epoch": 3.34, - "learning_rate": 1.314315006595317e-05, - "loss": 0.2527, + "epoch": 3.420369769704833, + "grad_norm": 0.33500173687934875, + "learning_rate": 1.200084124884421e-05, + "loss": 0.3858, "step": 94905 }, { - "epoch": 3.34, - "learning_rate": 1.3140642193381578e-05, - "loss": 0.2348, + "epoch": 3.4205499693660575, + "grad_norm": 0.19514542818069458, + "learning_rate": 1.1998348706321333e-05, + "loss": 0.3719, "step": 94910 }, { - "epoch": 3.34, - "learning_rate": 1.3138134474795921e-05, - "loss": 0.2663, + "epoch": 3.420730169027282, + "grad_norm": 0.25635966658592224, + "learning_rate": 1.1995856340943439e-05, + "loss": 0.3643, "step": 94915 }, { - "epoch": 3.34, - "learning_rate": 1.3135626910228784e-05, - "loss": 0.2632, + "epoch": 3.420910368688507, + "grad_norm": 0.1672072410583496, + "learning_rate": 1.1993364152744485e-05, + "loss": 0.3837, "step": 94920 }, { - "epoch": 3.34, - "learning_rate": 1.3133119499712712e-05, - "loss": 0.2536, + "epoch": 3.4210905683497317, + "grad_norm": 0.23457878828048706, + "learning_rate": 1.1990872141758417e-05, + "loss": 0.3667, "step": 94925 }, { - "epoch": 3.34, - "learning_rate": 1.313061224328026e-05, - "loss": 0.2499, + "epoch": 3.421270768010956, + "grad_norm": 0.23055920004844666, + "learning_rate": 1.1988380308019207e-05, + "loss": 0.3776, "step": 94930 }, { - "epoch": 3.34, - "learning_rate": 1.3128105140963981e-05, - "loss": 0.2533, + "epoch": 3.4214509676721807, + "grad_norm": 0.24517036974430084, + "learning_rate": 1.1985888651560795e-05, + "loss": 0.3673, "step": 94935 }, { - "epoch": 3.34, - "learning_rate": 1.312559819279644e-05, - "loss": 0.2737, + "epoch": 3.4216311673334054, + "grad_norm": 0.22914618253707886, + "learning_rate": 1.1983397172417129e-05, + "loss": 0.3627, "step": 94940 }, { - "epoch": 3.34, - "learning_rate": 1.312309139881017e-05, - "loss": 0.2586, + "epoch": 3.42181136699463, + "grad_norm": 0.2351149469614029, + "learning_rate": 1.1980905870622159e-05, + "loss": 0.3745, "step": 94945 }, { - "epoch": 3.34, - "learning_rate": 1.3120584759037741e-05, - "loss": 0.2556, + "epoch": 3.421991566655855, + "grad_norm": 0.20726028084754944, + "learning_rate": 1.1978414746209826e-05, + "loss": 0.3554, "step": 94950 }, { - "epoch": 3.34, - "learning_rate": 1.311807827351168e-05, - "loss": 0.2735, + "epoch": 3.422171766317079, + "grad_norm": 0.2245931476354599, + "learning_rate": 1.1975923799214061e-05, + "loss": 0.4048, "step": 94955 }, { - "epoch": 3.34, - "learning_rate": 1.3115571942264554e-05, - "loss": 0.2577, + "epoch": 3.422351965978304, + "grad_norm": 0.2029707431793213, + "learning_rate": 1.1973433029668821e-05, + "loss": 0.3594, "step": 94960 }, { - "epoch": 3.34, - "learning_rate": 1.3113065765328892e-05, - "loss": 0.2492, + "epoch": 3.4225321656395287, + "grad_norm": 0.22745849192142487, + "learning_rate": 1.1970942437608035e-05, + "loss": 0.4059, "step": 94965 }, { - "epoch": 3.34, - "learning_rate": 1.3110559742737239e-05, - "loss": 0.2564, + "epoch": 3.4227123653007534, + "grad_norm": 0.283438116312027, + "learning_rate": 1.1968452023065635e-05, + "loss": 0.4164, "step": 94970 }, { - "epoch": 3.34, - "learning_rate": 1.310805387452212e-05, - "loss": 0.2664, + "epoch": 3.4228925649619777, + "grad_norm": 0.181619331240654, + "learning_rate": 1.1965961786075556e-05, + "loss": 0.3374, "step": 94975 }, { - "epoch": 3.34, - "learning_rate": 1.3105548160716097e-05, - "loss": 0.2677, + "epoch": 3.4230727646232024, + "grad_norm": 0.1780719757080078, + "learning_rate": 1.196347172667171e-05, + "loss": 0.3894, "step": 94980 }, { - "epoch": 3.34, - "learning_rate": 1.310304260135169e-05, - "loss": 0.2575, + "epoch": 3.423252964284427, + "grad_norm": 0.24398981034755707, + "learning_rate": 1.196098184488806e-05, + "loss": 0.3389, "step": 94985 }, { - "epoch": 3.34, - "learning_rate": 1.3100537196461427e-05, - "loss": 0.2438, + "epoch": 3.423433163945652, + "grad_norm": 0.21179978549480438, + "learning_rate": 1.1958492140758497e-05, + "loss": 0.3897, "step": 94990 }, { - "epoch": 3.34, - "learning_rate": 1.3098031946077854e-05, - "loss": 0.2433, + "epoch": 3.4236133636068766, + "grad_norm": 0.2665993273258209, + "learning_rate": 1.1956002614316946e-05, + "loss": 0.3834, "step": 94995 }, { - "epoch": 3.34, - "learning_rate": 1.3095526850233486e-05, - "loss": 0.2506, + "epoch": 3.423793563268101, + "grad_norm": 0.24159680306911469, + "learning_rate": 1.1953513265597344e-05, + "loss": 0.3963, "step": 95000 }, { - "epoch": 3.34, - "eval_loss": 0.25423306226730347, - "eval_runtime": 10.5407, - "eval_samples_per_second": 9.487, - "eval_steps_per_second": 9.487, + "epoch": 3.423793563268101, + "eval_loss": 0.4299665093421936, + "eval_runtime": 3.5329, + "eval_samples_per_second": 28.305, + "eval_steps_per_second": 7.076, "step": 95000 }, { - "epoch": 3.34, - "learning_rate": 1.3093021908960862e-05, - "loss": 0.2546, + "epoch": 3.4239737629293256, + "grad_norm": 0.2492501586675644, + "learning_rate": 1.1951024094633594e-05, + "loss": 0.3966, "step": 95005 }, { - "epoch": 3.34, - "learning_rate": 1.3090517122292505e-05, - "loss": 0.234, + "epoch": 3.4241539625905504, + "grad_norm": 0.2158864438533783, + "learning_rate": 1.1948535101459619e-05, + "loss": 0.3756, "step": 95010 }, { - "epoch": 3.34, - "learning_rate": 1.3088012490260924e-05, - "loss": 0.2825, + "epoch": 3.424334162251775, + "grad_norm": 0.20892734825611115, + "learning_rate": 1.1946046286109321e-05, + "loss": 0.3704, "step": 95015 }, { - "epoch": 3.34, - "learning_rate": 1.3085508012898659e-05, - "loss": 0.2709, + "epoch": 3.4245143619129994, + "grad_norm": 0.2723451852798462, + "learning_rate": 1.194355764861661e-05, + "loss": 0.3775, "step": 95020 }, { - "epoch": 3.34, - "learning_rate": 1.308300369023822e-05, - "loss": 0.2723, + "epoch": 3.424694561574224, + "grad_norm": 0.24041932821273804, + "learning_rate": 1.1941069189015406e-05, + "loss": 0.3849, "step": 95025 }, { - "epoch": 3.34, - "learning_rate": 1.3080499522312118e-05, - "loss": 0.2856, + "epoch": 3.424874761235449, + "grad_norm": 0.23057153820991516, + "learning_rate": 1.1938580907339606e-05, + "loss": 0.3625, "step": 95030 }, { - "epoch": 3.34, - "learning_rate": 1.3077995509152882e-05, - "loss": 0.2701, + "epoch": 3.4250549608966736, + "grad_norm": 0.23812946677207947, + "learning_rate": 1.1936092803623114e-05, + "loss": 0.3765, "step": 95035 }, { - "epoch": 3.34, - "learning_rate": 1.3075491650793017e-05, - "loss": 0.2622, + "epoch": 3.4252351605578983, + "grad_norm": 0.18412302434444427, + "learning_rate": 1.1933604877899825e-05, + "loss": 0.3616, "step": 95040 }, { - "epoch": 3.34, - "learning_rate": 1.3072987947265037e-05, - "loss": 0.2584, + "epoch": 3.4254153602191226, + "grad_norm": 0.20551741123199463, + "learning_rate": 1.1931117130203634e-05, + "loss": 0.3636, "step": 95045 }, { - "epoch": 3.34, - "learning_rate": 1.3070484398601434e-05, - "loss": 0.2455, + "epoch": 3.4255955598803474, + "grad_norm": 0.2384248822927475, + "learning_rate": 1.192862956056845e-05, + "loss": 0.399, "step": 95050 }, { - "epoch": 3.34, - "learning_rate": 1.3067981004834734e-05, - "loss": 0.2491, + "epoch": 3.425775759541572, + "grad_norm": 0.27354708313941956, + "learning_rate": 1.1926142169028156e-05, + "loss": 0.3748, "step": 95055 }, { - "epoch": 3.34, - "learning_rate": 1.3065477765997442e-05, - "loss": 0.2469, + "epoch": 3.425955959202797, + "grad_norm": 0.23982170224189758, + "learning_rate": 1.1923654955616645e-05, + "loss": 0.3492, "step": 95060 }, { - "epoch": 3.34, - "learning_rate": 1.3062974682122059e-05, - "loss": 0.2695, + "epoch": 3.426136158864021, + "grad_norm": 0.20187516510486603, + "learning_rate": 1.1921167920367805e-05, + "loss": 0.3737, "step": 95065 }, { - "epoch": 3.34, - "learning_rate": 1.3060471753241074e-05, - "loss": 0.2539, + "epoch": 3.426316358525246, + "grad_norm": 0.2167653739452362, + "learning_rate": 1.1918681063315507e-05, + "loss": 0.3887, "step": 95070 }, { - "epoch": 3.35, - "learning_rate": 1.3057968979387004e-05, - "loss": 0.2544, + "epoch": 3.4264965581864706, + "grad_norm": 0.2231149524450302, + "learning_rate": 1.1916194384493659e-05, + "loss": 0.3895, "step": 95075 }, { - "epoch": 3.35, - "learning_rate": 1.3055466360592337e-05, - "loss": 0.2343, + "epoch": 3.4266767578476953, + "grad_norm": 0.2766813337802887, + "learning_rate": 1.1913707883936136e-05, + "loss": 0.3737, "step": 95080 }, { - "epoch": 3.35, - "learning_rate": 1.3052963896889565e-05, - "loss": 0.2827, + "epoch": 3.42685695750892, + "grad_norm": 0.2061295360326767, + "learning_rate": 1.1911221561676795e-05, + "loss": 0.3716, "step": 95085 }, { - "epoch": 3.35, - "learning_rate": 1.3050461588311175e-05, - "loss": 0.2629, + "epoch": 3.4270371571701443, + "grad_norm": 0.21205422282218933, + "learning_rate": 1.1908735417749534e-05, + "loss": 0.3941, "step": 95090 }, { - "epoch": 3.35, - "learning_rate": 1.3047959434889679e-05, - "loss": 0.2746, + "epoch": 3.427217356831369, + "grad_norm": 0.2193949669599533, + "learning_rate": 1.1906249452188217e-05, + "loss": 0.3787, "step": 95095 }, { - "epoch": 3.35, - "learning_rate": 1.304545743665755e-05, - "loss": 0.2742, + "epoch": 3.427397556492594, + "grad_norm": 0.1897018849849701, + "learning_rate": 1.1903763665026719e-05, + "loss": 0.3418, "step": 95100 }, { - "epoch": 3.35, - "learning_rate": 1.3042955593647267e-05, - "loss": 0.2586, + "epoch": 3.4275777561538185, + "grad_norm": 0.2177032083272934, + "learning_rate": 1.1901278056298901e-05, + "loss": 0.3671, "step": 95105 }, { - "epoch": 3.35, - "learning_rate": 1.3040453905891328e-05, - "loss": 0.2966, + "epoch": 3.427757955815043, + "grad_norm": 0.29652655124664307, + "learning_rate": 1.1898792626038626e-05, + "loss": 0.366, "step": 95110 }, { - "epoch": 3.35, - "learning_rate": 1.3037952373422225e-05, - "loss": 0.2524, + "epoch": 3.4279381554762676, + "grad_norm": 0.20908688008785248, + "learning_rate": 1.1896307374279777e-05, + "loss": 0.3806, "step": 95115 }, { - "epoch": 3.35, - "learning_rate": 1.3035450996272422e-05, - "loss": 0.2349, + "epoch": 3.4281183551374923, + "grad_norm": 0.2735453248023987, + "learning_rate": 1.18938223010562e-05, + "loss": 0.3945, "step": 95120 }, { - "epoch": 3.35, - "learning_rate": 1.3032949774474407e-05, - "loss": 0.2827, + "epoch": 3.428298554798717, + "grad_norm": 0.23692834377288818, + "learning_rate": 1.1891337406401758e-05, + "loss": 0.3743, "step": 95125 }, { - "epoch": 3.35, - "learning_rate": 1.3030448708060639e-05, - "loss": 0.2449, + "epoch": 3.4284787544599418, + "grad_norm": 0.23678283393383026, + "learning_rate": 1.1888852690350305e-05, + "loss": 0.3876, "step": 95130 }, { - "epoch": 3.35, - "learning_rate": 1.3027947797063619e-05, - "loss": 0.2624, + "epoch": 3.428658954121166, + "grad_norm": 0.30739691853523254, + "learning_rate": 1.1886368152935698e-05, + "loss": 0.3843, "step": 95135 }, { - "epoch": 3.35, - "learning_rate": 1.3025447041515806e-05, - "loss": 0.2681, + "epoch": 3.428839153782391, + "grad_norm": 0.23699170351028442, + "learning_rate": 1.1883883794191774e-05, + "loss": 0.363, "step": 95140 }, { - "epoch": 3.35, - "learning_rate": 1.3022946441449663e-05, - "loss": 0.2653, + "epoch": 3.4290193534436155, + "grad_norm": 0.2635442912578583, + "learning_rate": 1.1881399614152406e-05, + "loss": 0.3764, "step": 95145 }, { - "epoch": 3.35, - "learning_rate": 1.3020445996897673e-05, - "loss": 0.2469, + "epoch": 3.4291995531048403, + "grad_norm": 0.23307177424430847, + "learning_rate": 1.1878915612851428e-05, + "loss": 0.3758, "step": 95150 }, { - "epoch": 3.35, - "learning_rate": 1.3017945707892305e-05, - "loss": 0.2505, + "epoch": 3.429379752766065, + "grad_norm": 0.208553284406662, + "learning_rate": 1.1876431790322682e-05, + "loss": 0.3888, "step": 95155 }, { - "epoch": 3.35, - "learning_rate": 1.3015445574465999e-05, - "loss": 0.249, + "epoch": 3.4295599524272893, + "grad_norm": 0.30267196893692017, + "learning_rate": 1.1873948146600012e-05, + "loss": 0.4137, "step": 95160 }, { - "epoch": 3.35, - "learning_rate": 1.3012945596651243e-05, - "loss": 0.2543, + "epoch": 3.429740152088514, + "grad_norm": 0.21039345860481262, + "learning_rate": 1.1871464681717249e-05, + "loss": 0.382, "step": 95165 }, { - "epoch": 3.35, - "learning_rate": 1.3010445774480484e-05, - "loss": 0.2588, + "epoch": 3.4299203517497387, + "grad_norm": 0.21296541392803192, + "learning_rate": 1.1868981395708256e-05, + "loss": 0.3729, "step": 95170 }, { - "epoch": 3.35, - "learning_rate": 1.3007946107986191e-05, - "loss": 0.2457, + "epoch": 3.4301005514109635, + "grad_norm": 0.2397422194480896, + "learning_rate": 1.1866498288606843e-05, + "loss": 0.3815, "step": 95175 }, { - "epoch": 3.35, - "learning_rate": 1.3005446597200815e-05, - "loss": 0.2692, + "epoch": 3.430280751072188, + "grad_norm": 0.259847491979599, + "learning_rate": 1.1864015360446834e-05, + "loss": 0.3923, "step": 95180 }, { - "epoch": 3.35, - "learning_rate": 1.3002947242156805e-05, - "loss": 0.2647, + "epoch": 3.4304609507334125, + "grad_norm": 0.23978331685066223, + "learning_rate": 1.1861532611262086e-05, + "loss": 0.3823, "step": 95185 }, { - "epoch": 3.35, - "learning_rate": 1.3000448042886626e-05, - "loss": 0.2713, + "epoch": 3.4306411503946372, + "grad_norm": 0.20742113888263702, + "learning_rate": 1.18590500410864e-05, + "loss": 0.3816, "step": 95190 }, { - "epoch": 3.35, - "learning_rate": 1.2997948999422719e-05, - "loss": 0.2444, + "epoch": 3.430821350055862, + "grad_norm": 0.24058598279953003, + "learning_rate": 1.1856567649953635e-05, + "loss": 0.3628, "step": 95195 }, { - "epoch": 3.35, - "learning_rate": 1.2995450111797535e-05, - "loss": 0.2554, + "epoch": 3.4310015497170867, + "grad_norm": 0.1972092241048813, + "learning_rate": 1.1854085437897578e-05, + "loss": 0.3614, "step": 95200 }, { - "epoch": 3.35, - "learning_rate": 1.299295138004351e-05, - "loss": 0.2439, + "epoch": 3.431181749378311, + "grad_norm": 0.22613808512687683, + "learning_rate": 1.1851603404952056e-05, + "loss": 0.3998, "step": 95205 }, { - "epoch": 3.35, - "learning_rate": 1.299045280419311e-05, - "loss": 0.2501, + "epoch": 3.4313619490395357, + "grad_norm": 0.24530638754367828, + "learning_rate": 1.1849121551150899e-05, + "loss": 0.3455, "step": 95210 }, { - "epoch": 3.35, - "learning_rate": 1.2987954384278756e-05, - "loss": 0.2727, + "epoch": 3.4315421487007605, + "grad_norm": 0.22673511505126953, + "learning_rate": 1.1846639876527915e-05, + "loss": 0.3772, "step": 95215 }, { - "epoch": 3.35, - "learning_rate": 1.2985456120332906e-05, - "loss": 0.2381, + "epoch": 3.431722348361985, + "grad_norm": 0.24466091394424438, + "learning_rate": 1.1844158381116916e-05, + "loss": 0.3992, "step": 95220 }, { - "epoch": 3.35, - "learning_rate": 1.2982958012387985e-05, - "loss": 0.2882, + "epoch": 3.43190254802321, + "grad_norm": 0.24469353258609772, + "learning_rate": 1.1841677064951714e-05, + "loss": 0.3863, "step": 95225 }, { - "epoch": 3.35, - "learning_rate": 1.298046006047644e-05, - "loss": 0.247, + "epoch": 3.432082747684434, + "grad_norm": 0.23059558868408203, + "learning_rate": 1.1839195928066102e-05, + "loss": 0.4255, "step": 95230 }, { - "epoch": 3.35, - "learning_rate": 1.29779622646307e-05, - "loss": 0.2704, + "epoch": 3.432262947345659, + "grad_norm": 0.18520836532115936, + "learning_rate": 1.183671497049391e-05, + "loss": 0.3682, "step": 95235 }, { - "epoch": 3.35, - "learning_rate": 1.29754646248832e-05, - "loss": 0.2382, + "epoch": 3.4324431470068837, + "grad_norm": 0.2409636229276657, + "learning_rate": 1.1834234192268926e-05, + "loss": 0.3583, "step": 95240 }, { - "epoch": 3.35, - "learning_rate": 1.2972967141266357e-05, - "loss": 0.2845, + "epoch": 3.4326233466681084, + "grad_norm": 0.1873437464237213, + "learning_rate": 1.1831753593424951e-05, + "loss": 0.3832, "step": 95245 }, { - "epoch": 3.35, - "learning_rate": 1.2970469813812619e-05, - "loss": 0.2614, + "epoch": 3.4328035463293327, + "grad_norm": 0.27376478910446167, + "learning_rate": 1.1829273173995787e-05, + "loss": 0.3519, "step": 95250 }, { - "epoch": 3.35, - "learning_rate": 1.29679726425544e-05, - "loss": 0.2622, + "epoch": 3.4329837459905574, + "grad_norm": 0.24432578682899475, + "learning_rate": 1.1826792934015216e-05, + "loss": 0.4051, "step": 95255 }, { - "epoch": 3.35, - "learning_rate": 1.2965475627524122e-05, - "loss": 0.2555, + "epoch": 3.433163945651782, + "grad_norm": 0.3113139867782593, + "learning_rate": 1.182431287351705e-05, + "loss": 0.41, "step": 95260 }, { - "epoch": 3.35, - "learning_rate": 1.2962978768754217e-05, - "loss": 0.2572, + "epoch": 3.433344145313007, + "grad_norm": 0.24303504824638367, + "learning_rate": 1.1821832992535078e-05, + "loss": 0.3869, "step": 95265 }, { - "epoch": 3.35, - "learning_rate": 1.296048206627709e-05, - "loss": 0.2654, + "epoch": 3.4335243449742316, + "grad_norm": 0.19319789111614227, + "learning_rate": 1.1819353291103064e-05, + "loss": 0.3631, "step": 95270 }, { - "epoch": 3.35, - "learning_rate": 1.2957985520125182e-05, - "loss": 0.2568, + "epoch": 3.433704544635456, + "grad_norm": 0.24024613201618195, + "learning_rate": 1.1816873769254816e-05, + "loss": 0.3829, "step": 95275 }, { - "epoch": 3.35, - "learning_rate": 1.2955489130330884e-05, - "loss": 0.2593, + "epoch": 3.4338847442966807, + "grad_norm": 0.23290759325027466, + "learning_rate": 1.1814394427024112e-05, + "loss": 0.3731, "step": 95280 }, { - "epoch": 3.35, - "learning_rate": 1.2952992896926636e-05, - "loss": 0.2639, + "epoch": 3.4340649439579054, + "grad_norm": 0.21962322294712067, + "learning_rate": 1.1811915264444732e-05, + "loss": 0.3682, "step": 95285 }, { - "epoch": 3.35, - "learning_rate": 1.295049681994483e-05, - "loss": 0.2456, + "epoch": 3.43424514361913, + "grad_norm": 0.2735930383205414, + "learning_rate": 1.1809436281550451e-05, + "loss": 0.4133, "step": 95290 }, { - "epoch": 3.35, - "learning_rate": 1.2948000899417887e-05, - "loss": 0.2499, + "epoch": 3.4344253432803544, + "grad_norm": 0.2372862547636032, + "learning_rate": 1.1806957478375038e-05, + "loss": 0.3902, "step": 95295 }, { - "epoch": 3.35, - "learning_rate": 1.2945505135378196e-05, - "loss": 0.2455, + "epoch": 3.434605542941579, + "grad_norm": 0.24612626433372498, + "learning_rate": 1.1804478854952287e-05, + "loss": 0.4095, "step": 95300 }, { - "epoch": 3.35, - "learning_rate": 1.2943009527858187e-05, - "loss": 0.2623, + "epoch": 3.434785742602804, + "grad_norm": 0.21510684490203857, + "learning_rate": 1.1802000411315953e-05, + "loss": 0.3714, "step": 95305 }, { - "epoch": 3.35, - "learning_rate": 1.2940514076890257e-05, - "loss": 0.2515, + "epoch": 3.4349659422640286, + "grad_norm": 0.23222926259040833, + "learning_rate": 1.1799522147499812e-05, + "loss": 0.3823, "step": 95310 }, { - "epoch": 3.35, - "learning_rate": 1.29380187825068e-05, - "loss": 0.2423, + "epoch": 3.4351461419252534, + "grad_norm": 0.303889662027359, + "learning_rate": 1.1797044063537625e-05, + "loss": 0.374, "step": 95315 }, { - "epoch": 3.35, - "learning_rate": 1.2935523644740213e-05, - "loss": 0.256, + "epoch": 3.4353263415864776, + "grad_norm": 0.22583414614200592, + "learning_rate": 1.1794566159463155e-05, + "loss": 0.3554, "step": 95320 }, { - "epoch": 3.35, - "learning_rate": 1.2933028663622909e-05, - "loss": 0.2667, + "epoch": 3.4355065412477024, + "grad_norm": 0.21528246998786926, + "learning_rate": 1.1792088435310156e-05, + "loss": 0.4096, "step": 95325 }, { - "epoch": 3.35, - "learning_rate": 1.2930533839187265e-05, - "loss": 0.271, + "epoch": 3.435686740908927, + "grad_norm": 0.237309068441391, + "learning_rate": 1.1789610891112402e-05, + "loss": 0.3965, "step": 95330 }, { - "epoch": 3.35, - "learning_rate": 1.2928039171465697e-05, - "loss": 0.2523, + "epoch": 3.435866940570152, + "grad_norm": 0.22394055128097534, + "learning_rate": 1.1787133526903646e-05, + "loss": 0.3985, "step": 95335 }, { - "epoch": 3.35, - "learning_rate": 1.2925544660490573e-05, - "loss": 0.2573, + "epoch": 3.436047140231376, + "grad_norm": 0.23511168360710144, + "learning_rate": 1.1784656342717634e-05, + "loss": 0.4072, "step": 95340 }, { - "epoch": 3.35, - "learning_rate": 1.2923050306294305e-05, - "loss": 0.262, + "epoch": 3.436227339892601, + "grad_norm": 0.2214852273464203, + "learning_rate": 1.1782179338588124e-05, + "loss": 0.3582, "step": 95345 }, { - "epoch": 3.35, - "learning_rate": 1.2920556108909271e-05, - "loss": 0.2591, + "epoch": 3.4364075395538256, + "grad_norm": 0.2489766925573349, + "learning_rate": 1.1779702514548848e-05, + "loss": 0.361, "step": 95350 }, { - "epoch": 3.35, - "learning_rate": 1.2918062068367853e-05, - "loss": 0.2597, + "epoch": 3.4365877392150503, + "grad_norm": 0.23350292444229126, + "learning_rate": 1.1777225870633579e-05, + "loss": 0.3675, "step": 95355 }, { - "epoch": 3.36, - "learning_rate": 1.2915568184702425e-05, - "loss": 0.2418, + "epoch": 3.436767938876275, + "grad_norm": 0.23286855220794678, + "learning_rate": 1.1774749406876057e-05, + "loss": 0.4119, "step": 95360 }, { - "epoch": 3.36, - "learning_rate": 1.2913074457945396e-05, - "loss": 0.2486, + "epoch": 3.4369481385374994, + "grad_norm": 0.2144162505865097, + "learning_rate": 1.1772273123309993e-05, + "loss": 0.3873, "step": 95365 }, { - "epoch": 3.36, - "learning_rate": 1.2910580888129126e-05, - "loss": 0.2508, + "epoch": 3.437128338198724, + "grad_norm": 0.24179521203041077, + "learning_rate": 1.1769797019969156e-05, + "loss": 0.3731, "step": 95370 }, { - "epoch": 3.36, - "learning_rate": 1.2908087475285984e-05, - "loss": 0.2783, + "epoch": 3.437308537859949, + "grad_norm": 0.22237004339694977, + "learning_rate": 1.1767321096887265e-05, + "loss": 0.3584, "step": 95375 }, { - "epoch": 3.36, - "learning_rate": 1.2905594219448368e-05, - "loss": 0.2565, + "epoch": 3.4374887375211736, + "grad_norm": 0.23531267046928406, + "learning_rate": 1.1764845354098081e-05, + "loss": 0.4297, "step": 95380 }, { - "epoch": 3.36, - "learning_rate": 1.2903101120648637e-05, - "loss": 0.2734, + "epoch": 3.437668937182398, + "grad_norm": 0.2168639749288559, + "learning_rate": 1.1762369791635305e-05, + "loss": 0.4128, "step": 95385 }, { - "epoch": 3.36, - "learning_rate": 1.290060817891917e-05, - "loss": 0.2653, + "epoch": 3.4378491368436226, + "grad_norm": 0.24747677147388458, + "learning_rate": 1.175989440953267e-05, + "loss": 0.3953, "step": 95390 }, { - "epoch": 3.36, - "learning_rate": 1.2898115394292321e-05, - "loss": 0.2656, + "epoch": 3.4380293365048473, + "grad_norm": 0.2616941034793854, + "learning_rate": 1.1757419207823918e-05, + "loss": 0.3724, "step": 95395 }, { - "epoch": 3.36, - "learning_rate": 1.289562276680048e-05, - "loss": 0.2528, + "epoch": 3.438209536166072, + "grad_norm": 0.21548521518707275, + "learning_rate": 1.1754944186542768e-05, + "loss": 0.3945, "step": 95400 }, { - "epoch": 3.36, - "learning_rate": 1.2893130296476e-05, - "loss": 0.2752, + "epoch": 3.438389735827297, + "grad_norm": 0.20728285610675812, + "learning_rate": 1.1752469345722938e-05, + "loss": 0.4152, "step": 95405 }, { - "epoch": 3.36, - "learning_rate": 1.2890637983351244e-05, - "loss": 0.2442, + "epoch": 3.4385699354885215, + "grad_norm": 0.24086813628673553, + "learning_rate": 1.1749994685398149e-05, + "loss": 0.3667, "step": 95410 }, { - "epoch": 3.36, - "learning_rate": 1.2888145827458564e-05, - "loss": 0.2665, + "epoch": 3.438750135149746, + "grad_norm": 0.21392937004566193, + "learning_rate": 1.1747520205602108e-05, + "loss": 0.3966, "step": 95415 }, { - "epoch": 3.36, - "learning_rate": 1.288565382883034e-05, - "loss": 0.2613, + "epoch": 3.4389303348109705, + "grad_norm": 0.21268320083618164, + "learning_rate": 1.1745045906368549e-05, + "loss": 0.3754, "step": 95420 }, { - "epoch": 3.36, - "learning_rate": 1.2883161987498915e-05, - "loss": 0.2647, + "epoch": 3.4391105344721953, + "grad_norm": 0.2621963918209076, + "learning_rate": 1.1742571787731172e-05, + "loss": 0.4324, "step": 95425 }, { - "epoch": 3.36, - "learning_rate": 1.288067030349664e-05, - "loss": 0.2593, + "epoch": 3.43929073413342, + "grad_norm": 0.28907299041748047, + "learning_rate": 1.1740097849723686e-05, + "loss": 0.3717, "step": 95430 }, { - "epoch": 3.36, - "learning_rate": 1.287817877685588e-05, - "loss": 0.2676, + "epoch": 3.4394709337946443, + "grad_norm": 0.24965962767601013, + "learning_rate": 1.17376240923798e-05, + "loss": 0.3629, "step": 95435 }, { - "epoch": 3.36, - "learning_rate": 1.2875687407608972e-05, - "loss": 0.2488, + "epoch": 3.439651133455869, + "grad_norm": 0.27768054604530334, + "learning_rate": 1.1735150515733209e-05, + "loss": 0.3604, "step": 95440 }, { - "epoch": 3.36, - "learning_rate": 1.2873196195788279e-05, - "loss": 0.263, + "epoch": 3.4398313331170938, + "grad_norm": 0.25251927971839905, + "learning_rate": 1.1732677119817634e-05, + "loss": 0.3643, "step": 95445 }, { - "epoch": 3.36, - "learning_rate": 1.2870705141426146e-05, - "loss": 0.2592, + "epoch": 3.4400115327783185, + "grad_norm": 0.2550899386405945, + "learning_rate": 1.1730203904666773e-05, + "loss": 0.3879, "step": 95450 }, { - "epoch": 3.36, - "learning_rate": 1.2868214244554902e-05, - "loss": 0.2432, + "epoch": 3.4401917324395432, + "grad_norm": 0.2622126638889313, + "learning_rate": 1.1727730870314298e-05, + "loss": 0.3498, "step": 95455 }, { - "epoch": 3.36, - "learning_rate": 1.2865723505206912e-05, - "loss": 0.2637, + "epoch": 3.4403719321007675, + "grad_norm": 0.2463812530040741, + "learning_rate": 1.1725258016793927e-05, + "loss": 0.413, "step": 95460 }, { - "epoch": 3.36, - "learning_rate": 1.2863232923414504e-05, - "loss": 0.2573, + "epoch": 3.4405521317619923, + "grad_norm": 0.2597421109676361, + "learning_rate": 1.1722785344139347e-05, + "loss": 0.3833, "step": 95465 }, { - "epoch": 3.36, - "learning_rate": 1.2860742499210016e-05, - "loss": 0.2492, + "epoch": 3.440732331423217, + "grad_norm": 0.20518064498901367, + "learning_rate": 1.1720312852384239e-05, + "loss": 0.3924, "step": 95470 }, { - "epoch": 3.36, - "learning_rate": 1.2858252232625778e-05, - "loss": 0.2399, + "epoch": 3.4409125310844417, + "grad_norm": 0.20788519084453583, + "learning_rate": 1.1717840541562312e-05, + "loss": 0.3612, "step": 95475 }, { - "epoch": 3.36, - "learning_rate": 1.2855762123694145e-05, - "loss": 0.2297, + "epoch": 3.441092730745666, + "grad_norm": 0.21892179548740387, + "learning_rate": 1.171536841170722e-05, + "loss": 0.3725, "step": 95480 }, { - "epoch": 3.36, - "learning_rate": 1.2853272172447437e-05, - "loss": 0.2794, + "epoch": 3.4412729304068908, + "grad_norm": 0.23169714212417603, + "learning_rate": 1.1712896462852668e-05, + "loss": 0.3949, "step": 95485 }, { - "epoch": 3.36, - "learning_rate": 1.2850782378917975e-05, - "loss": 0.2753, + "epoch": 3.4414531300681155, + "grad_norm": 0.24189196527004242, + "learning_rate": 1.1710424695032332e-05, + "loss": 0.368, "step": 95490 }, { - "epoch": 3.36, - "learning_rate": 1.2848292743138097e-05, - "loss": 0.2583, + "epoch": 3.44163332972934, + "grad_norm": 0.21008506417274475, + "learning_rate": 1.1707953108279888e-05, + "loss": 0.4096, "step": 95495 }, { - "epoch": 3.36, - "learning_rate": 1.2845803265140138e-05, - "loss": 0.2693, + "epoch": 3.441813529390565, + "grad_norm": 0.23108026385307312, + "learning_rate": 1.1705481702629007e-05, + "loss": 0.3929, "step": 95500 }, { - "epoch": 3.36, - "eval_loss": 0.2542358934879303, - "eval_runtime": 10.5386, - "eval_samples_per_second": 9.489, - "eval_steps_per_second": 9.489, + "epoch": 3.441813529390565, + "eval_loss": 0.4303448796272278, + "eval_runtime": 3.5356, + "eval_samples_per_second": 28.284, + "eval_steps_per_second": 7.071, "step": 95500 }, { - "epoch": 3.36, - "learning_rate": 1.2843313944956414e-05, - "loss": 0.2593, + "epoch": 3.4419937290517892, + "grad_norm": 0.1974146068096161, + "learning_rate": 1.1703010478113363e-05, + "loss": 0.3409, "step": 95505 }, { - "epoch": 3.36, - "learning_rate": 1.2840824782619237e-05, - "loss": 0.2635, + "epoch": 3.442173928713014, + "grad_norm": 0.2331085056066513, + "learning_rate": 1.1700539434766619e-05, + "loss": 0.3589, "step": 95510 }, { - "epoch": 3.36, - "learning_rate": 1.2838335778160948e-05, - "loss": 0.2435, + "epoch": 3.4423541283742387, + "grad_norm": 0.23634552955627441, + "learning_rate": 1.1698068572622458e-05, + "loss": 0.3905, "step": 95515 }, { - "epoch": 3.36, - "learning_rate": 1.2835846931613853e-05, - "loss": 0.2347, + "epoch": 3.4425343280354634, + "grad_norm": 0.3103722929954529, + "learning_rate": 1.1695597891714539e-05, + "loss": 0.3924, "step": 95520 }, { - "epoch": 3.36, - "learning_rate": 1.2833358243010268e-05, - "loss": 0.2579, + "epoch": 3.4427145276966877, + "grad_norm": 0.2085963785648346, + "learning_rate": 1.1693127392076522e-05, + "loss": 0.3817, "step": 95525 }, { - "epoch": 3.36, - "learning_rate": 1.2830869712382504e-05, - "loss": 0.2742, + "epoch": 3.4428947273579125, + "grad_norm": 0.2746676504611969, + "learning_rate": 1.169065707374207e-05, + "loss": 0.4077, "step": 95530 }, { - "epoch": 3.36, - "learning_rate": 1.282838133976288e-05, - "loss": 0.261, + "epoch": 3.443074927019137, + "grad_norm": 0.2430097460746765, + "learning_rate": 1.1688186936744827e-05, + "loss": 0.3698, "step": 95535 }, { - "epoch": 3.36, - "learning_rate": 1.2825893125183709e-05, - "loss": 0.2383, + "epoch": 3.443255126680362, + "grad_norm": 0.2820988595485687, + "learning_rate": 1.1685716981118469e-05, + "loss": 0.3811, "step": 95540 }, { - "epoch": 3.36, - "learning_rate": 1.2823405068677277e-05, - "loss": 0.2771, + "epoch": 3.4434353263415867, + "grad_norm": 0.2863827347755432, + "learning_rate": 1.168324720689665e-05, + "loss": 0.4197, "step": 95545 }, { - "epoch": 3.36, - "learning_rate": 1.2820917170275909e-05, - "loss": 0.2735, + "epoch": 3.443615526002811, + "grad_norm": 0.2102956622838974, + "learning_rate": 1.1680777614112992e-05, + "loss": 0.3898, "step": 95550 }, { - "epoch": 3.36, - "learning_rate": 1.2818429430011914e-05, - "loss": 0.2574, + "epoch": 3.4437957256640357, + "grad_norm": 0.20017428696155548, + "learning_rate": 1.1678308202801166e-05, + "loss": 0.3685, "step": 95555 }, { - "epoch": 3.36, - "learning_rate": 1.2815941847917584e-05, - "loss": 0.2508, + "epoch": 3.4439759253252604, + "grad_norm": 0.2171812206506729, + "learning_rate": 1.1675838972994805e-05, + "loss": 0.4052, "step": 95560 }, { - "epoch": 3.36, - "learning_rate": 1.281345442402522e-05, - "loss": 0.2654, + "epoch": 3.444156124986485, + "grad_norm": 0.20312188565731049, + "learning_rate": 1.1673369924727576e-05, + "loss": 0.3949, "step": 95565 }, { - "epoch": 3.36, - "learning_rate": 1.281096715836711e-05, - "loss": 0.254, + "epoch": 3.4443363246477094, + "grad_norm": 0.2849917709827423, + "learning_rate": 1.1670901058033093e-05, + "loss": 0.3912, "step": 95570 }, { - "epoch": 3.36, - "learning_rate": 1.2808480050975569e-05, - "loss": 0.262, + "epoch": 3.444516524308934, + "grad_norm": 0.2215384542942047, + "learning_rate": 1.1668432372944992e-05, + "loss": 0.3403, "step": 95575 }, { - "epoch": 3.36, - "learning_rate": 1.280599310188288e-05, - "loss": 0.2479, + "epoch": 3.444696723970159, + "grad_norm": 0.26748472452163696, + "learning_rate": 1.1665963869496932e-05, + "loss": 0.3769, "step": 95580 }, { - "epoch": 3.36, - "learning_rate": 1.2803506311121332e-05, - "loss": 0.2974, + "epoch": 3.4448769236313836, + "grad_norm": 0.20163197815418243, + "learning_rate": 1.1663495547722528e-05, + "loss": 0.3837, "step": 95585 }, { - "epoch": 3.36, - "learning_rate": 1.2801019678723208e-05, - "loss": 0.2636, + "epoch": 3.4450571232926084, + "grad_norm": 0.23869015276432037, + "learning_rate": 1.1661027407655417e-05, + "loss": 0.3995, "step": 95590 }, { - "epoch": 3.36, - "learning_rate": 1.2798533204720814e-05, - "loss": 0.2263, + "epoch": 3.4452373229538327, + "grad_norm": 0.25916892290115356, + "learning_rate": 1.1658559449329227e-05, + "loss": 0.3736, "step": 95595 }, { - "epoch": 3.36, - "learning_rate": 1.2796046889146417e-05, - "loss": 0.2639, + "epoch": 3.4454175226150574, + "grad_norm": 0.2509896755218506, + "learning_rate": 1.1656091672777569e-05, + "loss": 0.3497, "step": 95600 }, { - "epoch": 3.36, - "learning_rate": 1.2793560732032317e-05, - "loss": 0.254, + "epoch": 3.445597722276282, + "grad_norm": 0.2731640636920929, + "learning_rate": 1.165362407803409e-05, + "loss": 0.3667, "step": 95605 }, { - "epoch": 3.36, - "learning_rate": 1.2791074733410776e-05, - "loss": 0.2232, + "epoch": 3.445777921937507, + "grad_norm": 0.23170186579227448, + "learning_rate": 1.1651156665132396e-05, + "loss": 0.3817, "step": 95610 }, { - "epoch": 3.36, - "learning_rate": 1.2788588893314096e-05, - "loss": 0.2799, + "epoch": 3.445958121598731, + "grad_norm": 0.21757963299751282, + "learning_rate": 1.164868943410611e-05, + "loss": 0.3697, "step": 95615 }, { - "epoch": 3.36, - "learning_rate": 1.2786103211774538e-05, - "loss": 0.2552, + "epoch": 3.446138321259956, + "grad_norm": 0.2630380690097809, + "learning_rate": 1.1646222384988847e-05, + "loss": 0.3495, "step": 95620 }, { - "epoch": 3.36, - "learning_rate": 1.2783617688824373e-05, - "loss": 0.2556, + "epoch": 3.4463185209211806, + "grad_norm": 0.23066528141498566, + "learning_rate": 1.164375551781421e-05, + "loss": 0.3468, "step": 95625 }, { - "epoch": 3.36, - "learning_rate": 1.2781132324495892e-05, - "loss": 0.2572, + "epoch": 3.4464987205824054, + "grad_norm": 0.27254176139831543, + "learning_rate": 1.1641288832615823e-05, + "loss": 0.4197, "step": 95630 }, { - "epoch": 3.36, - "learning_rate": 1.2778647118821352e-05, - "loss": 0.2541, + "epoch": 3.44667892024363, + "grad_norm": 0.2096617966890335, + "learning_rate": 1.1638822329427293e-05, + "loss": 0.3646, "step": 95635 }, { - "epoch": 3.36, - "learning_rate": 1.2776162071833028e-05, - "loss": 0.2767, + "epoch": 3.4468591199048544, + "grad_norm": 0.20088768005371094, + "learning_rate": 1.1636356008282221e-05, + "loss": 0.3663, "step": 95640 }, { - "epoch": 3.37, - "learning_rate": 1.2773677183563171e-05, - "loss": 0.2583, + "epoch": 3.447039319566079, + "grad_norm": 0.2403293251991272, + "learning_rate": 1.163388986921421e-05, + "loss": 0.3651, "step": 95645 }, { - "epoch": 3.37, - "learning_rate": 1.2771192454044073e-05, - "loss": 0.2749, + "epoch": 3.447219519227304, + "grad_norm": 0.23152384161949158, + "learning_rate": 1.163142391225686e-05, + "loss": 0.3991, "step": 95650 }, { - "epoch": 3.37, - "learning_rate": 1.2768707883307968e-05, - "loss": 0.26, + "epoch": 3.4473997188885286, + "grad_norm": 0.29287877678871155, + "learning_rate": 1.1628958137443766e-05, + "loss": 0.4082, "step": 95655 }, { - "epoch": 3.37, - "learning_rate": 1.276622347138714e-05, - "loss": 0.2622, + "epoch": 3.4475799185497533, + "grad_norm": 0.2720036208629608, + "learning_rate": 1.1626492544808545e-05, + "loss": 0.3914, "step": 95660 }, { - "epoch": 3.37, - "learning_rate": 1.276373921831383e-05, - "loss": 0.2501, + "epoch": 3.4477601182109776, + "grad_norm": 0.24344083666801453, + "learning_rate": 1.1624027134384757e-05, + "loss": 0.3794, "step": 95665 }, { - "epoch": 3.37, - "learning_rate": 1.2761255124120313e-05, - "loss": 0.2671, + "epoch": 3.4479403178722023, + "grad_norm": 0.22803980112075806, + "learning_rate": 1.1621561906206019e-05, + "loss": 0.3875, "step": 95670 }, { - "epoch": 3.37, - "learning_rate": 1.2758771188838831e-05, - "loss": 0.2439, + "epoch": 3.448120517533427, + "grad_norm": 0.23738615214824677, + "learning_rate": 1.1619096860305905e-05, + "loss": 0.4092, "step": 95675 }, { - "epoch": 3.37, - "learning_rate": 1.2756287412501638e-05, - "loss": 0.2651, + "epoch": 3.448300717194652, + "grad_norm": 0.20066243410110474, + "learning_rate": 1.161663199671801e-05, + "loss": 0.3773, "step": 95680 }, { - "epoch": 3.37, - "learning_rate": 1.2753803795140973e-05, - "loss": 0.2558, + "epoch": 3.4484809168558765, + "grad_norm": 0.3553462326526642, + "learning_rate": 1.1614167315475913e-05, + "loss": 0.3927, "step": 95685 }, { - "epoch": 3.37, - "learning_rate": 1.2751320336789108e-05, - "loss": 0.2531, + "epoch": 3.448661116517101, + "grad_norm": 0.23177509009838104, + "learning_rate": 1.1611702816613195e-05, + "loss": 0.4163, "step": 95690 }, { - "epoch": 3.37, - "learning_rate": 1.2748837037478272e-05, - "loss": 0.2478, + "epoch": 3.4488413161783256, + "grad_norm": 0.21542638540267944, + "learning_rate": 1.1609238500163424e-05, + "loss": 0.4101, "step": 95695 }, { - "epoch": 3.37, - "learning_rate": 1.2746353897240715e-05, - "loss": 0.2585, + "epoch": 3.4490215158395503, + "grad_norm": 0.2028144747018814, + "learning_rate": 1.1606774366160194e-05, + "loss": 0.3794, "step": 95700 }, { - "epoch": 3.37, - "learning_rate": 1.2743870916108668e-05, - "loss": 0.2654, + "epoch": 3.449201715500775, + "grad_norm": 0.23155230283737183, + "learning_rate": 1.160431041463707e-05, + "loss": 0.3431, "step": 95705 }, { - "epoch": 3.37, - "learning_rate": 1.2741388094114382e-05, - "loss": 0.2564, + "epoch": 3.4493819151619993, + "grad_norm": 0.2762317359447479, + "learning_rate": 1.1601846645627625e-05, + "loss": 0.3731, "step": 95710 }, { - "epoch": 3.37, - "learning_rate": 1.27389054312901e-05, - "loss": 0.2517, + "epoch": 3.449562114823224, + "grad_norm": 0.2540960907936096, + "learning_rate": 1.1599383059165425e-05, + "loss": 0.3865, "step": 95715 }, { - "epoch": 3.37, - "learning_rate": 1.2736422927668052e-05, - "loss": 0.2956, + "epoch": 3.449742314484449, + "grad_norm": 0.19944801926612854, + "learning_rate": 1.1596919655284027e-05, + "loss": 0.3779, "step": 95720 }, { - "epoch": 3.37, - "learning_rate": 1.2733940583280457e-05, - "loss": 0.2528, + "epoch": 3.4499225141456735, + "grad_norm": 0.23415467143058777, + "learning_rate": 1.1594456434017013e-05, + "loss": 0.3739, "step": 95725 }, { - "epoch": 3.37, - "learning_rate": 1.2731458398159573e-05, - "loss": 0.2563, + "epoch": 3.4501027138068983, + "grad_norm": 0.21086935698986053, + "learning_rate": 1.1591993395397944e-05, + "loss": 0.3912, "step": 95730 }, { - "epoch": 3.37, - "learning_rate": 1.2728976372337617e-05, - "loss": 0.2618, + "epoch": 3.4502829134681225, + "grad_norm": 0.23152147233486176, + "learning_rate": 1.158953053946035e-05, + "loss": 0.4073, "step": 95735 }, { - "epoch": 3.37, - "learning_rate": 1.2726494505846803e-05, - "loss": 0.2516, + "epoch": 3.4504631131293473, + "grad_norm": 0.29169178009033203, + "learning_rate": 1.1587067866237816e-05, + "loss": 0.41, "step": 95740 }, { - "epoch": 3.37, - "learning_rate": 1.272401279871938e-05, - "loss": 0.2465, + "epoch": 3.450643312790572, + "grad_norm": 0.24295637011528015, + "learning_rate": 1.1584605375763876e-05, + "loss": 0.3491, "step": 95745 }, { - "epoch": 3.37, - "learning_rate": 1.2721531250987565e-05, - "loss": 0.2483, + "epoch": 3.4508235124517967, + "grad_norm": 0.19027990102767944, + "learning_rate": 1.158214306807211e-05, + "loss": 0.385, "step": 95750 }, { - "epoch": 3.37, - "learning_rate": 1.2719049862683572e-05, - "loss": 0.2505, + "epoch": 3.451003712113021, + "grad_norm": 0.21917863190174103, + "learning_rate": 1.1579680943196036e-05, + "loss": 0.3643, "step": 95755 }, { - "epoch": 3.37, - "learning_rate": 1.2716568633839612e-05, - "loss": 0.2483, + "epoch": 3.4511839117742458, + "grad_norm": 0.22133587300777435, + "learning_rate": 1.1577219001169204e-05, + "loss": 0.3636, "step": 95760 }, { - "epoch": 3.37, - "learning_rate": 1.2714087564487925e-05, - "loss": 0.2572, + "epoch": 3.4513641114354705, + "grad_norm": 0.22902527451515198, + "learning_rate": 1.1574757242025169e-05, + "loss": 0.3693, "step": 95765 }, { - "epoch": 3.37, - "learning_rate": 1.2711606654660701e-05, - "loss": 0.2597, + "epoch": 3.4515443110966952, + "grad_norm": 0.21824954450130463, + "learning_rate": 1.1572295665797472e-05, + "loss": 0.3672, "step": 95770 }, { - "epoch": 3.37, - "learning_rate": 1.270912590439018e-05, - "loss": 0.2515, + "epoch": 3.45172451075792, + "grad_norm": 0.29460370540618896, + "learning_rate": 1.1569834272519644e-05, + "loss": 0.413, "step": 95775 }, { - "epoch": 3.37, - "learning_rate": 1.2706645313708548e-05, - "loss": 0.2443, + "epoch": 3.4519047104191443, + "grad_norm": 0.259569376707077, + "learning_rate": 1.1567373062225226e-05, + "loss": 0.3763, "step": 95780 }, { - "epoch": 3.37, - "learning_rate": 1.2704164882648034e-05, - "loss": 0.2494, + "epoch": 3.452084910080369, + "grad_norm": 0.28914976119995117, + "learning_rate": 1.1564912034947739e-05, + "loss": 0.3791, "step": 95785 }, { - "epoch": 3.37, - "learning_rate": 1.2701684611240835e-05, - "loss": 0.2548, + "epoch": 3.4522651097415937, + "grad_norm": 0.2676786482334137, + "learning_rate": 1.1562451190720733e-05, + "loss": 0.3611, "step": 95790 }, { - "epoch": 3.37, - "learning_rate": 1.2699204499519157e-05, - "loss": 0.2632, + "epoch": 3.4524453094028185, + "grad_norm": 0.2162621021270752, + "learning_rate": 1.155999052957773e-05, + "loss": 0.3395, "step": 95795 }, { - "epoch": 3.37, - "learning_rate": 1.2696724547515193e-05, - "loss": 0.2684, + "epoch": 3.4526255090640428, + "grad_norm": 0.26020073890686035, + "learning_rate": 1.155753005155225e-05, + "loss": 0.3507, "step": 95800 }, { - "epoch": 3.37, - "learning_rate": 1.2694244755261164e-05, - "loss": 0.2437, + "epoch": 3.4528057087252675, + "grad_norm": 0.2733500003814697, + "learning_rate": 1.1555069756677824e-05, + "loss": 0.377, "step": 95805 }, { - "epoch": 3.37, - "learning_rate": 1.2691765122789256e-05, - "loss": 0.2587, + "epoch": 3.452985908386492, + "grad_norm": 0.25482839345932007, + "learning_rate": 1.155260964498796e-05, + "loss": 0.3767, "step": 95810 }, { - "epoch": 3.37, - "learning_rate": 1.2689285650131654e-05, - "loss": 0.2623, + "epoch": 3.453166108047717, + "grad_norm": 0.2589060068130493, + "learning_rate": 1.1550149716516196e-05, + "loss": 0.3995, "step": 95815 }, { - "epoch": 3.37, - "learning_rate": 1.2686806337320578e-05, - "loss": 0.2859, + "epoch": 3.4533463077089417, + "grad_norm": 0.26133349537849426, + "learning_rate": 1.1547689971296038e-05, + "loss": 0.3989, "step": 95820 }, { - "epoch": 3.37, - "learning_rate": 1.2684327184388195e-05, - "loss": 0.2551, + "epoch": 3.453526507370166, + "grad_norm": 0.2330733835697174, + "learning_rate": 1.1545230409360996e-05, + "loss": 0.3813, "step": 95825 }, { - "epoch": 3.37, - "learning_rate": 1.2681848191366719e-05, - "loss": 0.2557, + "epoch": 3.4537067070313907, + "grad_norm": 0.2710002660751343, + "learning_rate": 1.154277103074459e-05, + "loss": 0.3821, "step": 95830 }, { - "epoch": 3.37, - "learning_rate": 1.2679369358288323e-05, - "loss": 0.2455, + "epoch": 3.4538869066926154, + "grad_norm": 0.24312447011470795, + "learning_rate": 1.154031183548032e-05, + "loss": 0.3863, "step": 95835 }, { - "epoch": 3.37, - "learning_rate": 1.2676890685185188e-05, - "loss": 0.2614, + "epoch": 3.45406710635384, + "grad_norm": 0.21377715468406677, + "learning_rate": 1.1537852823601685e-05, + "loss": 0.3886, "step": 95840 }, { - "epoch": 3.37, - "learning_rate": 1.2674412172089517e-05, - "loss": 0.2673, + "epoch": 3.4542473060150645, + "grad_norm": 0.2441663295030594, + "learning_rate": 1.1535393995142222e-05, + "loss": 0.3983, "step": 95845 }, { - "epoch": 3.37, - "learning_rate": 1.2671933819033482e-05, - "loss": 0.255, + "epoch": 3.454427505676289, + "grad_norm": 0.21451100707054138, + "learning_rate": 1.1532935350135387e-05, + "loss": 0.3596, "step": 95850 }, { - "epoch": 3.37, - "learning_rate": 1.2669455626049251e-05, - "loss": 0.2618, + "epoch": 3.454607705337514, + "grad_norm": 0.22826121747493744, + "learning_rate": 1.1530476888614711e-05, + "loss": 0.3795, "step": 95855 }, { - "epoch": 3.37, - "learning_rate": 1.266697759316902e-05, - "loss": 0.247, + "epoch": 3.4547879049987387, + "grad_norm": 0.2341785728931427, + "learning_rate": 1.1528018610613678e-05, + "loss": 0.3178, "step": 95860 }, { - "epoch": 3.37, - "learning_rate": 1.266449972042496e-05, - "loss": 0.2691, + "epoch": 3.4549681046599634, + "grad_norm": 0.25270435214042664, + "learning_rate": 1.1525560516165784e-05, + "loss": 0.3635, "step": 95865 }, { - "epoch": 3.37, - "learning_rate": 1.2662022007849234e-05, - "loss": 0.2587, + "epoch": 3.4551483043211877, + "grad_norm": 0.2552870512008667, + "learning_rate": 1.152310260530452e-05, + "loss": 0.3629, "step": 95870 }, { - "epoch": 3.37, - "learning_rate": 1.2659544455474026e-05, - "loss": 0.2492, + "epoch": 3.4553285039824124, + "grad_norm": 0.24541820585727692, + "learning_rate": 1.1520644878063374e-05, + "loss": 0.3886, "step": 95875 }, { - "epoch": 3.37, - "learning_rate": 1.2657067063331492e-05, - "loss": 0.2823, + "epoch": 3.455508703643637, + "grad_norm": 0.22713278234004974, + "learning_rate": 1.1518187334475822e-05, + "loss": 0.3915, "step": 95880 }, { - "epoch": 3.37, - "learning_rate": 1.265458983145382e-05, - "loss": 0.2437, + "epoch": 3.455688903304862, + "grad_norm": 0.2381686270236969, + "learning_rate": 1.1515729974575369e-05, + "loss": 0.4036, "step": 95885 }, { - "epoch": 3.37, - "learning_rate": 1.265211275987316e-05, - "loss": 0.2672, + "epoch": 3.455869102966086, + "grad_norm": 0.2205180674791336, + "learning_rate": 1.151327279839548e-05, + "loss": 0.4164, "step": 95890 }, { - "epoch": 3.37, - "learning_rate": 1.2649635848621672e-05, - "loss": 0.2517, + "epoch": 3.456049302627311, + "grad_norm": 0.2498241364955902, + "learning_rate": 1.1510815805969641e-05, + "loss": 0.3929, "step": 95895 }, { - "epoch": 3.37, - "learning_rate": 1.2647159097731532e-05, - "loss": 0.2606, + "epoch": 3.4562295022885356, + "grad_norm": 0.20146778225898743, + "learning_rate": 1.1508358997331322e-05, + "loss": 0.378, "step": 95900 }, { - "epoch": 3.37, - "learning_rate": 1.264468250723489e-05, - "loss": 0.2482, + "epoch": 3.4564097019497604, + "grad_norm": 0.22101639211177826, + "learning_rate": 1.1505902372513994e-05, + "loss": 0.3808, "step": 95905 }, { - "epoch": 3.37, - "learning_rate": 1.26422060771639e-05, - "loss": 0.2635, + "epoch": 3.456589901610985, + "grad_norm": 0.19309233129024506, + "learning_rate": 1.150344593155114e-05, + "loss": 0.4071, "step": 95910 }, { - "epoch": 3.37, - "learning_rate": 1.2639729807550715e-05, - "loss": 0.2477, + "epoch": 3.45677010127221, + "grad_norm": 0.2546674907207489, + "learning_rate": 1.1500989674476234e-05, + "loss": 0.3693, "step": 95915 }, { - "epoch": 3.37, - "learning_rate": 1.2637253698427498e-05, - "loss": 0.2564, + "epoch": 3.456950300933434, + "grad_norm": 0.2687970697879791, + "learning_rate": 1.1498533601322711e-05, + "loss": 0.3548, "step": 95920 }, { - "epoch": 3.37, - "learning_rate": 1.26347777498264e-05, - "loss": 0.2578, + "epoch": 3.457130500594659, + "grad_norm": 0.22413934767246246, + "learning_rate": 1.1496077712124062e-05, + "loss": 0.3728, "step": 95925 }, { - "epoch": 3.38, - "learning_rate": 1.2632301961779547e-05, - "loss": 0.2635, + "epoch": 3.4573107002558836, + "grad_norm": 0.20262411236763, + "learning_rate": 1.1493622006913728e-05, + "loss": 0.3563, "step": 95930 }, { - "epoch": 3.38, - "learning_rate": 1.2629826334319108e-05, - "loss": 0.2843, + "epoch": 3.4574908999171083, + "grad_norm": 0.18897296488285065, + "learning_rate": 1.1491166485725194e-05, + "loss": 0.3703, "step": 95935 }, { - "epoch": 3.38, - "learning_rate": 1.2627350867477231e-05, - "loss": 0.2645, + "epoch": 3.4576710995783326, + "grad_norm": 0.2627841830253601, + "learning_rate": 1.1488711148591907e-05, + "loss": 0.4015, "step": 95940 }, { - "epoch": 3.38, - "learning_rate": 1.2624875561286049e-05, - "loss": 0.262, + "epoch": 3.4578512992395574, + "grad_norm": 0.19592086970806122, + "learning_rate": 1.1486255995547299e-05, + "loss": 0.3791, "step": 95945 }, { - "epoch": 3.38, - "learning_rate": 1.2622400415777699e-05, - "loss": 0.2535, + "epoch": 3.458031498900782, + "grad_norm": 0.27027082443237305, + "learning_rate": 1.1483801026624844e-05, + "loss": 0.3186, "step": 95950 }, { - "epoch": 3.38, - "learning_rate": 1.2619925430984313e-05, - "loss": 0.2517, + "epoch": 3.458211698562007, + "grad_norm": 0.2537202537059784, + "learning_rate": 1.1481346241857982e-05, + "loss": 0.3907, "step": 95955 }, { - "epoch": 3.38, - "learning_rate": 1.2617450606938052e-05, - "loss": 0.2612, + "epoch": 3.4583918982232316, + "grad_norm": 0.2511982023715973, + "learning_rate": 1.1478891641280162e-05, + "loss": 0.3686, "step": 95960 }, { - "epoch": 3.38, - "learning_rate": 1.2614975943671031e-05, - "loss": 0.2554, + "epoch": 3.458572097884456, + "grad_norm": 0.2458512932062149, + "learning_rate": 1.1476437224924824e-05, + "loss": 0.3945, "step": 95965 }, { - "epoch": 3.38, - "learning_rate": 1.2612501441215374e-05, - "loss": 0.2715, + "epoch": 3.4587522975456806, + "grad_norm": 0.2354070544242859, + "learning_rate": 1.1473982992825403e-05, + "loss": 0.3661, "step": 95970 }, { - "epoch": 3.38, - "learning_rate": 1.2610027099603233e-05, - "loss": 0.2533, + "epoch": 3.4589324972069053, + "grad_norm": 0.22366875410079956, + "learning_rate": 1.1471528945015352e-05, + "loss": 0.4073, "step": 95975 }, { - "epoch": 3.38, - "learning_rate": 1.2607552918866725e-05, - "loss": 0.2504, + "epoch": 3.45911269686813, + "grad_norm": 0.3030519485473633, + "learning_rate": 1.1469075081528102e-05, + "loss": 0.387, "step": 95980 }, { - "epoch": 3.38, - "learning_rate": 1.2605078899037964e-05, - "loss": 0.2624, + "epoch": 3.4592928965293543, + "grad_norm": 0.1697065532207489, + "learning_rate": 1.1466621402397085e-05, + "loss": 0.383, "step": 95985 }, { - "epoch": 3.38, - "learning_rate": 1.260260504014909e-05, - "loss": 0.2599, + "epoch": 3.459473096190579, + "grad_norm": 0.2466737926006317, + "learning_rate": 1.1464167907655731e-05, + "loss": 0.3841, "step": 95990 }, { - "epoch": 3.38, - "learning_rate": 1.2600131342232224e-05, - "loss": 0.2644, + "epoch": 3.459653295851804, + "grad_norm": 0.2090192586183548, + "learning_rate": 1.1461714597337459e-05, + "loss": 0.3883, "step": 95995 }, { - "epoch": 3.38, - "learning_rate": 1.2597657805319485e-05, - "loss": 0.2574, + "epoch": 3.4598334955130285, + "grad_norm": 0.20262013375759125, + "learning_rate": 1.1459261471475713e-05, + "loss": 0.3893, "step": 96000 }, { - "epoch": 3.38, - "eval_loss": 0.2541818916797638, - "eval_runtime": 10.5477, - "eval_samples_per_second": 9.481, - "eval_steps_per_second": 9.481, + "epoch": 3.4598334955130285, + "eval_loss": 0.4300161600112915, + "eval_runtime": 3.5329, + "eval_samples_per_second": 28.305, + "eval_steps_per_second": 7.076, "step": 96000 }, { - "epoch": 3.38, - "learning_rate": 1.259518442944298e-05, - "loss": 0.2428, + "epoch": 3.4600136951742533, + "grad_norm": 0.23101657629013062, + "learning_rate": 1.1456808530103907e-05, + "loss": 0.3993, "step": 96005 }, { - "epoch": 3.38, - "learning_rate": 1.2592711214634826e-05, - "loss": 0.251, + "epoch": 3.4601938948354776, + "grad_norm": 0.2159048616886139, + "learning_rate": 1.1454355773255465e-05, + "loss": 0.3827, "step": 96010 }, { - "epoch": 3.38, - "learning_rate": 1.2590238160927148e-05, - "loss": 0.2491, + "epoch": 3.4603740944967023, + "grad_norm": 0.26098620891571045, + "learning_rate": 1.14519032009638e-05, + "loss": 0.3952, "step": 96015 }, { - "epoch": 3.38, - "learning_rate": 1.2587765268352048e-05, - "loss": 0.2583, + "epoch": 3.460554294157927, + "grad_norm": 0.22338443994522095, + "learning_rate": 1.1449450813262336e-05, + "loss": 0.366, "step": 96020 }, { - "epoch": 3.38, - "learning_rate": 1.2585292536941637e-05, - "loss": 0.2616, + "epoch": 3.4607344938191518, + "grad_norm": 0.218563973903656, + "learning_rate": 1.1446998610184467e-05, + "loss": 0.3993, "step": 96025 }, { - "epoch": 3.38, - "learning_rate": 1.258281996672801e-05, - "loss": 0.2655, + "epoch": 3.460914693480376, + "grad_norm": 0.2379787415266037, + "learning_rate": 1.1444546591763639e-05, + "loss": 0.3897, "step": 96030 }, { - "epoch": 3.38, - "learning_rate": 1.2580347557743294e-05, - "loss": 0.2491, + "epoch": 3.461094893141601, + "grad_norm": 0.2325468510389328, + "learning_rate": 1.144209475803322e-05, + "loss": 0.3757, "step": 96035 }, { - "epoch": 3.38, - "learning_rate": 1.2577875310019571e-05, - "loss": 0.2575, + "epoch": 3.4612750928028255, + "grad_norm": 0.237319678068161, + "learning_rate": 1.1439643109026643e-05, + "loss": 0.3858, "step": 96040 }, { - "epoch": 3.38, - "learning_rate": 1.2575403223588956e-05, - "loss": 0.2572, + "epoch": 3.4614552924640503, + "grad_norm": 0.19926400482654572, + "learning_rate": 1.1437191644777307e-05, + "loss": 0.3948, "step": 96045 }, { - "epoch": 3.38, - "learning_rate": 1.2572931298483537e-05, - "loss": 0.2645, + "epoch": 3.461635492125275, + "grad_norm": 0.20922797918319702, + "learning_rate": 1.1434740365318597e-05, + "loss": 0.352, "step": 96050 }, { - "epoch": 3.38, - "learning_rate": 1.2570459534735424e-05, - "loss": 0.2598, + "epoch": 3.4618156917864993, + "grad_norm": 0.2545210123062134, + "learning_rate": 1.1432289270683941e-05, + "loss": 0.4189, "step": 96055 }, { - "epoch": 3.38, - "learning_rate": 1.25679879323767e-05, - "loss": 0.2502, + "epoch": 3.461995891447724, + "grad_norm": 0.20487937331199646, + "learning_rate": 1.1429838360906708e-05, + "loss": 0.398, "step": 96060 }, { - "epoch": 3.38, - "learning_rate": 1.256551649143946e-05, - "loss": 0.2473, + "epoch": 3.4621760911089488, + "grad_norm": 0.27374470233917236, + "learning_rate": 1.1427387636020292e-05, + "loss": 0.3865, "step": 96065 }, { - "epoch": 3.38, - "learning_rate": 1.2563045211955787e-05, - "loss": 0.2557, + "epoch": 3.4623562907701735, + "grad_norm": 0.24212025105953217, + "learning_rate": 1.14249370960581e-05, + "loss": 0.3911, "step": 96070 }, { - "epoch": 3.38, - "learning_rate": 1.2560574093957784e-05, - "loss": 0.2662, + "epoch": 3.4625364904313978, + "grad_norm": 0.24381519854068756, + "learning_rate": 1.1422486741053513e-05, + "loss": 0.3624, "step": 96075 }, { - "epoch": 3.38, - "learning_rate": 1.2558103137477527e-05, - "loss": 0.2786, + "epoch": 3.4627166900926225, + "grad_norm": 0.23481976985931396, + "learning_rate": 1.1420036571039915e-05, + "loss": 0.3895, "step": 96080 }, { - "epoch": 3.38, - "learning_rate": 1.2555632342547096e-05, - "loss": 0.2703, + "epoch": 3.4628968897538472, + "grad_norm": 0.27169761061668396, + "learning_rate": 1.141758658605069e-05, + "loss": 0.39, "step": 96085 }, { - "epoch": 3.38, - "learning_rate": 1.2553161709198583e-05, - "loss": 0.2625, + "epoch": 3.463077089415072, + "grad_norm": 0.24660931527614594, + "learning_rate": 1.1415136786119207e-05, + "loss": 0.3867, "step": 96090 }, { - "epoch": 3.38, - "learning_rate": 1.2550691237464054e-05, - "loss": 0.2605, + "epoch": 3.4632572890762967, + "grad_norm": 0.23132185637950897, + "learning_rate": 1.1412687171278865e-05, + "loss": 0.4086, "step": 96095 }, { - "epoch": 3.38, - "learning_rate": 1.2548220927375604e-05, - "loss": 0.228, + "epoch": 3.463437488737521, + "grad_norm": 0.266564279794693, + "learning_rate": 1.1410237741563031e-05, + "loss": 0.3929, "step": 96100 }, { - "epoch": 3.38, - "learning_rate": 1.2545750778965292e-05, - "loss": 0.2524, + "epoch": 3.4636176883987457, + "grad_norm": 0.19676515460014343, + "learning_rate": 1.1407788497005076e-05, + "loss": 0.3475, "step": 96105 }, { - "epoch": 3.38, - "learning_rate": 1.2543280792265206e-05, - "loss": 0.2701, + "epoch": 3.4637978880599705, + "grad_norm": 0.2622312605381012, + "learning_rate": 1.140533943763837e-05, + "loss": 0.3491, "step": 96110 }, { - "epoch": 3.38, - "learning_rate": 1.2540810967307409e-05, - "loss": 0.2554, + "epoch": 3.463978087721195, + "grad_norm": 0.27703362703323364, + "learning_rate": 1.1402890563496276e-05, + "loss": 0.3801, "step": 96115 }, { - "epoch": 3.38, - "learning_rate": 1.2538341304123973e-05, - "loss": 0.2725, + "epoch": 3.4641582873824195, + "grad_norm": 0.24706853926181793, + "learning_rate": 1.1400441874612173e-05, + "loss": 0.3824, "step": 96120 }, { - "epoch": 3.38, - "learning_rate": 1.2535871802746951e-05, - "loss": 0.2588, + "epoch": 3.464338487043644, + "grad_norm": 0.2970876693725586, + "learning_rate": 1.1397993371019427e-05, + "loss": 0.3695, "step": 96125 }, { - "epoch": 3.38, - "learning_rate": 1.2533402463208432e-05, - "loss": 0.2784, + "epoch": 3.464518686704869, + "grad_norm": 0.24940580129623413, + "learning_rate": 1.1395545052751366e-05, + "loss": 0.3726, "step": 96130 }, { - "epoch": 3.38, - "learning_rate": 1.2530933285540464e-05, - "loss": 0.2701, + "epoch": 3.4646988863660937, + "grad_norm": 0.21381749212741852, + "learning_rate": 1.1393096919841384e-05, + "loss": 0.3976, "step": 96135 }, { - "epoch": 3.38, - "learning_rate": 1.2528464269775108e-05, - "loss": 0.2697, + "epoch": 3.4648790860273184, + "grad_norm": 0.2594471275806427, + "learning_rate": 1.1390648972322817e-05, + "loss": 0.3572, "step": 96140 }, { - "epoch": 3.38, - "learning_rate": 1.2525995415944419e-05, - "loss": 0.2534, + "epoch": 3.4650592856885427, + "grad_norm": 0.22999241948127747, + "learning_rate": 1.1388201210229027e-05, + "loss": 0.3397, "step": 96145 }, { - "epoch": 3.38, - "learning_rate": 1.2523526724080456e-05, - "loss": 0.272, + "epoch": 3.4652394853497674, + "grad_norm": 0.28377705812454224, + "learning_rate": 1.1385753633593357e-05, + "loss": 0.3886, "step": 96150 }, { - "epoch": 3.38, - "learning_rate": 1.2521058194215284e-05, - "loss": 0.2502, + "epoch": 3.465419685010992, + "grad_norm": 0.21517440676689148, + "learning_rate": 1.1383306242449152e-05, + "loss": 0.4021, "step": 96155 }, { - "epoch": 3.38, - "learning_rate": 1.251858982638095e-05, - "loss": 0.2615, + "epoch": 3.465599884672217, + "grad_norm": 0.26591652631759644, + "learning_rate": 1.138085903682977e-05, + "loss": 0.3855, "step": 96160 }, { - "epoch": 3.38, - "learning_rate": 1.2516121620609495e-05, - "loss": 0.2699, + "epoch": 3.4657800843334416, + "grad_norm": 0.2830233871936798, + "learning_rate": 1.1378412016768547e-05, + "loss": 0.3945, "step": 96165 }, { - "epoch": 3.38, - "learning_rate": 1.2513653576932977e-05, - "loss": 0.2711, + "epoch": 3.465960283994666, + "grad_norm": 0.2300427407026291, + "learning_rate": 1.1375965182298822e-05, + "loss": 0.3664, "step": 96170 }, { - "epoch": 3.38, - "learning_rate": 1.2511185695383443e-05, - "loss": 0.2523, + "epoch": 3.4661404836558907, + "grad_norm": 0.23842735588550568, + "learning_rate": 1.1373518533453937e-05, + "loss": 0.3729, "step": 96175 }, { - "epoch": 3.38, - "learning_rate": 1.2508717975992932e-05, - "loss": 0.2591, + "epoch": 3.4663206833171154, + "grad_norm": 0.2205519825220108, + "learning_rate": 1.1371072070267221e-05, + "loss": 0.3791, "step": 96180 }, { - "epoch": 3.38, - "learning_rate": 1.2506250418793475e-05, - "loss": 0.2689, + "epoch": 3.46650088297834, + "grad_norm": 0.22283774614334106, + "learning_rate": 1.1368625792772e-05, + "loss": 0.3938, "step": 96185 }, { - "epoch": 3.38, - "learning_rate": 1.250378302381713e-05, - "loss": 0.2366, + "epoch": 3.466681082639565, + "grad_norm": 0.23954735696315765, + "learning_rate": 1.1366179701001625e-05, + "loss": 0.4172, "step": 96190 }, { - "epoch": 3.38, - "learning_rate": 1.250131579109593e-05, - "loss": 0.2754, + "epoch": 3.466861282300789, + "grad_norm": 0.20571085810661316, + "learning_rate": 1.136373379498941e-05, + "loss": 0.3765, "step": 96195 }, { - "epoch": 3.38, - "learning_rate": 1.2498848720661899e-05, - "loss": 0.2485, + "epoch": 3.467041481962014, + "grad_norm": 0.28632012009620667, + "learning_rate": 1.1361288074768684e-05, + "loss": 0.3678, "step": 96200 }, { - "epoch": 3.38, - "learning_rate": 1.2496381812547086e-05, - "loss": 0.2825, + "epoch": 3.4672216816232386, + "grad_norm": 0.2133149951696396, + "learning_rate": 1.1358842540372767e-05, + "loss": 0.4013, "step": 96205 }, { - "epoch": 3.38, - "learning_rate": 1.2493915066783506e-05, - "loss": 0.2758, + "epoch": 3.4674018812844634, + "grad_norm": 0.17300593852996826, + "learning_rate": 1.1356397191834967e-05, + "loss": 0.3512, "step": 96210 }, { - "epoch": 3.39, - "learning_rate": 1.2491448483403206e-05, - "loss": 0.2512, + "epoch": 3.4675820809456877, + "grad_norm": 0.23370462656021118, + "learning_rate": 1.1353952029188625e-05, + "loss": 0.4099, "step": 96215 }, { - "epoch": 3.39, - "learning_rate": 1.2488982062438196e-05, - "loss": 0.2478, + "epoch": 3.4677622806069124, + "grad_norm": 0.2420395463705063, + "learning_rate": 1.1351507052467044e-05, + "loss": 0.3867, "step": 96220 }, { - "epoch": 3.39, - "learning_rate": 1.2486515803920518e-05, - "loss": 0.2593, + "epoch": 3.467942480268137, + "grad_norm": 0.2655438780784607, + "learning_rate": 1.1349062261703538e-05, + "loss": 0.3969, "step": 96225 }, { - "epoch": 3.39, - "learning_rate": 1.2484049707882187e-05, - "loss": 0.2603, + "epoch": 3.468122679929362, + "grad_norm": 0.24291475117206573, + "learning_rate": 1.1346617656931416e-05, + "loss": 0.4008, "step": 96230 }, { - "epoch": 3.39, - "learning_rate": 1.2481583774355218e-05, - "loss": 0.2691, + "epoch": 3.4683028795905866, + "grad_norm": 0.23957978188991547, + "learning_rate": 1.1344173238183974e-05, + "loss": 0.3572, "step": 96235 }, { - "epoch": 3.39, - "learning_rate": 1.2479118003371628e-05, - "loss": 0.2614, + "epoch": 3.468483079251811, + "grad_norm": 0.23568493127822876, + "learning_rate": 1.134172900549455e-05, + "loss": 0.3883, "step": 96240 }, { - "epoch": 3.39, - "learning_rate": 1.2476652394963448e-05, - "loss": 0.2582, + "epoch": 3.4686632789130356, + "grad_norm": 0.2132604718208313, + "learning_rate": 1.1339284958896412e-05, + "loss": 0.3945, "step": 96245 }, { - "epoch": 3.39, - "learning_rate": 1.2474186949162681e-05, - "loss": 0.2583, + "epoch": 3.4688434785742603, + "grad_norm": 0.25484153628349304, + "learning_rate": 1.1336841098422862e-05, + "loss": 0.3929, "step": 96250 }, { - "epoch": 3.39, - "learning_rate": 1.2471721666001334e-05, - "loss": 0.2465, + "epoch": 3.469023678235485, + "grad_norm": 0.3113678991794586, + "learning_rate": 1.1334397424107218e-05, + "loss": 0.4022, "step": 96255 }, { - "epoch": 3.39, - "learning_rate": 1.2469256545511435e-05, - "loss": 0.2863, + "epoch": 3.4692038778967094, + "grad_norm": 0.1955181211233139, + "learning_rate": 1.1331953935982761e-05, + "loss": 0.3452, "step": 96260 }, { - "epoch": 3.39, - "learning_rate": 1.2466791587724972e-05, - "loss": 0.2601, + "epoch": 3.469384077557934, + "grad_norm": 0.2540067732334137, + "learning_rate": 1.1329510634082787e-05, + "loss": 0.4174, "step": 96265 }, { - "epoch": 3.39, - "learning_rate": 1.2464326792673969e-05, - "loss": 0.2444, + "epoch": 3.469564277219159, + "grad_norm": 0.2300526648759842, + "learning_rate": 1.1327067518440585e-05, + "loss": 0.4029, "step": 96270 }, { - "epoch": 3.39, - "learning_rate": 1.2461862160390423e-05, - "loss": 0.2784, + "epoch": 3.4697444768803836, + "grad_norm": 0.2536962330341339, + "learning_rate": 1.132462458908943e-05, + "loss": 0.3517, "step": 96275 }, { - "epoch": 3.39, - "learning_rate": 1.2459397690906321e-05, - "loss": 0.247, + "epoch": 3.4699246765416083, + "grad_norm": 0.24605917930603027, + "learning_rate": 1.1322181846062626e-05, + "loss": 0.3896, "step": 96280 }, { - "epoch": 3.39, - "learning_rate": 1.2456933384253689e-05, - "loss": 0.2483, + "epoch": 3.4701048762028326, + "grad_norm": 0.21459710597991943, + "learning_rate": 1.1319739289393448e-05, + "loss": 0.4341, "step": 96285 }, { - "epoch": 3.39, - "learning_rate": 1.2454469240464506e-05, - "loss": 0.255, + "epoch": 3.4702850758640573, + "grad_norm": 0.2235480546951294, + "learning_rate": 1.1317296919115171e-05, + "loss": 0.3664, "step": 96290 }, { - "epoch": 3.39, - "learning_rate": 1.2452005259570776e-05, - "loss": 0.2449, + "epoch": 3.470465275525282, + "grad_norm": 0.23965181410312653, + "learning_rate": 1.1314854735261076e-05, + "loss": 0.3913, "step": 96295 }, { - "epoch": 3.39, - "learning_rate": 1.2449541441604478e-05, - "loss": 0.2525, + "epoch": 3.470645475186507, + "grad_norm": 0.29223349690437317, + "learning_rate": 1.1312412737864429e-05, + "loss": 0.4089, "step": 96300 }, { - "epoch": 3.39, - "learning_rate": 1.2447077786597621e-05, - "loss": 0.264, + "epoch": 3.470825674847731, + "grad_norm": 0.2220560908317566, + "learning_rate": 1.1309970926958513e-05, + "loss": 0.3817, "step": 96305 }, { - "epoch": 3.39, - "learning_rate": 1.244461429458218e-05, - "loss": 0.2626, + "epoch": 3.471005874508956, + "grad_norm": 0.26034578680992126, + "learning_rate": 1.1307529302576605e-05, + "loss": 0.3736, "step": 96310 }, { - "epoch": 3.39, - "learning_rate": 1.2442150965590154e-05, - "loss": 0.2405, + "epoch": 3.4711860741701805, + "grad_norm": 0.23067830502986908, + "learning_rate": 1.130508786475194e-05, + "loss": 0.3885, "step": 96315 }, { - "epoch": 3.39, - "learning_rate": 1.2439687799653516e-05, - "loss": 0.2791, + "epoch": 3.4713662738314053, + "grad_norm": 0.2320612072944641, + "learning_rate": 1.1302646613517812e-05, + "loss": 0.3833, "step": 96320 }, { - "epoch": 3.39, - "learning_rate": 1.2437224796804262e-05, - "loss": 0.2476, + "epoch": 3.47154647349263, + "grad_norm": 0.22921355068683624, + "learning_rate": 1.130020554890747e-05, + "loss": 0.3706, "step": 96325 }, { - "epoch": 3.39, - "learning_rate": 1.2434761957074365e-05, - "loss": 0.2756, + "epoch": 3.4717266731538543, + "grad_norm": 0.24659670889377594, + "learning_rate": 1.1297764670954166e-05, + "loss": 0.3904, "step": 96330 }, { - "epoch": 3.39, - "learning_rate": 1.2432299280495794e-05, - "loss": 0.2364, + "epoch": 3.471906872815079, + "grad_norm": 0.2158777266740799, + "learning_rate": 1.1295323979691183e-05, + "loss": 0.389, "step": 96335 }, { - "epoch": 3.39, - "learning_rate": 1.2429836767100541e-05, - "loss": 0.2747, + "epoch": 3.4720870724763038, + "grad_norm": 0.21258202195167542, + "learning_rate": 1.1292883475151741e-05, + "loss": 0.3774, "step": 96340 }, { - "epoch": 3.39, - "learning_rate": 1.2427374416920575e-05, - "loss": 0.2762, + "epoch": 3.4722672721375285, + "grad_norm": 0.20848548412322998, + "learning_rate": 1.1290443157369115e-05, + "loss": 0.373, "step": 96345 }, { - "epoch": 3.39, - "learning_rate": 1.242491222998787e-05, - "loss": 0.2775, + "epoch": 3.472447471798753, + "grad_norm": 0.268587201833725, + "learning_rate": 1.1288003026376551e-05, + "loss": 0.3654, "step": 96350 }, { - "epoch": 3.39, - "learning_rate": 1.242245020633438e-05, - "loss": 0.241, + "epoch": 3.4726276714599775, + "grad_norm": 0.25624382495880127, + "learning_rate": 1.1285563082207286e-05, + "loss": 0.3601, "step": 96355 }, { - "epoch": 3.39, - "learning_rate": 1.2419988345992095e-05, - "loss": 0.2515, + "epoch": 3.4728078711212023, + "grad_norm": 0.1996915340423584, + "learning_rate": 1.1283123324894573e-05, + "loss": 0.3577, "step": 96360 }, { - "epoch": 3.39, - "learning_rate": 1.2417526648992972e-05, - "loss": 0.2556, + "epoch": 3.472988070782427, + "grad_norm": 0.32410600781440735, + "learning_rate": 1.1280683754471647e-05, + "loss": 0.3696, "step": 96365 }, { - "epoch": 3.39, - "learning_rate": 1.2415065115368962e-05, - "loss": 0.2698, + "epoch": 3.4731682704436517, + "grad_norm": 0.20257310569286346, + "learning_rate": 1.1278244370971739e-05, + "loss": 0.3503, "step": 96370 }, { - "epoch": 3.39, - "learning_rate": 1.2412603745152038e-05, - "loss": 0.2618, + "epoch": 3.473348470104876, + "grad_norm": 0.2658899426460266, + "learning_rate": 1.1275805174428102e-05, + "loss": 0.4159, "step": 96375 }, { - "epoch": 3.39, - "learning_rate": 1.2410142538374167e-05, - "loss": 0.2633, + "epoch": 3.4735286697661008, + "grad_norm": 0.22585046291351318, + "learning_rate": 1.1273366164873967e-05, + "loss": 0.3779, "step": 96380 }, { - "epoch": 3.39, - "learning_rate": 1.2407681495067303e-05, - "loss": 0.2549, + "epoch": 3.4737088694273255, + "grad_norm": 0.24556881189346313, + "learning_rate": 1.1270927342342558e-05, + "loss": 0.4055, "step": 96385 }, { - "epoch": 3.39, - "learning_rate": 1.240522061526339e-05, - "loss": 0.2463, + "epoch": 3.47388906908855, + "grad_norm": 0.20033571124076843, + "learning_rate": 1.1268488706867105e-05, + "loss": 0.3803, "step": 96390 }, { - "epoch": 3.39, - "learning_rate": 1.2402759898994382e-05, - "loss": 0.2453, + "epoch": 3.4740692687497745, + "grad_norm": 0.2153259962797165, + "learning_rate": 1.1266050258480829e-05, + "loss": 0.4001, "step": 96395 }, { - "epoch": 3.39, - "learning_rate": 1.2400299346292244e-05, - "loss": 0.2395, + "epoch": 3.4742494684109992, + "grad_norm": 0.24744583666324615, + "learning_rate": 1.1263611997216966e-05, + "loss": 0.3978, "step": 96400 }, { - "epoch": 3.39, - "learning_rate": 1.2397838957188912e-05, - "loss": 0.2588, + "epoch": 3.474429668072224, + "grad_norm": 0.2664938271045685, + "learning_rate": 1.1261173923108731e-05, + "loss": 0.3655, "step": 96405 }, { - "epoch": 3.39, - "learning_rate": 1.239537873171634e-05, - "loss": 0.2691, + "epoch": 3.4746098677334487, + "grad_norm": 0.23613092303276062, + "learning_rate": 1.125873603618934e-05, + "loss": 0.407, "step": 96410 }, { - "epoch": 3.39, - "learning_rate": 1.2392918669906456e-05, - "loss": 0.2514, + "epoch": 3.4747900673946734, + "grad_norm": 0.2801758348941803, + "learning_rate": 1.1256298336492013e-05, + "loss": 0.3698, "step": 96415 }, { - "epoch": 3.39, - "learning_rate": 1.2390458771791227e-05, - "loss": 0.2651, + "epoch": 3.474970267055898, + "grad_norm": 0.24601031839847565, + "learning_rate": 1.1253860824049948e-05, + "loss": 0.4104, "step": 96420 }, { - "epoch": 3.39, - "learning_rate": 1.238799903740257e-05, - "loss": 0.2565, + "epoch": 3.4751504667171225, + "grad_norm": 0.2248278707265854, + "learning_rate": 1.1251423498896389e-05, + "loss": 0.3917, "step": 96425 }, { - "epoch": 3.39, - "learning_rate": 1.2385539466772434e-05, - "loss": 0.2823, + "epoch": 3.475330666378347, + "grad_norm": 0.2194819152355194, + "learning_rate": 1.1248986361064511e-05, + "loss": 0.3757, "step": 96430 }, { - "epoch": 3.39, - "learning_rate": 1.2383080059932764e-05, - "loss": 0.2646, + "epoch": 3.475510866039572, + "grad_norm": 0.22775894403457642, + "learning_rate": 1.1246549410587522e-05, + "loss": 0.3658, "step": 96435 }, { - "epoch": 3.39, - "learning_rate": 1.2380620816915484e-05, - "loss": 0.2515, + "epoch": 3.4756910657007967, + "grad_norm": 0.23458243906497955, + "learning_rate": 1.1244112647498647e-05, + "loss": 0.3502, "step": 96440 }, { - "epoch": 3.39, - "learning_rate": 1.2378161737752525e-05, - "loss": 0.2457, + "epoch": 3.475871265362021, + "grad_norm": 0.24316877126693726, + "learning_rate": 1.1241676071831073e-05, + "loss": 0.4056, "step": 96445 }, { - "epoch": 3.39, - "learning_rate": 1.237570282247581e-05, - "loss": 0.2659, + "epoch": 3.4760514650232457, + "grad_norm": 0.25940200686454773, + "learning_rate": 1.1239239683617997e-05, + "loss": 0.3978, "step": 96450 }, { - "epoch": 3.39, - "learning_rate": 1.2373244071117285e-05, - "loss": 0.2197, + "epoch": 3.4762316646844704, + "grad_norm": 0.24130047857761383, + "learning_rate": 1.123680348289262e-05, + "loss": 0.3834, "step": 96455 }, { - "epoch": 3.39, - "learning_rate": 1.2370785483708863e-05, - "loss": 0.2739, + "epoch": 3.476411864345695, + "grad_norm": 0.1958744078874588, + "learning_rate": 1.1234367469688118e-05, + "loss": 0.3965, "step": 96460 }, { - "epoch": 3.39, - "learning_rate": 1.2368327060282466e-05, - "loss": 0.2702, + "epoch": 3.47659206400692, + "grad_norm": 0.29350563883781433, + "learning_rate": 1.1231931644037701e-05, + "loss": 0.3916, "step": 96465 }, { - "epoch": 3.39, - "learning_rate": 1.2365868800870011e-05, - "loss": 0.2533, + "epoch": 3.476772263668144, + "grad_norm": 0.21596458554267883, + "learning_rate": 1.1229496005974554e-05, + "loss": 0.372, "step": 96470 }, { - "epoch": 3.39, - "learning_rate": 1.2363410705503433e-05, - "loss": 0.2643, + "epoch": 3.476952463329369, + "grad_norm": 0.24967950582504272, + "learning_rate": 1.1227060555531858e-05, + "loss": 0.3712, "step": 96475 }, { - "epoch": 3.39, - "learning_rate": 1.2360952774214629e-05, - "loss": 0.2736, + "epoch": 3.4771326629905936, + "grad_norm": 0.20245712995529175, + "learning_rate": 1.1224625292742794e-05, + "loss": 0.344, "step": 96480 }, { - "epoch": 3.39, - "learning_rate": 1.2358495007035534e-05, - "loss": 0.2449, + "epoch": 3.4773128626518184, + "grad_norm": 0.26654964685440063, + "learning_rate": 1.1222190217640535e-05, + "loss": 0.4232, "step": 96485 }, { - "epoch": 3.39, - "learning_rate": 1.2356037403998042e-05, - "loss": 0.264, + "epoch": 3.4774930623130427, + "grad_norm": 0.2164486050605774, + "learning_rate": 1.1219755330258274e-05, + "loss": 0.3632, "step": 96490 }, { - "epoch": 3.39, - "learning_rate": 1.235357996513408e-05, - "loss": 0.246, + "epoch": 3.4776732619742674, + "grad_norm": 0.2758568525314331, + "learning_rate": 1.1217320630629184e-05, + "loss": 0.3687, "step": 96495 }, { - "epoch": 3.4, - "learning_rate": 1.2351122690475544e-05, - "loss": 0.2577, + "epoch": 3.477853461635492, + "grad_norm": 0.24314354360103607, + "learning_rate": 1.1214886118786425e-05, + "loss": 0.3426, "step": 96500 }, { - "epoch": 3.4, - "eval_loss": 0.25399091839790344, - "eval_runtime": 10.5362, - "eval_samples_per_second": 9.491, - "eval_steps_per_second": 9.491, + "epoch": 3.477853461635492, + "eval_loss": 0.42988404631614685, + "eval_runtime": 3.5336, + "eval_samples_per_second": 28.3, + "eval_steps_per_second": 7.075, "step": 96500 }, { - "epoch": 3.4, - "learning_rate": 1.2348665580054347e-05, - "loss": 0.2472, + "epoch": 3.478033661296717, + "grad_norm": 0.19991134107112885, + "learning_rate": 1.1212451794763178e-05, + "loss": 0.3617, "step": 96505 }, { - "epoch": 3.4, - "learning_rate": 1.234620863390238e-05, - "loss": 0.2501, + "epoch": 3.4782138609579416, + "grad_norm": 0.23575247824192047, + "learning_rate": 1.1210017658592605e-05, + "loss": 0.4334, "step": 96510 }, { - "epoch": 3.4, - "learning_rate": 1.2343751852051566e-05, - "loss": 0.2679, + "epoch": 3.478394060619166, + "grad_norm": 0.23354025185108185, + "learning_rate": 1.1207583710307861e-05, + "loss": 0.3685, "step": 96515 }, { - "epoch": 3.4, - "learning_rate": 1.2341295234533792e-05, - "loss": 0.2335, + "epoch": 3.4785742602803906, + "grad_norm": 0.27086594700813293, + "learning_rate": 1.1205149949942139e-05, + "loss": 0.418, "step": 96520 }, { - "epoch": 3.4, - "learning_rate": 1.2338838781380957e-05, - "loss": 0.2598, + "epoch": 3.4787544599416154, + "grad_norm": 0.2325955182313919, + "learning_rate": 1.120271637752856e-05, + "loss": 0.3802, "step": 96525 }, { - "epoch": 3.4, - "learning_rate": 1.2336382492624948e-05, - "loss": 0.2633, + "epoch": 3.47893465960284, + "grad_norm": 0.25425323843955994, + "learning_rate": 1.1200282993100305e-05, + "loss": 0.3639, "step": 96530 }, { - "epoch": 3.4, - "learning_rate": 1.2333926368297666e-05, - "loss": 0.2346, + "epoch": 3.4791148592640644, + "grad_norm": 0.21064163744449615, + "learning_rate": 1.1197849796690527e-05, + "loss": 0.3845, "step": 96535 }, { - "epoch": 3.4, - "learning_rate": 1.2331470408431012e-05, - "loss": 0.2565, + "epoch": 3.479295058925289, + "grad_norm": 0.20380394160747528, + "learning_rate": 1.1195416788332371e-05, + "loss": 0.3789, "step": 96540 }, { - "epoch": 3.4, - "learning_rate": 1.2329014613056855e-05, - "loss": 0.277, + "epoch": 3.479475258586514, + "grad_norm": 0.24418340623378754, + "learning_rate": 1.119298396805899e-05, + "loss": 0.3595, "step": 96545 }, { - "epoch": 3.4, - "learning_rate": 1.2326558982207103e-05, - "loss": 0.2606, + "epoch": 3.4796554582477386, + "grad_norm": 0.19111719727516174, + "learning_rate": 1.1190551335903526e-05, + "loss": 0.3371, "step": 96550 }, { - "epoch": 3.4, - "learning_rate": 1.2324103515913632e-05, - "loss": 0.2622, + "epoch": 3.4798356579089633, + "grad_norm": 0.1976500004529953, + "learning_rate": 1.1188118891899122e-05, + "loss": 0.3705, "step": 96555 }, { - "epoch": 3.4, - "learning_rate": 1.2321648214208322e-05, - "loss": 0.2664, + "epoch": 3.4800158575701876, + "grad_norm": 0.21880120038986206, + "learning_rate": 1.118568663607893e-05, + "loss": 0.3656, "step": 96560 }, { - "epoch": 3.4, - "learning_rate": 1.2319193077123048e-05, - "loss": 0.2556, + "epoch": 3.4801960572314123, + "grad_norm": 0.23799078166484833, + "learning_rate": 1.1183254568476084e-05, + "loss": 0.4213, "step": 96565 }, { - "epoch": 3.4, - "learning_rate": 1.2316738104689702e-05, - "loss": 0.2709, + "epoch": 3.480376256892637, + "grad_norm": 0.2790437638759613, + "learning_rate": 1.1180822689123719e-05, + "loss": 0.3503, "step": 96570 }, { - "epoch": 3.4, - "learning_rate": 1.2314283296940151e-05, - "loss": 0.2433, + "epoch": 3.480556456553862, + "grad_norm": 0.2113523781299591, + "learning_rate": 1.1178390998054968e-05, + "loss": 0.3869, "step": 96575 }, { - "epoch": 3.4, - "learning_rate": 1.2311828653906274e-05, - "loss": 0.2474, + "epoch": 3.480736656215086, + "grad_norm": 0.20490826666355133, + "learning_rate": 1.1175959495302957e-05, + "loss": 0.3975, "step": 96580 }, { - "epoch": 3.4, - "learning_rate": 1.2309374175619932e-05, - "loss": 0.2536, + "epoch": 3.480916855876311, + "grad_norm": 0.19894984364509583, + "learning_rate": 1.117352818090083e-05, + "loss": 0.364, "step": 96585 }, { - "epoch": 3.4, - "learning_rate": 1.2306919862112998e-05, - "loss": 0.2532, + "epoch": 3.4810970555375356, + "grad_norm": 0.2029849737882614, + "learning_rate": 1.1171097054881705e-05, + "loss": 0.3436, "step": 96590 }, { - "epoch": 3.4, - "learning_rate": 1.2304465713417357e-05, - "loss": 0.2661, + "epoch": 3.4812772551987603, + "grad_norm": 0.21581901609897614, + "learning_rate": 1.1168666117278704e-05, + "loss": 0.3586, "step": 96595 }, { - "epoch": 3.4, - "learning_rate": 1.2302011729564861e-05, - "loss": 0.2787, + "epoch": 3.481457454859985, + "grad_norm": 0.27598705887794495, + "learning_rate": 1.116623536812495e-05, + "loss": 0.3926, "step": 96600 }, { - "epoch": 3.4, - "learning_rate": 1.2299557910587366e-05, - "loss": 0.2458, + "epoch": 3.4816376545212093, + "grad_norm": 0.24678505957126617, + "learning_rate": 1.1163804807453551e-05, + "loss": 0.3838, "step": 96605 }, { - "epoch": 3.4, - "learning_rate": 1.229710425651675e-05, - "loss": 0.2451, + "epoch": 3.481817854182434, + "grad_norm": 0.1914645880460739, + "learning_rate": 1.1161374435297653e-05, + "loss": 0.3738, "step": 96610 }, { - "epoch": 3.4, - "learning_rate": 1.2294650767384866e-05, - "loss": 0.2549, + "epoch": 3.481998053843659, + "grad_norm": 0.22241747379302979, + "learning_rate": 1.1158944251690337e-05, + "loss": 0.3985, "step": 96615 }, { - "epoch": 3.4, - "learning_rate": 1.2292197443223567e-05, - "loss": 0.2621, + "epoch": 3.4821782535048835, + "grad_norm": 0.2281738668680191, + "learning_rate": 1.1156514256664719e-05, + "loss": 0.3981, "step": 96620 }, { - "epoch": 3.4, - "learning_rate": 1.22897442840647e-05, - "loss": 0.2517, + "epoch": 3.482358453166108, + "grad_norm": 0.20726355910301208, + "learning_rate": 1.1154084450253924e-05, + "loss": 0.3826, "step": 96625 }, { - "epoch": 3.4, - "learning_rate": 1.2287291289940137e-05, - "loss": 0.2579, + "epoch": 3.4825386528273325, + "grad_norm": 0.2736766040325165, + "learning_rate": 1.1151654832491034e-05, + "loss": 0.3828, "step": 96630 }, { - "epoch": 3.4, - "learning_rate": 1.2284838460881718e-05, - "loss": 0.2619, + "epoch": 3.4827188524885573, + "grad_norm": 0.15977123379707336, + "learning_rate": 1.1149225403409189e-05, + "loss": 0.3602, "step": 96635 }, { - "epoch": 3.4, - "learning_rate": 1.2282385796921284e-05, - "loss": 0.2806, + "epoch": 3.482899052149782, + "grad_norm": 0.25133785605430603, + "learning_rate": 1.1146796163041456e-05, + "loss": 0.3763, "step": 96640 }, { - "epoch": 3.4, - "learning_rate": 1.2279933298090695e-05, - "loss": 0.2447, + "epoch": 3.4830792518110067, + "grad_norm": 0.2637987434864044, + "learning_rate": 1.1144367111420934e-05, + "loss": 0.3561, "step": 96645 }, { - "epoch": 3.4, - "learning_rate": 1.2277480964421784e-05, - "loss": 0.2726, + "epoch": 3.483259451472231, + "grad_norm": 0.262717604637146, + "learning_rate": 1.1141938248580736e-05, + "loss": 0.3806, "step": 96650 }, { - "epoch": 3.4, - "learning_rate": 1.2275028795946405e-05, - "loss": 0.264, + "epoch": 3.4834396511334558, + "grad_norm": 0.21309638023376465, + "learning_rate": 1.1139509574553944e-05, + "loss": 0.3634, "step": 96655 }, { - "epoch": 3.4, - "learning_rate": 1.2272576792696383e-05, - "loss": 0.2581, + "epoch": 3.4836198507946805, + "grad_norm": 0.22101840376853943, + "learning_rate": 1.1137081089373655e-05, + "loss": 0.3906, "step": 96660 }, { - "epoch": 3.4, - "learning_rate": 1.2270124954703575e-05, - "loss": 0.247, + "epoch": 3.4838000504559052, + "grad_norm": 0.25005871057510376, + "learning_rate": 1.113465279307295e-05, + "loss": 0.3673, "step": 96665 }, { - "epoch": 3.4, - "learning_rate": 1.22676732819998e-05, - "loss": 0.2573, + "epoch": 3.48398025011713, + "grad_norm": 0.2766312062740326, + "learning_rate": 1.1132224685684906e-05, + "loss": 0.3975, "step": 96670 }, { - "epoch": 3.4, - "learning_rate": 1.22652217746169e-05, - "loss": 0.2721, + "epoch": 3.4841604497783543, + "grad_norm": 0.23252266645431519, + "learning_rate": 1.1129796767242625e-05, + "loss": 0.3683, "step": 96675 }, { - "epoch": 3.4, - "learning_rate": 1.2262770432586692e-05, - "loss": 0.2413, + "epoch": 3.484340649439579, + "grad_norm": 0.3283499777317047, + "learning_rate": 1.1127369037779182e-05, + "loss": 0.3911, "step": 96680 }, { - "epoch": 3.4, - "learning_rate": 1.2260319255941028e-05, - "loss": 0.2581, + "epoch": 3.4845208491008037, + "grad_norm": 0.31998345255851746, + "learning_rate": 1.1124941497327646e-05, + "loss": 0.3783, "step": 96685 }, { - "epoch": 3.4, - "learning_rate": 1.2257868244711717e-05, - "loss": 0.2703, + "epoch": 3.4847010487620285, + "grad_norm": 0.21715426445007324, + "learning_rate": 1.1122514145921097e-05, + "loss": 0.3533, "step": 96690 }, { - "epoch": 3.4, - "learning_rate": 1.2255417398930586e-05, - "loss": 0.2916, + "epoch": 3.484881248423253, + "grad_norm": 0.23155249655246735, + "learning_rate": 1.1120086983592606e-05, + "loss": 0.3644, "step": 96695 }, { - "epoch": 3.4, - "learning_rate": 1.225296671862947e-05, - "loss": 0.2592, + "epoch": 3.4850614480844775, + "grad_norm": 0.2212996780872345, + "learning_rate": 1.1117660010375233e-05, + "loss": 0.3898, "step": 96700 }, { - "epoch": 3.4, - "learning_rate": 1.2250516203840167e-05, - "loss": 0.2426, + "epoch": 3.485241647745702, + "grad_norm": 0.18644365668296814, + "learning_rate": 1.1115233226302074e-05, + "loss": 0.3834, "step": 96705 }, { - "epoch": 3.4, - "learning_rate": 1.224806585459452e-05, - "loss": 0.2486, + "epoch": 3.485421847406927, + "grad_norm": 0.21037942171096802, + "learning_rate": 1.1112806631406153e-05, + "loss": 0.3391, "step": 96710 }, { - "epoch": 3.4, - "learning_rate": 1.2245615670924334e-05, - "loss": 0.2327, + "epoch": 3.4856020470681517, + "grad_norm": 0.25046810507774353, + "learning_rate": 1.1110380225720565e-05, + "loss": 0.3941, "step": 96715 }, { - "epoch": 3.4, - "learning_rate": 1.2243165652861416e-05, - "loss": 0.2592, + "epoch": 3.485782246729376, + "grad_norm": 0.23813064396381378, + "learning_rate": 1.1107954009278357e-05, + "loss": 0.386, "step": 96720 }, { - "epoch": 3.4, - "learning_rate": 1.2240715800437591e-05, - "loss": 0.2651, + "epoch": 3.4859624463906007, + "grad_norm": 0.2435433268547058, + "learning_rate": 1.1105527982112584e-05, + "loss": 0.3704, "step": 96725 }, { - "epoch": 3.4, - "learning_rate": 1.2238266113684666e-05, - "loss": 0.2399, + "epoch": 3.4861426460518254, + "grad_norm": 0.219720259308815, + "learning_rate": 1.1103102144256305e-05, + "loss": 0.3791, "step": 96730 }, { - "epoch": 3.4, - "learning_rate": 1.2235816592634441e-05, - "loss": 0.2465, + "epoch": 3.48632284571305, + "grad_norm": 0.24152056872844696, + "learning_rate": 1.1100676495742568e-05, + "loss": 0.3654, "step": 96735 }, { - "epoch": 3.4, - "learning_rate": 1.2233367237318718e-05, - "loss": 0.251, + "epoch": 3.486503045374275, + "grad_norm": 0.22711649537086487, + "learning_rate": 1.1098251036604413e-05, + "loss": 0.397, "step": 96740 }, { - "epoch": 3.4, - "learning_rate": 1.2230918047769319e-05, - "loss": 0.2387, + "epoch": 3.486683245035499, + "grad_norm": 0.22654442489147186, + "learning_rate": 1.1095825766874904e-05, + "loss": 0.3741, "step": 96745 }, { - "epoch": 3.4, - "learning_rate": 1.2228469024018024e-05, - "loss": 0.24, + "epoch": 3.486863444696724, + "grad_norm": 0.20491252839565277, + "learning_rate": 1.109340068658708e-05, + "loss": 0.3717, "step": 96750 }, { - "epoch": 3.4, - "learning_rate": 1.222602016609665e-05, - "loss": 0.2775, + "epoch": 3.4870436443579487, + "grad_norm": 0.200746089220047, + "learning_rate": 1.1090975795773979e-05, + "loss": 0.3603, "step": 96755 }, { - "epoch": 3.4, - "learning_rate": 1.2223571474036976e-05, - "loss": 0.2399, + "epoch": 3.4872238440191734, + "grad_norm": 0.2560223639011383, + "learning_rate": 1.1088551094468636e-05, + "loss": 0.3774, "step": 96760 }, { - "epoch": 3.4, - "learning_rate": 1.222112294787082e-05, - "loss": 0.2629, + "epoch": 3.4874040436803977, + "grad_norm": 0.22208933532238007, + "learning_rate": 1.1086126582704085e-05, + "loss": 0.3932, "step": 96765 }, { - "epoch": 3.4, - "learning_rate": 1.2218674587629959e-05, - "loss": 0.2504, + "epoch": 3.4875842433416224, + "grad_norm": 0.25349971652030945, + "learning_rate": 1.1083702260513373e-05, + "loss": 0.3906, "step": 96770 }, { - "epoch": 3.4, - "learning_rate": 1.2216226393346173e-05, - "loss": 0.2398, + "epoch": 3.487764443002847, + "grad_norm": 0.20808152854442596, + "learning_rate": 1.1081278127929534e-05, + "loss": 0.3931, "step": 96775 }, { - "epoch": 3.4, - "learning_rate": 1.2213778365051273e-05, - "loss": 0.2523, + "epoch": 3.487944642664072, + "grad_norm": 0.18402156233787537, + "learning_rate": 1.1078854184985567e-05, + "loss": 0.3678, "step": 96780 }, { - "epoch": 3.41, - "learning_rate": 1.2211330502777036e-05, - "loss": 0.2766, + "epoch": 3.4881248423252966, + "grad_norm": 0.22728796303272247, + "learning_rate": 1.1076430431714526e-05, + "loss": 0.3698, "step": 96785 }, { - "epoch": 3.41, - "learning_rate": 1.2208882806555243e-05, - "loss": 0.2539, + "epoch": 3.488305041986521, + "grad_norm": 0.2671549618244171, + "learning_rate": 1.1074006868149413e-05, + "loss": 0.3913, "step": 96790 }, { - "epoch": 3.41, - "learning_rate": 1.2206435276417669e-05, - "loss": 0.2499, + "epoch": 3.4884852416477456, + "grad_norm": 0.2308701127767563, + "learning_rate": 1.1071583494323274e-05, + "loss": 0.3614, "step": 96795 }, { - "epoch": 3.41, - "learning_rate": 1.2203987912396108e-05, - "loss": 0.2583, + "epoch": 3.4886654413089704, + "grad_norm": 0.22043147683143616, + "learning_rate": 1.106916031026912e-05, + "loss": 0.378, "step": 96800 }, { - "epoch": 3.41, - "learning_rate": 1.2201540714522336e-05, - "loss": 0.2588, + "epoch": 3.488845640970195, + "grad_norm": 0.28679531812667847, + "learning_rate": 1.106673731601994e-05, + "loss": 0.3863, "step": 96805 }, { - "epoch": 3.41, - "learning_rate": 1.2199093682828111e-05, - "loss": 0.2623, + "epoch": 3.4890258406314194, + "grad_norm": 0.25224077701568604, + "learning_rate": 1.1064314511608778e-05, + "loss": 0.381, "step": 96810 }, { - "epoch": 3.41, - "learning_rate": 1.2196646817345217e-05, - "loss": 0.2326, + "epoch": 3.489206040292644, + "grad_norm": 0.252714604139328, + "learning_rate": 1.1061891897068624e-05, + "loss": 0.37, "step": 96815 }, { - "epoch": 3.41, - "learning_rate": 1.219420011810544e-05, - "loss": 0.2569, + "epoch": 3.489386239953869, + "grad_norm": 0.2400377094745636, + "learning_rate": 1.105946947243251e-05, + "loss": 0.3898, "step": 96820 }, { - "epoch": 3.41, - "learning_rate": 1.219175358514053e-05, - "loss": 0.2758, + "epoch": 3.4895664396150936, + "grad_norm": 0.26662567257881165, + "learning_rate": 1.1057047237733417e-05, + "loss": 0.3624, "step": 96825 }, { - "epoch": 3.41, - "learning_rate": 1.2189307218482263e-05, - "loss": 0.2601, + "epoch": 3.4897466392763183, + "grad_norm": 0.219301238656044, + "learning_rate": 1.1054625193004347e-05, + "loss": 0.3836, "step": 96830 }, { - "epoch": 3.41, - "learning_rate": 1.2186861018162387e-05, - "loss": 0.2643, + "epoch": 3.4899268389375426, + "grad_norm": 0.2580451965332031, + "learning_rate": 1.1052203338278319e-05, + "loss": 0.3923, "step": 96835 }, { - "epoch": 3.41, - "learning_rate": 1.2184414984212689e-05, - "loss": 0.2625, + "epoch": 3.4901070385987674, + "grad_norm": 0.23948605358600616, + "learning_rate": 1.104978167358832e-05, + "loss": 0.3659, "step": 96840 }, { - "epoch": 3.41, - "learning_rate": 1.2181969116664912e-05, - "loss": 0.2553, + "epoch": 3.490287238259992, + "grad_norm": 0.20990577340126038, + "learning_rate": 1.1047360198967344e-05, + "loss": 0.389, "step": 96845 }, { - "epoch": 3.41, - "learning_rate": 1.2179523415550822e-05, - "loss": 0.2407, + "epoch": 3.490467437921217, + "grad_norm": 0.2310859113931656, + "learning_rate": 1.1044938914448385e-05, + "loss": 0.3828, "step": 96850 }, { - "epoch": 3.41, - "learning_rate": 1.2177077880902162e-05, - "loss": 0.2398, + "epoch": 3.490647637582441, + "grad_norm": 0.19339106976985931, + "learning_rate": 1.104251782006442e-05, + "loss": 0.3703, "step": 96855 }, { - "epoch": 3.41, - "learning_rate": 1.2174632512750706e-05, - "loss": 0.2668, + "epoch": 3.490827837243666, + "grad_norm": 0.23496823012828827, + "learning_rate": 1.104009691584846e-05, + "loss": 0.3635, "step": 96860 }, { - "epoch": 3.41, - "learning_rate": 1.2172187311128181e-05, - "loss": 0.2671, + "epoch": 3.4910080369048906, + "grad_norm": 0.28526195883750916, + "learning_rate": 1.1037676201833474e-05, + "loss": 0.4448, "step": 96865 }, { - "epoch": 3.41, - "learning_rate": 1.2169742276066362e-05, - "loss": 0.2285, + "epoch": 3.4911882365661153, + "grad_norm": 0.23180098831653595, + "learning_rate": 1.103525567805245e-05, + "loss": 0.3416, "step": 96870 }, { - "epoch": 3.41, - "learning_rate": 1.2167297407596972e-05, - "loss": 0.2851, + "epoch": 3.49136843622734, + "grad_norm": 0.20511779189109802, + "learning_rate": 1.1032835344538362e-05, + "loss": 0.3618, "step": 96875 }, { - "epoch": 3.41, - "learning_rate": 1.2164852705751782e-05, - "loss": 0.2614, + "epoch": 3.4915486358885643, + "grad_norm": 0.22699345648288727, + "learning_rate": 1.1030415201324188e-05, + "loss": 0.3726, "step": 96880 }, { - "epoch": 3.41, - "learning_rate": 1.2162408170562518e-05, - "loss": 0.257, + "epoch": 3.491728835549789, + "grad_norm": 0.24510350823402405, + "learning_rate": 1.1027995248442891e-05, + "loss": 0.3828, "step": 96885 }, { - "epoch": 3.41, - "learning_rate": 1.2159963802060925e-05, - "loss": 0.2346, + "epoch": 3.491909035211014, + "grad_norm": 0.2351260483264923, + "learning_rate": 1.1025575485927476e-05, + "loss": 0.3986, "step": 96890 }, { - "epoch": 3.41, - "learning_rate": 1.215751960027873e-05, - "loss": 0.249, + "epoch": 3.4920892348722385, + "grad_norm": 0.24812495708465576, + "learning_rate": 1.1023155913810868e-05, + "loss": 0.3722, "step": 96895 }, { - "epoch": 3.41, - "learning_rate": 1.2155075565247686e-05, - "loss": 0.2442, + "epoch": 3.492269434533463, + "grad_norm": 0.26317039132118225, + "learning_rate": 1.1020736532126063e-05, + "loss": 0.3707, "step": 96900 }, { - "epoch": 3.41, - "learning_rate": 1.2152631696999525e-05, - "loss": 0.2477, + "epoch": 3.4924496341946876, + "grad_norm": 0.2346991002559662, + "learning_rate": 1.101831734090602e-05, + "loss": 0.3692, "step": 96905 }, { - "epoch": 3.41, - "learning_rate": 1.2150187995565965e-05, - "loss": 0.262, + "epoch": 3.4926298338559123, + "grad_norm": 0.2354336977005005, + "learning_rate": 1.1015898340183684e-05, + "loss": 0.3982, "step": 96910 }, { - "epoch": 3.41, - "learning_rate": 1.2147744460978753e-05, - "loss": 0.2629, + "epoch": 3.492810033517137, + "grad_norm": 0.2742219567298889, + "learning_rate": 1.1013479529992047e-05, + "loss": 0.3985, "step": 96915 }, { - "epoch": 3.41, - "learning_rate": 1.2145301093269598e-05, - "loss": 0.2606, + "epoch": 3.4929902331783618, + "grad_norm": 0.24369125068187714, + "learning_rate": 1.1011060910364033e-05, + "loss": 0.4218, "step": 96920 }, { - "epoch": 3.41, - "learning_rate": 1.214285789247025e-05, - "loss": 0.2496, + "epoch": 3.4931704328395865, + "grad_norm": 0.2888088822364807, + "learning_rate": 1.1008642481332596e-05, + "loss": 0.3919, "step": 96925 }, { - "epoch": 3.41, - "learning_rate": 1.214041485861241e-05, - "loss": 0.2718, + "epoch": 3.493350632500811, + "grad_norm": 0.2732216715812683, + "learning_rate": 1.1006224242930705e-05, + "loss": 0.3953, "step": 96930 }, { - "epoch": 3.41, - "learning_rate": 1.2137971991727815e-05, - "loss": 0.245, + "epoch": 3.4935308321620355, + "grad_norm": 0.2236909568309784, + "learning_rate": 1.1003806195191298e-05, + "loss": 0.3726, "step": 96935 }, { - "epoch": 3.41, - "learning_rate": 1.2135529291848177e-05, - "loss": 0.2643, + "epoch": 3.4937110318232603, + "grad_norm": 0.21694554388523102, + "learning_rate": 1.1001388338147326e-05, + "loss": 0.3782, "step": 96940 }, { - "epoch": 3.41, - "learning_rate": 1.2133086759005217e-05, - "loss": 0.2774, + "epoch": 3.493891231484485, + "grad_norm": 0.2243148237466812, + "learning_rate": 1.0998970671831726e-05, + "loss": 0.3572, "step": 96945 }, { - "epoch": 3.41, - "learning_rate": 1.213064439323063e-05, - "loss": 0.2857, + "epoch": 3.4940714311457093, + "grad_norm": 0.22870691120624542, + "learning_rate": 1.099655319627743e-05, + "loss": 0.3504, "step": 96950 }, { - "epoch": 3.41, - "learning_rate": 1.212820219455616e-05, - "loss": 0.2753, + "epoch": 3.494251630806934, + "grad_norm": 0.2252253144979477, + "learning_rate": 1.0994135911517396e-05, + "loss": 0.3817, "step": 96955 }, { - "epoch": 3.41, - "learning_rate": 1.2125760163013497e-05, - "loss": 0.241, + "epoch": 3.4944318304681588, + "grad_norm": 0.21160556375980377, + "learning_rate": 1.0991718817584549e-05, + "loss": 0.3751, "step": 96960 }, { - "epoch": 3.41, - "learning_rate": 1.2123318298634356e-05, - "loss": 0.2469, + "epoch": 3.4946120301293835, + "grad_norm": 0.229017436504364, + "learning_rate": 1.0989301914511818e-05, + "loss": 0.3454, "step": 96965 }, { - "epoch": 3.41, - "learning_rate": 1.2120876601450432e-05, - "loss": 0.2252, + "epoch": 3.494792229790608, + "grad_norm": 0.20936749875545502, + "learning_rate": 1.0986885202332136e-05, + "loss": 0.3819, "step": 96970 }, { - "epoch": 3.41, - "learning_rate": 1.211843507149344e-05, - "loss": 0.2413, + "epoch": 3.4949724294518325, + "grad_norm": 0.21880267560482025, + "learning_rate": 1.0984468681078424e-05, + "loss": 0.3691, "step": 96975 }, { - "epoch": 3.41, - "learning_rate": 1.2115993708795085e-05, - "loss": 0.2777, + "epoch": 3.4951526291130572, + "grad_norm": 0.22391989827156067, + "learning_rate": 1.098205235078362e-05, + "loss": 0.4176, "step": 96980 }, { - "epoch": 3.41, - "learning_rate": 1.2113552513387061e-05, - "loss": 0.2447, + "epoch": 3.495332828774282, + "grad_norm": 0.28297480940818787, + "learning_rate": 1.0979636211480648e-05, + "loss": 0.3618, "step": 96985 }, { - "epoch": 3.41, - "learning_rate": 1.2111111485301054e-05, - "loss": 0.259, + "epoch": 3.4955130284355067, + "grad_norm": 0.22317856550216675, + "learning_rate": 1.09772202632024e-05, + "loss": 0.3755, "step": 96990 }, { - "epoch": 3.41, - "learning_rate": 1.2108670624568786e-05, - "loss": 0.2624, + "epoch": 3.495693228096731, + "grad_norm": 0.2054082453250885, + "learning_rate": 1.0974804505981822e-05, + "loss": 0.3781, "step": 96995 }, { - "epoch": 3.41, - "learning_rate": 1.2106229931221929e-05, - "loss": 0.2697, + "epoch": 3.4958734277579557, + "grad_norm": 0.2986920177936554, + "learning_rate": 1.0972388939851804e-05, + "loss": 0.3982, "step": 97000 }, { - "epoch": 3.41, - "eval_loss": 0.2534578740596771, - "eval_runtime": 10.5485, - "eval_samples_per_second": 9.48, - "eval_steps_per_second": 9.48, + "epoch": 3.4958734277579557, + "eval_loss": 0.4296031594276428, + "eval_runtime": 3.5247, + "eval_samples_per_second": 28.371, + "eval_steps_per_second": 7.093, "step": 97000 }, { - "epoch": 3.41, - "learning_rate": 1.210378940529218e-05, - "loss": 0.2617, + "epoch": 3.4960536274191805, + "grad_norm": 0.25335046648979187, + "learning_rate": 1.0969973564845293e-05, + "loss": 0.4044, "step": 97005 }, { - "epoch": 3.41, - "learning_rate": 1.2101349046811216e-05, - "loss": 0.2551, + "epoch": 3.496233827080405, + "grad_norm": 0.23634032905101776, + "learning_rate": 1.0967558380995165e-05, + "loss": 0.4013, "step": 97010 }, { - "epoch": 3.41, - "learning_rate": 1.2098908855810745e-05, - "loss": 0.2594, + "epoch": 3.49641402674163, + "grad_norm": 0.2781918942928314, + "learning_rate": 1.0965143388334329e-05, + "loss": 0.3848, "step": 97015 }, { - "epoch": 3.41, - "learning_rate": 1.209646883232244e-05, - "loss": 0.2491, + "epoch": 3.496594226402854, + "grad_norm": 0.266989141702652, + "learning_rate": 1.0962728586895706e-05, + "loss": 0.3611, "step": 97020 }, { - "epoch": 3.41, - "learning_rate": 1.2094028976377978e-05, - "loss": 0.2569, + "epoch": 3.496774426064079, + "grad_norm": 0.2467830330133438, + "learning_rate": 1.0960313976712188e-05, + "loss": 0.3863, "step": 97025 }, { - "epoch": 3.41, - "learning_rate": 1.209158928800904e-05, - "loss": 0.2469, + "epoch": 3.4969546257253037, + "grad_norm": 0.2517973482608795, + "learning_rate": 1.0957899557816676e-05, + "loss": 0.3733, "step": 97030 }, { - "epoch": 3.41, - "learning_rate": 1.2089149767247317e-05, - "loss": 0.2686, + "epoch": 3.4971348253865284, + "grad_norm": 0.2016320675611496, + "learning_rate": 1.095548533024206e-05, + "loss": 0.4017, "step": 97035 }, { - "epoch": 3.41, - "learning_rate": 1.2086710414124477e-05, - "loss": 0.2714, + "epoch": 3.4973150250477527, + "grad_norm": 0.2199011892080307, + "learning_rate": 1.095307129402123e-05, + "loss": 0.3199, "step": 97040 }, { - "epoch": 3.41, - "learning_rate": 1.2084271228672183e-05, - "loss": 0.261, + "epoch": 3.4974952247089774, + "grad_norm": 0.23464830219745636, + "learning_rate": 1.0950657449187094e-05, + "loss": 0.3899, "step": 97045 }, { - "epoch": 3.41, - "learning_rate": 1.2081832210922125e-05, - "loss": 0.2628, + "epoch": 3.497675424370202, + "grad_norm": 0.2523633539676666, + "learning_rate": 1.0948243795772528e-05, + "loss": 0.3891, "step": 97050 }, { - "epoch": 3.41, - "learning_rate": 1.207939336090596e-05, - "loss": 0.2624, + "epoch": 3.497855624031427, + "grad_norm": 0.26079466938972473, + "learning_rate": 1.0945830333810423e-05, + "loss": 0.3812, "step": 97055 }, { - "epoch": 3.41, - "learning_rate": 1.2076954678655361e-05, - "loss": 0.2594, + "epoch": 3.4980358236926516, + "grad_norm": 0.2218736708164215, + "learning_rate": 1.0943417063333655e-05, + "loss": 0.3631, "step": 97060 }, { - "epoch": 3.42, - "learning_rate": 1.2074516164201977e-05, - "loss": 0.2724, + "epoch": 3.498216023353876, + "grad_norm": 0.19591349363327026, + "learning_rate": 1.0941003984375112e-05, + "loss": 0.3689, "step": 97065 }, { - "epoch": 3.42, - "learning_rate": 1.2072077817577493e-05, - "loss": 0.2579, + "epoch": 3.4983962230151007, + "grad_norm": 0.2568768858909607, + "learning_rate": 1.0938591096967657e-05, + "loss": 0.4084, "step": 97070 }, { - "epoch": 3.42, - "learning_rate": 1.2069639638813557e-05, - "loss": 0.23, + "epoch": 3.4985764226763254, + "grad_norm": 0.21716725826263428, + "learning_rate": 1.0936178401144184e-05, + "loss": 0.387, "step": 97075 }, { - "epoch": 3.42, - "learning_rate": 1.206720162794182e-05, - "loss": 0.2357, + "epoch": 3.49875662233755, + "grad_norm": 0.24818527698516846, + "learning_rate": 1.0933765896937556e-05, + "loss": 0.402, "step": 97080 }, { - "epoch": 3.42, - "learning_rate": 1.2064763784993958e-05, - "loss": 0.248, + "epoch": 3.4989368219987744, + "grad_norm": 0.23442403972148895, + "learning_rate": 1.093135358438065e-05, + "loss": 0.373, "step": 97085 }, { - "epoch": 3.42, - "learning_rate": 1.2062326110001603e-05, - "loss": 0.2556, + "epoch": 3.499117021659999, + "grad_norm": 0.2103729248046875, + "learning_rate": 1.0928941463506322e-05, + "loss": 0.3536, "step": 97090 }, { - "epoch": 3.42, - "learning_rate": 1.2059888602996428e-05, - "loss": 0.2812, + "epoch": 3.499297221321224, + "grad_norm": 0.21677030622959137, + "learning_rate": 1.0926529534347435e-05, + "loss": 0.3647, "step": 97095 }, { - "epoch": 3.42, - "learning_rate": 1.205745126401007e-05, - "loss": 0.2516, + "epoch": 3.4994774209824486, + "grad_norm": 0.24412192404270172, + "learning_rate": 1.0924117796936878e-05, + "loss": 0.4133, "step": 97100 }, { - "epoch": 3.42, - "learning_rate": 1.2055014093074168e-05, - "loss": 0.2561, + "epoch": 3.4996576206436734, + "grad_norm": 0.247679203748703, + "learning_rate": 1.0921706251307481e-05, + "loss": 0.4191, "step": 97105 }, { - "epoch": 3.42, - "learning_rate": 1.2052577090220387e-05, - "loss": 0.2539, + "epoch": 3.4998378203048977, + "grad_norm": 0.23608912527561188, + "learning_rate": 1.09192948974921e-05, + "loss": 0.3869, "step": 97110 }, { - "epoch": 3.42, - "learning_rate": 1.2050140255480361e-05, - "loss": 0.2494, + "epoch": 3.5000180199661224, + "grad_norm": 0.23635424673557281, + "learning_rate": 1.0916883735523612e-05, + "loss": 0.3906, "step": 97115 }, { - "epoch": 3.42, - "learning_rate": 1.2047703588885728e-05, - "loss": 0.2374, + "epoch": 3.500198219627347, + "grad_norm": 0.3293451964855194, + "learning_rate": 1.0914472765434852e-05, + "loss": 0.3959, "step": 97120 }, { - "epoch": 3.42, - "learning_rate": 1.2045267090468117e-05, - "loss": 0.245, + "epoch": 3.500378419288572, + "grad_norm": 0.21557281911373138, + "learning_rate": 1.091206198725868e-05, + "loss": 0.36, "step": 97125 }, { - "epoch": 3.42, - "learning_rate": 1.2042830760259189e-05, - "loss": 0.2559, + "epoch": 3.500558618949796, + "grad_norm": 0.2755977511405945, + "learning_rate": 1.090965140102793e-05, + "loss": 0.3945, "step": 97130 }, { - "epoch": 3.42, - "learning_rate": 1.2040394598290553e-05, - "loss": 0.2556, + "epoch": 3.500738818611021, + "grad_norm": 0.18358829617500305, + "learning_rate": 1.0907241006775445e-05, + "loss": 0.3519, "step": 97135 }, { - "epoch": 3.42, - "learning_rate": 1.203795860459386e-05, - "loss": 0.2499, + "epoch": 3.5009190182722456, + "grad_norm": 0.2956106960773468, + "learning_rate": 1.090483080453408e-05, + "loss": 0.3928, "step": 97140 }, { - "epoch": 3.42, - "learning_rate": 1.2035522779200725e-05, - "loss": 0.2616, + "epoch": 3.5010992179334703, + "grad_norm": 0.23157207667827606, + "learning_rate": 1.090242079433667e-05, + "loss": 0.4038, "step": 97145 }, { - "epoch": 3.42, - "learning_rate": 1.203308712214279e-05, - "loss": 0.2649, + "epoch": 3.501279417594695, + "grad_norm": 0.1804695725440979, + "learning_rate": 1.0900010976216046e-05, + "loss": 0.3982, "step": 97150 }, { - "epoch": 3.42, - "learning_rate": 1.2030651633451676e-05, - "loss": 0.2547, + "epoch": 3.50145961725592, + "grad_norm": 0.25014370679855347, + "learning_rate": 1.0897601350205042e-05, + "loss": 0.3616, "step": 97155 }, { - "epoch": 3.42, - "learning_rate": 1.2028216313158994e-05, - "loss": 0.2481, + "epoch": 3.501639816917144, + "grad_norm": 0.29266318678855896, + "learning_rate": 1.0895191916336477e-05, + "loss": 0.3894, "step": 97160 }, { - "epoch": 3.42, - "learning_rate": 1.2025781161296382e-05, - "loss": 0.2559, + "epoch": 3.501820016578369, + "grad_norm": 0.2770882248878479, + "learning_rate": 1.0892782674643207e-05, + "loss": 0.3823, "step": 97165 }, { - "epoch": 3.42, - "learning_rate": 1.2023346177895453e-05, - "loss": 0.2568, + "epoch": 3.5020002162395936, + "grad_norm": 0.2866399586200714, + "learning_rate": 1.0890373625158046e-05, + "loss": 0.41, "step": 97170 }, { - "epoch": 3.42, - "learning_rate": 1.2020911362987822e-05, - "loss": 0.2789, + "epoch": 3.502180415900818, + "grad_norm": 0.2779446542263031, + "learning_rate": 1.0887964767913796e-05, + "loss": 0.3801, "step": 97175 }, { - "epoch": 3.42, - "learning_rate": 1.2018476716605092e-05, - "loss": 0.2556, + "epoch": 3.5023606155620426, + "grad_norm": 0.2382117062807083, + "learning_rate": 1.0885556102943303e-05, + "loss": 0.3715, "step": 97180 }, { - "epoch": 3.42, - "learning_rate": 1.2016042238778896e-05, - "loss": 0.2463, + "epoch": 3.5025408152232673, + "grad_norm": 0.23480451107025146, + "learning_rate": 1.0883147630279367e-05, + "loss": 0.3713, "step": 97185 }, { - "epoch": 3.42, - "learning_rate": 1.2013607929540826e-05, - "loss": 0.2639, + "epoch": 3.502721014884492, + "grad_norm": 0.2746983468532562, + "learning_rate": 1.0880739349954829e-05, + "loss": 0.3993, "step": 97190 }, { - "epoch": 3.42, - "learning_rate": 1.2011173788922508e-05, - "loss": 0.259, + "epoch": 3.502901214545717, + "grad_norm": 0.27288299798965454, + "learning_rate": 1.0878331262002475e-05, + "loss": 0.3596, "step": 97195 }, { - "epoch": 3.42, - "learning_rate": 1.200873981695553e-05, - "loss": 0.2368, + "epoch": 3.5030814142069415, + "grad_norm": 0.25127673149108887, + "learning_rate": 1.0875923366455113e-05, + "loss": 0.3874, "step": 97200 }, { - "epoch": 3.42, - "learning_rate": 1.2006306013671515e-05, - "loss": 0.2639, + "epoch": 3.503261613868166, + "grad_norm": 0.29258519411087036, + "learning_rate": 1.0873515663345572e-05, + "loss": 0.411, "step": 97205 }, { - "epoch": 3.42, - "learning_rate": 1.2003872379102049e-05, - "loss": 0.2753, + "epoch": 3.5034418135293905, + "grad_norm": 0.2521027624607086, + "learning_rate": 1.0871108152706644e-05, + "loss": 0.3625, "step": 97210 }, { - "epoch": 3.42, - "learning_rate": 1.2001438913278739e-05, - "loss": 0.2755, + "epoch": 3.5036220131906153, + "grad_norm": 0.24310770630836487, + "learning_rate": 1.086870083457113e-05, + "loss": 0.3946, "step": 97215 }, { - "epoch": 3.42, - "learning_rate": 1.1999005616233166e-05, - "loss": 0.2661, + "epoch": 3.5038022128518396, + "grad_norm": 0.23873953521251678, + "learning_rate": 1.0866293708971834e-05, + "loss": 0.3985, "step": 97220 }, { - "epoch": 3.42, - "learning_rate": 1.199657248799695e-05, - "loss": 0.2821, + "epoch": 3.5039824125130643, + "grad_norm": 0.25413691997528076, + "learning_rate": 1.086388677594154e-05, + "loss": 0.3693, "step": 97225 }, { - "epoch": 3.42, - "learning_rate": 1.1994139528601667e-05, - "loss": 0.2348, + "epoch": 3.504162612174289, + "grad_norm": 0.24815772473812103, + "learning_rate": 1.086148003551306e-05, + "loss": 0.3974, "step": 97230 }, { - "epoch": 3.42, - "learning_rate": 1.1991706738078912e-05, - "loss": 0.2759, + "epoch": 3.5043428118355138, + "grad_norm": 0.2222224920988083, + "learning_rate": 1.085907348771918e-05, + "loss": 0.3328, "step": 97235 }, { - "epoch": 3.42, - "learning_rate": 1.1989274116460264e-05, - "loss": 0.2274, + "epoch": 3.5045230114967385, + "grad_norm": 0.18813829123973846, + "learning_rate": 1.0856667132592683e-05, + "loss": 0.3828, "step": 97240 }, { - "epoch": 3.42, - "learning_rate": 1.1986841663777326e-05, - "loss": 0.2486, + "epoch": 3.5047032111579632, + "grad_norm": 0.2500969171524048, + "learning_rate": 1.0854260970166358e-05, + "loss": 0.3769, "step": 97245 }, { - "epoch": 3.42, - "learning_rate": 1.1984409380061665e-05, - "loss": 0.2528, + "epoch": 3.5048834108191875, + "grad_norm": 0.3164437711238861, + "learning_rate": 1.085185500047299e-05, + "loss": 0.3717, "step": 97250 }, { - "epoch": 3.42, - "learning_rate": 1.1981977265344868e-05, - "loss": 0.2428, + "epoch": 3.5050636104804123, + "grad_norm": 0.23495697975158691, + "learning_rate": 1.084944922354535e-05, + "loss": 0.3808, "step": 97255 }, { - "epoch": 3.42, - "learning_rate": 1.1979545319658527e-05, - "loss": 0.2572, + "epoch": 3.505243810141637, + "grad_norm": 0.2900897264480591, + "learning_rate": 1.0847043639416233e-05, + "loss": 0.3814, "step": 97260 }, { - "epoch": 3.42, - "learning_rate": 1.197711354303421e-05, - "loss": 0.2478, + "epoch": 3.5054240098028617, + "grad_norm": 0.2515815794467926, + "learning_rate": 1.0844638248118405e-05, + "loss": 0.3928, "step": 97265 }, { - "epoch": 3.42, - "learning_rate": 1.1974681935503493e-05, - "loss": 0.2638, + "epoch": 3.505604209464086, + "grad_norm": 0.23748047649860382, + "learning_rate": 1.0842233049684642e-05, + "loss": 0.4071, "step": 97270 }, { - "epoch": 3.42, - "learning_rate": 1.1972250497097933e-05, - "loss": 0.2489, + "epoch": 3.5057844091253108, + "grad_norm": 0.2293088734149933, + "learning_rate": 1.0839828044147712e-05, + "loss": 0.4278, "step": 97275 }, { - "epoch": 3.42, - "learning_rate": 1.196981922784913e-05, - "loss": 0.2545, + "epoch": 3.5059646087865355, + "grad_norm": 0.2869160771369934, + "learning_rate": 1.0837423231540375e-05, + "loss": 0.3777, "step": 97280 }, { - "epoch": 3.42, - "learning_rate": 1.1967388127788634e-05, - "loss": 0.2686, + "epoch": 3.50614480844776, + "grad_norm": 0.2737312912940979, + "learning_rate": 1.083501861189542e-05, + "loss": 0.3707, "step": 97285 }, { - "epoch": 3.42, - "learning_rate": 1.1964957196948012e-05, - "loss": 0.2514, + "epoch": 3.506325008108985, + "grad_norm": 0.2278176099061966, + "learning_rate": 1.0832614185245587e-05, + "loss": 0.362, "step": 97290 }, { - "epoch": 3.42, - "learning_rate": 1.1962526435358825e-05, - "loss": 0.2623, + "epoch": 3.5065052077702092, + "grad_norm": 0.26734185218811035, + "learning_rate": 1.0830209951623635e-05, + "loss": 0.3986, "step": 97295 }, { - "epoch": 3.42, - "learning_rate": 1.1960095843052647e-05, - "loss": 0.2366, + "epoch": 3.506685407431434, + "grad_norm": 0.2176479548215866, + "learning_rate": 1.0827805911062336e-05, + "loss": 0.3585, "step": 97300 }, { - "epoch": 3.42, - "learning_rate": 1.1957665420061023e-05, - "loss": 0.2658, + "epoch": 3.5068656070926587, + "grad_norm": 0.22162985801696777, + "learning_rate": 1.082540206359444e-05, + "loss": 0.3828, "step": 97305 }, { - "epoch": 3.42, - "learning_rate": 1.1955235166415526e-05, - "loss": 0.2777, + "epoch": 3.5070458067538834, + "grad_norm": 0.21119628846645355, + "learning_rate": 1.0822998409252694e-05, + "loss": 0.3715, "step": 97310 }, { - "epoch": 3.42, - "learning_rate": 1.1952805082147697e-05, - "loss": 0.2421, + "epoch": 3.5072260064151077, + "grad_norm": 0.25533100962638855, + "learning_rate": 1.082059494806985e-05, + "loss": 0.4005, "step": 97315 }, { - "epoch": 3.42, - "learning_rate": 1.1950375167289102e-05, - "loss": 0.2495, + "epoch": 3.5074062060763325, + "grad_norm": 0.3065352439880371, + "learning_rate": 1.0818191680078648e-05, + "loss": 0.3915, "step": 97320 }, { - "epoch": 3.42, - "learning_rate": 1.1947945421871288e-05, - "loss": 0.237, + "epoch": 3.507586405737557, + "grad_norm": 0.23930329084396362, + "learning_rate": 1.0815788605311846e-05, + "loss": 0.3843, "step": 97325 }, { - "epoch": 3.42, - "learning_rate": 1.1945515845925798e-05, - "loss": 0.2598, + "epoch": 3.507766605398782, + "grad_norm": 0.2522366940975189, + "learning_rate": 1.0813385723802177e-05, + "loss": 0.3506, "step": 97330 }, { - "epoch": 3.42, - "learning_rate": 1.1943086439484174e-05, - "loss": 0.2453, + "epoch": 3.5079468050600067, + "grad_norm": 0.22099296748638153, + "learning_rate": 1.0810983035582384e-05, + "loss": 0.3578, "step": 97335 }, { - "epoch": 3.42, - "learning_rate": 1.1940657202577977e-05, - "loss": 0.271, + "epoch": 3.5081270047212314, + "grad_norm": 0.2783578038215637, + "learning_rate": 1.0808580540685198e-05, + "loss": 0.409, "step": 97340 }, { - "epoch": 3.42, - "learning_rate": 1.1938228135238741e-05, - "loss": 0.2707, + "epoch": 3.5083072043824557, + "grad_norm": 0.282536119222641, + "learning_rate": 1.0806178239143347e-05, + "loss": 0.392, "step": 97345 }, { - "epoch": 3.43, - "learning_rate": 1.1935799237498002e-05, - "loss": 0.2652, + "epoch": 3.5084874040436804, + "grad_norm": 0.2658419609069824, + "learning_rate": 1.0803776130989577e-05, + "loss": 0.4034, "step": 97350 }, { - "epoch": 3.43, - "learning_rate": 1.1933370509387293e-05, - "loss": 0.2505, + "epoch": 3.508667603704905, + "grad_norm": 0.23582491278648376, + "learning_rate": 1.0801374216256619e-05, + "loss": 0.3993, "step": 97355 }, { - "epoch": 3.43, - "learning_rate": 1.193142764905355e-05, - "loss": 0.2565, + "epoch": 3.5088478033661294, + "grad_norm": 0.22847378253936768, + "learning_rate": 1.0798972494977167e-05, + "loss": 0.3643, "step": 97360 }, { - "epoch": 3.43, - "learning_rate": 1.1928999226356375e-05, - "loss": 0.2557, + "epoch": 3.509028003027354, + "grad_norm": 0.24764804542064667, + "learning_rate": 1.0796570967183978e-05, + "loss": 0.3575, "step": 97365 }, { - "epoch": 3.43, - "learning_rate": 1.192657097337754e-05, - "loss": 0.2775, + "epoch": 3.509208202688579, + "grad_norm": 0.2016579508781433, + "learning_rate": 1.079416963290976e-05, + "loss": 0.3777, "step": 97370 }, { - "epoch": 3.43, - "learning_rate": 1.1924142890148566e-05, - "loss": 0.2629, + "epoch": 3.5093884023498036, + "grad_norm": 0.2673712968826294, + "learning_rate": 1.0791768492187218e-05, + "loss": 0.3758, "step": 97375 }, { - "epoch": 3.43, - "learning_rate": 1.1921714976700979e-05, - "loss": 0.2723, + "epoch": 3.5095686020110284, + "grad_norm": 0.23791760206222534, + "learning_rate": 1.07893675450491e-05, + "loss": 0.3433, "step": 97380 }, { - "epoch": 3.43, - "learning_rate": 1.1919287233066295e-05, - "loss": 0.2582, + "epoch": 3.509748801672253, + "grad_norm": 0.208527609705925, + "learning_rate": 1.0786966791528078e-05, + "loss": 0.3444, "step": 97385 }, { - "epoch": 3.43, - "learning_rate": 1.1916859659276055e-05, - "loss": 0.2901, + "epoch": 3.5099290013334774, + "grad_norm": 0.2786830961704254, + "learning_rate": 1.0784566231656893e-05, + "loss": 0.3838, "step": 97390 }, { - "epoch": 3.43, - "learning_rate": 1.191443225536176e-05, - "loss": 0.2493, + "epoch": 3.510109200994702, + "grad_norm": 0.2595866024494171, + "learning_rate": 1.078216586546824e-05, + "loss": 0.3882, "step": 97395 }, { - "epoch": 3.43, - "learning_rate": 1.191200502135495e-05, - "loss": 0.24, + "epoch": 3.510289400655927, + "grad_norm": 0.2380652129650116, + "learning_rate": 1.0779765692994826e-05, + "loss": 0.3906, "step": 97400 }, { - "epoch": 3.43, - "learning_rate": 1.1909577957287119e-05, - "loss": 0.2765, + "epoch": 3.510469600317151, + "grad_norm": 0.2838803231716156, + "learning_rate": 1.0777365714269347e-05, + "loss": 0.3842, "step": 97405 }, { - "epoch": 3.43, - "learning_rate": 1.1907151063189797e-05, - "loss": 0.2789, + "epoch": 3.510649799978376, + "grad_norm": 0.21511715650558472, + "learning_rate": 1.0774965929324502e-05, + "loss": 0.3744, "step": 97410 }, { - "epoch": 3.43, - "learning_rate": 1.1904724339094494e-05, - "loss": 0.2526, + "epoch": 3.5108299996396006, + "grad_norm": 0.22063112258911133, + "learning_rate": 1.0772566338193e-05, + "loss": 0.3887, "step": 97415 }, { - "epoch": 3.43, - "learning_rate": 1.1902297785032715e-05, - "loss": 0.2481, + "epoch": 3.5110101993008254, + "grad_norm": 0.21338684856891632, + "learning_rate": 1.0770166940907524e-05, + "loss": 0.3964, "step": 97420 }, { - "epoch": 3.43, - "learning_rate": 1.1899871401035958e-05, - "loss": 0.2529, + "epoch": 3.51119039896205, + "grad_norm": 0.21853932738304138, + "learning_rate": 1.0767767737500772e-05, + "loss": 0.3588, "step": 97425 }, { - "epoch": 3.43, - "learning_rate": 1.1897445187135747e-05, - "loss": 0.2799, + "epoch": 3.511370598623275, + "grad_norm": 0.20588691532611847, + "learning_rate": 1.0765368728005429e-05, + "loss": 0.4019, "step": 97430 }, { - "epoch": 3.43, - "learning_rate": 1.1895019143363576e-05, - "loss": 0.2723, + "epoch": 3.511550798284499, + "grad_norm": 0.27712467312812805, + "learning_rate": 1.076296991245418e-05, + "loss": 0.4012, "step": 97435 }, { - "epoch": 3.43, - "learning_rate": 1.1892593269750935e-05, - "loss": 0.2673, + "epoch": 3.511730997945724, + "grad_norm": 0.2563231289386749, + "learning_rate": 1.07605712908797e-05, + "loss": 0.3952, "step": 97440 }, { - "epoch": 3.43, - "learning_rate": 1.189016756632934e-05, - "loss": 0.2484, + "epoch": 3.5119111976069486, + "grad_norm": 0.1871892213821411, + "learning_rate": 1.0758172863314689e-05, + "loss": 0.3994, "step": 97445 }, { - "epoch": 3.43, - "learning_rate": 1.188774203313027e-05, - "loss": 0.2577, + "epoch": 3.512091397268173, + "grad_norm": 0.24314607679843903, + "learning_rate": 1.0755774629791815e-05, + "loss": 0.384, "step": 97450 }, { - "epoch": 3.43, - "learning_rate": 1.188531667018524e-05, - "loss": 0.2783, + "epoch": 3.5122715969293976, + "grad_norm": 0.23912304639816284, + "learning_rate": 1.0753376590343755e-05, + "loss": 0.3468, "step": 97455 }, { - "epoch": 3.43, - "learning_rate": 1.188289147752572e-05, - "loss": 0.2529, + "epoch": 3.5124517965906223, + "grad_norm": 0.25849324464797974, + "learning_rate": 1.0750978745003181e-05, + "loss": 0.395, "step": 97460 }, { - "epoch": 3.43, - "learning_rate": 1.188046645518322e-05, - "loss": 0.2641, + "epoch": 3.512631996251847, + "grad_norm": 0.21953187882900238, + "learning_rate": 1.0748581093802753e-05, + "loss": 0.3912, "step": 97465 }, { - "epoch": 3.43, - "learning_rate": 1.1878041603189213e-05, - "loss": 0.2413, + "epoch": 3.512812195913072, + "grad_norm": 0.231736421585083, + "learning_rate": 1.0746183636775167e-05, + "loss": 0.3748, "step": 97470 }, { - "epoch": 3.43, - "learning_rate": 1.1875616921575191e-05, - "loss": 0.2463, + "epoch": 3.5129923955742965, + "grad_norm": 0.27500176429748535, + "learning_rate": 1.0743786373953061e-05, + "loss": 0.3845, "step": 97475 }, { - "epoch": 3.43, - "learning_rate": 1.1873192410372628e-05, - "loss": 0.2614, + "epoch": 3.513172595235521, + "grad_norm": 0.2197420597076416, + "learning_rate": 1.0741389305369093e-05, + "loss": 0.3836, "step": 97480 }, { - "epoch": 3.43, - "learning_rate": 1.1870768069613014e-05, - "loss": 0.2712, + "epoch": 3.5133527948967456, + "grad_norm": 0.22646686434745789, + "learning_rate": 1.0738992431055948e-05, + "loss": 0.347, "step": 97485 }, { - "epoch": 3.43, - "learning_rate": 1.1868343899327827e-05, - "loss": 0.2498, + "epoch": 3.5135329945579703, + "grad_norm": 0.22990435361862183, + "learning_rate": 1.073659575104626e-05, + "loss": 0.4269, "step": 97490 }, { - "epoch": 3.43, - "learning_rate": 1.1865919899548542e-05, - "loss": 0.2691, + "epoch": 3.5137131942191946, + "grad_norm": 0.29172033071517944, + "learning_rate": 1.0734678546958963e-05, + "loss": 0.3727, "step": 97495 }, { - "epoch": 3.43, - "learning_rate": 1.1863496070306618e-05, - "loss": 0.2492, + "epoch": 3.5138933938804193, + "grad_norm": 0.24528679251670837, + "learning_rate": 1.0732282216777811e-05, + "loss": 0.3703, "step": 97500 }, { - "epoch": 3.43, - "eval_loss": 0.25291621685028076, - "eval_runtime": 10.5567, - "eval_samples_per_second": 9.473, - "eval_steps_per_second": 9.473, + "epoch": 3.5138933938804193, + "eval_loss": 0.4301924407482147, + "eval_runtime": 3.5317, + "eval_samples_per_second": 28.315, + "eval_steps_per_second": 7.079, "step": 97500 }, { - "epoch": 3.43, - "learning_rate": 1.1861072411633542e-05, - "loss": 0.2492, + "epoch": 3.514073593541644, + "grad_norm": 0.20823730528354645, + "learning_rate": 1.0729886080991553e-05, + "loss": 0.3724, "step": 97505 }, { - "epoch": 3.43, - "learning_rate": 1.1858648923560792e-05, - "loss": 0.2829, + "epoch": 3.514253793202869, + "grad_norm": 0.23355242609977722, + "learning_rate": 1.0727490139632824e-05, + "loss": 0.4024, "step": 97510 }, { - "epoch": 3.43, - "learning_rate": 1.1856225606119825e-05, - "loss": 0.2566, + "epoch": 3.5144339928640935, + "grad_norm": 0.2617993950843811, + "learning_rate": 1.0725094392734289e-05, + "loss": 0.3831, "step": 97515 }, { - "epoch": 3.43, - "learning_rate": 1.1853802459342098e-05, - "loss": 0.2599, + "epoch": 3.5146141925253183, + "grad_norm": 0.2531090974807739, + "learning_rate": 1.0722698840328576e-05, + "loss": 0.3876, "step": 97520 }, { - "epoch": 3.43, - "learning_rate": 1.185137948325909e-05, - "loss": 0.259, + "epoch": 3.5147943921865425, + "grad_norm": 0.24823562800884247, + "learning_rate": 1.0720303482448333e-05, + "loss": 0.3846, "step": 97525 }, { - "epoch": 3.43, - "learning_rate": 1.1848956677902254e-05, - "loss": 0.2482, + "epoch": 3.5149745918477673, + "grad_norm": 0.2218950390815735, + "learning_rate": 1.0717908319126185e-05, + "loss": 0.3883, "step": 97530 }, { - "epoch": 3.43, - "learning_rate": 1.184653404330305e-05, - "loss": 0.2563, + "epoch": 3.515154791508992, + "grad_norm": 0.23106147348880768, + "learning_rate": 1.0715513350394762e-05, + "loss": 0.4091, "step": 97535 }, { - "epoch": 3.43, - "learning_rate": 1.1844111579492923e-05, - "loss": 0.2484, + "epoch": 3.5153349911702167, + "grad_norm": 0.31263554096221924, + "learning_rate": 1.0713118576286716e-05, + "loss": 0.3521, "step": 97540 }, { - "epoch": 3.43, - "learning_rate": 1.1841689286503347e-05, - "loss": 0.2535, + "epoch": 3.515515190831441, + "grad_norm": 0.25022903084754944, + "learning_rate": 1.0710723996834671e-05, + "loss": 0.3958, "step": 97545 }, { - "epoch": 3.43, - "learning_rate": 1.1839267164365764e-05, - "loss": 0.2615, + "epoch": 3.5156953904926658, + "grad_norm": 0.25562384724617004, + "learning_rate": 1.0708329612071227e-05, + "loss": 0.3729, "step": 97550 }, { - "epoch": 3.43, - "learning_rate": 1.1836845213111614e-05, - "loss": 0.26, + "epoch": 3.5158755901538905, + "grad_norm": 0.25449755787849426, + "learning_rate": 1.0705935422029034e-05, + "loss": 0.3987, "step": 97555 }, { - "epoch": 3.43, - "learning_rate": 1.1834423432772354e-05, - "loss": 0.2566, + "epoch": 3.5160557898151152, + "grad_norm": 0.20006752014160156, + "learning_rate": 1.0703541426740697e-05, + "loss": 0.3625, "step": 97560 }, { - "epoch": 3.43, - "learning_rate": 1.1832001823379438e-05, - "loss": 0.2528, + "epoch": 3.51623598947634, + "grad_norm": 0.20985792577266693, + "learning_rate": 1.0701147626238856e-05, + "loss": 0.3947, "step": 97565 }, { - "epoch": 3.43, - "learning_rate": 1.18295803849643e-05, - "loss": 0.264, + "epoch": 3.5164161891375643, + "grad_norm": 0.24245929718017578, + "learning_rate": 1.0698754020556101e-05, + "loss": 0.3838, "step": 97570 }, { - "epoch": 3.43, - "learning_rate": 1.1827159117558373e-05, - "loss": 0.2678, + "epoch": 3.516596388798789, + "grad_norm": 0.23297759890556335, + "learning_rate": 1.0696360609725044e-05, + "loss": 0.3709, "step": 97575 }, { - "epoch": 3.43, - "learning_rate": 1.1824738021193113e-05, - "loss": 0.2706, + "epoch": 3.5167765884600137, + "grad_norm": 0.27019697427749634, + "learning_rate": 1.0693967393778315e-05, + "loss": 0.3778, "step": 97580 }, { - "epoch": 3.43, - "learning_rate": 1.1822317095899946e-05, - "loss": 0.2569, + "epoch": 3.5169567881212385, + "grad_norm": 0.26086610555648804, + "learning_rate": 1.0691574372748509e-05, + "loss": 0.393, "step": 97585 }, { - "epoch": 3.43, - "learning_rate": 1.1819896341710307e-05, - "loss": 0.2325, + "epoch": 3.5171369877824628, + "grad_norm": 0.20790447294712067, + "learning_rate": 1.0689181546668234e-05, + "loss": 0.342, "step": 97590 }, { - "epoch": 3.43, - "learning_rate": 1.1817475758655618e-05, - "loss": 0.261, + "epoch": 3.5173171874436875, + "grad_norm": 0.24701309204101562, + "learning_rate": 1.0686788915570088e-05, + "loss": 0.3964, "step": 97595 }, { - "epoch": 3.43, - "learning_rate": 1.1815055346767332e-05, - "loss": 0.2611, + "epoch": 3.517497387104912, + "grad_norm": 0.2746676802635193, + "learning_rate": 1.0684396479486664e-05, + "loss": 0.3973, "step": 97600 }, { - "epoch": 3.43, - "learning_rate": 1.181263510607686e-05, - "loss": 0.2512, + "epoch": 3.517677586766137, + "grad_norm": 0.3433469831943512, + "learning_rate": 1.0682004238450574e-05, + "loss": 0.3645, "step": 97605 }, { - "epoch": 3.43, - "learning_rate": 1.1810215036615622e-05, - "loss": 0.2693, + "epoch": 3.5178577864273617, + "grad_norm": 0.22473633289337158, + "learning_rate": 1.0679612192494403e-05, + "loss": 0.3702, "step": 97610 }, { - "epoch": 3.43, - "learning_rate": 1.1807795138415048e-05, - "loss": 0.2683, + "epoch": 3.5180379860885864, + "grad_norm": 0.26055383682250977, + "learning_rate": 1.0677220341650747e-05, + "loss": 0.3831, "step": 97615 }, { - "epoch": 3.43, - "learning_rate": 1.180537541150657e-05, - "loss": 0.2503, + "epoch": 3.5182181857498107, + "grad_norm": 0.22649826109409332, + "learning_rate": 1.067482868595219e-05, + "loss": 0.3515, "step": 97620 }, { - "epoch": 3.43, - "learning_rate": 1.1802955855921596e-05, - "loss": 0.2489, + "epoch": 3.5183983854110354, + "grad_norm": 0.24779903888702393, + "learning_rate": 1.067243722543131e-05, + "loss": 0.3627, "step": 97625 }, { - "epoch": 3.43, - "learning_rate": 1.1800536471691545e-05, - "loss": 0.2223, + "epoch": 3.51857858507226, + "grad_norm": 0.2685101330280304, + "learning_rate": 1.0670045960120707e-05, + "loss": 0.3862, "step": 97630 }, { - "epoch": 3.44, - "learning_rate": 1.1798117258847818e-05, - "loss": 0.2399, + "epoch": 3.5187587847334845, + "grad_norm": 0.2545015513896942, + "learning_rate": 1.0667654890052962e-05, + "loss": 0.3848, "step": 97635 }, { - "epoch": 3.44, - "learning_rate": 1.1795698217421849e-05, - "loss": 0.2515, + "epoch": 3.518938984394709, + "grad_norm": 0.2047766149044037, + "learning_rate": 1.0665264015260626e-05, + "loss": 0.3676, "step": 97640 }, { - "epoch": 3.44, - "learning_rate": 1.1793279347445036e-05, - "loss": 0.2619, + "epoch": 3.519119184055934, + "grad_norm": 0.24359673261642456, + "learning_rate": 1.0662873335776302e-05, + "loss": 0.3863, "step": 97645 }, { - "epoch": 3.44, - "learning_rate": 1.1790860648948787e-05, - "loss": 0.269, + "epoch": 3.5192993837171587, + "grad_norm": 0.2065386027097702, + "learning_rate": 1.0660482851632553e-05, + "loss": 0.402, "step": 97650 }, { - "epoch": 3.44, - "learning_rate": 1.1788442121964496e-05, - "loss": 0.2618, + "epoch": 3.5194795833783834, + "grad_norm": 0.3513331711292267, + "learning_rate": 1.065809256286194e-05, + "loss": 0.3652, "step": 97655 }, { - "epoch": 3.44, - "learning_rate": 1.178602376652359e-05, - "loss": 0.2519, + "epoch": 3.519659783039608, + "grad_norm": 0.23848022520542145, + "learning_rate": 1.0655702469497057e-05, + "loss": 0.3883, "step": 97660 }, { - "epoch": 3.44, - "learning_rate": 1.1783605582657444e-05, - "loss": 0.2638, + "epoch": 3.5198399827008324, + "grad_norm": 0.1824946254491806, + "learning_rate": 1.0653312571570434e-05, + "loss": 0.384, "step": 97665 }, { - "epoch": 3.44, - "learning_rate": 1.1781187570397483e-05, - "loss": 0.2562, + "epoch": 3.520020182362057, + "grad_norm": 0.28067702054977417, + "learning_rate": 1.0650922869114658e-05, + "loss": 0.3883, "step": 97670 }, { - "epoch": 3.44, - "learning_rate": 1.177876972977508e-05, - "loss": 0.2606, + "epoch": 3.520200382023282, + "grad_norm": 0.23458154499530792, + "learning_rate": 1.0648533362162277e-05, + "loss": 0.4112, "step": 97675 }, { - "epoch": 3.44, - "learning_rate": 1.1776352060821646e-05, - "loss": 0.254, + "epoch": 3.520380581684506, + "grad_norm": 0.2893427908420563, + "learning_rate": 1.0646144050745854e-05, + "loss": 0.3922, "step": 97680 }, { - "epoch": 3.44, - "learning_rate": 1.1773934563568568e-05, - "loss": 0.2534, + "epoch": 3.520560781345731, + "grad_norm": 0.17397251725196838, + "learning_rate": 1.0643754934897937e-05, + "loss": 0.3335, "step": 97685 }, { - "epoch": 3.44, - "learning_rate": 1.1771517238047223e-05, - "loss": 0.2525, + "epoch": 3.5207409810069556, + "grad_norm": 0.25946879386901855, + "learning_rate": 1.064136601465108e-05, + "loss": 0.3578, "step": 97690 }, { - "epoch": 3.44, - "learning_rate": 1.1769100084289017e-05, - "loss": 0.2712, + "epoch": 3.5209211806681804, + "grad_norm": 0.28153377771377563, + "learning_rate": 1.0638977290037825e-05, + "loss": 0.3993, "step": 97695 }, { - "epoch": 3.44, - "learning_rate": 1.1766683102325326e-05, - "loss": 0.2546, + "epoch": 3.521101380329405, + "grad_norm": 0.2764267325401306, + "learning_rate": 1.063658876109073e-05, + "loss": 0.3849, "step": 97700 }, { - "epoch": 3.44, - "learning_rate": 1.1764266292187538e-05, - "loss": 0.2382, + "epoch": 3.52128157999063, + "grad_norm": 0.26901406049728394, + "learning_rate": 1.0634200427842334e-05, + "loss": 0.3975, "step": 97705 }, { - "epoch": 3.44, - "learning_rate": 1.1761849653907017e-05, - "loss": 0.2577, + "epoch": 3.521461779651854, + "grad_norm": 0.22816415131092072, + "learning_rate": 1.0631812290325174e-05, + "loss": 0.3826, "step": 97710 }, { - "epoch": 3.44, - "learning_rate": 1.1759433187515163e-05, - "loss": 0.2438, + "epoch": 3.521641979313079, + "grad_norm": 0.22323520481586456, + "learning_rate": 1.062942434857179e-05, + "loss": 0.3701, "step": 97715 }, { - "epoch": 3.44, - "learning_rate": 1.1757016893043335e-05, - "loss": 0.2657, + "epoch": 3.5218221789743036, + "grad_norm": 0.18023869395256042, + "learning_rate": 1.062703660261471e-05, + "loss": 0.3809, "step": 97720 }, { - "epoch": 3.44, - "learning_rate": 1.1754600770522925e-05, - "loss": 0.2439, + "epoch": 3.522002378635528, + "grad_norm": 0.2793877124786377, + "learning_rate": 1.062464905248648e-05, + "loss": 0.3815, "step": 97725 }, { - "epoch": 3.44, - "learning_rate": 1.1752184819985285e-05, - "loss": 0.2766, + "epoch": 3.5221825782967526, + "grad_norm": 0.19938796758651733, + "learning_rate": 1.0622261698219634e-05, + "loss": 0.3712, "step": 97730 }, { - "epoch": 3.44, - "learning_rate": 1.1749769041461803e-05, - "loss": 0.2521, + "epoch": 3.5223627779579774, + "grad_norm": 0.21415963768959045, + "learning_rate": 1.0619874539846673e-05, + "loss": 0.3845, "step": 97735 }, { - "epoch": 3.44, - "learning_rate": 1.1747353434983841e-05, - "loss": 0.2627, + "epoch": 3.522542977619202, + "grad_norm": 0.2542286515235901, + "learning_rate": 1.0617487577400143e-05, + "loss": 0.3956, "step": 97740 }, { - "epoch": 3.44, - "learning_rate": 1.1744938000582756e-05, - "loss": 0.252, + "epoch": 3.522723177280427, + "grad_norm": 0.25228896737098694, + "learning_rate": 1.0615100810912551e-05, + "loss": 0.365, "step": 97745 }, { - "epoch": 3.44, - "learning_rate": 1.174252273828991e-05, - "loss": 0.2612, + "epoch": 3.5229033769416516, + "grad_norm": 0.24575848877429962, + "learning_rate": 1.0612714240416444e-05, + "loss": 0.3517, "step": 97750 }, { - "epoch": 3.44, - "learning_rate": 1.1740107648136677e-05, - "loss": 0.2575, + "epoch": 3.523083576602876, + "grad_norm": 0.2416459023952484, + "learning_rate": 1.0610327865944311e-05, + "loss": 0.3597, "step": 97755 }, { - "epoch": 3.44, - "learning_rate": 1.1737692730154409e-05, - "loss": 0.2804, + "epoch": 3.5232637762641006, + "grad_norm": 0.2579323351383209, + "learning_rate": 1.0607941687528669e-05, + "loss": 0.4051, "step": 97760 }, { - "epoch": 3.44, - "learning_rate": 1.173527798437446e-05, - "loss": 0.2669, + "epoch": 3.5234439759253253, + "grad_norm": 0.3103504478931427, + "learning_rate": 1.0605555705202041e-05, + "loss": 0.3566, "step": 97765 }, { - "epoch": 3.44, - "learning_rate": 1.1732863410828174e-05, - "loss": 0.2583, + "epoch": 3.52362417558655, + "grad_norm": 0.22384612262248993, + "learning_rate": 1.060316991899693e-05, + "loss": 0.3673, "step": 97770 }, { - "epoch": 3.44, - "learning_rate": 1.1730449009546915e-05, - "loss": 0.2429, + "epoch": 3.5238043752477743, + "grad_norm": 0.22075200080871582, + "learning_rate": 1.0600784328945843e-05, + "loss": 0.3654, "step": 97775 }, { - "epoch": 3.44, - "learning_rate": 1.1728034780562038e-05, - "loss": 0.2754, + "epoch": 3.523984574908999, + "grad_norm": 0.24411165714263916, + "learning_rate": 1.059839893508128e-05, + "loss": 0.3677, "step": 97780 }, { - "epoch": 3.44, - "learning_rate": 1.1725620723904877e-05, - "loss": 0.2643, + "epoch": 3.524164774570224, + "grad_norm": 0.2727641761302948, + "learning_rate": 1.0596013737435734e-05, + "loss": 0.3663, "step": 97785 }, { - "epoch": 3.44, - "learning_rate": 1.1723206839606787e-05, - "loss": 0.2565, + "epoch": 3.5243449742314485, + "grad_norm": 0.21201664209365845, + "learning_rate": 1.0593628736041722e-05, + "loss": 0.3997, "step": 97790 }, { - "epoch": 3.44, - "learning_rate": 1.172079312769911e-05, - "loss": 0.2546, + "epoch": 3.5245251738926733, + "grad_norm": 0.2514842450618744, + "learning_rate": 1.0591243930931729e-05, + "loss": 0.3711, "step": 97795 }, { - "epoch": 3.44, - "learning_rate": 1.1718379588213182e-05, - "loss": 0.2491, + "epoch": 3.5247053735538976, + "grad_norm": 0.21594151854515076, + "learning_rate": 1.0588859322138247e-05, + "loss": 0.3714, "step": 97800 }, { - "epoch": 3.44, - "learning_rate": 1.1715966221180333e-05, - "loss": 0.2714, + "epoch": 3.5248855732151223, + "grad_norm": 0.21471446752548218, + "learning_rate": 1.058647490969377e-05, + "loss": 0.3749, "step": 97805 }, { - "epoch": 3.44, - "learning_rate": 1.1713553026631918e-05, - "loss": 0.2475, + "epoch": 3.525065772876347, + "grad_norm": 0.2319725900888443, + "learning_rate": 1.0584090693630778e-05, + "loss": 0.3581, "step": 97810 }, { - "epoch": 3.44, - "learning_rate": 1.1711140004599262e-05, - "loss": 0.2586, + "epoch": 3.5252459725375718, + "grad_norm": 0.18940207362174988, + "learning_rate": 1.0581706673981753e-05, + "loss": 0.3634, "step": 97815 }, { - "epoch": 3.44, - "learning_rate": 1.1708727155113692e-05, - "loss": 0.2523, + "epoch": 3.525426172198796, + "grad_norm": 0.22552742063999176, + "learning_rate": 1.0579322850779188e-05, + "loss": 0.3836, "step": 97820 }, { - "epoch": 3.44, - "learning_rate": 1.1706314478206531e-05, - "loss": 0.2639, + "epoch": 3.525606371860021, + "grad_norm": 0.22318586707115173, + "learning_rate": 1.0576939224055563e-05, + "loss": 0.3915, "step": 97825 }, { - "epoch": 3.44, - "learning_rate": 1.1703901973909129e-05, - "loss": 0.2606, + "epoch": 3.5257865715212455, + "grad_norm": 0.22033105790615082, + "learning_rate": 1.0574555793843345e-05, + "loss": 0.3772, "step": 97830 }, { - "epoch": 3.44, - "learning_rate": 1.1701489642252786e-05, - "loss": 0.2492, + "epoch": 3.5259667711824703, + "grad_norm": 0.19937936961650848, + "learning_rate": 1.0572172560175011e-05, + "loss": 0.379, "step": 97835 }, { - "epoch": 3.44, - "learning_rate": 1.1699077483268844e-05, - "loss": 0.2723, + "epoch": 3.526146970843695, + "grad_norm": 0.19125044345855713, + "learning_rate": 1.0569789523083026e-05, + "loss": 0.4136, "step": 97840 }, { - "epoch": 3.44, - "learning_rate": 1.1696665496988606e-05, - "loss": 0.2448, + "epoch": 3.5263271705049197, + "grad_norm": 0.2663952708244324, + "learning_rate": 1.056740668259988e-05, + "loss": 0.3793, "step": 97845 }, { - "epoch": 3.44, - "learning_rate": 1.1694253683443413e-05, - "loss": 0.2654, + "epoch": 3.526507370166144, + "grad_norm": 0.21519611775875092, + "learning_rate": 1.0565024038758009e-05, + "loss": 0.3712, "step": 97850 }, { - "epoch": 3.44, - "learning_rate": 1.169184204266456e-05, - "loss": 0.2614, + "epoch": 3.5266875698273688, + "grad_norm": 0.2259238213300705, + "learning_rate": 1.0562641591589898e-05, + "loss": 0.3741, "step": 97855 }, { - "epoch": 3.44, - "learning_rate": 1.1689430574683375e-05, - "loss": 0.2569, + "epoch": 3.5268677694885935, + "grad_norm": 0.22167803347110748, + "learning_rate": 1.0560259341128e-05, + "loss": 0.3672, "step": 97860 }, { - "epoch": 3.44, - "learning_rate": 1.1687019279531147e-05, - "loss": 0.2485, + "epoch": 3.5270479691498178, + "grad_norm": 0.1899726837873459, + "learning_rate": 1.0557877287404774e-05, + "loss": 0.375, "step": 97865 }, { - "epoch": 3.44, - "learning_rate": 1.1684608157239214e-05, - "loss": 0.2755, + "epoch": 3.5272281688110425, + "grad_norm": 0.23604774475097656, + "learning_rate": 1.0555495430452673e-05, + "loss": 0.355, "step": 97870 }, { - "epoch": 3.44, - "learning_rate": 1.1682197207838866e-05, - "loss": 0.2449, + "epoch": 3.5274083684722672, + "grad_norm": 0.22828394174575806, + "learning_rate": 1.0553113770304152e-05, + "loss": 0.3715, "step": 97875 }, { - "epoch": 3.44, - "learning_rate": 1.1679786431361401e-05, - "loss": 0.2429, + "epoch": 3.527588568133492, + "grad_norm": 0.1918339878320694, + "learning_rate": 1.0550732306991648e-05, + "loss": 0.3772, "step": 97880 }, { - "epoch": 3.44, - "learning_rate": 1.1677375827838142e-05, - "loss": 0.2594, + "epoch": 3.5277687677947167, + "grad_norm": 0.21536006033420563, + "learning_rate": 1.0548351040547628e-05, + "loss": 0.348, "step": 97885 }, { - "epoch": 3.44, - "learning_rate": 1.1674965397300367e-05, - "loss": 0.2398, + "epoch": 3.5279489674559414, + "grad_norm": 0.2499334067106247, + "learning_rate": 1.0545969971004527e-05, + "loss": 0.3779, "step": 97890 }, { - "epoch": 3.44, - "learning_rate": 1.1672555139779392e-05, - "loss": 0.2532, + "epoch": 3.5281291671171657, + "grad_norm": 0.22823038697242737, + "learning_rate": 1.0543589098394784e-05, + "loss": 0.3632, "step": 97895 }, { - "epoch": 3.44, - "learning_rate": 1.16701450553065e-05, - "loss": 0.2591, + "epoch": 3.5283093667783905, + "grad_norm": 0.20469775795936584, + "learning_rate": 1.0541208422750846e-05, + "loss": 0.3779, "step": 97900 }, { - "epoch": 3.44, - "learning_rate": 1.1667735143912998e-05, - "loss": 0.2373, + "epoch": 3.528489566439615, + "grad_norm": 0.2718513309955597, + "learning_rate": 1.053882794410513e-05, + "loss": 0.3781, "step": 97905 }, { - "epoch": 3.44, - "learning_rate": 1.1665325405630167e-05, - "loss": 0.2712, + "epoch": 3.5286697661008395, + "grad_norm": 0.1913396418094635, + "learning_rate": 1.0536447662490097e-05, + "loss": 0.3742, "step": 97910 }, { - "epoch": 3.44, - "learning_rate": 1.16629158404893e-05, - "loss": 0.2522, + "epoch": 3.5288499657620642, + "grad_norm": 0.24252444505691528, + "learning_rate": 1.0534067577938172e-05, + "loss": 0.3849, "step": 97915 }, { - "epoch": 3.45, - "learning_rate": 1.1660506448521672e-05, - "loss": 0.2578, + "epoch": 3.529030165423289, + "grad_norm": 0.26426470279693604, + "learning_rate": 1.0531687690481757e-05, + "loss": 0.3961, "step": 97920 }, { - "epoch": 3.45, - "learning_rate": 1.1658097229758586e-05, - "loss": 0.2519, + "epoch": 3.5292103650845137, + "grad_norm": 0.1625565141439438, + "learning_rate": 1.0529308000153304e-05, + "loss": 0.3535, "step": 97925 }, { - "epoch": 3.45, - "learning_rate": 1.1655688184231314e-05, - "loss": 0.2566, + "epoch": 3.5293905647457384, + "grad_norm": 0.2652257978916168, + "learning_rate": 1.052692850698522e-05, + "loss": 0.3936, "step": 97930 }, { - "epoch": 3.45, - "learning_rate": 1.1653279311971135e-05, - "loss": 0.2403, + "epoch": 3.529570764406963, + "grad_norm": 0.2854976952075958, + "learning_rate": 1.052454921100995e-05, + "loss": 0.382, "step": 97935 }, { - "epoch": 3.45, - "learning_rate": 1.1650870613009321e-05, - "loss": 0.2754, + "epoch": 3.5297509640681874, + "grad_norm": 0.24219295382499695, + "learning_rate": 1.0522170112259887e-05, + "loss": 0.3635, "step": 97940 }, { - "epoch": 3.45, - "learning_rate": 1.1648462087377154e-05, - "loss": 0.2559, + "epoch": 3.529931163729412, + "grad_norm": 0.19204115867614746, + "learning_rate": 1.0519791210767446e-05, + "loss": 0.3773, "step": 97945 }, { - "epoch": 3.45, - "learning_rate": 1.1646053735105919e-05, - "loss": 0.2337, + "epoch": 3.530111363390637, + "grad_norm": 0.30104586482048035, + "learning_rate": 1.0517412506565052e-05, + "loss": 0.4092, "step": 97950 }, { - "epoch": 3.45, - "learning_rate": 1.1643645556226876e-05, - "loss": 0.2396, + "epoch": 3.530291563051861, + "grad_norm": 0.26225852966308594, + "learning_rate": 1.0515033999685109e-05, + "loss": 0.3847, "step": 97955 }, { - "epoch": 3.45, - "learning_rate": 1.164123755077128e-05, - "loss": 0.2341, + "epoch": 3.530471762713086, + "grad_norm": 0.2835884690284729, + "learning_rate": 1.0512655690160025e-05, + "loss": 0.4016, "step": 97960 }, { - "epoch": 3.45, - "learning_rate": 1.1638829718770421e-05, - "loss": 0.241, + "epoch": 3.5306519623743107, + "grad_norm": 0.21307377517223358, + "learning_rate": 1.05102775780222e-05, + "loss": 0.3708, "step": 97965 }, { - "epoch": 3.45, - "learning_rate": 1.1636422060255555e-05, - "loss": 0.2725, + "epoch": 3.5308321620355354, + "grad_norm": 0.22064463794231415, + "learning_rate": 1.050789966330403e-05, + "loss": 0.4074, "step": 97970 }, { - "epoch": 3.45, - "learning_rate": 1.1634014575257943e-05, - "loss": 0.2507, + "epoch": 3.53101236169676, + "grad_norm": 0.22840745747089386, + "learning_rate": 1.050552194603793e-05, + "loss": 0.4148, "step": 97975 }, { - "epoch": 3.45, - "learning_rate": 1.163160726380883e-05, - "loss": 0.2498, + "epoch": 3.531192561357985, + "grad_norm": 0.22727341949939728, + "learning_rate": 1.050314442625629e-05, + "loss": 0.3656, "step": 97980 }, { - "epoch": 3.45, - "learning_rate": 1.1629200125939498e-05, - "loss": 0.2688, + "epoch": 3.531372761019209, + "grad_norm": 0.22287560999393463, + "learning_rate": 1.0500767103991496e-05, + "loss": 0.4111, "step": 97985 }, { - "epoch": 3.45, - "learning_rate": 1.1626793161681191e-05, - "loss": 0.2578, + "epoch": 3.531552960680434, + "grad_norm": 0.22651442885398865, + "learning_rate": 1.0498389979275947e-05, + "loss": 0.3876, "step": 97990 }, { - "epoch": 3.45, - "learning_rate": 1.162438637106515e-05, - "loss": 0.2772, + "epoch": 3.5317331603416586, + "grad_norm": 0.21187719702720642, + "learning_rate": 1.0496013052142027e-05, + "loss": 0.3932, "step": 97995 }, { - "epoch": 3.45, - "learning_rate": 1.162197975412264e-05, - "loss": 0.2662, + "epoch": 3.531913360002883, + "grad_norm": 0.23431113362312317, + "learning_rate": 1.0493636322622108e-05, + "loss": 0.3845, "step": 98000 }, { - "epoch": 3.45, - "eval_loss": 0.25310197472572327, - "eval_runtime": 10.5476, - "eval_samples_per_second": 9.481, - "eval_steps_per_second": 9.481, + "epoch": 3.531913360002883, + "eval_loss": 0.42959845066070557, + "eval_runtime": 3.5275, + "eval_samples_per_second": 28.349, + "eval_steps_per_second": 7.087, "step": 98000 }, { - "epoch": 3.45, - "learning_rate": 1.1619573310884913e-05, - "loss": 0.2549, + "epoch": 3.5320935596641077, + "grad_norm": 0.28880631923675537, + "learning_rate": 1.0491259790748597e-05, + "loss": 0.3756, "step": 98005 }, { - "epoch": 3.45, - "learning_rate": 1.1617167041383208e-05, - "loss": 0.2417, + "epoch": 3.5322737593253324, + "grad_norm": 0.2546299397945404, + "learning_rate": 1.0488883456553861e-05, + "loss": 0.3563, "step": 98010 }, { - "epoch": 3.45, - "learning_rate": 1.1614760945648765e-05, - "loss": 0.2693, + "epoch": 3.532453958986557, + "grad_norm": 0.2502884566783905, + "learning_rate": 1.0486507320070279e-05, + "loss": 0.362, "step": 98015 }, { - "epoch": 3.45, - "learning_rate": 1.1612355023712834e-05, - "loss": 0.2729, + "epoch": 3.532634158647782, + "grad_norm": 0.23691552877426147, + "learning_rate": 1.0484131381330228e-05, + "loss": 0.3751, "step": 98020 }, { - "epoch": 3.45, - "learning_rate": 1.1609949275606651e-05, - "loss": 0.2596, + "epoch": 3.5328143583090066, + "grad_norm": 0.2482050359249115, + "learning_rate": 1.0481755640366065e-05, + "loss": 0.3846, "step": 98025 }, { - "epoch": 3.45, - "learning_rate": 1.1607543701361455e-05, - "loss": 0.2607, + "epoch": 3.532994557970231, + "grad_norm": 0.24961546063423157, + "learning_rate": 1.0479380097210187e-05, + "loss": 0.3872, "step": 98030 }, { - "epoch": 3.45, - "learning_rate": 1.1605138301008466e-05, - "loss": 0.2725, + "epoch": 3.5331747576314556, + "grad_norm": 0.17989207804203033, + "learning_rate": 1.0477004751894929e-05, + "loss": 0.3843, "step": 98035 }, { - "epoch": 3.45, - "learning_rate": 1.1602733074578939e-05, - "loss": 0.263, + "epoch": 3.5333549572926803, + "grad_norm": 0.2397972047328949, + "learning_rate": 1.0474629604452676e-05, + "loss": 0.3869, "step": 98040 }, { - "epoch": 3.45, - "learning_rate": 1.1600328022104095e-05, - "loss": 0.2431, + "epoch": 3.533535156953905, + "grad_norm": 0.272879958152771, + "learning_rate": 1.0472254654915784e-05, + "loss": 0.407, "step": 98045 }, { - "epoch": 3.45, - "learning_rate": 1.1597923143615152e-05, - "loss": 0.2503, + "epoch": 3.5337153566151294, + "grad_norm": 0.25433728098869324, + "learning_rate": 1.046987990331661e-05, + "loss": 0.3856, "step": 98050 }, { - "epoch": 3.45, - "learning_rate": 1.1595518439143344e-05, - "loss": 0.2678, + "epoch": 3.533895556276354, + "grad_norm": 0.2679862082004547, + "learning_rate": 1.046750534968751e-05, + "loss": 0.4406, "step": 98055 }, { - "epoch": 3.45, - "learning_rate": 1.1593113908719902e-05, - "loss": 0.2826, + "epoch": 3.534075755937579, + "grad_norm": 0.18556173145771027, + "learning_rate": 1.0465130994060831e-05, + "loss": 0.3705, "step": 98060 }, { - "epoch": 3.45, - "learning_rate": 1.1590709552376042e-05, - "loss": 0.232, + "epoch": 3.5342559555988036, + "grad_norm": 0.23661454021930695, + "learning_rate": 1.0462756836468924e-05, + "loss": 0.3722, "step": 98065 }, { - "epoch": 3.45, - "learning_rate": 1.1588305370142982e-05, - "loss": 0.2463, + "epoch": 3.5344361552600283, + "grad_norm": 0.21671940386295319, + "learning_rate": 1.046038287694415e-05, + "loss": 0.3898, "step": 98070 }, { - "epoch": 3.45, - "learning_rate": 1.1585901362051929e-05, - "loss": 0.3027, + "epoch": 3.5346163549212526, + "grad_norm": 0.21144139766693115, + "learning_rate": 1.0458009115518841e-05, + "loss": 0.364, "step": 98075 }, { - "epoch": 3.45, - "learning_rate": 1.1583497528134116e-05, - "loss": 0.2829, + "epoch": 3.5347965545824773, + "grad_norm": 0.2512389123439789, + "learning_rate": 1.0455635552225343e-05, + "loss": 0.3769, "step": 98080 }, { - "epoch": 3.45, - "learning_rate": 1.1581093868420745e-05, - "loss": 0.2723, + "epoch": 3.534976754243702, + "grad_norm": 0.2403516322374344, + "learning_rate": 1.0453262187095996e-05, + "loss": 0.3693, "step": 98085 }, { - "epoch": 3.45, - "learning_rate": 1.1578690382943027e-05, - "loss": 0.2583, + "epoch": 3.535156953904927, + "grad_norm": 0.2084726095199585, + "learning_rate": 1.0450889020163126e-05, + "loss": 0.3775, "step": 98090 }, { - "epoch": 3.45, - "learning_rate": 1.1576287071732158e-05, - "loss": 0.2432, + "epoch": 3.535337153566151, + "grad_norm": 0.2216814011335373, + "learning_rate": 1.0448516051459087e-05, + "loss": 0.4085, "step": 98095 }, { - "epoch": 3.45, - "learning_rate": 1.1573883934819369e-05, - "loss": 0.2546, + "epoch": 3.535517353227376, + "grad_norm": 0.2486451119184494, + "learning_rate": 1.0446143281016205e-05, + "loss": 0.3845, "step": 98100 }, { - "epoch": 3.45, - "learning_rate": 1.1571480972235834e-05, - "loss": 0.2613, + "epoch": 3.5356975528886005, + "grad_norm": 0.22463186085224152, + "learning_rate": 1.0443770708866787e-05, + "loss": 0.3779, "step": 98105 }, { - "epoch": 3.45, - "learning_rate": 1.1569078184012782e-05, - "loss": 0.2659, + "epoch": 3.5358777525498253, + "grad_norm": 0.232293039560318, + "learning_rate": 1.0441398335043184e-05, + "loss": 0.3514, "step": 98110 }, { - "epoch": 3.45, - "learning_rate": 1.156667557018139e-05, - "loss": 0.2676, + "epoch": 3.53605795221105, + "grad_norm": 0.29086434841156006, + "learning_rate": 1.0439026159577703e-05, + "loss": 0.3531, "step": 98115 }, { - "epoch": 3.45, - "learning_rate": 1.1564273130772868e-05, - "loss": 0.2521, + "epoch": 3.5362381518722747, + "grad_norm": 0.2319757342338562, + "learning_rate": 1.0436654182502678e-05, + "loss": 0.4037, "step": 98120 }, { - "epoch": 3.45, - "learning_rate": 1.1561870865818408e-05, - "loss": 0.2378, + "epoch": 3.536418351533499, + "grad_norm": 0.23496223986148834, + "learning_rate": 1.0434282403850429e-05, + "loss": 0.3831, "step": 98125 }, { - "epoch": 3.45, - "learning_rate": 1.1559468775349197e-05, - "loss": 0.2688, + "epoch": 3.5365985511947238, + "grad_norm": 0.21282298862934113, + "learning_rate": 1.0431910823653244e-05, + "loss": 0.3574, "step": 98130 }, { - "epoch": 3.45, - "learning_rate": 1.155706685939642e-05, - "loss": 0.2599, + "epoch": 3.5367787508559485, + "grad_norm": 0.24302206933498383, + "learning_rate": 1.0429539441943464e-05, + "loss": 0.3586, "step": 98135 }, { - "epoch": 3.45, - "learning_rate": 1.1554665117991275e-05, - "loss": 0.2635, + "epoch": 3.536958950517173, + "grad_norm": 0.2705324590206146, + "learning_rate": 1.0427168258753386e-05, + "loss": 0.3779, "step": 98140 }, { - "epoch": 3.45, - "learning_rate": 1.1552263551164947e-05, - "loss": 0.245, + "epoch": 3.5371391501783975, + "grad_norm": 0.23419471085071564, + "learning_rate": 1.0424797274115322e-05, + "loss": 0.3901, "step": 98145 }, { - "epoch": 3.45, - "learning_rate": 1.1549862158948604e-05, - "loss": 0.2681, + "epoch": 3.5373193498396223, + "grad_norm": 0.20766755938529968, + "learning_rate": 1.0422426488061572e-05, + "loss": 0.386, "step": 98150 }, { - "epoch": 3.45, - "learning_rate": 1.1547460941373445e-05, - "loss": 0.2921, + "epoch": 3.537499549500847, + "grad_norm": 0.23583157360553741, + "learning_rate": 1.042005590062443e-05, + "loss": 0.3403, "step": 98155 }, { - "epoch": 3.45, - "learning_rate": 1.1545059898470634e-05, - "loss": 0.2582, + "epoch": 3.5376797491620717, + "grad_norm": 0.2042224109172821, + "learning_rate": 1.0417685511836212e-05, + "loss": 0.3912, "step": 98160 }, { - "epoch": 3.45, - "learning_rate": 1.1542659030271358e-05, - "loss": 0.2489, + "epoch": 3.5378599488232965, + "grad_norm": 0.2094784826040268, + "learning_rate": 1.0415315321729205e-05, + "loss": 0.3683, "step": 98165 }, { - "epoch": 3.45, - "learning_rate": 1.1540258336806778e-05, - "loss": 0.2556, + "epoch": 3.5380401484845208, + "grad_norm": 0.24115018546581268, + "learning_rate": 1.0412945330335705e-05, + "loss": 0.3984, "step": 98170 }, { - "epoch": 3.45, - "learning_rate": 1.1537857818108082e-05, - "loss": 0.2798, + "epoch": 3.5382203481457455, + "grad_norm": 0.216557577252388, + "learning_rate": 1.0410575537688e-05, + "loss": 0.3695, "step": 98175 }, { - "epoch": 3.45, - "learning_rate": 1.1535457474206433e-05, - "loss": 0.2701, + "epoch": 3.53840054780697, + "grad_norm": 0.22672514617443085, + "learning_rate": 1.0408205943818378e-05, + "loss": 0.3619, "step": 98180 }, { - "epoch": 3.45, - "learning_rate": 1.1533057305132995e-05, - "loss": 0.2527, + "epoch": 3.5385807474681945, + "grad_norm": 0.23298558592796326, + "learning_rate": 1.0405836548759117e-05, + "loss": 0.414, "step": 98185 }, { - "epoch": 3.45, - "learning_rate": 1.1530657310918922e-05, - "loss": 0.2337, + "epoch": 3.5387609471294192, + "grad_norm": 0.20260320603847504, + "learning_rate": 1.0403467352542515e-05, + "loss": 0.3736, "step": 98190 }, { - "epoch": 3.45, - "learning_rate": 1.1528257491595395e-05, - "loss": 0.24, + "epoch": 3.538941146790644, + "grad_norm": 0.21936598420143127, + "learning_rate": 1.0401098355200848e-05, + "loss": 0.3585, "step": 98195 }, { - "epoch": 3.45, - "learning_rate": 1.1525857847193568e-05, - "loss": 0.2533, + "epoch": 3.5391213464518687, + "grad_norm": 0.2278914600610733, + "learning_rate": 1.0398729556766385e-05, + "loss": 0.3446, "step": 98200 }, { - "epoch": 3.46, - "learning_rate": 1.1523458377744598e-05, - "loss": 0.2501, + "epoch": 3.5393015461130934, + "grad_norm": 0.24078984558582306, + "learning_rate": 1.0396360957271405e-05, + "loss": 0.4154, "step": 98205 }, { - "epoch": 3.46, - "learning_rate": 1.1521059083279629e-05, - "loss": 0.2454, + "epoch": 3.539481745774318, + "grad_norm": 0.24573035538196564, + "learning_rate": 1.0393992556748172e-05, + "loss": 0.3924, "step": 98210 }, { - "epoch": 3.46, - "learning_rate": 1.1518659963829826e-05, - "loss": 0.2358, + "epoch": 3.5396619454355425, + "grad_norm": 0.25402477383613586, + "learning_rate": 1.0391624355228982e-05, + "loss": 0.3632, "step": 98215 }, { - "epoch": 3.46, - "learning_rate": 1.1516261019426345e-05, - "loss": 0.2607, + "epoch": 3.539842145096767, + "grad_norm": 0.19696156680583954, + "learning_rate": 1.0389256352746063e-05, + "loss": 0.3623, "step": 98220 }, { - "epoch": 3.46, - "learning_rate": 1.1513862250100333e-05, - "loss": 0.2938, + "epoch": 3.540022344757992, + "grad_norm": 0.24171549081802368, + "learning_rate": 1.0386888549331706e-05, + "loss": 0.4052, "step": 98225 }, { - "epoch": 3.46, - "learning_rate": 1.1511463655882917e-05, - "loss": 0.2692, + "epoch": 3.5402025444192162, + "grad_norm": 0.25626784563064575, + "learning_rate": 1.0384520945018164e-05, + "loss": 0.3945, "step": 98230 }, { - "epoch": 3.46, - "learning_rate": 1.150906523680527e-05, - "loss": 0.2624, + "epoch": 3.540382744080441, + "grad_norm": 0.2716904282569885, + "learning_rate": 1.0382153539837686e-05, + "loss": 0.3824, "step": 98235 }, { - "epoch": 3.46, - "learning_rate": 1.1506666992898519e-05, - "loss": 0.2545, + "epoch": 3.5405629437416657, + "grad_norm": 0.20084236562252045, + "learning_rate": 1.0379786333822552e-05, + "loss": 0.3576, "step": 98240 }, { - "epoch": 3.46, - "learning_rate": 1.1504268924193803e-05, - "loss": 0.2491, + "epoch": 3.5407431434028904, + "grad_norm": 0.21598944067955017, + "learning_rate": 1.037741932700499e-05, + "loss": 0.3622, "step": 98245 }, { - "epoch": 3.46, - "learning_rate": 1.1501871030722256e-05, - "loss": 0.2624, + "epoch": 3.540923343064115, + "grad_norm": 0.2069658786058426, + "learning_rate": 1.037505251941725e-05, + "loss": 0.375, "step": 98250 }, { - "epoch": 3.46, - "learning_rate": 1.1499473312515025e-05, - "loss": 0.2537, + "epoch": 3.54110354272534, + "grad_norm": 0.2307698279619217, + "learning_rate": 1.0372685911091598e-05, + "loss": 0.3512, "step": 98255 }, { - "epoch": 3.46, - "learning_rate": 1.1497075769603233e-05, - "loss": 0.2647, + "epoch": 3.541283742386564, + "grad_norm": 0.23482072353363037, + "learning_rate": 1.0370319502060267e-05, + "loss": 0.3773, "step": 98260 }, { - "epoch": 3.46, - "learning_rate": 1.1494678402018008e-05, - "loss": 0.2603, + "epoch": 3.541463942047789, + "grad_norm": 0.23283249139785767, + "learning_rate": 1.03679532923555e-05, + "loss": 0.4013, "step": 98265 }, { - "epoch": 3.46, - "learning_rate": 1.149228120979049e-05, - "loss": 0.2296, + "epoch": 3.5416441417090136, + "grad_norm": 0.23621907830238342, + "learning_rate": 1.0365587282009539e-05, + "loss": 0.404, "step": 98270 }, { - "epoch": 3.46, - "learning_rate": 1.148988419295179e-05, - "loss": 0.274, + "epoch": 3.5418243413702384, + "grad_norm": 0.2579342722892761, + "learning_rate": 1.0363221471054607e-05, + "loss": 0.4041, "step": 98275 }, { - "epoch": 3.46, - "learning_rate": 1.1487487351533051e-05, - "loss": 0.2443, + "epoch": 3.5420045410314627, + "grad_norm": 0.2534416615962982, + "learning_rate": 1.0360855859522958e-05, + "loss": 0.3824, "step": 98280 }, { - "epoch": 3.46, - "learning_rate": 1.1485090685565375e-05, - "loss": 0.2331, + "epoch": 3.5421847406926874, + "grad_norm": 0.20750831067562103, + "learning_rate": 1.0358490447446815e-05, + "loss": 0.3827, "step": 98285 }, { - "epoch": 3.46, - "learning_rate": 1.1482694195079897e-05, - "loss": 0.2856, + "epoch": 3.542364940353912, + "grad_norm": 0.22172388434410095, + "learning_rate": 1.0356125234858405e-05, + "loss": 0.3838, "step": 98290 }, { - "epoch": 3.46, - "learning_rate": 1.1480297880107726e-05, - "loss": 0.2525, + "epoch": 3.542545140015137, + "grad_norm": 0.2661237120628357, + "learning_rate": 1.0353760221789951e-05, + "loss": 0.3559, "step": 98295 }, { - "epoch": 3.46, - "learning_rate": 1.147790174067998e-05, - "loss": 0.2415, + "epoch": 3.5427253396763616, + "grad_norm": 0.24932733178138733, + "learning_rate": 1.035139540827367e-05, + "loss": 0.3511, "step": 98300 }, { - "epoch": 3.46, - "learning_rate": 1.1475505776827757e-05, - "loss": 0.2257, + "epoch": 3.542905539337586, + "grad_norm": 0.21312791109085083, + "learning_rate": 1.0349030794341802e-05, + "loss": 0.3607, "step": 98305 }, { - "epoch": 3.46, - "learning_rate": 1.147310998858219e-05, - "loss": 0.2586, + "epoch": 3.5430857389988106, + "grad_norm": 0.2608811855316162, + "learning_rate": 1.0346666380026559e-05, + "loss": 0.3928, "step": 98310 }, { - "epoch": 3.46, - "learning_rate": 1.1470714375974373e-05, - "loss": 0.2403, + "epoch": 3.5432659386600354, + "grad_norm": 0.21913045644760132, + "learning_rate": 1.0344302165360134e-05, + "loss": 0.4061, "step": 98315 }, { - "epoch": 3.46, - "learning_rate": 1.1468318939035405e-05, - "loss": 0.2549, + "epoch": 3.54344613832126, + "grad_norm": 0.23208434879779816, + "learning_rate": 1.0341938150374761e-05, + "loss": 0.3448, "step": 98320 }, { - "epoch": 3.46, - "learning_rate": 1.146592367779641e-05, - "loss": 0.2518, + "epoch": 3.5436263379824844, + "grad_norm": 0.22456184029579163, + "learning_rate": 1.0339574335102645e-05, + "loss": 0.3587, "step": 98325 }, { - "epoch": 3.46, - "learning_rate": 1.1463528592288464e-05, - "loss": 0.2237, + "epoch": 3.543806537643709, + "grad_norm": 0.23729225993156433, + "learning_rate": 1.033721071957599e-05, + "loss": 0.3741, "step": 98330 }, { - "epoch": 3.46, - "learning_rate": 1.1461133682542687e-05, - "loss": 0.2486, + "epoch": 3.543986737304934, + "grad_norm": 0.26063045859336853, + "learning_rate": 1.0334847303827e-05, + "loss": 0.3996, "step": 98335 }, { - "epoch": 3.46, - "learning_rate": 1.1458738948590166e-05, - "loss": 0.2736, + "epoch": 3.5441669369661586, + "grad_norm": 0.24216891825199127, + "learning_rate": 1.0332484087887867e-05, + "loss": 0.4231, "step": 98340 }, { - "epoch": 3.46, - "learning_rate": 1.145634439046199e-05, - "loss": 0.2724, + "epoch": 3.5443471366273833, + "grad_norm": 0.25515010952949524, + "learning_rate": 1.0330121071790808e-05, + "loss": 0.3934, "step": 98345 }, { - "epoch": 3.46, - "learning_rate": 1.1453950008189266e-05, - "loss": 0.2465, + "epoch": 3.544527336288608, + "grad_norm": 0.18739314377307892, + "learning_rate": 1.0327758255568007e-05, + "loss": 0.3672, "step": 98350 }, { - "epoch": 3.46, - "learning_rate": 1.1451555801803071e-05, - "loss": 0.2525, + "epoch": 3.5447075359498323, + "grad_norm": 0.2008611410856247, + "learning_rate": 1.0325395639251661e-05, + "loss": 0.3601, "step": 98355 }, { - "epoch": 3.46, - "learning_rate": 1.1449161771334496e-05, - "loss": 0.225, + "epoch": 3.544887735611057, + "grad_norm": 0.20512117445468903, + "learning_rate": 1.0323033222873956e-05, + "loss": 0.3739, "step": 98360 }, { - "epoch": 3.46, - "learning_rate": 1.1446767916814618e-05, - "loss": 0.2672, + "epoch": 3.545067935272282, + "grad_norm": 0.25009825825691223, + "learning_rate": 1.0320671006467086e-05, + "loss": 0.3642, "step": 98365 }, { - "epoch": 3.46, - "learning_rate": 1.1444374238274533e-05, - "loss": 0.2658, + "epoch": 3.545248134933506, + "grad_norm": 0.27992385625839233, + "learning_rate": 1.031830899006322e-05, + "loss": 0.4017, "step": 98370 }, { - "epoch": 3.46, - "learning_rate": 1.1441980735745317e-05, - "loss": 0.2542, + "epoch": 3.545428334594731, + "grad_norm": 0.2249823361635208, + "learning_rate": 1.0315947173694562e-05, + "loss": 0.3877, "step": 98375 }, { - "epoch": 3.46, - "learning_rate": 1.1439587409258035e-05, - "loss": 0.2588, + "epoch": 3.5456085342559556, + "grad_norm": 0.26109376549720764, + "learning_rate": 1.031358555739328e-05, + "loss": 0.3812, "step": 98380 }, { - "epoch": 3.46, - "learning_rate": 1.1437194258843775e-05, - "loss": 0.28, + "epoch": 3.5457887339171803, + "grad_norm": 0.23642925918102264, + "learning_rate": 1.031122414119155e-05, + "loss": 0.3841, "step": 98385 }, { - "epoch": 3.46, - "learning_rate": 1.1434801284533619e-05, - "loss": 0.2688, + "epoch": 3.545968933578405, + "grad_norm": 0.2479097694158554, + "learning_rate": 1.0308862925121548e-05, + "loss": 0.3603, "step": 98390 }, { - "epoch": 3.46, - "learning_rate": 1.1432408486358628e-05, - "loss": 0.2388, + "epoch": 3.5461491332396298, + "grad_norm": 0.30400943756103516, + "learning_rate": 1.0306501909215438e-05, + "loss": 0.3651, "step": 98395 }, { - "epoch": 3.46, - "learning_rate": 1.1430015864349863e-05, - "loss": 0.2906, + "epoch": 3.546329332900854, + "grad_norm": 0.1932227909564972, + "learning_rate": 1.0304141093505401e-05, + "loss": 0.4042, "step": 98400 }, { - "epoch": 3.46, - "learning_rate": 1.1427623418538409e-05, - "loss": 0.2705, + "epoch": 3.546509532562079, + "grad_norm": 0.25808045268058777, + "learning_rate": 1.0301780478023607e-05, + "loss": 0.3855, "step": 98405 }, { - "epoch": 3.46, - "learning_rate": 1.1425231148955318e-05, - "loss": 0.255, + "epoch": 3.5466897322233035, + "grad_norm": 0.17466764152050018, + "learning_rate": 1.0299420062802187e-05, + "loss": 0.3894, "step": 98410 }, { - "epoch": 3.46, - "learning_rate": 1.1422839055631654e-05, - "loss": 0.2498, + "epoch": 3.546869931884528, + "grad_norm": 0.2541663944721222, + "learning_rate": 1.0297059847873334e-05, + "loss": 0.3537, "step": 98415 }, { - "epoch": 3.46, - "learning_rate": 1.142044713859847e-05, - "loss": 0.2692, + "epoch": 3.5470501315457525, + "grad_norm": 0.277024507522583, + "learning_rate": 1.0294699833269186e-05, + "loss": 0.3547, "step": 98420 }, { - "epoch": 3.46, - "learning_rate": 1.1418055397886837e-05, - "loss": 0.239, + "epoch": 3.5472303312069773, + "grad_norm": 0.2566564381122589, + "learning_rate": 1.029234001902192e-05, + "loss": 0.3823, "step": 98425 }, { - "epoch": 3.46, - "learning_rate": 1.1415663833527807e-05, - "loss": 0.2592, + "epoch": 3.547410530868202, + "grad_norm": 0.1786002814769745, + "learning_rate": 1.0289980405163668e-05, + "loss": 0.366, "step": 98430 }, { - "epoch": 3.46, - "learning_rate": 1.141327244555242e-05, - "loss": 0.2535, + "epoch": 3.5475907305294268, + "grad_norm": 0.24739274382591248, + "learning_rate": 1.0287620991726577e-05, + "loss": 0.3887, "step": 98435 }, { - "epoch": 3.46, - "learning_rate": 1.1410881233991736e-05, - "loss": 0.2289, + "epoch": 3.5477709301906515, + "grad_norm": 0.2730603516101837, + "learning_rate": 1.0285261778742808e-05, + "loss": 0.3606, "step": 98440 }, { - "epoch": 3.46, - "learning_rate": 1.1408490198876812e-05, - "loss": 0.2741, + "epoch": 3.5479511298518758, + "grad_norm": 0.27662402391433716, + "learning_rate": 1.0282902766244498e-05, + "loss": 0.4084, "step": 98445 }, { - "epoch": 3.46, - "learning_rate": 1.1406099340238686e-05, - "loss": 0.2558, + "epoch": 3.5481313295131005, + "grad_norm": 0.18735605478286743, + "learning_rate": 1.0280543954263792e-05, + "loss": 0.3771, "step": 98450 }, { - "epoch": 3.46, - "learning_rate": 1.1403708658108404e-05, - "loss": 0.2714, + "epoch": 3.5483115291743252, + "grad_norm": 0.2412840873003006, + "learning_rate": 1.0278185342832821e-05, + "loss": 0.4203, "step": 98455 }, { - "epoch": 3.46, - "learning_rate": 1.1401318152516991e-05, - "loss": 0.2539, + "epoch": 3.5484917288355495, + "grad_norm": 0.2625180184841156, + "learning_rate": 1.0275826931983718e-05, + "loss": 0.4129, "step": 98460 }, { - "epoch": 3.46, - "learning_rate": 1.1398927823495512e-05, - "loss": 0.2633, + "epoch": 3.5486719284967743, + "grad_norm": 0.23540963232517242, + "learning_rate": 1.0273468721748631e-05, + "loss": 0.362, "step": 98465 }, { - "epoch": 3.46, - "learning_rate": 1.139653767107499e-05, - "loss": 0.2585, + "epoch": 3.548852128157999, + "grad_norm": 0.23965346813201904, + "learning_rate": 1.0271110712159679e-05, + "loss": 0.3762, "step": 98470 }, { - "epoch": 3.46, - "learning_rate": 1.1394147695286461e-05, - "loss": 0.2629, + "epoch": 3.5490323278192237, + "grad_norm": 0.23907659947872162, + "learning_rate": 1.0268752903248995e-05, + "loss": 0.3859, "step": 98475 }, { - "epoch": 3.46, - "learning_rate": 1.139175789616095e-05, - "loss": 0.2507, + "epoch": 3.5492125274804485, + "grad_norm": 0.2294924557209015, + "learning_rate": 1.0266395295048701e-05, + "loss": 0.3549, "step": 98480 }, { - "epoch": 3.46, - "learning_rate": 1.1389368273729504e-05, - "loss": 0.265, + "epoch": 3.549392727141673, + "grad_norm": 0.25225427746772766, + "learning_rate": 1.0264037887590907e-05, + "loss": 0.3969, "step": 98485 }, { - "epoch": 3.47, - "learning_rate": 1.1386978828023132e-05, - "loss": 0.249, + "epoch": 3.5495729268028975, + "grad_norm": 0.2280511111021042, + "learning_rate": 1.0261680680907754e-05, + "loss": 0.3913, "step": 98490 }, { - "epoch": 3.47, - "learning_rate": 1.1384589559072866e-05, - "loss": 0.2654, + "epoch": 3.549753126464122, + "grad_norm": 0.2504872977733612, + "learning_rate": 1.0259323675031357e-05, + "loss": 0.3917, "step": 98495 }, { - "epoch": 3.47, - "learning_rate": 1.1382200466909745e-05, - "loss": 0.2584, + "epoch": 3.549933326125347, + "grad_norm": 0.21838301420211792, + "learning_rate": 1.0256966869993804e-05, + "loss": 0.3925, "step": 98500 }, { - "epoch": 3.47, - "eval_loss": 0.2523847818374634, - "eval_runtime": 10.5582, - "eval_samples_per_second": 9.471, - "eval_steps_per_second": 9.471, + "epoch": 3.549933326125347, + "eval_loss": 0.42997604608535767, + "eval_runtime": 3.5334, + "eval_samples_per_second": 28.301, + "eval_steps_per_second": 7.075, "step": 98500 }, { - "epoch": 3.47, - "learning_rate": 1.1379811551564773e-05, - "loss": 0.2517, + "epoch": 3.5501135257865712, + "grad_norm": 0.226282998919487, + "learning_rate": 1.025461026582723e-05, + "loss": 0.3838, "step": 98505 }, { - "epoch": 3.47, - "learning_rate": 1.1377422813068975e-05, - "loss": 0.2734, + "epoch": 3.550293725447796, + "grad_norm": 0.2352711260318756, + "learning_rate": 1.0252253862563738e-05, + "loss": 0.3911, "step": 98510 }, { - "epoch": 3.47, - "learning_rate": 1.1375034251453353e-05, - "loss": 0.2441, + "epoch": 3.5504739251090207, + "grad_norm": 0.20618538558483124, + "learning_rate": 1.024989766023543e-05, + "loss": 0.3656, "step": 98515 }, { - "epoch": 3.47, - "learning_rate": 1.137264586674894e-05, - "loss": 0.2562, + "epoch": 3.5506541247702454, + "grad_norm": 0.2552780210971832, + "learning_rate": 1.0247541658874412e-05, + "loss": 0.3744, "step": 98520 }, { - "epoch": 3.47, - "learning_rate": 1.1370257658986743e-05, - "loss": 0.2511, + "epoch": 3.55083432443147, + "grad_norm": 0.21368472278118134, + "learning_rate": 1.0245185858512777e-05, + "loss": 0.3951, "step": 98525 }, { - "epoch": 3.47, - "learning_rate": 1.1367869628197766e-05, - "loss": 0.2513, + "epoch": 3.551014524092695, + "grad_norm": 0.257311075925827, + "learning_rate": 1.024283025918263e-05, + "loss": 0.3753, "step": 98530 }, { - "epoch": 3.47, - "learning_rate": 1.1365481774413009e-05, - "loss": 0.2531, + "epoch": 3.551194723753919, + "grad_norm": 0.1774069368839264, + "learning_rate": 1.0240474860916068e-05, + "loss": 0.3849, "step": 98535 }, { - "epoch": 3.47, - "learning_rate": 1.1363094097663495e-05, - "loss": 0.2581, + "epoch": 3.551374923415144, + "grad_norm": 0.20663419365882874, + "learning_rate": 1.023811966374518e-05, + "loss": 0.3717, "step": 98540 }, { - "epoch": 3.47, - "learning_rate": 1.1360706597980208e-05, - "loss": 0.2849, + "epoch": 3.5515551230763687, + "grad_norm": 0.23598462343215942, + "learning_rate": 1.0235764667702053e-05, + "loss": 0.4101, "step": 98545 }, { - "epoch": 3.47, - "learning_rate": 1.1358319275394167e-05, - "loss": 0.2515, + "epoch": 3.5517353227375934, + "grad_norm": 0.25389420986175537, + "learning_rate": 1.0233409872818772e-05, + "loss": 0.3636, "step": 98550 }, { - "epoch": 3.47, - "learning_rate": 1.1355932129936348e-05, - "loss": 0.2516, + "epoch": 3.5519155223988177, + "grad_norm": 0.18197034299373627, + "learning_rate": 1.0231055279127414e-05, + "loss": 0.3946, "step": 98555 }, { - "epoch": 3.47, - "learning_rate": 1.1353545161637771e-05, - "loss": 0.2606, + "epoch": 3.5520957220600424, + "grad_norm": 0.2080104500055313, + "learning_rate": 1.0228700886660078e-05, + "loss": 0.361, "step": 98560 }, { - "epoch": 3.47, - "learning_rate": 1.1351158370529414e-05, - "loss": 0.2511, + "epoch": 3.552275921721267, + "grad_norm": 0.2673806846141815, + "learning_rate": 1.0226346695448832e-05, + "loss": 0.3957, "step": 98565 }, { - "epoch": 3.47, - "learning_rate": 1.1348771756642273e-05, - "loss": 0.2594, + "epoch": 3.552456121382492, + "grad_norm": 0.25703608989715576, + "learning_rate": 1.0223992705525753e-05, + "loss": 0.3573, "step": 98570 }, { - "epoch": 3.47, - "learning_rate": 1.1346385320007324e-05, - "loss": 0.2801, + "epoch": 3.5526363210437166, + "grad_norm": 0.24846360087394714, + "learning_rate": 1.0221638916922909e-05, + "loss": 0.3862, "step": 98575 }, { - "epoch": 3.47, - "learning_rate": 1.1343999060655571e-05, - "loss": 0.2821, + "epoch": 3.552816520704941, + "grad_norm": 0.25549542903900146, + "learning_rate": 1.021928532967237e-05, + "loss": 0.3834, "step": 98580 }, { - "epoch": 3.47, - "learning_rate": 1.1341612978617994e-05, - "loss": 0.2317, + "epoch": 3.5529967203661657, + "grad_norm": 0.2898486256599426, + "learning_rate": 1.0216931943806213e-05, + "loss": 0.3971, "step": 98585 }, { - "epoch": 3.47, - "learning_rate": 1.133922707392557e-05, - "loss": 0.2723, + "epoch": 3.5531769200273904, + "grad_norm": 0.25332796573638916, + "learning_rate": 1.0214578759356504e-05, + "loss": 0.3667, "step": 98590 }, { - "epoch": 3.47, - "learning_rate": 1.133684134660927e-05, - "loss": 0.2516, + "epoch": 3.553357119688615, + "grad_norm": 0.225131094455719, + "learning_rate": 1.021222577635528e-05, + "loss": 0.3701, "step": 98595 }, { - "epoch": 3.47, - "learning_rate": 1.1334455796700081e-05, - "loss": 0.2523, + "epoch": 3.5535373193498394, + "grad_norm": 0.18378715217113495, + "learning_rate": 1.0209872994834627e-05, + "loss": 0.3875, "step": 98600 }, { - "epoch": 3.47, - "learning_rate": 1.1332070424228985e-05, - "loss": 0.2486, + "epoch": 3.553717519011064, + "grad_norm": 0.21205966174602509, + "learning_rate": 1.0207520414826583e-05, + "loss": 0.3717, "step": 98605 }, { - "epoch": 3.47, - "learning_rate": 1.1329685229226941e-05, - "loss": 0.2606, + "epoch": 3.553897718672289, + "grad_norm": 0.21734385192394257, + "learning_rate": 1.0205168036363225e-05, + "loss": 0.3397, "step": 98610 }, { - "epoch": 3.47, - "learning_rate": 1.132730021172493e-05, - "loss": 0.256, + "epoch": 3.5540779183335136, + "grad_norm": 0.22054389119148254, + "learning_rate": 1.0202815859476577e-05, + "loss": 0.4035, "step": 98615 }, { - "epoch": 3.47, - "learning_rate": 1.132491537175392e-05, - "loss": 0.2728, + "epoch": 3.5542581179947383, + "grad_norm": 0.24117785692214966, + "learning_rate": 1.0200463884198693e-05, + "loss": 0.3737, "step": 98620 }, { - "epoch": 3.47, - "learning_rate": 1.132253070934487e-05, - "loss": 0.2528, + "epoch": 3.554438317655963, + "grad_norm": 0.20071789622306824, + "learning_rate": 1.0198112110561631e-05, + "loss": 0.3685, "step": 98625 }, { - "epoch": 3.47, - "learning_rate": 1.1320146224528735e-05, - "loss": 0.2654, + "epoch": 3.5546185173171874, + "grad_norm": 0.29259100556373596, + "learning_rate": 1.0195760538597426e-05, + "loss": 0.3778, "step": 98630 }, { - "epoch": 3.47, - "learning_rate": 1.1317761917336494e-05, - "loss": 0.2539, + "epoch": 3.554798716978412, + "grad_norm": 0.24713879823684692, + "learning_rate": 1.0193409168338116e-05, + "loss": 0.3813, "step": 98635 }, { - "epoch": 3.47, - "learning_rate": 1.13153777877991e-05, - "loss": 0.2663, + "epoch": 3.554978916639637, + "grad_norm": 0.22376933693885803, + "learning_rate": 1.0191057999815743e-05, + "loss": 0.3789, "step": 98640 }, { - "epoch": 3.47, - "learning_rate": 1.1312993835947506e-05, - "loss": 0.2685, + "epoch": 3.555159116300861, + "grad_norm": 0.2563563287258148, + "learning_rate": 1.0188707033062325e-05, + "loss": 0.3876, "step": 98645 }, { - "epoch": 3.47, - "learning_rate": 1.1310610061812657e-05, - "loss": 0.2615, + "epoch": 3.555339315962086, + "grad_norm": 0.2513370215892792, + "learning_rate": 1.0186356268109917e-05, + "loss": 0.3808, "step": 98650 }, { - "epoch": 3.47, - "learning_rate": 1.1308226465425516e-05, - "loss": 0.2569, + "epoch": 3.5555195156233106, + "grad_norm": 0.23794013261795044, + "learning_rate": 1.0184005704990538e-05, + "loss": 0.3962, "step": 98655 }, { - "epoch": 3.47, - "learning_rate": 1.1305843046817041e-05, - "loss": 0.2481, + "epoch": 3.5556997152845353, + "grad_norm": 0.23575954139232635, + "learning_rate": 1.018165534373621e-05, + "loss": 0.3906, "step": 98660 }, { - "epoch": 3.47, - "learning_rate": 1.130345980601817e-05, - "loss": 0.2643, + "epoch": 3.55587991494576, + "grad_norm": 0.20875148475170135, + "learning_rate": 1.0179305184378959e-05, + "loss": 0.3759, "step": 98665 }, { - "epoch": 3.47, - "learning_rate": 1.1301076743059833e-05, - "loss": 0.2595, + "epoch": 3.556060114606985, + "grad_norm": 0.2343914955854416, + "learning_rate": 1.0176955226950799e-05, + "loss": 0.3565, "step": 98670 }, { - "epoch": 3.47, - "learning_rate": 1.1298693857973e-05, - "loss": 0.2522, + "epoch": 3.556240314268209, + "grad_norm": 0.24285919964313507, + "learning_rate": 1.0174605471483761e-05, + "loss": 0.3683, "step": 98675 }, { - "epoch": 3.47, - "learning_rate": 1.1296311150788596e-05, - "loss": 0.2526, + "epoch": 3.556420513929434, + "grad_norm": 0.24529211223125458, + "learning_rate": 1.0172255918009862e-05, + "loss": 0.3582, "step": 98680 }, { - "epoch": 3.47, - "learning_rate": 1.1293928621537561e-05, - "loss": 0.2947, + "epoch": 3.5566007135906585, + "grad_norm": 0.23087753355503082, + "learning_rate": 1.0169906566561087e-05, + "loss": 0.3616, "step": 98685 }, { - "epoch": 3.47, - "learning_rate": 1.129154627025082e-05, - "loss": 0.2512, + "epoch": 3.556780913251883, + "grad_norm": 0.2296365201473236, + "learning_rate": 1.0167557417169476e-05, + "loss": 0.3671, "step": 98690 }, { - "epoch": 3.47, - "learning_rate": 1.1289164096959326e-05, - "loss": 0.2723, + "epoch": 3.5569611129131076, + "grad_norm": 0.27826911211013794, + "learning_rate": 1.0165208469867022e-05, + "loss": 0.3988, "step": 98695 }, { - "epoch": 3.47, - "learning_rate": 1.1286782101694002e-05, - "loss": 0.2568, + "epoch": 3.5571413125743323, + "grad_norm": 0.21545450389385223, + "learning_rate": 1.0162859724685723e-05, + "loss": 0.3612, "step": 98700 }, { - "epoch": 3.47, - "learning_rate": 1.1284400284485766e-05, - "loss": 0.2901, + "epoch": 3.557321512235557, + "grad_norm": 0.23605626821517944, + "learning_rate": 1.0160511181657604e-05, + "loss": 0.358, "step": 98705 }, { - "epoch": 3.47, - "learning_rate": 1.128201864536556e-05, - "loss": 0.2345, + "epoch": 3.5575017118967818, + "grad_norm": 0.2647882103919983, + "learning_rate": 1.0158162840814627e-05, + "loss": 0.4287, "step": 98710 }, { - "epoch": 3.47, - "learning_rate": 1.1279637184364294e-05, - "loss": 0.2344, + "epoch": 3.5576819115580065, + "grad_norm": 0.2228686660528183, + "learning_rate": 1.0155814702188818e-05, + "loss": 0.414, "step": 98715 }, { - "epoch": 3.47, - "learning_rate": 1.1277255901512907e-05, - "loss": 0.2376, + "epoch": 3.557862111219231, + "grad_norm": 0.23151619732379913, + "learning_rate": 1.0153466765812161e-05, + "loss": 0.3869, "step": 98720 }, { - "epoch": 3.47, - "learning_rate": 1.1274874796842296e-05, - "loss": 0.2682, + "epoch": 3.5580423108804555, + "grad_norm": 0.24438922107219696, + "learning_rate": 1.0151119031716646e-05, + "loss": 0.3947, "step": 98725 }, { - "epoch": 3.47, - "learning_rate": 1.1272493870383404e-05, - "loss": 0.2536, + "epoch": 3.5582225105416803, + "grad_norm": 0.2741881012916565, + "learning_rate": 1.0148771499934257e-05, + "loss": 0.3601, "step": 98730 }, { - "epoch": 3.47, - "learning_rate": 1.1270113122167131e-05, - "loss": 0.2728, + "epoch": 3.5584027102029046, + "grad_norm": 0.3307461440563202, + "learning_rate": 1.0146424170496982e-05, + "loss": 0.3836, "step": 98735 }, { - "epoch": 3.47, - "learning_rate": 1.1267732552224388e-05, - "loss": 0.2463, + "epoch": 3.5585829098641293, + "grad_norm": 0.2043096274137497, + "learning_rate": 1.0144077043436792e-05, + "loss": 0.3482, "step": 98740 }, { - "epoch": 3.47, - "learning_rate": 1.1265352160586082e-05, - "loss": 0.2672, + "epoch": 3.558763109525354, + "grad_norm": 0.24508656561374664, + "learning_rate": 1.0141730118785687e-05, + "loss": 0.3842, "step": 98745 }, { - "epoch": 3.47, - "learning_rate": 1.1262971947283135e-05, - "loss": 0.247, + "epoch": 3.5589433091865788, + "grad_norm": 0.20151925086975098, + "learning_rate": 1.013938339657563e-05, + "loss": 0.3545, "step": 98750 }, { - "epoch": 3.47, - "learning_rate": 1.1260591912346444e-05, - "loss": 0.2392, + "epoch": 3.5591235088478035, + "grad_norm": 0.22490449249744415, + "learning_rate": 1.0137036876838598e-05, + "loss": 0.3762, "step": 98755 }, { - "epoch": 3.47, - "learning_rate": 1.1258212055806904e-05, - "loss": 0.2306, + "epoch": 3.559303708509028, + "grad_norm": 0.21991267800331116, + "learning_rate": 1.0134690559606563e-05, + "loss": 0.3916, "step": 98760 }, { - "epoch": 3.47, - "learning_rate": 1.1255832377695433e-05, - "loss": 0.2488, + "epoch": 3.5594839081702525, + "grad_norm": 0.3030828833580017, + "learning_rate": 1.0132344444911482e-05, + "loss": 0.3619, "step": 98765 }, { - "epoch": 3.48, - "learning_rate": 1.1253452878042912e-05, - "loss": 0.2726, + "epoch": 3.5596641078314772, + "grad_norm": 0.22489933669567108, + "learning_rate": 1.0129998532785337e-05, + "loss": 0.3944, "step": 98770 }, { - "epoch": 3.48, - "learning_rate": 1.1251073556880254e-05, - "loss": 0.2537, + "epoch": 3.559844307492702, + "grad_norm": 0.21111252903938293, + "learning_rate": 1.0127652823260092e-05, + "loss": 0.3725, "step": 98775 }, { - "epoch": 3.48, - "learning_rate": 1.1248694414238346e-05, - "loss": 0.2447, + "epoch": 3.5600245071539267, + "grad_norm": 0.23496191203594208, + "learning_rate": 1.012530731636768e-05, + "loss": 0.3468, "step": 98780 }, { - "epoch": 3.48, - "learning_rate": 1.1246315450148067e-05, - "loss": 0.2559, + "epoch": 3.560204706815151, + "grad_norm": 0.22039222717285156, + "learning_rate": 1.0122962012140083e-05, + "loss": 0.3789, "step": 98785 }, { - "epoch": 3.48, - "learning_rate": 1.124393666464033e-05, - "loss": 0.2508, + "epoch": 3.5603849064763757, + "grad_norm": 0.2323601096868515, + "learning_rate": 1.0120616910609243e-05, + "loss": 0.3234, "step": 98790 }, { - "epoch": 3.48, - "learning_rate": 1.124155805774601e-05, - "loss": 0.2415, + "epoch": 3.5605651061376005, + "grad_norm": 0.21386387944221497, + "learning_rate": 1.0118272011807134e-05, + "loss": 0.405, "step": 98795 }, { - "epoch": 3.48, - "learning_rate": 1.123917962949599e-05, - "loss": 0.2632, + "epoch": 3.560745305798825, + "grad_norm": 0.22486340999603271, + "learning_rate": 1.0115927315765678e-05, + "loss": 0.3728, "step": 98800 }, { - "epoch": 3.48, - "learning_rate": 1.1236801379921145e-05, - "loss": 0.2436, + "epoch": 3.56092550546005, + "grad_norm": 0.22867098450660706, + "learning_rate": 1.011358282251682e-05, + "loss": 0.4076, "step": 98805 }, { - "epoch": 3.48, - "learning_rate": 1.1234423309052371e-05, - "loss": 0.2469, + "epoch": 3.5611057051212742, + "grad_norm": 0.2371281385421753, + "learning_rate": 1.0111238532092524e-05, + "loss": 0.3776, "step": 98810 }, { - "epoch": 3.48, - "learning_rate": 1.1232045416920542e-05, - "loss": 0.2319, + "epoch": 3.561285904782499, + "grad_norm": 0.29785680770874023, + "learning_rate": 1.0108894444524713e-05, + "loss": 0.4038, "step": 98815 }, { - "epoch": 3.48, - "learning_rate": 1.1229667703556521e-05, - "loss": 0.2872, + "epoch": 3.5614661044437237, + "grad_norm": 0.24299155175685883, + "learning_rate": 1.0106550559845348e-05, + "loss": 0.3717, "step": 98820 }, { - "epoch": 3.48, - "learning_rate": 1.1227290168991187e-05, - "loss": 0.2447, + "epoch": 3.5616463041049484, + "grad_norm": 0.19322209060192108, + "learning_rate": 1.0104206878086343e-05, + "loss": 0.3855, "step": 98825 }, { - "epoch": 3.48, - "learning_rate": 1.1224912813255422e-05, - "loss": 0.2335, + "epoch": 3.5618265037661727, + "grad_norm": 0.20190706849098206, + "learning_rate": 1.0101863399279621e-05, + "loss": 0.3884, "step": 98830 }, { - "epoch": 3.48, - "learning_rate": 1.1222535636380089e-05, - "loss": 0.2465, + "epoch": 3.5620067034273974, + "grad_norm": 0.2435564547777176, + "learning_rate": 1.0099520123457138e-05, + "loss": 0.4033, "step": 98835 }, { - "epoch": 3.48, - "learning_rate": 1.1220158638396042e-05, - "loss": 0.2574, + "epoch": 3.562186903088622, + "grad_norm": 0.2607412040233612, + "learning_rate": 1.0097177050650808e-05, + "loss": 0.3681, "step": 98840 }, { - "epoch": 3.48, - "learning_rate": 1.1217781819334161e-05, - "loss": 0.2807, + "epoch": 3.562367102749847, + "grad_norm": 0.20941239595413208, + "learning_rate": 1.0094834180892554e-05, + "loss": 0.3891, "step": 98845 }, { - "epoch": 3.48, - "learning_rate": 1.12154051792253e-05, - "loss": 0.2698, + "epoch": 3.5625473024110716, + "grad_norm": 0.2150138020515442, + "learning_rate": 1.0092491514214301e-05, + "loss": 0.3664, "step": 98850 }, { - "epoch": 3.48, - "learning_rate": 1.1213028718100319e-05, - "loss": 0.2538, + "epoch": 3.562727502072296, + "grad_norm": 0.2666082978248596, + "learning_rate": 1.0090149050647955e-05, + "loss": 0.3944, "step": 98855 }, { - "epoch": 3.48, - "learning_rate": 1.1210652435990065e-05, - "loss": 0.2862, + "epoch": 3.5629077017335207, + "grad_norm": 0.27906569838523865, + "learning_rate": 1.0087806790225451e-05, + "loss": 0.3951, "step": 98860 }, { - "epoch": 3.48, - "learning_rate": 1.120827633292541e-05, - "loss": 0.2738, + "epoch": 3.5630879013947454, + "grad_norm": 0.2231929451227188, + "learning_rate": 1.0085464732978692e-05, + "loss": 0.3646, "step": 98865 }, { - "epoch": 3.48, - "learning_rate": 1.12059004089372e-05, - "loss": 0.2577, + "epoch": 3.56326810105597, + "grad_norm": 0.22479088604450226, + "learning_rate": 1.0083122878939588e-05, + "loss": 0.4146, "step": 98870 }, { - "epoch": 3.48, - "learning_rate": 1.1203524664056273e-05, - "loss": 0.2518, + "epoch": 3.5634483007171944, + "grad_norm": 0.2482202798128128, + "learning_rate": 1.0080781228140045e-05, + "loss": 0.3679, "step": 98875 }, { - "epoch": 3.48, - "learning_rate": 1.1201149098313485e-05, - "loss": 0.2623, + "epoch": 3.563628500378419, + "grad_norm": 0.3211915194988251, + "learning_rate": 1.0078439780611973e-05, + "loss": 0.3653, "step": 98880 }, { - "epoch": 3.48, - "learning_rate": 1.1198773711739691e-05, - "loss": 0.2556, + "epoch": 3.563808700039644, + "grad_norm": 0.2703056037425995, + "learning_rate": 1.007609853638726e-05, + "loss": 0.3871, "step": 98885 }, { - "epoch": 3.48, - "learning_rate": 1.1196398504365727e-05, - "loss": 0.2792, + "epoch": 3.5639888997008686, + "grad_norm": 0.21698278188705444, + "learning_rate": 1.0073757495497832e-05, + "loss": 0.3286, "step": 98890 }, { - "epoch": 3.48, - "learning_rate": 1.1194023476222429e-05, - "loss": 0.2372, + "epoch": 3.5641690993620934, + "grad_norm": 0.2787526547908783, + "learning_rate": 1.007141665797555e-05, + "loss": 0.3737, "step": 98895 }, { - "epoch": 3.48, - "learning_rate": 1.1191648627340628e-05, - "loss": 0.2521, + "epoch": 3.564349299023318, + "grad_norm": 0.23529410362243652, + "learning_rate": 1.0069076023852337e-05, + "loss": 0.4027, "step": 98900 }, { - "epoch": 3.48, - "learning_rate": 1.1189273957751181e-05, - "loss": 0.2586, + "epoch": 3.5645294986845424, + "grad_norm": 0.22988703846931458, + "learning_rate": 1.006673559316007e-05, + "loss": 0.3583, "step": 98905 }, { - "epoch": 3.48, - "learning_rate": 1.118689946748491e-05, - "loss": 0.2539, + "epoch": 3.564709698345767, + "grad_norm": 0.25721538066864014, + "learning_rate": 1.006439536593064e-05, + "loss": 0.3557, "step": 98910 }, { - "epoch": 3.48, - "learning_rate": 1.1184525156572646e-05, - "loss": 0.2706, + "epoch": 3.564889898006992, + "grad_norm": 0.3110162317752838, + "learning_rate": 1.006205534219593e-05, + "loss": 0.3782, "step": 98915 }, { - "epoch": 3.48, - "learning_rate": 1.1182151025045206e-05, - "loss": 0.2535, + "epoch": 3.565070097668216, + "grad_norm": 0.2260642945766449, + "learning_rate": 1.0059715521987829e-05, + "loss": 0.3644, "step": 98920 }, { - "epoch": 3.48, - "learning_rate": 1.1179777072933443e-05, - "loss": 0.2416, + "epoch": 3.565250297329441, + "grad_norm": 0.2566840350627899, + "learning_rate": 1.0057375905338199e-05, + "loss": 0.3579, "step": 98925 }, { - "epoch": 3.48, - "learning_rate": 1.1177403300268152e-05, - "loss": 0.2593, + "epoch": 3.5654304969906656, + "grad_norm": 0.19224494695663452, + "learning_rate": 1.0055036492278938e-05, + "loss": 0.3589, "step": 98930 }, { - "epoch": 3.48, - "learning_rate": 1.1175029707080184e-05, - "loss": 0.259, + "epoch": 3.5656106966518903, + "grad_norm": 0.24910667538642883, + "learning_rate": 1.005269728284191e-05, + "loss": 0.3914, "step": 98935 }, { - "epoch": 3.48, - "learning_rate": 1.1172656293400333e-05, - "loss": 0.2734, + "epoch": 3.565790896313115, + "grad_norm": 0.23995643854141235, + "learning_rate": 1.0050358277058991e-05, + "loss": 0.3903, "step": 98940 }, { - "epoch": 3.48, - "learning_rate": 1.1170283059259434e-05, - "loss": 0.2653, + "epoch": 3.56597109597434, + "grad_norm": 0.29442286491394043, + "learning_rate": 1.0048019474962044e-05, + "loss": 0.3919, "step": 98945 }, { - "epoch": 3.48, - "learning_rate": 1.1167910004688298e-05, - "loss": 0.2646, + "epoch": 3.566151295635564, + "grad_norm": 0.22082147002220154, + "learning_rate": 1.004568087658293e-05, + "loss": 0.346, "step": 98950 }, { - "epoch": 3.48, - "learning_rate": 1.1165537129717727e-05, - "loss": 0.2462, + "epoch": 3.566331495296789, + "grad_norm": 0.24365797638893127, + "learning_rate": 1.0043342481953525e-05, + "loss": 0.3524, "step": 98955 }, { - "epoch": 3.48, - "learning_rate": 1.1163164434378545e-05, - "loss": 0.2366, + "epoch": 3.5665116949580136, + "grad_norm": 0.21891754865646362, + "learning_rate": 1.0041004291105693e-05, + "loss": 0.3475, "step": 98960 }, { - "epoch": 3.48, - "learning_rate": 1.1160791918701557e-05, - "loss": 0.2477, + "epoch": 3.566691894619238, + "grad_norm": 0.22469094395637512, + "learning_rate": 1.0038666304071265e-05, + "loss": 0.4066, "step": 98965 }, { - "epoch": 3.48, - "learning_rate": 1.1158419582717563e-05, - "loss": 0.2542, + "epoch": 3.5668720942804626, + "grad_norm": 0.2404364049434662, + "learning_rate": 1.0036328520882119e-05, + "loss": 0.4111, "step": 98970 }, { - "epoch": 3.48, - "learning_rate": 1.1156047426457363e-05, - "loss": 0.2579, + "epoch": 3.5670522939416873, + "grad_norm": 0.22921812534332275, + "learning_rate": 1.0033990941570093e-05, + "loss": 0.3701, "step": 98975 }, { - "epoch": 3.48, - "learning_rate": 1.115367544995177e-05, - "loss": 0.2721, + "epoch": 3.567232493602912, + "grad_norm": 0.20173293352127075, + "learning_rate": 1.0031653566167048e-05, + "loss": 0.373, "step": 98980 }, { - "epoch": 3.48, - "learning_rate": 1.1151303653231568e-05, - "loss": 0.2704, + "epoch": 3.567412693264137, + "grad_norm": 0.23841802775859833, + "learning_rate": 1.0029316394704839e-05, + "loss": 0.3717, "step": 98985 }, { - "epoch": 3.48, - "learning_rate": 1.1148932036327573e-05, - "loss": 0.2792, + "epoch": 3.5675928929253615, + "grad_norm": 0.25082719326019287, + "learning_rate": 1.0026979427215275e-05, + "loss": 0.4165, "step": 98990 }, { - "epoch": 3.48, - "learning_rate": 1.1146560599270553e-05, - "loss": 0.252, + "epoch": 3.567773092586586, + "grad_norm": 0.23590582609176636, + "learning_rate": 1.0024642663730227e-05, + "loss": 0.3838, "step": 98995 }, { - "epoch": 3.48, - "learning_rate": 1.1144189342091328e-05, - "loss": 0.2534, + "epoch": 3.5679532922478105, + "grad_norm": 0.25584080815315247, + "learning_rate": 1.0022306104281523e-05, + "loss": 0.3765, "step": 99000 }, { - "epoch": 3.48, - "eval_loss": 0.25207310914993286, - "eval_runtime": 10.551, - "eval_samples_per_second": 9.478, - "eval_steps_per_second": 9.478, + "epoch": 3.5679532922478105, + "eval_loss": 0.43000587821006775, + "eval_runtime": 3.5299, + "eval_samples_per_second": 28.33, + "eval_steps_per_second": 7.082, "step": 99000 }, { - "epoch": 3.48, - "learning_rate": 1.114181826482067e-05, - "loss": 0.2452, + "epoch": 3.5681334919090353, + "grad_norm": 0.22017063200473785, + "learning_rate": 1.0019969748900998e-05, + "loss": 0.3714, "step": 99005 }, { - "epoch": 3.48, - "learning_rate": 1.113944736748937e-05, - "loss": 0.2669, + "epoch": 3.5683136915702596, + "grad_norm": 0.25801563262939453, + "learning_rate": 1.0017633597620485e-05, + "loss": 0.3674, "step": 99010 }, { - "epoch": 3.48, - "learning_rate": 1.11370766501282e-05, - "loss": 0.2615, + "epoch": 3.5684938912314843, + "grad_norm": 0.2582071125507355, + "learning_rate": 1.0015297650471805e-05, + "loss": 0.3786, "step": 99015 }, { - "epoch": 3.48, - "learning_rate": 1.1134706112767965e-05, - "loss": 0.2689, + "epoch": 3.568674090892709, + "grad_norm": 0.2171230912208557, + "learning_rate": 1.0012961907486804e-05, + "loss": 0.3997, "step": 99020 }, { - "epoch": 3.48, - "learning_rate": 1.1132335755439435e-05, - "loss": 0.2541, + "epoch": 3.5688542905539338, + "grad_norm": 0.20874454081058502, + "learning_rate": 1.0010626368697292e-05, + "loss": 0.4006, "step": 99025 }, { - "epoch": 3.48, - "learning_rate": 1.1129965578173385e-05, - "loss": 0.27, + "epoch": 3.5690344902151585, + "grad_norm": 0.21824629604816437, + "learning_rate": 1.0008291034135098e-05, + "loss": 0.3733, "step": 99030 }, { - "epoch": 3.48, - "learning_rate": 1.112759558100058e-05, - "loss": 0.2776, + "epoch": 3.5692146898763832, + "grad_norm": 0.3257184326648712, + "learning_rate": 1.0005955903832031e-05, + "loss": 0.3853, "step": 99035 }, { - "epoch": 3.48, - "learning_rate": 1.1125225763951805e-05, - "loss": 0.2625, + "epoch": 3.5693948895376075, + "grad_norm": 0.21955911815166473, + "learning_rate": 1.0003620977819908e-05, + "loss": 0.3398, "step": 99040 }, { - "epoch": 3.48, - "learning_rate": 1.112285612705784e-05, - "loss": 0.2843, + "epoch": 3.5695750891988323, + "grad_norm": 0.2739407420158386, + "learning_rate": 1.0001286256130551e-05, + "loss": 0.3881, "step": 99045 }, { - "epoch": 3.48, - "learning_rate": 1.1120486670349442e-05, - "loss": 0.2572, + "epoch": 3.569755288860057, + "grad_norm": 0.2548768222332001, + "learning_rate": 9.998951738795767e-06, + "loss": 0.3858, "step": 99050 }, { - "epoch": 3.49, - "learning_rate": 1.1118117393857369e-05, - "loss": 0.2688, + "epoch": 3.5699354885212817, + "grad_norm": 0.22001412510871887, + "learning_rate": 9.996617425847363e-06, + "loss": 0.3921, "step": 99055 }, { - "epoch": 3.49, - "learning_rate": 1.1115748297612403e-05, - "loss": 0.2671, + "epoch": 3.570115688182506, + "grad_norm": 0.2573753595352173, + "learning_rate": 9.994283317317138e-06, + "loss": 0.3646, "step": 99060 }, { - "epoch": 3.49, - "learning_rate": 1.1113379381645294e-05, - "loss": 0.2572, + "epoch": 3.5702958878437308, + "grad_norm": 0.30579379200935364, + "learning_rate": 9.991949413236898e-06, + "loss": 0.4174, "step": 99065 }, { - "epoch": 3.49, - "learning_rate": 1.1111010645986794e-05, - "loss": 0.2505, + "epoch": 3.5704760875049555, + "grad_norm": 0.22438612580299377, + "learning_rate": 9.989615713638434e-06, + "loss": 0.3636, "step": 99070 }, { - "epoch": 3.49, - "learning_rate": 1.1108642090667676e-05, - "loss": 0.2631, + "epoch": 3.57065628716618, + "grad_norm": 0.25755026936531067, + "learning_rate": 9.987282218553568e-06, + "loss": 0.3919, "step": 99075 }, { - "epoch": 3.49, - "learning_rate": 1.1106273715718685e-05, - "loss": 0.2634, + "epoch": 3.570836486827405, + "grad_norm": 0.20849241316318512, + "learning_rate": 9.984948928014057e-06, + "loss": 0.3864, "step": 99080 }, { - "epoch": 3.49, - "learning_rate": 1.1103905521170574e-05, - "loss": 0.2694, + "epoch": 3.5710166864886292, + "grad_norm": 0.2606984078884125, + "learning_rate": 9.982615842051719e-06, + "loss": 0.3855, "step": 99085 }, { - "epoch": 3.49, - "learning_rate": 1.1101537507054083e-05, - "loss": 0.2526, + "epoch": 3.571196886149854, + "grad_norm": 0.224889874458313, + "learning_rate": 9.98028296069833e-06, + "loss": 0.3739, "step": 99090 }, { - "epoch": 3.49, - "learning_rate": 1.1099169673399967e-05, - "loss": 0.2336, + "epoch": 3.5713770858110787, + "grad_norm": 0.27115222811698914, + "learning_rate": 9.977950283985673e-06, + "loss": 0.3709, "step": 99095 }, { - "epoch": 3.49, - "learning_rate": 1.1096802020238984e-05, - "loss": 0.2396, + "epoch": 3.5715572854723034, + "grad_norm": 0.22326913475990295, + "learning_rate": 9.97561781194555e-06, + "loss": 0.3937, "step": 99100 }, { - "epoch": 3.49, - "learning_rate": 1.1094434547601862e-05, - "loss": 0.2503, + "epoch": 3.5717374851335277, + "grad_norm": 0.22230251133441925, + "learning_rate": 9.97328554460972e-06, + "loss": 0.362, "step": 99105 }, { - "epoch": 3.49, - "learning_rate": 1.1092067255519333e-05, - "loss": 0.2572, + "epoch": 3.5719176847947525, + "grad_norm": 0.252175509929657, + "learning_rate": 9.970953482009953e-06, + "loss": 0.3838, "step": 99110 }, { - "epoch": 3.49, - "learning_rate": 1.1089700144022155e-05, - "loss": 0.2532, + "epoch": 3.572097884455977, + "grad_norm": 0.24773818254470825, + "learning_rate": 9.968621624178046e-06, + "loss": 0.3791, "step": 99115 }, { - "epoch": 3.49, - "learning_rate": 1.1087333213141055e-05, - "loss": 0.2654, + "epoch": 3.572278084117202, + "grad_norm": 0.2345571517944336, + "learning_rate": 9.966289971145756e-06, + "loss": 0.3671, "step": 99120 }, { - "epoch": 3.49, - "learning_rate": 1.1084966462906765e-05, - "loss": 0.2429, + "epoch": 3.5724582837784267, + "grad_norm": 0.2684936821460724, + "learning_rate": 9.963958522944858e-06, + "loss": 0.3666, "step": 99125 }, { - "epoch": 3.49, - "learning_rate": 1.1082599893350005e-05, - "loss": 0.248, + "epoch": 3.5726384834396514, + "grad_norm": 0.2345341295003891, + "learning_rate": 9.961627279607111e-06, + "loss": 0.3676, "step": 99130 }, { - "epoch": 3.49, - "learning_rate": 1.1080233504501519e-05, - "loss": 0.2411, + "epoch": 3.5728186831008757, + "grad_norm": 0.22453191876411438, + "learning_rate": 9.959296241164273e-06, + "loss": 0.3522, "step": 99135 }, { - "epoch": 3.49, - "learning_rate": 1.107786729639203e-05, - "loss": 0.2428, + "epoch": 3.5729988827621004, + "grad_norm": 0.26160648465156555, + "learning_rate": 9.956965407648122e-06, + "loss": 0.3757, "step": 99140 }, { - "epoch": 3.49, - "learning_rate": 1.1075501269052252e-05, - "loss": 0.2668, + "epoch": 3.573179082423325, + "grad_norm": 0.24852387607097626, + "learning_rate": 9.954634779090404e-06, + "loss": 0.4034, "step": 99145 }, { - "epoch": 3.49, - "learning_rate": 1.107313542251292e-05, - "loss": 0.2729, + "epoch": 3.5733592820845494, + "grad_norm": 0.22410407662391663, + "learning_rate": 9.952304355522876e-06, + "loss": 0.4013, "step": 99150 }, { - "epoch": 3.49, - "learning_rate": 1.1070769756804739e-05, - "loss": 0.2708, + "epoch": 3.573539481745774, + "grad_norm": 0.2551601529121399, + "learning_rate": 9.949974136977286e-06, + "loss": 0.3943, "step": 99155 }, { - "epoch": 3.49, - "learning_rate": 1.106840427195844e-05, - "loss": 0.2568, + "epoch": 3.573719681406999, + "grad_norm": 0.20935653150081635, + "learning_rate": 9.947644123485376e-06, + "loss": 0.3551, "step": 99160 }, { - "epoch": 3.49, - "learning_rate": 1.1066038968004734e-05, - "loss": 0.2346, + "epoch": 3.5738998810682236, + "grad_norm": 0.24979546666145325, + "learning_rate": 9.945314315078907e-06, + "loss": 0.3857, "step": 99165 }, { - "epoch": 3.49, - "learning_rate": 1.1063673844974318e-05, - "loss": 0.2624, + "epoch": 3.5740800807294484, + "grad_norm": 0.2232057750225067, + "learning_rate": 9.94298471178963e-06, + "loss": 0.3694, "step": 99170 }, { - "epoch": 3.49, - "learning_rate": 1.1061308902897922e-05, - "loss": 0.2456, + "epoch": 3.574260280390673, + "grad_norm": 0.20639793574810028, + "learning_rate": 9.94065531364925e-06, + "loss": 0.3668, "step": 99175 }, { - "epoch": 3.49, - "learning_rate": 1.1058944141806243e-05, - "loss": 0.2458, + "epoch": 3.5744404800518974, + "grad_norm": 0.2616964876651764, + "learning_rate": 9.938326120689534e-06, + "loss": 0.3901, "step": 99180 }, { - "epoch": 3.49, - "learning_rate": 1.1056579561729976e-05, - "loss": 0.27, + "epoch": 3.574620679713122, + "grad_norm": 0.2324073314666748, + "learning_rate": 9.93599713294221e-06, + "loss": 0.3556, "step": 99185 }, { - "epoch": 3.49, - "learning_rate": 1.1054215162699846e-05, - "loss": 0.2613, + "epoch": 3.574800879374347, + "grad_norm": 0.23615048825740814, + "learning_rate": 9.933668350439008e-06, + "loss": 0.3786, "step": 99190 }, { - "epoch": 3.49, - "learning_rate": 1.1051850944746543e-05, - "loss": 0.2632, + "epoch": 3.574981079035571, + "grad_norm": 0.21338878571987152, + "learning_rate": 9.931339773211657e-06, + "loss": 0.3847, "step": 99195 }, { - "epoch": 3.49, - "learning_rate": 1.1049486907900752e-05, - "loss": 0.2627, + "epoch": 3.575161278696796, + "grad_norm": 0.2527485489845276, + "learning_rate": 9.929011401291877e-06, + "loss": 0.384, "step": 99200 }, { - "epoch": 3.49, - "learning_rate": 1.104712305219319e-05, - "loss": 0.2485, + "epoch": 3.5753414783580206, + "grad_norm": 0.20677971839904785, + "learning_rate": 9.926683234711406e-06, + "loss": 0.4078, "step": 99205 }, { - "epoch": 3.49, - "learning_rate": 1.104475937765453e-05, - "loss": 0.2587, + "epoch": 3.5755216780192454, + "grad_norm": 0.2947938144207001, + "learning_rate": 9.92435527350196e-06, + "loss": 0.3631, "step": 99210 }, { - "epoch": 3.49, - "learning_rate": 1.1042395884315481e-05, - "loss": 0.2514, + "epoch": 3.57570187768047, + "grad_norm": 0.21625371277332306, + "learning_rate": 9.922027517695253e-06, + "loss": 0.3614, "step": 99215 }, { - "epoch": 3.49, - "learning_rate": 1.1040032572206724e-05, - "loss": 0.2589, + "epoch": 3.575882077341695, + "grad_norm": 0.21706773340702057, + "learning_rate": 9.919699967323001e-06, + "loss": 0.3756, "step": 99220 }, { - "epoch": 3.49, - "learning_rate": 1.1037669441358933e-05, - "loss": 0.2746, + "epoch": 3.576062277002919, + "grad_norm": 0.24126207828521729, + "learning_rate": 9.917372622416912e-06, + "loss": 0.3679, "step": 99225 }, { - "epoch": 3.49, - "learning_rate": 1.1035306491802815e-05, - "loss": 0.2571, + "epoch": 3.576242476664144, + "grad_norm": 0.20370317995548248, + "learning_rate": 9.915045483008706e-06, + "loss": 0.3477, "step": 99230 }, { - "epoch": 3.49, - "learning_rate": 1.1032943723569039e-05, - "loss": 0.2562, + "epoch": 3.5764226763253686, + "grad_norm": 0.2700237035751343, + "learning_rate": 9.912718549130088e-06, + "loss": 0.3631, "step": 99235 }, { - "epoch": 3.49, - "learning_rate": 1.103058113668828e-05, - "loss": 0.2581, + "epoch": 3.576602875986593, + "grad_norm": 0.21943305432796478, + "learning_rate": 9.910391820812756e-06, + "loss": 0.3704, "step": 99240 }, { - "epoch": 3.49, - "learning_rate": 1.1028218731191215e-05, - "loss": 0.2801, + "epoch": 3.5767830756478176, + "grad_norm": 0.22363239526748657, + "learning_rate": 9.908065298088414e-06, + "loss": 0.3915, "step": 99245 }, { - "epoch": 3.49, - "learning_rate": 1.1025856507108529e-05, - "loss": 0.2654, + "epoch": 3.5769632753090423, + "grad_norm": 0.2681196928024292, + "learning_rate": 9.905738980988763e-06, + "loss": 0.3785, "step": 99250 }, { - "epoch": 3.49, - "learning_rate": 1.1023494464470885e-05, - "loss": 0.2478, + "epoch": 3.577143474970267, + "grad_norm": 0.2544713318347931, + "learning_rate": 9.903412869545484e-06, + "loss": 0.3965, "step": 99255 }, { - "epoch": 3.49, - "learning_rate": 1.1021132603308949e-05, - "loss": 0.2558, + "epoch": 3.577323674631492, + "grad_norm": 0.254402220249176, + "learning_rate": 9.901086963790294e-06, + "loss": 0.4389, "step": 99260 }, { - "epoch": 3.49, - "learning_rate": 1.1018770923653391e-05, - "loss": 0.269, + "epoch": 3.5775038742927165, + "grad_norm": 0.2791866064071655, + "learning_rate": 9.89876126375487e-06, + "loss": 0.4082, "step": 99265 }, { - "epoch": 3.49, - "learning_rate": 1.1016409425534888e-05, - "loss": 0.2642, + "epoch": 3.577684073953941, + "grad_norm": 0.2096082866191864, + "learning_rate": 9.896435769470897e-06, + "loss": 0.3456, "step": 99270 }, { - "epoch": 3.49, - "learning_rate": 1.1014048108984093e-05, - "loss": 0.2572, + "epoch": 3.5778642736151656, + "grad_norm": 0.20750059187412262, + "learning_rate": 9.894110480970064e-06, + "loss": 0.3667, "step": 99275 }, { - "epoch": 3.49, - "learning_rate": 1.1011686974031668e-05, - "loss": 0.2507, + "epoch": 3.5780444732763903, + "grad_norm": 0.22872807085514069, + "learning_rate": 9.891785398284045e-06, + "loss": 0.348, "step": 99280 }, { - "epoch": 3.49, - "learning_rate": 1.1009326020708257e-05, - "loss": 0.2368, + "epoch": 3.578224672937615, + "grad_norm": 0.2133398801088333, + "learning_rate": 9.889460521444541e-06, + "loss": 0.3775, "step": 99285 }, { - "epoch": 3.49, - "learning_rate": 1.100696524904454e-05, - "loss": 0.2572, + "epoch": 3.5784048725988393, + "grad_norm": 0.24460799992084503, + "learning_rate": 9.8871358504832e-06, + "loss": 0.3606, "step": 99290 }, { - "epoch": 3.49, - "learning_rate": 1.1004604659071155e-05, - "loss": 0.2535, + "epoch": 3.578585072260064, + "grad_norm": 0.25514861941337585, + "learning_rate": 9.884811385431703e-06, + "loss": 0.3779, "step": 99295 }, { - "epoch": 3.49, - "learning_rate": 1.1002244250818746e-05, - "loss": 0.2556, + "epoch": 3.578765271921289, + "grad_norm": 0.2582806944847107, + "learning_rate": 9.88248712632173e-06, + "loss": 0.3921, "step": 99300 }, { - "epoch": 3.49, - "learning_rate": 1.099988402431798e-05, - "loss": 0.2535, + "epoch": 3.5789454715825135, + "grad_norm": 0.21507005393505096, + "learning_rate": 9.880163073184945e-06, + "loss": 0.3696, "step": 99305 }, { - "epoch": 3.49, - "learning_rate": 1.0997523979599494e-05, - "loss": 0.2563, + "epoch": 3.5791256712437383, + "grad_norm": 0.2389105260372162, + "learning_rate": 9.87783922605301e-06, + "loss": 0.3775, "step": 99310 }, { - "epoch": 3.49, - "learning_rate": 1.0995164116693924e-05, - "loss": 0.261, + "epoch": 3.5793058709049625, + "grad_norm": 0.24397464096546173, + "learning_rate": 9.875515584957587e-06, + "loss": 0.4102, "step": 99315 }, { - "epoch": 3.49, - "learning_rate": 1.0992804435631915e-05, - "loss": 0.2546, + "epoch": 3.5794860705661873, + "grad_norm": 0.262635201215744, + "learning_rate": 9.87319214993033e-06, + "loss": 0.3625, "step": 99320 }, { - "epoch": 3.49, - "learning_rate": 1.099044493644412e-05, - "loss": 0.2593, + "epoch": 3.579666270227412, + "grad_norm": 0.23571620881557465, + "learning_rate": 9.870868921002908e-06, + "loss": 0.4029, "step": 99325 }, { - "epoch": 3.49, - "learning_rate": 1.0988085619161165e-05, - "loss": 0.2948, + "epoch": 3.5798464698886368, + "grad_norm": 0.20199549198150635, + "learning_rate": 9.868545898206968e-06, + "loss": 0.3573, "step": 99330 }, { - "epoch": 3.49, - "learning_rate": 1.0985726483813683e-05, - "loss": 0.2426, + "epoch": 3.580026669549861, + "grad_norm": 0.262723445892334, + "learning_rate": 9.86622308157416e-06, + "loss": 0.3837, "step": 99335 }, { - "epoch": 3.5, - "learning_rate": 1.0983367530432298e-05, - "loss": 0.2673, + "epoch": 3.5802068692110858, + "grad_norm": 0.2516125440597534, + "learning_rate": 9.863900471136135e-06, + "loss": 0.367, "step": 99340 }, { - "epoch": 3.5, - "learning_rate": 1.098100875904766e-05, - "loss": 0.2426, + "epoch": 3.5803870688723105, + "grad_norm": 0.2330031394958496, + "learning_rate": 9.861578066924524e-06, + "loss": 0.3491, "step": 99345 }, { - "epoch": 3.5, - "learning_rate": 1.097865016969038e-05, - "loss": 0.259, + "epoch": 3.5805672685335352, + "grad_norm": 0.2417108565568924, + "learning_rate": 9.85925586897099e-06, + "loss": 0.4232, "step": 99350 }, { - "epoch": 3.5, - "learning_rate": 1.0976291762391092e-05, - "loss": 0.2576, + "epoch": 3.58074746819476, + "grad_norm": 0.3049721121788025, + "learning_rate": 9.856933877307173e-06, + "loss": 0.3506, "step": 99355 }, { - "epoch": 3.5, - "learning_rate": 1.09739335371804e-05, - "loss": 0.2639, + "epoch": 3.5809276678559843, + "grad_norm": 0.20108012855052948, + "learning_rate": 9.854612091964683e-06, + "loss": 0.3765, "step": 99360 }, { - "epoch": 3.5, - "learning_rate": 1.0971575494088948e-05, - "loss": 0.2561, + "epoch": 3.581107867517209, + "grad_norm": 0.18804548680782318, + "learning_rate": 9.852290512975179e-06, + "loss": 0.3871, "step": 99365 }, { - "epoch": 3.5, - "learning_rate": 1.0969217633147336e-05, - "loss": 0.2426, + "epoch": 3.5812880671784337, + "grad_norm": 0.24155212938785553, + "learning_rate": 9.849969140370286e-06, + "loss": 0.3778, "step": 99370 }, { - "epoch": 3.5, - "learning_rate": 1.0966859954386194e-05, - "loss": 0.2757, + "epoch": 3.5814682668396585, + "grad_norm": 0.2706049680709839, + "learning_rate": 9.847647974181626e-06, + "loss": 0.3831, "step": 99375 }, { - "epoch": 3.5, - "learning_rate": 1.0964502457836118e-05, - "loss": 0.2638, + "epoch": 3.5816484665008828, + "grad_norm": 0.26998743414878845, + "learning_rate": 9.845327014440834e-06, + "loss": 0.4087, "step": 99380 }, { - "epoch": 3.5, - "learning_rate": 1.0962145143527735e-05, - "loss": 0.2468, + "epoch": 3.5818286661621075, + "grad_norm": 0.21241755783557892, + "learning_rate": 9.843006261179513e-06, + "loss": 0.388, "step": 99385 }, { - "epoch": 3.5, - "learning_rate": 1.0959788011491649e-05, - "loss": 0.2661, + "epoch": 3.5820088658233322, + "grad_norm": 0.261828750371933, + "learning_rate": 9.84068571442931e-06, + "loss": 0.3801, "step": 99390 }, { - "epoch": 3.5, - "learning_rate": 1.095743106175846e-05, - "loss": 0.2553, + "epoch": 3.582189065484557, + "grad_norm": 0.26328206062316895, + "learning_rate": 9.838365374221827e-06, + "loss": 0.4188, "step": 99395 }, { - "epoch": 3.5, - "learning_rate": 1.0955074294358769e-05, - "loss": 0.2562, + "epoch": 3.5823692651457817, + "grad_norm": 0.26441022753715515, + "learning_rate": 9.836045240588684e-06, + "loss": 0.3694, "step": 99400 }, { - "epoch": 3.5, - "learning_rate": 1.095271770932319e-05, - "loss": 0.256, + "epoch": 3.5825494648070064, + "grad_norm": 0.21081748604774475, + "learning_rate": 9.833725313561487e-06, + "loss": 0.33, "step": 99405 }, { - "epoch": 3.5, - "learning_rate": 1.0950361306682313e-05, - "loss": 0.2602, + "epoch": 3.5827296644682307, + "grad_norm": 0.23671510815620422, + "learning_rate": 9.831405593171836e-06, + "loss": 0.4302, "step": 99410 }, { - "epoch": 3.5, - "learning_rate": 1.0948005086466737e-05, - "loss": 0.2556, + "epoch": 3.5829098641294554, + "grad_norm": 0.2610369920730591, + "learning_rate": 9.829086079451357e-06, + "loss": 0.368, "step": 99415 }, { - "epoch": 3.5, - "learning_rate": 1.0945649048707044e-05, - "loss": 0.2603, + "epoch": 3.58309006379068, + "grad_norm": 0.22345009446144104, + "learning_rate": 9.826766772431643e-06, + "loss": 0.3939, "step": 99420 }, { - "epoch": 3.5, - "learning_rate": 1.0943293193433834e-05, - "loss": 0.2637, + "epoch": 3.5832702634519045, + "grad_norm": 0.28314733505249023, + "learning_rate": 9.824447672144293e-06, + "loss": 0.3636, "step": 99425 }, { - "epoch": 3.5, - "learning_rate": 1.094093752067771e-05, - "loss": 0.2701, + "epoch": 3.583450463113129, + "grad_norm": 0.26532143354415894, + "learning_rate": 9.822128778620907e-06, + "loss": 0.386, "step": 99430 }, { - "epoch": 3.5, - "learning_rate": 1.0938582030469238e-05, - "loss": 0.2635, + "epoch": 3.583630662774354, + "grad_norm": 0.23965269327163696, + "learning_rate": 9.819810091893078e-06, + "loss": 0.4179, "step": 99435 }, { - "epoch": 3.5, - "learning_rate": 1.0936226722839018e-05, - "loss": 0.2507, + "epoch": 3.5838108624355787, + "grad_norm": 0.23256781697273254, + "learning_rate": 9.817491611992386e-06, + "loss": 0.3622, "step": 99440 }, { - "epoch": 3.5, - "learning_rate": 1.093387159781763e-05, - "loss": 0.2497, + "epoch": 3.5839910620968034, + "grad_norm": 0.2113410234451294, + "learning_rate": 9.815173338950442e-06, + "loss": 0.3808, "step": 99445 }, { - "epoch": 3.5, - "learning_rate": 1.0931516655435645e-05, - "loss": 0.253, + "epoch": 3.584171261758028, + "grad_norm": 0.21958471834659576, + "learning_rate": 9.812855272798822e-06, + "loss": 0.3558, "step": 99450 }, { - "epoch": 3.5, - "learning_rate": 1.0929161895723639e-05, - "loss": 0.2477, + "epoch": 3.5843514614192524, + "grad_norm": 0.21652553975582123, + "learning_rate": 9.810537413569107e-06, + "loss": 0.3874, "step": 99455 }, { - "epoch": 3.5, - "learning_rate": 1.0926807318712199e-05, - "loss": 0.2548, + "epoch": 3.584531661080477, + "grad_norm": 0.23006175458431244, + "learning_rate": 9.80821976129288e-06, + "loss": 0.3678, "step": 99460 }, { - "epoch": 3.5, - "learning_rate": 1.0924452924431895e-05, - "loss": 0.2588, + "epoch": 3.584711860741702, + "grad_norm": 0.22379863262176514, + "learning_rate": 9.805902316001712e-06, + "loss": 0.3814, "step": 99465 }, { - "epoch": 3.5, - "learning_rate": 1.0922098712913289e-05, - "loss": 0.2448, + "epoch": 3.584892060402926, + "grad_norm": 0.26252281665802, + "learning_rate": 9.803585077727195e-06, + "loss": 0.3827, "step": 99470 }, { - "epoch": 3.5, - "learning_rate": 1.0919744684186947e-05, - "loss": 0.2646, + "epoch": 3.585072260064151, + "grad_norm": 0.2835235893726349, + "learning_rate": 9.801268046500884e-06, + "loss": 0.3678, "step": 99475 }, { - "epoch": 3.5, - "learning_rate": 1.0917390838283442e-05, - "loss": 0.2531, + "epoch": 3.5852524597253757, + "grad_norm": 0.25666457414627075, + "learning_rate": 9.798951222354344e-06, + "loss": 0.3539, "step": 99480 }, { - "epoch": 3.5, - "learning_rate": 1.0915037175233345e-05, - "loss": 0.2822, + "epoch": 3.5854326593866004, + "grad_norm": 0.26046478748321533, + "learning_rate": 9.796634605319158e-06, + "loss": 0.3828, "step": 99485 }, { - "epoch": 3.5, - "learning_rate": 1.0912683695067205e-05, - "loss": 0.2679, + "epoch": 3.585612859047825, + "grad_norm": 0.23460224270820618, + "learning_rate": 9.794318195426882e-06, + "loss": 0.3791, "step": 99490 }, { - "epoch": 3.5, - "learning_rate": 1.0910330397815576e-05, - "loss": 0.2748, + "epoch": 3.58579305870905, + "grad_norm": 0.28312990069389343, + "learning_rate": 9.79200199270908e-06, + "loss": 0.4117, "step": 99495 }, { - "epoch": 3.5, - "learning_rate": 1.0907977283509032e-05, - "loss": 0.2566, + "epoch": 3.585973258370274, + "grad_norm": 0.2613278925418854, + "learning_rate": 9.7896859971973e-06, + "loss": 0.3978, "step": 99500 }, { - "epoch": 3.5, - "eval_loss": 0.25208380818367004, - "eval_runtime": 10.545, - "eval_samples_per_second": 9.483, - "eval_steps_per_second": 9.483, + "epoch": 3.585973258370274, + "eval_loss": 0.4298833906650543, + "eval_runtime": 3.5272, + "eval_samples_per_second": 28.351, + "eval_steps_per_second": 7.088, "step": 99500 }, { - "epoch": 3.5, - "learning_rate": 1.0905624352178115e-05, - "loss": 0.2539, + "epoch": 3.586153458031499, + "grad_norm": 0.2599956691265106, + "learning_rate": 9.787370208923099e-06, + "loss": 0.393, "step": 99505 }, { - "epoch": 3.5, - "learning_rate": 1.0903271603853376e-05, - "loss": 0.2521, + "epoch": 3.5863336576927236, + "grad_norm": 0.2776644229888916, + "learning_rate": 9.785054627918044e-06, + "loss": 0.4205, "step": 99510 }, { - "epoch": 3.5, - "learning_rate": 1.0900919038565357e-05, - "loss": 0.2631, + "epoch": 3.586513857353948, + "grad_norm": 0.27284055948257446, + "learning_rate": 9.782739254213672e-06, + "loss": 0.3999, "step": 99515 }, { - "epoch": 3.5, - "learning_rate": 1.0898566656344622e-05, - "loss": 0.269, + "epoch": 3.5866940570151726, + "grad_norm": 0.24026501178741455, + "learning_rate": 9.780424087841533e-06, + "loss": 0.378, "step": 99520 }, { - "epoch": 3.5, - "learning_rate": 1.0896214457221706e-05, - "loss": 0.2594, + "epoch": 3.5868742566763974, + "grad_norm": 0.26283571124076843, + "learning_rate": 9.778109128833166e-06, + "loss": 0.3963, "step": 99525 }, { - "epoch": 3.5, - "learning_rate": 1.089386244122714e-05, - "loss": 0.2525, + "epoch": 3.587054456337622, + "grad_norm": 0.22547613084316254, + "learning_rate": 9.77579437722011e-06, + "loss": 0.3896, "step": 99530 }, { - "epoch": 3.5, - "learning_rate": 1.0891510608391475e-05, - "loss": 0.2582, + "epoch": 3.587234655998847, + "grad_norm": 0.2306743711233139, + "learning_rate": 9.773479833033913e-06, + "loss": 0.4029, "step": 99535 }, { - "epoch": 3.5, - "learning_rate": 1.0889158958745257e-05, - "loss": 0.2852, + "epoch": 3.5874148556600716, + "grad_norm": 0.24043156206607819, + "learning_rate": 9.771165496306118e-06, + "loss": 0.3885, "step": 99540 }, { - "epoch": 3.5, - "learning_rate": 1.088680749231901e-05, - "loss": 0.2508, + "epoch": 3.587595055321296, + "grad_norm": 0.18582390248775482, + "learning_rate": 9.768851367068224e-06, + "loss": 0.3691, "step": 99545 }, { - "epoch": 3.5, - "learning_rate": 1.0884456209143257e-05, - "loss": 0.263, + "epoch": 3.5877752549825206, + "grad_norm": 0.24359643459320068, + "learning_rate": 9.766537445351792e-06, + "loss": 0.402, "step": 99550 }, { - "epoch": 3.5, - "learning_rate": 1.0882105109248547e-05, - "loss": 0.2546, + "epoch": 3.5879554546437453, + "grad_norm": 0.16479088366031647, + "learning_rate": 9.764223731188337e-06, + "loss": 0.398, "step": 99555 }, { - "epoch": 3.5, - "learning_rate": 1.0879754192665397e-05, - "loss": 0.2612, + "epoch": 3.58813565430497, + "grad_norm": 0.23543262481689453, + "learning_rate": 9.761910224609374e-06, + "loss": 0.3722, "step": 99560 }, { - "epoch": 3.5, - "learning_rate": 1.0877403459424335e-05, - "loss": 0.2559, + "epoch": 3.5883158539661943, + "grad_norm": 0.26243576407432556, + "learning_rate": 9.759596925646456e-06, + "loss": 0.3961, "step": 99565 }, { - "epoch": 3.5, - "learning_rate": 1.0875052909555872e-05, - "loss": 0.2799, + "epoch": 3.588496053627419, + "grad_norm": 0.28197166323661804, + "learning_rate": 9.757283834331057e-06, + "loss": 0.3743, "step": 99570 }, { - "epoch": 3.5, - "learning_rate": 1.0872702543090547e-05, - "loss": 0.2395, + "epoch": 3.588676253288644, + "grad_norm": 0.24312157928943634, + "learning_rate": 9.754970950694725e-06, + "loss": 0.4079, "step": 99575 }, { - "epoch": 3.5, - "learning_rate": 1.087035236005887e-05, - "loss": 0.251, + "epoch": 3.5888564529498685, + "grad_norm": 0.21838116645812988, + "learning_rate": 9.752658274768964e-06, + "loss": 0.365, "step": 99580 }, { - "epoch": 3.5, - "learning_rate": 1.0868002360491344e-05, - "loss": 0.2693, + "epoch": 3.5890366526110933, + "grad_norm": 0.2255815863609314, + "learning_rate": 9.75034580658528e-06, + "loss": 0.3944, "step": 99585 }, { - "epoch": 3.5, - "learning_rate": 1.0865652544418506e-05, - "loss": 0.2612, + "epoch": 3.5892168522723176, + "grad_norm": 0.2602185308933258, + "learning_rate": 9.748033546175182e-06, + "loss": 0.3865, "step": 99590 }, { - "epoch": 3.5, - "learning_rate": 1.0863302911870846e-05, - "loss": 0.2632, + "epoch": 3.5893970519335423, + "grad_norm": 0.1989058256149292, + "learning_rate": 9.745721493570176e-06, + "loss": 0.3411, "step": 99595 }, { - "epoch": 3.5, - "learning_rate": 1.0860953462878889e-05, - "loss": 0.2443, + "epoch": 3.589577251594767, + "grad_norm": 0.25254446268081665, + "learning_rate": 9.743409648801749e-06, + "loss": 0.389, "step": 99600 }, { - "epoch": 3.5, - "learning_rate": 1.0858604197473135e-05, - "loss": 0.2425, + "epoch": 3.5897574512559918, + "grad_norm": 0.2625841498374939, + "learning_rate": 9.741098011901423e-06, + "loss": 0.379, "step": 99605 }, { - "epoch": 3.5, - "learning_rate": 1.0856255115684075e-05, - "loss": 0.2392, + "epoch": 3.589937650917216, + "grad_norm": 0.215394988656044, + "learning_rate": 9.738786582900684e-06, + "loss": 0.3997, "step": 99610 }, { - "epoch": 3.5, - "learning_rate": 1.0853906217542231e-05, - "loss": 0.2635, + "epoch": 3.590117850578441, + "grad_norm": 0.2351488173007965, + "learning_rate": 9.736475361831019e-06, + "loss": 0.3684, "step": 99615 }, { - "epoch": 3.5, - "learning_rate": 1.085155750307809e-05, - "loss": 0.2571, + "epoch": 3.5902980502396655, + "grad_norm": 0.2385423183441162, + "learning_rate": 9.734164348723922e-06, + "loss": 0.3604, "step": 99620 }, { - "epoch": 3.51, - "learning_rate": 1.0849208972322156e-05, - "loss": 0.2595, + "epoch": 3.5904782499008903, + "grad_norm": 0.17893251776695251, + "learning_rate": 9.731853543610876e-06, + "loss": 0.3638, "step": 99625 }, { - "epoch": 3.51, - "learning_rate": 1.0846860625304906e-05, - "loss": 0.2321, + "epoch": 3.590658449562115, + "grad_norm": 0.23732203245162964, + "learning_rate": 9.729542946523376e-06, + "loss": 0.3772, "step": 99630 }, { - "epoch": 3.51, - "learning_rate": 1.084451246205685e-05, - "loss": 0.2596, + "epoch": 3.5908386492233397, + "grad_norm": 0.21697406470775604, + "learning_rate": 9.727232557492896e-06, + "loss": 0.3953, "step": 99635 }, { - "epoch": 3.51, - "learning_rate": 1.0842164482608463e-05, - "loss": 0.2488, + "epoch": 3.591018848884564, + "grad_norm": 0.21853552758693695, + "learning_rate": 9.724922376550915e-06, + "loss": 0.3793, "step": 99640 }, { - "epoch": 3.51, - "learning_rate": 1.083981668699025e-05, - "loss": 0.2511, + "epoch": 3.5911990485457888, + "grad_norm": 0.23639477789402008, + "learning_rate": 9.722612403728912e-06, + "loss": 0.4109, "step": 99645 }, { - "epoch": 3.51, - "learning_rate": 1.0837469075232678e-05, - "loss": 0.2557, + "epoch": 3.5913792482070135, + "grad_norm": 0.26770761609077454, + "learning_rate": 9.720302639058349e-06, + "loss": 0.3729, "step": 99650 }, { - "epoch": 3.51, - "learning_rate": 1.0835121647366243e-05, - "loss": 0.2735, + "epoch": 3.5915594478682378, + "grad_norm": 0.21606463193893433, + "learning_rate": 9.717993082570716e-06, + "loss": 0.3727, "step": 99655 }, { - "epoch": 3.51, - "learning_rate": 1.083277440342142e-05, - "loss": 0.2476, + "epoch": 3.5917396475294625, + "grad_norm": 0.24439482390880585, + "learning_rate": 9.715683734297466e-06, + "loss": 0.3812, "step": 99660 }, { - "epoch": 3.51, - "learning_rate": 1.0830427343428673e-05, - "loss": 0.2678, + "epoch": 3.5919198471906872, + "grad_norm": 0.24273943901062012, + "learning_rate": 9.713374594270055e-06, + "loss": 0.4021, "step": 99665 }, { - "epoch": 3.51, - "learning_rate": 1.08280804674185e-05, - "loss": 0.2673, + "epoch": 3.592100046851912, + "grad_norm": 0.2814248502254486, + "learning_rate": 9.711065662519964e-06, + "loss": 0.385, "step": 99670 }, { - "epoch": 3.51, - "learning_rate": 1.0825733775421362e-05, - "loss": 0.2607, + "epoch": 3.5922802465131367, + "grad_norm": 0.23897801339626312, + "learning_rate": 9.708756939078634e-06, + "loss": 0.3566, "step": 99675 }, { - "epoch": 3.51, - "learning_rate": 1.0823387267467733e-05, - "loss": 0.2653, + "epoch": 3.5924604461743614, + "grad_norm": 0.24239744246006012, + "learning_rate": 9.70644842397755e-06, + "loss": 0.4005, "step": 99680 }, { - "epoch": 3.51, - "learning_rate": 1.0821040943588066e-05, - "loss": 0.2476, + "epoch": 3.5926406458355857, + "grad_norm": 0.2517867684364319, + "learning_rate": 9.704140117248134e-06, + "loss": 0.3916, "step": 99685 }, { - "epoch": 3.51, - "learning_rate": 1.0818694803812846e-05, - "loss": 0.2518, + "epoch": 3.5928208454968105, + "grad_norm": 0.315687894821167, + "learning_rate": 9.701832018921839e-06, + "loss": 0.3683, "step": 99690 }, { - "epoch": 3.51, - "learning_rate": 1.0816348848172528e-05, - "loss": 0.2531, + "epoch": 3.593001045158035, + "grad_norm": 0.29640886187553406, + "learning_rate": 9.69952412903013e-06, + "loss": 0.375, "step": 99695 }, { - "epoch": 3.51, - "learning_rate": 1.0814003076697565e-05, - "loss": 0.2656, + "epoch": 3.5931812448192595, + "grad_norm": 0.22682395577430725, + "learning_rate": 9.697216447604444e-06, + "loss": 0.3987, "step": 99700 }, { - "epoch": 3.51, - "learning_rate": 1.0811657489418423e-05, - "loss": 0.2415, + "epoch": 3.5933614444804842, + "grad_norm": 0.30215203762054443, + "learning_rate": 9.69490897467622e-06, + "loss": 0.3749, "step": 99705 }, { - "epoch": 3.51, - "learning_rate": 1.0809312086365566e-05, - "loss": 0.2528, + "epoch": 3.593541644141709, + "grad_norm": 0.23141373693943024, + "learning_rate": 9.692601710276897e-06, + "loss": 0.3501, "step": 99710 }, { - "epoch": 3.51, - "learning_rate": 1.0806966867569441e-05, - "loss": 0.2445, + "epoch": 3.5937218438029337, + "grad_norm": 0.238219752907753, + "learning_rate": 9.690294654437907e-06, + "loss": 0.3775, "step": 99715 }, { - "epoch": 3.51, - "learning_rate": 1.0804621833060496e-05, - "loss": 0.2759, + "epoch": 3.5939020434641584, + "grad_norm": 0.1689501702785492, + "learning_rate": 9.687987807190693e-06, + "loss": 0.3641, "step": 99720 }, { - "epoch": 3.51, - "learning_rate": 1.0802276982869176e-05, - "loss": 0.2405, + "epoch": 3.594082243125383, + "grad_norm": 0.21029123663902283, + "learning_rate": 9.685681168566683e-06, + "loss": 0.3353, "step": 99725 }, { - "epoch": 3.51, - "learning_rate": 1.0799932317025937e-05, - "loss": 0.262, + "epoch": 3.5942624427866074, + "grad_norm": 0.22430087625980377, + "learning_rate": 9.6833747385973e-06, + "loss": 0.3909, "step": 99730 }, { - "epoch": 3.51, - "learning_rate": 1.079758783556122e-05, - "loss": 0.2699, + "epoch": 3.594442642447832, + "grad_norm": 0.21610116958618164, + "learning_rate": 9.681068517313973e-06, + "loss": 0.3581, "step": 99735 }, { - "epoch": 3.51, - "learning_rate": 1.0795243538505468e-05, - "loss": 0.2695, + "epoch": 3.594622842109057, + "grad_norm": 0.2914960980415344, + "learning_rate": 9.678762504748121e-06, + "loss": 0.3932, "step": 99740 }, { - "epoch": 3.51, - "learning_rate": 1.0792899425889105e-05, - "loss": 0.2483, + "epoch": 3.594803041770281, + "grad_norm": 0.24280034005641937, + "learning_rate": 9.676456700931152e-06, + "loss": 0.3773, "step": 99745 }, { - "epoch": 3.51, - "learning_rate": 1.079055549774259e-05, - "loss": 0.2657, + "epoch": 3.594983241431506, + "grad_norm": 0.2517662048339844, + "learning_rate": 9.674151105894516e-06, + "loss": 0.3723, "step": 99750 }, { - "epoch": 3.51, - "learning_rate": 1.078821175409634e-05, - "loss": 0.2653, + "epoch": 3.5951634410927307, + "grad_norm": 0.23000794649124146, + "learning_rate": 9.671845719669584e-06, + "loss": 0.3735, "step": 99755 }, { - "epoch": 3.51, - "learning_rate": 1.0785868194980803e-05, - "loss": 0.2405, + "epoch": 3.5953436407539554, + "grad_norm": 0.2036644071340561, + "learning_rate": 9.669540542287795e-06, + "loss": 0.3355, "step": 99760 }, { - "epoch": 3.51, - "learning_rate": 1.078352482042639e-05, - "loss": 0.268, + "epoch": 3.59552384041518, + "grad_norm": 0.210072860121727, + "learning_rate": 9.667235573780547e-06, + "loss": 0.3654, "step": 99765 }, { - "epoch": 3.51, - "learning_rate": 1.0781181630463547e-05, - "loss": 0.2504, + "epoch": 3.595704040076405, + "grad_norm": 0.23076112568378448, + "learning_rate": 9.664930814179248e-06, + "loss": 0.381, "step": 99770 }, { - "epoch": 3.51, - "learning_rate": 1.0778838625122692e-05, - "loss": 0.2692, + "epoch": 3.595884239737629, + "grad_norm": 0.22967948019504547, + "learning_rate": 9.662626263515298e-06, + "loss": 0.3657, "step": 99775 }, { - "epoch": 3.51, - "learning_rate": 1.0776495804434233e-05, - "loss": 0.2375, + "epoch": 3.596064439398854, + "grad_norm": 0.21021999418735504, + "learning_rate": 9.660321921820095e-06, + "loss": 0.3699, "step": 99780 }, { - "epoch": 3.51, - "learning_rate": 1.0774153168428616e-05, - "loss": 0.2549, + "epoch": 3.5962446390600786, + "grad_norm": 0.2712535858154297, + "learning_rate": 9.658017789125026e-06, + "loss": 0.4032, "step": 99785 }, { - "epoch": 3.51, - "learning_rate": 1.0771810717136242e-05, - "loss": 0.2362, + "epoch": 3.5964248387213034, + "grad_norm": 0.19251175224781036, + "learning_rate": 9.655713865461505e-06, + "loss": 0.3645, "step": 99790 }, { - "epoch": 3.51, - "learning_rate": 1.0769468450587528e-05, - "loss": 0.2604, + "epoch": 3.5966050383825277, + "grad_norm": 0.22021165490150452, + "learning_rate": 9.653410150860912e-06, + "loss": 0.3683, "step": 99795 }, { - "epoch": 3.51, - "learning_rate": 1.076712636881288e-05, - "loss": 0.2494, + "epoch": 3.5967852380437524, + "grad_norm": 0.20990362763404846, + "learning_rate": 9.651106645354632e-06, + "loss": 0.3702, "step": 99800 }, { - "epoch": 3.51, - "learning_rate": 1.0764784471842726e-05, - "loss": 0.2472, + "epoch": 3.596965437704977, + "grad_norm": 0.25066566467285156, + "learning_rate": 9.648803348974054e-06, + "loss": 0.3669, "step": 99805 }, { - "epoch": 3.51, - "learning_rate": 1.0762442759707456e-05, - "loss": 0.233, + "epoch": 3.597145637366202, + "grad_norm": 0.21980679035186768, + "learning_rate": 9.64650026175055e-06, + "loss": 0.3826, "step": 99810 }, { - "epoch": 3.51, - "learning_rate": 1.0760101232437492e-05, - "loss": 0.2798, + "epoch": 3.5973258370274266, + "grad_norm": 0.20478756725788116, + "learning_rate": 9.644197383715514e-06, + "loss": 0.3417, "step": 99815 }, { - "epoch": 3.51, - "learning_rate": 1.0757759890063219e-05, - "loss": 0.253, + "epoch": 3.597506036688651, + "grad_norm": 0.2611001431941986, + "learning_rate": 9.641894714900316e-06, + "loss": 0.3901, "step": 99820 }, { - "epoch": 3.51, - "learning_rate": 1.0755418732615059e-05, - "loss": 0.243, + "epoch": 3.5976862363498756, + "grad_norm": 0.2687753140926361, + "learning_rate": 9.63959225533633e-06, + "loss": 0.3791, "step": 99825 }, { - "epoch": 3.51, - "learning_rate": 1.0753077760123399e-05, - "loss": 0.2646, + "epoch": 3.5978664360111003, + "grad_norm": 0.20144647359848022, + "learning_rate": 9.637290005054928e-06, + "loss": 0.3827, "step": 99830 }, { - "epoch": 3.51, - "learning_rate": 1.0750736972618633e-05, - "loss": 0.2589, + "epoch": 3.598046635672325, + "grad_norm": 0.2408836930990219, + "learning_rate": 9.634987964087464e-06, + "loss": 0.3959, "step": 99835 }, { - "epoch": 3.51, - "learning_rate": 1.0748396370131147e-05, - "loss": 0.2586, + "epoch": 3.5982268353335494, + "grad_norm": 0.23702372610569, + "learning_rate": 9.632686132465321e-06, + "loss": 0.404, "step": 99840 }, { - "epoch": 3.51, - "learning_rate": 1.0746055952691353e-05, - "loss": 0.2589, + "epoch": 3.598407034994774, + "grad_norm": 0.2729646563529968, + "learning_rate": 9.630384510219867e-06, + "loss": 0.3612, "step": 99845 }, { - "epoch": 3.51, - "learning_rate": 1.0743715720329628e-05, - "loss": 0.2689, + "epoch": 3.598587234655999, + "grad_norm": 0.25669410824775696, + "learning_rate": 9.628083097382428e-06, + "loss": 0.373, "step": 99850 }, { - "epoch": 3.51, - "learning_rate": 1.074137567307636e-05, - "loss": 0.257, + "epoch": 3.5987674343172236, + "grad_norm": 0.18720833957195282, + "learning_rate": 9.625781893984392e-06, + "loss": 0.3708, "step": 99855 }, { - "epoch": 3.51, - "learning_rate": 1.0739035810961923e-05, - "loss": 0.2485, + "epoch": 3.5989476339784483, + "grad_norm": 0.2443101853132248, + "learning_rate": 9.623480900057092e-06, + "loss": 0.38, "step": 99860 }, { - "epoch": 3.51, - "learning_rate": 1.0736696134016708e-05, - "loss": 0.2514, + "epoch": 3.5991278336396726, + "grad_norm": 0.24887430667877197, + "learning_rate": 9.621180115631904e-06, + "loss": 0.3652, "step": 99865 }, { - "epoch": 3.51, - "learning_rate": 1.07343566422711e-05, - "loss": 0.2608, + "epoch": 3.5993080333008973, + "grad_norm": 0.22554521262645721, + "learning_rate": 9.61887954074015e-06, + "loss": 0.3778, "step": 99870 }, { - "epoch": 3.51, - "learning_rate": 1.0732017335755473e-05, - "loss": 0.2521, + "epoch": 3.599488232962122, + "grad_norm": 0.2344188541173935, + "learning_rate": 9.616579175413176e-06, + "loss": 0.3619, "step": 99875 }, { - "epoch": 3.51, - "learning_rate": 1.0729678214500189e-05, - "loss": 0.2462, + "epoch": 3.599668432623347, + "grad_norm": 0.30154213309288025, + "learning_rate": 9.614279019682343e-06, + "loss": 0.4012, "step": 99880 }, { - "epoch": 3.51, - "learning_rate": 1.0727339278535636e-05, - "loss": 0.2633, + "epoch": 3.599848632284571, + "grad_norm": 0.2288644164800644, + "learning_rate": 9.611979073578977e-06, + "loss": 0.3882, "step": 99885 }, { - "epoch": 3.51, - "learning_rate": 1.072500052789218e-05, - "loss": 0.2736, + "epoch": 3.600028831945796, + "grad_norm": 0.20435230433940887, + "learning_rate": 9.60967933713442e-06, + "loss": 0.408, "step": 99890 }, { - "epoch": 3.51, - "learning_rate": 1.0722661962600169e-05, - "loss": 0.2583, + "epoch": 3.6002090316070205, + "grad_norm": 0.24859078228473663, + "learning_rate": 9.607379810379998e-06, + "loss": 0.3833, "step": 99895 }, { - "epoch": 3.51, - "learning_rate": 1.0720323582689998e-05, - "loss": 0.2506, + "epoch": 3.6003892312682453, + "grad_norm": 0.28969287872314453, + "learning_rate": 9.605080493347041e-06, + "loss": 0.3902, "step": 99900 }, { - "epoch": 3.51, - "learning_rate": 1.0717985388192012e-05, - "loss": 0.2657, + "epoch": 3.60056943092947, + "grad_norm": 0.2676686644554138, + "learning_rate": 9.602781386066889e-06, + "loss": 0.3858, "step": 99905 }, { - "epoch": 3.52, - "learning_rate": 1.0715647379136575e-05, - "loss": 0.2707, + "epoch": 3.6007496305906947, + "grad_norm": 0.2682179808616638, + "learning_rate": 9.600482488570862e-06, + "loss": 0.3932, "step": 99910 }, { - "epoch": 3.52, - "learning_rate": 1.0713309555554032e-05, - "loss": 0.2525, + "epoch": 3.600929830251919, + "grad_norm": 0.21893753111362457, + "learning_rate": 9.598183800890276e-06, + "loss": 0.3843, "step": 99915 }, { - "epoch": 3.52, - "learning_rate": 1.0710971917474752e-05, - "loss": 0.2493, + "epoch": 3.6011100299131438, + "grad_norm": 0.2323058843612671, + "learning_rate": 9.595885323056455e-06, + "loss": 0.3797, "step": 99920 }, { - "epoch": 3.52, - "learning_rate": 1.0708634464929091e-05, - "loss": 0.2662, + "epoch": 3.6012902295743685, + "grad_norm": 0.23504719138145447, + "learning_rate": 9.593587055100717e-06, + "loss": 0.3858, "step": 99925 }, { - "epoch": 3.52, - "learning_rate": 1.0706297197947393e-05, - "loss": 0.2626, + "epoch": 3.601470429235593, + "grad_norm": 0.2661448121070862, + "learning_rate": 9.591288997054359e-06, + "loss": 0.3477, "step": 99930 }, { - "epoch": 3.52, - "learning_rate": 1.0703960116559999e-05, - "loss": 0.2644, + "epoch": 3.6016506288968175, + "grad_norm": 0.261135458946228, + "learning_rate": 9.588991148948726e-06, + "loss": 0.3728, "step": 99935 }, { - "epoch": 3.52, - "learning_rate": 1.0701623220797272e-05, - "loss": 0.256, + "epoch": 3.6018308285580423, + "grad_norm": 0.2633829712867737, + "learning_rate": 9.586693510815088e-06, + "loss": 0.3891, "step": 99940 }, { - "epoch": 3.52, - "learning_rate": 1.0699286510689541e-05, - "loss": 0.2534, + "epoch": 3.602011028219267, + "grad_norm": 0.23481690883636475, + "learning_rate": 9.584396082684775e-06, + "loss": 0.3821, "step": 99945 }, { - "epoch": 3.52, - "learning_rate": 1.0696949986267152e-05, - "loss": 0.2299, + "epoch": 3.6021912278804917, + "grad_norm": 0.23687255382537842, + "learning_rate": 9.582098864589078e-06, + "loss": 0.3825, "step": 99950 }, { - "epoch": 3.52, - "learning_rate": 1.069461364756043e-05, - "loss": 0.2468, + "epoch": 3.6023714275417165, + "grad_norm": 0.24096225202083588, + "learning_rate": 9.579801856559292e-06, + "loss": 0.35, "step": 99955 }, { - "epoch": 3.52, - "learning_rate": 1.0692277494599734e-05, - "loss": 0.2435, + "epoch": 3.6025516272029408, + "grad_norm": 0.23858147859573364, + "learning_rate": 9.577505058626738e-06, + "loss": 0.3736, "step": 99960 }, { - "epoch": 3.52, - "learning_rate": 1.0689941527415384e-05, - "loss": 0.2582, + "epoch": 3.6027318268641655, + "grad_norm": 0.24173545837402344, + "learning_rate": 9.575208470822682e-06, + "loss": 0.3797, "step": 99965 }, { - "epoch": 3.52, - "learning_rate": 1.0687605746037705e-05, - "loss": 0.2606, + "epoch": 3.60291202652539, + "grad_norm": 0.19363081455230713, + "learning_rate": 9.572912093178419e-06, + "loss": 0.3779, "step": 99970 }, { - "epoch": 3.52, - "learning_rate": 1.0685270150497032e-05, - "loss": 0.2559, + "epoch": 3.6030922261866145, + "grad_norm": 0.2636258006095886, + "learning_rate": 9.570615925725246e-06, + "loss": 0.4388, "step": 99975 }, { - "epoch": 3.52, - "learning_rate": 1.06829347408237e-05, - "loss": 0.2676, + "epoch": 3.6032724258478392, + "grad_norm": 0.2344840168952942, + "learning_rate": 9.568319968494446e-06, + "loss": 0.3784, "step": 99980 }, { - "epoch": 3.52, - "learning_rate": 1.0681066546929894e-05, - "loss": 0.2738, + "epoch": 3.603452625509064, + "grad_norm": 0.22249801456928253, + "learning_rate": 9.5660242215173e-06, + "loss": 0.3607, "step": 99985 }, { - "epoch": 3.52, - "learning_rate": 1.0678731471894177e-05, - "loss": 0.2751, + "epoch": 3.6036328251702887, + "grad_norm": 0.19639311730861664, + "learning_rate": 9.563728684825082e-06, + "loss": 0.3788, "step": 99990 }, { - "epoch": 3.52, - "learning_rate": 1.0676396582810693e-05, - "loss": 0.268, + "epoch": 3.6038130248315134, + "grad_norm": 0.2106081247329712, + "learning_rate": 9.561433358449068e-06, + "loss": 0.3919, "step": 99995 }, { - "epoch": 3.52, - "learning_rate": 1.0674061879709762e-05, - "loss": 0.2599, + "epoch": 3.603993224492738, + "grad_norm": 0.2347351461648941, + "learning_rate": 9.559138242420544e-06, + "loss": 0.3608, "step": 100000 }, { - "epoch": 3.52, - "eval_loss": 0.2520838975906372, - "eval_runtime": 10.5365, - "eval_samples_per_second": 9.491, - "eval_steps_per_second": 9.491, + "epoch": 3.603993224492738, + "eval_loss": 0.42966705560684204, + "eval_runtime": 3.5294, + "eval_samples_per_second": 28.334, + "eval_steps_per_second": 7.083, "step": 100000 }, { - "epoch": 3.52, - "learning_rate": 1.0671727362621686e-05, - "loss": 0.2531, + "epoch": 3.6041734241539625, + "grad_norm": 0.23393255472183228, + "learning_rate": 9.556843336770768e-06, + "loss": 0.3584, "step": 100005 }, { - "epoch": 3.52, - "learning_rate": 1.0669393031576788e-05, - "loss": 0.2713, + "epoch": 3.604353623815187, + "grad_norm": 0.22265978157520294, + "learning_rate": 9.554548641531014e-06, + "loss": 0.355, "step": 100010 }, { - "epoch": 3.52, - "learning_rate": 1.0667058886605388e-05, - "loss": 0.2459, + "epoch": 3.604533823476412, + "grad_norm": 0.2124675214290619, + "learning_rate": 9.552254156732543e-06, + "loss": 0.3352, "step": 100015 }, { - "epoch": 3.52, - "learning_rate": 1.0664724927737783e-05, - "loss": 0.2487, + "epoch": 3.6047140231376362, + "grad_norm": 0.20015810430049896, + "learning_rate": 9.549959882406611e-06, + "loss": 0.3809, "step": 100020 }, { - "epoch": 3.52, - "learning_rate": 1.0662391155004265e-05, - "loss": 0.2499, + "epoch": 3.604894222798861, + "grad_norm": 0.24774090945720673, + "learning_rate": 9.547665818584491e-06, + "loss": 0.3974, "step": 100025 }, { - "epoch": 3.52, - "learning_rate": 1.0660057568435158e-05, - "loss": 0.2693, + "epoch": 3.6050744224600857, + "grad_norm": 0.23569168150424957, + "learning_rate": 9.545371965297445e-06, + "loss": 0.3889, "step": 100030 }, { - "epoch": 3.52, - "learning_rate": 1.0657724168060756e-05, - "loss": 0.282, + "epoch": 3.6052546221213104, + "grad_norm": 0.25077682733535767, + "learning_rate": 9.543078322576696e-06, + "loss": 0.3749, "step": 100035 }, { - "epoch": 3.52, - "learning_rate": 1.0655390953911352e-05, - "loss": 0.2716, + "epoch": 3.605434821782535, + "grad_norm": 0.30293580889701843, + "learning_rate": 9.54078489045352e-06, + "loss": 0.3622, "step": 100040 }, { - "epoch": 3.52, - "learning_rate": 1.0653057926017235e-05, - "loss": 0.2725, + "epoch": 3.60561502144376, + "grad_norm": 0.23128095269203186, + "learning_rate": 9.538491668959146e-06, + "loss": 0.3876, "step": 100045 }, { - "epoch": 3.52, - "learning_rate": 1.0650725084408713e-05, - "loss": 0.2495, + "epoch": 3.605795221104984, + "grad_norm": 0.20295816659927368, + "learning_rate": 9.536198658124849e-06, + "loss": 0.3585, "step": 100050 }, { - "epoch": 3.52, - "learning_rate": 1.0648392429116072e-05, - "loss": 0.232, + "epoch": 3.605975420766209, + "grad_norm": 0.23228393495082855, + "learning_rate": 9.533905857981843e-06, + "loss": 0.4119, "step": 100055 }, { - "epoch": 3.52, - "learning_rate": 1.0646059960169585e-05, - "loss": 0.2373, + "epoch": 3.6061556204274337, + "grad_norm": 0.2623964548110962, + "learning_rate": 9.531613268561365e-06, + "loss": 0.3794, "step": 100060 }, { - "epoch": 3.52, - "learning_rate": 1.0643727677599553e-05, - "loss": 0.2393, + "epoch": 3.6063358200886584, + "grad_norm": 0.21798618137836456, + "learning_rate": 9.529320889894671e-06, + "loss": 0.391, "step": 100065 }, { - "epoch": 3.52, - "learning_rate": 1.0641395581436264e-05, - "loss": 0.2558, + "epoch": 3.6065160197498827, + "grad_norm": 0.21800731122493744, + "learning_rate": 9.527028722012985e-06, + "loss": 0.3605, "step": 100070 }, { - "epoch": 3.52, - "learning_rate": 1.063906367170999e-05, - "loss": 0.2498, + "epoch": 3.6066962194111074, + "grad_norm": 0.2361879199743271, + "learning_rate": 9.524736764947537e-06, + "loss": 0.3766, "step": 100075 }, { - "epoch": 3.52, - "learning_rate": 1.0636731948451001e-05, - "loss": 0.2549, + "epoch": 3.606876419072332, + "grad_norm": 0.23150278627872467, + "learning_rate": 9.522445018729556e-06, + "loss": 0.3902, "step": 100080 }, { - "epoch": 3.52, - "learning_rate": 1.063440041168959e-05, - "loss": 0.2481, + "epoch": 3.607056618733557, + "grad_norm": 0.25107669830322266, + "learning_rate": 9.520153483390253e-06, + "loss": 0.3765, "step": 100085 }, { - "epoch": 3.52, - "learning_rate": 1.0632069061456024e-05, - "loss": 0.2654, + "epoch": 3.6072368183947816, + "grad_norm": 0.22123172879219055, + "learning_rate": 9.517862158960873e-06, + "loss": 0.3759, "step": 100090 }, { - "epoch": 3.52, - "learning_rate": 1.0629737897780574e-05, - "loss": 0.2545, + "epoch": 3.607417018056006, + "grad_norm": 0.2164187878370285, + "learning_rate": 9.515571045472623e-06, + "loss": 0.3721, "step": 100095 }, { - "epoch": 3.52, - "learning_rate": 1.0627406920693494e-05, - "loss": 0.2416, + "epoch": 3.6075972177172306, + "grad_norm": 0.23865751922130585, + "learning_rate": 9.513280142956718e-06, + "loss": 0.3667, "step": 100100 }, { - "epoch": 3.52, - "learning_rate": 1.0625076130225075e-05, - "loss": 0.2567, + "epoch": 3.6077774173784554, + "grad_norm": 0.22169511020183563, + "learning_rate": 9.510989451444374e-06, + "loss": 0.3906, "step": 100105 }, { - "epoch": 3.52, - "learning_rate": 1.0622745526405566e-05, - "loss": 0.2655, + "epoch": 3.60795761703968, + "grad_norm": 0.2418065220117569, + "learning_rate": 9.5086989709668e-06, + "loss": 0.3893, "step": 100110 }, { - "epoch": 3.52, - "learning_rate": 1.0620415109265228e-05, - "loss": 0.2459, + "epoch": 3.6081378167009044, + "grad_norm": 0.274650514125824, + "learning_rate": 9.506408701555197e-06, + "loss": 0.3756, "step": 100115 }, { - "epoch": 3.52, - "learning_rate": 1.061808487883432e-05, - "loss": 0.2647, + "epoch": 3.608318016362129, + "grad_norm": 0.25587350130081177, + "learning_rate": 9.504118643240781e-06, + "loss": 0.3975, "step": 100120 }, { - "epoch": 3.52, - "learning_rate": 1.061575483514311e-05, - "loss": 0.2402, + "epoch": 3.608498216023354, + "grad_norm": 0.35312119126319885, + "learning_rate": 9.501828796054751e-06, + "loss": 0.3939, "step": 100125 }, { - "epoch": 3.52, - "learning_rate": 1.0613424978221845e-05, - "loss": 0.2637, + "epoch": 3.6086784156845786, + "grad_norm": 0.24378515779972076, + "learning_rate": 9.499539160028301e-06, + "loss": 0.4169, "step": 100130 }, { - "epoch": 3.52, - "learning_rate": 1.0611095308100774e-05, - "loss": 0.2624, + "epoch": 3.6088586153458033, + "grad_norm": 0.2463681548833847, + "learning_rate": 9.497249735192628e-06, + "loss": 0.3423, "step": 100135 }, { - "epoch": 3.52, - "learning_rate": 1.060876582481014e-05, - "loss": 0.2645, + "epoch": 3.609038815007028, + "grad_norm": 0.2869650423526764, + "learning_rate": 9.494960521578922e-06, + "loss": 0.3543, "step": 100140 }, { - "epoch": 3.52, - "learning_rate": 1.0606436528380207e-05, - "loss": 0.2463, + "epoch": 3.6092190146682523, + "grad_norm": 0.29217445850372314, + "learning_rate": 9.49267151921839e-06, + "loss": 0.37, "step": 100145 }, { - "epoch": 3.52, - "learning_rate": 1.0604107418841209e-05, - "loss": 0.2536, + "epoch": 3.609399214329477, + "grad_norm": 0.21554695069789886, + "learning_rate": 9.4903827281422e-06, + "loss": 0.3741, "step": 100150 }, { - "epoch": 3.52, - "learning_rate": 1.0601778496223389e-05, - "loss": 0.2507, + "epoch": 3.609579413990702, + "grad_norm": 0.17673182487487793, + "learning_rate": 9.488094148381533e-06, + "loss": 0.3679, "step": 100155 }, { - "epoch": 3.52, - "learning_rate": 1.0599449760556978e-05, - "loss": 0.2748, + "epoch": 3.609759613651926, + "grad_norm": 0.2148004174232483, + "learning_rate": 9.485805779967588e-06, + "loss": 0.3473, "step": 100160 }, { - "epoch": 3.52, - "learning_rate": 1.0597121211872232e-05, - "loss": 0.253, + "epoch": 3.609939813313151, + "grad_norm": 0.33437803387641907, + "learning_rate": 9.483517622931534e-06, + "loss": 0.3883, "step": 100165 }, { - "epoch": 3.52, - "learning_rate": 1.0594792850199363e-05, - "loss": 0.2503, + "epoch": 3.6101200129743756, + "grad_norm": 0.31750041246414185, + "learning_rate": 9.48122967730455e-06, + "loss": 0.3693, "step": 100170 }, { - "epoch": 3.52, - "learning_rate": 1.0592464675568627e-05, - "loss": 0.2762, + "epoch": 3.6103002126356003, + "grad_norm": 0.2354346662759781, + "learning_rate": 9.478941943117809e-06, + "loss": 0.388, "step": 100175 }, { - "epoch": 3.52, - "learning_rate": 1.0590136688010232e-05, - "loss": 0.2657, + "epoch": 3.610480412296825, + "grad_norm": 0.2933277189731598, + "learning_rate": 9.47665442040247e-06, + "loss": 0.4218, "step": 100180 }, { - "epoch": 3.52, - "learning_rate": 1.0587808887554427e-05, - "loss": 0.2391, + "epoch": 3.6106606119580498, + "grad_norm": 0.2739449441432953, + "learning_rate": 9.474367109189716e-06, + "loss": 0.3977, "step": 100185 }, { - "epoch": 3.52, - "learning_rate": 1.0585481274231421e-05, - "loss": 0.2671, + "epoch": 3.610840811619274, + "grad_norm": 0.2737182676792145, + "learning_rate": 9.472080009510707e-06, + "loss": 0.3892, "step": 100190 }, { - "epoch": 3.53, - "learning_rate": 1.0583153848071437e-05, - "loss": 0.2434, + "epoch": 3.611021011280499, + "grad_norm": 0.2394844889640808, + "learning_rate": 9.469793121396597e-06, + "loss": 0.366, "step": 100195 }, { - "epoch": 3.53, - "learning_rate": 1.0580826609104704e-05, - "loss": 0.2563, + "epoch": 3.6112012109417235, + "grad_norm": 0.20772317051887512, + "learning_rate": 9.46796376325298e-06, + "loss": 0.333, "step": 100200 }, { - "epoch": 3.53, - "learning_rate": 1.0578499557361438e-05, - "loss": 0.2772, + "epoch": 3.611381410602948, + "grad_norm": 0.19267010688781738, + "learning_rate": 9.465677256034215e-06, + "loss": 0.3884, "step": 100205 }, { - "epoch": 3.53, - "learning_rate": 1.057617269287185e-05, - "loss": 0.2472, + "epoch": 3.6115616102641726, + "grad_norm": 0.28297874331474304, + "learning_rate": 9.463390960467583e-06, + "loss": 0.3827, "step": 100210 }, { - "epoch": 3.53, - "learning_rate": 1.0573846015666145e-05, - "loss": 0.2396, + "epoch": 3.6117418099253973, + "grad_norm": 0.25464269518852234, + "learning_rate": 9.461104876584256e-06, + "loss": 0.3932, "step": 100215 }, { - "epoch": 3.53, - "learning_rate": 1.057151952577455e-05, - "loss": 0.2741, + "epoch": 3.611922009586622, + "grad_norm": 0.22311927378177643, + "learning_rate": 9.458819004415361e-06, + "loss": 0.37, "step": 100220 }, { - "epoch": 3.53, - "learning_rate": 1.0569193223227259e-05, - "loss": 0.2355, + "epoch": 3.6121022092478468, + "grad_norm": 0.22702719271183014, + "learning_rate": 9.456533343992039e-06, + "loss": 0.4145, "step": 100225 }, { - "epoch": 3.53, - "learning_rate": 1.056686710805449e-05, - "loss": 0.2578, + "epoch": 3.6122824089090715, + "grad_norm": 0.20826251804828644, + "learning_rate": 9.454247895345447e-06, + "loss": 0.3434, "step": 100230 }, { - "epoch": 3.53, - "learning_rate": 1.0564541180286434e-05, - "loss": 0.2587, + "epoch": 3.6124626085702958, + "grad_norm": 0.2535870671272278, + "learning_rate": 9.451962658506722e-06, + "loss": 0.3997, "step": 100235 }, { - "epoch": 3.53, - "learning_rate": 1.0562215439953302e-05, - "loss": 0.2684, + "epoch": 3.6126428082315205, + "grad_norm": 0.27190902829170227, + "learning_rate": 9.449677633506992e-06, + "loss": 0.3874, "step": 100240 }, { - "epoch": 3.53, - "learning_rate": 1.0559889887085288e-05, - "loss": 0.2392, + "epoch": 3.6128230078927452, + "grad_norm": 0.2581278383731842, + "learning_rate": 9.447392820377397e-06, + "loss": 0.3805, "step": 100245 }, { - "epoch": 3.53, - "learning_rate": 1.0557564521712593e-05, - "loss": 0.2883, + "epoch": 3.6130032075539695, + "grad_norm": 0.2271093875169754, + "learning_rate": 9.445108219149052e-06, + "loss": 0.3962, "step": 100250 }, { - "epoch": 3.53, - "learning_rate": 1.0555239343865392e-05, - "loss": 0.2483, + "epoch": 3.6131834072151943, + "grad_norm": 0.251756489276886, + "learning_rate": 9.442823829853103e-06, + "loss": 0.3868, "step": 100255 }, { - "epoch": 3.53, - "learning_rate": 1.0552914353573897e-05, - "loss": 0.2482, + "epoch": 3.613363606876419, + "grad_norm": 0.20969605445861816, + "learning_rate": 9.440539652520671e-06, + "loss": 0.3637, "step": 100260 }, { - "epoch": 3.53, - "learning_rate": 1.0550589550868289e-05, - "loss": 0.293, + "epoch": 3.6135438065376437, + "grad_norm": 0.2561705410480499, + "learning_rate": 9.438255687182873e-06, + "loss": 0.3433, "step": 100265 }, { - "epoch": 3.53, - "learning_rate": 1.0548264935778754e-05, - "loss": 0.2732, + "epoch": 3.6137240061988685, + "grad_norm": 0.19795164465904236, + "learning_rate": 9.435971933870827e-06, + "loss": 0.3794, "step": 100270 }, { - "epoch": 3.53, - "learning_rate": 1.0545940508335464e-05, - "loss": 0.2652, + "epoch": 3.613904205860093, + "grad_norm": 0.267392098903656, + "learning_rate": 9.433688392615644e-06, + "loss": 0.3855, "step": 100275 }, { - "epoch": 3.53, - "learning_rate": 1.0544081101506425e-05, - "loss": 0.2416, + "epoch": 3.6140844055213175, + "grad_norm": 0.27009448409080505, + "learning_rate": 9.431405063448451e-06, + "loss": 0.3847, "step": 100280 }, { - "epoch": 3.53, - "learning_rate": 1.0541757011902451e-05, - "loss": 0.2487, + "epoch": 3.6142646051825422, + "grad_norm": 0.22548210620880127, + "learning_rate": 9.429121946400358e-06, + "loss": 0.3959, "step": 100285 }, { - "epoch": 3.53, - "learning_rate": 1.0539433110029242e-05, - "loss": 0.2596, + "epoch": 3.614444804843767, + "grad_norm": 0.2359258383512497, + "learning_rate": 9.426839041502444e-06, + "loss": 0.3798, "step": 100290 }, { - "epoch": 3.53, - "learning_rate": 1.0537109395916966e-05, - "loss": 0.2557, + "epoch": 3.6146250045049917, + "grad_norm": 0.20709139108657837, + "learning_rate": 9.424556348785846e-06, + "loss": 0.3797, "step": 100295 }, { - "epoch": 3.53, - "learning_rate": 1.0534785869595793e-05, - "loss": 0.2529, + "epoch": 3.614805204166216, + "grad_norm": 0.22372165322303772, + "learning_rate": 9.422273868281639e-06, + "loss": 0.3794, "step": 100300 }, { - "epoch": 3.53, - "learning_rate": 1.0532462531095883e-05, - "loss": 0.2673, + "epoch": 3.6149854038274407, + "grad_norm": 0.17889077961444855, + "learning_rate": 9.419991600020947e-06, + "loss": 0.3699, "step": 100305 }, { - "epoch": 3.53, - "learning_rate": 1.0530139380447421e-05, - "loss": 0.2484, + "epoch": 3.6151656034886654, + "grad_norm": 0.17842791974544525, + "learning_rate": 9.417709544034862e-06, + "loss": 0.3769, "step": 100310 }, { - "epoch": 3.53, - "learning_rate": 1.0527816417680555e-05, - "loss": 0.2821, + "epoch": 3.61534580314989, + "grad_norm": 0.2319183498620987, + "learning_rate": 9.415427700354446e-06, + "loss": 0.3625, "step": 100315 }, { - "epoch": 3.53, - "learning_rate": 1.0525493642825462e-05, - "loss": 0.251, + "epoch": 3.615526002811115, + "grad_norm": 0.22687672078609467, + "learning_rate": 9.413146069010822e-06, + "loss": 0.387, "step": 100320 }, { - "epoch": 3.53, - "learning_rate": 1.052317105591229e-05, - "loss": 0.266, + "epoch": 3.615706202472339, + "grad_norm": 0.25115665793418884, + "learning_rate": 9.410864650035064e-06, + "loss": 0.3584, "step": 100325 }, { - "epoch": 3.53, - "learning_rate": 1.0520848656971208e-05, - "loss": 0.3008, + "epoch": 3.615886402133564, + "grad_norm": 0.23549871146678925, + "learning_rate": 9.408583443458257e-06, + "loss": 0.3989, "step": 100330 }, { - "epoch": 3.53, - "learning_rate": 1.0518526446032365e-05, - "loss": 0.2552, + "epoch": 3.6160666017947887, + "grad_norm": 0.19801755249500275, + "learning_rate": 9.406302449311485e-06, + "loss": 0.3762, "step": 100335 }, { - "epoch": 3.53, - "learning_rate": 1.051620442312591e-05, - "loss": 0.2618, + "epoch": 3.6162468014560134, + "grad_norm": 0.25041335821151733, + "learning_rate": 9.404021667625812e-06, + "loss": 0.3706, "step": 100340 }, { - "epoch": 3.53, - "learning_rate": 1.051388258828199e-05, - "loss": 0.2678, + "epoch": 3.6164270011172377, + "grad_norm": 0.2323254495859146, + "learning_rate": 9.401741098432332e-06, + "loss": 0.3995, "step": 100345 }, { - "epoch": 3.53, - "learning_rate": 1.0511560941530765e-05, - "loss": 0.2644, + "epoch": 3.6166072007784624, + "grad_norm": 0.20460768043994904, + "learning_rate": 9.399460741762111e-06, + "loss": 0.3853, "step": 100350 }, { - "epoch": 3.53, - "learning_rate": 1.0509239482902373e-05, - "loss": 0.2646, + "epoch": 3.616787400439687, + "grad_norm": 0.30746299028396606, + "learning_rate": 9.397180597646218e-06, + "loss": 0.387, "step": 100355 }, { - "epoch": 3.53, - "learning_rate": 1.0506918212426956e-05, - "loss": 0.3034, + "epoch": 3.616967600100912, + "grad_norm": 0.24463587999343872, + "learning_rate": 9.394900666115716e-06, + "loss": 0.3957, "step": 100360 }, { - "epoch": 3.53, - "learning_rate": 1.0504597130134647e-05, - "loss": 0.2515, + "epoch": 3.6171477997621366, + "grad_norm": 0.19743074476718903, + "learning_rate": 9.392620947201675e-06, + "loss": 0.3711, "step": 100365 }, { - "epoch": 3.53, - "learning_rate": 1.0502276236055589e-05, - "loss": 0.2508, + "epoch": 3.617327999423361, + "grad_norm": 0.23998847603797913, + "learning_rate": 9.390341440935138e-06, + "loss": 0.3986, "step": 100370 }, { - "epoch": 3.53, - "learning_rate": 1.049995553021993e-05, - "loss": 0.2688, + "epoch": 3.6175081990845857, + "grad_norm": 0.2761986553668976, + "learning_rate": 9.38806214734719e-06, + "loss": 0.3685, "step": 100375 }, { - "epoch": 3.53, - "learning_rate": 1.0497635012657783e-05, - "loss": 0.2463, + "epoch": 3.6176883987458104, + "grad_norm": 0.209417924284935, + "learning_rate": 9.38578306646887e-06, + "loss": 0.3551, "step": 100380 }, { - "epoch": 3.53, - "learning_rate": 1.0495314683399298e-05, - "loss": 0.2498, + "epoch": 3.617868598407035, + "grad_norm": 0.2800207734107971, + "learning_rate": 9.383504198331233e-06, + "loss": 0.38, "step": 100385 }, { - "epoch": 3.53, - "learning_rate": 1.0492994542474593e-05, - "loss": 0.2574, + "epoch": 3.6180487980682594, + "grad_norm": 0.2673444151878357, + "learning_rate": 9.38122554296533e-06, + "loss": 0.3892, "step": 100390 }, { - "epoch": 3.53, - "learning_rate": 1.0490674589913794e-05, - "loss": 0.2614, + "epoch": 3.618228997729484, + "grad_norm": 0.26103273034095764, + "learning_rate": 9.378947100402194e-06, + "loss": 0.3775, "step": 100395 }, { - "epoch": 3.53, - "learning_rate": 1.0488354825747013e-05, - "loss": 0.2473, + "epoch": 3.618409197390709, + "grad_norm": 0.20183852314949036, + "learning_rate": 9.376668870672897e-06, + "loss": 0.3525, "step": 100400 }, { - "epoch": 3.53, - "learning_rate": 1.0486035250004392e-05, - "loss": 0.2613, + "epoch": 3.6185893970519336, + "grad_norm": 0.22087271511554718, + "learning_rate": 9.374390853808454e-06, + "loss": 0.3442, "step": 100405 }, { - "epoch": 3.53, - "learning_rate": 1.0483715862716037e-05, - "loss": 0.2537, + "epoch": 3.6187695967131583, + "grad_norm": 0.23035688698291779, + "learning_rate": 9.372113049839903e-06, + "loss": 0.3829, "step": 100410 }, { - "epoch": 3.53, - "learning_rate": 1.0481396663912063e-05, - "loss": 0.2646, + "epoch": 3.618949796374383, + "grad_norm": 0.199892058968544, + "learning_rate": 9.369835458798293e-06, + "loss": 0.3724, "step": 100415 }, { - "epoch": 3.53, - "learning_rate": 1.0479077653622579e-05, - "loss": 0.2528, + "epoch": 3.6191299960356074, + "grad_norm": 0.2462371587753296, + "learning_rate": 9.367558080714639e-06, + "loss": 0.3667, "step": 100420 }, { - "epoch": 3.53, - "learning_rate": 1.047675883187771e-05, - "loss": 0.2698, + "epoch": 3.619310195696832, + "grad_norm": 0.21908694505691528, + "learning_rate": 9.36528091562e-06, + "loss": 0.3298, "step": 100425 }, { - "epoch": 3.53, - "learning_rate": 1.0474440198707546e-05, - "loss": 0.2347, + "epoch": 3.619490395358057, + "grad_norm": 0.22547689080238342, + "learning_rate": 9.36300396354537e-06, + "loss": 0.3721, "step": 100430 }, { - "epoch": 3.53, - "learning_rate": 1.0472121754142213e-05, - "loss": 0.2728, + "epoch": 3.619670595019281, + "grad_norm": 0.28035834431648254, + "learning_rate": 9.360727224521775e-06, + "loss": 0.3714, "step": 100435 }, { - "epoch": 3.53, - "learning_rate": 1.0469803498211797e-05, - "loss": 0.2621, + "epoch": 3.619850794680506, + "grad_norm": 0.2472325563430786, + "learning_rate": 9.358450698580254e-06, + "loss": 0.3513, "step": 100440 }, { - "epoch": 3.53, - "learning_rate": 1.0467485430946416e-05, - "loss": 0.2744, + "epoch": 3.6200309943417306, + "grad_norm": 0.24515065550804138, + "learning_rate": 9.356174385751815e-06, + "loss": 0.3279, "step": 100445 }, { - "epoch": 3.53, - "learning_rate": 1.0465167552376159e-05, - "loss": 0.2605, + "epoch": 3.6202111940029553, + "grad_norm": 0.29932889342308044, + "learning_rate": 9.35389828606747e-06, + "loss": 0.3565, "step": 100450 }, { - "epoch": 3.53, - "learning_rate": 1.046284986253112e-05, - "loss": 0.2494, + "epoch": 3.62039139366418, + "grad_norm": 0.2718527913093567, + "learning_rate": 9.35162239955823e-06, + "loss": 0.3859, "step": 100455 }, { - "epoch": 3.53, - "learning_rate": 1.0460532361441391e-05, - "loss": 0.25, + "epoch": 3.620571593325405, + "grad_norm": 0.21936644613742828, + "learning_rate": 9.349346726255098e-06, + "loss": 0.3857, "step": 100460 }, { - "epoch": 3.53, - "learning_rate": 1.0458215049137074e-05, - "loss": 0.2382, + "epoch": 3.620751792986629, + "grad_norm": 0.2328893095254898, + "learning_rate": 9.347071266189095e-06, + "loss": 0.333, "step": 100465 }, { - "epoch": 3.53, - "learning_rate": 1.045589792564825e-05, - "loss": 0.2719, + "epoch": 3.620931992647854, + "grad_norm": 0.22715623676776886, + "learning_rate": 9.344796019391217e-06, + "loss": 0.3534, "step": 100470 }, { - "epoch": 3.53, - "learning_rate": 1.0453580991004999e-05, - "loss": 0.2378, + "epoch": 3.6211121923090785, + "grad_norm": 0.2767705023288727, + "learning_rate": 9.342520985892461e-06, + "loss": 0.3693, "step": 100475 }, { - "epoch": 3.54, - "learning_rate": 1.0451264245237422e-05, - "loss": 0.2533, + "epoch": 3.621292391970303, + "grad_norm": 0.22328001260757446, + "learning_rate": 9.340246165723826e-06, + "loss": 0.4045, "step": 100480 }, { - "epoch": 3.54, - "learning_rate": 1.044894768837558e-05, - "loss": 0.2508, + "epoch": 3.6214725916315276, + "grad_norm": 0.25353187322616577, + "learning_rate": 9.337971558916295e-06, + "loss": 0.3343, "step": 100485 }, { - "epoch": 3.54, - "learning_rate": 1.0446631320449571e-05, - "loss": 0.2745, + "epoch": 3.6216527912927523, + "grad_norm": 0.2742082178592682, + "learning_rate": 9.335697165500878e-06, + "loss": 0.3792, "step": 100490 }, { - "epoch": 3.54, - "learning_rate": 1.0444315141489458e-05, - "loss": 0.248, + "epoch": 3.621832990953977, + "grad_norm": 0.28070586919784546, + "learning_rate": 9.333422985508563e-06, + "loss": 0.3781, "step": 100495 }, { - "epoch": 3.54, - "learning_rate": 1.0441999151525326e-05, - "loss": 0.2534, + "epoch": 3.6220131906152018, + "grad_norm": 0.20703862607479095, + "learning_rate": 9.331149018970311e-06, + "loss": 0.3754, "step": 100500 }, { - "epoch": 3.54, - "eval_loss": 0.25199830532073975, - "eval_runtime": 10.5358, - "eval_samples_per_second": 9.491, - "eval_steps_per_second": 9.491, + "epoch": 3.6220131906152018, + "eval_loss": 0.42909446358680725, + "eval_runtime": 3.5291, + "eval_samples_per_second": 28.336, + "eval_steps_per_second": 7.084, "step": 100500 }, { - "epoch": 3.54, - "learning_rate": 1.0439683350587243e-05, - "loss": 0.2719, + "epoch": 3.6221933902764265, + "grad_norm": 0.23609666526317596, + "learning_rate": 9.328875265917128e-06, + "loss": 0.3835, "step": 100505 }, { - "epoch": 3.54, - "learning_rate": 1.0437367738705273e-05, - "loss": 0.2638, + "epoch": 3.622373589937651, + "grad_norm": 0.22293473780155182, + "learning_rate": 9.326601726379986e-06, + "loss": 0.4105, "step": 100510 }, { - "epoch": 3.54, - "learning_rate": 1.043505231590948e-05, - "loss": 0.2566, + "epoch": 3.6225537895988755, + "grad_norm": 0.251966267824173, + "learning_rate": 9.324328400389858e-06, + "loss": 0.3797, "step": 100515 }, { - "epoch": 3.54, - "learning_rate": 1.043273708222994e-05, - "loss": 0.256, + "epoch": 3.6227339892601003, + "grad_norm": 0.22561317682266235, + "learning_rate": 9.322055287977724e-06, + "loss": 0.3619, "step": 100520 }, { - "epoch": 3.54, - "learning_rate": 1.0430422037696713e-05, - "loss": 0.2569, + "epoch": 3.6229141889213246, + "grad_norm": 0.22810781002044678, + "learning_rate": 9.319782389174542e-06, + "loss": 0.3784, "step": 100525 }, { - "epoch": 3.54, - "learning_rate": 1.042810718233984e-05, - "loss": 0.2712, + "epoch": 3.6230943885825493, + "grad_norm": 0.20727618038654327, + "learning_rate": 9.317509704011298e-06, + "loss": 0.3684, "step": 100530 }, { - "epoch": 3.54, - "learning_rate": 1.0425792516189406e-05, - "loss": 0.2612, + "epoch": 3.623274588243774, + "grad_norm": 0.3055703341960907, + "learning_rate": 9.315237232518948e-06, + "loss": 0.3809, "step": 100535 }, { - "epoch": 3.54, - "learning_rate": 1.0423478039275438e-05, - "loss": 0.2688, + "epoch": 3.6234547879049988, + "grad_norm": 0.2843535840511322, + "learning_rate": 9.312964974728453e-06, + "loss": 0.3996, "step": 100540 }, { - "epoch": 3.54, - "learning_rate": 1.0421163751628016e-05, - "loss": 0.2562, + "epoch": 3.6236349875662235, + "grad_norm": 0.22682887315750122, + "learning_rate": 9.310692930670773e-06, + "loss": 0.3469, "step": 100545 }, { - "epoch": 3.54, - "learning_rate": 1.0418849653277171e-05, - "loss": 0.2504, + "epoch": 3.623815187227448, + "grad_norm": 0.19296708703041077, + "learning_rate": 9.308421100376865e-06, + "loss": 0.3504, "step": 100550 }, { - "epoch": 3.54, - "learning_rate": 1.0416535744252945e-05, - "loss": 0.2534, + "epoch": 3.6239953868886725, + "grad_norm": 0.3073079586029053, + "learning_rate": 9.306149483877674e-06, + "loss": 0.4082, "step": 100555 }, { - "epoch": 3.54, - "learning_rate": 1.0414222024585402e-05, - "loss": 0.2655, + "epoch": 3.6241755865498972, + "grad_norm": 0.2964288294315338, + "learning_rate": 9.303878081204165e-06, + "loss": 0.4123, "step": 100560 }, { - "epoch": 3.54, - "learning_rate": 1.041190849430457e-05, - "loss": 0.2611, + "epoch": 3.624355786211122, + "grad_norm": 0.2711564600467682, + "learning_rate": 9.301606892387276e-06, + "loss": 0.3638, "step": 100565 }, { - "epoch": 3.54, - "learning_rate": 1.0409595153440497e-05, - "loss": 0.2421, + "epoch": 3.6245359858723467, + "grad_norm": 0.2122730016708374, + "learning_rate": 9.299335917457958e-06, + "loss": 0.376, "step": 100570 }, { - "epoch": 3.54, - "learning_rate": 1.0407282002023203e-05, - "loss": 0.2549, + "epoch": 3.624716185533571, + "grad_norm": 0.21361200511455536, + "learning_rate": 9.297065156447147e-06, + "loss": 0.3711, "step": 100575 }, { - "epoch": 3.54, - "learning_rate": 1.0404969040082746e-05, - "loss": 0.2332, + "epoch": 3.6248963851947957, + "grad_norm": 0.24864503741264343, + "learning_rate": 9.294794609385773e-06, + "loss": 0.3661, "step": 100580 }, { - "epoch": 3.54, - "learning_rate": 1.0402656267649148e-05, - "loss": 0.23, + "epoch": 3.6250765848560205, + "grad_norm": 0.23032629489898682, + "learning_rate": 9.292524276304792e-06, + "loss": 0.3687, "step": 100585 }, { - "epoch": 3.54, - "learning_rate": 1.0400343684752428e-05, - "loss": 0.2538, + "epoch": 3.625256784517245, + "grad_norm": 0.31852540373802185, + "learning_rate": 9.290254157235135e-06, + "loss": 0.3777, "step": 100590 }, { - "epoch": 3.54, - "learning_rate": 1.0398031291422621e-05, - "loss": 0.2485, + "epoch": 3.62543698417847, + "grad_norm": 0.196120947599411, + "learning_rate": 9.287984252207707e-06, + "loss": 0.3826, "step": 100595 }, { - "epoch": 3.54, - "learning_rate": 1.0395719087689768e-05, - "loss": 0.2665, + "epoch": 3.6256171838396942, + "grad_norm": 0.17373962700366974, + "learning_rate": 9.285714561253458e-06, + "loss": 0.3801, "step": 100600 }, { - "epoch": 3.54, - "learning_rate": 1.0393407073583877e-05, - "loss": 0.2479, + "epoch": 3.625797383500919, + "grad_norm": 0.21692752838134766, + "learning_rate": 9.2834450844033e-06, + "loss": 0.3712, "step": 100605 }, { - "epoch": 3.54, - "learning_rate": 1.0391095249134969e-05, - "loss": 0.2536, + "epoch": 3.6259775831621437, + "grad_norm": 0.23615795373916626, + "learning_rate": 9.281175821688177e-06, + "loss": 0.3827, "step": 100610 }, { - "epoch": 3.54, - "learning_rate": 1.0388783614373052e-05, - "loss": 0.2444, + "epoch": 3.6261577828233684, + "grad_norm": 0.21718338131904602, + "learning_rate": 9.278906773138979e-06, + "loss": 0.3901, "step": 100615 }, { - "epoch": 3.54, - "learning_rate": 1.0386472169328159e-05, - "loss": 0.2464, + "epoch": 3.6263379824845927, + "grad_norm": 0.2522786557674408, + "learning_rate": 9.276637938786626e-06, + "loss": 0.342, "step": 100620 }, { - "epoch": 3.54, - "learning_rate": 1.0384160914030297e-05, - "loss": 0.2735, + "epoch": 3.6265181821458174, + "grad_norm": 0.31845128536224365, + "learning_rate": 9.274369318662044e-06, + "loss": 0.3978, "step": 100625 }, { - "epoch": 3.54, - "learning_rate": 1.0381849848509461e-05, - "loss": 0.2655, + "epoch": 3.626698381807042, + "grad_norm": 0.2796033024787903, + "learning_rate": 9.272100912796137e-06, + "loss": 0.3601, "step": 100630 }, { - "epoch": 3.54, - "learning_rate": 1.0379538972795683e-05, - "loss": 0.259, + "epoch": 3.626878581468267, + "grad_norm": 0.22418884932994843, + "learning_rate": 9.26983272121981e-06, + "loss": 0.3808, "step": 100635 }, { - "epoch": 3.54, - "learning_rate": 1.0377228286918956e-05, - "loss": 0.2766, + "epoch": 3.6270587811294916, + "grad_norm": 0.2793503403663635, + "learning_rate": 9.267564743963963e-06, + "loss": 0.4101, "step": 100640 }, { - "epoch": 3.54, - "learning_rate": 1.0374917790909272e-05, - "loss": 0.2667, + "epoch": 3.6272389807907164, + "grad_norm": 0.23813951015472412, + "learning_rate": 9.265296981059496e-06, + "loss": 0.4087, "step": 100645 }, { - "epoch": 3.54, - "learning_rate": 1.0372607484796643e-05, - "loss": 0.2517, + "epoch": 3.6274191804519407, + "grad_norm": 0.27661383152008057, + "learning_rate": 9.263029432537317e-06, + "loss": 0.3816, "step": 100650 }, { - "epoch": 3.54, - "learning_rate": 1.0370297368611073e-05, - "loss": 0.28, + "epoch": 3.6275993801131654, + "grad_norm": 0.25448471307754517, + "learning_rate": 9.260762098428319e-06, + "loss": 0.3711, "step": 100655 }, { - "epoch": 3.54, - "learning_rate": 1.0367987442382554e-05, - "loss": 0.2532, + "epoch": 3.62777957977439, + "grad_norm": 0.18283936381340027, + "learning_rate": 9.258494978763387e-06, + "loss": 0.3951, "step": 100660 }, { - "epoch": 3.54, - "learning_rate": 1.0365677706141072e-05, - "loss": 0.2579, + "epoch": 3.6279597794356144, + "grad_norm": 0.22228430211544037, + "learning_rate": 9.256228073573413e-06, + "loss": 0.3452, "step": 100665 }, { - "epoch": 3.54, - "learning_rate": 1.0363368159916614e-05, - "loss": 0.2502, + "epoch": 3.628139979096839, + "grad_norm": 0.185472771525383, + "learning_rate": 9.253961382889278e-06, + "loss": 0.4253, "step": 100670 }, { - "epoch": 3.54, - "learning_rate": 1.0361058803739182e-05, - "loss": 0.2588, + "epoch": 3.628320178758064, + "grad_norm": 0.18725287914276123, + "learning_rate": 9.251694906741879e-06, + "loss": 0.3513, "step": 100675 }, { - "epoch": 3.54, - "learning_rate": 1.0358749637638754e-05, - "loss": 0.2443, + "epoch": 3.6285003784192886, + "grad_norm": 0.22151315212249756, + "learning_rate": 9.249428645162095e-06, + "loss": 0.3352, "step": 100680 }, { - "epoch": 3.54, - "learning_rate": 1.0356440661645313e-05, - "loss": 0.2402, + "epoch": 3.6286805780805134, + "grad_norm": 0.22003361582756042, + "learning_rate": 9.247162598180777e-06, + "loss": 0.3763, "step": 100685 }, { - "epoch": 3.54, - "learning_rate": 1.0354131875788833e-05, - "loss": 0.2557, + "epoch": 3.628860777741738, + "grad_norm": 0.280362993478775, + "learning_rate": 9.244896765828831e-06, + "loss": 0.3839, "step": 100690 }, { - "epoch": 3.54, - "learning_rate": 1.0351823280099304e-05, - "loss": 0.2556, + "epoch": 3.6290409774029624, + "grad_norm": 0.24455983936786652, + "learning_rate": 9.242631148137114e-06, + "loss": 0.3854, "step": 100695 }, { - "epoch": 3.54, - "learning_rate": 1.0349514874606692e-05, - "loss": 0.2534, + "epoch": 3.629221177064187, + "grad_norm": 0.28553998470306396, + "learning_rate": 9.240365745136498e-06, + "loss": 0.4387, "step": 100700 }, { - "epoch": 3.54, - "learning_rate": 1.034720665934098e-05, - "loss": 0.255, + "epoch": 3.629401376725412, + "grad_norm": 0.22999370098114014, + "learning_rate": 9.238100556857847e-06, + "loss": 0.3486, "step": 100705 }, { - "epoch": 3.54, - "learning_rate": 1.0344898634332123e-05, - "loss": 0.258, + "epoch": 3.629581576386636, + "grad_norm": 0.24775420129299164, + "learning_rate": 9.235835583332017e-06, + "loss": 0.4171, "step": 100710 }, { - "epoch": 3.54, - "learning_rate": 1.0342590799610111e-05, - "loss": 0.2445, + "epoch": 3.629761776047861, + "grad_norm": 0.30555811524391174, + "learning_rate": 9.233570824589881e-06, + "loss": 0.4051, "step": 100715 }, { - "epoch": 3.54, - "learning_rate": 1.0340283155204895e-05, - "loss": 0.2602, + "epoch": 3.6299419757090856, + "grad_norm": 0.22852908074855804, + "learning_rate": 9.23130628066229e-06, + "loss": 0.3686, "step": 100720 }, { - "epoch": 3.54, - "learning_rate": 1.033797570114644e-05, - "loss": 0.2574, + "epoch": 3.6301221753703103, + "grad_norm": 0.20418477058410645, + "learning_rate": 9.229041951580101e-06, + "loss": 0.3467, "step": 100725 }, { - "epoch": 3.54, - "learning_rate": 1.0335668437464705e-05, - "loss": 0.2721, + "epoch": 3.630302375031535, + "grad_norm": 0.2210427224636078, + "learning_rate": 9.226777837374163e-06, + "loss": 0.3732, "step": 100730 }, { - "epoch": 3.54, - "learning_rate": 1.0333361364189652e-05, - "loss": 0.2454, + "epoch": 3.63048257469276, + "grad_norm": 0.2393556535243988, + "learning_rate": 9.224513938075318e-06, + "loss": 0.4281, "step": 100735 }, { - "epoch": 3.54, - "learning_rate": 1.0331054481351244e-05, - "loss": 0.2615, + "epoch": 3.630662774353984, + "grad_norm": 0.2593443691730499, + "learning_rate": 9.222250253714413e-06, + "loss": 0.3624, "step": 100740 }, { - "epoch": 3.54, - "learning_rate": 1.0328747788979412e-05, - "loss": 0.2755, + "epoch": 3.630842974015209, + "grad_norm": 0.2505268454551697, + "learning_rate": 9.219986784322299e-06, + "loss": 0.3829, "step": 100745 }, { - "epoch": 3.54, - "learning_rate": 1.0326441287104132e-05, - "loss": 0.244, + "epoch": 3.6310231736764336, + "grad_norm": 0.2201002836227417, + "learning_rate": 9.217723529929812e-06, + "loss": 0.405, "step": 100750 }, { - "epoch": 3.54, - "learning_rate": 1.0324134975755334e-05, - "loss": 0.2653, + "epoch": 3.631203373337658, + "grad_norm": 0.2460431605577469, + "learning_rate": 9.215460490567784e-06, + "loss": 0.3749, "step": 100755 }, { - "epoch": 3.55, - "learning_rate": 1.0321828854962978e-05, - "loss": 0.2529, + "epoch": 3.6313835729988826, + "grad_norm": 0.2341911494731903, + "learning_rate": 9.213197666267052e-06, + "loss": 0.4055, "step": 100760 }, { - "epoch": 3.55, - "learning_rate": 1.0319522924756994e-05, - "loss": 0.2557, + "epoch": 3.6315637726601073, + "grad_norm": 0.25453901290893555, + "learning_rate": 9.210935057058437e-06, + "loss": 0.4276, "step": 100765 }, { - "epoch": 3.55, - "learning_rate": 1.0317217185167338e-05, - "loss": 0.2609, + "epoch": 3.631743972321332, + "grad_norm": 0.22719573974609375, + "learning_rate": 9.208672662972783e-06, + "loss": 0.3768, "step": 100770 }, { - "epoch": 3.55, - "learning_rate": 1.0314911636223942e-05, - "loss": 0.2805, + "epoch": 3.631924171982557, + "grad_norm": 0.22030307352542877, + "learning_rate": 9.206410484040914e-06, + "loss": 0.3629, "step": 100775 }, { - "epoch": 3.55, - "learning_rate": 1.0312606277956738e-05, - "loss": 0.2377, + "epoch": 3.6321043716437815, + "grad_norm": 0.27278268337249756, + "learning_rate": 9.204148520293629e-06, + "loss": 0.3757, "step": 100780 }, { - "epoch": 3.55, - "learning_rate": 1.0310301110395657e-05, - "loss": 0.2782, + "epoch": 3.632284571305006, + "grad_norm": 0.2132975459098816, + "learning_rate": 9.201886771761772e-06, + "loss": 0.3837, "step": 100785 }, { - "epoch": 3.55, - "learning_rate": 1.030799613357064e-05, - "loss": 0.2543, + "epoch": 3.6324647709662305, + "grad_norm": 0.2718239724636078, + "learning_rate": 9.199625238476139e-06, + "loss": 0.3942, "step": 100790 }, { - "epoch": 3.55, - "learning_rate": 1.0305691347511612e-05, - "loss": 0.2685, + "epoch": 3.6326449706274553, + "grad_norm": 0.2489858865737915, + "learning_rate": 9.19736392046757e-06, + "loss": 0.3506, "step": 100795 }, { - "epoch": 3.55, - "learning_rate": 1.0303386752248497e-05, - "loss": 0.2632, + "epoch": 3.63282517028868, + "grad_norm": 0.2690345048904419, + "learning_rate": 9.195102817766851e-06, + "loss": 0.365, "step": 100800 }, { - "epoch": 3.55, - "learning_rate": 1.0301082347811214e-05, - "loss": 0.2473, + "epoch": 3.6330053699499043, + "grad_norm": 0.25727832317352295, + "learning_rate": 9.192841930404786e-06, + "loss": 0.3702, "step": 100805 }, { - "epoch": 3.55, - "learning_rate": 1.0298778134229689e-05, - "loss": 0.2704, + "epoch": 3.633185569611129, + "grad_norm": 0.28264495730400085, + "learning_rate": 9.190581258412198e-06, + "loss": 0.3825, "step": 100810 }, { - "epoch": 3.55, - "learning_rate": 1.0296474111533849e-05, - "loss": 0.2618, + "epoch": 3.6333657692723538, + "grad_norm": 0.2584122121334076, + "learning_rate": 9.18832080181988e-06, + "loss": 0.4279, "step": 100815 }, { - "epoch": 3.55, - "learning_rate": 1.0294170279753602e-05, - "loss": 0.2552, + "epoch": 3.6335459689335785, + "grad_norm": 0.2514478862285614, + "learning_rate": 9.18606056065863e-06, + "loss": 0.3661, "step": 100820 }, { - "epoch": 3.55, - "learning_rate": 1.0291866638918854e-05, - "loss": 0.2588, + "epoch": 3.6337261685948032, + "grad_norm": 0.2434535175561905, + "learning_rate": 9.183800534959245e-06, + "loss": 0.3543, "step": 100825 }, { - "epoch": 3.55, - "learning_rate": 1.0289563189059534e-05, - "loss": 0.2607, + "epoch": 3.6339063682560275, + "grad_norm": 0.225295290350914, + "learning_rate": 9.181540724752504e-06, + "loss": 0.3573, "step": 100830 }, { - "epoch": 3.55, - "learning_rate": 1.0287259930205542e-05, - "loss": 0.2637, + "epoch": 3.6340865679172523, + "grad_norm": 0.24885191023349762, + "learning_rate": 9.179281130069217e-06, + "loss": 0.3833, "step": 100835 }, { - "epoch": 3.55, - "learning_rate": 1.0284956862386783e-05, - "loss": 0.2253, + "epoch": 3.634266767578477, + "grad_norm": 0.2610785961151123, + "learning_rate": 9.177021750940162e-06, + "loss": 0.3577, "step": 100840 }, { - "epoch": 3.55, - "learning_rate": 1.0282653985633154e-05, - "loss": 0.2524, + "epoch": 3.6344469672397017, + "grad_norm": 0.2569064497947693, + "learning_rate": 9.174762587396124e-06, + "loss": 0.3981, "step": 100845 }, { - "epoch": 3.55, - "learning_rate": 1.0280351299974572e-05, - "loss": 0.243, + "epoch": 3.634627166900926, + "grad_norm": 0.308135986328125, + "learning_rate": 9.17250363946788e-06, + "loss": 0.4227, "step": 100850 }, { - "epoch": 3.55, - "learning_rate": 1.0278048805440926e-05, - "loss": 0.2449, + "epoch": 3.6348073665621508, + "grad_norm": 0.2591281235218048, + "learning_rate": 9.170244907186201e-06, + "loss": 0.3902, "step": 100855 }, { - "epoch": 3.55, - "learning_rate": 1.0275746502062108e-05, - "loss": 0.2679, + "epoch": 3.6349875662233755, + "grad_norm": 0.24686266481876373, + "learning_rate": 9.167986390581879e-06, + "loss": 0.3907, "step": 100860 }, { - "epoch": 3.55, - "learning_rate": 1.0273444389868025e-05, - "loss": 0.2617, + "epoch": 3.6351677658846, + "grad_norm": 0.21753914654254913, + "learning_rate": 9.165728089685687e-06, + "loss": 0.398, "step": 100865 }, { - "epoch": 3.55, - "learning_rate": 1.0271142468888553e-05, - "loss": 0.2594, + "epoch": 3.635347965545825, + "grad_norm": 0.23476707935333252, + "learning_rate": 9.163470004528366e-06, + "loss": 0.3879, "step": 100870 }, { - "epoch": 3.55, - "learning_rate": 1.0268840739153598e-05, - "loss": 0.2612, + "epoch": 3.6355281652070492, + "grad_norm": 0.2765940725803375, + "learning_rate": 9.16121213514071e-06, + "loss": 0.4109, "step": 100875 }, { - "epoch": 3.55, - "learning_rate": 1.026653920069303e-05, - "loss": 0.2597, + "epoch": 3.635708364868274, + "grad_norm": 0.2919420301914215, + "learning_rate": 9.15895448155347e-06, + "loss": 0.3816, "step": 100880 }, { - "epoch": 3.55, - "learning_rate": 1.0264237853536748e-05, - "loss": 0.2657, + "epoch": 3.6358885645294987, + "grad_norm": 0.2056131362915039, + "learning_rate": 9.156697043797402e-06, + "loss": 0.3565, "step": 100885 }, { - "epoch": 3.55, - "learning_rate": 1.0261936697714628e-05, - "loss": 0.259, + "epoch": 3.6360687641907234, + "grad_norm": 0.29402926564216614, + "learning_rate": 9.154439821903286e-06, + "loss": 0.4164, "step": 100890 }, { - "epoch": 3.55, - "learning_rate": 1.0259635733256548e-05, - "loss": 0.254, + "epoch": 3.6362489638519477, + "grad_norm": 0.2530692517757416, + "learning_rate": 9.152182815901841e-06, + "loss": 0.3703, "step": 100895 }, { - "epoch": 3.55, - "learning_rate": 1.0257334960192372e-05, - "loss": 0.2554, + "epoch": 3.6364291635131725, + "grad_norm": 0.24608898162841797, + "learning_rate": 9.149926025823846e-06, + "loss": 0.3821, "step": 100900 }, { - "epoch": 3.55, - "learning_rate": 1.0255034378551994e-05, - "loss": 0.2457, + "epoch": 3.636609363174397, + "grad_norm": 0.1967172771692276, + "learning_rate": 9.147669451700042e-06, + "loss": 0.3682, "step": 100905 }, { - "epoch": 3.55, - "learning_rate": 1.0252733988365281e-05, - "loss": 0.2942, + "epoch": 3.636789562835622, + "grad_norm": 0.2919192910194397, + "learning_rate": 9.14541309356117e-06, + "loss": 0.379, "step": 100910 }, { - "epoch": 3.55, - "learning_rate": 1.0250433789662087e-05, - "loss": 0.2503, + "epoch": 3.6369697624968467, + "grad_norm": 0.22142526507377625, + "learning_rate": 9.143156951437976e-06, + "loss": 0.3938, "step": 100915 }, { - "epoch": 3.55, - "learning_rate": 1.0248133782472299e-05, - "loss": 0.2548, + "epoch": 3.6371499621580714, + "grad_norm": 0.22058270871639252, + "learning_rate": 9.140901025361198e-06, + "loss": 0.3691, "step": 100920 }, { - "epoch": 3.55, - "learning_rate": 1.0245833966825766e-05, - "loss": 0.2613, + "epoch": 3.6373301618192957, + "grad_norm": 0.2393786460161209, + "learning_rate": 9.138645315361566e-06, + "loss": 0.3929, "step": 100925 }, { - "epoch": 3.55, - "learning_rate": 1.0243534342752362e-05, - "loss": 0.2564, + "epoch": 3.6375103614805204, + "grad_norm": 0.26111987233161926, + "learning_rate": 9.13638982146983e-06, + "loss": 0.4086, "step": 100930 }, { - "epoch": 3.55, - "learning_rate": 1.0241234910281938e-05, - "loss": 0.2618, + "epoch": 3.637690561141745, + "grad_norm": 0.2450508177280426, + "learning_rate": 9.134134543716711e-06, + "loss": 0.3776, "step": 100935 }, { - "epoch": 3.55, - "learning_rate": 1.0238935669444347e-05, - "loss": 0.2408, + "epoch": 3.6378707608029694, + "grad_norm": 0.26415956020355225, + "learning_rate": 9.131879482132935e-06, + "loss": 0.4035, "step": 100940 }, { - "epoch": 3.55, - "learning_rate": 1.0236636620269455e-05, - "loss": 0.2501, + "epoch": 3.638050960464194, + "grad_norm": 0.21556943655014038, + "learning_rate": 9.12962463674923e-06, + "loss": 0.3617, "step": 100945 }, { - "epoch": 3.55, - "learning_rate": 1.0234337762787107e-05, - "loss": 0.2572, + "epoch": 3.638231160125419, + "grad_norm": 0.26129695773124695, + "learning_rate": 9.127370007596308e-06, + "loss": 0.3823, "step": 100950 }, { - "epoch": 3.55, - "learning_rate": 1.0232039097027151e-05, - "loss": 0.2402, + "epoch": 3.6384113597866437, + "grad_norm": 0.2311808317899704, + "learning_rate": 9.125115594704905e-06, + "loss": 0.3735, "step": 100955 }, { - "epoch": 3.55, - "learning_rate": 1.0229740623019429e-05, - "loss": 0.2628, + "epoch": 3.6385915594478684, + "grad_norm": 0.24276402592658997, + "learning_rate": 9.122861398105736e-06, + "loss": 0.3719, "step": 100960 }, { - "epoch": 3.55, - "learning_rate": 1.0227442340793799e-05, - "loss": 0.2499, + "epoch": 3.638771759109093, + "grad_norm": 0.2854870557785034, + "learning_rate": 9.12060741782949e-06, + "loss": 0.3717, "step": 100965 }, { - "epoch": 3.55, - "learning_rate": 1.0225144250380086e-05, - "loss": 0.2416, + "epoch": 3.6389519587703174, + "grad_norm": 0.2724928557872772, + "learning_rate": 9.118353653906905e-06, + "loss": 0.4111, "step": 100970 }, { - "epoch": 3.55, - "learning_rate": 1.0222846351808146e-05, - "loss": 0.2722, + "epoch": 3.639132158431542, + "grad_norm": 0.22794464230537415, + "learning_rate": 9.116100106368667e-06, + "loss": 0.3872, "step": 100975 }, { - "epoch": 3.55, - "learning_rate": 1.02205486451078e-05, - "loss": 0.251, + "epoch": 3.639312358092767, + "grad_norm": 0.2465512901544571, + "learning_rate": 9.113846775245506e-06, + "loss": 0.3988, "step": 100980 }, { - "epoch": 3.55, - "learning_rate": 1.0218251130308898e-05, - "loss": 0.2553, + "epoch": 3.639492557753991, + "grad_norm": 0.24606558680534363, + "learning_rate": 9.111593660568099e-06, + "loss": 0.3761, "step": 100985 }, { - "epoch": 3.55, - "learning_rate": 1.0215953807441262e-05, - "loss": 0.2722, + "epoch": 3.639672757415216, + "grad_norm": 0.1907157152891159, + "learning_rate": 9.109340762367141e-06, + "loss": 0.3817, "step": 100990 }, { - "epoch": 3.55, - "learning_rate": 1.0213656676534717e-05, - "loss": 0.2825, + "epoch": 3.6398529570764406, + "grad_norm": 0.2616572976112366, + "learning_rate": 9.107088080673351e-06, + "loss": 0.4135, "step": 100995 }, { - "epoch": 3.55, - "learning_rate": 1.02113597376191e-05, - "loss": 0.2753, + "epoch": 3.6400331567376654, + "grad_norm": 0.2173600196838379, + "learning_rate": 9.104835615517406e-06, + "loss": 0.3634, "step": 101000 }, { - "epoch": 3.55, - "eval_loss": 0.2520409822463989, - "eval_runtime": 10.5341, - "eval_samples_per_second": 9.493, - "eval_steps_per_second": 9.493, + "epoch": 3.6400331567376654, + "eval_loss": 0.4288954436779022, + "eval_runtime": 3.5295, + "eval_samples_per_second": 28.333, + "eval_steps_per_second": 7.083, "step": 101000 }, { - "epoch": 3.55, - "learning_rate": 1.0209062990724233e-05, - "loss": 0.2884, + "epoch": 3.64021335639889, + "grad_norm": 0.2579386234283447, + "learning_rate": 9.10258336693e-06, + "loss": 0.3848, "step": 101005 }, { - "epoch": 3.55, - "learning_rate": 1.0206766435879937e-05, - "loss": 0.2577, + "epoch": 3.640393556060115, + "grad_norm": 0.21844492852687836, + "learning_rate": 9.10033133494182e-06, + "loss": 0.4074, "step": 101010 }, { - "epoch": 3.55, - "learning_rate": 1.0204470073116021e-05, - "loss": 0.2315, + "epoch": 3.640573755721339, + "grad_norm": 0.25436314940452576, + "learning_rate": 9.098079519583536e-06, + "loss": 0.3451, "step": 101015 }, { - "epoch": 3.55, - "learning_rate": 1.0202173902462317e-05, - "loss": 0.2592, + "epoch": 3.640753955382564, + "grad_norm": 0.237601637840271, + "learning_rate": 9.095827920885847e-06, + "loss": 0.3692, "step": 101020 }, { - "epoch": 3.55, - "learning_rate": 1.0199877923948634e-05, - "loss": 0.2662, + "epoch": 3.6409341550437886, + "grad_norm": 0.1813516467809677, + "learning_rate": 9.093576538879425e-06, + "loss": 0.408, "step": 101025 }, { - "epoch": 3.55, - "learning_rate": 1.0197582137604773e-05, - "loss": 0.2575, + "epoch": 3.641114354705013, + "grad_norm": 0.23129382729530334, + "learning_rate": 9.091325373594944e-06, + "loss": 0.3885, "step": 101030 }, { - "epoch": 3.55, - "learning_rate": 1.0195286543460553e-05, - "loss": 0.2617, + "epoch": 3.6412945543662376, + "grad_norm": 0.24837276339530945, + "learning_rate": 9.089074425063074e-06, + "loss": 0.4024, "step": 101035 }, { - "epoch": 3.55, - "learning_rate": 1.0192991141545788e-05, - "loss": 0.2393, + "epoch": 3.6414747540274623, + "grad_norm": 0.23222190141677856, + "learning_rate": 9.086823693314476e-06, + "loss": 0.3736, "step": 101040 }, { - "epoch": 3.56, - "learning_rate": 1.0190695931890277e-05, - "loss": 0.2726, + "epoch": 3.641654953688687, + "grad_norm": 0.2549395263195038, + "learning_rate": 9.084573178379832e-06, + "loss": 0.3688, "step": 101045 }, { - "epoch": 3.56, - "learning_rate": 1.0188400914523818e-05, - "loss": 0.2583, + "epoch": 3.641835153349912, + "grad_norm": 0.3221529722213745, + "learning_rate": 9.082322880289798e-06, + "loss": 0.4044, "step": 101050 }, { - "epoch": 3.56, - "learning_rate": 1.0186106089476205e-05, - "loss": 0.2676, + "epoch": 3.6420153530111365, + "grad_norm": 0.19385772943496704, + "learning_rate": 9.080072799075033e-06, + "loss": 0.3395, "step": 101055 }, { - "epoch": 3.56, - "learning_rate": 1.018381145677725e-05, - "loss": 0.2549, + "epoch": 3.642195552672361, + "grad_norm": 0.21461910009384155, + "learning_rate": 9.077822934766194e-06, + "loss": 0.3626, "step": 101060 }, { - "epoch": 3.56, - "learning_rate": 1.0181517016456738e-05, - "loss": 0.2461, + "epoch": 3.6423757523335856, + "grad_norm": 0.2746890187263489, + "learning_rate": 9.075573287393935e-06, + "loss": 0.3974, "step": 101065 }, { - "epoch": 3.56, - "learning_rate": 1.0179222768544463e-05, - "loss": 0.2421, + "epoch": 3.6425559519948103, + "grad_norm": 0.22592508792877197, + "learning_rate": 9.073323856988898e-06, + "loss": 0.3628, "step": 101070 }, { - "epoch": 3.56, - "learning_rate": 1.0176928713070205e-05, - "loss": 0.2284, + "epoch": 3.642736151656035, + "grad_norm": 0.17644916474819183, + "learning_rate": 9.071074643581757e-06, + "loss": 0.3879, "step": 101075 }, { - "epoch": 3.56, - "learning_rate": 1.0174634850063765e-05, - "loss": 0.2318, + "epoch": 3.6429163513172593, + "grad_norm": 0.20565937459468842, + "learning_rate": 9.068825647203122e-06, + "loss": 0.3596, "step": 101080 }, { - "epoch": 3.56, - "learning_rate": 1.0172341179554915e-05, - "loss": 0.2498, + "epoch": 3.643096550978484, + "grad_norm": 0.2097647488117218, + "learning_rate": 9.066576867883664e-06, + "loss": 0.3634, "step": 101085 }, { - "epoch": 3.56, - "learning_rate": 1.017004770157344e-05, - "loss": 0.2441, + "epoch": 3.643276750639709, + "grad_norm": 0.24162964522838593, + "learning_rate": 9.06432830565401e-06, + "loss": 0.3715, "step": 101090 }, { - "epoch": 3.56, - "learning_rate": 1.0167754416149134e-05, - "loss": 0.2806, + "epoch": 3.6434569503009335, + "grad_norm": 0.2646183371543884, + "learning_rate": 9.062079960544798e-06, + "loss": 0.3756, "step": 101095 }, { - "epoch": 3.56, - "learning_rate": 1.0165461323311762e-05, - "loss": 0.2589, + "epoch": 3.6436371499621583, + "grad_norm": 0.20252090692520142, + "learning_rate": 9.05983183258666e-06, + "loss": 0.3804, "step": 101100 }, { - "epoch": 3.56, - "learning_rate": 1.0163168423091096e-05, - "loss": 0.2643, + "epoch": 3.6438173496233826, + "grad_norm": 0.2239752858877182, + "learning_rate": 9.057583921810225e-06, + "loss": 0.34, "step": 101105 }, { - "epoch": 3.56, - "learning_rate": 1.0160875715516902e-05, - "loss": 0.2619, + "epoch": 3.6439975492846073, + "grad_norm": 0.23255230486392975, + "learning_rate": 9.055336228246119e-06, + "loss": 0.3719, "step": 101110 }, { - "epoch": 3.56, - "learning_rate": 1.0158583200618967e-05, - "loss": 0.2684, + "epoch": 3.644177748945832, + "grad_norm": 0.23270903527736664, + "learning_rate": 9.053088751924976e-06, + "loss": 0.3866, "step": 101115 }, { - "epoch": 3.56, - "learning_rate": 1.015629087842705e-05, - "loss": 0.2496, + "epoch": 3.6443579486070568, + "grad_norm": 0.27508658170700073, + "learning_rate": 9.05084149287741e-06, + "loss": 0.4123, "step": 101120 }, { - "epoch": 3.56, - "learning_rate": 1.015399874897091e-05, - "loss": 0.2581, + "epoch": 3.644538148268281, + "grad_norm": 0.19277596473693848, + "learning_rate": 9.048594451134042e-06, + "loss": 0.3398, "step": 101125 }, { - "epoch": 3.56, - "learning_rate": 1.0151706812280306e-05, - "loss": 0.2518, + "epoch": 3.6447183479295058, + "grad_norm": 0.28608769178390503, + "learning_rate": 9.046347626725487e-06, + "loss": 0.4222, "step": 101130 }, { - "epoch": 3.56, - "learning_rate": 1.0149415068385013e-05, - "loss": 0.2583, + "epoch": 3.6448985475907305, + "grad_norm": 0.21985885500907898, + "learning_rate": 9.04410101968235e-06, + "loss": 0.3681, "step": 101135 }, { - "epoch": 3.56, - "learning_rate": 1.0147123517314765e-05, - "loss": 0.2328, + "epoch": 3.6450787472519552, + "grad_norm": 0.2108525186777115, + "learning_rate": 9.04185463003525e-06, + "loss": 0.338, "step": 101140 }, { - "epoch": 3.56, - "learning_rate": 1.0144832159099343e-05, - "loss": 0.2369, + "epoch": 3.64525894691318, + "grad_norm": 0.28811293840408325, + "learning_rate": 9.039608457814805e-06, + "loss": 0.4011, "step": 101145 }, { - "epoch": 3.56, - "learning_rate": 1.0142540993768476e-05, - "loss": 0.2563, + "epoch": 3.6454391465744047, + "grad_norm": 0.2849256992340088, + "learning_rate": 9.037362503051585e-06, + "loss": 0.4098, "step": 101150 }, { - "epoch": 3.56, - "learning_rate": 1.0140250021351932e-05, - "loss": 0.2836, + "epoch": 3.645619346235629, + "grad_norm": 0.2216586172580719, + "learning_rate": 9.035116765776223e-06, + "loss": 0.355, "step": 101155 }, { - "epoch": 3.56, - "learning_rate": 1.013795924187945e-05, - "loss": 0.2418, + "epoch": 3.6457995458968537, + "grad_norm": 0.22768335044384003, + "learning_rate": 9.032871246019292e-06, + "loss": 0.368, "step": 101160 }, { - "epoch": 3.56, - "learning_rate": 1.013566865538077e-05, - "loss": 0.2433, + "epoch": 3.6459797455580785, + "grad_norm": 0.23428839445114136, + "learning_rate": 9.030625943811408e-06, + "loss": 0.3678, "step": 101165 }, { - "epoch": 3.56, - "learning_rate": 1.0133378261885626e-05, - "loss": 0.2598, + "epoch": 3.6461599452193028, + "grad_norm": 0.2779921889305115, + "learning_rate": 9.028380859183164e-06, + "loss": 0.3978, "step": 101170 }, { - "epoch": 3.56, - "learning_rate": 1.0131088061423779e-05, - "loss": 0.2743, + "epoch": 3.6463401448805275, + "grad_norm": 0.30054476857185364, + "learning_rate": 9.02613599216512e-06, + "loss": 0.3748, "step": 101175 }, { - "epoch": 3.56, - "learning_rate": 1.0128798054024954e-05, - "loss": 0.2507, + "epoch": 3.6465203445417522, + "grad_norm": 0.27546700835227966, + "learning_rate": 9.023891342787888e-06, + "loss": 0.3651, "step": 101180 }, { - "epoch": 3.56, - "learning_rate": 1.0126508239718888e-05, - "loss": 0.2571, + "epoch": 3.646700544202977, + "grad_norm": 0.23770831525325775, + "learning_rate": 9.021646911082044e-06, + "loss": 0.3828, "step": 101185 }, { - "epoch": 3.56, - "learning_rate": 1.01242186185353e-05, - "loss": 0.2496, + "epoch": 3.6468807438642017, + "grad_norm": 0.2967265248298645, + "learning_rate": 9.019402697078167e-06, + "loss": 0.3796, "step": 101190 }, { - "epoch": 3.56, - "learning_rate": 1.0121929190503929e-05, - "loss": 0.2665, + "epoch": 3.6470609435254264, + "grad_norm": 0.22645854949951172, + "learning_rate": 9.017158700806835e-06, + "loss": 0.3777, "step": 101195 }, { - "epoch": 3.56, - "learning_rate": 1.0119639955654512e-05, - "loss": 0.2489, + "epoch": 3.6472411431866507, + "grad_norm": 0.21501436829566956, + "learning_rate": 9.014914922298612e-06, + "loss": 0.3483, "step": 101200 }, { - "epoch": 3.56, - "learning_rate": 1.0117350914016754e-05, - "loss": 0.2636, + "epoch": 3.6474213428478754, + "grad_norm": 0.25435328483581543, + "learning_rate": 9.012671361584088e-06, + "loss": 0.4062, "step": 101205 }, { - "epoch": 3.56, - "learning_rate": 1.0115062065620398e-05, - "loss": 0.2412, + "epoch": 3.6476015425091, + "grad_norm": 0.21039626002311707, + "learning_rate": 9.010428018693823e-06, + "loss": 0.3534, "step": 101210 }, { - "epoch": 3.56, - "learning_rate": 1.011277341049515e-05, - "loss": 0.2654, + "epoch": 3.6477817421703245, + "grad_norm": 0.24590708315372467, + "learning_rate": 9.008184893658378e-06, + "loss": 0.3897, "step": 101215 }, { - "epoch": 3.56, - "learning_rate": 1.0110484948670726e-05, - "loss": 0.2726, + "epoch": 3.647961941831549, + "grad_norm": 0.23743712902069092, + "learning_rate": 9.005941986508318e-06, + "loss": 0.375, "step": 101220 }, { - "epoch": 3.56, - "learning_rate": 1.0108196680176835e-05, - "loss": 0.251, + "epoch": 3.648142141492774, + "grad_norm": 0.23340876400470734, + "learning_rate": 9.0036992972742e-06, + "loss": 0.368, "step": 101225 }, { - "epoch": 3.56, - "learning_rate": 1.0105908605043204e-05, - "loss": 0.2586, + "epoch": 3.6483223411539987, + "grad_norm": 0.2544988691806793, + "learning_rate": 9.001456825986579e-06, + "loss": 0.3809, "step": 101230 }, { - "epoch": 3.56, - "learning_rate": 1.0103620723299537e-05, - "loss": 0.2657, + "epoch": 3.6485025408152234, + "grad_norm": 0.24296405911445618, + "learning_rate": 8.999214572676015e-06, + "loss": 0.3803, "step": 101235 }, { - "epoch": 3.56, - "learning_rate": 1.0101333034975535e-05, - "loss": 0.2426, + "epoch": 3.648682740476448, + "grad_norm": 0.2581973969936371, + "learning_rate": 8.996972537373057e-06, + "loss": 0.364, "step": 101240 }, { - "epoch": 3.56, - "learning_rate": 1.0099045540100898e-05, - "loss": 0.261, + "epoch": 3.6488629401376724, + "grad_norm": 0.22508592903614044, + "learning_rate": 8.99473072010825e-06, + "loss": 0.3848, "step": 101245 }, { - "epoch": 3.56, - "learning_rate": 1.0096758238705334e-05, - "loss": 0.2671, + "epoch": 3.649043139798897, + "grad_norm": 0.2491467446088791, + "learning_rate": 8.992489120912138e-06, + "loss": 0.3736, "step": 101250 }, { - "epoch": 3.56, - "learning_rate": 1.009447113081855e-05, - "loss": 0.2714, + "epoch": 3.649223339460122, + "grad_norm": 0.24909654259681702, + "learning_rate": 8.990247739815252e-06, + "loss": 0.3896, "step": 101255 }, { - "epoch": 3.56, - "learning_rate": 1.0092184216470238e-05, - "loss": 0.254, + "epoch": 3.649403539121346, + "grad_norm": 0.25092822313308716, + "learning_rate": 8.988006576848159e-06, + "loss": 0.3498, "step": 101260 }, { - "epoch": 3.56, - "learning_rate": 1.0089897495690079e-05, - "loss": 0.2622, + "epoch": 3.649583738782571, + "grad_norm": 0.2587062120437622, + "learning_rate": 8.98576563204136e-06, + "loss": 0.3853, "step": 101265 }, { - "epoch": 3.56, - "learning_rate": 1.0087610968507786e-05, - "loss": 0.2505, + "epoch": 3.6497639384437957, + "grad_norm": 0.27524474263191223, + "learning_rate": 8.983524905425413e-06, + "loss": 0.3667, "step": 101270 }, { - "epoch": 3.56, - "learning_rate": 1.0085324634953033e-05, - "loss": 0.2544, + "epoch": 3.6499441381050204, + "grad_norm": 0.2666233479976654, + "learning_rate": 8.981284397030837e-06, + "loss": 0.37, "step": 101275 }, { - "epoch": 3.56, - "learning_rate": 1.0083038495055511e-05, - "loss": 0.2556, + "epoch": 3.650124337766245, + "grad_norm": 0.21499104797840118, + "learning_rate": 8.979044106888152e-06, + "loss": 0.3844, "step": 101280 }, { - "epoch": 3.56, - "learning_rate": 1.0080752548844895e-05, - "loss": 0.241, + "epoch": 3.65030453742747, + "grad_norm": 0.22629836201667786, + "learning_rate": 8.976804035027905e-06, + "loss": 0.3803, "step": 101285 }, { - "epoch": 3.56, - "learning_rate": 1.0078466796350885e-05, - "loss": 0.2487, + "epoch": 3.650484737088694, + "grad_norm": 0.24577264487743378, + "learning_rate": 8.974564181480594e-06, + "loss": 0.3886, "step": 101290 }, { - "epoch": 3.56, - "learning_rate": 1.0076181237603152e-05, - "loss": 0.2585, + "epoch": 3.650664936749919, + "grad_norm": 0.22071078419685364, + "learning_rate": 8.972324546276733e-06, + "loss": 0.402, "step": 101295 }, { - "epoch": 3.56, - "learning_rate": 1.0073895872631359e-05, - "loss": 0.265, + "epoch": 3.6508451364111436, + "grad_norm": 0.2849634289741516, + "learning_rate": 8.97008512944686e-06, + "loss": 0.3814, "step": 101300 }, { - "epoch": 3.56, - "learning_rate": 1.00716107014652e-05, - "loss": 0.2519, + "epoch": 3.6510253360723683, + "grad_norm": 0.22348150610923767, + "learning_rate": 8.967845931021469e-06, + "loss": 0.375, "step": 101305 }, { - "epoch": 3.56, - "learning_rate": 1.0069325724134328e-05, - "loss": 0.2672, + "epoch": 3.6512055357335926, + "grad_norm": 0.25602954626083374, + "learning_rate": 8.965606951031074e-06, + "loss": 0.3992, "step": 101310 }, { - "epoch": 3.56, - "learning_rate": 1.0067040940668435e-05, - "loss": 0.241, + "epoch": 3.6513857353948174, + "grad_norm": 0.270134836435318, + "learning_rate": 8.96336818950618e-06, + "loss": 0.3896, "step": 101315 }, { - "epoch": 3.56, - "learning_rate": 1.0064756351097163e-05, - "loss": 0.2439, + "epoch": 3.651565935056042, + "grad_norm": 0.2468041628599167, + "learning_rate": 8.961129646477281e-06, + "loss": 0.3846, "step": 101320 }, { - "epoch": 3.56, - "learning_rate": 1.0062471955450194e-05, - "loss": 0.2713, + "epoch": 3.651746134717267, + "grad_norm": 0.22694005072116852, + "learning_rate": 8.958891321974896e-06, + "loss": 0.3719, "step": 101325 }, { - "epoch": 3.57, - "learning_rate": 1.0060187753757183e-05, - "loss": 0.25, + "epoch": 3.6519263343784916, + "grad_norm": 0.3231867253780365, + "learning_rate": 8.956653216029509e-06, + "loss": 0.4239, "step": 101330 }, { - "epoch": 3.57, - "learning_rate": 1.005790374604779e-05, - "loss": 0.2872, + "epoch": 3.652106534039716, + "grad_norm": 0.23950925469398499, + "learning_rate": 8.954415328671617e-06, + "loss": 0.3259, "step": 101335 }, { - "epoch": 3.57, - "learning_rate": 1.005561993235166e-05, - "loss": 0.2552, + "epoch": 3.6522867337009406, + "grad_norm": 0.30801981687545776, + "learning_rate": 8.95217765993171e-06, + "loss": 0.3592, "step": 101340 }, { - "epoch": 3.57, - "learning_rate": 1.0053336312698466e-05, - "loss": 0.2638, + "epoch": 3.6524669333621653, + "grad_norm": 0.23010605573654175, + "learning_rate": 8.949940209840266e-06, + "loss": 0.3641, "step": 101345 }, { - "epoch": 3.57, - "learning_rate": 1.0051052887117848e-05, - "loss": 0.2532, + "epoch": 3.65264713302339, + "grad_norm": 0.2432517111301422, + "learning_rate": 8.947702978427786e-06, + "loss": 0.3871, "step": 101350 }, { - "epoch": 3.57, - "learning_rate": 1.004876965563945e-05, - "loss": 0.2728, + "epoch": 3.6528273326846143, + "grad_norm": 0.21140114963054657, + "learning_rate": 8.945465965724756e-06, + "loss": 0.3849, "step": 101355 }, { - "epoch": 3.57, - "learning_rate": 1.0046486618292933e-05, - "loss": 0.2583, + "epoch": 3.653007532345839, + "grad_norm": 0.263443261384964, + "learning_rate": 8.943229171761628e-06, + "loss": 0.4131, "step": 101360 }, { - "epoch": 3.57, - "learning_rate": 1.0044203775107921e-05, - "loss": 0.2902, + "epoch": 3.653187732007064, + "grad_norm": 0.19522389769554138, + "learning_rate": 8.940992596568897e-06, + "loss": 0.3953, "step": 101365 }, { - "epoch": 3.57, - "learning_rate": 1.0041921126114079e-05, - "loss": 0.2379, + "epoch": 3.6533679316682885, + "grad_norm": 0.26923754811286926, + "learning_rate": 8.938756240177037e-06, + "loss": 0.3825, "step": 101370 }, { - "epoch": 3.57, - "learning_rate": 1.0039638671341031e-05, - "loss": 0.2592, + "epoch": 3.6535481313295133, + "grad_norm": 0.24230341613292694, + "learning_rate": 8.936520102616513e-06, + "loss": 0.3723, "step": 101375 }, { - "epoch": 3.57, - "learning_rate": 1.003735641081841e-05, - "loss": 0.2424, + "epoch": 3.6537283309907376, + "grad_norm": 0.27861183881759644, + "learning_rate": 8.934284183917793e-06, + "loss": 0.3888, "step": 101380 }, { - "epoch": 3.57, - "learning_rate": 1.0035074344575863e-05, - "loss": 0.2278, + "epoch": 3.6539085306519623, + "grad_norm": 0.18770521879196167, + "learning_rate": 8.932048484111333e-06, + "loss": 0.3635, "step": 101385 }, { - "epoch": 3.57, - "learning_rate": 1.003279247264301e-05, - "loss": 0.2786, + "epoch": 3.654088730313187, + "grad_norm": 0.2484419047832489, + "learning_rate": 8.929813003227608e-06, + "loss": 0.3887, "step": 101390 }, { - "epoch": 3.57, - "learning_rate": 1.0030510795049486e-05, - "loss": 0.2683, + "epoch": 3.6542689299744118, + "grad_norm": 0.21241828799247742, + "learning_rate": 8.927577741297072e-06, + "loss": 0.3797, "step": 101395 }, { - "epoch": 3.57, - "learning_rate": 1.0028229311824904e-05, - "loss": 0.2549, + "epoch": 3.654449129635636, + "grad_norm": 0.2524920701980591, + "learning_rate": 8.925342698350175e-06, + "loss": 0.3909, "step": 101400 }, { - "epoch": 3.57, - "learning_rate": 1.0025948022998908e-05, - "loss": 0.2537, + "epoch": 3.654629329296861, + "grad_norm": 0.2676452696323395, + "learning_rate": 8.923107874417372e-06, + "loss": 0.3485, "step": 101405 }, { - "epoch": 3.57, - "learning_rate": 1.00236669286011e-05, - "loss": 0.2422, + "epoch": 3.6548095289580855, + "grad_norm": 0.21999165415763855, + "learning_rate": 8.920873269529112e-06, + "loss": 0.3712, "step": 101410 }, { - "epoch": 3.57, - "learning_rate": 1.0021386028661115e-05, - "loss": 0.2395, + "epoch": 3.6549897286193103, + "grad_norm": 0.23482544720172882, + "learning_rate": 8.918638883715832e-06, + "loss": 0.3802, "step": 101415 }, { - "epoch": 3.57, - "learning_rate": 1.0019105323208549e-05, - "loss": 0.2659, + "epoch": 3.655169928280535, + "grad_norm": 0.2594209313392639, + "learning_rate": 8.916404717007992e-06, + "loss": 0.3737, "step": 101420 }, { - "epoch": 3.57, - "learning_rate": 1.0016824812273042e-05, - "loss": 0.2536, + "epoch": 3.6553501279417597, + "grad_norm": 0.20219695568084717, + "learning_rate": 8.914170769436025e-06, + "loss": 0.3846, "step": 101425 }, { - "epoch": 3.57, - "learning_rate": 1.0014544495884188e-05, - "loss": 0.2819, + "epoch": 3.655530327602984, + "grad_norm": 0.24486181139945984, + "learning_rate": 8.911937041030365e-06, + "loss": 0.3891, "step": 101430 }, { - "epoch": 3.57, - "learning_rate": 1.001226437407159e-05, - "loss": 0.2659, + "epoch": 3.6557105272642088, + "grad_norm": 0.19330917298793793, + "learning_rate": 8.90970353182145e-06, + "loss": 0.3704, "step": 101435 }, { - "epoch": 3.57, - "learning_rate": 1.000998444686487e-05, - "loss": 0.2414, + "epoch": 3.6558907269254335, + "grad_norm": 0.21298684179782867, + "learning_rate": 8.907470241839703e-06, + "loss": 0.3495, "step": 101440 }, { - "epoch": 3.57, - "learning_rate": 1.0007704714293623e-05, - "loss": 0.2716, + "epoch": 3.6560709265866578, + "grad_norm": 0.25250443816185, + "learning_rate": 8.905237171115563e-06, + "loss": 0.4116, "step": 101445 }, { - "epoch": 3.57, - "learning_rate": 1.0005425176387448e-05, - "loss": 0.2525, + "epoch": 3.6562511262478825, + "grad_norm": 0.20300345122814178, + "learning_rate": 8.90300431967945e-06, + "loss": 0.3616, "step": 101450 }, { - "epoch": 3.57, - "learning_rate": 1.000314583317594e-05, - "loss": 0.2505, + "epoch": 3.6564313259091072, + "grad_norm": 0.2571481466293335, + "learning_rate": 8.900771687561787e-06, + "loss": 0.3844, "step": 101455 }, { - "epoch": 3.57, - "learning_rate": 1.0000866684688706e-05, - "loss": 0.2487, + "epoch": 3.656611525570332, + "grad_norm": 0.18634074926376343, + "learning_rate": 8.898539274792997e-06, + "loss": 0.3538, "step": 101460 }, { - "epoch": 3.57, - "learning_rate": 9.998587730955336e-06, - "loss": 0.2531, + "epoch": 3.6567917252315567, + "grad_norm": 0.2489531934261322, + "learning_rate": 8.896307081403479e-06, + "loss": 0.3721, "step": 101465 }, { - "epoch": 3.57, - "learning_rate": 9.996308972005408e-06, - "loss": 0.2609, + "epoch": 3.6569719248927814, + "grad_norm": 0.2489020973443985, + "learning_rate": 8.894075107423678e-06, + "loss": 0.3765, "step": 101470 }, { - "epoch": 3.57, - "learning_rate": 9.99403040786852e-06, - "loss": 0.26, + "epoch": 3.6571521245540057, + "grad_norm": 0.19710054993629456, + "learning_rate": 8.891843352883978e-06, + "loss": 0.3843, "step": 101475 }, { - "epoch": 3.57, - "learning_rate": 9.991752038574267e-06, - "loss": 0.2382, + "epoch": 3.6573323242152305, + "grad_norm": 0.22611059248447418, + "learning_rate": 8.889611817814783e-06, + "loss": 0.3832, "step": 101480 }, { - "epoch": 3.57, - "learning_rate": 9.989473864152221e-06, - "loss": 0.2853, + "epoch": 3.657512523876455, + "grad_norm": 0.2789013683795929, + "learning_rate": 8.887380502246517e-06, + "loss": 0.3685, "step": 101485 }, { - "epoch": 3.57, - "learning_rate": 9.987195884631967e-06, - "loss": 0.2495, + "epoch": 3.6576927235376795, + "grad_norm": 0.17864690721035004, + "learning_rate": 8.885149406209573e-06, + "loss": 0.3661, "step": 101490 }, { - "epoch": 3.57, - "learning_rate": 9.984918100043072e-06, - "loss": 0.2519, + "epoch": 3.6578729231989042, + "grad_norm": 0.23747530579566956, + "learning_rate": 8.882918529734346e-06, + "loss": 0.4134, "step": 101495 }, { - "epoch": 3.57, - "learning_rate": 9.982640510415128e-06, - "loss": 0.2421, + "epoch": 3.658053122860129, + "grad_norm": 0.25165247917175293, + "learning_rate": 8.880687872851237e-06, + "loss": 0.3606, "step": 101500 }, { - "epoch": 3.57, - "eval_loss": 0.25163590908050537, - "eval_runtime": 10.5398, - "eval_samples_per_second": 9.488, - "eval_steps_per_second": 9.488, + "epoch": 3.658053122860129, + "eval_loss": 0.4290784001350403, + "eval_runtime": 3.529, + "eval_samples_per_second": 28.337, + "eval_steps_per_second": 7.084, "step": 101500 }, { - "epoch": 3.57, - "learning_rate": 9.9803631157777e-06, - "loss": 0.2317, + "epoch": 3.6582333225213537, + "grad_norm": 0.21503256261348724, + "learning_rate": 8.878457435590626e-06, + "loss": 0.3689, "step": 101505 }, { - "epoch": 3.57, - "learning_rate": 9.978085916160359e-06, - "loss": 0.265, + "epoch": 3.6584135221825784, + "grad_norm": 0.1987353265285492, + "learning_rate": 8.876227217982916e-06, + "loss": 0.3373, "step": 101510 }, { - "epoch": 3.57, - "learning_rate": 9.975808911592662e-06, - "loss": 0.2652, + "epoch": 3.658593721843803, + "grad_norm": 0.29664599895477295, + "learning_rate": 8.87399722005849e-06, + "loss": 0.3837, "step": 101515 }, { - "epoch": 3.57, - "learning_rate": 9.973532102104197e-06, - "loss": 0.2534, + "epoch": 3.6587739215050274, + "grad_norm": 0.22414027154445648, + "learning_rate": 8.871767441847734e-06, + "loss": 0.3662, "step": 101520 }, { - "epoch": 3.57, - "learning_rate": 9.971255487724504e-06, - "loss": 0.2545, + "epoch": 3.658954121166252, + "grad_norm": 0.2331104874610901, + "learning_rate": 8.869537883381022e-06, + "loss": 0.3745, "step": 101525 }, { - "epoch": 3.57, - "learning_rate": 9.968979068483164e-06, - "loss": 0.2848, + "epoch": 3.659134320827477, + "grad_norm": 0.2712317407131195, + "learning_rate": 8.86730854468872e-06, + "loss": 0.3619, "step": 101530 }, { - "epoch": 3.57, - "learning_rate": 9.966702844409714e-06, - "loss": 0.2729, + "epoch": 3.659314520488701, + "grad_norm": 0.22348475456237793, + "learning_rate": 8.865079425801228e-06, + "loss": 0.3921, "step": 101535 }, { - "epoch": 3.57, - "learning_rate": 9.964426815533732e-06, - "loss": 0.2398, + "epoch": 3.659494720149926, + "grad_norm": 0.2932550311088562, + "learning_rate": 8.862850526748917e-06, + "loss": 0.3714, "step": 101540 }, { - "epoch": 3.57, - "learning_rate": 9.962150981884757e-06, - "loss": 0.2565, + "epoch": 3.6596749198111507, + "grad_norm": 0.26502054929733276, + "learning_rate": 8.860621847562123e-06, + "loss": 0.3736, "step": 101545 }, { - "epoch": 3.57, - "learning_rate": 9.959875343492334e-06, - "loss": 0.2747, + "epoch": 3.6598551194723754, + "grad_norm": 0.25092580914497375, + "learning_rate": 8.858393388271238e-06, + "loss": 0.3978, "step": 101550 }, { - "epoch": 3.57, - "learning_rate": 9.957599900386024e-06, - "loss": 0.2581, + "epoch": 3.6600353191336, + "grad_norm": 0.2842051684856415, + "learning_rate": 8.85616514890662e-06, + "loss": 0.3674, "step": 101555 }, { - "epoch": 3.57, - "learning_rate": 9.955324652595369e-06, - "loss": 0.2666, + "epoch": 3.660215518794825, + "grad_norm": 0.2661273777484894, + "learning_rate": 8.853937129498627e-06, + "loss": 0.3835, "step": 101560 }, { - "epoch": 3.57, - "learning_rate": 9.953049600149908e-06, - "loss": 0.2615, + "epoch": 3.660395718456049, + "grad_norm": 0.23841172456741333, + "learning_rate": 8.851709330077615e-06, + "loss": 0.3654, "step": 101565 }, { - "epoch": 3.57, - "learning_rate": 9.950774743079172e-06, - "loss": 0.2477, + "epoch": 3.660575918117274, + "grad_norm": 0.20587527751922607, + "learning_rate": 8.849481750673927e-06, + "loss": 0.383, "step": 101570 }, { - "epoch": 3.57, - "learning_rate": 9.948500081412717e-06, - "loss": 0.2734, + "epoch": 3.6607561177784986, + "grad_norm": 0.2173176258802414, + "learning_rate": 8.84725439131793e-06, + "loss": 0.4062, "step": 101575 }, { - "epoch": 3.57, - "learning_rate": 9.94622561518006e-06, - "loss": 0.2722, + "epoch": 3.6609363174397234, + "grad_norm": 0.201410710811615, + "learning_rate": 8.845027252039969e-06, + "loss": 0.3691, "step": 101580 }, { - "epoch": 3.57, - "learning_rate": 9.943951344410751e-06, - "loss": 0.248, + "epoch": 3.6611165171009477, + "grad_norm": 0.24154742062091827, + "learning_rate": 8.842800332870382e-06, + "loss": 0.3661, "step": 101585 }, { - "epoch": 3.57, - "learning_rate": 9.941677269134303e-06, - "loss": 0.259, + "epoch": 3.6612967167621724, + "grad_norm": 0.18455544114112854, + "learning_rate": 8.84057363383951e-06, + "loss": 0.3598, "step": 101590 }, { - "epoch": 3.57, - "learning_rate": 9.939403389380261e-06, - "loss": 0.2713, + "epoch": 3.661476916423397, + "grad_norm": 0.24699454009532928, + "learning_rate": 8.838347154977697e-06, + "loss": 0.3765, "step": 101595 }, { - "epoch": 3.57, - "learning_rate": 9.937129705178142e-06, - "loss": 0.2572, + "epoch": 3.661657116084622, + "grad_norm": 0.20563150942325592, + "learning_rate": 8.836120896315267e-06, + "loss": 0.3507, "step": 101600 }, { - "epoch": 3.57, - "learning_rate": 9.934856216557465e-06, - "loss": 0.2642, + "epoch": 3.6618373157458466, + "grad_norm": 0.21455486118793488, + "learning_rate": 8.833894857882566e-06, + "loss": 0.365, "step": 101605 }, { - "epoch": 3.57, - "learning_rate": 9.932582923547745e-06, - "loss": 0.2454, + "epoch": 3.662017515407071, + "grad_norm": 0.237900048494339, + "learning_rate": 8.83166903970992e-06, + "loss": 0.3663, "step": 101610 }, { - "epoch": 3.58, - "learning_rate": 9.930309826178513e-06, - "loss": 0.2612, + "epoch": 3.6621977150682956, + "grad_norm": 0.24725571274757385, + "learning_rate": 8.82944344182765e-06, + "loss": 0.382, "step": 101615 }, { - "epoch": 3.58, - "learning_rate": 9.928036924479276e-06, - "loss": 0.2666, + "epoch": 3.6623779147295203, + "grad_norm": 0.28484535217285156, + "learning_rate": 8.827218064266085e-06, + "loss": 0.4055, "step": 101620 }, { - "epoch": 3.58, - "learning_rate": 9.925764218479549e-06, - "loss": 0.2315, + "epoch": 3.662558114390745, + "grad_norm": 0.2544311583042145, + "learning_rate": 8.824992907055534e-06, + "loss": 0.3613, "step": 101625 }, { - "epoch": 3.58, - "learning_rate": 9.923491708208829e-06, - "loss": 0.2454, + "epoch": 3.6627383140519694, + "grad_norm": 0.21593233942985535, + "learning_rate": 8.822767970226332e-06, + "loss": 0.3756, "step": 101630 }, { - "epoch": 3.58, - "learning_rate": 9.921219393696634e-06, - "loss": 0.2555, + "epoch": 3.662918513713194, + "grad_norm": 0.2815852463245392, + "learning_rate": 8.820543253808783e-06, + "loss": 0.4064, "step": 101635 }, { - "epoch": 3.58, - "learning_rate": 9.918947274972476e-06, - "loss": 0.2696, + "epoch": 3.663098713374419, + "grad_norm": 0.299152135848999, + "learning_rate": 8.8183187578332e-06, + "loss": 0.4224, "step": 101640 }, { - "epoch": 3.58, - "learning_rate": 9.916675352065849e-06, - "loss": 0.2541, + "epoch": 3.6632789130356436, + "grad_norm": 0.24634763598442078, + "learning_rate": 8.81609448232989e-06, + "loss": 0.3834, "step": 101645 }, { - "epoch": 3.58, - "learning_rate": 9.914403625006245e-06, - "loss": 0.244, + "epoch": 3.6634591126968683, + "grad_norm": 0.22788767516613007, + "learning_rate": 8.813870427329155e-06, + "loss": 0.3795, "step": 101650 }, { - "epoch": 3.58, - "learning_rate": 9.912132093823173e-06, - "loss": 0.2263, + "epoch": 3.663639312358093, + "grad_norm": 0.26291418075561523, + "learning_rate": 8.811646592861314e-06, + "loss": 0.376, "step": 101655 }, { - "epoch": 3.58, - "learning_rate": 9.909860758546128e-06, - "loss": 0.2481, + "epoch": 3.6638195120193173, + "grad_norm": 0.191041499376297, + "learning_rate": 8.809422978956647e-06, + "loss": 0.3674, "step": 101660 }, { - "epoch": 3.58, - "learning_rate": 9.907589619204583e-06, - "loss": 0.2358, + "epoch": 3.663999711680542, + "grad_norm": 0.28010818362236023, + "learning_rate": 8.80719958564545e-06, + "loss": 0.4071, "step": 101665 }, { - "epoch": 3.58, - "learning_rate": 9.905318675828052e-06, - "loss": 0.2522, + "epoch": 3.664179911341767, + "grad_norm": 0.21684035658836365, + "learning_rate": 8.804976412958029e-06, + "loss": 0.3826, "step": 101670 }, { - "epoch": 3.58, - "learning_rate": 9.903047928446007e-06, - "loss": 0.2481, + "epoch": 3.664360111002991, + "grad_norm": 0.26138171553611755, + "learning_rate": 8.802753460924674e-06, + "loss": 0.3812, "step": 101675 }, { - "epoch": 3.58, - "learning_rate": 9.900777377087939e-06, - "loss": 0.2524, + "epoch": 3.664540310664216, + "grad_norm": 0.20288319885730743, + "learning_rate": 8.800530729575665e-06, + "loss": 0.3448, "step": 101680 }, { - "epoch": 3.58, - "learning_rate": 9.898507021783315e-06, - "loss": 0.2466, + "epoch": 3.6647205103254406, + "grad_norm": 0.2686285674571991, + "learning_rate": 8.798308218941287e-06, + "loss": 0.414, "step": 101685 }, { - "epoch": 3.58, - "learning_rate": 9.896236862561625e-06, - "loss": 0.2591, + "epoch": 3.6649007099866653, + "grad_norm": 0.26645994186401367, + "learning_rate": 8.796085929051814e-06, + "loss": 0.3848, "step": 101690 }, { - "epoch": 3.58, - "learning_rate": 9.893966899452356e-06, - "loss": 0.2576, + "epoch": 3.66508090964789, + "grad_norm": 0.22347326576709747, + "learning_rate": 8.793863859937543e-06, + "loss": 0.3875, "step": 101695 }, { - "epoch": 3.58, - "learning_rate": 9.89169713248497e-06, - "loss": 0.2258, + "epoch": 3.6652611093091148, + "grad_norm": 0.21688346564769745, + "learning_rate": 8.79164201162874e-06, + "loss": 0.3964, "step": 101700 }, { - "epoch": 3.58, - "learning_rate": 9.889427561688933e-06, - "loss": 0.2534, + "epoch": 3.665441308970339, + "grad_norm": 0.2277715802192688, + "learning_rate": 8.789420384155675e-06, + "loss": 0.388, "step": 101705 }, { - "epoch": 3.58, - "learning_rate": 9.88715818709373e-06, - "loss": 0.2556, + "epoch": 3.6656215086315638, + "grad_norm": 0.23789222538471222, + "learning_rate": 8.78719897754862e-06, + "loss": 0.3821, "step": 101710 }, { - "epoch": 3.58, - "learning_rate": 9.884889008728814e-06, - "loss": 0.239, + "epoch": 3.6658017082927885, + "grad_norm": 0.24796278774738312, + "learning_rate": 8.784977791837831e-06, + "loss": 0.3832, "step": 101715 }, { - "epoch": 3.58, - "learning_rate": 9.882620026623657e-06, - "loss": 0.2524, + "epoch": 3.665981907954013, + "grad_norm": 0.28839823603630066, + "learning_rate": 8.782756827053588e-06, + "loss": 0.3859, "step": 101720 }, { - "epoch": 3.58, - "learning_rate": 9.880351240807707e-06, - "loss": 0.2553, + "epoch": 3.6661621076152375, + "grad_norm": 0.2025182694196701, + "learning_rate": 8.780536083226154e-06, + "loss": 0.3639, "step": 101725 }, { - "epoch": 3.58, - "learning_rate": 9.878082651310439e-06, - "loss": 0.2408, + "epoch": 3.6663423072764623, + "grad_norm": 0.24186572432518005, + "learning_rate": 8.778315560385756e-06, + "loss": 0.3622, "step": 101730 }, { - "epoch": 3.58, - "learning_rate": 9.875814258161303e-06, - "loss": 0.2744, + "epoch": 3.666522506937687, + "grad_norm": 0.19818326830863953, + "learning_rate": 8.776095258562677e-06, + "loss": 0.4044, "step": 101735 }, { - "epoch": 3.58, - "learning_rate": 9.873546061389743e-06, - "loss": 0.2682, + "epoch": 3.6667027065989117, + "grad_norm": 0.2045317143201828, + "learning_rate": 8.773875177787161e-06, + "loss": 0.3493, "step": 101740 }, { - "epoch": 3.58, - "learning_rate": 9.871278061025227e-06, - "loss": 0.2772, + "epoch": 3.6668829062601365, + "grad_norm": 0.2028067409992218, + "learning_rate": 8.771655318089445e-06, + "loss": 0.3869, "step": 101745 }, { - "epoch": 3.58, - "learning_rate": 9.869010257097185e-06, - "loss": 0.2773, + "epoch": 3.6670631059213608, + "grad_norm": 0.24124419689178467, + "learning_rate": 8.769435679499798e-06, + "loss": 0.3822, "step": 101750 }, { - "epoch": 3.58, - "learning_rate": 9.866742649635085e-06, - "loss": 0.2547, + "epoch": 3.6672433055825855, + "grad_norm": 0.2072051465511322, + "learning_rate": 8.767216262048433e-06, + "loss": 0.3652, "step": 101755 }, { - "epoch": 3.58, - "learning_rate": 9.864475238668358e-06, - "loss": 0.2462, + "epoch": 3.6674235052438102, + "grad_norm": 0.22216108441352844, + "learning_rate": 8.764997065765612e-06, + "loss": 0.404, "step": 101760 }, { - "epoch": 3.58, - "learning_rate": 9.862208024226435e-06, - "loss": 0.2654, + "epoch": 3.6676037049050345, + "grad_norm": 0.23680134117603302, + "learning_rate": 8.762778090681562e-06, + "loss": 0.3964, "step": 101765 }, { - "epoch": 3.58, - "learning_rate": 9.859941006338773e-06, - "loss": 0.2604, + "epoch": 3.6677839045662592, + "grad_norm": 0.29067909717559814, + "learning_rate": 8.760559336826519e-06, + "loss": 0.3897, "step": 101770 }, { - "epoch": 3.58, - "learning_rate": 9.8576741850348e-06, - "loss": 0.258, + "epoch": 3.667964104227484, + "grad_norm": 0.244569793343544, + "learning_rate": 8.758340804230709e-06, + "loss": 0.3949, "step": 101775 }, { - "epoch": 3.58, - "learning_rate": 9.855407560343941e-06, - "loss": 0.2535, + "epoch": 3.6681443038887087, + "grad_norm": 0.22354869544506073, + "learning_rate": 8.75612249292436e-06, + "loss": 0.3871, "step": 101780 }, { - "epoch": 3.58, - "learning_rate": 9.853141132295643e-06, - "loss": 0.2421, + "epoch": 3.6683245035499334, + "grad_norm": 0.2619014084339142, + "learning_rate": 8.75390440293769e-06, + "loss": 0.4095, "step": 101785 }, { - "epoch": 3.58, - "learning_rate": 9.850874900919325e-06, - "loss": 0.2586, + "epoch": 3.668504703211158, + "grad_norm": 0.2473766952753067, + "learning_rate": 8.751686534300934e-06, + "loss": 0.37, "step": 101790 }, { - "epoch": 3.58, - "learning_rate": 9.848608866244403e-06, - "loss": 0.2664, + "epoch": 3.6686849028723825, + "grad_norm": 0.24038511514663696, + "learning_rate": 8.749468887044308e-06, + "loss": 0.3576, "step": 101795 }, { - "epoch": 3.58, - "learning_rate": 9.846343028300318e-06, - "loss": 0.2558, + "epoch": 3.668865102533607, + "grad_norm": 0.25563234090805054, + "learning_rate": 8.747251461198016e-06, + "loss": 0.4066, "step": 101800 }, { - "epoch": 3.58, - "learning_rate": 9.844077387116474e-06, - "loss": 0.2521, + "epoch": 3.669045302194832, + "grad_norm": 0.2780168354511261, + "learning_rate": 8.745034256792281e-06, + "loss": 0.3688, "step": 101805 }, { - "epoch": 3.58, - "learning_rate": 9.841811942722306e-06, - "loss": 0.2885, + "epoch": 3.6692255018560567, + "grad_norm": 0.2796255350112915, + "learning_rate": 8.742817273857295e-06, + "loss": 0.3828, "step": 101810 }, { - "epoch": 3.58, - "learning_rate": 9.83954669514722e-06, - "loss": 0.2562, + "epoch": 3.669405701517281, + "grad_norm": 0.2719496190547943, + "learning_rate": 8.740600512423289e-06, + "loss": 0.3553, "step": 101815 }, { - "epoch": 3.58, - "learning_rate": 9.837281644420621e-06, - "loss": 0.2398, + "epoch": 3.6695859011785057, + "grad_norm": 0.22607314586639404, + "learning_rate": 8.73838397252046e-06, + "loss": 0.368, "step": 101820 }, { - "epoch": 3.58, - "learning_rate": 9.835016790571935e-06, - "loss": 0.2481, + "epoch": 3.6697661008397304, + "grad_norm": 0.20450633764266968, + "learning_rate": 8.73616765417898e-06, + "loss": 0.3893, "step": 101825 }, { - "epoch": 3.58, - "learning_rate": 9.832752133630563e-06, - "loss": 0.279, + "epoch": 3.669946300500955, + "grad_norm": 0.20725052058696747, + "learning_rate": 8.73395155742908e-06, + "loss": 0.3655, "step": 101830 }, { - "epoch": 3.58, - "learning_rate": 9.830487673625907e-06, - "loss": 0.2565, + "epoch": 3.67012650016218, + "grad_norm": 0.2661026418209076, + "learning_rate": 8.731735682300932e-06, + "loss": 0.3748, "step": 101835 }, { - "epoch": 3.58, - "learning_rate": 9.828223410587364e-06, - "loss": 0.2632, + "epoch": 3.670306699823404, + "grad_norm": 0.20922096073627472, + "learning_rate": 8.72952002882475e-06, + "loss": 0.3746, "step": 101840 }, { - "epoch": 3.58, - "learning_rate": 9.825959344544353e-06, - "loss": 0.2412, + "epoch": 3.670486899484629, + "grad_norm": 0.2124052345752716, + "learning_rate": 8.7273045970307e-06, + "loss": 0.3466, "step": 101845 }, { - "epoch": 3.58, - "learning_rate": 9.823695475526245e-06, - "loss": 0.2662, + "epoch": 3.6706670991458537, + "grad_norm": 0.24056388437747955, + "learning_rate": 8.725089386948967e-06, + "loss": 0.3639, "step": 101850 }, { - "epoch": 3.58, - "learning_rate": 9.821431803562465e-06, - "loss": 0.2596, + "epoch": 3.6708472988070784, + "grad_norm": 0.21469198167324066, + "learning_rate": 8.722874398609749e-06, + "loss": 0.4017, "step": 101855 }, { - "epoch": 3.58, - "learning_rate": 9.819168328682377e-06, - "loss": 0.2805, + "epoch": 3.6710274984683027, + "grad_norm": 0.23535849153995514, + "learning_rate": 8.720659632043207e-06, + "loss": 0.4005, "step": 101860 }, { - "epoch": 3.58, - "learning_rate": 9.816905050915392e-06, - "loss": 0.2598, + "epoch": 3.6712076981295274, + "grad_norm": 0.2353014349937439, + "learning_rate": 8.718445087279541e-06, + "loss": 0.341, "step": 101865 }, { - "epoch": 3.58, - "learning_rate": 9.814641970290891e-06, - "loss": 0.2627, + "epoch": 3.671387897790752, + "grad_norm": 0.24978400766849518, + "learning_rate": 8.716230764348901e-06, + "loss": 0.3621, "step": 101870 }, { - "epoch": 3.58, - "learning_rate": 9.812379086838255e-06, - "loss": 0.2407, + "epoch": 3.671568097451977, + "grad_norm": 0.20733198523521423, + "learning_rate": 8.714016663281457e-06, + "loss": 0.3481, "step": 101875 }, { - "epoch": 3.58, - "learning_rate": 9.810116400586861e-06, - "loss": 0.2531, + "epoch": 3.6717482971132016, + "grad_norm": 0.23899643123149872, + "learning_rate": 8.71180278410739e-06, + "loss": 0.3813, "step": 101880 }, { - "epoch": 3.58, - "learning_rate": 9.807853911566103e-06, - "loss": 0.2484, + "epoch": 3.671928496774426, + "grad_norm": 0.27464571595191956, + "learning_rate": 8.709589126856857e-06, + "loss": 0.4055, "step": 101885 }, { - "epoch": 3.58, - "learning_rate": 9.80559161980535e-06, - "loss": 0.2519, + "epoch": 3.6721086964356506, + "grad_norm": 0.27018100023269653, + "learning_rate": 8.707375691560018e-06, + "loss": 0.3866, "step": 101890 }, { - "epoch": 3.58, - "learning_rate": 9.803329525333977e-06, - "loss": 0.2567, + "epoch": 3.6722888960968754, + "grad_norm": 0.22291234135627747, + "learning_rate": 8.70516247824703e-06, + "loss": 0.3894, "step": 101895 }, { - "epoch": 3.59, - "learning_rate": 9.801067628181343e-06, - "loss": 0.2459, + "epoch": 3.6724690957581, + "grad_norm": 0.24809099733829498, + "learning_rate": 8.702949486948042e-06, + "loss": 0.4137, "step": 101900 }, { - "epoch": 3.59, - "learning_rate": 9.798805928376839e-06, - "loss": 0.2837, + "epoch": 3.6726492954193244, + "grad_norm": 0.27931684255599976, + "learning_rate": 8.70073671769322e-06, + "loss": 0.3755, "step": 101905 }, { - "epoch": 3.59, - "learning_rate": 9.796544425949817e-06, - "loss": 0.2564, + "epoch": 3.672829495080549, + "grad_norm": 0.2072281837463379, + "learning_rate": 8.698524170512703e-06, + "loss": 0.4101, "step": 101910 }, { - "epoch": 3.59, - "learning_rate": 9.794283120929643e-06, - "loss": 0.2478, + "epoch": 3.673009694741774, + "grad_norm": 0.20641744136810303, + "learning_rate": 8.696311845436641e-06, + "loss": 0.3667, "step": 101915 }, { - "epoch": 3.59, - "learning_rate": 9.792022013345692e-06, - "loss": 0.2534, + "epoch": 3.6731898944029986, + "grad_norm": 0.28100159764289856, + "learning_rate": 8.69409974249517e-06, + "loss": 0.3929, "step": 101920 }, { - "epoch": 3.59, - "learning_rate": 9.78976110322731e-06, - "loss": 0.2457, + "epoch": 3.6733700940642233, + "grad_norm": 0.2308221310377121, + "learning_rate": 8.691887861718437e-06, + "loss": 0.3521, "step": 101925 }, { - "epoch": 3.59, - "learning_rate": 9.78750039060386e-06, - "loss": 0.2703, + "epoch": 3.673550293725448, + "grad_norm": 0.23538222908973694, + "learning_rate": 8.689676203136565e-06, + "loss": 0.3795, "step": 101930 }, { - "epoch": 3.59, - "learning_rate": 9.785239875504682e-06, - "loss": 0.2379, + "epoch": 3.6737304933866723, + "grad_norm": 0.25849369168281555, + "learning_rate": 8.687464766779712e-06, + "loss": 0.3653, "step": 101935 }, { - "epoch": 3.59, - "learning_rate": 9.782979557959142e-06, - "loss": 0.2682, + "epoch": 3.673910693047897, + "grad_norm": 0.19717663526535034, + "learning_rate": 8.685253552677978e-06, + "loss": 0.3281, "step": 101940 }, { - "epoch": 3.59, - "learning_rate": 9.780719437996588e-06, - "loss": 0.2586, + "epoch": 3.674090892709122, + "grad_norm": 0.21411502361297607, + "learning_rate": 8.683042560861512e-06, + "loss": 0.367, "step": 101945 }, { - "epoch": 3.59, - "learning_rate": 9.778459515646363e-06, - "loss": 0.2426, + "epoch": 3.674271092370346, + "grad_norm": 0.23891699314117432, + "learning_rate": 8.680831791360433e-06, + "loss": 0.3794, "step": 101950 }, { - "epoch": 3.59, - "learning_rate": 9.776199790937798e-06, - "loss": 0.2492, + "epoch": 3.674451292031571, + "grad_norm": 0.2534111440181732, + "learning_rate": 8.678621244204863e-06, + "loss": 0.4035, "step": 101955 }, { - "epoch": 3.59, - "learning_rate": 9.773940263900258e-06, - "loss": 0.2677, + "epoch": 3.6746314916927956, + "grad_norm": 0.21654155850410461, + "learning_rate": 8.676410919424917e-06, + "loss": 0.3668, "step": 101960 }, { - "epoch": 3.59, - "learning_rate": 9.77168093456306e-06, - "loss": 0.2639, + "epoch": 3.6748116913540203, + "grad_norm": 0.2257722020149231, + "learning_rate": 8.674200817050712e-06, + "loss": 0.3829, "step": 101965 }, { - "epoch": 3.59, - "learning_rate": 9.769421802955556e-06, - "loss": 0.2521, + "epoch": 3.674991891015245, + "grad_norm": 0.20194071531295776, + "learning_rate": 8.671990937112354e-06, + "loss": 0.3617, "step": 101970 }, { - "epoch": 3.59, - "learning_rate": 9.767162869107063e-06, - "loss": 0.2546, + "epoch": 3.6751720906764698, + "grad_norm": 0.2552418112754822, + "learning_rate": 8.669781279639963e-06, + "loss": 0.3767, "step": 101975 }, { - "epoch": 3.59, - "learning_rate": 9.764904133046934e-06, - "loss": 0.2585, + "epoch": 3.675352290337694, + "grad_norm": 0.1989719569683075, + "learning_rate": 8.667571844663644e-06, + "loss": 0.3989, "step": 101980 }, { - "epoch": 3.59, - "learning_rate": 9.762645594804481e-06, - "loss": 0.2574, + "epoch": 3.675532489998919, + "grad_norm": 0.2214037925004959, + "learning_rate": 8.665362632213494e-06, + "loss": 0.3783, "step": 101985 }, { - "epoch": 3.59, - "learning_rate": 9.760387254409034e-06, - "loss": 0.2536, + "epoch": 3.6757126896601435, + "grad_norm": 0.23372018337249756, + "learning_rate": 8.663153642319616e-06, + "loss": 0.3871, "step": 101990 }, { - "epoch": 3.59, - "learning_rate": 9.758129111889908e-06, - "loss": 0.2695, + "epoch": 3.675892889321368, + "grad_norm": 0.20797449350357056, + "learning_rate": 8.6609448750121e-06, + "loss": 0.3736, "step": 101995 }, { - "epoch": 3.59, - "learning_rate": 9.75587116727644e-06, - "loss": 0.2769, + "epoch": 3.6760730889825926, + "grad_norm": 0.2792084515094757, + "learning_rate": 8.658736330321051e-06, + "loss": 0.3913, "step": 102000 }, { - "epoch": 3.59, - "eval_loss": 0.25186842679977417, - "eval_runtime": 10.5576, - "eval_samples_per_second": 9.472, - "eval_steps_per_second": 9.472, + "epoch": 3.6760730889825926, + "eval_loss": 0.4288567304611206, + "eval_runtime": 3.5476, + "eval_samples_per_second": 28.188, + "eval_steps_per_second": 7.047, "step": 102000 }, { - "epoch": 3.59, - "learning_rate": 9.753613420597935e-06, - "loss": 0.2926, + "epoch": 3.6762532886438173, + "grad_norm": 0.2272174209356308, + "learning_rate": 8.656528008276568e-06, + "loss": 0.3661, "step": 102005 }, { - "epoch": 3.59, - "learning_rate": 9.751355871883713e-06, - "loss": 0.2503, + "epoch": 3.676433488305042, + "grad_norm": 0.21868175268173218, + "learning_rate": 8.654319908908709e-06, + "loss": 0.3666, "step": 102010 }, { - "epoch": 3.59, - "learning_rate": 9.74909852116308e-06, - "loss": 0.2665, + "epoch": 3.6766136879662668, + "grad_norm": 0.23744815587997437, + "learning_rate": 8.652112032247587e-06, + "loss": 0.369, "step": 102015 }, { - "epoch": 3.59, - "learning_rate": 9.746841368465351e-06, - "loss": 0.2326, + "epoch": 3.6767938876274915, + "grad_norm": 0.26380473375320435, + "learning_rate": 8.649904378323262e-06, + "loss": 0.3666, "step": 102020 }, { - "epoch": 3.59, - "learning_rate": 9.74458441381984e-06, - "loss": 0.2563, + "epoch": 3.6769740872887158, + "grad_norm": 0.2755199372768402, + "learning_rate": 8.647696947165834e-06, + "loss": 0.3548, "step": 102025 }, { - "epoch": 3.59, - "learning_rate": 9.742327657255843e-06, - "loss": 0.248, + "epoch": 3.6771542869499405, + "grad_norm": 0.2914251685142517, + "learning_rate": 8.645489738805376e-06, + "loss": 0.3747, "step": 102030 }, { - "epoch": 3.59, - "learning_rate": 9.740071098802672e-06, - "loss": 0.2442, + "epoch": 3.6773344866111652, + "grad_norm": 0.18029101192951202, + "learning_rate": 8.64328275327194e-06, + "loss": 0.3884, "step": 102035 }, { - "epoch": 3.59, - "learning_rate": 9.73781473848962e-06, - "loss": 0.2555, + "epoch": 3.6775146862723895, + "grad_norm": 0.22522252798080444, + "learning_rate": 8.641075990595615e-06, + "loss": 0.3904, "step": 102040 }, { - "epoch": 3.59, - "learning_rate": 9.735558576345988e-06, - "loss": 0.2531, + "epoch": 3.6776948859336143, + "grad_norm": 0.2504473924636841, + "learning_rate": 8.638869450806455e-06, + "loss": 0.3892, "step": 102045 }, { - "epoch": 3.59, - "learning_rate": 9.733302612401058e-06, - "loss": 0.2493, + "epoch": 3.677875085594839, + "grad_norm": 0.28689128160476685, + "learning_rate": 8.63666313393455e-06, + "loss": 0.3805, "step": 102050 }, { - "epoch": 3.59, - "learning_rate": 9.731046846684142e-06, - "loss": 0.2575, + "epoch": 3.6780552852560637, + "grad_norm": 0.3135242164134979, + "learning_rate": 8.634457040009932e-06, + "loss": 0.373, "step": 102055 }, { - "epoch": 3.59, - "learning_rate": 9.728791279224519e-06, - "loss": 0.2549, + "epoch": 3.6782354849172885, + "grad_norm": 0.2535361051559448, + "learning_rate": 8.63225116906266e-06, + "loss": 0.3851, "step": 102060 }, { - "epoch": 3.59, - "learning_rate": 9.726535910051477e-06, - "loss": 0.2429, + "epoch": 3.678415684578513, + "grad_norm": 0.27366381883621216, + "learning_rate": 8.630045521122806e-06, + "loss": 0.3767, "step": 102065 }, { - "epoch": 3.59, - "learning_rate": 9.724280739194294e-06, - "loss": 0.2302, + "epoch": 3.6785958842397375, + "grad_norm": 0.2666814923286438, + "learning_rate": 8.62784009622041e-06, + "loss": 0.3867, "step": 102070 }, { - "epoch": 3.59, - "learning_rate": 9.722025766682254e-06, - "loss": 0.2686, + "epoch": 3.6787760839009622, + "grad_norm": 0.27532005310058594, + "learning_rate": 8.625634894385525e-06, + "loss": 0.3659, "step": 102075 }, { - "epoch": 3.59, - "learning_rate": 9.719770992544655e-06, - "loss": 0.2265, + "epoch": 3.678956283562187, + "grad_norm": 0.2169187366962433, + "learning_rate": 8.623429915648195e-06, + "loss": 0.3966, "step": 102080 }, { - "epoch": 3.59, - "learning_rate": 9.717516416810756e-06, - "loss": 0.2452, + "epoch": 3.6791364832234117, + "grad_norm": 0.20126007497310638, + "learning_rate": 8.62122516003845e-06, + "loss": 0.3569, "step": 102085 }, { - "epoch": 3.59, - "learning_rate": 9.715262039509829e-06, - "loss": 0.2526, + "epoch": 3.679316682884636, + "grad_norm": 0.22194866836071014, + "learning_rate": 8.619020627586355e-06, + "loss": 0.3845, "step": 102090 }, { - "epoch": 3.59, - "learning_rate": 9.713007860671158e-06, - "loss": 0.2746, + "epoch": 3.6794968825458607, + "grad_norm": 0.2033412605524063, + "learning_rate": 8.61681631832193e-06, + "loss": 0.3834, "step": 102095 }, { - "epoch": 3.59, - "learning_rate": 9.710753880324005e-06, - "loss": 0.2545, + "epoch": 3.6796770822070854, + "grad_norm": 0.23441757261753082, + "learning_rate": 8.61461223227521e-06, + "loss": 0.4229, "step": 102100 }, { - "epoch": 3.59, - "learning_rate": 9.708500098497638e-06, - "loss": 0.2662, + "epoch": 3.67985728186831, + "grad_norm": 0.2877858877182007, + "learning_rate": 8.612408369476225e-06, + "loss": 0.3905, "step": 102105 }, { - "epoch": 3.59, - "learning_rate": 9.706246515221312e-06, - "loss": 0.2388, + "epoch": 3.680037481529535, + "grad_norm": 0.2478102594614029, + "learning_rate": 8.610204729955005e-06, + "loss": 0.3757, "step": 102110 }, { - "epoch": 3.59, - "learning_rate": 9.703993130524302e-06, - "loss": 0.2357, + "epoch": 3.680217681190759, + "grad_norm": 0.2399631142616272, + "learning_rate": 8.608001313741562e-06, + "loss": 0.3803, "step": 102115 }, { - "epoch": 3.59, - "learning_rate": 9.701739944435864e-06, - "loss": 0.2314, + "epoch": 3.680397880851984, + "grad_norm": 0.2504538595676422, + "learning_rate": 8.605798120865946e-06, + "loss": 0.381, "step": 102120 }, { - "epoch": 3.59, - "learning_rate": 9.69948695698524e-06, - "loss": 0.249, + "epoch": 3.6805780805132087, + "grad_norm": 0.21540558338165283, + "learning_rate": 8.60359515135814e-06, + "loss": 0.4004, "step": 102125 }, { - "epoch": 3.59, - "learning_rate": 9.697234168201696e-06, - "loss": 0.262, + "epoch": 3.6807582801744334, + "grad_norm": 0.28474679589271545, + "learning_rate": 8.601392405248186e-06, + "loss": 0.4054, "step": 102130 }, { - "epoch": 3.59, - "learning_rate": 9.69498157811449e-06, - "loss": 0.2401, + "epoch": 3.6809384798356577, + "grad_norm": 0.24228844046592712, + "learning_rate": 8.59918988256608e-06, + "loss": 0.4117, "step": 102135 }, { - "epoch": 3.59, - "learning_rate": 9.69272918675286e-06, - "loss": 0.2485, + "epoch": 3.6811186794968824, + "grad_norm": 0.2137162685394287, + "learning_rate": 8.596987583341842e-06, + "loss": 0.3891, "step": 102140 }, { - "epoch": 3.59, - "learning_rate": 9.690476994146044e-06, - "loss": 0.2577, + "epoch": 3.681298879158107, + "grad_norm": 0.22238872945308685, + "learning_rate": 8.594785507605468e-06, + "loss": 0.3851, "step": 102145 }, { - "epoch": 3.59, - "learning_rate": 9.688225000323306e-06, - "loss": 0.2478, + "epoch": 3.681479078819332, + "grad_norm": 0.21729840338230133, + "learning_rate": 8.592583655386969e-06, + "loss": 0.3649, "step": 102150 }, { - "epoch": 3.59, - "learning_rate": 9.685973205313876e-06, - "loss": 0.2574, + "epoch": 3.6816592784805566, + "grad_norm": 0.205953449010849, + "learning_rate": 8.590382026716331e-06, + "loss": 0.3353, "step": 102155 }, { - "epoch": 3.59, - "learning_rate": 9.683721609146989e-06, - "loss": 0.255, + "epoch": 3.6818394781417814, + "grad_norm": 0.2748815715312958, + "learning_rate": 8.58818062162357e-06, + "loss": 0.425, "step": 102160 }, { - "epoch": 3.59, - "learning_rate": 9.681470211851878e-06, - "loss": 0.2443, + "epoch": 3.6820196778030057, + "grad_norm": 0.29391714930534363, + "learning_rate": 8.58597944013867e-06, + "loss": 0.4082, "step": 102165 }, { - "epoch": 3.59, - "learning_rate": 9.679219013457789e-06, - "loss": 0.2414, + "epoch": 3.6821998774642304, + "grad_norm": 0.24622267484664917, + "learning_rate": 8.583778482291621e-06, + "loss": 0.3797, "step": 102170 }, { - "epoch": 3.59, - "learning_rate": 9.676968013993943e-06, - "loss": 0.2342, + "epoch": 3.682380077125455, + "grad_norm": 0.27090001106262207, + "learning_rate": 8.581577748112416e-06, + "loss": 0.37, "step": 102175 }, { - "epoch": 3.59, - "learning_rate": 9.67471721348956e-06, - "loss": 0.2552, + "epoch": 3.6825602767866794, + "grad_norm": 0.24336892366409302, + "learning_rate": 8.579377237631022e-06, + "loss": 0.3865, "step": 102180 }, { - "epoch": 3.6, - "learning_rate": 9.672466611973882e-06, - "loss": 0.2323, + "epoch": 3.682740476447904, + "grad_norm": 0.18202580511569977, + "learning_rate": 8.577176950877444e-06, + "loss": 0.3838, "step": 102185 }, { - "epoch": 3.6, - "learning_rate": 9.670216209476118e-06, - "loss": 0.2672, + "epoch": 3.682920676109129, + "grad_norm": 0.25919443368911743, + "learning_rate": 8.574976887881653e-06, + "loss": 0.379, "step": 102190 }, { - "epoch": 3.6, - "learning_rate": 9.6679660060255e-06, - "loss": 0.259, + "epoch": 3.6831008757703536, + "grad_norm": 0.2314002364873886, + "learning_rate": 8.572777048673619e-06, + "loss": 0.3772, "step": 102195 }, { - "epoch": 3.6, - "learning_rate": 9.66571600165124e-06, - "loss": 0.2511, + "epoch": 3.6832810754315783, + "grad_norm": 0.22203220427036285, + "learning_rate": 8.57057743328332e-06, + "loss": 0.3834, "step": 102200 }, { - "epoch": 3.6, - "learning_rate": 9.663466196382544e-06, - "loss": 0.2588, + "epoch": 3.683461275092803, + "grad_norm": 0.24144263565540314, + "learning_rate": 8.568378041740712e-06, + "loss": 0.3648, "step": 102205 }, { - "epoch": 3.6, - "learning_rate": 9.66121659024864e-06, - "loss": 0.2726, + "epoch": 3.6836414747540274, + "grad_norm": 0.22700710594654083, + "learning_rate": 8.566178874075781e-06, + "loss": 0.4007, "step": 102210 }, { - "epoch": 3.6, - "learning_rate": 9.658967183278731e-06, - "loss": 0.2671, + "epoch": 3.683821674415252, + "grad_norm": 0.22427678108215332, + "learning_rate": 8.563979930318489e-06, + "loss": 0.3775, "step": 102215 }, { - "epoch": 3.6, - "learning_rate": 9.656717975502023e-06, - "loss": 0.2524, + "epoch": 3.684001874076477, + "grad_norm": 0.2833479344844818, + "learning_rate": 8.561781210498773e-06, + "loss": 0.3957, "step": 102220 }, { - "epoch": 3.6, - "learning_rate": 9.654468966947711e-06, - "loss": 0.2584, + "epoch": 3.684182073737701, + "grad_norm": 0.25942087173461914, + "learning_rate": 8.559582714646613e-06, + "loss": 0.3475, "step": 102225 }, { - "epoch": 3.6, - "learning_rate": 9.652220157645014e-06, - "loss": 0.2484, + "epoch": 3.684362273398926, + "grad_norm": 0.2398940771818161, + "learning_rate": 8.557384442791947e-06, + "loss": 0.3867, "step": 102230 }, { - "epoch": 3.6, - "learning_rate": 9.649971547623119e-06, - "loss": 0.2767, + "epoch": 3.6845424730601506, + "grad_norm": 0.25691062211990356, + "learning_rate": 8.55518639496475e-06, + "loss": 0.4184, "step": 102235 }, { - "epoch": 3.6, - "learning_rate": 9.647723136911233e-06, - "loss": 0.2603, + "epoch": 3.6847226727213753, + "grad_norm": 0.3172163665294647, + "learning_rate": 8.552988571194948e-06, + "loss": 0.4036, "step": 102240 }, { - "epoch": 3.6, - "learning_rate": 9.645474925538536e-06, - "loss": 0.2558, + "epoch": 3.6849028723826, + "grad_norm": 0.2886956036090851, + "learning_rate": 8.550790971512484e-06, + "loss": 0.3484, "step": 102245 }, { - "epoch": 3.6, - "learning_rate": 9.643226913534236e-06, - "loss": 0.2626, + "epoch": 3.685083072043825, + "grad_norm": 0.26184308528900146, + "learning_rate": 8.548593595947315e-06, + "loss": 0.3872, "step": 102250 }, { - "epoch": 3.6, - "learning_rate": 9.640979100927517e-06, - "loss": 0.267, + "epoch": 3.685263271705049, + "grad_norm": 0.21521615982055664, + "learning_rate": 8.546396444529375e-06, + "loss": 0.3831, "step": 102255 }, { - "epoch": 3.6, - "learning_rate": 9.63873148774755e-06, - "loss": 0.2714, + "epoch": 3.685443471366274, + "grad_norm": 0.2815273404121399, + "learning_rate": 8.544199517288599e-06, + "loss": 0.3915, "step": 102260 }, { - "epoch": 3.6, - "learning_rate": 9.636484074023542e-06, - "loss": 0.2546, + "epoch": 3.6856236710274985, + "grad_norm": 0.24685360491275787, + "learning_rate": 8.542002814254918e-06, + "loss": 0.3755, "step": 102265 }, { - "epoch": 3.6, - "learning_rate": 9.634236859784663e-06, - "loss": 0.2572, + "epoch": 3.685803870688723, + "grad_norm": 0.21425403654575348, + "learning_rate": 8.539806335458253e-06, + "loss": 0.3594, "step": 102270 }, { - "epoch": 3.6, - "learning_rate": 9.631989845060094e-06, - "loss": 0.2706, + "epoch": 3.6859840703499476, + "grad_norm": 0.20560602843761444, + "learning_rate": 8.537610080928548e-06, + "loss": 0.3628, "step": 102275 }, { - "epoch": 3.6, - "learning_rate": 9.629743029878995e-06, - "loss": 0.2668, + "epoch": 3.6861642700111723, + "grad_norm": 0.2760855555534363, + "learning_rate": 8.53541405069572e-06, + "loss": 0.4007, "step": 102280 }, { - "epoch": 3.6, - "learning_rate": 9.627496414270568e-06, - "loss": 0.2564, + "epoch": 3.686344469672397, + "grad_norm": 0.24321669340133667, + "learning_rate": 8.533218244789683e-06, + "loss": 0.3586, "step": 102285 }, { - "epoch": 3.6, - "learning_rate": 9.625249998263958e-06, - "loss": 0.245, + "epoch": 3.6865246693336218, + "grad_norm": 0.23994041979312897, + "learning_rate": 8.531022663240366e-06, + "loss": 0.3742, "step": 102290 }, { - "epoch": 3.6, - "learning_rate": 9.62300378188835e-06, - "loss": 0.2856, + "epoch": 3.6867048689948465, + "grad_norm": 0.23098506033420563, + "learning_rate": 8.528827306077672e-06, + "loss": 0.3835, "step": 102295 }, { - "epoch": 3.6, - "learning_rate": 9.6207577651729e-06, - "loss": 0.2775, + "epoch": 3.686885068656071, + "grad_norm": 0.21299470961093903, + "learning_rate": 8.526632173331511e-06, + "loss": 0.3864, "step": 102300 }, { - "epoch": 3.6, - "learning_rate": 9.618511948146783e-06, - "loss": 0.281, + "epoch": 3.6870652683172955, + "grad_norm": 0.20803943276405334, + "learning_rate": 8.524437265031815e-06, + "loss": 0.4014, "step": 102305 }, { - "epoch": 3.6, - "learning_rate": 9.61626633083915e-06, - "loss": 0.2514, + "epoch": 3.6872454679785203, + "grad_norm": 0.26612281799316406, + "learning_rate": 8.522242581208451e-06, + "loss": 0.3805, "step": 102310 }, { - "epoch": 3.6, - "learning_rate": 9.614020913279161e-06, - "loss": 0.2598, + "epoch": 3.687425667639745, + "grad_norm": 0.26680296659469604, + "learning_rate": 8.520048121891352e-06, + "loss": 0.3989, "step": 102315 }, { - "epoch": 3.6, - "learning_rate": 9.611775695495961e-06, - "loss": 0.2906, + "epoch": 3.6876058673009693, + "grad_norm": 0.26344892382621765, + "learning_rate": 8.517853887110408e-06, + "loss": 0.3972, "step": 102320 }, { - "epoch": 3.6, - "learning_rate": 9.609530677518724e-06, - "loss": 0.2555, + "epoch": 3.687786066962194, + "grad_norm": 0.2039593607187271, + "learning_rate": 8.515659876895504e-06, + "loss": 0.3739, "step": 102325 }, { - "epoch": 3.6, - "learning_rate": 9.607285859376587e-06, - "loss": 0.2633, + "epoch": 3.6879662666234188, + "grad_norm": 0.24437233805656433, + "learning_rate": 8.51346609127656e-06, + "loss": 0.3616, "step": 102330 }, { - "epoch": 3.6, - "learning_rate": 9.605041241098697e-06, - "loss": 0.2663, + "epoch": 3.6881464662846435, + "grad_norm": 0.23033076524734497, + "learning_rate": 8.51127253028344e-06, + "loss": 0.4245, "step": 102335 }, { - "epoch": 3.6, - "learning_rate": 9.602796822714194e-06, - "loss": 0.2656, + "epoch": 3.688326665945868, + "grad_norm": 0.2611679136753082, + "learning_rate": 8.509079193946032e-06, + "loss": 0.3759, "step": 102340 }, { - "epoch": 3.6, - "learning_rate": 9.600552604252237e-06, - "loss": 0.257, + "epoch": 3.6885068656070925, + "grad_norm": 0.19574002921581268, + "learning_rate": 8.506886082294233e-06, + "loss": 0.3695, "step": 102345 }, { - "epoch": 3.6, - "learning_rate": 9.598308585741944e-06, - "loss": 0.2543, + "epoch": 3.6886870652683172, + "grad_norm": 0.2866314947605133, + "learning_rate": 8.504693195357921e-06, + "loss": 0.3685, "step": 102350 }, { - "epoch": 3.6, - "learning_rate": 9.596064767212477e-06, - "loss": 0.2677, + "epoch": 3.688867264929542, + "grad_norm": 0.23495450615882874, + "learning_rate": 8.502500533166968e-06, + "loss": 0.4131, "step": 102355 }, { - "epoch": 3.6, - "learning_rate": 9.593821148692945e-06, - "loss": 0.2726, + "epoch": 3.6890474645907667, + "grad_norm": 0.2283242791891098, + "learning_rate": 8.500308095751253e-06, + "loss": 0.3596, "step": 102360 }, { - "epoch": 3.6, - "learning_rate": 9.591577730212503e-06, - "loss": 0.2637, + "epoch": 3.689227664251991, + "grad_norm": 0.3276098370552063, + "learning_rate": 8.498115883140637e-06, + "loss": 0.3671, "step": 102365 }, { - "epoch": 3.6, - "learning_rate": 9.58933451180027e-06, - "loss": 0.2391, + "epoch": 3.6894078639132157, + "grad_norm": 0.2699391543865204, + "learning_rate": 8.495923895365004e-06, + "loss": 0.3776, "step": 102370 }, { - "epoch": 3.6, - "learning_rate": 9.58709149348536e-06, - "loss": 0.2445, + "epoch": 3.6895880635744405, + "grad_norm": 0.2524622082710266, + "learning_rate": 8.493732132454215e-06, + "loss": 0.3558, "step": 102375 }, { - "epoch": 3.6, - "learning_rate": 9.58484867529692e-06, - "loss": 0.2422, + "epoch": 3.689768263235665, + "grad_norm": 0.26742005348205566, + "learning_rate": 8.49154059443813e-06, + "loss": 0.3457, "step": 102380 }, { - "epoch": 3.6, - "learning_rate": 9.582606057264062e-06, - "loss": 0.2746, + "epoch": 3.68994846289689, + "grad_norm": 0.2498975545167923, + "learning_rate": 8.489349281346606e-06, + "loss": 0.3697, "step": 102385 }, { - "epoch": 3.6, - "learning_rate": 9.580363639415905e-06, - "loss": 0.2527, + "epoch": 3.6901286625581142, + "grad_norm": 0.20344579219818115, + "learning_rate": 8.487158193209497e-06, + "loss": 0.3884, "step": 102390 }, { - "epoch": 3.6, - "learning_rate": 9.578121421781552e-06, - "loss": 0.2454, + "epoch": 3.690308862219339, + "grad_norm": 0.2646947205066681, + "learning_rate": 8.484967330056665e-06, + "loss": 0.3891, "step": 102395 }, { - "epoch": 3.6, - "learning_rate": 9.57587940439014e-06, - "loss": 0.2562, + "epoch": 3.6904890618805637, + "grad_norm": 0.23882755637168884, + "learning_rate": 8.482776691917966e-06, + "loss": 0.3553, "step": 102400 }, { - "epoch": 3.6, - "learning_rate": 9.573637587270759e-06, - "loss": 0.2745, + "epoch": 3.6906692615417884, + "grad_norm": 0.20374254882335663, + "learning_rate": 8.480586278823219e-06, + "loss": 0.3649, "step": 102405 }, { - "epoch": 3.6, - "learning_rate": 9.571395970452537e-06, - "loss": 0.2321, + "epoch": 3.6908494612030127, + "grad_norm": 0.18348126113414764, + "learning_rate": 8.478396090802294e-06, + "loss": 0.338, "step": 102410 }, { - "epoch": 3.6, - "learning_rate": 9.569154553964563e-06, - "loss": 0.2629, + "epoch": 3.6910296608642374, + "grad_norm": 0.22053396701812744, + "learning_rate": 8.476206127885026e-06, + "loss": 0.3812, "step": 102415 }, { - "epoch": 3.6, - "learning_rate": 9.566913337835956e-06, - "loss": 0.2616, + "epoch": 3.691209860525462, + "grad_norm": 0.19936566054821014, + "learning_rate": 8.474016390101247e-06, + "loss": 0.3908, "step": 102420 }, { - "epoch": 3.6, - "learning_rate": 9.564672322095808e-06, - "loss": 0.2612, + "epoch": 3.691390060186687, + "grad_norm": 0.20026947557926178, + "learning_rate": 8.471826877480795e-06, + "loss": 0.3673, "step": 102425 }, { - "epoch": 3.6, - "learning_rate": 9.562431506773217e-06, - "loss": 0.2632, + "epoch": 3.6915702598479117, + "grad_norm": 0.24228374660015106, + "learning_rate": 8.469637590053497e-06, + "loss": 0.3657, "step": 102430 }, { - "epoch": 3.6, - "learning_rate": 9.560190891897272e-06, - "loss": 0.2614, + "epoch": 3.6917504595091364, + "grad_norm": 0.2570672035217285, + "learning_rate": 8.467448527849192e-06, + "loss": 0.344, "step": 102435 }, { - "epoch": 3.6, - "learning_rate": 9.55795047749708e-06, - "loss": 0.244, + "epoch": 3.6919306591703607, + "grad_norm": 0.255535364151001, + "learning_rate": 8.4652596908977e-06, + "loss": 0.377, "step": 102440 }, { - "epoch": 3.6, - "learning_rate": 9.555710263601728e-06, - "loss": 0.2503, + "epoch": 3.6921108588315854, + "grad_norm": 0.2862887680530548, + "learning_rate": 8.463071079228846e-06, + "loss": 0.3874, "step": 102445 }, { - "epoch": 3.6, - "learning_rate": 9.553470250240296e-06, - "loss": 0.2715, + "epoch": 3.69229105849281, + "grad_norm": 0.2560741901397705, + "learning_rate": 8.460882692872446e-06, + "loss": 0.3996, "step": 102450 }, { - "epoch": 3.6, - "learning_rate": 9.551230437441866e-06, - "loss": 0.2537, + "epoch": 3.6924712581540344, + "grad_norm": 0.1878066509962082, + "learning_rate": 8.458694531858308e-06, + "loss": 0.3576, "step": 102455 }, { - "epoch": 3.6, - "learning_rate": 9.548990825235527e-06, - "loss": 0.2516, + "epoch": 3.692651457815259, + "grad_norm": 0.2039957493543625, + "learning_rate": 8.456506596216262e-06, + "loss": 0.3603, "step": 102460 }, { - "epoch": 3.61, - "learning_rate": 9.546751413650368e-06, - "loss": 0.25, + "epoch": 3.692831657476484, + "grad_norm": 0.2823259234428406, + "learning_rate": 8.454318885976113e-06, + "loss": 0.3855, "step": 102465 }, { - "epoch": 3.61, - "learning_rate": 9.544512202715458e-06, - "loss": 0.2575, + "epoch": 3.6930118571377086, + "grad_norm": 0.20816364884376526, + "learning_rate": 8.452131401167665e-06, + "loss": 0.3641, "step": 102470 }, { - "epoch": 3.61, - "learning_rate": 9.542273192459863e-06, - "loss": 0.2565, + "epoch": 3.6931920567989334, + "grad_norm": 0.248849555850029, + "learning_rate": 8.449944141820723e-06, + "loss": 0.3656, "step": 102475 }, { - "epoch": 3.61, - "learning_rate": 9.540034382912673e-06, - "loss": 0.2556, + "epoch": 3.693372256460158, + "grad_norm": 0.21117424964904785, + "learning_rate": 8.447757107965088e-06, + "loss": 0.3622, "step": 102480 }, { - "epoch": 3.61, - "learning_rate": 9.537795774102948e-06, - "loss": 0.2792, + "epoch": 3.6935524561213824, + "grad_norm": 0.21693001687526703, + "learning_rate": 8.445570299630548e-06, + "loss": 0.3607, "step": 102485 }, { - "epoch": 3.61, - "learning_rate": 9.535557366059747e-06, - "loss": 0.236, + "epoch": 3.693732655782607, + "grad_norm": 0.24611179530620575, + "learning_rate": 8.443383716846917e-06, + "loss": 0.3802, "step": 102490 }, { - "epoch": 3.61, - "learning_rate": 9.533319158812148e-06, - "loss": 0.2493, + "epoch": 3.693912855443832, + "grad_norm": 0.199144646525383, + "learning_rate": 8.441197359643977e-06, + "loss": 0.3146, "step": 102495 }, { - "epoch": 3.61, - "learning_rate": 9.53108115238921e-06, - "loss": 0.2487, + "epoch": 3.694093055105056, + "grad_norm": 0.2069302648305893, + "learning_rate": 8.439011228051515e-06, + "loss": 0.3344, "step": 102500 }, { - "epoch": 3.61, - "eval_loss": 0.2519163489341736, - "eval_runtime": 10.5511, - "eval_samples_per_second": 9.478, - "eval_steps_per_second": 9.478, + "epoch": 3.694093055105056, + "eval_loss": 0.4291780889034271, + "eval_runtime": 3.5329, + "eval_samples_per_second": 28.305, + "eval_steps_per_second": 7.076, "step": 102500 }, { - "epoch": 3.61, - "learning_rate": 9.528843346819989e-06, - "loss": 0.2756, + "epoch": 3.694273254766281, + "grad_norm": 0.22869972884655, + "learning_rate": 8.436825322099324e-06, + "loss": 0.402, "step": 102505 }, { - "epoch": 3.61, - "learning_rate": 9.526605742133529e-06, - "loss": 0.2386, + "epoch": 3.6944534544275056, + "grad_norm": 0.21545152366161346, + "learning_rate": 8.434639641817168e-06, + "loss": 0.3588, "step": 102510 }, { - "epoch": 3.61, - "learning_rate": 9.524368338358897e-06, - "loss": 0.2588, + "epoch": 3.6946336540887303, + "grad_norm": 0.20983858406543732, + "learning_rate": 8.43245418723486e-06, + "loss": 0.3756, "step": 102515 }, { - "epoch": 3.61, - "learning_rate": 9.522131135525153e-06, - "loss": 0.2416, + "epoch": 3.694813853749955, + "grad_norm": 0.23540320992469788, + "learning_rate": 8.430268958382146e-06, + "loss": 0.3386, "step": 102520 }, { - "epoch": 3.61, - "learning_rate": 9.519894133661337e-06, - "loss": 0.2571, + "epoch": 3.69499405341118, + "grad_norm": 0.2099265158176422, + "learning_rate": 8.428083955288801e-06, + "loss": 0.3688, "step": 102525 }, { - "epoch": 3.61, - "learning_rate": 9.517657332796485e-06, - "loss": 0.2645, + "epoch": 3.695174253072404, + "grad_norm": 0.2085736095905304, + "learning_rate": 8.425899177984611e-06, + "loss": 0.3659, "step": 102530 }, { - "epoch": 3.61, - "learning_rate": 9.515420732959654e-06, - "loss": 0.2454, + "epoch": 3.695354452733629, + "grad_norm": 0.23389971256256104, + "learning_rate": 8.423714626499338e-06, + "loss": 0.355, "step": 102535 }, { - "epoch": 3.61, - "learning_rate": 9.513184334179883e-06, - "loss": 0.2603, + "epoch": 3.6955346523948536, + "grad_norm": 0.23065683245658875, + "learning_rate": 8.421530300862743e-06, + "loss": 0.3645, "step": 102540 }, { - "epoch": 3.61, - "learning_rate": 9.510948136486209e-06, - "loss": 0.2355, + "epoch": 3.695714852056078, + "grad_norm": 0.21496419608592987, + "learning_rate": 8.419346201104588e-06, + "loss": 0.3849, "step": 102545 }, { - "epoch": 3.61, - "learning_rate": 9.508712139907654e-06, - "loss": 0.2529, + "epoch": 3.6958950517173026, + "grad_norm": 0.22192232310771942, + "learning_rate": 8.41716232725462e-06, + "loss": 0.3783, "step": 102550 }, { - "epoch": 3.61, - "learning_rate": 9.506476344473272e-06, - "loss": 0.2456, + "epoch": 3.6960752513785273, + "grad_norm": 0.21217724680900574, + "learning_rate": 8.414978679342617e-06, + "loss": 0.3721, "step": 102555 }, { - "epoch": 3.61, - "learning_rate": 9.504240750212088e-06, - "loss": 0.2642, + "epoch": 3.696255451039752, + "grad_norm": 0.27816903591156006, + "learning_rate": 8.412795257398318e-06, + "loss": 0.3908, "step": 102560 }, { - "epoch": 3.61, - "learning_rate": 9.502005357153113e-06, - "loss": 0.2516, + "epoch": 3.696435650700977, + "grad_norm": 0.25314852595329285, + "learning_rate": 8.410612061451473e-06, + "loss": 0.3816, "step": 102565 }, { - "epoch": 3.61, - "learning_rate": 9.499770165325386e-06, - "loss": 0.2802, + "epoch": 3.6966158503622015, + "grad_norm": 0.25173550844192505, + "learning_rate": 8.408429091531825e-06, + "loss": 0.3694, "step": 102570 }, { - "epoch": 3.61, - "learning_rate": 9.497535174757934e-06, - "loss": 0.2455, + "epoch": 3.696796050023426, + "grad_norm": 0.2536860704421997, + "learning_rate": 8.406246347669108e-06, + "loss": 0.3842, "step": 102575 }, { - "epoch": 3.61, - "learning_rate": 9.495300385479777e-06, - "loss": 0.2484, + "epoch": 3.6969762496846506, + "grad_norm": 0.22247686982154846, + "learning_rate": 8.404063829893083e-06, + "loss": 0.4039, "step": 102580 }, { - "epoch": 3.61, - "learning_rate": 9.493065797519923e-06, - "loss": 0.2567, + "epoch": 3.6971564493458753, + "grad_norm": 0.20821170508861542, + "learning_rate": 8.401881538233483e-06, + "loss": 0.3945, "step": 102585 }, { - "epoch": 3.61, - "learning_rate": 9.490831410907382e-06, - "loss": 0.2646, + "epoch": 3.6973366490071, + "grad_norm": 0.252389132976532, + "learning_rate": 8.399699472720019e-06, + "loss": 0.3411, "step": 102590 }, { - "epoch": 3.61, - "learning_rate": 9.488597225671184e-06, - "loss": 0.2687, + "epoch": 3.6975168486683243, + "grad_norm": 0.2539133131504059, + "learning_rate": 8.397517633382441e-06, + "loss": 0.3972, "step": 102595 }, { - "epoch": 3.61, - "learning_rate": 9.486363241840326e-06, - "loss": 0.2501, + "epoch": 3.697697048329549, + "grad_norm": 0.17788057029247284, + "learning_rate": 8.395336020250472e-06, + "loss": 0.3556, "step": 102600 }, { - "epoch": 3.61, - "learning_rate": 9.48412945944381e-06, - "loss": 0.2685, + "epoch": 3.6978772479907738, + "grad_norm": 0.20301634073257446, + "learning_rate": 8.393154633353825e-06, + "loss": 0.3843, "step": 102605 }, { - "epoch": 3.61, - "learning_rate": 9.481895878510658e-06, - "loss": 0.2516, + "epoch": 3.6980574476519985, + "grad_norm": 0.2251247763633728, + "learning_rate": 8.39097347272225e-06, + "loss": 0.3663, "step": 102610 }, { - "epoch": 3.61, - "learning_rate": 9.47966249906986e-06, - "loss": 0.264, + "epoch": 3.6982376473132232, + "grad_norm": 0.285604327917099, + "learning_rate": 8.388792538385429e-06, + "loss": 0.3637, "step": 102615 }, { - "epoch": 3.61, - "learning_rate": 9.477429321150409e-06, - "loss": 0.248, + "epoch": 3.6984178469744475, + "grad_norm": 0.2661183178424835, + "learning_rate": 8.3866118303731e-06, + "loss": 0.4143, "step": 102620 }, { - "epoch": 3.61, - "learning_rate": 9.475196344781315e-06, - "loss": 0.2655, + "epoch": 3.6985980466356723, + "grad_norm": 0.2364463359117508, + "learning_rate": 8.384431348714972e-06, + "loss": 0.3785, "step": 102625 }, { - "epoch": 3.61, - "learning_rate": 9.472963569991555e-06, - "loss": 0.2697, + "epoch": 3.698778246296897, + "grad_norm": 0.21531599760055542, + "learning_rate": 8.382251093440747e-06, + "loss": 0.3636, "step": 102630 }, { - "epoch": 3.61, - "learning_rate": 9.470730996810142e-06, - "loss": 0.2444, + "epoch": 3.6989584459581217, + "grad_norm": 0.23172800242900848, + "learning_rate": 8.380071064580133e-06, + "loss": 0.3799, "step": 102635 }, { - "epoch": 3.61, - "learning_rate": 9.468498625266051e-06, - "loss": 0.2492, + "epoch": 3.699138645619346, + "grad_norm": 0.2571316063404083, + "learning_rate": 8.377891262162827e-06, + "loss": 0.3758, "step": 102640 }, { - "epoch": 3.61, - "learning_rate": 9.466266455388262e-06, - "loss": 0.2685, + "epoch": 3.6993188452805708, + "grad_norm": 0.22325757145881653, + "learning_rate": 8.37571168621854e-06, + "loss": 0.3447, "step": 102645 }, { - "epoch": 3.61, - "learning_rate": 9.464034487205775e-06, - "loss": 0.2759, + "epoch": 3.6994990449417955, + "grad_norm": 0.2073674499988556, + "learning_rate": 8.373532336776965e-06, + "loss": 0.4042, "step": 102650 }, { - "epoch": 3.61, - "learning_rate": 9.461802720747562e-06, - "loss": 0.2395, + "epoch": 3.6996792446030202, + "grad_norm": 0.22034046053886414, + "learning_rate": 8.371353213867792e-06, + "loss": 0.386, "step": 102655 }, { - "epoch": 3.61, - "learning_rate": 9.459571156042599e-06, - "loss": 0.2341, + "epoch": 3.699859444264245, + "grad_norm": 0.23101651668548584, + "learning_rate": 8.369174317520714e-06, + "loss": 0.4023, "step": 102660 }, { - "epoch": 3.61, - "learning_rate": 9.457339793119858e-06, - "loss": 0.2506, + "epoch": 3.7000396439254697, + "grad_norm": 0.2687579095363617, + "learning_rate": 8.366995647765413e-06, + "loss": 0.3909, "step": 102665 }, { - "epoch": 3.61, - "learning_rate": 9.45510863200832e-06, - "loss": 0.2364, + "epoch": 3.700219843586694, + "grad_norm": 0.24219973385334015, + "learning_rate": 8.364817204631569e-06, + "loss": 0.3828, "step": 102670 }, { - "epoch": 3.61, - "learning_rate": 9.452877672736946e-06, - "loss": 0.2614, + "epoch": 3.7004000432479187, + "grad_norm": 0.2693396508693695, + "learning_rate": 8.36263898814888e-06, + "loss": 0.3629, "step": 102675 }, { - "epoch": 3.61, - "learning_rate": 9.450646915334718e-06, - "loss": 0.2472, + "epoch": 3.7005802429091434, + "grad_norm": 0.20992809534072876, + "learning_rate": 8.36046099834701e-06, + "loss": 0.3739, "step": 102680 }, { - "epoch": 3.61, - "learning_rate": 9.448416359830583e-06, - "loss": 0.2616, + "epoch": 3.7007604425703677, + "grad_norm": 0.22777998447418213, + "learning_rate": 8.35828323525564e-06, + "loss": 0.3974, "step": 102685 }, { - "epoch": 3.61, - "learning_rate": 9.446186006253523e-06, - "loss": 0.258, + "epoch": 3.7009406422315925, + "grad_norm": 0.25267812609672546, + "learning_rate": 8.35610569890444e-06, + "loss": 0.3982, "step": 102690 }, { - "epoch": 3.61, - "learning_rate": 9.443955854632488e-06, - "loss": 0.2586, + "epoch": 3.701120841892817, + "grad_norm": 0.24532164633274078, + "learning_rate": 8.353928389323064e-06, + "loss": 0.3529, "step": 102695 }, { - "epoch": 3.61, - "learning_rate": 9.441725904996431e-06, - "loss": 0.2646, + "epoch": 3.701301041554042, + "grad_norm": 0.23484160006046295, + "learning_rate": 8.351751306541215e-06, + "loss": 0.3878, "step": 102700 }, { - "epoch": 3.61, - "learning_rate": 9.439496157374303e-06, - "loss": 0.2476, + "epoch": 3.7014812412152667, + "grad_norm": 0.2185060977935791, + "learning_rate": 8.349574450588518e-06, + "loss": 0.3754, "step": 102705 }, { - "epoch": 3.61, - "learning_rate": 9.437266611795073e-06, - "loss": 0.2542, + "epoch": 3.7016614408764914, + "grad_norm": 0.28905272483825684, + "learning_rate": 8.347397821494637e-06, + "loss": 0.3616, "step": 102710 }, { - "epoch": 3.61, - "learning_rate": 9.435037268287677e-06, - "loss": 0.2702, + "epoch": 3.7018416405377157, + "grad_norm": 0.25226137042045593, + "learning_rate": 8.345221419289247e-06, + "loss": 0.387, "step": 102715 }, { - "epoch": 3.61, - "learning_rate": 9.432808126881057e-06, - "loss": 0.2632, + "epoch": 3.7020218401989404, + "grad_norm": 0.22827744483947754, + "learning_rate": 8.343045244001982e-06, + "loss": 0.3774, "step": 102720 }, { - "epoch": 3.61, - "learning_rate": 9.430579187604174e-06, - "loss": 0.2578, + "epoch": 3.702202039860165, + "grad_norm": 0.2841804623603821, + "learning_rate": 8.340869295662517e-06, + "loss": 0.4008, "step": 102725 }, { - "epoch": 3.61, - "learning_rate": 9.428350450485951e-06, - "loss": 0.251, + "epoch": 3.7023822395213895, + "grad_norm": 0.2542993724346161, + "learning_rate": 8.338693574300474e-06, + "loss": 0.3553, "step": 102730 }, { - "epoch": 3.61, - "learning_rate": 9.426121915555341e-06, - "loss": 0.2416, + "epoch": 3.702562439182614, + "grad_norm": 0.21926385164260864, + "learning_rate": 8.336518079945497e-06, + "loss": 0.4037, "step": 102735 }, { - "epoch": 3.61, - "learning_rate": 9.423893582841268e-06, - "loss": 0.2561, + "epoch": 3.702742638843839, + "grad_norm": 0.2608276903629303, + "learning_rate": 8.334342812627244e-06, + "loss": 0.3645, "step": 102740 }, { - "epoch": 3.61, - "learning_rate": 9.42166545237268e-06, - "loss": 0.2564, + "epoch": 3.7029228385050637, + "grad_norm": 0.2661699950695038, + "learning_rate": 8.332167772375344e-06, + "loss": 0.3456, "step": 102745 }, { - "epoch": 3.62, - "learning_rate": 9.419437524178499e-06, - "loss": 0.2659, + "epoch": 3.7031030381662884, + "grad_norm": 0.18789133429527283, + "learning_rate": 8.32999295921943e-06, + "loss": 0.3741, "step": 102750 }, { - "epoch": 3.62, - "learning_rate": 9.417209798287654e-06, - "loss": 0.2413, + "epoch": 3.703283237827513, + "grad_norm": 0.276394248008728, + "learning_rate": 8.327818373189133e-06, + "loss": 0.3662, "step": 102755 }, { - "epoch": 3.62, - "learning_rate": 9.414982274729061e-06, - "loss": 0.2587, + "epoch": 3.7034634374887374, + "grad_norm": 0.20188158750534058, + "learning_rate": 8.325644014314077e-06, + "loss": 0.3959, "step": 102760 }, { - "epoch": 3.62, - "learning_rate": 9.412754953531663e-06, - "loss": 0.2574, + "epoch": 3.703643637149962, + "grad_norm": 0.22388440370559692, + "learning_rate": 8.323469882623899e-06, + "loss": 0.3678, "step": 102765 }, { - "epoch": 3.62, - "learning_rate": 9.410527834724369e-06, - "loss": 0.2834, + "epoch": 3.703823836811187, + "grad_norm": 0.2965695559978485, + "learning_rate": 8.321295978148217e-06, + "loss": 0.3772, "step": 102770 }, { - "epoch": 3.62, - "learning_rate": 9.408300918336097e-06, - "loss": 0.248, + "epoch": 3.704004036472411, + "grad_norm": 0.2891329824924469, + "learning_rate": 8.319122300916649e-06, + "loss": 0.3989, "step": 102775 }, { - "epoch": 3.62, - "learning_rate": 9.406074204395754e-06, - "loss": 0.2823, + "epoch": 3.704184236133636, + "grad_norm": 0.2619994580745697, + "learning_rate": 8.316948850958809e-06, + "loss": 0.3902, "step": 102780 }, { - "epoch": 3.62, - "learning_rate": 9.403847692932268e-06, - "loss": 0.2686, + "epoch": 3.7043644357948606, + "grad_norm": 0.24362410604953766, + "learning_rate": 8.31477562830431e-06, + "loss": 0.3854, "step": 102785 }, { - "epoch": 3.62, - "learning_rate": 9.401621383974532e-06, - "loss": 0.2514, + "epoch": 3.7045446354560854, + "grad_norm": 0.27109575271606445, + "learning_rate": 8.312602632982756e-06, + "loss": 0.373, "step": 102790 }, { - "epoch": 3.62, - "learning_rate": 9.399395277551473e-06, - "loss": 0.2579, + "epoch": 3.70472483511731, + "grad_norm": 0.24202807247638702, + "learning_rate": 8.310429865023775e-06, + "loss": 0.3879, "step": 102795 }, { - "epoch": 3.62, - "learning_rate": 9.397169373691977e-06, - "loss": 0.2614, + "epoch": 3.704905034778535, + "grad_norm": 0.245781809091568, + "learning_rate": 8.308257324456942e-06, + "loss": 0.3847, "step": 102800 }, { - "epoch": 3.62, - "learning_rate": 9.394943672424964e-06, - "loss": 0.2541, + "epoch": 3.705085234439759, + "grad_norm": 0.2280445694923401, + "learning_rate": 8.306085011311878e-06, + "loss": 0.3645, "step": 102805 }, { - "epoch": 3.62, - "learning_rate": 9.392718173779319e-06, - "loss": 0.2648, + "epoch": 3.705265434100984, + "grad_norm": 0.2140251249074936, + "learning_rate": 8.303912925618174e-06, + "loss": 0.3751, "step": 102810 }, { - "epoch": 3.62, - "learning_rate": 9.390492877783946e-06, - "loss": 0.2603, + "epoch": 3.7054456337622086, + "grad_norm": 0.3311702311038971, + "learning_rate": 8.301741067405424e-06, + "loss": 0.3822, "step": 102815 }, { - "epoch": 3.62, - "learning_rate": 9.388267784467725e-06, - "loss": 0.2555, + "epoch": 3.7056258334234333, + "grad_norm": 0.2583707273006439, + "learning_rate": 8.29956943670322e-06, + "loss": 0.3744, "step": 102820 }, { - "epoch": 3.62, - "learning_rate": 9.38604289385957e-06, - "loss": 0.2853, + "epoch": 3.7058060330846576, + "grad_norm": 0.2845836877822876, + "learning_rate": 8.297398033541137e-06, + "loss": 0.389, "step": 102825 }, { - "epoch": 3.62, - "learning_rate": 9.383818205988356e-06, - "loss": 0.2637, + "epoch": 3.7059862327458823, + "grad_norm": 0.23895393311977386, + "learning_rate": 8.295226857948785e-06, + "loss": 0.3877, "step": 102830 }, { - "epoch": 3.62, - "learning_rate": 9.381593720882961e-06, - "loss": 0.2786, + "epoch": 3.706166432407107, + "grad_norm": 0.25052353739738464, + "learning_rate": 8.29305590995573e-06, + "loss": 0.4122, "step": 102835 }, { - "epoch": 3.62, - "learning_rate": 9.379369438572292e-06, - "loss": 0.2633, + "epoch": 3.706346632068332, + "grad_norm": 0.23934602737426758, + "learning_rate": 8.290885189591555e-06, + "loss": 0.4222, "step": 102840 }, { - "epoch": 3.62, - "learning_rate": 9.377145359085205e-06, - "loss": 0.2493, + "epoch": 3.7065268317295565, + "grad_norm": 0.24071356654167175, + "learning_rate": 8.288714696885835e-06, + "loss": 0.3534, "step": 102845 }, { - "epoch": 3.62, - "learning_rate": 9.374921482450597e-06, - "loss": 0.2476, + "epoch": 3.706707031390781, + "grad_norm": 0.250177264213562, + "learning_rate": 8.28654443186814e-06, + "loss": 0.3517, "step": 102850 }, { - "epoch": 3.62, - "learning_rate": 9.372697808697328e-06, - "loss": 0.2641, + "epoch": 3.7068872310520056, + "grad_norm": 0.25618675351142883, + "learning_rate": 8.284374394568034e-06, + "loss": 0.3571, "step": 102855 }, { - "epoch": 3.62, - "learning_rate": 9.37047433785429e-06, - "loss": 0.2473, + "epoch": 3.7070674307132303, + "grad_norm": 0.2533912658691406, + "learning_rate": 8.282204585015098e-06, + "loss": 0.384, "step": 102860 }, { - "epoch": 3.62, - "learning_rate": 9.36825106995034e-06, - "loss": 0.2858, + "epoch": 3.707247630374455, + "grad_norm": 0.24974510073661804, + "learning_rate": 8.280035003238889e-06, + "loss": 0.397, "step": 102865 }, { - "epoch": 3.62, - "learning_rate": 9.366028005014348e-06, - "loss": 0.2718, + "epoch": 3.7074278300356793, + "grad_norm": 0.2762242555618286, + "learning_rate": 8.277865649268965e-06, + "loss": 0.3575, "step": 102870 }, { - "epoch": 3.62, - "learning_rate": 9.363805143075175e-06, - "loss": 0.2565, + "epoch": 3.707608029696904, + "grad_norm": 0.28377920389175415, + "learning_rate": 8.275696523134885e-06, + "loss": 0.3622, "step": 102875 }, { - "epoch": 3.62, - "learning_rate": 9.36158248416169e-06, - "loss": 0.2571, + "epoch": 3.707788229358129, + "grad_norm": 0.2506207525730133, + "learning_rate": 8.273527624866192e-06, + "loss": 0.4064, "step": 102880 }, { - "epoch": 3.62, - "learning_rate": 9.359360028302755e-06, - "loss": 0.2721, + "epoch": 3.7079684290193535, + "grad_norm": 0.27845442295074463, + "learning_rate": 8.271358954492458e-06, + "loss": 0.3983, "step": 102885 }, { - "epoch": 3.62, - "learning_rate": 9.357137775527219e-06, - "loss": 0.2719, + "epoch": 3.7081486286805783, + "grad_norm": 0.1891564130783081, + "learning_rate": 8.269190512043226e-06, + "loss": 0.3888, "step": 102890 }, { - "epoch": 3.62, - "learning_rate": 9.354915725863936e-06, - "loss": 0.2675, + "epoch": 3.7083288283418026, + "grad_norm": 0.21435974538326263, + "learning_rate": 8.267022297548016e-06, + "loss": 0.3931, "step": 102895 }, { - "epoch": 3.62, - "learning_rate": 9.352693879341756e-06, - "loss": 0.2651, + "epoch": 3.7085090280030273, + "grad_norm": 0.2660645842552185, + "learning_rate": 8.264854311036399e-06, + "loss": 0.3827, "step": 102900 }, { - "epoch": 3.62, - "learning_rate": 9.350472235989547e-06, - "loss": 0.2441, + "epoch": 3.708689227664252, + "grad_norm": 0.23603610694408417, + "learning_rate": 8.262686552537894e-06, + "loss": 0.3614, "step": 102905 }, { - "epoch": 3.62, - "learning_rate": 9.348250795836141e-06, - "loss": 0.251, + "epoch": 3.7088694273254768, + "grad_norm": 0.24526463449001312, + "learning_rate": 8.260519022082058e-06, + "loss": 0.3742, "step": 102910 }, { - "epoch": 3.62, - "learning_rate": 9.346029558910375e-06, - "loss": 0.2562, + "epoch": 3.709049626986701, + "grad_norm": 0.24888326227664948, + "learning_rate": 8.258351719698401e-06, + "loss": 0.3826, "step": 102915 }, { - "epoch": 3.62, - "learning_rate": 9.343808525241108e-06, - "loss": 0.2616, + "epoch": 3.7092298266479258, + "grad_norm": 0.2145439237356186, + "learning_rate": 8.256184645416453e-06, + "loss": 0.3911, "step": 102920 }, { - "epoch": 3.62, - "learning_rate": 9.341587694857168e-06, - "loss": 0.2518, + "epoch": 3.7094100263091505, + "grad_norm": 0.2232615351676941, + "learning_rate": 8.254017799265757e-06, + "loss": 0.3904, "step": 102925 }, { - "epoch": 3.62, - "learning_rate": 9.339367067787393e-06, - "loss": 0.2519, + "epoch": 3.7095902259703752, + "grad_norm": 0.2325001209974289, + "learning_rate": 8.251851181275824e-06, + "loss": 0.3509, "step": 102930 }, { - "epoch": 3.62, - "learning_rate": 9.33714664406061e-06, - "loss": 0.2531, + "epoch": 3.7097704256316, + "grad_norm": 0.2643667459487915, + "learning_rate": 8.249684791476178e-06, + "loss": 0.3894, "step": 102935 }, { - "epoch": 3.62, - "learning_rate": 9.334926423705662e-06, - "loss": 0.2655, + "epoch": 3.7099506252928247, + "grad_norm": 0.22574815154075623, + "learning_rate": 8.247518629896334e-06, + "loss": 0.3611, "step": 102940 }, { - "epoch": 3.62, - "learning_rate": 9.33270640675137e-06, - "loss": 0.2587, + "epoch": 3.710130824954049, + "grad_norm": 0.26316380500793457, + "learning_rate": 8.245352696565797e-06, + "loss": 0.3905, "step": 102945 }, { - "epoch": 3.62, - "learning_rate": 9.330486593226553e-06, - "loss": 0.2436, + "epoch": 3.7103110246152737, + "grad_norm": 0.20727650821208954, + "learning_rate": 8.243186991514092e-06, + "loss": 0.3788, "step": 102950 }, { - "epoch": 3.62, - "learning_rate": 9.32826698316004e-06, - "loss": 0.2479, + "epoch": 3.7104912242764985, + "grad_norm": 0.24111421406269073, + "learning_rate": 8.241021514770721e-06, + "loss": 0.364, "step": 102955 }, { - "epoch": 3.62, - "learning_rate": 9.32604757658066e-06, - "loss": 0.2617, + "epoch": 3.7106714239377228, + "grad_norm": 0.21225392818450928, + "learning_rate": 8.238856266365188e-06, + "loss": 0.3928, "step": 102960 }, { - "epoch": 3.62, - "learning_rate": 9.323828373517224e-06, - "loss": 0.2575, + "epoch": 3.7108516235989475, + "grad_norm": 0.27659305930137634, + "learning_rate": 8.236691246326994e-06, + "loss": 0.3714, "step": 102965 }, { - "epoch": 3.62, - "learning_rate": 9.321609373998538e-06, - "loss": 0.276, + "epoch": 3.7110318232601722, + "grad_norm": 0.26336660981178284, + "learning_rate": 8.234526454685634e-06, + "loss": 0.3199, "step": 102970 }, { - "epoch": 3.62, - "learning_rate": 9.319390578053427e-06, - "loss": 0.2469, + "epoch": 3.711212022921397, + "grad_norm": 0.2715068459510803, + "learning_rate": 8.232361891470599e-06, + "loss": 0.3795, "step": 102975 }, { - "epoch": 3.62, - "learning_rate": 9.3171719857107e-06, - "loss": 0.2463, + "epoch": 3.7113922225826217, + "grad_norm": 0.2276494801044464, + "learning_rate": 8.230197556711403e-06, + "loss": 0.3569, "step": 102980 }, { - "epoch": 3.62, - "learning_rate": 9.314953596999156e-06, - "loss": 0.2471, + "epoch": 3.7115724222438464, + "grad_norm": 0.25353577733039856, + "learning_rate": 8.228033450437503e-06, + "loss": 0.406, "step": 102985 }, { - "epoch": 3.62, - "learning_rate": 9.312735411947597e-06, - "loss": 0.2415, + "epoch": 3.7117526219050707, + "grad_norm": 0.20923548936843872, + "learning_rate": 8.225869572678405e-06, + "loss": 0.3721, "step": 102990 }, { - "epoch": 3.62, - "learning_rate": 9.310517430584836e-06, - "loss": 0.2512, + "epoch": 3.7119328215662954, + "grad_norm": 0.2902141809463501, + "learning_rate": 8.22370592346359e-06, + "loss": 0.4074, "step": 102995 }, { - "epoch": 3.62, - "learning_rate": 9.308299652939666e-06, - "loss": 0.2563, + "epoch": 3.71211302122752, + "grad_norm": 0.26280632615089417, + "learning_rate": 8.221542502822533e-06, + "loss": 0.3569, "step": 103000 }, { - "epoch": 3.62, - "eval_loss": 0.25192898511886597, - "eval_runtime": 10.5381, - "eval_samples_per_second": 9.489, - "eval_steps_per_second": 9.489, + "epoch": 3.71211302122752, + "eval_loss": 0.42930299043655396, + "eval_runtime": 3.5329, + "eval_samples_per_second": 28.306, + "eval_steps_per_second": 7.076, "step": 103000 }, { - "epoch": 3.62, - "learning_rate": 9.306082079040879e-06, - "loss": 0.2784, + "epoch": 3.7122932208887445, + "grad_norm": 0.24260425567626953, + "learning_rate": 8.219379310784708e-06, + "loss": 0.3687, "step": 103005 }, { - "epoch": 3.62, - "learning_rate": 9.30386470891727e-06, - "loss": 0.2625, + "epoch": 3.712473420549969, + "grad_norm": 0.2277086079120636, + "learning_rate": 8.217216347379594e-06, + "loss": 0.392, "step": 103010 }, { - "epoch": 3.62, - "learning_rate": 9.301647542597643e-06, - "loss": 0.2697, + "epoch": 3.712653620211194, + "grad_norm": 0.28349313139915466, + "learning_rate": 8.21505361263665e-06, + "loss": 0.3905, "step": 103015 }, { - "epoch": 3.62, - "learning_rate": 9.299430580110776e-06, - "loss": 0.2611, + "epoch": 3.7128338198724187, + "grad_norm": 0.2919948399066925, + "learning_rate": 8.212891106585357e-06, + "loss": 0.3954, "step": 103020 }, { - "epoch": 3.62, - "learning_rate": 9.29721382148546e-06, - "loss": 0.2729, + "epoch": 3.7130140195336434, + "grad_norm": 0.20646890997886658, + "learning_rate": 8.210728829255173e-06, + "loss": 0.3824, "step": 103025 }, { - "epoch": 3.62, - "learning_rate": 9.29499726675046e-06, - "loss": 0.2328, + "epoch": 3.713194219194868, + "grad_norm": 0.2770639657974243, + "learning_rate": 8.208566780675561e-06, + "loss": 0.3688, "step": 103030 }, { - "epoch": 3.63, - "learning_rate": 9.292780915934585e-06, - "loss": 0.2722, + "epoch": 3.7133744188560924, + "grad_norm": 0.31089577078819275, + "learning_rate": 8.206404960875972e-06, + "loss": 0.3768, "step": 103035 }, { - "epoch": 3.63, - "learning_rate": 9.290564769066592e-06, - "loss": 0.278, + "epoch": 3.713554618517317, + "grad_norm": 0.20020338892936707, + "learning_rate": 8.204243369885859e-06, + "loss": 0.3446, "step": 103040 }, { - "epoch": 3.63, - "learning_rate": 9.288348826175266e-06, - "loss": 0.2406, + "epoch": 3.713734818178542, + "grad_norm": 0.25456637144088745, + "learning_rate": 8.202082007734684e-06, + "loss": 0.3762, "step": 103045 }, { - "epoch": 3.63, - "learning_rate": 9.28613308728937e-06, - "loss": 0.2726, + "epoch": 3.713915017839766, + "grad_norm": 0.23088417947292328, + "learning_rate": 8.19992087445189e-06, + "loss": 0.3943, "step": 103050 }, { - "epoch": 3.63, - "learning_rate": 9.283917552437684e-06, - "loss": 0.2624, + "epoch": 3.714095217500991, + "grad_norm": 0.26196154952049255, + "learning_rate": 8.197759970066923e-06, + "loss": 0.4015, "step": 103055 }, { - "epoch": 3.63, - "learning_rate": 9.281702221648966e-06, - "loss": 0.2461, + "epoch": 3.7142754171622157, + "grad_norm": 0.21619191765785217, + "learning_rate": 8.195599294609222e-06, + "loss": 0.3914, "step": 103060 }, { - "epoch": 3.63, - "learning_rate": 9.279487094951991e-06, - "loss": 0.2531, + "epoch": 3.7144556168234404, + "grad_norm": 0.29650604724884033, + "learning_rate": 8.19343884810822e-06, + "loss": 0.3818, "step": 103065 }, { - "epoch": 3.63, - "learning_rate": 9.277272172375511e-06, - "loss": 0.2718, + "epoch": 3.714635816484665, + "grad_norm": 0.26321521401405334, + "learning_rate": 8.191278630593367e-06, + "loss": 0.3734, "step": 103070 }, { - "epoch": 3.63, - "learning_rate": 9.275057453948297e-06, - "loss": 0.2636, + "epoch": 3.71481601614589, + "grad_norm": 0.2702058255672455, + "learning_rate": 8.189118642094095e-06, + "loss": 0.3759, "step": 103075 }, { - "epoch": 3.63, - "learning_rate": 9.2728429396991e-06, - "loss": 0.2607, + "epoch": 3.714996215807114, + "grad_norm": 0.2815110683441162, + "learning_rate": 8.186958882639813e-06, + "loss": 0.3757, "step": 103080 }, { - "epoch": 3.63, - "learning_rate": 9.270628629656663e-06, - "loss": 0.2376, + "epoch": 3.715176415468339, + "grad_norm": 0.28946641087532043, + "learning_rate": 8.18479935225997e-06, + "loss": 0.3692, "step": 103085 }, { - "epoch": 3.63, - "learning_rate": 9.268414523849755e-06, - "loss": 0.2535, + "epoch": 3.7153566151295636, + "grad_norm": 0.3027191758155823, + "learning_rate": 8.18264005098397e-06, + "loss": 0.3738, "step": 103090 }, { - "epoch": 3.63, - "learning_rate": 9.26620062230712e-06, - "loss": 0.2569, + "epoch": 3.7155368147907883, + "grad_norm": 0.22006292641162872, + "learning_rate": 8.18048097884126e-06, + "loss": 0.398, "step": 103095 }, { - "epoch": 3.63, - "learning_rate": 9.263986925057497e-06, - "loss": 0.2596, + "epoch": 3.7157170144520126, + "grad_norm": 0.27258020639419556, + "learning_rate": 8.178322135861233e-06, + "loss": 0.3971, "step": 103100 }, { - "epoch": 3.63, - "learning_rate": 9.261773432129628e-06, - "loss": 0.2691, + "epoch": 3.7158972141132374, + "grad_norm": 0.22416990995407104, + "learning_rate": 8.176163522073302e-06, + "loss": 0.3712, "step": 103105 }, { - "epoch": 3.63, - "learning_rate": 9.259560143552265e-06, - "loss": 0.2405, + "epoch": 3.716077413774462, + "grad_norm": 0.22454127669334412, + "learning_rate": 8.174005137506894e-06, + "loss": 0.3709, "step": 103110 }, { - "epoch": 3.63, - "learning_rate": 9.257347059354132e-06, - "loss": 0.288, + "epoch": 3.716257613435687, + "grad_norm": 0.21970023214817047, + "learning_rate": 8.171846982191409e-06, + "loss": 0.3468, "step": 103115 }, { - "epoch": 3.63, - "learning_rate": 9.255134179563985e-06, - "loss": 0.2593, + "epoch": 3.7164378130969116, + "grad_norm": 0.23587019741535187, + "learning_rate": 8.169689056156249e-06, + "loss": 0.3688, "step": 103120 }, { - "epoch": 3.63, - "learning_rate": 9.252921504210533e-06, - "loss": 0.2573, + "epoch": 3.716618012758136, + "grad_norm": 0.2582648992538452, + "learning_rate": 8.167531359430815e-06, + "loss": 0.3482, "step": 103125 }, { - "epoch": 3.63, - "learning_rate": 9.250709033322524e-06, - "loss": 0.2445, + "epoch": 3.7167982124193606, + "grad_norm": 0.23627053201198578, + "learning_rate": 8.165373892044503e-06, + "loss": 0.3601, "step": 103130 }, { - "epoch": 3.63, - "learning_rate": 9.248496766928683e-06, - "loss": 0.2501, + "epoch": 3.7169784120805853, + "grad_norm": 0.2360253483057022, + "learning_rate": 8.163216654026721e-06, + "loss": 0.3759, "step": 103135 }, { - "epoch": 3.63, - "learning_rate": 9.246284705057728e-06, - "loss": 0.2482, + "epoch": 3.71715861174181, + "grad_norm": 0.32699692249298096, + "learning_rate": 8.16105964540685e-06, + "loss": 0.4162, "step": 103140 }, { - "epoch": 3.63, - "learning_rate": 9.244072847738377e-06, - "loss": 0.2378, + "epoch": 3.7173388114030343, + "grad_norm": 0.2977791726589203, + "learning_rate": 8.158902866214282e-06, + "loss": 0.3725, "step": 103145 }, { - "epoch": 3.63, - "learning_rate": 9.241861194999365e-06, - "loss": 0.2562, + "epoch": 3.717519011064259, + "grad_norm": 0.29648059606552124, + "learning_rate": 8.156746316478403e-06, + "loss": 0.412, "step": 103150 }, { - "epoch": 3.63, - "learning_rate": 9.2396497468694e-06, - "loss": 0.2551, + "epoch": 3.717699210725484, + "grad_norm": 0.2452070415019989, + "learning_rate": 8.154589996228595e-06, + "loss": 0.3724, "step": 103155 }, { - "epoch": 3.63, - "learning_rate": 9.237438503377196e-06, - "loss": 0.2423, + "epoch": 3.7178794103867085, + "grad_norm": 0.23447571694850922, + "learning_rate": 8.152433905494228e-06, + "loss": 0.3573, "step": 103160 }, { - "epoch": 3.63, - "learning_rate": 9.235227464551457e-06, - "loss": 0.2807, + "epoch": 3.7180596100479333, + "grad_norm": 0.22790288925170898, + "learning_rate": 8.150278044304702e-06, + "loss": 0.3927, "step": 103165 }, { - "epoch": 3.63, - "learning_rate": 9.233016630420898e-06, - "loss": 0.2741, + "epoch": 3.718239809709158, + "grad_norm": 0.24579019844532013, + "learning_rate": 8.14812241268936e-06, + "loss": 0.3558, "step": 103170 }, { - "epoch": 3.63, - "learning_rate": 9.230806001014238e-06, - "loss": 0.2461, + "epoch": 3.7184200093703823, + "grad_norm": 0.2221144735813141, + "learning_rate": 8.145967010677597e-06, + "loss": 0.3759, "step": 103175 }, { - "epoch": 3.63, - "learning_rate": 9.228595576360172e-06, - "loss": 0.2494, + "epoch": 3.718600209031607, + "grad_norm": 0.2838151454925537, + "learning_rate": 8.143811838298767e-06, + "loss": 0.3491, "step": 103180 }, { - "epoch": 3.63, - "learning_rate": 9.226385356487388e-06, - "loss": 0.2675, + "epoch": 3.7187804086928318, + "grad_norm": 0.20898236334323883, + "learning_rate": 8.141656895582233e-06, + "loss": 0.3515, "step": 103185 }, { - "epoch": 3.63, - "learning_rate": 9.2241753414246e-06, - "loss": 0.2761, + "epoch": 3.718960608354056, + "grad_norm": 0.2016228437423706, + "learning_rate": 8.139502182557373e-06, + "loss": 0.3518, "step": 103190 }, { - "epoch": 3.63, - "learning_rate": 9.221965531200504e-06, - "loss": 0.2476, + "epoch": 3.719140808015281, + "grad_norm": 0.29854917526245117, + "learning_rate": 8.137347699253526e-06, + "loss": 0.3839, "step": 103195 }, { - "epoch": 3.63, - "learning_rate": 9.219755925843776e-06, - "loss": 0.2674, + "epoch": 3.7193210076765055, + "grad_norm": 0.24756811559200287, + "learning_rate": 8.135193445700043e-06, + "loss": 0.3774, "step": 103200 }, { - "epoch": 3.63, - "learning_rate": 9.217546525383128e-06, - "loss": 0.2556, + "epoch": 3.7195012073377303, + "grad_norm": 0.25717464089393616, + "learning_rate": 8.133039421926291e-06, + "loss": 0.3913, "step": 103205 }, { - "epoch": 3.63, - "learning_rate": 9.215337329847238e-06, - "loss": 0.263, + "epoch": 3.719681406998955, + "grad_norm": 0.216623917222023, + "learning_rate": 8.130885627961612e-06, + "loss": 0.401, "step": 103210 }, { - "epoch": 3.63, - "learning_rate": 9.213128339264788e-06, - "loss": 0.2573, + "epoch": 3.7198616066601797, + "grad_norm": 0.2196447253227234, + "learning_rate": 8.12873206383535e-06, + "loss": 0.3996, "step": 103215 }, { - "epoch": 3.63, - "learning_rate": 9.210919553664454e-06, - "loss": 0.237, + "epoch": 3.720041806321404, + "grad_norm": 0.17287389934062958, + "learning_rate": 8.126578729576851e-06, + "loss": 0.3962, "step": 103220 }, { - "epoch": 3.63, - "learning_rate": 9.208710973074932e-06, - "loss": 0.2578, + "epoch": 3.7202220059826288, + "grad_norm": 0.23201312124729156, + "learning_rate": 8.124425625215437e-06, + "loss": 0.3583, "step": 103225 }, { - "epoch": 3.63, - "learning_rate": 9.206502597524886e-06, - "loss": 0.2596, + "epoch": 3.7204022056438535, + "grad_norm": 0.21326382458209991, + "learning_rate": 8.122272750780465e-06, + "loss": 0.38, "step": 103230 }, { - "epoch": 3.63, - "learning_rate": 9.204294427043e-06, - "loss": 0.2492, + "epoch": 3.7205824053050778, + "grad_norm": 0.2922148108482361, + "learning_rate": 8.120120106301263e-06, + "loss": 0.3748, "step": 103235 }, { - "epoch": 3.63, - "learning_rate": 9.202086461657935e-06, - "loss": 0.229, + "epoch": 3.7207626049663025, + "grad_norm": 0.22092339396476746, + "learning_rate": 8.117967691807155e-06, + "loss": 0.4146, "step": 103240 }, { - "epoch": 3.63, - "learning_rate": 9.199878701398376e-06, - "loss": 0.2658, + "epoch": 3.7209428046275272, + "grad_norm": 0.2587362825870514, + "learning_rate": 8.115815507327465e-06, + "loss": 0.3996, "step": 103245 }, { - "epoch": 3.63, - "learning_rate": 9.197671146292977e-06, - "loss": 0.2651, + "epoch": 3.721123004288752, + "grad_norm": 0.2722143232822418, + "learning_rate": 8.113663552891516e-06, + "loss": 0.4322, "step": 103250 }, { - "epoch": 3.63, - "learning_rate": 9.195463796370405e-06, - "loss": 0.2714, + "epoch": 3.7213032039499767, + "grad_norm": 0.21530699729919434, + "learning_rate": 8.11151182852864e-06, + "loss": 0.3856, "step": 103255 }, { - "epoch": 3.63, - "learning_rate": 9.19325665165931e-06, - "loss": 0.2282, + "epoch": 3.7214834036112014, + "grad_norm": 0.22060318291187286, + "learning_rate": 8.10936033426815e-06, + "loss": 0.3657, "step": 103260 }, { - "epoch": 3.63, - "learning_rate": 9.191049712188371e-06, - "loss": 0.2691, + "epoch": 3.7216636032724257, + "grad_norm": 0.36303722858428955, + "learning_rate": 8.107209070139341e-06, + "loss": 0.4083, "step": 103265 }, { - "epoch": 3.63, - "learning_rate": 9.18884297798623e-06, - "loss": 0.2691, + "epoch": 3.7218438029336505, + "grad_norm": 0.189983531832695, + "learning_rate": 8.105058036171547e-06, + "loss": 0.3606, "step": 103270 }, { - "epoch": 3.63, - "learning_rate": 9.18663644908154e-06, - "loss": 0.252, + "epoch": 3.722024002594875, + "grad_norm": 0.2879542410373688, + "learning_rate": 8.102907232394057e-06, + "loss": 0.3928, "step": 103275 }, { - "epoch": 3.63, - "learning_rate": 9.184430125502958e-06, - "loss": 0.2643, + "epoch": 3.7222042022560995, + "grad_norm": 0.2692555785179138, + "learning_rate": 8.100756658836202e-06, + "loss": 0.3752, "step": 103280 }, { - "epoch": 3.63, - "learning_rate": 9.182224007279122e-06, - "loss": 0.2822, + "epoch": 3.7223844019173242, + "grad_norm": 0.298684298992157, + "learning_rate": 8.098606315527258e-06, + "loss": 0.3915, "step": 103285 }, { - "epoch": 3.63, - "learning_rate": 9.18001809443869e-06, - "loss": 0.2773, + "epoch": 3.722564601578549, + "grad_norm": 0.2624755799770355, + "learning_rate": 8.096456202496518e-06, + "loss": 0.3932, "step": 103290 }, { - "epoch": 3.63, - "learning_rate": 9.177812387010295e-06, - "loss": 0.2452, + "epoch": 3.7227448012397737, + "grad_norm": 0.25200989842414856, + "learning_rate": 8.094306319773304e-06, + "loss": 0.3696, "step": 103295 }, { - "epoch": 3.63, - "learning_rate": 9.175606885022575e-06, - "loss": 0.254, + "epoch": 3.7229250009009984, + "grad_norm": 0.26927831768989563, + "learning_rate": 8.092156667386891e-06, + "loss": 0.3891, "step": 103300 }, { - "epoch": 3.63, - "learning_rate": 9.173401588504177e-06, - "loss": 0.278, + "epoch": 3.723105200562223, + "grad_norm": 0.21889813244342804, + "learning_rate": 8.090007245366567e-06, + "loss": 0.3829, "step": 103305 }, { - "epoch": 3.63, - "learning_rate": 9.171196497483731e-06, - "loss": 0.2877, + "epoch": 3.7232854002234474, + "grad_norm": 0.24276183545589447, + "learning_rate": 8.087858053741626e-06, + "loss": 0.3566, "step": 103310 }, { - "epoch": 3.63, - "learning_rate": 9.168991611989855e-06, - "loss": 0.2475, + "epoch": 3.723465599884672, + "grad_norm": 0.2647343873977661, + "learning_rate": 8.085709092541332e-06, + "loss": 0.3916, "step": 103315 }, { - "epoch": 3.64, - "learning_rate": 9.166786932051203e-06, - "loss": 0.2479, + "epoch": 3.723645799545897, + "grad_norm": 0.23813170194625854, + "learning_rate": 8.083560361794988e-06, + "loss": 0.3831, "step": 103320 }, { - "epoch": 3.64, - "learning_rate": 9.164582457696384e-06, - "loss": 0.2354, + "epoch": 3.7238259992071217, + "grad_norm": 0.21774239838123322, + "learning_rate": 8.081411861531856e-06, + "loss": 0.3904, "step": 103325 }, { - "epoch": 3.64, - "learning_rate": 9.162378188954027e-06, - "loss": 0.2533, + "epoch": 3.724006198868346, + "grad_norm": 0.2522357106208801, + "learning_rate": 8.079263591781213e-06, + "loss": 0.3394, "step": 103330 }, { - "epoch": 3.64, - "learning_rate": 9.160174125852745e-06, - "loss": 0.2438, + "epoch": 3.7241863985295707, + "grad_norm": 0.2412281036376953, + "learning_rate": 8.077115552572328e-06, + "loss": 0.3927, "step": 103335 }, { - "epoch": 3.64, - "learning_rate": 9.157970268421162e-06, - "loss": 0.2633, + "epoch": 3.7243665981907954, + "grad_norm": 0.20674298703670502, + "learning_rate": 8.074967743934466e-06, + "loss": 0.3638, "step": 103340 }, { - "epoch": 3.64, - "learning_rate": 9.155766616687902e-06, - "loss": 0.2715, + "epoch": 3.72454679785202, + "grad_norm": 0.2086464762687683, + "learning_rate": 8.072820165896886e-06, + "loss": 0.3662, "step": 103345 }, { - "epoch": 3.64, - "learning_rate": 9.153563170681573e-06, - "loss": 0.2678, + "epoch": 3.724726997513245, + "grad_norm": 0.24110788106918335, + "learning_rate": 8.07067281848886e-06, + "loss": 0.4261, "step": 103350 }, { - "epoch": 3.64, - "learning_rate": 9.151359930430772e-06, - "loss": 0.2665, + "epoch": 3.724907197174469, + "grad_norm": 0.2587282955646515, + "learning_rate": 8.06852570173964e-06, + "loss": 0.3886, "step": 103355 }, { - "epoch": 3.64, - "learning_rate": 9.149156895964128e-06, - "loss": 0.2473, + "epoch": 3.725087396835694, + "grad_norm": 0.20811423659324646, + "learning_rate": 8.06637881567848e-06, + "loss": 0.3289, "step": 103360 }, { - "epoch": 3.64, - "learning_rate": 9.146954067310237e-06, - "loss": 0.2512, + "epoch": 3.7252675964969186, + "grad_norm": 0.19246096909046173, + "learning_rate": 8.064232160334632e-06, + "loss": 0.351, "step": 103365 }, { - "epoch": 3.64, - "learning_rate": 9.144751444497698e-06, - "loss": 0.2683, + "epoch": 3.7254477961581434, + "grad_norm": 0.2847633361816406, + "learning_rate": 8.062085735737329e-06, + "loss": 0.3723, "step": 103370 }, { - "epoch": 3.64, - "learning_rate": 9.142549027555103e-06, - "loss": 0.2704, + "epoch": 3.7256279958193677, + "grad_norm": 0.21437424421310425, + "learning_rate": 8.059939541915847e-06, + "loss": 0.3805, "step": 103375 }, { - "epoch": 3.64, - "learning_rate": 9.140346816511069e-06, - "loss": 0.2578, + "epoch": 3.7258081954805924, + "grad_norm": 0.23143932223320007, + "learning_rate": 8.057793578899403e-06, + "loss": 0.3444, "step": 103380 }, { - "epoch": 3.64, - "learning_rate": 9.138144811394178e-06, - "loss": 0.2615, + "epoch": 3.725988395141817, + "grad_norm": 0.18867234885692596, + "learning_rate": 8.055647846717232e-06, + "loss": 0.367, "step": 103385 }, { - "epoch": 3.64, - "learning_rate": 9.135943012233017e-06, - "loss": 0.2751, + "epoch": 3.726168594803042, + "grad_norm": 0.22278887033462524, + "learning_rate": 8.053502345398583e-06, + "loss": 0.3791, "step": 103390 }, { - "epoch": 3.64, - "learning_rate": 9.13374141905618e-06, - "loss": 0.2699, + "epoch": 3.7263487944642666, + "grad_norm": 0.2264564037322998, + "learning_rate": 8.051357074972688e-06, + "loss": 0.3769, "step": 103395 }, { - "epoch": 3.64, - "learning_rate": 9.131540031892261e-06, - "loss": 0.2361, + "epoch": 3.726528994125491, + "grad_norm": 0.3085385262966156, + "learning_rate": 8.04921203546877e-06, + "loss": 0.3945, "step": 103400 }, { - "epoch": 3.64, - "learning_rate": 9.12933885076984e-06, - "loss": 0.2633, + "epoch": 3.7267091937867156, + "grad_norm": 0.1784570962190628, + "learning_rate": 8.047067226916058e-06, + "loss": 0.3429, "step": 103405 }, { - "epoch": 3.64, - "learning_rate": 9.127137875717492e-06, - "loss": 0.2574, + "epoch": 3.7268893934479403, + "grad_norm": 0.1708642989397049, + "learning_rate": 8.044922649343762e-06, + "loss": 0.3599, "step": 103410 }, { - "epoch": 3.64, - "learning_rate": 9.124937106763793e-06, - "loss": 0.2722, + "epoch": 3.727069593109165, + "grad_norm": 0.30440521240234375, + "learning_rate": 8.042778302781123e-06, + "loss": 0.4162, "step": 103415 }, { - "epoch": 3.64, - "learning_rate": 9.12273654393733e-06, - "loss": 0.2493, + "epoch": 3.7272497927703894, + "grad_norm": 0.28747257590293884, + "learning_rate": 8.040634187257345e-06, + "loss": 0.3721, "step": 103420 }, { - "epoch": 3.64, - "learning_rate": 9.120536187266667e-06, - "loss": 0.2745, + "epoch": 3.727429992431614, + "grad_norm": 0.24325698614120483, + "learning_rate": 8.038490302801641e-06, + "loss": 0.3838, "step": 103425 }, { - "epoch": 3.64, - "learning_rate": 9.118336036780373e-06, - "loss": 0.263, + "epoch": 3.727610192092839, + "grad_norm": 0.22188496589660645, + "learning_rate": 8.036346649443224e-06, + "loss": 0.3684, "step": 103430 }, { - "epoch": 3.64, - "learning_rate": 9.116136092507024e-06, - "loss": 0.2741, + "epoch": 3.7277903917540636, + "grad_norm": 0.23214270174503326, + "learning_rate": 8.034203227211292e-06, + "loss": 0.3609, "step": 103435 }, { - "epoch": 3.64, - "learning_rate": 9.113936354475183e-06, - "loss": 0.2627, + "epoch": 3.7279705914152883, + "grad_norm": 0.2736099660396576, + "learning_rate": 8.032060036135067e-06, + "loss": 0.3663, "step": 103440 }, { - "epoch": 3.64, - "learning_rate": 9.1117368227134e-06, - "loss": 0.2345, + "epoch": 3.728150791076513, + "grad_norm": 0.26921337842941284, + "learning_rate": 8.029917076243745e-06, + "loss": 0.3804, "step": 103445 }, { - "epoch": 3.64, - "learning_rate": 9.109537497250242e-06, - "loss": 0.2622, + "epoch": 3.7283309907377373, + "grad_norm": 0.25673896074295044, + "learning_rate": 8.027774347566499e-06, + "loss": 0.3767, "step": 103450 }, { - "epoch": 3.64, - "learning_rate": 9.107338378114277e-06, - "loss": 0.2581, + "epoch": 3.728511190398962, + "grad_norm": 0.288904070854187, + "learning_rate": 8.025631850132554e-06, + "loss": 0.3969, "step": 103455 }, { - "epoch": 3.64, - "learning_rate": 9.105139465334048e-06, - "loss": 0.2879, + "epoch": 3.728691390060187, + "grad_norm": 0.23645661771297455, + "learning_rate": 8.023489583971077e-06, + "loss": 0.3695, "step": 103460 }, { - "epoch": 3.64, - "learning_rate": 9.102940758938109e-06, - "loss": 0.2679, + "epoch": 3.728871589721411, + "grad_norm": 0.24535304307937622, + "learning_rate": 8.021347549111277e-06, + "loss": 0.4008, "step": 103465 }, { - "epoch": 3.64, - "learning_rate": 9.100742258955e-06, - "loss": 0.2528, + "epoch": 3.729051789382636, + "grad_norm": 0.25152429938316345, + "learning_rate": 8.019205745582337e-06, + "loss": 0.3894, "step": 103470 }, { - "epoch": 3.64, - "learning_rate": 9.098543965413281e-06, - "loss": 0.2925, + "epoch": 3.7292319890438606, + "grad_norm": 0.2640352249145508, + "learning_rate": 8.017064173413416e-06, + "loss": 0.3442, "step": 103475 }, { - "epoch": 3.64, - "learning_rate": 9.096345878341492e-06, - "loss": 0.2465, + "epoch": 3.7294121887050853, + "grad_norm": 0.22095341980457306, + "learning_rate": 8.015351082277115e-06, + "loss": 0.3711, "step": 103480 }, { - "epoch": 3.64, - "learning_rate": 9.094147997768168e-06, - "loss": 0.271, + "epoch": 3.72959238836631, + "grad_norm": 0.23479041457176208, + "learning_rate": 8.013209926629791e-06, + "loss": 0.3823, "step": 103485 }, { - "epoch": 3.64, - "learning_rate": 9.091950323721842e-06, - "loss": 0.2876, + "epoch": 3.7297725880275348, + "grad_norm": 0.2575128972530365, + "learning_rate": 8.011069002424194e-06, + "loss": 0.3876, "step": 103490 }, { - "epoch": 3.64, - "learning_rate": 9.089752856231069e-06, - "loss": 0.2557, + "epoch": 3.729952787688759, + "grad_norm": 0.25867316126823425, + "learning_rate": 8.00892830968949e-06, + "loss": 0.3505, "step": 103495 }, { - "epoch": 3.64, - "learning_rate": 9.08755559532436e-06, - "loss": 0.2575, + "epoch": 3.7301329873499838, + "grad_norm": 0.24454909563064575, + "learning_rate": 8.00678784845485e-06, + "loss": 0.4008, "step": 103500 }, { - "epoch": 3.64, - "eval_loss": 0.2520705461502075, - "eval_runtime": 10.549, - "eval_samples_per_second": 9.48, - "eval_steps_per_second": 9.48, + "epoch": 3.7301329873499838, + "eval_loss": 0.4287916421890259, + "eval_runtime": 3.5358, + "eval_samples_per_second": 28.282, + "eval_steps_per_second": 7.071, "step": 103500 }, { - "epoch": 3.64, - "learning_rate": 9.085358541030262e-06, - "loss": 0.2715, + "epoch": 3.7303131870112085, + "grad_norm": 0.2569969892501831, + "learning_rate": 8.004647618749424e-06, + "loss": 0.3734, "step": 103505 }, { - "epoch": 3.64, - "learning_rate": 9.083161693377288e-06, - "loss": 0.2492, + "epoch": 3.730493386672433, + "grad_norm": 0.26036691665649414, + "learning_rate": 8.002507620602398e-06, + "loss": 0.3745, "step": 103510 }, { - "epoch": 3.64, - "learning_rate": 9.080965052393974e-06, - "loss": 0.2546, + "epoch": 3.7306735863336575, + "grad_norm": 0.2165229618549347, + "learning_rate": 8.000367854042911e-06, + "loss": 0.3696, "step": 103515 }, { - "epoch": 3.64, - "learning_rate": 9.078768618108843e-06, - "loss": 0.2508, + "epoch": 3.7308537859948823, + "grad_norm": 0.20541858673095703, + "learning_rate": 7.99822831910012e-06, + "loss": 0.3753, "step": 103520 }, { - "epoch": 3.64, - "learning_rate": 9.07657239055041e-06, - "loss": 0.2488, + "epoch": 3.731033985656107, + "grad_norm": 0.23357829451560974, + "learning_rate": 7.99608901580318e-06, + "loss": 0.3534, "step": 103525 }, { - "epoch": 3.64, - "learning_rate": 9.074376369747176e-06, - "loss": 0.2609, + "epoch": 3.7312141853173317, + "grad_norm": 0.2972395420074463, + "learning_rate": 7.993949944181223e-06, + "loss": 0.3948, "step": 103530 }, { - "epoch": 3.64, - "learning_rate": 9.072180555727683e-06, - "loss": 0.2645, + "epoch": 3.7313943849785565, + "grad_norm": 0.22406528890132904, + "learning_rate": 7.991811104263417e-06, + "loss": 0.3747, "step": 103535 }, { - "epoch": 3.64, - "learning_rate": 9.069984948520428e-06, - "loss": 0.2386, + "epoch": 3.7315745846397808, + "grad_norm": 0.21077711880207062, + "learning_rate": 7.989672496078899e-06, + "loss": 0.3868, "step": 103540 }, { - "epoch": 3.64, - "learning_rate": 9.06778954815391e-06, - "loss": 0.2696, + "epoch": 3.7317547843010055, + "grad_norm": 0.2398710399866104, + "learning_rate": 7.98753411965678e-06, + "loss": 0.3643, "step": 103545 }, { - "epoch": 3.64, - "learning_rate": 9.065594354656653e-06, - "loss": 0.2695, + "epoch": 3.7319349839622302, + "grad_norm": 0.29205483198165894, + "learning_rate": 7.985395975026227e-06, + "loss": 0.3413, "step": 103550 }, { - "epoch": 3.64, - "learning_rate": 9.063399368057143e-06, - "loss": 0.2459, + "epoch": 3.7321151836234545, + "grad_norm": 0.24142169952392578, + "learning_rate": 7.983258062216361e-06, + "loss": 0.3876, "step": 103555 }, { - "epoch": 3.64, - "learning_rate": 9.061204588383901e-06, - "loss": 0.2661, + "epoch": 3.7322953832846792, + "grad_norm": 0.2126045972108841, + "learning_rate": 7.981120381256308e-06, + "loss": 0.3878, "step": 103560 }, { - "epoch": 3.64, - "learning_rate": 9.059010015665403e-06, - "loss": 0.275, + "epoch": 3.732475582945904, + "grad_norm": 0.21912820637226105, + "learning_rate": 7.978982932175195e-06, + "loss": 0.3782, "step": 103565 }, { - "epoch": 3.64, - "learning_rate": 9.056815649930165e-06, - "loss": 0.266, + "epoch": 3.7326557826071287, + "grad_norm": 0.3162360191345215, + "learning_rate": 7.976845715002131e-06, + "loss": 0.4018, "step": 103570 }, { - "epoch": 3.64, - "learning_rate": 9.054621491206667e-06, - "loss": 0.2611, + "epoch": 3.7328359822683534, + "grad_norm": 0.20726588368415833, + "learning_rate": 7.974708729766262e-06, + "loss": 0.3311, "step": 103575 }, { - "epoch": 3.64, - "learning_rate": 9.052427539523401e-06, - "loss": 0.249, + "epoch": 3.733016181929578, + "grad_norm": 0.30441704392433167, + "learning_rate": 7.97257197649669e-06, + "loss": 0.3811, "step": 103580 }, { - "epoch": 3.64, - "learning_rate": 9.050233794908844e-06, - "loss": 0.2448, + "epoch": 3.7331963815908025, + "grad_norm": 0.23164492845535278, + "learning_rate": 7.970435455222528e-06, + "loss": 0.3566, "step": 103585 }, { - "epoch": 3.64, - "learning_rate": 9.048040257391501e-06, - "loss": 0.2596, + "epoch": 3.733376581252027, + "grad_norm": 0.26288750767707825, + "learning_rate": 7.968299165972884e-06, + "loss": 0.3842, "step": 103590 }, { - "epoch": 3.64, - "learning_rate": 9.04584692699984e-06, - "loss": 0.2596, + "epoch": 3.733556780913252, + "grad_norm": 0.23692747950553894, + "learning_rate": 7.966163108776866e-06, + "loss": 0.3784, "step": 103595 }, { - "epoch": 3.64, - "learning_rate": 9.043653803762345e-06, - "loss": 0.2496, + "epoch": 3.7337369805744767, + "grad_norm": 0.19637106359004974, + "learning_rate": 7.964027283663573e-06, + "loss": 0.3741, "step": 103600 }, { - "epoch": 3.65, - "learning_rate": 9.041460887707478e-06, - "loss": 0.2707, + "epoch": 3.733917180235701, + "grad_norm": 0.2366388440132141, + "learning_rate": 7.961891690662115e-06, + "loss": 0.3927, "step": 103605 }, { - "epoch": 3.65, - "learning_rate": 9.039268178863727e-06, - "loss": 0.2813, + "epoch": 3.7340973798969257, + "grad_norm": 0.24893350899219513, + "learning_rate": 7.959756329801588e-06, + "loss": 0.3664, "step": 103610 }, { - "epoch": 3.65, - "learning_rate": 9.037075677259569e-06, - "loss": 0.257, + "epoch": 3.7342775795581504, + "grad_norm": 0.26749271154403687, + "learning_rate": 7.957621201111079e-06, + "loss": 0.3876, "step": 103615 }, { - "epoch": 3.65, - "learning_rate": 9.034883382923459e-06, - "loss": 0.2434, + "epoch": 3.734457779219375, + "grad_norm": 0.23498766124248505, + "learning_rate": 7.95548630461968e-06, + "loss": 0.3765, "step": 103620 }, { - "epoch": 3.65, - "learning_rate": 9.032691295883863e-06, - "loss": 0.2592, + "epoch": 3.7346379788806, + "grad_norm": 0.23466086387634277, + "learning_rate": 7.953351640356475e-06, + "loss": 0.3897, "step": 103625 }, { - "epoch": 3.65, - "learning_rate": 9.030499416169256e-06, - "loss": 0.2716, + "epoch": 3.734818178541824, + "grad_norm": 0.27381378412246704, + "learning_rate": 7.951217208350567e-06, + "loss": 0.3869, "step": 103630 }, { - "epoch": 3.65, - "learning_rate": 9.028307743808087e-06, - "loss": 0.2672, + "epoch": 3.734998378203049, + "grad_norm": 0.23438720405101776, + "learning_rate": 7.94908300863102e-06, + "loss": 0.3729, "step": 103635 }, { - "epoch": 3.65, - "learning_rate": 9.026116278828816e-06, - "loss": 0.2589, + "epoch": 3.7351785778642737, + "grad_norm": 0.2471228539943695, + "learning_rate": 7.946949041226904e-06, + "loss": 0.4086, "step": 103640 }, { - "epoch": 3.65, - "learning_rate": 9.023925021259894e-06, - "loss": 0.244, + "epoch": 3.7353587775254984, + "grad_norm": 0.2612572908401489, + "learning_rate": 7.944815306167314e-06, + "loss": 0.3758, "step": 103645 }, { - "epoch": 3.65, - "learning_rate": 9.021733971129782e-06, - "loss": 0.2679, + "epoch": 3.7355389771867227, + "grad_norm": 0.31286609172821045, + "learning_rate": 7.942681803481309e-06, + "loss": 0.3795, "step": 103650 }, { - "epoch": 3.65, - "learning_rate": 9.019543128466922e-06, - "loss": 0.2594, + "epoch": 3.7357191768479474, + "grad_norm": 0.2046850472688675, + "learning_rate": 7.940548533197973e-06, + "loss": 0.3579, "step": 103655 }, { - "epoch": 3.65, - "learning_rate": 9.017352493299757e-06, - "loss": 0.2675, + "epoch": 3.735899376509172, + "grad_norm": 0.22326718270778656, + "learning_rate": 7.938415495346354e-06, + "loss": 0.3635, "step": 103660 }, { - "epoch": 3.65, - "learning_rate": 9.01560013458205e-06, - "loss": 0.2589, + "epoch": 3.736079576170397, + "grad_norm": 0.27218303084373474, + "learning_rate": 7.936282689955515e-06, + "loss": 0.3704, "step": 103665 }, { - "epoch": 3.65, - "learning_rate": 9.013409872978834e-06, - "loss": 0.2636, + "epoch": 3.7362597758316216, + "grad_norm": 0.2567422688007355, + "learning_rate": 7.934150117054528e-06, + "loss": 0.3358, "step": 103670 }, { - "epoch": 3.65, - "learning_rate": 9.011219818950945e-06, - "loss": 0.2685, + "epoch": 3.7364399754928463, + "grad_norm": 0.21654558181762695, + "learning_rate": 7.93201777667244e-06, + "loss": 0.3797, "step": 103675 }, { - "epoch": 3.65, - "learning_rate": 9.009029972526839e-06, - "loss": 0.2679, + "epoch": 3.7366201751540706, + "grad_norm": 0.26781266927719116, + "learning_rate": 7.929885668838305e-06, + "loss": 0.3711, "step": 103680 }, { - "epoch": 3.65, - "learning_rate": 9.00684033373494e-06, - "loss": 0.2409, + "epoch": 3.7368003748152954, + "grad_norm": 0.23572920262813568, + "learning_rate": 7.927753793581174e-06, + "loss": 0.3858, "step": 103685 }, { - "epoch": 3.65, - "learning_rate": 9.004650902603679e-06, - "loss": 0.2331, + "epoch": 3.73698057447652, + "grad_norm": 0.2272387593984604, + "learning_rate": 7.925622150930085e-06, + "loss": 0.4051, "step": 103690 }, { - "epoch": 3.65, - "learning_rate": 9.002461679161475e-06, - "loss": 0.2469, + "epoch": 3.7371607741377444, + "grad_norm": 0.24096395075321198, + "learning_rate": 7.923490740914097e-06, + "loss": 0.3699, "step": 103695 }, { - "epoch": 3.65, - "learning_rate": 9.000272663436765e-06, - "loss": 0.2537, + "epoch": 3.737340973798969, + "grad_norm": 0.2557925581932068, + "learning_rate": 7.92135956356224e-06, + "loss": 0.4079, "step": 103700 }, { - "epoch": 3.65, - "learning_rate": 8.998083855457979e-06, - "loss": 0.247, + "epoch": 3.737521173460194, + "grad_norm": 0.2199949324131012, + "learning_rate": 7.919228618903551e-06, + "loss": 0.3751, "step": 103705 }, { - "epoch": 3.65, - "learning_rate": 8.995895255253529e-06, - "loss": 0.2679, + "epoch": 3.7377013731214186, + "grad_norm": 0.2655465602874756, + "learning_rate": 7.91709790696707e-06, + "loss": 0.4015, "step": 103710 }, { - "epoch": 3.65, - "learning_rate": 8.99370686285182e-06, - "loss": 0.2445, + "epoch": 3.7378815727826433, + "grad_norm": 0.25117290019989014, + "learning_rate": 7.91496742778181e-06, + "loss": 0.3899, "step": 103715 }, { - "epoch": 3.65, - "learning_rate": 8.991518678281292e-06, - "loss": 0.2439, + "epoch": 3.738061772443868, + "grad_norm": 0.30154553055763245, + "learning_rate": 7.91283718137682e-06, + "loss": 0.424, "step": 103720 }, { - "epoch": 3.65, - "learning_rate": 8.989330701570339e-06, - "loss": 0.2615, + "epoch": 3.7382419721050923, + "grad_norm": 0.27134832739830017, + "learning_rate": 7.910707167781126e-06, + "loss": 0.3404, "step": 103725 }, { - "epoch": 3.65, - "learning_rate": 8.98714293274737e-06, - "loss": 0.2648, + "epoch": 3.738422171766317, + "grad_norm": 0.2242002934217453, + "learning_rate": 7.908577387023719e-06, + "loss": 0.38, "step": 103730 }, { - "epoch": 3.65, - "learning_rate": 8.984955371840806e-06, - "loss": 0.2536, + "epoch": 3.738602371427542, + "grad_norm": 0.2798958122730255, + "learning_rate": 7.906447839133643e-06, + "loss": 0.3884, "step": 103735 }, { - "epoch": 3.65, - "learning_rate": 8.98276801887904e-06, - "loss": 0.2468, + "epoch": 3.738782571088766, + "grad_norm": 0.2313423752784729, + "learning_rate": 7.90431852413991e-06, + "loss": 0.3852, "step": 103740 }, { - "epoch": 3.65, - "learning_rate": 8.980580873890477e-06, - "loss": 0.2553, + "epoch": 3.738962770749991, + "grad_norm": 0.24077194929122925, + "learning_rate": 7.902189442071525e-06, + "loss": 0.3847, "step": 103745 }, { - "epoch": 3.65, - "learning_rate": 8.978393936903507e-06, - "loss": 0.2512, + "epoch": 3.7391429704112156, + "grad_norm": 0.24760626256465912, + "learning_rate": 7.900060592957498e-06, + "loss": 0.3997, "step": 103750 }, { - "epoch": 3.65, - "learning_rate": 8.97620720794653e-06, - "loss": 0.2637, + "epoch": 3.7393231700724403, + "grad_norm": 0.19511263072490692, + "learning_rate": 7.897931976826827e-06, + "loss": 0.3953, "step": 103755 }, { - "epoch": 3.65, - "learning_rate": 8.974020687047955e-06, - "loss": 0.2473, + "epoch": 3.739503369733665, + "grad_norm": 0.1964816004037857, + "learning_rate": 7.895803593708532e-06, + "loss": 0.3628, "step": 103760 }, { - "epoch": 3.65, - "learning_rate": 8.971834374236155e-06, - "loss": 0.2655, + "epoch": 3.7396835693948898, + "grad_norm": 0.2270769327878952, + "learning_rate": 7.893675443631599e-06, + "loss": 0.3912, "step": 103765 }, { - "epoch": 3.65, - "learning_rate": 8.969648269539515e-06, - "loss": 0.2595, + "epoch": 3.739863769056114, + "grad_norm": 0.21808400750160217, + "learning_rate": 7.891547526625026e-06, + "loss": 0.3762, "step": 103770 }, { - "epoch": 3.65, - "learning_rate": 8.96746237298644e-06, - "loss": 0.2466, + "epoch": 3.740043968717339, + "grad_norm": 0.29697415232658386, + "learning_rate": 7.889419842717807e-06, + "loss": 0.364, "step": 103775 }, { - "epoch": 3.65, - "learning_rate": 8.965276684605297e-06, - "loss": 0.2621, + "epoch": 3.7402241683785635, + "grad_norm": 0.26142972707748413, + "learning_rate": 7.88729239193893e-06, + "loss": 0.3906, "step": 103780 }, { - "epoch": 3.65, - "learning_rate": 8.96309120442447e-06, - "loss": 0.2389, + "epoch": 3.740404368039788, + "grad_norm": 0.3567783534526825, + "learning_rate": 7.885165174317374e-06, + "loss": 0.3906, "step": 103785 }, { - "epoch": 3.65, - "learning_rate": 8.960905932472327e-06, - "loss": 0.2661, + "epoch": 3.7405845677010126, + "grad_norm": 0.2854664921760559, + "learning_rate": 7.883038189882137e-06, + "loss": 0.4332, "step": 103790 }, { - "epoch": 3.65, - "learning_rate": 8.95872086877726e-06, - "loss": 0.274, + "epoch": 3.7407647673622373, + "grad_norm": 0.2237059473991394, + "learning_rate": 7.880911438662186e-06, + "loss": 0.38, "step": 103795 }, { - "epoch": 3.65, - "learning_rate": 8.956536013367628e-06, - "loss": 0.2816, + "epoch": 3.740944967023462, + "grad_norm": 0.1987508237361908, + "learning_rate": 7.878784920686509e-06, + "loss": 0.4177, "step": 103800 }, { - "epoch": 3.65, - "learning_rate": 8.954351366271796e-06, - "loss": 0.2454, + "epoch": 3.7411251666846868, + "grad_norm": 0.2105659693479538, + "learning_rate": 7.876658635984065e-06, + "loss": 0.3688, "step": 103805 }, { - "epoch": 3.65, - "learning_rate": 8.952166927518146e-06, - "loss": 0.2581, + "epoch": 3.7413053663459115, + "grad_norm": 0.2706106901168823, + "learning_rate": 7.87453258458383e-06, + "loss": 0.3544, "step": 103810 }, { - "epoch": 3.65, - "learning_rate": 8.949982697135024e-06, - "loss": 0.2761, + "epoch": 3.7414855660071358, + "grad_norm": 0.2130729705095291, + "learning_rate": 7.872406766514779e-06, + "loss": 0.4017, "step": 103815 }, { - "epoch": 3.65, - "learning_rate": 8.947798675150806e-06, - "loss": 0.2835, + "epoch": 3.7416657656683605, + "grad_norm": 0.191060870885849, + "learning_rate": 7.870281181805877e-06, + "loss": 0.3528, "step": 103820 }, { - "epoch": 3.65, - "learning_rate": 8.945614861593843e-06, - "loss": 0.2625, + "epoch": 3.7418459653295852, + "grad_norm": 0.26372456550598145, + "learning_rate": 7.868155830486063e-06, + "loss": 0.363, "step": 103825 }, { - "epoch": 3.65, - "learning_rate": 8.943431256492484e-06, - "loss": 0.2508, + "epoch": 3.74202616499081, + "grad_norm": 0.2443712055683136, + "learning_rate": 7.866030712584318e-06, + "loss": 0.3847, "step": 103830 }, { - "epoch": 3.65, - "learning_rate": 8.941247859875095e-06, - "loss": 0.2554, + "epoch": 3.7422063646520343, + "grad_norm": 0.30589404702186584, + "learning_rate": 7.863905828129577e-06, + "loss": 0.3667, "step": 103835 }, { - "epoch": 3.65, - "learning_rate": 8.939064671770023e-06, - "loss": 0.2452, + "epoch": 3.742386564313259, + "grad_norm": 0.3021865785121918, + "learning_rate": 7.861781177150818e-06, + "loss": 0.4225, "step": 103840 }, { - "epoch": 3.65, - "learning_rate": 8.936881692205601e-06, - "loss": 0.2538, + "epoch": 3.7425667639744837, + "grad_norm": 0.21095475554466248, + "learning_rate": 7.859656759676964e-06, + "loss": 0.3872, "step": 103845 }, { - "epoch": 3.65, - "learning_rate": 8.934698921210195e-06, - "loss": 0.2836, + "epoch": 3.7427469636357085, + "grad_norm": 0.25870993733406067, + "learning_rate": 7.857532575736961e-06, + "loss": 0.3782, "step": 103850 }, { - "epoch": 3.65, - "learning_rate": 8.932516358812132e-06, - "loss": 0.2783, + "epoch": 3.742927163296933, + "grad_norm": 0.2628803849220276, + "learning_rate": 7.855408625359768e-06, + "loss": 0.3724, "step": 103855 }, { - "epoch": 3.65, - "learning_rate": 8.93033400503975e-06, - "loss": 0.2501, + "epoch": 3.7431073629581575, + "grad_norm": 0.2707422375679016, + "learning_rate": 7.85328490857431e-06, + "loss": 0.3451, "step": 103860 }, { - "epoch": 3.65, - "learning_rate": 8.928151859921397e-06, - "loss": 0.2647, + "epoch": 3.7432875626193822, + "grad_norm": 0.21672846376895905, + "learning_rate": 7.851161425409525e-06, + "loss": 0.3629, "step": 103865 }, { - "epoch": 3.65, - "learning_rate": 8.925969923485391e-06, - "loss": 0.2741, + "epoch": 3.743467762280607, + "grad_norm": 0.2119687795639038, + "learning_rate": 7.849038175894346e-06, + "loss": 0.3829, "step": 103870 }, { - "epoch": 3.65, - "learning_rate": 8.923788195760086e-06, - "loss": 0.2295, + "epoch": 3.7436479619418317, + "grad_norm": 0.3262704610824585, + "learning_rate": 7.846915160057694e-06, + "loss": 0.3918, "step": 103875 }, { - "epoch": 3.65, - "learning_rate": 8.921606676773791e-06, - "loss": 0.2456, + "epoch": 3.743828161603056, + "grad_norm": 0.2228257954120636, + "learning_rate": 7.84479237792851e-06, + "loss": 0.3837, "step": 103880 }, { - "epoch": 3.65, - "learning_rate": 8.919425366554832e-06, - "loss": 0.2479, + "epoch": 3.7440083612642807, + "grad_norm": 0.24818168580532074, + "learning_rate": 7.842669829535709e-06, + "loss": 0.3801, "step": 103885 }, { - "epoch": 3.66, - "learning_rate": 8.917244265131543e-06, - "loss": 0.2524, + "epoch": 3.7441885609255054, + "grad_norm": 0.18585266172885895, + "learning_rate": 7.840547514908209e-06, + "loss": 0.3744, "step": 103890 }, { - "epoch": 3.66, - "learning_rate": 8.915063372532242e-06, - "loss": 0.2658, + "epoch": 3.74436876058673, + "grad_norm": 0.25262919068336487, + "learning_rate": 7.838425434074925e-06, + "loss": 0.3952, "step": 103895 }, { - "epoch": 3.66, - "learning_rate": 8.91288268878524e-06, - "loss": 0.28, + "epoch": 3.744548960247955, + "grad_norm": 0.21435338258743286, + "learning_rate": 7.836303587064766e-06, + "loss": 0.3595, "step": 103900 }, { - "epoch": 3.66, - "learning_rate": 8.910702213918848e-06, - "loss": 0.2514, + "epoch": 3.744729159909179, + "grad_norm": 0.23021239042282104, + "learning_rate": 7.834181973906656e-06, + "loss": 0.4104, "step": 103905 }, { - "epoch": 3.66, - "learning_rate": 8.90852194796139e-06, - "loss": 0.2709, + "epoch": 3.744909359570404, + "grad_norm": 0.2677323818206787, + "learning_rate": 7.832060594629504e-06, + "loss": 0.3823, "step": 103910 }, { - "epoch": 3.66, - "learning_rate": 8.906341890941164e-06, - "loss": 0.2609, + "epoch": 3.7450895592316287, + "grad_norm": 0.22377659380435944, + "learning_rate": 7.82993944926218e-06, + "loss": 0.3476, "step": 103915 }, { - "epoch": 3.66, - "learning_rate": 8.904162042886487e-06, - "loss": 0.2631, + "epoch": 3.7452697588928534, + "grad_norm": 0.2559918761253357, + "learning_rate": 7.82781853783362e-06, + "loss": 0.3891, "step": 103920 }, { - "epoch": 3.66, - "learning_rate": 8.901982403825654e-06, - "loss": 0.2406, + "epoch": 3.7454499585540777, + "grad_norm": 0.201356440782547, + "learning_rate": 7.825697860372705e-06, + "loss": 0.3572, "step": 103925 }, { - "epoch": 3.66, - "learning_rate": 8.899802973786977e-06, - "loss": 0.2491, + "epoch": 3.7456301582153024, + "grad_norm": 0.26732808351516724, + "learning_rate": 7.823577416908325e-06, + "loss": 0.3886, "step": 103930 }, { - "epoch": 3.66, - "learning_rate": 8.89762375279875e-06, - "loss": 0.2554, + "epoch": 3.745810357876527, + "grad_norm": 0.22887802124023438, + "learning_rate": 7.821457207469392e-06, + "loss": 0.378, "step": 103935 }, { - "epoch": 3.66, - "learning_rate": 8.895444740889267e-06, - "loss": 0.2572, + "epoch": 3.745990557537752, + "grad_norm": 0.26050490140914917, + "learning_rate": 7.819337232084764e-06, + "loss": 0.3839, "step": 103940 }, { - "epoch": 3.66, - "learning_rate": 8.893265938086808e-06, - "loss": 0.2622, + "epoch": 3.7461707571989766, + "grad_norm": 0.23908868432044983, + "learning_rate": 7.817217490783346e-06, + "loss": 0.4197, "step": 103945 }, { - "epoch": 3.66, - "learning_rate": 8.891087344419688e-06, - "loss": 0.2487, + "epoch": 3.7463509568602014, + "grad_norm": 0.23192131519317627, + "learning_rate": 7.815097983594016e-06, + "loss": 0.3885, "step": 103950 }, { - "epoch": 3.66, - "learning_rate": 8.888908959916182e-06, - "loss": 0.2296, + "epoch": 3.7465311565214257, + "grad_norm": 0.31525352597236633, + "learning_rate": 7.812978710545646e-06, + "loss": 0.3664, "step": 103955 }, { - "epoch": 3.66, - "learning_rate": 8.886730784604565e-06, - "loss": 0.2806, + "epoch": 3.7467113561826504, + "grad_norm": 0.2983095347881317, + "learning_rate": 7.810859671667118e-06, + "loss": 0.4093, "step": 103960 }, { - "epoch": 3.66, - "learning_rate": 8.884552818513143e-06, - "loss": 0.2633, + "epoch": 3.746891555843875, + "grad_norm": 0.30987173318862915, + "learning_rate": 7.808740866987293e-06, + "loss": 0.4, "step": 103965 }, { - "epoch": 3.66, - "learning_rate": 8.882375061670178e-06, - "loss": 0.2681, + "epoch": 3.7470717555050994, + "grad_norm": 0.2129722535610199, + "learning_rate": 7.80662229653504e-06, + "loss": 0.3731, "step": 103970 }, { - "epoch": 3.66, - "learning_rate": 8.880197514103944e-06, - "loss": 0.2587, + "epoch": 3.747251955166324, + "grad_norm": 0.2828996181488037, + "learning_rate": 7.804503960339238e-06, + "loss": 0.3715, "step": 103975 }, { - "epoch": 3.66, - "learning_rate": 8.878020175842722e-06, - "loss": 0.2471, + "epoch": 3.747432154827549, + "grad_norm": 0.26484811305999756, + "learning_rate": 7.80238585842874e-06, + "loss": 0.3518, "step": 103980 }, { - "epoch": 3.66, - "learning_rate": 8.875843046914792e-06, - "loss": 0.2488, + "epoch": 3.7476123544887736, + "grad_norm": 0.23784774541854858, + "learning_rate": 7.800267990832404e-06, + "loss": 0.4227, "step": 103985 }, { - "epoch": 3.66, - "learning_rate": 8.873666127348412e-06, - "loss": 0.258, + "epoch": 3.7477925541499983, + "grad_norm": 0.22352570295333862, + "learning_rate": 7.79815035757909e-06, + "loss": 0.3722, "step": 103990 }, { - "epoch": 3.66, - "learning_rate": 8.871489417171853e-06, - "loss": 0.2769, + "epoch": 3.747972753811223, + "grad_norm": 0.1883036494255066, + "learning_rate": 7.796032958697635e-06, + "loss": 0.3852, "step": 103995 }, { - "epoch": 3.66, - "learning_rate": 8.869312916413363e-06, - "loss": 0.2779, + "epoch": 3.7481529534724474, + "grad_norm": 0.29498735070228577, + "learning_rate": 7.79391579421691e-06, + "loss": 0.3987, "step": 104000 }, { - "epoch": 3.66, - "eval_loss": 0.25182366371154785, - "eval_runtime": 10.5433, - "eval_samples_per_second": 9.485, - "eval_steps_per_second": 9.485, + "epoch": 3.7481529534724474, + "eval_loss": 0.4285476803779602, + "eval_runtime": 3.5328, + "eval_samples_per_second": 28.306, + "eval_steps_per_second": 7.076, "step": 104000 }, { - "epoch": 3.66, - "learning_rate": 8.867136625101225e-06, - "loss": 0.2541, + "epoch": 3.748333153133672, + "grad_norm": 0.2650872766971588, + "learning_rate": 7.79179886416576e-06, + "loss": 0.3802, "step": 104005 }, { - "epoch": 3.66, - "learning_rate": 8.864960543263684e-06, - "loss": 0.2648, + "epoch": 3.748513352794897, + "grad_norm": 0.19966596364974976, + "learning_rate": 7.789682168573004e-06, + "loss": 0.3448, "step": 104010 }, { - "epoch": 3.66, - "learning_rate": 8.862784670928998e-06, - "loss": 0.2607, + "epoch": 3.748693552456121, + "grad_norm": 0.2707030177116394, + "learning_rate": 7.7875657074675e-06, + "loss": 0.3787, "step": 104015 }, { - "epoch": 3.66, - "learning_rate": 8.860609008125408e-06, - "loss": 0.2858, + "epoch": 3.748873752117346, + "grad_norm": 0.21822407841682434, + "learning_rate": 7.785449480878076e-06, + "loss": 0.4043, "step": 104020 }, { - "epoch": 3.66, - "learning_rate": 8.858433554881185e-06, - "loss": 0.2665, + "epoch": 3.7490539517785706, + "grad_norm": 0.23433195054531097, + "learning_rate": 7.783333488833585e-06, + "loss": 0.3704, "step": 104025 }, { - "epoch": 3.66, - "learning_rate": 8.856258311224556e-06, - "loss": 0.2776, + "epoch": 3.7492341514397953, + "grad_norm": 0.26820576190948486, + "learning_rate": 7.781217731362834e-06, + "loss": 0.372, "step": 104030 }, { - "epoch": 3.66, - "learning_rate": 8.85408327718378e-06, - "loss": 0.2577, + "epoch": 3.74941435110102, + "grad_norm": 0.27072396874427795, + "learning_rate": 7.779102208494648e-06, + "loss": 0.3665, "step": 104035 }, { - "epoch": 3.66, - "learning_rate": 8.851908452787086e-06, - "loss": 0.2582, + "epoch": 3.749594550762245, + "grad_norm": 0.20719735324382782, + "learning_rate": 7.776986920257873e-06, + "loss": 0.3721, "step": 104040 }, { - "epoch": 3.66, - "learning_rate": 8.849733838062724e-06, - "loss": 0.2478, + "epoch": 3.749774750423469, + "grad_norm": 0.2387949824333191, + "learning_rate": 7.774871866681313e-06, + "loss": 0.3456, "step": 104045 }, { - "epoch": 3.66, - "learning_rate": 8.847559433038927e-06, - "loss": 0.2542, + "epoch": 3.749954950084694, + "grad_norm": 0.23849116265773773, + "learning_rate": 7.772757047793792e-06, + "loss": 0.3932, "step": 104050 }, { - "epoch": 3.66, - "learning_rate": 8.845385237743924e-06, - "loss": 0.2527, + "epoch": 3.7501351497459186, + "grad_norm": 0.25094231963157654, + "learning_rate": 7.770642463624117e-06, + "loss": 0.3632, "step": 104055 }, { - "epoch": 3.66, - "learning_rate": 8.843211252205944e-06, - "loss": 0.2481, + "epoch": 3.750315349407143, + "grad_norm": 0.21101891994476318, + "learning_rate": 7.768528114201095e-06, + "loss": 0.3791, "step": 104060 }, { - "epoch": 3.66, - "learning_rate": 8.84103747645322e-06, - "loss": 0.2688, + "epoch": 3.7504955490683676, + "grad_norm": 0.2257104218006134, + "learning_rate": 7.766413999553552e-06, + "loss": 0.3644, "step": 104065 }, { - "epoch": 3.66, - "learning_rate": 8.83886391051398e-06, - "loss": 0.2292, + "epoch": 3.7506757487295923, + "grad_norm": 0.28515586256980896, + "learning_rate": 7.76430011971028e-06, + "loss": 0.3604, "step": 104070 }, { - "epoch": 3.66, - "learning_rate": 8.836690554416433e-06, - "loss": 0.2643, + "epoch": 3.750855948390817, + "grad_norm": 0.23402278125286102, + "learning_rate": 7.762186474700084e-06, + "loss": 0.3458, "step": 104075 }, { - "epoch": 3.66, - "learning_rate": 8.834517408188814e-06, - "loss": 0.2564, + "epoch": 3.7510361480520418, + "grad_norm": 0.239648699760437, + "learning_rate": 7.760073064551757e-06, + "loss": 0.3902, "step": 104080 }, { - "epoch": 3.66, - "learning_rate": 8.832344471859324e-06, - "loss": 0.2784, + "epoch": 3.7512163477132665, + "grad_norm": 0.19721876084804535, + "learning_rate": 7.75795988929409e-06, + "loss": 0.3715, "step": 104085 }, { - "epoch": 3.66, - "learning_rate": 8.830171745456195e-06, - "loss": 0.2447, + "epoch": 3.751396547374491, + "grad_norm": 0.29029005765914917, + "learning_rate": 7.755846948955889e-06, + "loss": 0.3953, "step": 104090 }, { - "epoch": 3.66, - "learning_rate": 8.827999229007622e-06, - "loss": 0.2457, + "epoch": 3.7515767470357155, + "grad_norm": 0.23127782344818115, + "learning_rate": 7.753734243565935e-06, + "loss": 0.3737, "step": 104095 }, { - "epoch": 3.66, - "learning_rate": 8.825826922541827e-06, - "loss": 0.2656, + "epoch": 3.7517569466969403, + "grad_norm": 0.2870927155017853, + "learning_rate": 7.751621773153014e-06, + "loss": 0.415, "step": 104100 }, { - "epoch": 3.66, - "learning_rate": 8.823654826087014e-06, - "loss": 0.2321, + "epoch": 3.751937146358165, + "grad_norm": 0.2417393922805786, + "learning_rate": 7.749509537745906e-06, + "loss": 0.3357, "step": 104105 }, { - "epoch": 3.66, - "learning_rate": 8.82148293967138e-06, - "loss": 0.251, + "epoch": 3.7521173460193893, + "grad_norm": 0.226087749004364, + "learning_rate": 7.74739753737339e-06, + "loss": 0.3768, "step": 104110 }, { - "epoch": 3.66, - "learning_rate": 8.819311263323121e-06, - "loss": 0.27, + "epoch": 3.752297545680614, + "grad_norm": 0.25405430793762207, + "learning_rate": 7.745285772064237e-06, + "loss": 0.3824, "step": 104115 }, { - "epoch": 3.66, - "learning_rate": 8.817139797070447e-06, - "loss": 0.2412, + "epoch": 3.7524777453418388, + "grad_norm": 0.24911725521087646, + "learning_rate": 7.743174241847237e-06, + "loss": 0.3721, "step": 104120 }, { - "epoch": 3.66, - "learning_rate": 8.814968540941551e-06, - "loss": 0.2398, + "epoch": 3.7526579450030635, + "grad_norm": 0.2514585852622986, + "learning_rate": 7.741062946751135e-06, + "loss": 0.4184, "step": 104125 }, { - "epoch": 3.66, - "learning_rate": 8.812797494964622e-06, - "loss": 0.2572, + "epoch": 3.7528381446642882, + "grad_norm": 0.2331729680299759, + "learning_rate": 7.738951886804713e-06, + "loss": 0.4288, "step": 104130 }, { - "epoch": 3.66, - "learning_rate": 8.810626659167842e-06, - "loss": 0.2635, + "epoch": 3.7530183443255125, + "grad_norm": 0.2459155172109604, + "learning_rate": 7.736841062036731e-06, + "loss": 0.3804, "step": 104135 }, { - "epoch": 3.66, - "learning_rate": 8.808456033579405e-06, - "loss": 0.2611, + "epoch": 3.7531985439867372, + "grad_norm": 0.2611176073551178, + "learning_rate": 7.734730472475948e-06, + "loss": 0.3821, "step": 104140 }, { - "epoch": 3.66, - "learning_rate": 8.806285618227503e-06, - "loss": 0.2458, + "epoch": 3.753378743647962, + "grad_norm": 0.29315176606178284, + "learning_rate": 7.73262011815112e-06, + "loss": 0.3522, "step": 104145 }, { - "epoch": 3.66, - "learning_rate": 8.80411541314031e-06, - "loss": 0.2607, + "epoch": 3.7535589433091867, + "grad_norm": 0.20626281201839447, + "learning_rate": 7.730509999090999e-06, + "loss": 0.3707, "step": 104150 }, { - "epoch": 3.66, - "learning_rate": 8.801945418345997e-06, - "loss": 0.2331, + "epoch": 3.753739142970411, + "grad_norm": 0.19073475897312164, + "learning_rate": 7.72840011532433e-06, + "loss": 0.3713, "step": 104155 }, { - "epoch": 3.66, - "learning_rate": 8.799775633872756e-06, - "loss": 0.2686, + "epoch": 3.7539193426316357, + "grad_norm": 0.2597355246543884, + "learning_rate": 7.726290466879872e-06, + "loss": 0.3736, "step": 104160 }, { - "epoch": 3.66, - "learning_rate": 8.797606059748754e-06, - "loss": 0.2482, + "epoch": 3.7540995422928605, + "grad_norm": 0.19545309245586395, + "learning_rate": 7.724181053786361e-06, + "loss": 0.3695, "step": 104165 }, { - "epoch": 3.66, - "learning_rate": 8.795436696002157e-06, - "loss": 0.2513, + "epoch": 3.754279741954085, + "grad_norm": 0.2396879643201828, + "learning_rate": 7.722071876072538e-06, + "loss": 0.4125, "step": 104170 }, { - "epoch": 3.67, - "learning_rate": 8.79326754266113e-06, - "loss": 0.2615, + "epoch": 3.75445994161531, + "grad_norm": 0.22774870693683624, + "learning_rate": 7.71996293376714e-06, + "loss": 0.3802, "step": 104175 }, { - "epoch": 3.67, - "learning_rate": 8.791098599753848e-06, - "loss": 0.2588, + "epoch": 3.7546401412765347, + "grad_norm": 0.20402881503105164, + "learning_rate": 7.717854226898897e-06, + "loss": 0.34, "step": 104180 }, { - "epoch": 3.67, - "learning_rate": 8.78892986730847e-06, - "loss": 0.2491, + "epoch": 3.754820340937759, + "grad_norm": 0.22189156711101532, + "learning_rate": 7.71574575549655e-06, + "loss": 0.3903, "step": 104185 }, { - "epoch": 3.67, - "learning_rate": 8.786761345353148e-06, - "loss": 0.2622, + "epoch": 3.7550005405989837, + "grad_norm": 0.19797289371490479, + "learning_rate": 7.713637519588829e-06, + "loss": 0.3864, "step": 104190 }, { - "epoch": 3.67, - "learning_rate": 8.784593033916041e-06, - "loss": 0.2478, + "epoch": 3.7551807402602084, + "grad_norm": 0.24208073318004608, + "learning_rate": 7.711529519204435e-06, + "loss": 0.3533, "step": 104195 }, { - "epoch": 3.67, - "learning_rate": 8.782424933025317e-06, - "loss": 0.2468, + "epoch": 3.7553609399214327, + "grad_norm": 0.24616724252700806, + "learning_rate": 7.709421754372112e-06, + "loss": 0.3659, "step": 104200 }, { - "epoch": 3.67, - "learning_rate": 8.78025704270912e-06, - "loss": 0.2527, + "epoch": 3.7555411395826575, + "grad_norm": 0.2978845238685608, + "learning_rate": 7.70731422512056e-06, + "loss": 0.4112, "step": 104205 }, { - "epoch": 3.67, - "learning_rate": 8.778089362995585e-06, - "loss": 0.2665, + "epoch": 3.755721339243882, + "grad_norm": 0.22275815904140472, + "learning_rate": 7.705206931478512e-06, + "loss": 0.3825, "step": 104210 }, { - "epoch": 3.67, - "learning_rate": 8.775921893912878e-06, - "loss": 0.2505, + "epoch": 3.755901538905107, + "grad_norm": 0.3553224205970764, + "learning_rate": 7.703099873474681e-06, + "loss": 0.3811, "step": 104215 }, { - "epoch": 3.67, - "learning_rate": 8.773754635489135e-06, - "loss": 0.2476, + "epoch": 3.7560817385663317, + "grad_norm": 0.2287319004535675, + "learning_rate": 7.700993051137751e-06, + "loss": 0.3758, "step": 104220 }, { - "epoch": 3.67, - "learning_rate": 8.771587587752491e-06, - "loss": 0.2766, + "epoch": 3.7562619382275564, + "grad_norm": 0.22735942900180817, + "learning_rate": 7.698886464496446e-06, + "loss": 0.4105, "step": 104225 }, { - "epoch": 3.67, - "learning_rate": 8.76942075073108e-06, - "loss": 0.2894, + "epoch": 3.7564421378887807, + "grad_norm": 0.18838316202163696, + "learning_rate": 7.696780113579464e-06, + "loss": 0.3754, "step": 104230 }, { - "epoch": 3.67, - "learning_rate": 8.767254124453052e-06, - "loss": 0.2504, + "epoch": 3.7566223375500054, + "grad_norm": 0.2099224328994751, + "learning_rate": 7.694673998415503e-06, + "loss": 0.3815, "step": 104235 }, { - "epoch": 3.67, - "learning_rate": 8.765087708946534e-06, - "loss": 0.2704, + "epoch": 3.75680253721123, + "grad_norm": 0.3171490728855133, + "learning_rate": 7.692568119033258e-06, + "loss": 0.3913, "step": 104240 }, { - "epoch": 3.67, - "learning_rate": 8.762921504239643e-06, - "loss": 0.2435, + "epoch": 3.7569827368724544, + "grad_norm": 0.25509047508239746, + "learning_rate": 7.690462475461416e-06, + "loss": 0.4001, "step": 104245 }, { - "epoch": 3.67, - "learning_rate": 8.760755510360525e-06, - "loss": 0.2439, + "epoch": 3.757162936533679, + "grad_norm": 0.2789323627948761, + "learning_rate": 7.688357067728676e-06, + "loss": 0.3647, "step": 104250 }, { - "epoch": 3.67, - "learning_rate": 8.758589727337286e-06, - "loss": 0.2392, + "epoch": 3.757343136194904, + "grad_norm": 0.21449807286262512, + "learning_rate": 7.686251895863721e-06, + "loss": 0.3643, "step": 104255 }, { - "epoch": 3.67, - "learning_rate": 8.756424155198064e-06, - "loss": 0.2783, + "epoch": 3.7575233358561286, + "grad_norm": 0.25625357031822205, + "learning_rate": 7.684146959895233e-06, + "loss": 0.3751, "step": 104260 }, { - "epoch": 3.67, - "learning_rate": 8.754258793970968e-06, - "loss": 0.2382, + "epoch": 3.7577035355173534, + "grad_norm": 0.25232619047164917, + "learning_rate": 7.68204225985189e-06, + "loss": 0.415, "step": 104265 }, { - "epoch": 3.67, - "learning_rate": 8.752093643684109e-06, - "loss": 0.258, + "epoch": 3.757883735178578, + "grad_norm": 0.24988026916980743, + "learning_rate": 7.67993779576236e-06, + "loss": 0.3681, "step": 104270 }, { - "epoch": 3.67, - "learning_rate": 8.749928704365611e-06, - "loss": 0.2632, + "epoch": 3.7580639348398024, + "grad_norm": 0.21227630972862244, + "learning_rate": 7.677833567655331e-06, + "loss": 0.3763, "step": 104275 }, { - "epoch": 3.67, - "learning_rate": 8.747763976043585e-06, - "loss": 0.2571, + "epoch": 3.758244134501027, + "grad_norm": 0.2227819859981537, + "learning_rate": 7.675729575559468e-06, + "loss": 0.3833, "step": 104280 }, { - "epoch": 3.67, - "learning_rate": 8.74559945874613e-06, - "loss": 0.2561, + "epoch": 3.758424334162252, + "grad_norm": 0.3054330348968506, + "learning_rate": 7.673625819503433e-06, + "loss": 0.3833, "step": 104285 }, { - "epoch": 3.67, - "learning_rate": 8.743435152501344e-06, - "loss": 0.2723, + "epoch": 3.758604533823476, + "grad_norm": 0.2734309136867523, + "learning_rate": 7.671522299515893e-06, + "loss": 0.3916, "step": 104290 }, { - "epoch": 3.67, - "learning_rate": 8.741271057337352e-06, - "loss": 0.2665, + "epoch": 3.758784733484701, + "grad_norm": 0.24472157657146454, + "learning_rate": 7.669419015625507e-06, + "loss": 0.3786, "step": 104295 }, { - "epoch": 3.67, - "learning_rate": 8.73910717328223e-06, - "loss": 0.2651, + "epoch": 3.7589649331459256, + "grad_norm": 0.21005383133888245, + "learning_rate": 7.667315967860925e-06, + "loss": 0.38, "step": 104300 }, { - "epoch": 3.67, - "learning_rate": 8.736943500364092e-06, - "loss": 0.2573, + "epoch": 3.7591451328071503, + "grad_norm": 0.18134789168834686, + "learning_rate": 7.665213156250819e-06, + "loss": 0.3707, "step": 104305 }, { - "epoch": 3.67, - "learning_rate": 8.734780038611018e-06, - "loss": 0.2369, + "epoch": 3.759325332468375, + "grad_norm": 0.26828300952911377, + "learning_rate": 7.663110580823816e-06, + "loss": 0.3727, "step": 104310 }, { - "epoch": 3.67, - "learning_rate": 8.732616788051113e-06, - "loss": 0.2747, + "epoch": 3.7595055321296, + "grad_norm": 0.28588294982910156, + "learning_rate": 7.661008241608581e-06, + "loss": 0.3755, "step": 104315 }, { - "epoch": 3.67, - "learning_rate": 8.730453748712458e-06, - "loss": 0.2685, + "epoch": 3.759685731790824, + "grad_norm": 0.27312424778938293, + "learning_rate": 7.65890613863375e-06, + "loss": 0.4112, "step": 104320 }, { - "epoch": 3.67, - "learning_rate": 8.728290920623133e-06, - "loss": 0.2375, + "epoch": 3.759865931452049, + "grad_norm": 0.2866198420524597, + "learning_rate": 7.65680427192797e-06, + "loss": 0.4028, "step": 104325 }, { - "epoch": 3.67, - "learning_rate": 8.726128303811234e-06, - "loss": 0.2635, + "epoch": 3.7600461311132736, + "grad_norm": 0.2471124827861786, + "learning_rate": 7.654702641519871e-06, + "loss": 0.3696, "step": 104330 }, { - "epoch": 3.67, - "learning_rate": 8.723965898304833e-06, - "loss": 0.2421, + "epoch": 3.7602263307744983, + "grad_norm": 0.24191972613334656, + "learning_rate": 7.65260124743809e-06, + "loss": 0.3817, "step": 104335 }, { - "epoch": 3.67, - "learning_rate": 8.721803704132011e-06, - "loss": 0.2507, + "epoch": 3.7604065304357226, + "grad_norm": 0.22872976958751678, + "learning_rate": 7.650500089711252e-06, + "loss": 0.4256, "step": 104340 }, { - "epoch": 3.67, - "learning_rate": 8.719641721320831e-06, - "loss": 0.2881, + "epoch": 3.7605867300969473, + "grad_norm": 0.3062474727630615, + "learning_rate": 7.648399168367998e-06, + "loss": 0.3657, "step": 104345 }, { - "epoch": 3.67, - "learning_rate": 8.717479949899381e-06, - "loss": 0.2453, + "epoch": 3.760766929758172, + "grad_norm": 0.2952815592288971, + "learning_rate": 7.646298483436946e-06, + "loss": 0.3974, "step": 104350 }, { - "epoch": 3.67, - "learning_rate": 8.715318389895715e-06, - "loss": 0.2716, + "epoch": 3.760947129419397, + "grad_norm": 0.30336183309555054, + "learning_rate": 7.644198034946718e-06, + "loss": 0.369, "step": 104355 }, { - "epoch": 3.67, - "learning_rate": 8.713157041337919e-06, - "loss": 0.2413, + "epoch": 3.7611273290806215, + "grad_norm": 0.24275535345077515, + "learning_rate": 7.642097822925932e-06, + "loss": 0.3679, "step": 104360 }, { - "epoch": 3.67, - "learning_rate": 8.710995904254036e-06, - "loss": 0.2537, + "epoch": 3.761307528741846, + "grad_norm": 0.20958095788955688, + "learning_rate": 7.639997847403194e-06, + "loss": 0.4052, "step": 104365 }, { - "epoch": 3.67, - "learning_rate": 8.708834978672142e-06, - "loss": 0.2658, + "epoch": 3.7614877284030706, + "grad_norm": 0.27302390336990356, + "learning_rate": 7.637898108407132e-06, + "loss": 0.3837, "step": 104370 }, { - "epoch": 3.67, - "learning_rate": 8.70667426462029e-06, - "loss": 0.2662, + "epoch": 3.7616679280642953, + "grad_norm": 0.3011491894721985, + "learning_rate": 7.635798605966346e-06, + "loss": 0.4018, "step": 104375 }, { - "epoch": 3.67, - "learning_rate": 8.704513762126537e-06, - "loss": 0.2634, + "epoch": 3.76184812772552, + "grad_norm": 0.2879173159599304, + "learning_rate": 7.633699340109443e-06, + "loss": 0.4226, "step": 104380 }, { - "epoch": 3.67, - "learning_rate": 8.702353471218924e-06, - "loss": 0.2611, + "epoch": 3.7620283273867443, + "grad_norm": 0.2275601178407669, + "learning_rate": 7.631600310865025e-06, + "loss": 0.4083, "step": 104385 }, { - "epoch": 3.67, - "learning_rate": 8.70019339192552e-06, - "loss": 0.2609, + "epoch": 3.762208527047969, + "grad_norm": 0.208656445145607, + "learning_rate": 7.62950151826168e-06, + "loss": 0.3667, "step": 104390 }, { - "epoch": 3.67, - "learning_rate": 8.698033524274362e-06, - "loss": 0.2498, + "epoch": 3.7623887267091938, + "grad_norm": 0.2310076206922531, + "learning_rate": 7.627402962328026e-06, + "loss": 0.3664, "step": 104395 }, { - "epoch": 3.67, - "learning_rate": 8.695873868293495e-06, - "loss": 0.2725, + "epoch": 3.7625689263704185, + "grad_norm": 0.2891443073749542, + "learning_rate": 7.625304643092648e-06, + "loss": 0.3927, "step": 104400 }, { - "epoch": 3.67, - "learning_rate": 8.693714424010954e-06, - "loss": 0.2618, + "epoch": 3.7627491260316432, + "grad_norm": 0.254215806722641, + "learning_rate": 7.623206560584115e-06, + "loss": 0.3493, "step": 104405 }, { - "epoch": 3.67, - "learning_rate": 8.691555191454793e-06, - "loss": 0.2633, + "epoch": 3.7629293256928675, + "grad_norm": 0.19437402486801147, + "learning_rate": 7.621108714831038e-06, + "loss": 0.3739, "step": 104410 }, { - "epoch": 3.67, - "learning_rate": 8.68939617065303e-06, - "loss": 0.2531, + "epoch": 3.7631095253540923, + "grad_norm": 0.23699922859668732, + "learning_rate": 7.619011105861987e-06, + "loss": 0.4086, "step": 104415 }, { - "epoch": 3.67, - "learning_rate": 8.687237361633719e-06, - "loss": 0.2509, + "epoch": 3.763289725015317, + "grad_norm": 0.21388274431228638, + "learning_rate": 7.616913733705547e-06, + "loss": 0.3858, "step": 104420 }, { - "epoch": 3.67, - "learning_rate": 8.68507876442487e-06, - "loss": 0.2635, + "epoch": 3.7634699246765417, + "grad_norm": 0.22161832451820374, + "learning_rate": 7.61481659839029e-06, + "loss": 0.3757, "step": 104425 }, { - "epoch": 3.67, - "learning_rate": 8.682920379054533e-06, - "loss": 0.2755, + "epoch": 3.763650124337766, + "grad_norm": 0.2681003212928772, + "learning_rate": 7.612719699944784e-06, + "loss": 0.4036, "step": 104430 }, { - "epoch": 3.67, - "learning_rate": 8.680762205550718e-06, - "loss": 0.2635, + "epoch": 3.7638303239989908, + "grad_norm": 0.2696530818939209, + "learning_rate": 7.610623038397613e-06, + "loss": 0.408, "step": 104435 }, { - "epoch": 3.67, - "learning_rate": 8.678604243941446e-06, - "loss": 0.2531, + "epoch": 3.7640105236602155, + "grad_norm": 0.25873225927352905, + "learning_rate": 7.608526613777339e-06, + "loss": 0.4039, "step": 104440 }, { - "epoch": 3.67, - "learning_rate": 8.676446494254748e-06, - "loss": 0.2754, + "epoch": 3.7641907233214402, + "grad_norm": 0.1770642250776291, + "learning_rate": 7.6064304261125205e-06, + "loss": 0.3661, "step": 104445 }, { - "epoch": 3.67, - "learning_rate": 8.674288956518633e-06, - "loss": 0.2745, + "epoch": 3.764370922982665, + "grad_norm": 0.25411689281463623, + "learning_rate": 7.604334475431721e-06, + "loss": 0.3954, "step": 104450 }, { - "epoch": 3.68, - "learning_rate": 8.672131630761119e-06, - "loss": 0.2658, + "epoch": 3.7645511226438897, + "grad_norm": 0.19758695363998413, + "learning_rate": 7.602238761763486e-06, + "loss": 0.3648, "step": 104455 }, { - "epoch": 3.68, - "learning_rate": 8.669974517010207e-06, - "loss": 0.2687, + "epoch": 3.764731322305114, + "grad_norm": 0.24316218495368958, + "learning_rate": 7.600143285136391e-06, + "loss": 0.4164, "step": 104460 }, { - "epoch": 3.68, - "learning_rate": 8.667817615293922e-06, - "loss": 0.2609, + "epoch": 3.7649115219663387, + "grad_norm": 0.23017525672912598, + "learning_rate": 7.598048045578973e-06, + "loss": 0.3761, "step": 104465 }, { - "epoch": 3.68, - "learning_rate": 8.665660925640254e-06, - "loss": 0.246, + "epoch": 3.7650917216275634, + "grad_norm": 0.2317330241203308, + "learning_rate": 7.595953043119783e-06, + "loss": 0.3907, "step": 104470 }, { - "epoch": 3.68, - "learning_rate": 8.663504448077222e-06, - "loss": 0.2521, + "epoch": 3.7652719212887877, + "grad_norm": 0.21798720955848694, + "learning_rate": 7.593858277787361e-06, + "loss": 0.3797, "step": 104475 }, { - "epoch": 3.68, - "learning_rate": 8.661348182632812e-06, - "loss": 0.2602, + "epoch": 3.7654521209500125, + "grad_norm": 0.18874719738960266, + "learning_rate": 7.59176374961025e-06, + "loss": 0.3733, "step": 104480 }, { - "epoch": 3.68, - "learning_rate": 8.659192129335034e-06, - "loss": 0.2542, + "epoch": 3.765632320611237, + "grad_norm": 0.24172750115394592, + "learning_rate": 7.58966945861698e-06, + "loss": 0.3748, "step": 104485 }, { - "epoch": 3.68, - "learning_rate": 8.657036288211878e-06, - "loss": 0.2705, + "epoch": 3.765812520272462, + "grad_norm": 0.32291868329048157, + "learning_rate": 7.5875754048361114e-06, + "loss": 0.4205, "step": 104490 }, { - "epoch": 3.68, - "learning_rate": 8.654880659291335e-06, - "loss": 0.2608, + "epoch": 3.7659927199336867, + "grad_norm": 0.23808036744594574, + "learning_rate": 7.5854815882961364e-06, + "loss": 0.3663, "step": 104495 }, { - "epoch": 3.68, - "learning_rate": 8.652725242601386e-06, - "loss": 0.256, + "epoch": 3.7661729195949114, + "grad_norm": 0.2094922959804535, + "learning_rate": 7.583388009025616e-06, + "loss": 0.3745, "step": 104500 }, { - "epoch": 3.68, - "eval_loss": 0.2519655227661133, - "eval_runtime": 10.5307, - "eval_samples_per_second": 9.496, - "eval_steps_per_second": 9.496, + "epoch": 3.7661729195949114, + "eval_loss": 0.4286915063858032, + "eval_runtime": 3.5312, + "eval_samples_per_second": 28.319, + "eval_steps_per_second": 7.08, "step": 104500 }, { - "epoch": 3.68, - "learning_rate": 8.650570038170037e-06, - "loss": 0.2585, + "epoch": 3.7663531192561357, + "grad_norm": 0.25394493341445923, + "learning_rate": 7.581294667053057e-06, + "loss": 0.3741, "step": 104505 }, { - "epoch": 3.68, - "learning_rate": 8.64841504602526e-06, - "loss": 0.2414, + "epoch": 3.7665333189173604, + "grad_norm": 0.22518283128738403, + "learning_rate": 7.579201562406982e-06, + "loss": 0.372, "step": 104510 }, { - "epoch": 3.68, - "learning_rate": 8.646260266195036e-06, - "loss": 0.2293, + "epoch": 3.766713518578585, + "grad_norm": 0.24221821129322052, + "learning_rate": 7.577108695115928e-06, + "loss": 0.4004, "step": 104515 }, { - "epoch": 3.68, - "learning_rate": 8.644105698707339e-06, - "loss": 0.2631, + "epoch": 3.7668937182398095, + "grad_norm": 0.25630852580070496, + "learning_rate": 7.575016065208385e-06, + "loss": 0.3834, "step": 104520 }, { - "epoch": 3.68, - "learning_rate": 8.641951343590149e-06, - "loss": 0.2891, + "epoch": 3.767073917901034, + "grad_norm": 0.21267221868038177, + "learning_rate": 7.5729236727128674e-06, + "loss": 0.3436, "step": 104525 }, { - "epoch": 3.68, - "learning_rate": 8.63979720087145e-06, - "loss": 0.2713, + "epoch": 3.767254117562259, + "grad_norm": 0.24416276812553406, + "learning_rate": 7.5708315176579e-06, + "loss": 0.4102, "step": 104530 }, { - "epoch": 3.68, - "learning_rate": 8.637643270579204e-06, - "loss": 0.2612, + "epoch": 3.7674343172234837, + "grad_norm": 0.2707764506340027, + "learning_rate": 7.568739600071978e-06, + "loss": 0.3707, "step": 104535 }, { - "epoch": 3.68, - "learning_rate": 8.635489552741366e-06, - "loss": 0.241, + "epoch": 3.7676145168847084, + "grad_norm": 0.2370613068342209, + "learning_rate": 7.566647919983602e-06, + "loss": 0.3701, "step": 104540 }, { - "epoch": 3.68, - "learning_rate": 8.633336047385921e-06, - "loss": 0.2513, + "epoch": 3.767794716545933, + "grad_norm": 0.2817910611629486, + "learning_rate": 7.564556477421275e-06, + "loss": 0.3784, "step": 104545 }, { - "epoch": 3.68, - "learning_rate": 8.63118275454082e-06, - "loss": 0.2451, + "epoch": 3.7679749162071574, + "grad_norm": 0.27115681767463684, + "learning_rate": 7.562465272413483e-06, + "loss": 0.4333, "step": 104550 }, { - "epoch": 3.68, - "learning_rate": 8.629029674234015e-06, - "loss": 0.2679, + "epoch": 3.768155115868382, + "grad_norm": 0.23443850874900818, + "learning_rate": 7.560374304988732e-06, + "loss": 0.4106, "step": 104555 }, { - "epoch": 3.68, - "learning_rate": 8.62687680649348e-06, - "loss": 0.2696, + "epoch": 3.768335315529607, + "grad_norm": 0.2954671382904053, + "learning_rate": 7.5582835751755064e-06, + "loss": 0.3693, "step": 104560 }, { - "epoch": 3.68, - "learning_rate": 8.62472415134716e-06, - "loss": 0.2559, + "epoch": 3.768515515190831, + "grad_norm": 0.2391144335269928, + "learning_rate": 7.556193083002291e-06, + "loss": 0.3376, "step": 104565 }, { - "epoch": 3.68, - "learning_rate": 8.622571708823007e-06, - "loss": 0.2507, + "epoch": 3.768695714852056, + "grad_norm": 0.29020795226097107, + "learning_rate": 7.554102828497564e-06, + "loss": 0.4022, "step": 104570 }, { - "epoch": 3.68, - "learning_rate": 8.620419478948957e-06, - "loss": 0.2527, + "epoch": 3.7688759145132806, + "grad_norm": 0.24601967632770538, + "learning_rate": 7.552012811689804e-06, + "loss": 0.3811, "step": 104575 }, { - "epoch": 3.68, - "learning_rate": 8.618267461752965e-06, - "loss": 0.2299, + "epoch": 3.7690561141745054, + "grad_norm": 0.23416286706924438, + "learning_rate": 7.549923032607498e-06, + "loss": 0.3815, "step": 104580 }, { - "epoch": 3.68, - "learning_rate": 8.616115657262982e-06, - "loss": 0.2722, + "epoch": 3.76923631383573, + "grad_norm": 0.2573948800563812, + "learning_rate": 7.547833491279119e-06, + "loss": 0.3997, "step": 104585 }, { - "epoch": 3.68, - "learning_rate": 8.61396406550694e-06, - "loss": 0.2582, + "epoch": 3.769416513496955, + "grad_norm": 0.23901304602622986, + "learning_rate": 7.5457441877331145e-06, + "loss": 0.3577, "step": 104590 }, { - "epoch": 3.68, - "learning_rate": 8.611812686512769e-06, - "loss": 0.273, + "epoch": 3.769596713158179, + "grad_norm": 0.1927175372838974, + "learning_rate": 7.543655121997975e-06, + "loss": 0.3581, "step": 104595 }, { - "epoch": 3.68, - "learning_rate": 8.609661520308417e-06, - "loss": 0.2449, + "epoch": 3.769776912819404, + "grad_norm": 0.30516329407691956, + "learning_rate": 7.541566294102154e-06, + "loss": 0.3853, "step": 104600 }, { - "epoch": 3.68, - "learning_rate": 8.607510566921809e-06, - "loss": 0.2516, + "epoch": 3.7699571124806286, + "grad_norm": 0.22798460721969604, + "learning_rate": 7.539895403048933e-06, + "loss": 0.3772, "step": 104605 }, { - "epoch": 3.68, - "learning_rate": 8.605359826380873e-06, - "loss": 0.2548, + "epoch": 3.7701373121418533, + "grad_norm": 0.24452564120292664, + "learning_rate": 7.537807003335604e-06, + "loss": 0.3669, "step": 104610 }, { - "epoch": 3.68, - "learning_rate": 8.603209298713527e-06, - "loss": 0.2399, + "epoch": 3.7703175118030776, + "grad_norm": 0.22427910566329956, + "learning_rate": 7.5357188415412724e-06, + "loss": 0.3694, "step": 104615 }, { - "epoch": 3.68, - "learning_rate": 8.60105898394771e-06, - "loss": 0.232, + "epoch": 3.7704977114643023, + "grad_norm": 0.2747547924518585, + "learning_rate": 7.5336309176943845e-06, + "loss": 0.3733, "step": 104620 }, { - "epoch": 3.68, - "learning_rate": 8.598908882111336e-06, - "loss": 0.2542, + "epoch": 3.770677911125527, + "grad_norm": 0.22400154173374176, + "learning_rate": 7.531543231823399e-06, + "loss": 0.3961, "step": 104625 }, { - "epoch": 3.68, - "learning_rate": 8.596758993232312e-06, - "loss": 0.257, + "epoch": 3.770858110786752, + "grad_norm": 0.2119477391242981, + "learning_rate": 7.529455783956757e-06, + "loss": 0.3566, "step": 104630 }, { - "epoch": 3.68, - "learning_rate": 8.594609317338565e-06, - "loss": 0.2478, + "epoch": 3.7710383104479765, + "grad_norm": 0.22162562608718872, + "learning_rate": 7.5273685741228976e-06, + "loss": 0.3925, "step": 104635 }, { - "epoch": 3.68, - "learning_rate": 8.59245985445801e-06, - "loss": 0.255, + "epoch": 3.771218510109201, + "grad_norm": 0.20573687553405762, + "learning_rate": 7.525281602350259e-06, + "loss": 0.3899, "step": 104640 }, { - "epoch": 3.68, - "learning_rate": 8.59031060461855e-06, - "loss": 0.2627, + "epoch": 3.7713987097704256, + "grad_norm": 0.26889345049858093, + "learning_rate": 7.523194868667266e-06, + "loss": 0.407, "step": 104645 }, { - "epoch": 3.68, - "learning_rate": 8.588161567848095e-06, - "loss": 0.2289, + "epoch": 3.7715789094316503, + "grad_norm": 0.23474839329719543, + "learning_rate": 7.521108373102367e-06, + "loss": 0.3577, "step": 104650 }, { - "epoch": 3.68, - "learning_rate": 8.586012744174538e-06, - "loss": 0.255, + "epoch": 3.771759109092875, + "grad_norm": 0.2511729896068573, + "learning_rate": 7.519022115683994e-06, + "loss": 0.3994, "step": 104655 }, { - "epoch": 3.68, - "learning_rate": 8.583864133625794e-06, - "loss": 0.2671, + "epoch": 3.7719393087540993, + "grad_norm": 0.24130114912986755, + "learning_rate": 7.5169360964405415e-06, + "loss": 0.3788, "step": 104660 }, { - "epoch": 3.68, - "learning_rate": 8.581715736229756e-06, - "loss": 0.2716, + "epoch": 3.772119508415324, + "grad_norm": 0.2123662829399109, + "learning_rate": 7.514850315400457e-06, + "loss": 0.3786, "step": 104665 }, { - "epoch": 3.68, - "learning_rate": 8.579567552014312e-06, - "loss": 0.2696, + "epoch": 3.772299708076549, + "grad_norm": 0.21604010462760925, + "learning_rate": 7.512764772592151e-06, + "loss": 0.3807, "step": 104670 }, { - "epoch": 3.68, - "learning_rate": 8.57741958100737e-06, - "loss": 0.2524, + "epoch": 3.7724799077377735, + "grad_norm": 0.26000019907951355, + "learning_rate": 7.510679468044035e-06, + "loss": 0.3552, "step": 104675 }, { - "epoch": 3.68, - "learning_rate": 8.575271823236813e-06, - "loss": 0.265, + "epoch": 3.7726601073989983, + "grad_norm": 0.2929229140281677, + "learning_rate": 7.508594401784538e-06, + "loss": 0.3689, "step": 104680 }, { - "epoch": 3.68, - "learning_rate": 8.573124278730516e-06, - "loss": 0.2618, + "epoch": 3.772840307060223, + "grad_norm": 0.26968440413475037, + "learning_rate": 7.506509573842041e-06, + "loss": 0.3501, "step": 104685 }, { - "epoch": 3.68, - "learning_rate": 8.570976947516385e-06, - "loss": 0.2743, + "epoch": 3.7730205067214473, + "grad_norm": 0.2252168208360672, + "learning_rate": 7.5044249842449735e-06, + "loss": 0.3709, "step": 104690 }, { - "epoch": 3.68, - "learning_rate": 8.568829829622283e-06, - "loss": 0.2527, + "epoch": 3.773200706382672, + "grad_norm": 0.20150010287761688, + "learning_rate": 7.502340633021726e-06, + "loss": 0.3637, "step": 104695 }, { - "epoch": 3.68, - "learning_rate": 8.566682925076106e-06, - "loss": 0.2584, + "epoch": 3.7733809060438968, + "grad_norm": 0.2643103003501892, + "learning_rate": 7.500256520200702e-06, + "loss": 0.4388, "step": 104700 }, { - "epoch": 3.68, - "learning_rate": 8.564536233905718e-06, - "loss": 0.2596, + "epoch": 3.773561105705121, + "grad_norm": 0.2742420732975006, + "learning_rate": 7.498172645810292e-06, + "loss": 0.3994, "step": 104705 }, { - "epoch": 3.68, - "learning_rate": 8.562389756138988e-06, - "loss": 0.276, + "epoch": 3.7737413053663458, + "grad_norm": 0.3029593229293823, + "learning_rate": 7.496089009878884e-06, + "loss": 0.3941, "step": 104710 }, { - "epoch": 3.68, - "learning_rate": 8.560243491803805e-06, - "loss": 0.2643, + "epoch": 3.7739215050275705, + "grad_norm": 0.23510831594467163, + "learning_rate": 7.494005612434885e-06, + "loss": 0.3873, "step": 104715 }, { - "epoch": 3.68, - "learning_rate": 8.558097440928026e-06, - "loss": 0.2524, + "epoch": 3.7741017046887952, + "grad_norm": 0.2507026195526123, + "learning_rate": 7.49192245350667e-06, + "loss": 0.3756, "step": 104720 }, { - "epoch": 3.68, - "learning_rate": 8.555951603539513e-06, - "loss": 0.2529, + "epoch": 3.77428190435002, + "grad_norm": 0.2193816602230072, + "learning_rate": 7.48983953312262e-06, + "loss": 0.3564, "step": 104725 }, { - "epoch": 3.68, - "learning_rate": 8.553805979666126e-06, - "loss": 0.2391, + "epoch": 3.7744621040112447, + "grad_norm": 0.2253447026014328, + "learning_rate": 7.487756851311114e-06, + "loss": 0.3607, "step": 104730 }, { - "epoch": 3.68, - "learning_rate": 8.551660569335737e-06, - "loss": 0.2702, + "epoch": 3.774642303672469, + "grad_norm": 0.27351540327072144, + "learning_rate": 7.485674408100535e-06, + "loss": 0.3961, "step": 104735 }, { - "epoch": 3.69, - "learning_rate": 8.54951537257619e-06, - "loss": 0.2369, + "epoch": 3.7748225033336937, + "grad_norm": 0.20876996219158173, + "learning_rate": 7.483592203519241e-06, + "loss": 0.3839, "step": 104740 }, { - "epoch": 3.69, - "learning_rate": 8.547370389415349e-06, - "loss": 0.2496, + "epoch": 3.7750027029949185, + "grad_norm": 0.23797959089279175, + "learning_rate": 7.481510237595621e-06, + "loss": 0.3904, "step": 104745 }, { - "epoch": 3.69, - "learning_rate": 8.545225619881053e-06, - "loss": 0.2909, + "epoch": 3.7751829026561428, + "grad_norm": 0.21280419826507568, + "learning_rate": 7.47942851035803e-06, + "loss": 0.3748, "step": 104750 }, { - "epoch": 3.69, - "learning_rate": 8.543081064001168e-06, - "loss": 0.2438, + "epoch": 3.7753631023173675, + "grad_norm": 0.22197531163692474, + "learning_rate": 7.477347021834838e-06, + "loss": 0.3514, "step": 104755 }, { - "epoch": 3.69, - "learning_rate": 8.54093672180353e-06, - "loss": 0.2567, + "epoch": 3.7755433019785922, + "grad_norm": 0.2479541003704071, + "learning_rate": 7.475265772054396e-06, + "loss": 0.3469, "step": 104760 }, { - "epoch": 3.69, - "learning_rate": 8.538792593315981e-06, - "loss": 0.2717, + "epoch": 3.775723501639817, + "grad_norm": 0.2767854630947113, + "learning_rate": 7.4731847610450604e-06, + "loss": 0.3672, "step": 104765 }, { - "epoch": 3.69, - "learning_rate": 8.536648678566353e-06, - "loss": 0.2731, + "epoch": 3.7759037013010417, + "grad_norm": 0.21771368384361267, + "learning_rate": 7.471103988835202e-06, + "loss": 0.3716, "step": 104770 }, { - "epoch": 3.69, - "learning_rate": 8.534504977582502e-06, - "loss": 0.2562, + "epoch": 3.7760839009622664, + "grad_norm": 0.25155210494995117, + "learning_rate": 7.46902345545315e-06, + "loss": 0.3624, "step": 104775 }, { - "epoch": 3.69, - "learning_rate": 8.532361490392252e-06, - "loss": 0.2409, + "epoch": 3.7762641006234907, + "grad_norm": 0.27159664034843445, + "learning_rate": 7.466943160927253e-06, + "loss": 0.3326, "step": 104780 }, { - "epoch": 3.69, - "learning_rate": 8.530218217023428e-06, - "loss": 0.2501, + "epoch": 3.7764443002847154, + "grad_norm": 0.22621986269950867, + "learning_rate": 7.464863105285868e-06, + "loss": 0.3833, "step": 104785 }, { - "epoch": 3.69, - "learning_rate": 8.528075157503873e-06, - "loss": 0.254, + "epoch": 3.77662449994594, + "grad_norm": 0.21097923815250397, + "learning_rate": 7.462783288557329e-06, + "loss": 0.378, "step": 104790 }, { - "epoch": 3.69, - "learning_rate": 8.525932311861399e-06, - "loss": 0.2703, + "epoch": 3.7768046996071645, + "grad_norm": 0.21815836429595947, + "learning_rate": 7.460703710769973e-06, + "loss": 0.364, "step": 104795 }, { - "epoch": 3.69, - "learning_rate": 8.523789680123844e-06, - "loss": 0.2657, + "epoch": 3.776984899268389, + "grad_norm": 0.22789250314235687, + "learning_rate": 7.458624371952133e-06, + "loss": 0.3781, "step": 104800 }, { - "epoch": 3.69, - "learning_rate": 8.521647262319016e-06, - "loss": 0.2476, + "epoch": 3.777165098929614, + "grad_norm": 0.28179875016212463, + "learning_rate": 7.456545272132132e-06, + "loss": 0.417, "step": 104805 }, { - "epoch": 3.69, - "learning_rate": 8.519505058474747e-06, - "loss": 0.254, + "epoch": 3.7773452985908387, + "grad_norm": 0.21157479286193848, + "learning_rate": 7.45446641133831e-06, + "loss": 0.3719, "step": 104810 }, { - "epoch": 3.69, - "learning_rate": 8.517363068618842e-06, - "loss": 0.2672, + "epoch": 3.7775254982520634, + "grad_norm": 0.24585333466529846, + "learning_rate": 7.452387789598988e-06, + "loss": 0.399, "step": 104815 }, { - "epoch": 3.69, - "learning_rate": 8.515221292779115e-06, - "loss": 0.2628, + "epoch": 3.777705697913288, + "grad_norm": 0.2581721842288971, + "learning_rate": 7.450309406942488e-06, + "loss": 0.4285, "step": 104820 }, { - "epoch": 3.69, - "learning_rate": 8.51307973098337e-06, - "loss": 0.2794, + "epoch": 3.7778858975745124, + "grad_norm": 0.2738182842731476, + "learning_rate": 7.448231263397121e-06, + "loss": 0.4033, "step": 104825 }, { - "epoch": 3.69, - "learning_rate": 8.510938383259426e-06, - "loss": 0.267, + "epoch": 3.778066097235737, + "grad_norm": 0.22549976408481598, + "learning_rate": 7.446153358991198e-06, + "loss": 0.4013, "step": 104830 }, { - "epoch": 3.69, - "learning_rate": 8.508797249635078e-06, - "loss": 0.2309, + "epoch": 3.778246296896962, + "grad_norm": 0.20595544576644897, + "learning_rate": 7.444075693753044e-06, + "loss": 0.3896, "step": 104835 }, { - "epoch": 3.69, - "learning_rate": 8.506656330138132e-06, - "loss": 0.2529, + "epoch": 3.7784264965581866, + "grad_norm": 0.24291178584098816, + "learning_rate": 7.441998267710962e-06, + "loss": 0.3715, "step": 104840 }, { - "epoch": 3.69, - "learning_rate": 8.504515624796378e-06, - "loss": 0.2605, + "epoch": 3.778606696219411, + "grad_norm": 0.248873770236969, + "learning_rate": 7.439921080893253e-06, + "loss": 0.3732, "step": 104845 }, { - "epoch": 3.69, - "learning_rate": 8.502375133637621e-06, - "loss": 0.257, + "epoch": 3.7787868958806357, + "grad_norm": 0.28607362508773804, + "learning_rate": 7.43784413332822e-06, + "loss": 0.3769, "step": 104850 }, { - "epoch": 3.69, - "learning_rate": 8.500234856689646e-06, - "loss": 0.2644, + "epoch": 3.7789670955418604, + "grad_norm": 0.254102498292923, + "learning_rate": 7.43576742504416e-06, + "loss": 0.3671, "step": 104855 }, { - "epoch": 3.69, - "learning_rate": 8.498094793980254e-06, - "loss": 0.247, + "epoch": 3.779147295203085, + "grad_norm": 0.21698035299777985, + "learning_rate": 7.433690956069361e-06, + "loss": 0.3563, "step": 104860 }, { - "epoch": 3.69, - "learning_rate": 8.495954945537218e-06, - "loss": 0.2457, + "epoch": 3.77932749486431, + "grad_norm": 0.25203150510787964, + "learning_rate": 7.431614726432137e-06, + "loss": 0.3384, "step": 104865 }, { - "epoch": 3.69, - "learning_rate": 8.49381531138834e-06, - "loss": 0.2419, + "epoch": 3.779507694525534, + "grad_norm": 0.24114732444286346, + "learning_rate": 7.429538736160746e-06, + "loss": 0.4012, "step": 104870 }, { - "epoch": 3.69, - "learning_rate": 8.491675891561393e-06, - "loss": 0.2537, + "epoch": 3.779687894186759, + "grad_norm": 0.26797887682914734, + "learning_rate": 7.4274629852834955e-06, + "loss": 0.4196, "step": 104875 }, { - "epoch": 3.69, - "learning_rate": 8.489536686084152e-06, - "loss": 0.2623, + "epoch": 3.7798680938479836, + "grad_norm": 0.22066861391067505, + "learning_rate": 7.425387473828657e-06, + "loss": 0.3429, "step": 104880 }, { - "epoch": 3.69, - "learning_rate": 8.487397694984388e-06, - "loss": 0.2529, + "epoch": 3.7800482935092083, + "grad_norm": 0.23529654741287231, + "learning_rate": 7.423312201824514e-06, + "loss": 0.3881, "step": 104885 }, { - "epoch": 3.69, - "learning_rate": 8.485258918289895e-06, - "loss": 0.2585, + "epoch": 3.7802284931704326, + "grad_norm": 0.2249959409236908, + "learning_rate": 7.421237169299341e-06, + "loss": 0.3611, "step": 104890 }, { - "epoch": 3.69, - "learning_rate": 8.483120356028429e-06, - "loss": 0.2664, + "epoch": 3.7804086928316574, + "grad_norm": 0.24566762149333954, + "learning_rate": 7.419162376281397e-06, + "loss": 0.4263, "step": 104895 }, { - "epoch": 3.69, - "learning_rate": 8.48098200822775e-06, - "loss": 0.2778, + "epoch": 3.780588892492882, + "grad_norm": 0.18927571177482605, + "learning_rate": 7.417087822798971e-06, + "loss": 0.3778, "step": 104900 }, { - "epoch": 3.69, - "learning_rate": 8.478843874915646e-06, - "loss": 0.2379, + "epoch": 3.780769092154107, + "grad_norm": 0.2276502400636673, + "learning_rate": 7.415013508880319e-06, + "loss": 0.3857, "step": 104905 }, { - "epoch": 3.69, - "learning_rate": 8.476705956119854e-06, - "loss": 0.2932, + "epoch": 3.7809492918153316, + "grad_norm": 0.2894652485847473, + "learning_rate": 7.412939434553707e-06, + "loss": 0.3817, "step": 104910 }, { - "epoch": 3.69, - "learning_rate": 8.474568251868157e-06, - "loss": 0.2642, + "epoch": 3.781129491476556, + "grad_norm": 0.2344924807548523, + "learning_rate": 7.410865599847386e-06, + "loss": 0.3711, "step": 104915 }, { - "epoch": 3.69, - "learning_rate": 8.472430762188289e-06, - "loss": 0.2508, + "epoch": 3.7813096911377806, + "grad_norm": 0.2306702584028244, + "learning_rate": 7.408792004789616e-06, + "loss": 0.3803, "step": 104920 }, { - "epoch": 3.69, - "learning_rate": 8.470293487108024e-06, - "loss": 0.2744, + "epoch": 3.7814898907990053, + "grad_norm": 0.25984832644462585, + "learning_rate": 7.4067186494086425e-06, + "loss": 0.4154, "step": 104925 }, { - "epoch": 3.69, - "learning_rate": 8.468156426655108e-06, - "loss": 0.275, + "epoch": 3.78167009046023, + "grad_norm": 0.23578700423240662, + "learning_rate": 7.404645533732729e-06, + "loss": 0.3523, "step": 104930 }, { - "epoch": 3.69, - "learning_rate": 8.466019580857284e-06, - "loss": 0.2619, + "epoch": 3.7818502901214543, + "grad_norm": 0.2648398280143738, + "learning_rate": 7.4025726577901135e-06, + "loss": 0.3804, "step": 104935 }, { - "epoch": 3.69, - "learning_rate": 8.463882949742292e-06, - "loss": 0.2579, + "epoch": 3.782030489782679, + "grad_norm": 0.22717751562595367, + "learning_rate": 7.400500021609038e-06, + "loss": 0.376, "step": 104940 }, { - "epoch": 3.69, - "learning_rate": 8.461746533337888e-06, - "loss": 0.2501, + "epoch": 3.782210689443904, + "grad_norm": 0.2581784129142761, + "learning_rate": 7.398427625217743e-06, + "loss": 0.3915, "step": 104945 }, { - "epoch": 3.69, - "learning_rate": 8.459610331671807e-06, - "loss": 0.2773, + "epoch": 3.7823908891051286, + "grad_norm": 0.263120174407959, + "learning_rate": 7.3963554686444556e-06, + "loss": 0.4054, "step": 104950 }, { - "epoch": 3.69, - "learning_rate": 8.457474344771787e-06, - "loss": 0.2684, + "epoch": 3.7825710887663533, + "grad_norm": 0.24363566935062408, + "learning_rate": 7.394283551917433e-06, + "loss": 0.3712, "step": 104955 }, { - "epoch": 3.69, - "learning_rate": 8.455338572665553e-06, - "loss": 0.256, + "epoch": 3.782751288427578, + "grad_norm": 0.2304638773202896, + "learning_rate": 7.39221187506488e-06, + "loss": 0.3958, "step": 104960 }, { - "epoch": 3.69, - "learning_rate": 8.453203015380842e-06, - "loss": 0.2457, + "epoch": 3.7829314880888023, + "grad_norm": 0.3181205093860626, + "learning_rate": 7.3901404381150255e-06, + "loss": 0.405, "step": 104965 }, { - "epoch": 3.69, - "learning_rate": 8.451067672945395e-06, - "loss": 0.2383, + "epoch": 3.783111687750027, + "grad_norm": 0.29112708568573, + "learning_rate": 7.3880692410961045e-06, + "loss": 0.38, "step": 104970 }, { - "epoch": 3.69, - "learning_rate": 8.448932545386928e-06, - "loss": 0.2567, + "epoch": 3.7832918874112518, + "grad_norm": 0.25521332025527954, + "learning_rate": 7.385998284036322e-06, + "loss": 0.4297, "step": 104975 }, { - "epoch": 3.69, - "learning_rate": 8.446797632733156e-06, - "loss": 0.2452, + "epoch": 3.783472087072476, + "grad_norm": 0.2132871150970459, + "learning_rate": 7.383927566963919e-06, + "loss": 0.3451, "step": 104980 }, { - "epoch": 3.69, - "learning_rate": 8.444662935011815e-06, - "loss": 0.2533, + "epoch": 3.783652286733701, + "grad_norm": 0.25532466173171997, + "learning_rate": 7.381857089907082e-06, + "loss": 0.4008, "step": 104985 }, { - "epoch": 3.69, - "learning_rate": 8.442528452250614e-06, - "loss": 0.2415, + "epoch": 3.7838324863949255, + "grad_norm": 0.28475886583328247, + "learning_rate": 7.379786852894027e-06, + "loss": 0.3948, "step": 104990 }, { - "epoch": 3.69, - "learning_rate": 8.440394184477274e-06, - "loss": 0.2855, + "epoch": 3.7840126860561503, + "grad_norm": 0.24241603910923004, + "learning_rate": 7.377716855952971e-06, + "loss": 0.3718, "step": 104995 }, { - "epoch": 3.69, - "learning_rate": 8.438260131719489e-06, - "loss": 0.2472, + "epoch": 3.784192885717375, + "grad_norm": 0.22656336426734924, + "learning_rate": 7.37564709911211e-06, + "loss": 0.3624, "step": 105000 }, { - "epoch": 3.69, - "eval_loss": 0.25183290243148804, - "eval_runtime": 10.5414, - "eval_samples_per_second": 9.486, - "eval_steps_per_second": 9.486, + "epoch": 3.784192885717375, + "eval_loss": 0.4286513924598694, + "eval_runtime": 3.5365, + "eval_samples_per_second": 28.277, + "eval_steps_per_second": 7.069, "step": 105000 }, { - "epoch": 3.69, - "learning_rate": 8.436126294004992e-06, - "loss": 0.2619, + "epoch": 3.7843730853785997, + "grad_norm": 0.2644498944282532, + "learning_rate": 7.3735775823996465e-06, + "loss": 0.387, "step": 105005 }, { - "epoch": 3.69, - "learning_rate": 8.43399267136148e-06, - "loss": 0.2468, + "epoch": 3.784553285039824, + "grad_norm": 0.2483258843421936, + "learning_rate": 7.371508305843775e-06, + "loss": 0.3835, "step": 105010 }, { - "epoch": 3.69, - "learning_rate": 8.431859263816647e-06, - "loss": 0.2627, + "epoch": 3.7847334847010488, + "grad_norm": 0.22739523649215698, + "learning_rate": 7.3694392694726835e-06, + "loss": 0.3869, "step": 105015 }, { - "epoch": 3.69, - "learning_rate": 8.429726071398202e-06, - "loss": 0.2629, + "epoch": 3.7849136843622735, + "grad_norm": 0.306366503238678, + "learning_rate": 7.367370473314575e-06, + "loss": 0.3759, "step": 105020 }, { - "epoch": 3.7, - "learning_rate": 8.42759309413385e-06, - "loss": 0.2444, + "epoch": 3.785093884023498, + "grad_norm": 0.21845044195652008, + "learning_rate": 7.365301917397629e-06, + "loss": 0.3692, "step": 105025 }, { - "epoch": 3.7, - "learning_rate": 8.425460332051286e-06, - "loss": 0.2447, + "epoch": 3.7852740836847225, + "grad_norm": 0.2691507041454315, + "learning_rate": 7.363233601750033e-06, + "loss": 0.3929, "step": 105030 }, { - "epoch": 3.7, - "learning_rate": 8.423327785178183e-06, - "loss": 0.2685, + "epoch": 3.7854542833459472, + "grad_norm": 0.23008035123348236, + "learning_rate": 7.361165526399963e-06, + "loss": 0.3841, "step": 105035 }, { - "epoch": 3.7, - "learning_rate": 8.421195453542255e-06, - "loss": 0.2732, + "epoch": 3.785634483007172, + "grad_norm": 0.21267521381378174, + "learning_rate": 7.359097691375596e-06, + "loss": 0.3342, "step": 105040 }, { - "epoch": 3.7, - "learning_rate": 8.419063337171176e-06, - "loss": 0.2691, + "epoch": 3.7858146826683967, + "grad_norm": 0.25716471672058105, + "learning_rate": 7.357030096705103e-06, + "loss": 0.4005, "step": 105045 }, { - "epoch": 3.7, - "learning_rate": 8.416931436092635e-06, - "loss": 0.2742, + "epoch": 3.7859948823296214, + "grad_norm": 0.25578397512435913, + "learning_rate": 7.354962742416674e-06, + "loss": 0.3783, "step": 105050 }, { - "epoch": 3.7, - "learning_rate": 8.4147997503343e-06, - "loss": 0.2566, + "epoch": 3.7861750819908457, + "grad_norm": 0.25349554419517517, + "learning_rate": 7.352895628538445e-06, + "loss": 0.3775, "step": 105055 }, { - "epoch": 3.7, - "learning_rate": 8.412668279923868e-06, - "loss": 0.248, + "epoch": 3.7863552816520705, + "grad_norm": 0.22552327811717987, + "learning_rate": 7.350828755098604e-06, + "loss": 0.3725, "step": 105060 }, { - "epoch": 3.7, - "learning_rate": 8.41053702488901e-06, - "loss": 0.2583, + "epoch": 3.786535481313295, + "grad_norm": 0.20090581476688385, + "learning_rate": 7.348762122125305e-06, + "loss": 0.3946, "step": 105065 }, { - "epoch": 3.7, - "learning_rate": 8.408405985257387e-06, - "loss": 0.2467, + "epoch": 3.7867156809745195, + "grad_norm": 0.24854734539985657, + "learning_rate": 7.346695729646705e-06, + "loss": 0.402, "step": 105070 }, { - "epoch": 3.7, - "learning_rate": 8.406275161056675e-06, - "loss": 0.2544, + "epoch": 3.7868958806357442, + "grad_norm": 0.2817821502685547, + "learning_rate": 7.344629577690956e-06, + "loss": 0.3533, "step": 105075 }, { - "epoch": 3.7, - "learning_rate": 8.404144552314555e-06, - "loss": 0.2703, + "epoch": 3.787076080296969, + "grad_norm": 0.24129050970077515, + "learning_rate": 7.3425636662862115e-06, + "loss": 0.3783, "step": 105080 }, { - "epoch": 3.7, - "learning_rate": 8.402014159058683e-06, - "loss": 0.2647, + "epoch": 3.7872562799581937, + "grad_norm": 0.2622377574443817, + "learning_rate": 7.340497995460613e-06, + "loss": 0.4121, "step": 105085 }, { - "epoch": 3.7, - "learning_rate": 8.399883981316713e-06, - "loss": 0.2664, + "epoch": 3.7874364796194184, + "grad_norm": 0.22572429478168488, + "learning_rate": 7.338432565242314e-06, + "loss": 0.3947, "step": 105090 }, { - "epoch": 3.7, - "learning_rate": 8.397754019116307e-06, - "loss": 0.2453, + "epoch": 3.787616679280643, + "grad_norm": 0.23703832924365997, + "learning_rate": 7.3363673756594555e-06, + "loss": 0.3583, "step": 105095 }, { - "epoch": 3.7, - "learning_rate": 8.395624272485128e-06, - "loss": 0.2837, + "epoch": 3.7877968789418675, + "grad_norm": 0.24465423822402954, + "learning_rate": 7.3343024267401725e-06, + "loss": 0.4008, "step": 105100 }, { - "epoch": 3.7, - "learning_rate": 8.393494741450828e-06, - "loss": 0.2511, + "epoch": 3.787977078603092, + "grad_norm": 0.22674879431724548, + "learning_rate": 7.332237718512594e-06, + "loss": 0.3801, "step": 105105 }, { - "epoch": 3.7, - "learning_rate": 8.391365426041054e-06, - "loss": 0.2686, + "epoch": 3.788157278264317, + "grad_norm": 0.24080802500247955, + "learning_rate": 7.330173251004851e-06, + "loss": 0.3732, "step": 105110 }, { - "epoch": 3.7, - "learning_rate": 8.389236326283445e-06, - "loss": 0.2467, + "epoch": 3.7883374779255417, + "grad_norm": 0.28059399127960205, + "learning_rate": 7.328109024245086e-06, + "loss": 0.4045, "step": 105115 }, { - "epoch": 3.7, - "learning_rate": 8.387107442205663e-06, - "loss": 0.2503, + "epoch": 3.788517677586766, + "grad_norm": 0.19681565463542938, + "learning_rate": 7.326045038261411e-06, + "loss": 0.3517, "step": 105120 }, { - "epoch": 3.7, - "learning_rate": 8.384978773835336e-06, - "loss": 0.247, + "epoch": 3.7886978772479907, + "grad_norm": 0.2852540612220764, + "learning_rate": 7.3239812930819524e-06, + "loss": 0.4153, "step": 105125 }, { - "epoch": 3.7, - "learning_rate": 8.38285032120012e-06, - "loss": 0.2505, + "epoch": 3.7888780769092154, + "grad_norm": 0.28406253457069397, + "learning_rate": 7.321917788734825e-06, + "loss": 0.3899, "step": 105130 }, { - "epoch": 3.7, - "learning_rate": 8.38072208432763e-06, - "loss": 0.2562, + "epoch": 3.78905827657044, + "grad_norm": 0.2611198127269745, + "learning_rate": 7.31985452524814e-06, + "loss": 0.3949, "step": 105135 }, { - "epoch": 3.7, - "learning_rate": 8.37859406324552e-06, - "loss": 0.2503, + "epoch": 3.789238476231665, + "grad_norm": 0.24562934041023254, + "learning_rate": 7.31779150265002e-06, + "loss": 0.3793, "step": 105140 }, { - "epoch": 3.7, - "learning_rate": 8.376466257981413e-06, - "loss": 0.259, + "epoch": 3.789418675892889, + "grad_norm": 0.27355340123176575, + "learning_rate": 7.315728720968576e-06, + "loss": 0.3941, "step": 105145 }, { - "epoch": 3.7, - "learning_rate": 8.374338668562928e-06, - "loss": 0.2572, + "epoch": 3.789598875554114, + "grad_norm": 0.20609021186828613, + "learning_rate": 7.313666180231888e-06, + "loss": 0.3649, "step": 105150 }, { - "epoch": 3.7, - "learning_rate": 8.372211295017707e-06, - "loss": 0.262, + "epoch": 3.7897790752153386, + "grad_norm": 0.21863946318626404, + "learning_rate": 7.311603880468082e-06, + "loss": 0.4042, "step": 105155 }, { - "epoch": 3.7, - "learning_rate": 8.370084137373363e-06, - "loss": 0.2496, + "epoch": 3.7899592748765634, + "grad_norm": 0.22044524550437927, + "learning_rate": 7.3095418217052405e-06, + "loss": 0.3829, "step": 105160 }, { - "epoch": 3.7, - "learning_rate": 8.367957195657517e-06, - "loss": 0.2508, + "epoch": 3.7901394745377877, + "grad_norm": 0.23821088671684265, + "learning_rate": 7.3074800039714844e-06, + "loss": 0.3746, "step": 105165 }, { - "epoch": 3.7, - "learning_rate": 8.365830469897778e-06, - "loss": 0.2497, + "epoch": 3.7903196741990124, + "grad_norm": 0.21885710954666138, + "learning_rate": 7.305418427294877e-06, + "loss": 0.342, "step": 105170 }, { - "epoch": 3.7, - "learning_rate": 8.363703960121777e-06, - "loss": 0.2672, + "epoch": 3.790499873860237, + "grad_norm": 0.26055842638015747, + "learning_rate": 7.303357091703511e-06, + "loss": 0.3668, "step": 105175 }, { - "epoch": 3.7, - "learning_rate": 8.36157766635711e-06, - "loss": 0.2408, + "epoch": 3.790680073521462, + "grad_norm": 0.21390856802463531, + "learning_rate": 7.301295997225488e-06, + "loss": 0.3783, "step": 105180 }, { - "epoch": 3.7, - "learning_rate": 8.359451588631397e-06, - "loss": 0.2604, + "epoch": 3.7908602731826866, + "grad_norm": 0.20063291490077972, + "learning_rate": 7.299235143888878e-06, + "loss": 0.373, "step": 105185 }, { - "epoch": 3.7, - "learning_rate": 8.357325726972231e-06, - "loss": 0.2554, + "epoch": 3.7910404728439113, + "grad_norm": 0.19434435665607452, + "learning_rate": 7.297174531721762e-06, + "loss": 0.4074, "step": 105190 }, { - "epoch": 3.7, - "learning_rate": 8.35520008140723e-06, - "loss": 0.2394, + "epoch": 3.7912206725051356, + "grad_norm": 0.2075873613357544, + "learning_rate": 7.295114160752217e-06, + "loss": 0.3656, "step": 105195 }, { - "epoch": 3.7, - "learning_rate": 8.353074651963987e-06, - "loss": 0.239, + "epoch": 3.7914008721663603, + "grad_norm": 0.2969796657562256, + "learning_rate": 7.293054031008306e-06, + "loss": 0.4043, "step": 105200 }, { - "epoch": 3.7, - "learning_rate": 8.3509494386701e-06, - "loss": 0.2537, + "epoch": 3.791581071827585, + "grad_norm": 0.3087260127067566, + "learning_rate": 7.290994142518115e-06, + "loss": 0.3599, "step": 105205 }, { - "epoch": 3.7, - "learning_rate": 8.348824441553155e-06, - "loss": 0.2547, + "epoch": 3.7917612714888094, + "grad_norm": 0.2378859966993332, + "learning_rate": 7.288934495309699e-06, + "loss": 0.3884, "step": 105210 }, { - "epoch": 3.7, - "learning_rate": 8.346699660640759e-06, - "loss": 0.2668, + "epoch": 3.791941471150034, + "grad_norm": 0.292811781167984, + "learning_rate": 7.286875089411119e-06, + "loss": 0.3768, "step": 105215 }, { - "epoch": 3.7, - "learning_rate": 8.344575095960491e-06, - "loss": 0.2744, + "epoch": 3.792121670811259, + "grad_norm": 0.2711928188800812, + "learning_rate": 7.284815924850441e-06, + "loss": 0.3985, "step": 105220 }, { - "epoch": 3.7, - "learning_rate": 8.34245074753994e-06, - "loss": 0.2562, + "epoch": 3.7923018704724836, + "grad_norm": 0.23554451763629913, + "learning_rate": 7.282757001655713e-06, + "loss": 0.3652, "step": 105225 }, { - "epoch": 3.7, - "learning_rate": 8.34032661540668e-06, - "loss": 0.2451, + "epoch": 3.7924820701337083, + "grad_norm": 0.24152545630931854, + "learning_rate": 7.280698319854984e-06, + "loss": 0.4081, "step": 105230 }, { - "epoch": 3.7, - "learning_rate": 8.338202699588299e-06, - "loss": 0.2601, + "epoch": 3.792662269794933, + "grad_norm": 0.30207541584968567, + "learning_rate": 7.2786398794763235e-06, + "loss": 0.4096, "step": 105235 }, { - "epoch": 3.7, - "learning_rate": 8.336079000112385e-06, - "loss": 0.244, + "epoch": 3.7928424694561573, + "grad_norm": 0.24256369471549988, + "learning_rate": 7.27658168054775e-06, + "loss": 0.3791, "step": 105240 }, { - "epoch": 3.7, - "learning_rate": 8.333955517006496e-06, - "loss": 0.266, + "epoch": 3.793022669117382, + "grad_norm": 0.221241295337677, + "learning_rate": 7.274523723097329e-06, + "loss": 0.3753, "step": 105245 }, { - "epoch": 3.7, - "learning_rate": 8.331832250298215e-06, - "loss": 0.2506, + "epoch": 3.793202868778607, + "grad_norm": 0.2517562508583069, + "learning_rate": 7.272466007153086e-06, + "loss": 0.4043, "step": 105250 }, { - "epoch": 3.7, - "learning_rate": 8.329709200015111e-06, - "loss": 0.269, + "epoch": 3.793383068439831, + "grad_norm": 0.2802235782146454, + "learning_rate": 7.270408532743059e-06, + "loss": 0.3501, "step": 105255 }, { - "epoch": 3.7, - "learning_rate": 8.327586366184747e-06, - "loss": 0.2455, + "epoch": 3.793563268101056, + "grad_norm": 0.28124722838401794, + "learning_rate": 7.268351299895295e-06, + "loss": 0.3683, "step": 105260 }, { - "epoch": 3.7, - "learning_rate": 8.325463748834677e-06, - "loss": 0.2603, + "epoch": 3.7937434677622806, + "grad_norm": 0.21457895636558533, + "learning_rate": 7.266294308637805e-06, + "loss": 0.3829, "step": 105265 }, { - "epoch": 3.7, - "learning_rate": 8.323341347992481e-06, - "loss": 0.2528, + "epoch": 3.7939236674235053, + "grad_norm": 0.2182459533214569, + "learning_rate": 7.264237558998615e-06, + "loss": 0.3997, "step": 105270 }, { - "epoch": 3.7, - "learning_rate": 8.321219163685706e-06, - "loss": 0.2537, + "epoch": 3.79410386708473, + "grad_norm": 0.22549273073673248, + "learning_rate": 7.262181051005762e-06, + "loss": 0.3851, "step": 105275 }, { - "epoch": 3.7, - "learning_rate": 8.31909719594191e-06, - "loss": 0.2582, + "epoch": 3.7942840667459548, + "grad_norm": 0.23273131251335144, + "learning_rate": 7.260124784687256e-06, + "loss": 0.4087, "step": 105280 }, { - "epoch": 3.7, - "learning_rate": 8.316975444788632e-06, - "loss": 0.254, + "epoch": 3.794464266407179, + "grad_norm": 0.22543184459209442, + "learning_rate": 7.258068760071115e-06, + "loss": 0.3824, "step": 105285 }, { - "epoch": 3.7, - "learning_rate": 8.314853910253445e-06, - "loss": 0.2834, + "epoch": 3.7946444660684038, + "grad_norm": 0.39207062125205994, + "learning_rate": 7.256012977185356e-06, + "loss": 0.4229, "step": 105290 }, { - "epoch": 3.7, - "learning_rate": 8.312732592363876e-06, - "loss": 0.2707, + "epoch": 3.7948246657296285, + "grad_norm": 0.24338999390602112, + "learning_rate": 7.253957436057973e-06, + "loss": 0.3539, "step": 105295 }, { - "epoch": 3.7, - "learning_rate": 8.310611491147482e-06, - "loss": 0.2676, + "epoch": 3.795004865390853, + "grad_norm": 0.20618541538715363, + "learning_rate": 7.251902136716996e-06, + "loss": 0.3645, "step": 105300 }, { - "epoch": 3.7, - "learning_rate": 8.308490606631794e-06, - "loss": 0.2672, + "epoch": 3.7951850650520775, + "grad_norm": 0.24109040200710297, + "learning_rate": 7.249847079190414e-06, + "loss": 0.3805, "step": 105305 }, { - "epoch": 3.71, - "learning_rate": 8.306369938844364e-06, - "loss": 0.2639, + "epoch": 3.7953652647133023, + "grad_norm": 0.2463836818933487, + "learning_rate": 7.247792263506228e-06, + "loss": 0.3682, "step": 105310 }, { - "epoch": 3.71, - "learning_rate": 8.304249487812715e-06, - "loss": 0.2749, + "epoch": 3.795545464374527, + "grad_norm": 0.23828503489494324, + "learning_rate": 7.2457376896924365e-06, + "loss": 0.3785, "step": 105315 }, { - "epoch": 3.71, - "learning_rate": 8.302129253564385e-06, - "loss": 0.2716, + "epoch": 3.7957256640357517, + "grad_norm": 0.2362687587738037, + "learning_rate": 7.243683357777023e-06, + "loss": 0.3743, "step": 105320 }, { - "epoch": 3.71, - "learning_rate": 8.300009236126897e-06, - "loss": 0.2357, + "epoch": 3.7959058636969765, + "grad_norm": 0.2598309814929962, + "learning_rate": 7.2416292677879946e-06, + "loss": 0.3675, "step": 105325 }, { - "epoch": 3.71, - "learning_rate": 8.29788943552779e-06, - "loss": 0.2526, + "epoch": 3.7960860633582008, + "grad_norm": 0.2267717868089676, + "learning_rate": 7.239575419753339e-06, + "loss": 0.3687, "step": 105330 }, { - "epoch": 3.71, - "learning_rate": 8.29576985179458e-06, - "loss": 0.2675, + "epoch": 3.7962662630194255, + "grad_norm": 0.20179912447929382, + "learning_rate": 7.237521813701012e-06, + "loss": 0.3649, "step": 105335 }, { - "epoch": 3.71, - "learning_rate": 8.293650484954782e-06, - "loss": 0.2506, + "epoch": 3.7964464626806502, + "grad_norm": 0.29818323254585266, + "learning_rate": 7.235468449659019e-06, + "loss": 0.3729, "step": 105340 }, { - "epoch": 3.71, - "learning_rate": 8.291531335035932e-06, - "loss": 0.2853, + "epoch": 3.796626662341875, + "grad_norm": 0.26517441868782043, + "learning_rate": 7.233415327655321e-06, + "loss": 0.3762, "step": 105345 }, { - "epoch": 3.71, - "learning_rate": 8.289412402065527e-06, - "loss": 0.2501, + "epoch": 3.7968068620030992, + "grad_norm": 0.22521330416202545, + "learning_rate": 7.231362447717915e-06, + "loss": 0.3459, "step": 105350 }, { - "epoch": 3.71, - "learning_rate": 8.287293686071099e-06, - "loss": 0.2818, + "epoch": 3.796987061664324, + "grad_norm": 0.20847706496715546, + "learning_rate": 7.229309809874749e-06, + "loss": 0.3975, "step": 105355 }, { - "epoch": 3.71, - "learning_rate": 8.28517518708014e-06, - "loss": 0.2614, + "epoch": 3.7971672613255487, + "grad_norm": 0.22983822226524353, + "learning_rate": 7.22725741415379e-06, + "loss": 0.3864, "step": 105360 }, { - "epoch": 3.71, - "learning_rate": 8.283056905120173e-06, - "loss": 0.2488, + "epoch": 3.7973474609867734, + "grad_norm": 0.24204209446907043, + "learning_rate": 7.225205260583013e-06, + "loss": 0.3853, "step": 105365 }, { - "epoch": 3.71, - "learning_rate": 8.280938840218697e-06, - "loss": 0.2743, + "epoch": 3.797527660647998, + "grad_norm": 0.3181317448616028, + "learning_rate": 7.223153349190373e-06, + "loss": 0.388, "step": 105370 }, { - "epoch": 3.71, - "learning_rate": 8.278820992403207e-06, - "loss": 0.2463, + "epoch": 3.7977078603092225, + "grad_norm": 0.22117455303668976, + "learning_rate": 7.221101680003828e-06, + "loss": 0.4012, "step": 105375 }, { - "epoch": 3.71, - "learning_rate": 8.276703361701205e-06, - "loss": 0.2606, + "epoch": 3.797888059970447, + "grad_norm": 0.1853034645318985, + "learning_rate": 7.219050253051329e-06, + "loss": 0.3584, "step": 105380 }, { - "epoch": 3.71, - "learning_rate": 8.274585948140193e-06, - "loss": 0.262, + "epoch": 3.798068259631672, + "grad_norm": 0.20408424735069275, + "learning_rate": 7.216999068360822e-06, + "loss": 0.3564, "step": 105385 }, { - "epoch": 3.71, - "learning_rate": 8.272468751747661e-06, - "loss": 0.2712, + "epoch": 3.7982484592928967, + "grad_norm": 0.2169559895992279, + "learning_rate": 7.214948125960266e-06, + "loss": 0.3362, "step": 105390 }, { - "epoch": 3.71, - "learning_rate": 8.270351772551096e-06, - "loss": 0.2589, + "epoch": 3.798428658954121, + "grad_norm": 0.2969491481781006, + "learning_rate": 7.212897425877599e-06, + "loss": 0.3837, "step": 105395 }, { - "epoch": 3.71, - "learning_rate": 8.268235010577985e-06, - "loss": 0.2597, + "epoch": 3.7986088586153457, + "grad_norm": 0.2599042057991028, + "learning_rate": 7.2108469681407605e-06, + "loss": 0.3692, "step": 105400 }, { - "epoch": 3.71, - "learning_rate": 8.266118465855813e-06, - "loss": 0.2744, + "epoch": 3.7987890582765704, + "grad_norm": 0.19389642775058746, + "learning_rate": 7.208796752777691e-06, + "loss": 0.356, "step": 105405 }, { - "epoch": 3.71, - "learning_rate": 8.264002138412071e-06, - "loss": 0.2636, + "epoch": 3.798969257937795, + "grad_norm": 0.22755640745162964, + "learning_rate": 7.206746779816317e-06, + "loss": 0.4012, "step": 105410 }, { - "epoch": 3.71, - "learning_rate": 8.261886028274232e-06, - "loss": 0.2698, + "epoch": 3.79914945759902, + "grad_norm": 0.25583744049072266, + "learning_rate": 7.204697049284567e-06, + "loss": 0.3835, "step": 105415 }, { - "epoch": 3.71, - "learning_rate": 8.259770135469765e-06, - "loss": 0.255, + "epoch": 3.799329657260244, + "grad_norm": 0.26095953583717346, + "learning_rate": 7.202647561210382e-06, + "loss": 0.373, "step": 105420 }, { - "epoch": 3.71, - "learning_rate": 8.257654460026162e-06, - "loss": 0.2355, + "epoch": 3.799509856921469, + "grad_norm": 0.26644977927207947, + "learning_rate": 7.200598315621679e-06, + "loss": 0.4386, "step": 105425 }, { - "epoch": 3.71, - "learning_rate": 8.255539001970877e-06, - "loss": 0.2424, + "epoch": 3.7996900565826937, + "grad_norm": 0.22700461745262146, + "learning_rate": 7.198549312546379e-06, + "loss": 0.4308, "step": 105430 }, { - "epoch": 3.71, - "learning_rate": 8.253423761331386e-06, - "loss": 0.2442, + "epoch": 3.7998702562439184, + "grad_norm": 0.20378275215625763, + "learning_rate": 7.196500552012397e-06, + "loss": 0.3812, "step": 105435 }, { - "epoch": 3.71, - "learning_rate": 8.251308738135144e-06, - "loss": 0.2664, + "epoch": 3.8000504559051427, + "grad_norm": 0.2918340861797333, + "learning_rate": 7.194452034047639e-06, + "loss": 0.3411, "step": 105440 }, { - "epoch": 3.71, - "learning_rate": 8.249193932409626e-06, - "loss": 0.262, + "epoch": 3.8002306555663674, + "grad_norm": 0.19135360419750214, + "learning_rate": 7.1924037586800415e-06, + "loss": 0.3652, "step": 105445 }, { - "epoch": 3.71, - "learning_rate": 8.247079344182284e-06, - "loss": 0.2698, + "epoch": 3.800410855227592, + "grad_norm": 0.24061061441898346, + "learning_rate": 7.190355725937487e-06, + "loss": 0.3725, "step": 105450 }, { - "epoch": 3.71, - "learning_rate": 8.244964973480573e-06, - "loss": 0.2772, + "epoch": 3.800591054888817, + "grad_norm": 0.27103692293167114, + "learning_rate": 7.1883079358478825e-06, + "loss": 0.3687, "step": 105455 }, { - "epoch": 3.71, - "learning_rate": 8.242850820331946e-06, - "loss": 0.2527, + "epoch": 3.8007712545500416, + "grad_norm": 0.2294936627149582, + "learning_rate": 7.186260388439137e-06, + "loss": 0.3556, "step": 105460 }, { - "epoch": 3.71, - "learning_rate": 8.240736884763866e-06, - "loss": 0.2283, + "epoch": 3.8009514542112663, + "grad_norm": 0.2741897702217102, + "learning_rate": 7.184213083739147e-06, + "loss": 0.3703, "step": 105465 }, { - "epoch": 3.71, - "learning_rate": 8.238623166803775e-06, - "loss": 0.251, + "epoch": 3.8011316538724906, + "grad_norm": 0.2626726031303406, + "learning_rate": 7.182166021775805e-06, + "loss": 0.379, "step": 105470 }, { - "epoch": 3.71, - "learning_rate": 8.236509666479106e-06, - "loss": 0.2775, + "epoch": 3.8013118535337154, + "grad_norm": 0.292074054479599, + "learning_rate": 7.1801192025770015e-06, + "loss": 0.3627, "step": 105475 }, { - "epoch": 3.71, - "learning_rate": 8.23439638381732e-06, - "loss": 0.2506, + "epoch": 3.80149205319494, + "grad_norm": 0.25104910135269165, + "learning_rate": 7.178072626170615e-06, + "loss": 0.3784, "step": 105480 }, { - "epoch": 3.71, - "learning_rate": 8.232283318845849e-06, - "loss": 0.2641, + "epoch": 3.8016722528561644, + "grad_norm": 0.24408739805221558, + "learning_rate": 7.176026292584548e-06, + "loss": 0.3792, "step": 105485 }, { - "epoch": 3.71, - "learning_rate": 8.230170471592127e-06, - "loss": 0.2681, + "epoch": 3.801852452517389, + "grad_norm": 0.238882914185524, + "learning_rate": 7.1739802018466695e-06, + "loss": 0.3824, "step": 105490 }, { - "epoch": 3.71, - "learning_rate": 8.228057842083584e-06, - "loss": 0.2476, + "epoch": 3.802032652178614, + "grad_norm": 0.22461140155792236, + "learning_rate": 7.171934353984863e-06, + "loss": 0.3616, "step": 105495 }, { - "epoch": 3.71, - "learning_rate": 8.225945430347662e-06, - "loss": 0.2553, + "epoch": 3.8022128518398386, + "grad_norm": 0.2754288911819458, + "learning_rate": 7.169888749026995e-06, + "loss": 0.3749, "step": 105500 }, { - "epoch": 3.71, - "eval_loss": 0.2516160011291504, - "eval_runtime": 10.5476, - "eval_samples_per_second": 9.481, - "eval_steps_per_second": 9.481, + "epoch": 3.8022128518398386, + "eval_loss": 0.4286453127861023, + "eval_runtime": 3.5307, + "eval_samples_per_second": 28.323, + "eval_steps_per_second": 7.081, "step": 105500 }, { - "epoch": 3.71, - "learning_rate": 8.223833236411784e-06, - "loss": 0.2675, + "epoch": 3.8023930515010633, + "grad_norm": 0.1857483983039856, + "learning_rate": 7.167843387000936e-06, + "loss": 0.4028, "step": 105505 }, { - "epoch": 3.71, - "learning_rate": 8.221721260303373e-06, - "loss": 0.2427, + "epoch": 3.802573251162288, + "grad_norm": 0.18854497373104095, + "learning_rate": 7.165798267934565e-06, + "loss": 0.3894, "step": 105510 }, { - "epoch": 3.71, - "learning_rate": 8.219609502049846e-06, - "loss": 0.242, + "epoch": 3.8027534508235123, + "grad_norm": 0.2537713050842285, + "learning_rate": 7.163753391855749e-06, + "loss": 0.3789, "step": 105515 }, { - "epoch": 3.71, - "learning_rate": 8.217497961678644e-06, - "loss": 0.2726, + "epoch": 3.802933650484737, + "grad_norm": 0.2627655565738678, + "learning_rate": 7.161708758792324e-06, + "loss": 0.3729, "step": 105520 }, { - "epoch": 3.71, - "learning_rate": 8.21538663921717e-06, - "loss": 0.2623, + "epoch": 3.803113850145962, + "grad_norm": 0.2551818788051605, + "learning_rate": 7.15966436877217e-06, + "loss": 0.3955, "step": 105525 }, { - "epoch": 3.71, - "learning_rate": 8.213275534692841e-06, - "loss": 0.2397, + "epoch": 3.803294049807186, + "grad_norm": 0.2461671382188797, + "learning_rate": 7.157620221823127e-06, + "loss": 0.3978, "step": 105530 }, { - "epoch": 3.71, - "learning_rate": 8.211164648133055e-06, - "loss": 0.2779, + "epoch": 3.803474249468411, + "grad_norm": 0.29957154393196106, + "learning_rate": 7.155576317973061e-06, + "loss": 0.4128, "step": 105535 }, { - "epoch": 3.71, - "learning_rate": 8.20905397956524e-06, - "loss": 0.2635, + "epoch": 3.8036544491296356, + "grad_norm": 0.2094818353652954, + "learning_rate": 7.153532657249823e-06, + "loss": 0.3306, "step": 105540 }, { - "epoch": 3.71, - "learning_rate": 8.206943529016794e-06, - "loss": 0.2533, + "epoch": 3.8038346487908603, + "grad_norm": 0.21276958286762238, + "learning_rate": 7.1514892396812335e-06, + "loss": 0.3973, "step": 105545 }, { - "epoch": 3.71, - "learning_rate": 8.204833296515122e-06, - "loss": 0.2796, + "epoch": 3.804014848452085, + "grad_norm": 0.2489434778690338, + "learning_rate": 7.149446065295151e-06, + "loss": 0.3623, "step": 105550 }, { - "epoch": 3.71, - "learning_rate": 8.20272328208761e-06, - "loss": 0.2774, + "epoch": 3.8041950481133098, + "grad_norm": 0.2391531616449356, + "learning_rate": 7.147403134119412e-06, + "loss": 0.3777, "step": 105555 }, { - "epoch": 3.71, - "learning_rate": 8.200613485761677e-06, - "loss": 0.2724, + "epoch": 3.804375247774534, + "grad_norm": 0.2393515408039093, + "learning_rate": 7.145360446181848e-06, + "loss": 0.3775, "step": 105560 }, { - "epoch": 3.71, - "learning_rate": 8.198503907564694e-06, - "loss": 0.2661, + "epoch": 3.804555447435759, + "grad_norm": 0.26830413937568665, + "learning_rate": 7.1433180015102936e-06, + "loss": 0.3694, "step": 105565 }, { - "epoch": 3.71, - "learning_rate": 8.196394547524078e-06, - "loss": 0.2391, + "epoch": 3.8047356470969835, + "grad_norm": 0.3024479150772095, + "learning_rate": 7.141275800132563e-06, + "loss": 0.3865, "step": 105570 }, { - "epoch": 3.71, - "learning_rate": 8.194285405667193e-06, - "loss": 0.2699, + "epoch": 3.804915846758208, + "grad_norm": 0.21776083111763, + "learning_rate": 7.1392338420765005e-06, + "loss": 0.3866, "step": 105575 }, { - "epoch": 3.71, - "learning_rate": 8.192176482021446e-06, - "loss": 0.2464, + "epoch": 3.8050960464194326, + "grad_norm": 0.24332380294799805, + "learning_rate": 7.137192127369921e-06, + "loss": 0.3977, "step": 105580 }, { - "epoch": 3.71, - "learning_rate": 8.190067776614214e-06, - "loss": 0.2504, + "epoch": 3.8052762460806573, + "grad_norm": 0.1978878378868103, + "learning_rate": 7.13515065604064e-06, + "loss": 0.3775, "step": 105585 }, { - "epoch": 3.71, - "learning_rate": 8.187959289472869e-06, - "loss": 0.256, + "epoch": 3.805456445741882, + "grad_norm": 0.22366763651371002, + "learning_rate": 7.1331094281164715e-06, + "loss": 0.3749, "step": 105590 }, { - "epoch": 3.72, - "learning_rate": 8.185851020624788e-06, - "loss": 0.2432, + "epoch": 3.8056366454031068, + "grad_norm": 0.20870821177959442, + "learning_rate": 7.13106844362523e-06, + "loss": 0.4025, "step": 105595 }, { - "epoch": 3.72, - "learning_rate": 8.183742970097357e-06, - "loss": 0.2638, + "epoch": 3.8058168450643315, + "grad_norm": 0.2318304181098938, + "learning_rate": 7.129027702594713e-06, + "loss": 0.4165, "step": 105600 }, { - "epoch": 3.72, - "learning_rate": 8.181635137917942e-06, - "loss": 0.2533, + "epoch": 3.8059970447255558, + "grad_norm": 0.22088027000427246, + "learning_rate": 7.126987205052738e-06, + "loss": 0.3893, "step": 105605 }, { - "epoch": 3.72, - "learning_rate": 8.179527524113904e-06, - "loss": 0.244, + "epoch": 3.8061772443867805, + "grad_norm": 0.27171891927719116, + "learning_rate": 7.124946951027103e-06, + "loss": 0.3769, "step": 105610 }, { - "epoch": 3.72, - "learning_rate": 8.17742012871262e-06, - "loss": 0.2672, + "epoch": 3.8063574440480052, + "grad_norm": 0.21456721425056458, + "learning_rate": 7.12290694054561e-06, + "loss": 0.3962, "step": 105615 }, { - "epoch": 3.72, - "learning_rate": 8.175312951741446e-06, - "loss": 0.2632, + "epoch": 3.80653764370923, + "grad_norm": 0.2533538043498993, + "learning_rate": 7.120867173636042e-06, + "loss": 0.3834, "step": 105620 }, { - "epoch": 3.72, - "learning_rate": 8.173205993227751e-06, - "loss": 0.259, + "epoch": 3.8067178433704543, + "grad_norm": 0.2441474199295044, + "learning_rate": 7.118827650326193e-06, + "loss": 0.3718, "step": 105625 }, { - "epoch": 3.72, - "learning_rate": 8.17109925319888e-06, - "loss": 0.2579, + "epoch": 3.806898043031679, + "grad_norm": 0.2645309865474701, + "learning_rate": 7.116788370643873e-06, + "loss": 0.3572, "step": 105630 }, { - "epoch": 3.72, - "learning_rate": 8.168992731682202e-06, - "loss": 0.2476, + "epoch": 3.8070782426929037, + "grad_norm": 0.2198515087366104, + "learning_rate": 7.1147493346168385e-06, + "loss": 0.3757, "step": 105635 }, { - "epoch": 3.72, - "learning_rate": 8.166886428705058e-06, - "loss": 0.2701, + "epoch": 3.8072584423541285, + "grad_norm": 0.23640519380569458, + "learning_rate": 7.112710542272874e-06, + "loss": 0.3703, "step": 105640 }, { - "epoch": 3.72, - "learning_rate": 8.164780344294806e-06, - "loss": 0.2374, + "epoch": 3.807438642015353, + "grad_norm": 0.28243306279182434, + "learning_rate": 7.110671993639772e-06, + "loss": 0.427, "step": 105645 }, { - "epoch": 3.72, - "learning_rate": 8.162674478478774e-06, - "loss": 0.2599, + "epoch": 3.8076188416765775, + "grad_norm": 0.2905183434486389, + "learning_rate": 7.108633688745303e-06, + "loss": 0.4278, "step": 105650 }, { - "epoch": 3.72, - "learning_rate": 8.16056883128433e-06, - "loss": 0.2632, + "epoch": 3.8077990413378022, + "grad_norm": 0.2647572457790375, + "learning_rate": 7.106595627617235e-06, + "loss": 0.3962, "step": 105655 }, { - "epoch": 3.72, - "learning_rate": 8.158463402738798e-06, - "loss": 0.227, + "epoch": 3.807979240999027, + "grad_norm": 0.22848838567733765, + "learning_rate": 7.104557810283338e-06, + "loss": 0.3891, "step": 105660 }, { - "epoch": 3.72, - "learning_rate": 8.156358192869521e-06, - "loss": 0.2211, + "epoch": 3.8081594406602517, + "grad_norm": 0.23743629455566406, + "learning_rate": 7.102520236771368e-06, + "loss": 0.3594, "step": 105665 }, { - "epoch": 3.72, - "learning_rate": 8.154253201703824e-06, - "loss": 0.2356, + "epoch": 3.808339640321476, + "grad_norm": 0.2258959710597992, + "learning_rate": 7.100482907109102e-06, + "loss": 0.3635, "step": 105670 }, { - "epoch": 3.72, - "learning_rate": 8.15214842926905e-06, - "loss": 0.2578, + "epoch": 3.8085198399827007, + "grad_norm": 0.2394886165857315, + "learning_rate": 7.098445821324293e-06, + "loss": 0.3791, "step": 105675 }, { - "epoch": 3.72, - "learning_rate": 8.15004387559253e-06, - "loss": 0.2664, + "epoch": 3.8087000396439255, + "grad_norm": 0.2250211089849472, + "learning_rate": 7.096408979444691e-06, + "loss": 0.403, "step": 105680 }, { - "epoch": 3.72, - "learning_rate": 8.147939540701588e-06, - "loss": 0.2439, + "epoch": 3.80888023930515, + "grad_norm": 0.1884530633687973, + "learning_rate": 7.094372381498052e-06, + "loss": 0.3308, "step": 105685 }, { - "epoch": 3.72, - "learning_rate": 8.145835424623539e-06, - "loss": 0.2558, + "epoch": 3.809060438966375, + "grad_norm": 0.24882015585899353, + "learning_rate": 7.092336027512115e-06, + "loss": 0.3892, "step": 105690 }, { - "epoch": 3.72, - "learning_rate": 8.143731527385711e-06, - "loss": 0.2669, + "epoch": 3.8092406386275997, + "grad_norm": 0.21837691962718964, + "learning_rate": 7.0902999175146396e-06, + "loss": 0.3885, "step": 105695 }, { - "epoch": 3.72, - "learning_rate": 8.141627849015427e-06, - "loss": 0.2658, + "epoch": 3.809420838288824, + "grad_norm": 0.27154818177223206, + "learning_rate": 7.0882640515333585e-06, + "loss": 0.3667, "step": 105700 }, { - "epoch": 3.72, - "learning_rate": 8.139524389539993e-06, - "loss": 0.2569, + "epoch": 3.8096010379500487, + "grad_norm": 0.19823803007602692, + "learning_rate": 7.0862284295960144e-06, + "loss": 0.3472, "step": 105705 }, { - "epoch": 3.72, - "learning_rate": 8.137421148986716e-06, - "loss": 0.251, + "epoch": 3.8097812376112734, + "grad_norm": 0.2487536519765854, + "learning_rate": 7.08419305173034e-06, + "loss": 0.3867, "step": 105710 }, { - "epoch": 3.72, - "learning_rate": 8.135318127382917e-06, - "loss": 0.2741, + "epoch": 3.8099614372724977, + "grad_norm": 0.2396322339773178, + "learning_rate": 7.082157917964058e-06, + "loss": 0.3602, "step": 105715 }, { - "epoch": 3.72, - "learning_rate": 8.133215324755903e-06, - "loss": 0.2686, + "epoch": 3.8101416369337224, + "grad_norm": 0.22832445800304413, + "learning_rate": 7.0801230283249145e-06, + "loss": 0.3922, "step": 105720 }, { - "epoch": 3.72, - "learning_rate": 8.131112741132962e-06, - "loss": 0.2404, + "epoch": 3.810321836594947, + "grad_norm": 0.22182156145572662, + "learning_rate": 7.078088382840631e-06, + "loss": 0.3613, "step": 105725 }, { - "epoch": 3.72, - "learning_rate": 8.129010376541412e-06, - "loss": 0.271, + "epoch": 3.810502036256172, + "grad_norm": 0.26968759298324585, + "learning_rate": 7.0760539815389075e-06, + "loss": 0.4139, "step": 105730 }, { - "epoch": 3.72, - "learning_rate": 8.126908231008536e-06, - "loss": 0.2655, + "epoch": 3.8106822359173966, + "grad_norm": 0.19509482383728027, + "learning_rate": 7.0740198244474895e-06, + "loss": 0.356, "step": 105735 }, { - "epoch": 3.72, - "learning_rate": 8.124806304561647e-06, - "loss": 0.2301, + "epoch": 3.8108624355786214, + "grad_norm": 0.24696923792362213, + "learning_rate": 7.071985911594078e-06, + "loss": 0.3766, "step": 105740 }, { - "epoch": 3.72, - "learning_rate": 8.122704597228017e-06, - "loss": 0.2741, + "epoch": 3.8110426352398457, + "grad_norm": 0.2208133488893509, + "learning_rate": 7.069952243006389e-06, + "loss": 0.3709, "step": 105745 }, { - "epoch": 3.72, - "learning_rate": 8.120603109034954e-06, - "loss": 0.2468, + "epoch": 3.8112228349010704, + "grad_norm": 0.2347523719072342, + "learning_rate": 7.067918818712127e-06, + "loss": 0.3837, "step": 105750 }, { - "epoch": 3.72, - "learning_rate": 8.118501840009738e-06, - "loss": 0.269, + "epoch": 3.811403034562295, + "grad_norm": 0.25173941254615784, + "learning_rate": 7.065885638738995e-06, + "loss": 0.4095, "step": 105755 }, { - "epoch": 3.72, - "learning_rate": 8.116400790179648e-06, - "loss": 0.2589, + "epoch": 3.8115832342235194, + "grad_norm": 0.19811011850833893, + "learning_rate": 7.063852703114704e-06, + "loss": 0.4196, "step": 105760 }, { - "epoch": 3.72, - "learning_rate": 8.114299959571959e-06, - "loss": 0.2432, + "epoch": 3.811763433884744, + "grad_norm": 0.22163262963294983, + "learning_rate": 7.061820011866949e-06, + "loss": 0.3861, "step": 105765 }, { - "epoch": 3.72, - "learning_rate": 8.112199348213965e-06, - "loss": 0.2401, + "epoch": 3.811943633545969, + "grad_norm": 0.21935562789440155, + "learning_rate": 7.059787565023421e-06, + "loss": 0.3796, "step": 105770 }, { - "epoch": 3.72, - "learning_rate": 8.110098956132934e-06, - "loss": 0.2524, + "epoch": 3.8121238332071936, + "grad_norm": 0.1883041262626648, + "learning_rate": 7.0577553626118145e-06, + "loss": 0.3783, "step": 105775 }, { - "epoch": 3.72, - "learning_rate": 8.107998783356128e-06, - "loss": 0.2679, + "epoch": 3.8123040328684183, + "grad_norm": 0.2626688778400421, + "learning_rate": 7.05572340465982e-06, + "loss": 0.4043, "step": 105780 }, { - "epoch": 3.72, - "learning_rate": 8.105898829910833e-06, - "loss": 0.2746, + "epoch": 3.812484232529643, + "grad_norm": 0.2265564501285553, + "learning_rate": 7.053691691195111e-06, + "loss": 0.3838, "step": 105785 }, { - "epoch": 3.72, - "learning_rate": 8.1037990958243e-06, - "loss": 0.253, + "epoch": 3.8126644321908674, + "grad_norm": 0.3275195360183716, + "learning_rate": 7.051660222245388e-06, + "loss": 0.3578, "step": 105790 }, { - "epoch": 3.72, - "learning_rate": 8.101699581123809e-06, - "loss": 0.2622, + "epoch": 3.812844631852092, + "grad_norm": 0.24849045276641846, + "learning_rate": 7.049628997838315e-06, + "loss": 0.3943, "step": 105795 }, { - "epoch": 3.72, - "learning_rate": 8.099600285836614e-06, - "loss": 0.2589, + "epoch": 3.813024831513317, + "grad_norm": 0.22452738881111145, + "learning_rate": 7.047598018001575e-06, + "loss": 0.3827, "step": 105800 }, { - "epoch": 3.72, - "learning_rate": 8.097501209989963e-06, - "loss": 0.2516, + "epoch": 3.813205031174541, + "grad_norm": 0.2721640467643738, + "learning_rate": 7.045567282762836e-06, + "loss": 0.3972, "step": 105805 }, { - "epoch": 3.72, - "learning_rate": 8.095402353611125e-06, - "loss": 0.2557, + "epoch": 3.813385230835766, + "grad_norm": 0.22152206301689148, + "learning_rate": 7.043536792149757e-06, + "loss": 0.354, "step": 105810 }, { - "epoch": 3.72, - "learning_rate": 8.093303716727351e-06, - "loss": 0.2641, + "epoch": 3.8135654304969906, + "grad_norm": 0.19255799055099487, + "learning_rate": 7.04150654619003e-06, + "loss": 0.3711, "step": 105815 }, { - "epoch": 3.72, - "learning_rate": 8.091205299365884e-06, - "loss": 0.2774, + "epoch": 3.8137456301582153, + "grad_norm": 0.2830601930618286, + "learning_rate": 7.039476544911291e-06, + "loss": 0.38, "step": 105820 }, { - "epoch": 3.72, - "learning_rate": 8.089107101553964e-06, - "loss": 0.2497, + "epoch": 3.81392582981944, + "grad_norm": 0.20766660571098328, + "learning_rate": 7.037446788341198e-06, + "loss": 0.3799, "step": 105825 }, { - "epoch": 3.72, - "learning_rate": 8.087009123318853e-06, - "loss": 0.2542, + "epoch": 3.814106029480665, + "grad_norm": 0.2430235594511032, + "learning_rate": 7.035417276507425e-06, + "loss": 0.3873, "step": 105830 }, { - "epoch": 3.72, - "learning_rate": 8.084911364687784e-06, - "loss": 0.2485, + "epoch": 3.814286229141889, + "grad_norm": 0.29118457436561584, + "learning_rate": 7.0333880094376055e-06, + "loss": 0.3836, "step": 105835 }, { - "epoch": 3.72, - "learning_rate": 8.082813825687982e-06, - "loss": 0.2743, + "epoch": 3.814466428803114, + "grad_norm": 0.20183739066123962, + "learning_rate": 7.031358987159409e-06, + "loss": 0.3727, "step": 105840 }, { - "epoch": 3.72, - "learning_rate": 8.080716506346696e-06, - "loss": 0.2506, + "epoch": 3.8146466284643386, + "grad_norm": 0.22086220979690552, + "learning_rate": 7.029330209700463e-06, + "loss": 0.3697, "step": 105845 }, { - "epoch": 3.72, - "learning_rate": 8.078619406691163e-06, - "loss": 0.2794, + "epoch": 3.8148268281255633, + "grad_norm": 0.24414370954036713, + "learning_rate": 7.0273016770884045e-06, + "loss": 0.3763, "step": 105850 }, { - "epoch": 3.72, - "learning_rate": 8.076522526748605e-06, - "loss": 0.261, + "epoch": 3.8150070277867876, + "grad_norm": 0.20783692598342896, + "learning_rate": 7.025273389350886e-06, + "loss": 0.3389, "step": 105855 }, { - "epoch": 3.72, - "learning_rate": 8.074425866546242e-06, - "loss": 0.238, + "epoch": 3.8151872274480123, + "grad_norm": 0.29034414887428284, + "learning_rate": 7.023245346515541e-06, + "loss": 0.4052, "step": 105860 }, { - "epoch": 3.72, - "learning_rate": 8.072329426111313e-06, - "loss": 0.2335, + "epoch": 3.815367427109237, + "grad_norm": 0.27145785093307495, + "learning_rate": 7.021217548609999e-06, + "loss": 0.3902, "step": 105865 }, { - "epoch": 3.72, - "learning_rate": 8.070233205471031e-06, - "loss": 0.2563, + "epoch": 3.8155476267704618, + "grad_norm": 0.2717854976654053, + "learning_rate": 7.019189995661884e-06, + "loss": 0.374, "step": 105870 }, { - "epoch": 3.72, - "learning_rate": 8.068137204652612e-06, - "loss": 0.2484, + "epoch": 3.8157278264316865, + "grad_norm": 0.30218860507011414, + "learning_rate": 7.017162687698817e-06, + "loss": 0.3719, "step": 105875 }, { - "epoch": 3.73, - "learning_rate": 8.066041423683265e-06, - "loss": 0.2721, + "epoch": 3.815908026092911, + "grad_norm": 0.2221565693616867, + "learning_rate": 7.015135624748434e-06, + "loss": 0.3678, "step": 105880 }, { - "epoch": 3.73, - "learning_rate": 8.063945862590219e-06, - "loss": 0.2546, + "epoch": 3.8160882257541355, + "grad_norm": 0.2444906085729599, + "learning_rate": 7.013108806838348e-06, + "loss": 0.4113, "step": 105885 }, { - "epoch": 3.73, - "learning_rate": 8.061850521400676e-06, - "loss": 0.2546, + "epoch": 3.8162684254153603, + "grad_norm": 0.2825545370578766, + "learning_rate": 7.011082233996169e-06, + "loss": 0.3969, "step": 105890 }, { - "epoch": 3.73, - "learning_rate": 8.059755400141833e-06, - "loss": 0.2708, + "epoch": 3.816448625076585, + "grad_norm": 0.226121723651886, + "learning_rate": 7.009055906249515e-06, + "loss": 0.3798, "step": 105895 }, { - "epoch": 3.73, - "learning_rate": 8.0576604988409e-06, - "loss": 0.2522, + "epoch": 3.8166288247378093, + "grad_norm": 0.2496255487203598, + "learning_rate": 7.007029823625982e-06, + "loss": 0.3724, "step": 105900 }, { - "epoch": 3.73, - "learning_rate": 8.055565817525088e-06, - "loss": 0.2467, + "epoch": 3.816809024399034, + "grad_norm": 0.2551683187484741, + "learning_rate": 7.0050039861531915e-06, + "loss": 0.3912, "step": 105905 }, { - "epoch": 3.73, - "learning_rate": 8.053471356221589e-06, - "loss": 0.2535, + "epoch": 3.8169892240602588, + "grad_norm": 0.24302713572978973, + "learning_rate": 7.002978393858747e-06, + "loss": 0.376, "step": 105910 }, { - "epoch": 3.73, - "learning_rate": 8.051377114957595e-06, - "loss": 0.2481, + "epoch": 3.8171694237214835, + "grad_norm": 0.25622105598449707, + "learning_rate": 7.00095304677022e-06, + "loss": 0.374, "step": 105915 }, { - "epoch": 3.73, - "learning_rate": 8.049283093760294e-06, - "loss": 0.2395, + "epoch": 3.8173496233827082, + "grad_norm": 0.23344071209430695, + "learning_rate": 6.9989279449152316e-06, + "loss": 0.3773, "step": 105920 }, { - "epoch": 3.73, - "learning_rate": 8.047189292656885e-06, - "loss": 0.222, + "epoch": 3.8175298230439325, + "grad_norm": 0.24558846652507782, + "learning_rate": 6.996903088321366e-06, + "loss": 0.3956, "step": 105925 }, { - "epoch": 3.73, - "learning_rate": 8.045095711674553e-06, - "loss": 0.278, + "epoch": 3.8177100227051572, + "grad_norm": 0.2149125039577484, + "learning_rate": 6.994878477016209e-06, + "loss": 0.3839, "step": 105930 }, { - "epoch": 3.73, - "learning_rate": 8.043002350840479e-06, - "loss": 0.2369, + "epoch": 3.817890222366382, + "grad_norm": 0.2317209243774414, + "learning_rate": 6.992854111027347e-06, + "loss": 0.403, "step": 105935 }, { - "epoch": 3.73, - "learning_rate": 8.040909210181838e-06, - "loss": 0.2597, + "epoch": 3.8180704220276067, + "grad_norm": 0.29228413105010986, + "learning_rate": 6.9908299903823555e-06, + "loss": 0.3927, "step": 105940 }, { - "epoch": 3.73, - "learning_rate": 8.038816289725821e-06, - "loss": 0.2459, + "epoch": 3.818250621688831, + "grad_norm": 0.20377010107040405, + "learning_rate": 6.988806115108826e-06, + "loss": 0.3415, "step": 105945 }, { - "epoch": 3.73, - "learning_rate": 8.036723589499586e-06, - "loss": 0.2616, + "epoch": 3.8184308213500557, + "grad_norm": 0.24830646812915802, + "learning_rate": 6.986782485234322e-06, + "loss": 0.38, "step": 105950 }, { - "epoch": 3.73, - "learning_rate": 8.034631109530319e-06, - "loss": 0.2529, + "epoch": 3.8186110210112805, + "grad_norm": 0.2752242982387543, + "learning_rate": 6.9847591007864225e-06, + "loss": 0.3827, "step": 105955 }, { - "epoch": 3.73, - "learning_rate": 8.032538849845195e-06, - "loss": 0.2854, + "epoch": 3.818791220672505, + "grad_norm": 0.2355724275112152, + "learning_rate": 6.9827359617926945e-06, + "loss": 0.3878, "step": 105960 }, { - "epoch": 3.73, - "learning_rate": 8.030446810471371e-06, - "loss": 0.2783, + "epoch": 3.81897142033373, + "grad_norm": 0.21855174005031586, + "learning_rate": 6.980713068280698e-06, + "loss": 0.3636, "step": 105965 }, { - "epoch": 3.73, - "learning_rate": 8.028354991436014e-06, - "loss": 0.2637, + "epoch": 3.8191516199949547, + "grad_norm": 0.24916981160640717, + "learning_rate": 6.978690420277989e-06, + "loss": 0.3912, "step": 105970 }, { - "epoch": 3.73, - "learning_rate": 8.02626339276627e-06, - "loss": 0.2745, + "epoch": 3.819331819656179, + "grad_norm": 0.25882112979888916, + "learning_rate": 6.976668017812144e-06, + "loss": 0.4247, "step": 105975 }, { - "epoch": 3.73, - "learning_rate": 8.02417201448932e-06, - "loss": 0.2596, + "epoch": 3.8195120193174037, + "grad_norm": 0.24076975882053375, + "learning_rate": 6.974645860910706e-06, + "loss": 0.3626, "step": 105980 }, { - "epoch": 3.73, - "learning_rate": 8.022080856632312e-06, - "loss": 0.261, + "epoch": 3.8196922189786284, + "grad_norm": 0.25258931517601013, + "learning_rate": 6.9726239496012285e-06, + "loss": 0.3866, "step": 105985 }, { - "epoch": 3.73, - "learning_rate": 8.019989919222393e-06, - "loss": 0.2877, + "epoch": 3.8198724186398527, + "grad_norm": 0.24138487875461578, + "learning_rate": 6.97060228391126e-06, + "loss": 0.3732, "step": 105990 }, { - "epoch": 3.73, - "learning_rate": 8.017899202286705e-06, - "loss": 0.2326, + "epoch": 3.8200526183010775, + "grad_norm": 0.23699279129505157, + "learning_rate": 6.968580863868334e-06, + "loss": 0.3936, "step": 105995 }, { - "epoch": 3.73, - "learning_rate": 8.015808705852412e-06, - "loss": 0.2643, + "epoch": 3.820232817962302, + "grad_norm": 0.28243473172187805, + "learning_rate": 6.9665596895000155e-06, + "loss": 0.3651, "step": 106000 }, { - "epoch": 3.73, - "eval_loss": 0.25173264741897583, - "eval_runtime": 10.5541, - "eval_samples_per_second": 9.475, - "eval_steps_per_second": 9.475, + "epoch": 3.820232817962302, + "eval_loss": 0.42869871854782104, + "eval_runtime": 3.5415, + "eval_samples_per_second": 28.237, + "eval_steps_per_second": 7.059, "step": 106000 }, { - "epoch": 3.73, - "learning_rate": 8.013718429946645e-06, - "loss": 0.2656, + "epoch": 3.820413017623527, + "grad_norm": 0.2416362762451172, + "learning_rate": 6.964538760833836e-06, + "loss": 0.3882, "step": 106005 }, { - "epoch": 3.73, - "learning_rate": 8.011628374596555e-06, - "loss": 0.2496, + "epoch": 3.8205932172847517, + "grad_norm": 0.2267039567232132, + "learning_rate": 6.962518077897306e-06, + "loss": 0.392, "step": 106010 }, { - "epoch": 3.73, - "learning_rate": 8.009538539829267e-06, - "loss": 0.2472, + "epoch": 3.8207734169459764, + "grad_norm": 0.24580471217632294, + "learning_rate": 6.960497640717986e-06, + "loss": 0.3695, "step": 106015 }, { - "epoch": 3.73, - "learning_rate": 8.007448925671934e-06, - "loss": 0.2544, + "epoch": 3.8209536166072007, + "grad_norm": 0.23100297152996063, + "learning_rate": 6.958477449323384e-06, + "loss": 0.389, "step": 106020 }, { - "epoch": 3.73, - "learning_rate": 8.005359532151677e-06, - "loss": 0.2508, + "epoch": 3.8211338162684254, + "grad_norm": 0.25452324748039246, + "learning_rate": 6.956457503741049e-06, + "loss": 0.3714, "step": 106025 }, { - "epoch": 3.73, - "learning_rate": 8.003270359295628e-06, - "loss": 0.2504, + "epoch": 3.82131401592965, + "grad_norm": 0.20147554576396942, + "learning_rate": 6.954437803998479e-06, + "loss": 0.3673, "step": 106030 }, { - "epoch": 3.73, - "learning_rate": 8.001181407130903e-06, - "loss": 0.2553, + "epoch": 3.8214942155908744, + "grad_norm": 0.26207849383354187, + "learning_rate": 6.952418350123194e-06, + "loss": 0.3893, "step": 106035 }, { - "epoch": 3.73, - "learning_rate": 7.99909267568464e-06, - "loss": 0.2288, + "epoch": 3.821674415252099, + "grad_norm": 0.25192660093307495, + "learning_rate": 6.950399142142722e-06, + "loss": 0.4313, "step": 106040 }, { - "epoch": 3.73, - "learning_rate": 7.99700416498396e-06, - "loss": 0.2656, + "epoch": 3.821854614913324, + "grad_norm": 0.2420777529478073, + "learning_rate": 6.9483801800845685e-06, + "loss": 0.3608, "step": 106045 }, { - "epoch": 3.73, - "learning_rate": 7.994915875055971e-06, - "loss": 0.2688, + "epoch": 3.8220348145745486, + "grad_norm": 0.19895605742931366, + "learning_rate": 6.9463614639762366e-06, + "loss": 0.3602, "step": 106050 }, { - "epoch": 3.73, - "learning_rate": 7.992827805927785e-06, - "loss": 0.2497, + "epoch": 3.8222150142357734, + "grad_norm": 0.3234860599040985, + "learning_rate": 6.944342993845237e-06, + "loss": 0.3955, "step": 106055 }, { - "epoch": 3.73, - "learning_rate": 7.990739957626525e-06, - "loss": 0.2433, + "epoch": 3.822395213896998, + "grad_norm": 0.23543211817741394, + "learning_rate": 6.942324769719061e-06, + "loss": 0.3795, "step": 106060 }, { - "epoch": 3.73, - "learning_rate": 7.988652330179303e-06, - "loss": 0.268, + "epoch": 3.8225754135582224, + "grad_norm": 0.18973630666732788, + "learning_rate": 6.9403067916252205e-06, + "loss": 0.3613, "step": 106065 }, { - "epoch": 3.73, - "learning_rate": 7.986564923613214e-06, - "loss": 0.2484, + "epoch": 3.822755613219447, + "grad_norm": 0.242222398519516, + "learning_rate": 6.938289059591205e-06, + "loss": 0.3813, "step": 106070 }, { - "epoch": 3.73, - "learning_rate": 7.98447773795537e-06, - "loss": 0.2794, + "epoch": 3.822935812880672, + "grad_norm": 0.23931699991226196, + "learning_rate": 6.936271573644501e-06, + "loss": 0.3862, "step": 106075 }, { - "epoch": 3.73, - "learning_rate": 7.982390773232873e-06, - "loss": 0.2717, + "epoch": 3.823116012541896, + "grad_norm": 0.23971091210842133, + "learning_rate": 6.934254333812601e-06, + "loss": 0.4044, "step": 106080 }, { - "epoch": 3.73, - "learning_rate": 7.980304029472816e-06, - "loss": 0.2683, + "epoch": 3.823296212203121, + "grad_norm": 0.26793283224105835, + "learning_rate": 6.932237340122982e-06, + "loss": 0.4126, "step": 106085 }, { - "epoch": 3.73, - "learning_rate": 7.978217506702284e-06, - "loss": 0.2694, + "epoch": 3.8234764118643456, + "grad_norm": 0.24026966094970703, + "learning_rate": 6.930220592603137e-06, + "loss": 0.4094, "step": 106090 }, { - "epoch": 3.73, - "learning_rate": 7.976131204948393e-06, - "loss": 0.2623, + "epoch": 3.8236566115255703, + "grad_norm": 0.24495449662208557, + "learning_rate": 6.928204091280549e-06, + "loss": 0.4011, "step": 106095 }, { - "epoch": 3.73, - "learning_rate": 7.974045124238217e-06, - "loss": 0.2564, + "epoch": 3.823836811186795, + "grad_norm": 0.20799663662910461, + "learning_rate": 6.926187836182663e-06, + "loss": 0.3974, "step": 106100 }, { - "epoch": 3.73, - "learning_rate": 7.971959264598844e-06, - "loss": 0.2586, + "epoch": 3.82401701084802, + "grad_norm": 0.2459585964679718, + "learning_rate": 6.924171827336975e-06, + "loss": 0.3619, "step": 106105 }, { - "epoch": 3.73, - "learning_rate": 7.969873626057348e-06, - "loss": 0.2623, + "epoch": 3.824197210509244, + "grad_norm": 0.2440251260995865, + "learning_rate": 6.9221560647709485e-06, + "loss": 0.3862, "step": 106110 }, { - "epoch": 3.73, - "learning_rate": 7.967788208640823e-06, - "loss": 0.2551, + "epoch": 3.824377410170469, + "grad_norm": 0.29908034205436707, + "learning_rate": 6.920140548512038e-06, + "loss": 0.4097, "step": 106115 }, { - "epoch": 3.73, - "learning_rate": 7.96570301237635e-06, - "loss": 0.2527, + "epoch": 3.8245576098316936, + "grad_norm": 0.304340660572052, + "learning_rate": 6.9181252785877285e-06, + "loss": 0.3573, "step": 106120 }, { - "epoch": 3.73, - "learning_rate": 7.963618037290999e-06, - "loss": 0.2826, + "epoch": 3.8247378094929183, + "grad_norm": 0.20545925199985504, + "learning_rate": 6.916110255025443e-06, + "loss": 0.365, "step": 106125 }, { - "epoch": 3.73, - "learning_rate": 7.961533283411829e-06, - "loss": 0.2554, + "epoch": 3.8249180091541426, + "grad_norm": 0.20274123549461365, + "learning_rate": 6.914095477852664e-06, + "loss": 0.3642, "step": 106130 }, { - "epoch": 3.73, - "learning_rate": 7.95944875076593e-06, - "loss": 0.2638, + "epoch": 3.8250982088153673, + "grad_norm": 0.21836356818675995, + "learning_rate": 6.912080947096833e-06, + "loss": 0.3782, "step": 106135 }, { - "epoch": 3.73, - "learning_rate": 7.95736443938036e-06, - "loss": 0.27, + "epoch": 3.825278408476592, + "grad_norm": 0.21938830614089966, + "learning_rate": 6.910066662785394e-06, + "loss": 0.389, "step": 106140 }, { - "epoch": 3.73, - "learning_rate": 7.955280349282183e-06, - "loss": 0.2426, + "epoch": 3.825458608137817, + "grad_norm": 0.22280332446098328, + "learning_rate": 6.908052624945796e-06, + "loss": 0.3447, "step": 106145 }, { - "epoch": 3.73, - "learning_rate": 7.953196480498446e-06, - "loss": 0.247, + "epoch": 3.8256388077990415, + "grad_norm": 0.25814318656921387, + "learning_rate": 6.90603883360548e-06, + "loss": 0.3924, "step": 106150 }, { - "epoch": 3.73, - "learning_rate": 7.951112833056226e-06, - "loss": 0.2563, + "epoch": 3.825819007460266, + "grad_norm": 0.28167471289634705, + "learning_rate": 6.904025288791874e-06, + "loss": 0.3889, "step": 106155 }, { - "epoch": 3.74, - "learning_rate": 7.949029406982574e-06, - "loss": 0.2629, + "epoch": 3.8259992071214906, + "grad_norm": 0.21864792704582214, + "learning_rate": 6.902011990532425e-06, + "loss": 0.3953, "step": 106160 }, { - "epoch": 3.74, - "learning_rate": 7.946946202304528e-06, - "loss": 0.253, + "epoch": 3.8261794067827153, + "grad_norm": 0.23761038482189178, + "learning_rate": 6.89999893885456e-06, + "loss": 0.3828, "step": 106165 }, { - "epoch": 3.74, - "learning_rate": 7.944863219049155e-06, - "loss": 0.2575, + "epoch": 3.82635960644394, + "grad_norm": 0.23446708917617798, + "learning_rate": 6.8979861337857055e-06, + "loss": 0.3481, "step": 106170 }, { - "epoch": 3.74, - "learning_rate": 7.942780457243487e-06, - "loss": 0.268, + "epoch": 3.8265398061051643, + "grad_norm": 0.25455090403556824, + "learning_rate": 6.895973575353287e-06, + "loss": 0.38, "step": 106175 }, { - "epoch": 3.74, - "learning_rate": 7.94069791691458e-06, - "loss": 0.2613, + "epoch": 3.826720005766389, + "grad_norm": 0.2646729648113251, + "learning_rate": 6.893961263584714e-06, + "loss": 0.3828, "step": 106180 }, { - "epoch": 3.74, - "learning_rate": 7.93861559808946e-06, - "loss": 0.2466, + "epoch": 3.8269002054276138, + "grad_norm": 0.25168514251708984, + "learning_rate": 6.891949198507419e-06, + "loss": 0.376, "step": 106185 }, { - "epoch": 3.74, - "learning_rate": 7.936533500795182e-06, - "loss": 0.2552, + "epoch": 3.8270804050888385, + "grad_norm": 0.24938587844371796, + "learning_rate": 6.88993738014882e-06, + "loss": 0.3898, "step": 106190 }, { - "epoch": 3.74, - "learning_rate": 7.934451625058769e-06, - "loss": 0.2545, + "epoch": 3.8272606047500632, + "grad_norm": 0.24917519092559814, + "learning_rate": 6.8879258085363025e-06, + "loss": 0.3947, "step": 106195 }, { - "epoch": 3.74, - "learning_rate": 7.932369970907256e-06, - "loss": 0.2565, + "epoch": 3.827440804411288, + "grad_norm": 0.2233772575855255, + "learning_rate": 6.8859144836972976e-06, + "loss": 0.3719, "step": 106200 }, { - "epoch": 3.74, - "learning_rate": 7.930288538367665e-06, - "loss": 0.2552, + "epoch": 3.8276210040725123, + "grad_norm": 0.2717728018760681, + "learning_rate": 6.883903405659192e-06, + "loss": 0.3885, "step": 106205 }, { - "epoch": 3.74, - "learning_rate": 7.928207327467033e-06, - "loss": 0.2505, + "epoch": 3.827801203733737, + "grad_norm": 0.24201686680316925, + "learning_rate": 6.881892574449411e-06, + "loss": 0.3567, "step": 106210 }, { - "epoch": 3.74, - "learning_rate": 7.926126338232378e-06, - "loss": 0.2556, + "epoch": 3.8279814033949617, + "grad_norm": 0.22028078138828278, + "learning_rate": 6.87988199009533e-06, + "loss": 0.3652, "step": 106215 }, { - "epoch": 3.74, - "learning_rate": 7.924045570690714e-06, - "loss": 0.2675, + "epoch": 3.828161603056186, + "grad_norm": 0.21828703582286835, + "learning_rate": 6.87787165262434e-06, + "loss": 0.3752, "step": 106220 }, { - "epoch": 3.74, - "learning_rate": 7.921965024869074e-06, - "loss": 0.2662, + "epoch": 3.8283418027174108, + "grad_norm": 0.22384034097194672, + "learning_rate": 6.875861562063846e-06, + "loss": 0.387, "step": 106225 }, { - "epoch": 3.74, - "learning_rate": 7.919884700794452e-06, - "loss": 0.2472, + "epoch": 3.8285220023786355, + "grad_norm": 0.38978999853134155, + "learning_rate": 6.873851718441232e-06, + "loss": 0.3725, "step": 106230 }, { - "epoch": 3.74, - "learning_rate": 7.91780459849388e-06, - "loss": 0.2526, + "epoch": 3.8287022020398602, + "grad_norm": 0.30686643719673157, + "learning_rate": 6.8718421217838754e-06, + "loss": 0.3894, "step": 106235 }, { - "epoch": 3.74, - "learning_rate": 7.91572471799436e-06, - "loss": 0.2511, + "epoch": 3.828882401701085, + "grad_norm": 0.24971897900104523, + "learning_rate": 6.869832772119164e-06, + "loss": 0.3785, "step": 106240 }, { - "epoch": 3.74, - "learning_rate": 7.913645059322886e-06, - "loss": 0.244, + "epoch": 3.8290626013623097, + "grad_norm": 0.26769372820854187, + "learning_rate": 6.86782366947446e-06, + "loss": 0.3451, "step": 106245 }, { - "epoch": 3.74, - "learning_rate": 7.911565622506478e-06, - "loss": 0.2755, + "epoch": 3.829242801023534, + "grad_norm": 0.30026867985725403, + "learning_rate": 6.865814813877158e-06, + "loss": 0.4113, "step": 106250 }, { - "epoch": 3.74, - "learning_rate": 7.909486407572132e-06, - "loss": 0.2588, + "epoch": 3.8294230006847587, + "grad_norm": 0.2826428711414337, + "learning_rate": 6.863806205354617e-06, + "loss": 0.4001, "step": 106255 }, { - "epoch": 3.74, - "learning_rate": 7.907407414546836e-06, - "loss": 0.2589, + "epoch": 3.8296032003459834, + "grad_norm": 0.24307632446289062, + "learning_rate": 6.861797843934206e-06, + "loss": 0.3537, "step": 106260 }, { - "epoch": 3.74, - "learning_rate": 7.905328643457586e-06, - "loss": 0.2673, + "epoch": 3.8297834000072077, + "grad_norm": 0.277296245098114, + "learning_rate": 6.859789729643287e-06, + "loss": 0.3807, "step": 106265 }, { - "epoch": 3.74, - "learning_rate": 7.903250094331385e-06, - "loss": 0.2476, + "epoch": 3.8299635996684325, + "grad_norm": 0.2420312762260437, + "learning_rate": 6.857781862509221e-06, + "loss": 0.3795, "step": 106270 }, { - "epoch": 3.74, - "learning_rate": 7.901171767195215e-06, - "loss": 0.2565, + "epoch": 3.830143799329657, + "grad_norm": 0.17696630954742432, + "learning_rate": 6.855774242559359e-06, + "loss": 0.3511, "step": 106275 }, { - "epoch": 3.74, - "learning_rate": 7.899093662076052e-06, - "loss": 0.2415, + "epoch": 3.830323998990882, + "grad_norm": 0.26137152314186096, + "learning_rate": 6.853766869821066e-06, + "loss": 0.3418, "step": 106280 }, { - "epoch": 3.74, - "learning_rate": 7.897015779000887e-06, - "loss": 0.2595, + "epoch": 3.8305041986521067, + "grad_norm": 0.21335354447364807, + "learning_rate": 6.851759744321687e-06, + "loss": 0.3742, "step": 106285 }, { - "epoch": 3.74, - "learning_rate": 7.894938117996711e-06, - "loss": 0.2557, + "epoch": 3.8306843983133314, + "grad_norm": 0.23139753937721252, + "learning_rate": 6.849752866088566e-06, + "loss": 0.3503, "step": 106290 }, { - "epoch": 3.74, - "learning_rate": 7.89286067909049e-06, - "loss": 0.2347, + "epoch": 3.8308645979745557, + "grad_norm": 0.21770185232162476, + "learning_rate": 6.84774623514905e-06, + "loss": 0.3839, "step": 106295 }, { - "epoch": 3.74, - "learning_rate": 7.890783462309193e-06, - "loss": 0.2701, + "epoch": 3.8310447976357804, + "grad_norm": 0.2551557719707489, + "learning_rate": 6.845739851530469e-06, + "loss": 0.3913, "step": 106300 }, { - "epoch": 3.74, - "learning_rate": 7.888706467679805e-06, - "loss": 0.2512, + "epoch": 3.831224997297005, + "grad_norm": 0.2286597192287445, + "learning_rate": 6.843733715260181e-06, + "loss": 0.413, "step": 106305 }, { - "epoch": 3.74, - "learning_rate": 7.886629695229284e-06, - "loss": 0.2537, + "epoch": 3.8314051969582295, + "grad_norm": 0.22169090807437897, + "learning_rate": 6.841727826365493e-06, + "loss": 0.379, "step": 106310 }, { - "epoch": 3.74, - "learning_rate": 7.884553144984599e-06, - "loss": 0.264, + "epoch": 3.831585396619454, + "grad_norm": 0.240523561835289, + "learning_rate": 6.839722184873757e-06, + "loss": 0.3675, "step": 106315 }, { - "epoch": 3.74, - "learning_rate": 7.882476816972706e-06, - "loss": 0.2409, + "epoch": 3.831765596280679, + "grad_norm": 0.2571893632411957, + "learning_rate": 6.8377167908122876e-06, + "loss": 0.382, "step": 106320 }, { - "epoch": 3.74, - "learning_rate": 7.880400711220576e-06, - "loss": 0.2468, + "epoch": 3.8319457959419037, + "grad_norm": 0.24433764815330505, + "learning_rate": 6.8357116442084115e-06, + "loss": 0.3755, "step": 106325 }, { - "epoch": 3.74, - "learning_rate": 7.878324827755163e-06, - "loss": 0.2689, + "epoch": 3.8321259956031284, + "grad_norm": 0.1973889172077179, + "learning_rate": 6.833706745089446e-06, + "loss": 0.3574, "step": 106330 }, { - "epoch": 3.74, - "learning_rate": 7.876249166603408e-06, - "loss": 0.2537, + "epoch": 3.832306195264353, + "grad_norm": 0.28018200397491455, + "learning_rate": 6.831702093482711e-06, + "loss": 0.3909, "step": 106335 }, { - "epoch": 3.74, - "learning_rate": 7.874173727792275e-06, - "loss": 0.2725, + "epoch": 3.8324863949255774, + "grad_norm": 0.2281164973974228, + "learning_rate": 6.82969768941551e-06, + "loss": 0.368, "step": 106340 }, { - "epoch": 3.74, - "learning_rate": 7.872098511348716e-06, - "loss": 0.26, + "epoch": 3.832666594586802, + "grad_norm": 0.25604715943336487, + "learning_rate": 6.827693532915166e-06, + "loss": 0.3783, "step": 106345 }, { - "epoch": 3.74, - "learning_rate": 7.87002351729967e-06, - "loss": 0.258, + "epoch": 3.832846794248027, + "grad_norm": 0.21953798830509186, + "learning_rate": 6.82568962400898e-06, + "loss": 0.377, "step": 106350 }, { - "epoch": 3.74, - "learning_rate": 7.86794874567208e-06, - "loss": 0.2454, + "epoch": 3.8330269939092516, + "grad_norm": 0.22131898999214172, + "learning_rate": 6.823685962724255e-06, + "loss": 0.3739, "step": 106355 }, { - "epoch": 3.74, - "learning_rate": 7.865874196492878e-06, - "loss": 0.2593, + "epoch": 3.833207193570476, + "grad_norm": 0.3169558048248291, + "learning_rate": 6.8216825490882875e-06, + "loss": 0.4028, "step": 106360 }, { - "epoch": 3.74, - "learning_rate": 7.863799869789016e-06, - "loss": 0.2799, + "epoch": 3.8333873932317006, + "grad_norm": 0.2238478660583496, + "learning_rate": 6.819679383128372e-06, + "loss": 0.4006, "step": 106365 }, { - "epoch": 3.74, - "learning_rate": 7.861725765587419e-06, - "loss": 0.2782, + "epoch": 3.8335675928929254, + "grad_norm": 0.28951504826545715, + "learning_rate": 6.817676464871808e-06, + "loss": 0.361, "step": 106370 }, { - "epoch": 3.74, - "learning_rate": 7.859651883915018e-06, - "loss": 0.2602, + "epoch": 3.83374779255415, + "grad_norm": 0.2536328136920929, + "learning_rate": 6.815673794345895e-06, + "loss": 0.3859, "step": 106375 }, { - "epoch": 3.74, - "learning_rate": 7.857578224798731e-06, - "loss": 0.25, + "epoch": 3.833927992215375, + "grad_norm": 0.2552483379840851, + "learning_rate": 6.8136713715778875e-06, + "loss": 0.4012, "step": 106380 }, { - "epoch": 3.74, - "learning_rate": 7.855504788265505e-06, - "loss": 0.2351, + "epoch": 3.834108191876599, + "grad_norm": 0.2368234395980835, + "learning_rate": 6.811669196595094e-06, + "loss": 0.3601, "step": 106385 }, { - "epoch": 3.74, - "learning_rate": 7.85343157434224e-06, - "loss": 0.2632, + "epoch": 3.834288391537824, + "grad_norm": 0.2486155778169632, + "learning_rate": 6.80966726942478e-06, + "loss": 0.3964, "step": 106390 }, { - "epoch": 3.74, - "learning_rate": 7.851358583055875e-06, - "loss": 0.2442, + "epoch": 3.8344685911990486, + "grad_norm": 0.2343214601278305, + "learning_rate": 6.807665590094242e-06, + "loss": 0.408, "step": 106395 }, { - "epoch": 3.74, - "learning_rate": 7.849285814433312e-06, - "loss": 0.2653, + "epoch": 3.8346487908602733, + "grad_norm": 0.22257140278816223, + "learning_rate": 6.805664158630728e-06, + "loss": 0.3616, "step": 106400 }, { - "epoch": 3.74, - "learning_rate": 7.847213268501478e-06, - "loss": 0.2266, + "epoch": 3.8348289905214976, + "grad_norm": 0.26378926634788513, + "learning_rate": 6.803662975061515e-06, + "loss": 0.4036, "step": 106405 }, { - "epoch": 3.74, - "learning_rate": 7.845140945287271e-06, - "loss": 0.2704, + "epoch": 3.8350091901827223, + "grad_norm": 0.2561560273170471, + "learning_rate": 6.801662039413875e-06, + "loss": 0.3486, "step": 106410 }, { - "epoch": 3.74, - "learning_rate": 7.843068844817597e-06, - "loss": 0.2662, + "epoch": 3.835189389843947, + "grad_norm": 0.24770770967006683, + "learning_rate": 6.799661351715067e-06, + "loss": 0.417, "step": 106415 }, { - "epoch": 3.74, - "learning_rate": 7.840996967119377e-06, - "loss": 0.2551, + "epoch": 3.835369589505172, + "grad_norm": 0.27001357078552246, + "learning_rate": 6.797660911992351e-06, + "loss": 0.398, "step": 106420 }, { - "epoch": 3.74, - "learning_rate": 7.838925312219503e-06, - "loss": 0.2598, + "epoch": 3.8355497891663966, + "grad_norm": 0.2153816670179367, + "learning_rate": 6.795660720272978e-06, + "loss": 0.3642, "step": 106425 }, { - "epoch": 3.74, - "learning_rate": 7.836853880144874e-06, - "loss": 0.2275, + "epoch": 3.835729988827621, + "grad_norm": 0.2308131456375122, + "learning_rate": 6.7936607765841985e-06, + "loss": 0.4077, "step": 106430 }, { - "epoch": 3.74, - "learning_rate": 7.834782670922377e-06, - "loss": 0.2677, + "epoch": 3.8359101884888456, + "grad_norm": 0.19024352729320526, + "learning_rate": 6.791661080953274e-06, + "loss": 0.3708, "step": 106435 }, { - "epoch": 3.74, - "learning_rate": 7.832711684578925e-06, - "loss": 0.279, + "epoch": 3.8360903881500703, + "grad_norm": 0.15937311947345734, + "learning_rate": 6.78966163340744e-06, + "loss": 0.3621, "step": 106440 }, { - "epoch": 3.75, - "learning_rate": 7.830640921141388e-06, - "loss": 0.2623, + "epoch": 3.836270587811295, + "grad_norm": 0.29799744486808777, + "learning_rate": 6.78766243397394e-06, + "loss": 0.371, "step": 106445 }, { - "epoch": 3.75, - "learning_rate": 7.828570380636672e-06, - "loss": 0.2451, + "epoch": 3.8364507874725193, + "grad_norm": 0.20933188498020172, + "learning_rate": 6.785663482680016e-06, + "loss": 0.3618, "step": 106450 }, { - "epoch": 3.75, - "learning_rate": 7.826500063091646e-06, - "loss": 0.2776, + "epoch": 3.836630987133744, + "grad_norm": 0.25528839230537415, + "learning_rate": 6.7836647795528976e-06, + "loss": 0.3922, "step": 106455 }, { - "epoch": 3.75, - "learning_rate": 7.824429968533207e-06, - "loss": 0.2688, + "epoch": 3.836811186794969, + "grad_norm": 0.231916606426239, + "learning_rate": 6.781666324619815e-06, + "loss": 0.379, "step": 106460 }, { - "epoch": 3.75, - "learning_rate": 7.822360096988224e-06, - "loss": 0.2786, + "epoch": 3.8369913864561935, + "grad_norm": 0.21450571715831757, + "learning_rate": 6.779668117908008e-06, + "loss": 0.3953, "step": 106465 }, { - "epoch": 3.75, - "learning_rate": 7.820290448483574e-06, - "loss": 0.2598, + "epoch": 3.8371715861174183, + "grad_norm": 0.23598310351371765, + "learning_rate": 6.777670159444696e-06, + "loss": 0.367, "step": 106470 }, { - "epoch": 3.75, - "learning_rate": 7.818221023046127e-06, - "loss": 0.2687, + "epoch": 3.837351785778643, + "grad_norm": 0.18300804495811462, + "learning_rate": 6.775672449257098e-06, + "loss": 0.3378, "step": 106475 }, { - "epoch": 3.75, - "learning_rate": 7.816151820702761e-06, - "loss": 0.2457, + "epoch": 3.8375319854398673, + "grad_norm": 0.27081814408302307, + "learning_rate": 6.773674987372436e-06, + "loss": 0.389, "step": 106480 }, { - "epoch": 3.75, - "learning_rate": 7.81408284148034e-06, - "loss": 0.2553, + "epoch": 3.837712185101092, + "grad_norm": 0.22525940835475922, + "learning_rate": 6.771677773817917e-06, + "loss": 0.398, "step": 106485 }, { - "epoch": 3.75, - "learning_rate": 7.812014085405728e-06, - "loss": 0.2522, + "epoch": 3.8378923847623168, + "grad_norm": 0.23411720991134644, + "learning_rate": 6.769680808620774e-06, + "loss": 0.4028, "step": 106490 }, { - "epoch": 3.75, - "learning_rate": 7.809945552505779e-06, - "loss": 0.2643, + "epoch": 3.838072584423541, + "grad_norm": 0.2922147214412689, + "learning_rate": 6.7676840918081825e-06, + "loss": 0.4001, "step": 106495 }, { - "epoch": 3.75, - "learning_rate": 7.807877242807357e-06, - "loss": 0.2496, + "epoch": 3.838252784084766, + "grad_norm": 0.23292958736419678, + "learning_rate": 6.765687623407377e-06, + "loss": 0.3461, "step": 106500 }, { - "epoch": 3.75, - "eval_loss": 0.2516409158706665, - "eval_runtime": 10.5352, - "eval_samples_per_second": 9.492, - "eval_steps_per_second": 9.492, + "epoch": 3.838252784084766, + "eval_loss": 0.4283367097377777, + "eval_runtime": 3.5256, + "eval_samples_per_second": 28.364, + "eval_steps_per_second": 7.091, "step": 106500 }, { - "epoch": 3.75, - "learning_rate": 7.805809156337327e-06, - "loss": 0.2422, + "epoch": 3.8384329837459905, + "grad_norm": 0.2138739675283432, + "learning_rate": 6.763691403445543e-06, + "loss": 0.3657, "step": 106505 }, { - "epoch": 3.75, - "learning_rate": 7.803741293122533e-06, - "loss": 0.2345, + "epoch": 3.8386131834072152, + "grad_norm": 0.20448392629623413, + "learning_rate": 6.761695431949888e-06, + "loss": 0.3634, "step": 106510 }, { - "epoch": 3.75, - "learning_rate": 7.801673653189818e-06, - "loss": 0.2534, + "epoch": 3.83879338306844, + "grad_norm": 0.269741415977478, + "learning_rate": 6.759699708947598e-06, + "loss": 0.3845, "step": 106515 }, { - "epoch": 3.75, - "learning_rate": 7.799606236566045e-06, - "loss": 0.271, + "epoch": 3.8389735827296647, + "grad_norm": 0.20338128507137299, + "learning_rate": 6.757704234465869e-06, + "loss": 0.3784, "step": 106520 }, { - "epoch": 3.75, - "learning_rate": 7.797539043278048e-06, - "loss": 0.2369, + "epoch": 3.839153782390889, + "grad_norm": 0.2528837025165558, + "learning_rate": 6.75570900853188e-06, + "loss": 0.4003, "step": 106525 }, { - "epoch": 3.75, - "learning_rate": 7.795472073352664e-06, - "loss": 0.2472, + "epoch": 3.8393339820521137, + "grad_norm": 0.22932448983192444, + "learning_rate": 6.753714031172834e-06, + "loss": 0.3484, "step": 106530 }, { - "epoch": 3.75, - "learning_rate": 7.793405326816744e-06, - "loss": 0.2607, + "epoch": 3.8395141817133385, + "grad_norm": 0.2735101282596588, + "learning_rate": 6.751719302415898e-06, + "loss": 0.4048, "step": 106535 }, { - "epoch": 3.75, - "learning_rate": 7.791338803697115e-06, - "loss": 0.2395, + "epoch": 3.8396943813745628, + "grad_norm": 0.2353210300207138, + "learning_rate": 6.749724822288256e-06, + "loss": 0.3939, "step": 106540 }, { - "epoch": 3.75, - "learning_rate": 7.789272504020611e-06, - "loss": 0.2539, + "epoch": 3.8398745810357875, + "grad_norm": 0.2255113422870636, + "learning_rate": 6.747730590817078e-06, + "loss": 0.3748, "step": 106545 }, { - "epoch": 3.75, - "learning_rate": 7.787206427814053e-06, - "loss": 0.2468, + "epoch": 3.8400547806970122, + "grad_norm": 0.2259177565574646, + "learning_rate": 6.7457366080295296e-06, + "loss": 0.3593, "step": 106550 }, { - "epoch": 3.75, - "learning_rate": 7.785140575104277e-06, - "loss": 0.2856, + "epoch": 3.840234980358237, + "grad_norm": 0.27952998876571655, + "learning_rate": 6.743742873952794e-06, + "loss": 0.3721, "step": 106555 }, { - "epoch": 3.75, - "learning_rate": 7.783074945918115e-06, - "loss": 0.2525, + "epoch": 3.8404151800194617, + "grad_norm": 0.23785199224948883, + "learning_rate": 6.741749388614027e-06, + "loss": 0.3657, "step": 106560 }, { - "epoch": 3.75, - "learning_rate": 7.781009540282378e-06, - "loss": 0.2338, + "epoch": 3.8405953796806864, + "grad_norm": 0.19771380722522736, + "learning_rate": 6.739756152040391e-06, + "loss": 0.3827, "step": 106565 }, { - "epoch": 3.75, - "learning_rate": 7.778944358223875e-06, - "loss": 0.2603, + "epoch": 3.8407755793419107, + "grad_norm": 0.23672428727149963, + "learning_rate": 6.7377631642590395e-06, + "loss": 0.4001, "step": 106570 }, { - "epoch": 3.75, - "learning_rate": 7.776879399769438e-06, - "loss": 0.2385, + "epoch": 3.8409557790031355, + "grad_norm": 0.28637582063674927, + "learning_rate": 6.7357704252971245e-06, + "loss": 0.381, "step": 106575 }, { - "epoch": 3.75, - "learning_rate": 7.77481466494587e-06, - "loss": 0.2633, + "epoch": 3.84113597866436, + "grad_norm": 0.23448318243026733, + "learning_rate": 6.733777935181806e-06, + "loss": 0.4347, "step": 106580 }, { - "epoch": 3.75, - "learning_rate": 7.772750153779985e-06, - "loss": 0.2625, + "epoch": 3.8413161783255845, + "grad_norm": 0.24527902901172638, + "learning_rate": 6.731785693940237e-06, + "loss": 0.405, "step": 106585 }, { - "epoch": 3.75, - "learning_rate": 7.770685866298577e-06, - "loss": 0.2417, + "epoch": 3.841496377986809, + "grad_norm": 0.2648589015007019, + "learning_rate": 6.729793701599535e-06, + "loss": 0.3911, "step": 106590 }, { - "epoch": 3.75, - "learning_rate": 7.768621802528467e-06, - "loss": 0.2505, + "epoch": 3.841676577648034, + "grad_norm": 0.28334563970565796, + "learning_rate": 6.727801958186864e-06, + "loss": 0.4194, "step": 106595 }, { - "epoch": 3.75, - "learning_rate": 7.766557962496445e-06, - "loss": 0.2446, + "epoch": 3.8418567773092587, + "grad_norm": 0.23570017516613007, + "learning_rate": 6.725810463729354e-06, + "loss": 0.3873, "step": 106600 }, { - "epoch": 3.75, - "learning_rate": 7.764494346229307e-06, - "loss": 0.2627, + "epoch": 3.8420369769704834, + "grad_norm": 0.254119873046875, + "learning_rate": 6.723819218254138e-06, + "loss": 0.3425, "step": 106605 }, { - "epoch": 3.75, - "learning_rate": 7.762430953753855e-06, - "loss": 0.2617, + "epoch": 3.842217176631708, + "grad_norm": 0.23003673553466797, + "learning_rate": 6.721828221788346e-06, + "loss": 0.3899, "step": 106610 }, { - "epoch": 3.75, - "learning_rate": 7.760367785096869e-06, - "loss": 0.2554, + "epoch": 3.8423973762929324, + "grad_norm": 0.21980217099189758, + "learning_rate": 6.719837474359098e-06, + "loss": 0.372, "step": 106615 }, { - "epoch": 3.75, - "learning_rate": 7.758304840285158e-06, - "loss": 0.253, + "epoch": 3.842577575954157, + "grad_norm": 0.25185590982437134, + "learning_rate": 6.717846975993536e-06, + "loss": 0.4302, "step": 106620 }, { - "epoch": 3.75, - "learning_rate": 7.75624211934549e-06, - "loss": 0.2655, + "epoch": 3.842757775615382, + "grad_norm": 0.23943038284778595, + "learning_rate": 6.715856726718767e-06, + "loss": 0.4065, "step": 106625 }, { - "epoch": 3.75, - "learning_rate": 7.75417962230465e-06, - "loss": 0.2666, + "epoch": 3.8429379752766066, + "grad_norm": 0.2441278100013733, + "learning_rate": 6.71386672656191e-06, + "loss": 0.3898, "step": 106630 }, { - "epoch": 3.75, - "learning_rate": 7.75211734918943e-06, - "loss": 0.2492, + "epoch": 3.843118174937831, + "grad_norm": 0.2758260667324066, + "learning_rate": 6.711876975550077e-06, + "loss": 0.3623, "step": 106635 }, { - "epoch": 3.75, - "learning_rate": 7.750055300026599e-06, - "loss": 0.2651, + "epoch": 3.8432983745990557, + "grad_norm": 0.20836462080478668, + "learning_rate": 6.7098874737103825e-06, + "loss": 0.372, "step": 106640 }, { - "epoch": 3.75, - "learning_rate": 7.747993474842923e-06, - "loss": 0.286, + "epoch": 3.8434785742602804, + "grad_norm": 0.25424110889434814, + "learning_rate": 6.707898221069922e-06, + "loss": 0.3941, "step": 106645 }, { - "epoch": 3.75, - "learning_rate": 7.745931873665194e-06, - "loss": 0.2526, + "epoch": 3.843658773921505, + "grad_norm": 0.25770169496536255, + "learning_rate": 6.705909217655815e-06, + "loss": 0.3841, "step": 106650 }, { - "epoch": 3.75, - "learning_rate": 7.743870496520165e-06, - "loss": 0.246, + "epoch": 3.84383897358273, + "grad_norm": 0.296792596578598, + "learning_rate": 6.703920463495151e-06, + "loss": 0.3746, "step": 106655 }, { - "epoch": 3.75, - "learning_rate": 7.741809343434598e-06, - "loss": 0.254, + "epoch": 3.844019173243954, + "grad_norm": 0.2855359613895416, + "learning_rate": 6.701931958615029e-06, + "loss": 0.4001, "step": 106660 }, { - "epoch": 3.75, - "learning_rate": 7.739748414435272e-06, - "loss": 0.2589, + "epoch": 3.844199372905179, + "grad_norm": 0.2668587267398834, + "learning_rate": 6.699943703042541e-06, + "loss": 0.3571, "step": 106665 }, { - "epoch": 3.75, - "learning_rate": 7.737687709548927e-06, - "loss": 0.2474, + "epoch": 3.8443795725664036, + "grad_norm": 0.2563954293727875, + "learning_rate": 6.69795569680477e-06, + "loss": 0.3489, "step": 106670 }, { - "epoch": 3.75, - "learning_rate": 7.735627228802342e-06, - "loss": 0.2452, + "epoch": 3.8445597722276283, + "grad_norm": 0.3133787214756012, + "learning_rate": 6.6959679399288265e-06, + "loss": 0.3753, "step": 106675 }, { - "epoch": 3.75, - "learning_rate": 7.733566972222259e-06, - "loss": 0.2682, + "epoch": 3.8447399718888526, + "grad_norm": 0.22209054231643677, + "learning_rate": 6.693980432441757e-06, + "loss": 0.3987, "step": 106680 }, { - "epoch": 3.75, - "learning_rate": 7.73150693983542e-06, - "loss": 0.2653, + "epoch": 3.8449201715500774, + "grad_norm": 0.22669175267219543, + "learning_rate": 6.69199317437067e-06, + "loss": 0.3612, "step": 106685 }, { - "epoch": 3.75, - "learning_rate": 7.729447131668593e-06, - "loss": 0.2691, + "epoch": 3.845100371211302, + "grad_norm": 0.24316006898880005, + "learning_rate": 6.6900061657426325e-06, + "loss": 0.3768, "step": 106690 }, { - "epoch": 3.75, - "learning_rate": 7.727387547748513e-06, - "loss": 0.2659, + "epoch": 3.845280570872527, + "grad_norm": 0.26429635286331177, + "learning_rate": 6.688019406584706e-06, + "loss": 0.3953, "step": 106695 }, { - "epoch": 3.75, - "learning_rate": 7.725328188101925e-06, - "loss": 0.2828, + "epoch": 3.8454607705337516, + "grad_norm": 0.24208183586597443, + "learning_rate": 6.686032896923986e-06, + "loss": 0.3652, "step": 106700 }, { - "epoch": 3.75, - "learning_rate": 7.723269052755553e-06, - "loss": 0.2698, + "epoch": 3.8456409701949763, + "grad_norm": 0.21602210402488708, + "learning_rate": 6.684046636787514e-06, + "loss": 0.3948, "step": 106705 }, { - "epoch": 3.75, - "learning_rate": 7.721210141736156e-06, - "loss": 0.2539, + "epoch": 3.8458211698562006, + "grad_norm": 0.21090590953826904, + "learning_rate": 6.682060626202355e-06, + "loss": 0.3688, "step": 106710 }, { - "epoch": 3.75, - "learning_rate": 7.719151455070462e-06, - "loss": 0.2575, + "epoch": 3.8460013695174253, + "grad_norm": 0.23994767665863037, + "learning_rate": 6.6800748651955795e-06, + "loss": 0.3735, "step": 106715 }, { - "epoch": 3.75, - "learning_rate": 7.717092992785185e-06, - "loss": 0.2708, + "epoch": 3.84618156917865, + "grad_norm": 0.24536341428756714, + "learning_rate": 6.678089353794237e-06, + "loss": 0.3965, "step": 106720 }, { - "epoch": 3.75, - "learning_rate": 7.715034754907068e-06, - "loss": 0.2411, + "epoch": 3.8463617688398744, + "grad_norm": 0.24873821437358856, + "learning_rate": 6.676104092025378e-06, + "loss": 0.3989, "step": 106725 }, { - "epoch": 3.76, - "learning_rate": 7.71297674146284e-06, - "loss": 0.2614, + "epoch": 3.846541968501099, + "grad_norm": 0.2966354191303253, + "learning_rate": 6.674119079916056e-06, + "loss": 0.404, "step": 106730 }, { - "epoch": 3.76, - "learning_rate": 7.71091895247922e-06, - "loss": 0.2691, + "epoch": 3.846722168162324, + "grad_norm": 0.24333995580673218, + "learning_rate": 6.672134317493303e-06, + "loss": 0.3602, "step": 106735 }, { - "epoch": 3.76, - "learning_rate": 7.708861387982921e-06, - "loss": 0.2591, + "epoch": 3.8469023678235486, + "grad_norm": 0.2882740795612335, + "learning_rate": 6.670149804784181e-06, + "loss": 0.3889, "step": 106740 }, { - "epoch": 3.76, - "learning_rate": 7.706804048000654e-06, - "loss": 0.2456, + "epoch": 3.8470825674847733, + "grad_norm": 0.2220769077539444, + "learning_rate": 6.668165541815721e-06, + "loss": 0.3715, "step": 106745 }, { - "epoch": 3.76, - "learning_rate": 7.704746932559148e-06, - "loss": 0.2298, + "epoch": 3.847262767145998, + "grad_norm": 0.26409411430358887, + "learning_rate": 6.666181528614954e-06, + "loss": 0.356, "step": 106750 }, { - "epoch": 3.76, - "learning_rate": 7.702690041685105e-06, - "loss": 0.2538, + "epoch": 3.8474429668072223, + "grad_norm": 0.24379925429821014, + "learning_rate": 6.6641977652089155e-06, + "loss": 0.3866, "step": 106755 }, { - "epoch": 3.76, - "learning_rate": 7.70063337540523e-06, - "loss": 0.2592, + "epoch": 3.847623166468447, + "grad_norm": 0.2612653076648712, + "learning_rate": 6.662214251624624e-06, + "loss": 0.3836, "step": 106760 }, { - "epoch": 3.76, - "learning_rate": 7.698576933746225e-06, - "loss": 0.2535, + "epoch": 3.8478033661296718, + "grad_norm": 0.20070244371891022, + "learning_rate": 6.660230987889121e-06, + "loss": 0.3629, "step": 106765 }, { - "epoch": 3.76, - "learning_rate": 7.696520716734804e-06, - "loss": 0.2577, + "epoch": 3.847983565790896, + "grad_norm": 0.1985878348350525, + "learning_rate": 6.658247974029427e-06, + "loss": 0.3356, "step": 106770 }, { - "epoch": 3.76, - "learning_rate": 7.694464724397649e-06, - "loss": 0.256, + "epoch": 3.848163765452121, + "grad_norm": 0.24218730628490448, + "learning_rate": 6.656265210072537e-06, + "loss": 0.396, "step": 106775 }, { - "epoch": 3.76, - "learning_rate": 7.692408956761466e-06, - "loss": 0.2563, + "epoch": 3.8483439651133455, + "grad_norm": 0.29759272933006287, + "learning_rate": 6.6542826960454915e-06, + "loss": 0.3804, "step": 106780 }, { - "epoch": 3.76, - "learning_rate": 7.690353413852952e-06, - "loss": 0.2454, + "epoch": 3.8485241647745703, + "grad_norm": 0.340660035610199, + "learning_rate": 6.652300431975292e-06, + "loss": 0.4202, "step": 106785 }, { - "epoch": 3.76, - "learning_rate": 7.688298095698792e-06, - "loss": 0.2379, + "epoch": 3.848704364435795, + "grad_norm": 0.24566273391246796, + "learning_rate": 6.650318417888948e-06, + "loss": 0.396, "step": 106790 }, { - "epoch": 3.76, - "learning_rate": 7.686243002325675e-06, - "loss": 0.2398, + "epoch": 3.8488845640970197, + "grad_norm": 0.2540030777454376, + "learning_rate": 6.648336653813461e-06, + "loss": 0.3947, "step": 106795 }, { - "epoch": 3.76, - "learning_rate": 7.684188133760272e-06, - "loss": 0.2724, + "epoch": 3.849064763758244, + "grad_norm": 0.27518022060394287, + "learning_rate": 6.646355139775828e-06, + "loss": 0.3916, "step": 106800 }, { - "epoch": 3.76, - "learning_rate": 7.682133490029286e-06, - "loss": 0.246, + "epoch": 3.8492449634194688, + "grad_norm": 0.19195982813835144, + "learning_rate": 6.644373875803059e-06, + "loss": 0.3582, "step": 106805 }, { - "epoch": 3.76, - "learning_rate": 7.680079071159379e-06, - "loss": 0.2528, + "epoch": 3.8494251630806935, + "grad_norm": 0.17849093675613403, + "learning_rate": 6.642392861922145e-06, + "loss": 0.4018, "step": 106810 }, { - "epoch": 3.76, - "learning_rate": 7.678024877177234e-06, - "loss": 0.26, + "epoch": 3.849605362741918, + "grad_norm": 0.20975105464458466, + "learning_rate": 6.64041209816007e-06, + "loss": 0.3944, "step": 106815 }, { - "epoch": 3.76, - "learning_rate": 7.675970908109512e-06, - "loss": 0.2595, + "epoch": 3.8497855624031425, + "grad_norm": 0.20660048723220825, + "learning_rate": 6.638431584543827e-06, + "loss": 0.3759, "step": 106820 }, { - "epoch": 3.76, - "learning_rate": 7.6739171639829e-06, - "loss": 0.2548, + "epoch": 3.8499657620643672, + "grad_norm": 0.20970280468463898, + "learning_rate": 6.636451321100401e-06, + "loss": 0.3418, "step": 106825 }, { - "epoch": 3.76, - "learning_rate": 7.671863644824045e-06, - "loss": 0.2683, + "epoch": 3.850145961725592, + "grad_norm": 0.20445165038108826, + "learning_rate": 6.634471307856763e-06, + "loss": 0.3869, "step": 106830 }, { - "epoch": 3.76, - "learning_rate": 7.669810350659629e-06, - "loss": 0.2656, + "epoch": 3.8503261613868167, + "grad_norm": 0.2625582814216614, + "learning_rate": 6.632491544839903e-06, + "loss": 0.4001, "step": 106835 }, { - "epoch": 3.76, - "learning_rate": 7.667757281516296e-06, - "loss": 0.2655, + "epoch": 3.8505063610480414, + "grad_norm": 0.2839062213897705, + "learning_rate": 6.6305120320767885e-06, + "loss": 0.3926, "step": 106840 }, { - "epoch": 3.76, - "learning_rate": 7.665704437420724e-06, - "loss": 0.2698, + "epoch": 3.8506865607092657, + "grad_norm": 0.22392812371253967, + "learning_rate": 6.628532769594395e-06, + "loss": 0.4024, "step": 106845 }, { - "epoch": 3.76, - "learning_rate": 7.663651818399553e-06, - "loss": 0.2581, + "epoch": 3.8508667603704905, + "grad_norm": 0.2890661060810089, + "learning_rate": 6.626553757419682e-06, + "loss": 0.3862, "step": 106850 }, { - "epoch": 3.76, - "learning_rate": 7.661599424479437e-06, - "loss": 0.2746, + "epoch": 3.851046960031715, + "grad_norm": 0.29909321665763855, + "learning_rate": 6.62457499557961e-06, + "loss": 0.3887, "step": 106855 }, { - "epoch": 3.76, - "learning_rate": 7.659547255687023e-06, - "loss": 0.2813, + "epoch": 3.85122715969294, + "grad_norm": 0.22355806827545166, + "learning_rate": 6.622596484101156e-06, + "loss": 0.388, "step": 106860 }, { - "epoch": 3.76, - "learning_rate": 7.657495312048963e-06, - "loss": 0.2596, + "epoch": 3.8514073593541642, + "grad_norm": 0.2812059223651886, + "learning_rate": 6.620618223011274e-06, + "loss": 0.3845, "step": 106865 }, { - "epoch": 3.76, - "learning_rate": 7.6554435935919e-06, - "loss": 0.2277, + "epoch": 3.851587559015389, + "grad_norm": 0.28787899017333984, + "learning_rate": 6.618640212336893e-06, + "loss": 0.3785, "step": 106870 }, { - "epoch": 3.76, - "learning_rate": 7.65339210034247e-06, - "loss": 0.2635, + "epoch": 3.8517677586766137, + "grad_norm": 0.21656519174575806, + "learning_rate": 6.616662452104991e-06, + "loss": 0.393, "step": 106875 }, { - "epoch": 3.76, - "learning_rate": 7.651340832327305e-06, - "loss": 0.2561, + "epoch": 3.8519479583378384, + "grad_norm": 0.2218136340379715, + "learning_rate": 6.6146849423424946e-06, + "loss": 0.3646, "step": 106880 }, { - "epoch": 3.76, - "learning_rate": 7.649289789573044e-06, - "loss": 0.2629, + "epoch": 3.852128157999063, + "grad_norm": 0.2481302171945572, + "learning_rate": 6.612707683076369e-06, + "loss": 0.3755, "step": 106885 }, { - "epoch": 3.76, - "learning_rate": 7.64723897210633e-06, - "loss": 0.2495, + "epoch": 3.8523083576602875, + "grad_norm": 0.23201486468315125, + "learning_rate": 6.610730674333537e-06, + "loss": 0.3652, "step": 106890 }, { - "epoch": 3.76, - "learning_rate": 7.645188379953774e-06, - "loss": 0.2622, + "epoch": 3.852488557321512, + "grad_norm": 0.2411048710346222, + "learning_rate": 6.6087539161409305e-06, + "loss": 0.3625, "step": 106895 }, { - "epoch": 3.76, - "learning_rate": 7.643138013142018e-06, - "loss": 0.2551, + "epoch": 3.852668756982737, + "grad_norm": 0.24224650859832764, + "learning_rate": 6.6067774085254995e-06, + "loss": 0.354, "step": 106900 }, { - "epoch": 3.76, - "learning_rate": 7.641087871697677e-06, - "loss": 0.2712, + "epoch": 3.8528489566439617, + "grad_norm": 0.25247785449028015, + "learning_rate": 6.6048011515141646e-06, + "loss": 0.3811, "step": 106905 }, { - "epoch": 3.76, - "learning_rate": 7.63903795564737e-06, - "loss": 0.2408, + "epoch": 3.853029156305186, + "grad_norm": 0.2523113191127777, + "learning_rate": 6.602825145133854e-06, + "loss": 0.4067, "step": 106910 }, { - "epoch": 3.76, - "learning_rate": 7.636988265017702e-06, - "loss": 0.2498, + "epoch": 3.8532093559664107, + "grad_norm": 0.21480904519557953, + "learning_rate": 6.600849389411487e-06, + "loss": 0.388, "step": 106915 }, { - "epoch": 3.76, - "learning_rate": 7.634938799835312e-06, - "loss": 0.2667, + "epoch": 3.8533895556276354, + "grad_norm": 0.2217121422290802, + "learning_rate": 6.59887388437398e-06, + "loss": 0.3805, "step": 106920 }, { - "epoch": 3.76, - "learning_rate": 7.632889560126794e-06, - "loss": 0.2521, + "epoch": 3.85356975528886, + "grad_norm": 0.2531042695045471, + "learning_rate": 6.5968986300482595e-06, + "loss": 0.3892, "step": 106925 }, { - "epoch": 3.76, - "learning_rate": 7.630840545918761e-06, - "loss": 0.2465, + "epoch": 3.853749954950085, + "grad_norm": 0.27398139238357544, + "learning_rate": 6.594923626461233e-06, + "loss": 0.3802, "step": 106930 }, { - "epoch": 3.76, - "learning_rate": 7.628791757237811e-06, - "loss": 0.2646, + "epoch": 3.853930154611309, + "grad_norm": 0.24913030862808228, + "learning_rate": 6.592948873639807e-06, + "loss": 0.3898, "step": 106935 }, { - "epoch": 3.76, - "learning_rate": 7.626743194110547e-06, - "loss": 0.241, + "epoch": 3.854110354272534, + "grad_norm": 0.23287741839885712, + "learning_rate": 6.590974371610889e-06, + "loss": 0.3705, "step": 106940 }, { - "epoch": 3.76, - "learning_rate": 7.624694856563585e-06, - "loss": 0.2578, + "epoch": 3.8542905539337586, + "grad_norm": 0.23462989926338196, + "learning_rate": 6.589000120401375e-06, + "loss": 0.3583, "step": 106945 }, { - "epoch": 3.76, - "learning_rate": 7.6226467446235075e-06, - "loss": 0.2587, + "epoch": 3.8544707535949834, + "grad_norm": 0.2538296580314636, + "learning_rate": 6.587026120038178e-06, + "loss": 0.393, "step": 106950 }, { - "epoch": 3.76, - "learning_rate": 7.620598858316905e-06, - "loss": 0.263, + "epoch": 3.8546509532562077, + "grad_norm": 0.23619996011257172, + "learning_rate": 6.585052370548192e-06, + "loss": 0.3613, "step": 106955 }, { - "epoch": 3.76, - "learning_rate": 7.618551197670376e-06, - "loss": 0.2499, + "epoch": 3.8548311529174324, + "grad_norm": 0.20444704592227936, + "learning_rate": 6.583078871958287e-06, + "loss": 0.3983, "step": 106960 }, { - "epoch": 3.76, - "learning_rate": 7.616503762710509e-06, - "loss": 0.2554, + "epoch": 3.855011352578657, + "grad_norm": 0.2089342325925827, + "learning_rate": 6.581105624295372e-06, + "loss": 0.3489, "step": 106965 }, { - "epoch": 3.76, - "learning_rate": 7.6144565534638835e-06, - "loss": 0.2798, + "epoch": 3.855191552239882, + "grad_norm": 0.2452775537967682, + "learning_rate": 6.579132627586329e-06, + "loss": 0.3685, "step": 106970 }, { - "epoch": 3.76, - "learning_rate": 7.612409569957074e-06, - "loss": 0.2447, + "epoch": 3.8553717519011066, + "grad_norm": 0.23251672089099884, + "learning_rate": 6.577159881858031e-06, + "loss": 0.3592, "step": 106975 }, { - "epoch": 3.76, - "learning_rate": 7.610362812216676e-06, - "loss": 0.2339, + "epoch": 3.8555519515623313, + "grad_norm": 0.21264806389808655, + "learning_rate": 6.575187387137377e-06, + "loss": 0.3709, "step": 106980 }, { - "epoch": 3.76, - "learning_rate": 7.608316280269253e-06, - "loss": 0.236, + "epoch": 3.8557321512235556, + "grad_norm": 0.20880858600139618, + "learning_rate": 6.5732151434512115e-06, + "loss": 0.36, "step": 106985 }, { - "epoch": 3.76, - "learning_rate": 7.6062699741413775e-06, - "loss": 0.2403, + "epoch": 3.8559123508847803, + "grad_norm": 0.2800189256668091, + "learning_rate": 6.571243150826431e-06, + "loss": 0.3584, "step": 106990 }, { - "epoch": 3.76, - "learning_rate": 7.604223893859622e-06, - "loss": 0.2719, + "epoch": 3.856092550546005, + "grad_norm": 0.27210569381713867, + "learning_rate": 6.569271409289895e-06, + "loss": 0.3647, "step": 106995 }, { - "epoch": 3.76, - "learning_rate": 7.602178039450563e-06, - "loss": 0.2621, + "epoch": 3.8562727502072294, + "grad_norm": 0.22882099449634552, + "learning_rate": 6.567299918868467e-06, + "loss": 0.3507, "step": 107000 }, { - "epoch": 3.76, - "eval_loss": 0.251350462436676, - "eval_runtime": 10.5372, - "eval_samples_per_second": 9.49, - "eval_steps_per_second": 9.49, + "epoch": 3.8562727502072294, + "eval_loss": 0.42817628383636475, + "eval_runtime": 3.5212, + "eval_samples_per_second": 28.399, + "eval_steps_per_second": 7.1, "step": 107000 }, { - "epoch": 3.76, - "learning_rate": 7.600132410940758e-06, - "loss": 0.271, + "epoch": 3.856452949868454, + "grad_norm": 0.22581203281879425, + "learning_rate": 6.565328679589008e-06, + "loss": 0.3751, "step": 107005 }, { - "epoch": 3.76, - "learning_rate": 7.598087008356758e-06, - "loss": 0.2491, + "epoch": 3.856633149529679, + "grad_norm": 0.23361347615718842, + "learning_rate": 6.563357691478378e-06, + "loss": 0.3699, "step": 107010 }, { - "epoch": 3.77, - "learning_rate": 7.596041831725137e-06, - "loss": 0.2713, + "epoch": 3.8568133491909036, + "grad_norm": 0.2552787959575653, + "learning_rate": 6.561386954563423e-06, + "loss": 0.368, "step": 107015 }, { - "epoch": 3.77, - "learning_rate": 7.593996881072443e-06, - "loss": 0.2486, + "epoch": 3.8569935488521283, + "grad_norm": 0.19680064916610718, + "learning_rate": 6.559416468871008e-06, + "loss": 0.4097, "step": 107020 }, { - "epoch": 3.77, - "learning_rate": 7.591952156425231e-06, - "loss": 0.2624, + "epoch": 3.857173748513353, + "grad_norm": 0.300270676612854, + "learning_rate": 6.557446234427972e-06, + "loss": 0.3709, "step": 107025 }, { - "epoch": 3.77, - "learning_rate": 7.589907657810039e-06, - "loss": 0.2571, + "epoch": 3.8573539481745773, + "grad_norm": 0.2358800619840622, + "learning_rate": 6.555476251261161e-06, + "loss": 0.3658, "step": 107030 }, { - "epoch": 3.77, - "learning_rate": 7.587863385253427e-06, - "loss": 0.2679, + "epoch": 3.857534147835802, + "grad_norm": 0.2335239201784134, + "learning_rate": 6.553506519397417e-06, + "loss": 0.3828, "step": 107035 }, { - "epoch": 3.77, - "learning_rate": 7.585819338781938e-06, - "loss": 0.2466, + "epoch": 3.857714347497027, + "grad_norm": 0.3097918629646301, + "learning_rate": 6.551537038863567e-06, + "loss": 0.3893, "step": 107040 }, { - "epoch": 3.77, - "learning_rate": 7.583775518422098e-06, - "loss": 0.2415, + "epoch": 3.857894547158251, + "grad_norm": 0.22679081559181213, + "learning_rate": 6.549567809686458e-06, + "loss": 0.3852, "step": 107045 }, { - "epoch": 3.77, - "learning_rate": 7.581731924200464e-06, - "loss": 0.2564, + "epoch": 3.858074746819476, + "grad_norm": 0.23170988261699677, + "learning_rate": 6.547598831892926e-06, + "loss": 0.3755, "step": 107050 }, { - "epoch": 3.77, - "learning_rate": 7.579688556143555e-06, - "loss": 0.2617, + "epoch": 3.8582549464807006, + "grad_norm": 0.24494971334934235, + "learning_rate": 6.545630105509771e-06, + "loss": 0.3614, "step": 107055 }, { - "epoch": 3.77, - "learning_rate": 7.577645414277917e-06, - "loss": 0.2695, + "epoch": 3.8584351461419253, + "grad_norm": 0.22469878196716309, + "learning_rate": 6.543661630563841e-06, + "loss": 0.37, "step": 107060 }, { - "epoch": 3.77, - "learning_rate": 7.575602498630072e-06, - "loss": 0.2421, + "epoch": 3.85861534580315, + "grad_norm": 0.27316057682037354, + "learning_rate": 6.541693407081939e-06, + "loss": 0.3548, "step": 107065 }, { - "epoch": 3.77, - "learning_rate": 7.573559809226536e-06, - "loss": 0.2467, + "epoch": 3.8587955454643748, + "grad_norm": 0.2721122205257416, + "learning_rate": 6.53972543509091e-06, + "loss": 0.3677, "step": 107070 }, { - "epoch": 3.77, - "learning_rate": 7.571517346093851e-06, - "loss": 0.2511, + "epoch": 3.858975745125599, + "grad_norm": 0.2867799699306488, + "learning_rate": 6.537757714617537e-06, + "loss": 0.367, "step": 107075 }, { - "epoch": 3.77, - "learning_rate": 7.569475109258525e-06, - "loss": 0.2721, + "epoch": 3.8591559447868238, + "grad_norm": 0.2439620941877365, + "learning_rate": 6.535790245688633e-06, + "loss": 0.3661, "step": 107080 }, { - "epoch": 3.77, - "learning_rate": 7.567433098747079e-06, - "loss": 0.2477, + "epoch": 3.8593361444480485, + "grad_norm": 0.21324770152568817, + "learning_rate": 6.533823028331021e-06, + "loss": 0.3907, "step": 107085 }, { - "epoch": 3.77, - "learning_rate": 7.565391314586018e-06, - "loss": 0.2662, + "epoch": 3.859516344109273, + "grad_norm": 0.24692684412002563, + "learning_rate": 6.5318560625714924e-06, + "loss": 0.3767, "step": 107090 }, { - "epoch": 3.77, - "learning_rate": 7.563349756801869e-06, - "loss": 0.2606, + "epoch": 3.8596965437704975, + "grad_norm": 0.2432447373867035, + "learning_rate": 6.529889348436852e-06, + "loss": 0.3643, "step": 107095 }, { - "epoch": 3.77, - "learning_rate": 7.561308425421124e-06, - "loss": 0.2439, + "epoch": 3.8598767434317223, + "grad_norm": 0.2154054492712021, + "learning_rate": 6.5279228859538935e-06, + "loss": 0.3863, "step": 107100 }, { - "epoch": 3.77, - "learning_rate": 7.559267320470304e-06, - "loss": 0.2411, + "epoch": 3.860056943092947, + "grad_norm": 0.23707520961761475, + "learning_rate": 6.5259566751494e-06, + "loss": 0.3823, "step": 107105 }, { - "epoch": 3.77, - "learning_rate": 7.557226441975898e-06, - "loss": 0.2375, + "epoch": 3.8602371427541717, + "grad_norm": 0.23373231291770935, + "learning_rate": 6.523990716050179e-06, + "loss": 0.393, "step": 107110 }, { - "epoch": 3.77, - "learning_rate": 7.555185789964417e-06, - "loss": 0.258, + "epoch": 3.8604173424153965, + "grad_norm": 0.26256629824638367, + "learning_rate": 6.522025008683008e-06, + "loss": 0.3853, "step": 107115 }, { - "epoch": 3.77, - "learning_rate": 7.5531453644623525e-06, - "loss": 0.2532, + "epoch": 3.8605975420766208, + "grad_norm": 0.23125460743904114, + "learning_rate": 6.520059553074667e-06, + "loss": 0.3477, "step": 107120 }, { - "epoch": 3.77, - "learning_rate": 7.5511051654961915e-06, - "loss": 0.2477, + "epoch": 3.8607777417378455, + "grad_norm": 0.23365911841392517, + "learning_rate": 6.518094349251938e-06, + "loss": 0.3883, "step": 107125 }, { - "epoch": 3.77, - "learning_rate": 7.549065193092436e-06, - "loss": 0.2492, + "epoch": 3.8609579413990702, + "grad_norm": 0.18686018884181976, + "learning_rate": 6.516129397241588e-06, + "loss": 0.376, "step": 107130 }, { - "epoch": 3.77, - "learning_rate": 7.54702544727757e-06, - "loss": 0.2551, + "epoch": 3.861138141060295, + "grad_norm": 0.23002989590168, + "learning_rate": 6.5141646970704015e-06, + "loss": 0.3877, "step": 107135 }, { - "epoch": 3.77, - "learning_rate": 7.544985928078077e-06, - "loss": 0.2316, + "epoch": 3.8613183407215192, + "grad_norm": 0.3117145299911499, + "learning_rate": 6.512200248765146e-06, + "loss": 0.4153, "step": 107140 }, { - "epoch": 3.77, - "learning_rate": 7.542946635520429e-06, - "loss": 0.2486, + "epoch": 3.861498540382744, + "grad_norm": 0.2764013111591339, + "learning_rate": 6.510236052352581e-06, + "loss": 0.398, "step": 107145 }, { - "epoch": 3.77, - "learning_rate": 7.5409075696311225e-06, - "loss": 0.2684, + "epoch": 3.8616787400439687, + "grad_norm": 0.3027382493019104, + "learning_rate": 6.508272107859467e-06, + "loss": 0.3804, "step": 107150 }, { - "epoch": 3.77, - "learning_rate": 7.538868730436627e-06, - "loss": 0.2515, + "epoch": 3.8618589397051934, + "grad_norm": 0.20171962678432465, + "learning_rate": 6.50630841531257e-06, + "loss": 0.3427, "step": 107155 }, { - "epoch": 3.77, - "learning_rate": 7.536830117963403e-06, - "loss": 0.2779, + "epoch": 3.862039139366418, + "grad_norm": 0.2870292663574219, + "learning_rate": 6.50434497473863e-06, + "loss": 0.3651, "step": 107160 }, { - "epoch": 3.77, - "learning_rate": 7.534791732237934e-06, - "loss": 0.2644, + "epoch": 3.8622193390276425, + "grad_norm": 0.2380640059709549, + "learning_rate": 6.502381786164424e-06, + "loss": 0.3692, "step": 107165 }, { - "epoch": 3.77, - "learning_rate": 7.532753573286691e-06, - "loss": 0.2374, + "epoch": 3.862399538688867, + "grad_norm": 0.2691294550895691, + "learning_rate": 6.500418849616671e-06, + "loss": 0.3916, "step": 107170 }, { - "epoch": 3.77, - "learning_rate": 7.530715641136129e-06, - "loss": 0.2743, + "epoch": 3.862579738350092, + "grad_norm": 0.27460694313049316, + "learning_rate": 6.498456165122139e-06, + "loss": 0.379, "step": 107175 }, { - "epoch": 3.77, - "learning_rate": 7.528677935812714e-06, - "loss": 0.2512, + "epoch": 3.8627599380113167, + "grad_norm": 0.24331451952457428, + "learning_rate": 6.496493732707556e-06, + "loss": 0.3881, "step": 107180 }, { - "epoch": 3.77, - "learning_rate": 7.526640457342896e-06, - "loss": 0.2593, + "epoch": 3.862940137672541, + "grad_norm": 0.2511681020259857, + "learning_rate": 6.494531552399666e-06, + "loss": 0.371, "step": 107185 }, { - "epoch": 3.77, - "learning_rate": 7.5246032057531415e-06, - "loss": 0.246, + "epoch": 3.8631203373337657, + "grad_norm": 0.23705346882343292, + "learning_rate": 6.4925696242251975e-06, + "loss": 0.3931, "step": 107190 }, { - "epoch": 3.77, - "learning_rate": 7.522566181069899e-06, - "loss": 0.2391, + "epoch": 3.8633005369949904, + "grad_norm": 0.24382388591766357, + "learning_rate": 6.490607948210889e-06, + "loss": 0.3663, "step": 107195 }, { - "epoch": 3.77, - "learning_rate": 7.520529383319616e-06, - "loss": 0.2643, + "epoch": 3.863480736656215, + "grad_norm": 0.29807329177856445, + "learning_rate": 6.488646524383454e-06, + "loss": 0.3872, "step": 107200 }, { - "epoch": 3.77, - "learning_rate": 7.518492812528732e-06, - "loss": 0.2446, + "epoch": 3.86366093631744, + "grad_norm": 0.18684354424476624, + "learning_rate": 6.486685352769634e-06, + "loss": 0.363, "step": 107205 }, { - "epoch": 3.77, - "learning_rate": 7.516456468723707e-06, - "loss": 0.2642, + "epoch": 3.8638411359786646, + "grad_norm": 0.20785370469093323, + "learning_rate": 6.484724433396141e-06, + "loss": 0.3894, "step": 107210 }, { - "epoch": 3.77, - "learning_rate": 7.514420351930962e-06, - "loss": 0.284, + "epoch": 3.864021335639889, + "grad_norm": 0.2543993890285492, + "learning_rate": 6.482763766289693e-06, + "loss": 0.3472, "step": 107215 }, { - "epoch": 3.77, - "learning_rate": 7.512384462176955e-06, - "loss": 0.2778, + "epoch": 3.8642015353011137, + "grad_norm": 0.31936973333358765, + "learning_rate": 6.480803351477005e-06, + "loss": 0.3948, "step": 107220 }, { - "epoch": 3.77, - "learning_rate": 7.510348799488104e-06, - "loss": 0.2793, + "epoch": 3.8643817349623384, + "grad_norm": 0.33362480998039246, + "learning_rate": 6.478843188984776e-06, + "loss": 0.3973, "step": 107225 }, { - "epoch": 3.77, - "learning_rate": 7.508313363890853e-06, - "loss": 0.2586, + "epoch": 3.8645619346235627, + "grad_norm": 0.2270936667919159, + "learning_rate": 6.476883278839732e-06, + "loss": 0.3745, "step": 107230 }, { - "epoch": 3.77, - "learning_rate": 7.506278155411628e-06, - "loss": 0.2428, + "epoch": 3.8647421342847874, + "grad_norm": 0.22412335872650146, + "learning_rate": 6.474923621068574e-06, + "loss": 0.3999, "step": 107235 }, { - "epoch": 3.77, - "learning_rate": 7.504243174076841e-06, - "loss": 0.2601, + "epoch": 3.864922333946012, + "grad_norm": 0.23207208514213562, + "learning_rate": 6.472964215697982e-06, + "loss": 0.4043, "step": 107240 }, { - "epoch": 3.77, - "learning_rate": 7.502208419912937e-06, - "loss": 0.2664, + "epoch": 3.865102533607237, + "grad_norm": 0.23929710686206818, + "learning_rate": 6.47100506275467e-06, + "loss": 0.3865, "step": 107245 }, { - "epoch": 3.77, - "learning_rate": 7.500173892946328e-06, - "loss": 0.2744, + "epoch": 3.8652827332684616, + "grad_norm": 0.22255569696426392, + "learning_rate": 6.469046162265322e-06, + "loss": 0.3684, "step": 107250 }, { - "epoch": 3.77, - "learning_rate": 7.498139593203424e-06, - "loss": 0.2545, + "epoch": 3.8654629329296863, + "grad_norm": 0.2753349840641022, + "learning_rate": 6.467087514256645e-06, + "loss": 0.3796, "step": 107255 }, { - "epoch": 3.77, - "learning_rate": 7.496105520710636e-06, - "loss": 0.2582, + "epoch": 3.8656431325909106, + "grad_norm": 0.2791849374771118, + "learning_rate": 6.465129118755309e-06, + "loss": 0.3707, "step": 107260 }, { - "epoch": 3.77, - "learning_rate": 7.494071675494391e-06, - "loss": 0.263, + "epoch": 3.8658233322521354, + "grad_norm": 0.2520217299461365, + "learning_rate": 6.4631709757879885e-06, + "loss": 0.3908, "step": 107265 }, { - "epoch": 3.77, - "learning_rate": 7.49203805758108e-06, - "loss": 0.2693, + "epoch": 3.86600353191336, + "grad_norm": 0.2589038014411926, + "learning_rate": 6.461213085381384e-06, + "loss": 0.3644, "step": 107270 }, { - "epoch": 3.77, - "learning_rate": 7.490004666997125e-06, - "loss": 0.2386, + "epoch": 3.8661837315745844, + "grad_norm": 0.21371279656887054, + "learning_rate": 6.459255447562155e-06, + "loss": 0.3742, "step": 107275 }, { - "epoch": 3.77, - "learning_rate": 7.487971503768909e-06, - "loss": 0.2503, + "epoch": 3.866363931235809, + "grad_norm": 0.24295492470264435, + "learning_rate": 6.457298062356995e-06, + "loss": 0.3742, "step": 107280 }, { - "epoch": 3.77, - "learning_rate": 7.485938567922854e-06, - "loss": 0.2404, + "epoch": 3.866544130897034, + "grad_norm": 0.2283410131931305, + "learning_rate": 6.45534092979255e-06, + "loss": 0.3884, "step": 107285 }, { - "epoch": 3.77, - "learning_rate": 7.483905859485344e-06, - "loss": 0.2657, + "epoch": 3.8667243305582586, + "grad_norm": 0.20489716529846191, + "learning_rate": 6.453384049895489e-06, + "loss": 0.3688, "step": 107290 }, { - "epoch": 3.77, - "learning_rate": 7.481873378482771e-06, - "loss": 0.2587, + "epoch": 3.8669045302194833, + "grad_norm": 0.3497338891029358, + "learning_rate": 6.451427422692485e-06, + "loss": 0.3847, "step": 107295 }, { - "epoch": 3.78, - "learning_rate": 7.47984112494152e-06, - "loss": 0.2653, + "epoch": 3.867084729880708, + "grad_norm": 0.29012593626976013, + "learning_rate": 6.449471048210193e-06, + "loss": 0.3362, "step": 107300 }, { - "epoch": 3.78, - "learning_rate": 7.477809098887994e-06, - "loss": 0.2493, + "epoch": 3.8672649295419324, + "grad_norm": 0.24133215844631195, + "learning_rate": 6.4475149264752675e-06, + "loss": 0.3658, "step": 107305 }, { - "epoch": 3.78, - "learning_rate": 7.475777300348568e-06, - "loss": 0.2677, + "epoch": 3.867445129203157, + "grad_norm": 0.26846662163734436, + "learning_rate": 6.445559057514358e-06, + "loss": 0.3976, "step": 107310 }, { - "epoch": 3.78, - "learning_rate": 7.473745729349627e-06, - "loss": 0.2736, + "epoch": 3.867625328864382, + "grad_norm": 0.2922466993331909, + "learning_rate": 6.443603441354107e-06, + "loss": 0.404, "step": 107315 }, { - "epoch": 3.78, - "learning_rate": 7.471714385917539e-06, - "loss": 0.253, + "epoch": 3.867805528525606, + "grad_norm": 0.24670326709747314, + "learning_rate": 6.441648078021173e-06, + "loss": 0.3723, "step": 107320 }, { - "epoch": 3.78, - "learning_rate": 7.46968327007869e-06, - "loss": 0.2603, + "epoch": 3.867985728186831, + "grad_norm": 0.29100772738456726, + "learning_rate": 6.439692967542191e-06, + "loss": 0.4027, "step": 107325 }, { - "epoch": 3.78, - "learning_rate": 7.467652381859455e-06, - "loss": 0.2642, + "epoch": 3.8681659278480556, + "grad_norm": 0.21996453404426575, + "learning_rate": 6.4377381099438e-06, + "loss": 0.4048, "step": 107330 }, { - "epoch": 3.78, - "learning_rate": 7.4656217212862e-06, - "loss": 0.241, + "epoch": 3.8683461275092803, + "grad_norm": 0.2679681181907654, + "learning_rate": 6.435783505252632e-06, + "loss": 0.3685, "step": 107335 }, { - "epoch": 3.78, - "learning_rate": 7.463591288385285e-06, - "loss": 0.2435, + "epoch": 3.868526327170505, + "grad_norm": 0.2263759821653366, + "learning_rate": 6.4338291534953215e-06, + "loss": 0.3657, "step": 107340 }, { - "epoch": 3.78, - "learning_rate": 7.461561083183088e-06, - "loss": 0.2613, + "epoch": 3.8687065268317298, + "grad_norm": 0.2599135637283325, + "learning_rate": 6.431875054698486e-06, + "loss": 0.3653, "step": 107345 }, { - "epoch": 3.78, - "learning_rate": 7.459531105705964e-06, - "loss": 0.2122, + "epoch": 3.868886726492954, + "grad_norm": 0.28875550627708435, + "learning_rate": 6.429921208888773e-06, + "loss": 0.3754, "step": 107350 }, { - "epoch": 3.78, - "learning_rate": 7.457501355980256e-06, - "loss": 0.2452, + "epoch": 3.869066926154179, + "grad_norm": 0.23925039172172546, + "learning_rate": 6.4279676160927725e-06, + "loss": 0.4144, "step": 107355 }, { - "epoch": 3.78, - "learning_rate": 7.455471834032343e-06, - "loss": 0.2441, + "epoch": 3.8692471258154035, + "grad_norm": 0.3068399727344513, + "learning_rate": 6.426014276337125e-06, + "loss": 0.3856, "step": 107360 }, { - "epoch": 3.78, - "learning_rate": 7.453442539888566e-06, - "loss": 0.2422, + "epoch": 3.8694273254766283, + "grad_norm": 0.18254274129867554, + "learning_rate": 6.4240611896484365e-06, + "loss": 0.3931, "step": 107365 }, { - "epoch": 3.78, - "learning_rate": 7.451413473575275e-06, - "loss": 0.258, + "epoch": 3.8696075251378526, + "grad_norm": 0.20957380533218384, + "learning_rate": 6.422108356053319e-06, + "loss": 0.3773, "step": 107370 }, { - "epoch": 3.78, - "learning_rate": 7.449384635118805e-06, - "loss": 0.2488, + "epoch": 3.8697877247990773, + "grad_norm": 0.20488250255584717, + "learning_rate": 6.420155775578379e-06, + "loss": 0.3621, "step": 107375 }, { - "epoch": 3.78, - "learning_rate": 7.44735602454551e-06, - "loss": 0.2523, + "epoch": 3.869967924460302, + "grad_norm": 0.30153316259384155, + "learning_rate": 6.418203448250218e-06, + "loss": 0.4062, "step": 107380 }, { - "epoch": 3.78, - "learning_rate": 7.445327641881739e-06, - "loss": 0.2399, + "epoch": 3.8701481241215268, + "grad_norm": 0.27384403347969055, + "learning_rate": 6.416251374095431e-06, + "loss": 0.4122, "step": 107385 }, { - "epoch": 3.78, - "learning_rate": 7.443299487153818e-06, - "loss": 0.2626, + "epoch": 3.8703283237827515, + "grad_norm": 0.317130446434021, + "learning_rate": 6.414299553140629e-06, + "loss": 0.4039, "step": 107390 }, { - "epoch": 3.78, - "learning_rate": 7.4412715603880745e-06, - "loss": 0.274, + "epoch": 3.870508523443976, + "grad_norm": 0.21292860805988312, + "learning_rate": 6.412347985412395e-06, + "loss": 0.3729, "step": 107395 }, { - "epoch": 3.78, - "learning_rate": 7.439243861610856e-06, - "loss": 0.2693, + "epoch": 3.8706887231052005, + "grad_norm": 0.2204897701740265, + "learning_rate": 6.410396670937325e-06, + "loss": 0.3625, "step": 107400 }, { - "epoch": 3.78, - "learning_rate": 7.437216390848484e-06, - "loss": 0.2787, + "epoch": 3.8708689227664252, + "grad_norm": 0.2745380997657776, + "learning_rate": 6.4084456097419976e-06, + "loss": 0.4188, "step": 107405 }, { - "epoch": 3.78, - "learning_rate": 7.435189148127283e-06, - "loss": 0.2647, + "epoch": 3.87104912242765, + "grad_norm": 0.28390398621559143, + "learning_rate": 6.4064948018529915e-06, + "loss": 0.39, "step": 107410 }, { - "epoch": 3.78, - "learning_rate": 7.433162133473568e-06, - "loss": 0.2889, + "epoch": 3.8712293220888743, + "grad_norm": 0.2293710857629776, + "learning_rate": 6.404544247296903e-06, + "loss": 0.3506, "step": 107415 }, { - "epoch": 3.78, - "learning_rate": 7.43113534691367e-06, - "loss": 0.2834, + "epoch": 3.871409521750099, + "grad_norm": 0.22023165225982666, + "learning_rate": 6.4025939461003075e-06, + "loss": 0.3471, "step": 107420 }, { - "epoch": 3.78, - "learning_rate": 7.429108788473904e-06, - "loss": 0.2552, + "epoch": 3.8715897214113237, + "grad_norm": 0.25889191031455994, + "learning_rate": 6.400643898289751e-06, + "loss": 0.3967, "step": 107425 }, { - "epoch": 3.78, - "learning_rate": 7.427082458180573e-06, - "loss": 0.2517, + "epoch": 3.8717699210725485, + "grad_norm": 0.2643846273422241, + "learning_rate": 6.3986941038918305e-06, + "loss": 0.394, "step": 107430 }, { - "epoch": 3.78, - "learning_rate": 7.425056356059995e-06, - "loss": 0.2573, + "epoch": 3.871950120733773, + "grad_norm": 0.205980584025383, + "learning_rate": 6.396744562933094e-06, + "loss": 0.3505, "step": 107435 }, { - "epoch": 3.78, - "learning_rate": 7.423030482138485e-06, - "loss": 0.2585, + "epoch": 3.8721303203949975, + "grad_norm": 0.24297955632209778, + "learning_rate": 6.394795275440118e-06, + "loss": 0.3931, "step": 107440 }, { - "epoch": 3.78, - "learning_rate": 7.421004836442341e-06, - "loss": 0.2431, + "epoch": 3.8723105200562222, + "grad_norm": 0.22801950573921204, + "learning_rate": 6.392846241439462e-06, + "loss": 0.3853, "step": 107445 }, { - "epoch": 3.78, - "learning_rate": 7.418979418997865e-06, - "loss": 0.2421, + "epoch": 3.872490719717447, + "grad_norm": 0.20344804227352142, + "learning_rate": 6.390897460957657e-06, + "loss": 0.3943, "step": 107450 }, { - "epoch": 3.78, - "learning_rate": 7.416954229831347e-06, - "loss": 0.2353, + "epoch": 3.8726709193786717, + "grad_norm": 0.24186420440673828, + "learning_rate": 6.38894893402128e-06, + "loss": 0.368, "step": 107455 }, { - "epoch": 3.78, - "learning_rate": 7.414929268969095e-06, - "loss": 0.2701, + "epoch": 3.872851119039896, + "grad_norm": 0.28862589597702026, + "learning_rate": 6.387000660656869e-06, + "loss": 0.4025, "step": 107460 }, { - "epoch": 3.78, - "learning_rate": 7.412904536437401e-06, - "loss": 0.2685, + "epoch": 3.8730313187011207, + "grad_norm": 0.2653532922267914, + "learning_rate": 6.385052640890973e-06, + "loss": 0.4228, "step": 107465 }, { - "epoch": 3.78, - "learning_rate": 7.410880032262543e-06, - "loss": 0.2601, + "epoch": 3.8732115183623455, + "grad_norm": 0.20793728530406952, + "learning_rate": 6.383104874750129e-06, + "loss": 0.3892, "step": 107470 }, { - "epoch": 3.78, - "learning_rate": 7.408855756470823e-06, - "loss": 0.249, + "epoch": 3.87339171802357, + "grad_norm": 0.20338398218154907, + "learning_rate": 6.38115736226087e-06, + "loss": 0.3623, "step": 107475 }, { - "epoch": 3.78, - "learning_rate": 7.406831709088519e-06, - "loss": 0.2613, + "epoch": 3.873571917684795, + "grad_norm": 0.27274417877197266, + "learning_rate": 6.3792101034497454e-06, + "loss": 0.4013, "step": 107480 }, { - "epoch": 3.78, - "learning_rate": 7.4048078901419045e-06, - "loss": 0.2371, + "epoch": 3.8737521173460197, + "grad_norm": 0.21529635787010193, + "learning_rate": 6.3772630983432776e-06, + "loss": 0.3646, "step": 107485 }, { - "epoch": 3.78, - "learning_rate": 7.40278429965727e-06, - "loss": 0.2581, + "epoch": 3.873932317007244, + "grad_norm": 0.24685950577259064, + "learning_rate": 6.375316346967994e-06, + "loss": 0.3973, "step": 107490 }, { - "epoch": 3.78, - "learning_rate": 7.400760937660875e-06, - "loss": 0.28, + "epoch": 3.8741125166684687, + "grad_norm": 0.1983158439397812, + "learning_rate": 6.373369849350419e-06, + "loss": 0.368, "step": 107495 }, { - "epoch": 3.78, - "learning_rate": 7.39873780417901e-06, - "loss": 0.2608, + "epoch": 3.8742927163296934, + "grad_norm": 0.27724209427833557, + "learning_rate": 6.371423605517066e-06, + "loss": 0.3422, "step": 107500 }, { - "epoch": 3.78, - "eval_loss": 0.25113674998283386, - "eval_runtime": 10.5445, - "eval_samples_per_second": 9.484, - "eval_steps_per_second": 9.484, + "epoch": 3.8742927163296934, + "eval_loss": 0.4283193349838257, + "eval_runtime": 3.5283, + "eval_samples_per_second": 28.342, + "eval_steps_per_second": 7.086, "step": 107500 }, { - "epoch": 3.78, - "learning_rate": 7.3967148992379356e-06, - "loss": 0.2622, + "epoch": 3.8744729159909177, + "grad_norm": 0.25930055975914, + "learning_rate": 6.369477615494468e-06, + "loss": 0.3701, "step": 107505 }, { - "epoch": 3.78, - "learning_rate": 7.394692222863908e-06, - "loss": 0.249, + "epoch": 3.8746531156521424, + "grad_norm": 0.25892987847328186, + "learning_rate": 6.3675318793091295e-06, + "loss": 0.3995, "step": 107510 }, { - "epoch": 3.78, - "learning_rate": 7.3926697750832105e-06, - "loss": 0.2459, + "epoch": 3.874833315313367, + "grad_norm": 0.22906829416751862, + "learning_rate": 6.365586396987563e-06, + "loss": 0.3731, "step": 107515 }, { - "epoch": 3.78, - "learning_rate": 7.390647555922089e-06, - "loss": 0.2483, + "epoch": 3.875013514974592, + "grad_norm": 0.2710282802581787, + "learning_rate": 6.363641168556275e-06, + "loss": 0.4196, "step": 107520 }, { - "epoch": 3.78, - "learning_rate": 7.388625565406806e-06, - "loss": 0.2719, + "epoch": 3.8751937146358166, + "grad_norm": 0.24969340860843658, + "learning_rate": 6.361696194041766e-06, + "loss": 0.4083, "step": 107525 }, { - "epoch": 3.78, - "learning_rate": 7.386603803563605e-06, - "loss": 0.2514, + "epoch": 3.8753739142970414, + "grad_norm": 0.28726011514663696, + "learning_rate": 6.359751473470532e-06, + "loss": 0.4036, "step": 107530 }, { - "epoch": 3.78, - "learning_rate": 7.384582270418753e-06, - "loss": 0.2525, + "epoch": 3.8755541139582657, + "grad_norm": 0.2590154707431793, + "learning_rate": 6.35780700686909e-06, + "loss": 0.3776, "step": 107535 }, { - "epoch": 3.78, - "learning_rate": 7.382560965998483e-06, - "loss": 0.2545, + "epoch": 3.8757343136194904, + "grad_norm": 0.22933447360992432, + "learning_rate": 6.355862794263903e-06, + "loss": 0.3781, "step": 107540 }, { - "epoch": 3.78, - "learning_rate": 7.380539890329058e-06, - "loss": 0.2962, + "epoch": 3.875914513280715, + "grad_norm": 0.24526602029800415, + "learning_rate": 6.353918835681483e-06, + "loss": 0.3529, "step": 107545 }, { - "epoch": 3.78, - "learning_rate": 7.378519043436704e-06, - "loss": 0.2555, + "epoch": 3.8760947129419394, + "grad_norm": 0.24101245403289795, + "learning_rate": 6.351975131148308e-06, + "loss": 0.3841, "step": 107550 }, { - "epoch": 3.78, - "learning_rate": 7.3764984253476724e-06, - "loss": 0.2605, + "epoch": 3.876274912603164, + "grad_norm": 0.23179291188716888, + "learning_rate": 6.350031680690854e-06, + "loss": 0.3638, "step": 107555 }, { - "epoch": 3.78, - "learning_rate": 7.374478036088195e-06, - "loss": 0.2702, + "epoch": 3.876455112264389, + "grad_norm": 0.25088703632354736, + "learning_rate": 6.348088484335624e-06, + "loss": 0.3617, "step": 107560 }, { - "epoch": 3.78, - "learning_rate": 7.3724578756845076e-06, - "loss": 0.2675, + "epoch": 3.8766353119256136, + "grad_norm": 0.2836175262928009, + "learning_rate": 6.346145542109069e-06, + "loss": 0.3649, "step": 107565 }, { - "epoch": 3.78, - "learning_rate": 7.370437944162825e-06, - "loss": 0.2398, + "epoch": 3.8768155115868383, + "grad_norm": 0.22787512838840485, + "learning_rate": 6.344202854037662e-06, + "loss": 0.3655, "step": 107570 }, { - "epoch": 3.78, - "learning_rate": 7.368418241549399e-06, - "loss": 0.272, + "epoch": 3.876995711248063, + "grad_norm": 0.24610376358032227, + "learning_rate": 6.342260420147889e-06, + "loss": 0.3755, "step": 107575 }, { - "epoch": 3.78, - "learning_rate": 7.366398767870441e-06, - "loss": 0.2472, + "epoch": 3.8771759109092874, + "grad_norm": 0.270090788602829, + "learning_rate": 6.340318240466203e-06, + "loss": 0.3885, "step": 107580 }, { - "epoch": 3.79, - "learning_rate": 7.3643795231521675e-06, - "loss": 0.2548, + "epoch": 3.877356110570512, + "grad_norm": 0.24736785888671875, + "learning_rate": 6.3383763150190675e-06, + "loss": 0.3802, "step": 107585 }, { - "epoch": 3.79, - "learning_rate": 7.362360507420809e-06, - "loss": 0.2573, + "epoch": 3.877536310231737, + "grad_norm": 0.22846059501171112, + "learning_rate": 6.336434643832942e-06, + "loss": 0.3796, "step": 107590 }, { - "epoch": 3.79, - "learning_rate": 7.360341720702577e-06, - "loss": 0.2434, + "epoch": 3.877716509892961, + "grad_norm": 0.20484845340251923, + "learning_rate": 6.334493226934276e-06, + "loss": 0.3389, "step": 107595 }, { - "epoch": 3.79, - "learning_rate": 7.358323163023675e-06, - "loss": 0.2234, + "epoch": 3.877896709554186, + "grad_norm": 0.2559860050678253, + "learning_rate": 6.3325520643495314e-06, + "loss": 0.3823, "step": 107600 }, { - "epoch": 3.79, - "learning_rate": 7.356304834410321e-06, - "loss": 0.2456, + "epoch": 3.8780769092154106, + "grad_norm": 0.20136897265911102, + "learning_rate": 6.330611156105151e-06, + "loss": 0.38, "step": 107605 }, { - "epoch": 3.79, - "learning_rate": 7.354286734888729e-06, - "loss": 0.2644, + "epoch": 3.8782571088766353, + "grad_norm": 0.20288234949111938, + "learning_rate": 6.328670502227579e-06, + "loss": 0.3778, "step": 107610 }, { - "epoch": 3.79, - "learning_rate": 7.352268864485093e-06, - "loss": 0.2681, + "epoch": 3.87843730853786, + "grad_norm": 0.2542252838611603, + "learning_rate": 6.326730102743259e-06, + "loss": 0.3886, "step": 107615 }, { - "epoch": 3.79, - "learning_rate": 7.350251223225619e-06, - "loss": 0.2773, + "epoch": 3.878617508199085, + "grad_norm": 0.2526249587535858, + "learning_rate": 6.324789957678617e-06, + "loss": 0.3643, "step": 107620 }, { - "epoch": 3.79, - "learning_rate": 7.34823381113649e-06, - "loss": 0.2726, + "epoch": 3.878797707860309, + "grad_norm": 0.20341147482395172, + "learning_rate": 6.3228500670601015e-06, + "loss": 0.389, "step": 107625 }, { - "epoch": 3.79, - "learning_rate": 7.346216628243921e-06, - "loss": 0.2355, + "epoch": 3.878977907521534, + "grad_norm": 0.2713795602321625, + "learning_rate": 6.320910430914148e-06, + "loss": 0.4011, "step": 107630 }, { - "epoch": 3.79, - "learning_rate": 7.344199674574093e-06, - "loss": 0.2691, + "epoch": 3.8791581071827586, + "grad_norm": 0.2428542822599411, + "learning_rate": 6.318971049267159e-06, + "loss": 0.3911, "step": 107635 }, { - "epoch": 3.79, - "learning_rate": 7.342182950153198e-06, - "loss": 0.2433, + "epoch": 3.8793383068439833, + "grad_norm": 0.2431861311197281, + "learning_rate": 6.317031922145577e-06, + "loss": 0.4293, "step": 107640 }, { - "epoch": 3.79, - "learning_rate": 7.340166455007414e-06, - "loss": 0.2368, + "epoch": 3.8795185065052076, + "grad_norm": 0.23291923105716705, + "learning_rate": 6.315093049575821e-06, + "loss": 0.3808, "step": 107645 }, { - "epoch": 3.79, - "learning_rate": 7.338150189162937e-06, - "loss": 0.2437, + "epoch": 3.8796987061664323, + "grad_norm": 0.287264347076416, + "learning_rate": 6.313154431584303e-06, + "loss": 0.4031, "step": 107650 }, { - "epoch": 3.79, - "learning_rate": 7.336134152645932e-06, - "loss": 0.264, + "epoch": 3.879878905827657, + "grad_norm": 0.24084652960300446, + "learning_rate": 6.31121606819744e-06, + "loss": 0.3864, "step": 107655 }, { - "epoch": 3.79, - "learning_rate": 7.334118345482593e-06, - "loss": 0.2433, + "epoch": 3.8800591054888818, + "grad_norm": 0.25108230113983154, + "learning_rate": 6.309277959441629e-06, + "loss": 0.4094, "step": 107660 }, { - "epoch": 3.79, - "learning_rate": 7.332102767699076e-06, - "loss": 0.2492, + "epoch": 3.8802393051501065, + "grad_norm": 0.24399013817310333, + "learning_rate": 6.307340105343298e-06, + "loss": 0.3851, "step": 107665 }, { - "epoch": 3.79, - "learning_rate": 7.3300874193215715e-06, - "loss": 0.2647, + "epoch": 3.880419504811331, + "grad_norm": 0.20561757683753967, + "learning_rate": 6.305402505928837e-06, + "loss": 0.3682, "step": 107670 }, { - "epoch": 3.79, - "learning_rate": 7.328072300376234e-06, - "loss": 0.2439, + "epoch": 3.8805997044725555, + "grad_norm": 0.2230025976896286, + "learning_rate": 6.3034651612246476e-06, + "loss": 0.4135, "step": 107675 }, { - "epoch": 3.79, - "learning_rate": 7.326057410889231e-06, - "loss": 0.2655, + "epoch": 3.8807799041337803, + "grad_norm": 0.2320537567138672, + "learning_rate": 6.301528071257127e-06, + "loss": 0.3782, "step": 107680 }, { - "epoch": 3.79, - "learning_rate": 7.32404275088672e-06, - "loss": 0.2685, + "epoch": 3.880960103795005, + "grad_norm": 0.28236204385757446, + "learning_rate": 6.299591236052657e-06, + "loss": 0.4315, "step": 107685 }, { - "epoch": 3.79, - "learning_rate": 7.32202832039487e-06, - "loss": 0.2541, + "epoch": 3.8811403034562293, + "grad_norm": 0.24441473186016083, + "learning_rate": 6.297654655637644e-06, + "loss": 0.3819, "step": 107690 }, { - "epoch": 3.79, - "learning_rate": 7.320014119439833e-06, - "loss": 0.254, + "epoch": 3.881320503117454, + "grad_norm": 0.2429056018590927, + "learning_rate": 6.295718330038466e-06, + "loss": 0.4267, "step": 107695 }, { - "epoch": 3.79, - "learning_rate": 7.318000148047754e-06, - "loss": 0.2618, + "epoch": 3.8815007027786788, + "grad_norm": 0.26638370752334595, + "learning_rate": 6.293782259281503e-06, + "loss": 0.3785, "step": 107700 }, { - "epoch": 3.79, - "learning_rate": 7.315986406244798e-06, - "loss": 0.255, + "epoch": 3.8816809024399035, + "grad_norm": 0.23976294696331024, + "learning_rate": 6.291846443393137e-06, + "loss": 0.3971, "step": 107705 }, { - "epoch": 3.79, - "learning_rate": 7.313972894057094e-06, - "loss": 0.2475, + "epoch": 3.8818611021011282, + "grad_norm": 0.2684692144393921, + "learning_rate": 6.289910882399741e-06, + "loss": 0.4069, "step": 107710 }, { - "epoch": 3.79, - "learning_rate": 7.311959611510807e-06, - "loss": 0.2343, + "epoch": 3.882041301762353, + "grad_norm": 0.2549930512905121, + "learning_rate": 6.287975576327679e-06, + "loss": 0.379, "step": 107715 }, { - "epoch": 3.79, - "learning_rate": 7.309946558632061e-06, - "loss": 0.2428, + "epoch": 3.8822215014235772, + "grad_norm": 0.24471694231033325, + "learning_rate": 6.286040525203332e-06, + "loss": 0.3727, "step": 107720 }, { - "epoch": 3.79, - "learning_rate": 7.307933735447006e-06, - "loss": 0.2493, + "epoch": 3.882401701084802, + "grad_norm": 0.2403818666934967, + "learning_rate": 6.284105729053064e-06, + "loss": 0.3695, "step": 107725 }, { - "epoch": 3.79, - "learning_rate": 7.305921141981772e-06, - "loss": 0.2648, + "epoch": 3.8825819007460267, + "grad_norm": 0.24353429675102234, + "learning_rate": 6.28217118790323e-06, + "loss": 0.394, "step": 107730 }, { - "epoch": 3.79, - "learning_rate": 7.303908778262491e-06, - "loss": 0.2529, + "epoch": 3.882762100407251, + "grad_norm": 0.17580963671207428, + "learning_rate": 6.28023690178019e-06, + "loss": 0.3487, "step": 107735 }, { - "epoch": 3.79, - "learning_rate": 7.301896644315287e-06, - "loss": 0.2375, + "epoch": 3.8829423000684757, + "grad_norm": 0.21670427918434143, + "learning_rate": 6.27830287071029e-06, + "loss": 0.3557, "step": 107740 }, { - "epoch": 3.79, - "learning_rate": 7.299884740166297e-06, - "loss": 0.2622, + "epoch": 3.8831224997297005, + "grad_norm": 0.26354971528053284, + "learning_rate": 6.276369094719903e-06, + "loss": 0.3939, "step": 107745 }, { - "epoch": 3.79, - "learning_rate": 7.29787306584164e-06, - "loss": 0.2582, + "epoch": 3.883302699390925, + "grad_norm": 0.24392694234848022, + "learning_rate": 6.274435573835355e-06, + "loss": 0.3638, "step": 107750 }, { - "epoch": 3.79, - "learning_rate": 7.295861621367434e-06, - "loss": 0.2783, + "epoch": 3.88348289905215, + "grad_norm": 0.2409679889678955, + "learning_rate": 6.272502308082989e-06, + "loss": 0.3691, "step": 107755 }, { - "epoch": 3.79, - "learning_rate": 7.293850406769789e-06, - "loss": 0.2627, + "epoch": 3.8836630987133747, + "grad_norm": 0.24243175983428955, + "learning_rate": 6.270569297489162e-06, + "loss": 0.3579, "step": 107760 }, { - "epoch": 3.79, - "learning_rate": 7.2918394220748306e-06, - "loss": 0.2553, + "epoch": 3.883843298374599, + "grad_norm": 0.19919145107269287, + "learning_rate": 6.2686365420802e-06, + "loss": 0.3814, "step": 107765 }, { - "epoch": 3.79, - "learning_rate": 7.289828667308673e-06, - "loss": 0.2623, + "epoch": 3.8840234980358237, + "grad_norm": 0.2675543427467346, + "learning_rate": 6.266704041882443e-06, + "loss": 0.3378, "step": 107770 }, { - "epoch": 3.79, - "learning_rate": 7.287818142497418e-06, - "loss": 0.2408, + "epoch": 3.8842036976970484, + "grad_norm": 0.2584518492221832, + "learning_rate": 6.264771796922212e-06, + "loss": 0.361, "step": 107775 }, { - "epoch": 3.79, - "learning_rate": 7.285807847667167e-06, - "loss": 0.2554, + "epoch": 3.8843838973582727, + "grad_norm": 0.23442451655864716, + "learning_rate": 6.262839807225834e-06, + "loss": 0.3821, "step": 107780 }, { - "epoch": 3.79, - "learning_rate": 7.283797782844032e-06, - "loss": 0.2503, + "epoch": 3.8845640970194975, + "grad_norm": 0.24751421809196472, + "learning_rate": 6.260908072819641e-06, + "loss": 0.3798, "step": 107785 }, { - "epoch": 3.79, - "learning_rate": 7.281787948054109e-06, - "loss": 0.2431, + "epoch": 3.884744296680722, + "grad_norm": 0.2060132473707199, + "learning_rate": 6.258976593729948e-06, + "loss": 0.3664, "step": 107790 }, { - "epoch": 3.79, - "learning_rate": 7.27977834332349e-06, - "loss": 0.2696, + "epoch": 3.884924496341947, + "grad_norm": 0.18985074758529663, + "learning_rate": 6.2570453699830725e-06, + "loss": 0.417, "step": 107795 }, { - "epoch": 3.79, - "learning_rate": 7.277768968678264e-06, - "loss": 0.2607, + "epoch": 3.8851046960031717, + "grad_norm": 0.21168529987335205, + "learning_rate": 6.2551144016053224e-06, + "loss": 0.371, "step": 107800 }, { - "epoch": 3.79, - "learning_rate": 7.27575982414454e-06, - "loss": 0.2608, + "epoch": 3.8852848956643964, + "grad_norm": 0.21965354681015015, + "learning_rate": 6.253183688623002e-06, + "loss": 0.3586, "step": 107805 }, { - "epoch": 3.79, - "learning_rate": 7.2737509097483894e-06, - "loss": 0.274, + "epoch": 3.8854650953256207, + "grad_norm": 0.2647111117839813, + "learning_rate": 6.251253231062435e-06, + "loss": 0.4069, "step": 107810 }, { - "epoch": 3.79, - "learning_rate": 7.271742225515896e-06, - "loss": 0.2475, + "epoch": 3.8856452949868454, + "grad_norm": 0.2614315152168274, + "learning_rate": 6.249323028949916e-06, + "loss": 0.4049, "step": 107815 }, { - "epoch": 3.79, - "learning_rate": 7.269733771473147e-06, - "loss": 0.2381, + "epoch": 3.88582549464807, + "grad_norm": 0.24572202563285828, + "learning_rate": 6.247393082311728e-06, + "loss": 0.4089, "step": 107820 }, { - "epoch": 3.79, - "learning_rate": 7.2677255476462255e-06, - "loss": 0.2625, + "epoch": 3.8860056943092944, + "grad_norm": 0.21743349730968475, + "learning_rate": 6.245463391174186e-06, + "loss": 0.3464, "step": 107825 }, { - "epoch": 3.79, - "learning_rate": 7.265717554061205e-06, - "loss": 0.248, + "epoch": 3.886185893970519, + "grad_norm": 0.21934011578559875, + "learning_rate": 6.243533955563574e-06, + "loss": 0.3717, "step": 107830 }, { - "epoch": 3.79, - "learning_rate": 7.2637097907441444e-06, - "loss": 0.2609, + "epoch": 3.886366093631744, + "grad_norm": 0.2820332646369934, + "learning_rate": 6.241604775506174e-06, + "loss": 0.3887, "step": 107835 }, { - "epoch": 3.79, - "learning_rate": 7.261702257721131e-06, - "loss": 0.2537, + "epoch": 3.8865462932929686, + "grad_norm": 0.2698332965373993, + "learning_rate": 6.2396758510282895e-06, + "loss": 0.3956, "step": 107840 }, { - "epoch": 3.79, - "learning_rate": 7.259694955018226e-06, - "loss": 0.2211, + "epoch": 3.8867264929541934, + "grad_norm": 0.2532370090484619, + "learning_rate": 6.237747182156178e-06, + "loss": 0.3619, "step": 107845 }, { - "epoch": 3.79, - "learning_rate": 7.25768788266149e-06, - "loss": 0.2568, + "epoch": 3.886906692615418, + "grad_norm": 0.275337278842926, + "learning_rate": 6.2358187689161356e-06, + "loss": 0.3903, "step": 107850 }, { - "epoch": 3.79, - "learning_rate": 7.255681040676976e-06, - "loss": 0.2581, + "epoch": 3.8870868922766424, + "grad_norm": 0.2451581060886383, + "learning_rate": 6.233890611334428e-06, + "loss": 0.3933, "step": 107855 }, { - "epoch": 3.79, - "learning_rate": 7.253674429090756e-06, - "loss": 0.2605, + "epoch": 3.887267091937867, + "grad_norm": 0.2677507698535919, + "learning_rate": 6.231962709437328e-06, + "loss": 0.3806, "step": 107860 }, { - "epoch": 3.79, - "learning_rate": 7.25166804792888e-06, - "loss": 0.2435, + "epoch": 3.887447291599092, + "grad_norm": 0.26012924313545227, + "learning_rate": 6.230035063251102e-06, + "loss": 0.3458, "step": 107865 }, { - "epoch": 3.8, - "learning_rate": 7.249661897217391e-06, - "loss": 0.26, + "epoch": 3.8876274912603166, + "grad_norm": 0.2477564662694931, + "learning_rate": 6.2281076728020085e-06, + "loss": 0.394, "step": 107870 }, { - "epoch": 3.8, - "learning_rate": 7.247655976982343e-06, - "loss": 0.2477, + "epoch": 3.887807690921541, + "grad_norm": 0.2495788335800171, + "learning_rate": 6.226180538116319e-06, + "loss": 0.3995, "step": 107875 }, { - "epoch": 3.8, - "learning_rate": 7.245650287249791e-06, - "loss": 0.2548, + "epoch": 3.8879878905827656, + "grad_norm": 0.2793433666229248, + "learning_rate": 6.224253659220286e-06, + "loss": 0.3702, "step": 107880 }, { - "epoch": 3.8, - "learning_rate": 7.243644828045768e-06, - "loss": 0.2486, + "epoch": 3.8881680902439903, + "grad_norm": 0.19013087451457977, + "learning_rate": 6.22232703614016e-06, + "loss": 0.3745, "step": 107885 }, { - "epoch": 3.8, - "learning_rate": 7.241639599396316e-06, - "loss": 0.2293, + "epoch": 3.888348289905215, + "grad_norm": 0.2680392265319824, + "learning_rate": 6.220400668902196e-06, + "loss": 0.3855, "step": 107890 }, { - "epoch": 3.8, - "learning_rate": 7.2396346013274635e-06, - "loss": 0.2371, + "epoch": 3.88852848956644, + "grad_norm": 0.29071950912475586, + "learning_rate": 6.218474557532633e-06, + "loss": 0.4051, "step": 107895 }, { - "epoch": 3.8, - "learning_rate": 7.237629833865259e-06, - "loss": 0.2498, + "epoch": 3.888708689227664, + "grad_norm": 0.27325236797332764, + "learning_rate": 6.216548702057715e-06, + "loss": 0.3856, "step": 107900 }, { - "epoch": 3.8, - "learning_rate": 7.235625297035725e-06, - "loss": 0.2501, + "epoch": 3.888888888888889, + "grad_norm": 0.23505853116512299, + "learning_rate": 6.214623102503689e-06, + "loss": 0.3783, "step": 107905 }, { - "epoch": 3.8, - "learning_rate": 7.233620990864889e-06, - "loss": 0.2624, + "epoch": 3.8890690885501136, + "grad_norm": 0.23786808550357819, + "learning_rate": 6.212697758896788e-06, + "loss": 0.3829, "step": 107910 }, { - "epoch": 3.8, - "learning_rate": 7.231616915378769e-06, - "loss": 0.2747, + "epoch": 3.8892492882113383, + "grad_norm": 0.4009203612804413, + "learning_rate": 6.2107726712632404e-06, + "loss": 0.3708, "step": 107915 }, { - "epoch": 3.8, - "learning_rate": 7.2296130706033996e-06, - "loss": 0.2571, + "epoch": 3.8894294878725626, + "grad_norm": 0.20991608500480652, + "learning_rate": 6.208847839629278e-06, + "loss": 0.3835, "step": 107920 }, { - "epoch": 3.8, - "learning_rate": 7.227609456564788e-06, - "loss": 0.2528, + "epoch": 3.8896096875337873, + "grad_norm": 0.22346235811710358, + "learning_rate": 6.206923264021119e-06, + "loss": 0.3753, "step": 107925 }, { - "epoch": 3.8, - "learning_rate": 7.225606073288962e-06, - "loss": 0.2426, + "epoch": 3.889789887195012, + "grad_norm": 0.22620312869548798, + "learning_rate": 6.204998944465007e-06, + "loss": 0.3604, "step": 107930 }, { - "epoch": 3.8, - "learning_rate": 7.223602920801919e-06, - "loss": 0.2591, + "epoch": 3.889970086856237, + "grad_norm": 0.2643108665943146, + "learning_rate": 6.203074880987137e-06, + "loss": 0.3826, "step": 107935 }, { - "epoch": 3.8, - "learning_rate": 7.221599999129686e-06, - "loss": 0.2634, + "epoch": 3.8901502865174615, + "grad_norm": 0.25667524337768555, + "learning_rate": 6.201151073613726e-06, + "loss": 0.3917, "step": 107940 }, { - "epoch": 3.8, - "learning_rate": 7.219597308298259e-06, - "loss": 0.2832, + "epoch": 3.890330486178686, + "grad_norm": 0.20191670954227448, + "learning_rate": 6.199227522371001e-06, + "loss": 0.3856, "step": 107945 }, { - "epoch": 3.8, - "learning_rate": 7.217594848333636e-06, - "loss": 0.2459, + "epoch": 3.8905106858399106, + "grad_norm": 0.25496187806129456, + "learning_rate": 6.197304227285158e-06, + "loss": 0.3649, "step": 107950 }, { - "epoch": 3.8, - "learning_rate": 7.215592619261832e-06, - "loss": 0.257, + "epoch": 3.8906908855011353, + "grad_norm": 0.2135312408208847, + "learning_rate": 6.195381188382407e-06, + "loss": 0.3425, "step": 107955 }, { - "epoch": 3.8, - "learning_rate": 7.21359062110884e-06, - "loss": 0.2516, + "epoch": 3.89087108516236, + "grad_norm": 0.20012623071670532, + "learning_rate": 6.193458405688945e-06, + "loss": 0.3803, "step": 107960 }, { - "epoch": 3.8, - "learning_rate": 7.21158885390065e-06, - "loss": 0.2523, + "epoch": 3.8910512848235843, + "grad_norm": 0.2358315885066986, + "learning_rate": 6.191535879230964e-06, + "loss": 0.3862, "step": 107965 }, { - "epoch": 3.8, - "learning_rate": 7.209587317663246e-06, - "loss": 0.2596, + "epoch": 3.891231484484809, + "grad_norm": 0.2576768398284912, + "learning_rate": 6.1896136090346755e-06, + "loss": 0.407, "step": 107970 }, { - "epoch": 3.8, - "learning_rate": 7.207586012422637e-06, - "loss": 0.2645, + "epoch": 3.891411684146034, + "grad_norm": 0.23550166189670563, + "learning_rate": 6.187691595126255e-06, + "loss": 0.3778, "step": 107975 }, { - "epoch": 3.8, - "learning_rate": 7.20558493820479e-06, - "loss": 0.2577, + "epoch": 3.8915918838072585, + "grad_norm": 0.22159500420093536, + "learning_rate": 6.185769837531899e-06, + "loss": 0.3908, "step": 107980 }, { - "epoch": 3.8, - "learning_rate": 7.203584095035704e-06, - "loss": 0.2718, + "epoch": 3.8917720834684832, + "grad_norm": 0.27781954407691956, + "learning_rate": 6.183848336277784e-06, + "loss": 0.3946, "step": 107985 }, { - "epoch": 3.8, - "learning_rate": 7.2015834829413406e-06, - "loss": 0.246, + "epoch": 3.891952283129708, + "grad_norm": 0.23066557943820953, + "learning_rate": 6.1819270913900826e-06, + "loss": 0.394, "step": 107990 }, { - "epoch": 3.8, - "learning_rate": 7.199583101947696e-06, - "loss": 0.2511, + "epoch": 3.8921324827909323, + "grad_norm": 0.26918134093284607, + "learning_rate": 6.1800061028949916e-06, + "loss": 0.3916, "step": 107995 }, { - "epoch": 3.8, - "learning_rate": 7.197582952080731e-06, - "loss": 0.2522, + "epoch": 3.892312682452157, + "grad_norm": 0.24214857816696167, + "learning_rate": 6.178085370818676e-06, + "loss": 0.3739, "step": 108000 }, { - "epoch": 3.8, - "eval_loss": 0.25110575556755066, - "eval_runtime": 10.5564, - "eval_samples_per_second": 9.473, - "eval_steps_per_second": 9.473, + "epoch": 3.892312682452157, + "eval_loss": 0.42808079719543457, + "eval_runtime": 3.5205, + "eval_samples_per_second": 28.405, + "eval_steps_per_second": 7.101, "step": 108000 }, { - "epoch": 3.8, - "learning_rate": 7.195583033366421e-06, - "loss": 0.252, + "epoch": 3.8924928821133817, + "grad_norm": 0.18488271534442902, + "learning_rate": 6.1761648951873e-06, + "loss": 0.3708, "step": 108005 }, { - "epoch": 3.8, - "learning_rate": 7.193583345830723e-06, - "loss": 0.2597, + "epoch": 3.892673081774606, + "grad_norm": 0.3259855806827545, + "learning_rate": 6.174244676027033e-06, + "loss": 0.36, "step": 108010 }, { - "epoch": 3.8, - "learning_rate": 7.19158388949962e-06, - "loss": 0.2421, + "epoch": 3.8928532814358308, + "grad_norm": 0.25019964575767517, + "learning_rate": 6.172324713364039e-06, + "loss": 0.3678, "step": 108015 }, { - "epoch": 3.8, - "learning_rate": 7.189584664399063e-06, - "loss": 0.2783, + "epoch": 3.8930334810970555, + "grad_norm": 0.24694189429283142, + "learning_rate": 6.170405007224467e-06, + "loss": 0.3751, "step": 108020 }, { - "epoch": 3.8, - "learning_rate": 7.187585670555011e-06, - "loss": 0.2669, + "epoch": 3.8932136807582802, + "grad_norm": 0.1833256483078003, + "learning_rate": 6.168485557634496e-06, + "loss": 0.3654, "step": 108025 }, { - "epoch": 3.8, - "learning_rate": 7.185586907993413e-06, - "loss": 0.2566, + "epoch": 3.893393880419505, + "grad_norm": 0.2395724505186081, + "learning_rate": 6.166566364620249e-06, + "loss": 0.3517, "step": 108030 }, { - "epoch": 3.8, - "learning_rate": 7.1835883767402385e-06, - "loss": 0.2678, + "epoch": 3.8935740800807297, + "grad_norm": 0.2357412576675415, + "learning_rate": 6.164647428207896e-06, + "loss": 0.3885, "step": 108035 }, { - "epoch": 3.8, - "learning_rate": 7.181590076821418e-06, - "loss": 0.2312, + "epoch": 3.893754279741954, + "grad_norm": 0.25304263830184937, + "learning_rate": 6.162728748423577e-06, + "loss": 0.3779, "step": 108040 }, { - "epoch": 3.8, - "learning_rate": 7.1795920082629144e-06, - "loss": 0.2733, + "epoch": 3.8939344794031787, + "grad_norm": 0.23290584981441498, + "learning_rate": 6.160810325293429e-06, + "loss": 0.3866, "step": 108045 }, { - "epoch": 3.8, - "learning_rate": 7.177594171090657e-06, - "loss": 0.2529, + "epoch": 3.8941146790644035, + "grad_norm": 0.25585252046585083, + "learning_rate": 6.158892158843593e-06, + "loss": 0.3682, "step": 108050 }, { - "epoch": 3.8, - "learning_rate": 7.1755965653306026e-06, - "loss": 0.2586, + "epoch": 3.8942948787256277, + "grad_norm": 0.24071577191352844, + "learning_rate": 6.1569742491002054e-06, + "loss": 0.3762, "step": 108055 }, { - "epoch": 3.8, - "learning_rate": 7.173599191008678e-06, - "loss": 0.2694, + "epoch": 3.8944750783868525, + "grad_norm": 0.21371334791183472, + "learning_rate": 6.155056596089387e-06, + "loss": 0.3549, "step": 108060 }, { - "epoch": 3.8, - "learning_rate": 7.171602048150816e-06, - "loss": 0.2616, + "epoch": 3.894655278048077, + "grad_norm": 0.23668819665908813, + "learning_rate": 6.153139199837282e-06, + "loss": 0.383, "step": 108065 }, { - "epoch": 3.8, - "learning_rate": 7.169605136782956e-06, - "loss": 0.2726, + "epoch": 3.894835477709302, + "grad_norm": 0.23611487448215485, + "learning_rate": 6.151222060370007e-06, + "loss": 0.3706, "step": 108070 }, { - "epoch": 3.8, - "learning_rate": 7.167608456931024e-06, - "loss": 0.2387, + "epoch": 3.8950156773705267, + "grad_norm": 0.22055992484092712, + "learning_rate": 6.149305177713682e-06, + "loss": 0.4095, "step": 108075 }, { - "epoch": 3.8, - "learning_rate": 7.165612008620945e-06, - "loss": 0.2768, + "epoch": 3.8951958770317514, + "grad_norm": 0.20526739954948425, + "learning_rate": 6.147388551894423e-06, + "loss": 0.3806, "step": 108080 }, { - "epoch": 3.8, - "learning_rate": 7.163615791878634e-06, - "loss": 0.2449, + "epoch": 3.8953760766929757, + "grad_norm": 0.27778366208076477, + "learning_rate": 6.145472182938339e-06, + "loss": 0.4111, "step": 108085 }, { - "epoch": 3.8, - "learning_rate": 7.1616198067300215e-06, - "loss": 0.2485, + "epoch": 3.8955562763542004, + "grad_norm": 0.2274710088968277, + "learning_rate": 6.143556070871554e-06, + "loss": 0.3905, "step": 108090 }, { - "epoch": 3.8, - "learning_rate": 7.1596240532010136e-06, - "loss": 0.2662, + "epoch": 3.895736476015425, + "grad_norm": 0.29294195771217346, + "learning_rate": 6.141640215720165e-06, + "loss": 0.3839, "step": 108095 }, { - "epoch": 3.8, - "learning_rate": 7.157628531317539e-06, - "loss": 0.239, + "epoch": 3.8959166756766495, + "grad_norm": 0.2358580380678177, + "learning_rate": 6.139724617510279e-06, + "loss": 0.3886, "step": 108100 }, { - "epoch": 3.8, - "learning_rate": 7.1556332411054915e-06, - "loss": 0.2711, + "epoch": 3.896096875337874, + "grad_norm": 0.2260809987783432, + "learning_rate": 6.137809276267992e-06, + "loss": 0.348, "step": 108105 }, { - "epoch": 3.8, - "learning_rate": 7.153638182590794e-06, - "loss": 0.2437, + "epoch": 3.896277074999099, + "grad_norm": 0.22863958775997162, + "learning_rate": 6.1358941920193955e-06, + "loss": 0.368, "step": 108110 }, { - "epoch": 3.8, - "learning_rate": 7.1516433557993426e-06, - "loss": 0.2689, + "epoch": 3.8964572746603237, + "grad_norm": 0.22534294426441193, + "learning_rate": 6.133979364790601e-06, + "loss": 0.3866, "step": 108115 }, { - "epoch": 3.8, - "learning_rate": 7.149648760757041e-06, - "loss": 0.2491, + "epoch": 3.8966374743215484, + "grad_norm": 0.22448866069316864, + "learning_rate": 6.132064794607681e-06, + "loss": 0.3379, "step": 108120 }, { - "epoch": 3.8, - "learning_rate": 7.147654397489778e-06, - "loss": 0.2512, + "epoch": 3.896817673982773, + "grad_norm": 0.3200676143169403, + "learning_rate": 6.130150481496716e-06, + "loss": 0.3729, "step": 108125 }, { - "epoch": 3.8, - "learning_rate": 7.145660266023466e-06, - "loss": 0.2689, + "epoch": 3.8969978736439974, + "grad_norm": 0.2216705083847046, + "learning_rate": 6.128236425483805e-06, + "loss": 0.3849, "step": 108130 }, { - "epoch": 3.8, - "learning_rate": 7.1436663663839855e-06, - "loss": 0.2484, + "epoch": 3.897178073305222, + "grad_norm": 0.3131025433540344, + "learning_rate": 6.12632262659501e-06, + "loss": 0.358, "step": 108135 }, { - "epoch": 3.8, - "learning_rate": 7.1416726985972264e-06, - "loss": 0.2558, + "epoch": 3.897358272966447, + "grad_norm": 0.23903706669807434, + "learning_rate": 6.124409084856433e-06, + "loss": 0.3908, "step": 108140 }, { - "epoch": 3.8, - "learning_rate": 7.139679262689083e-06, - "loss": 0.274, + "epoch": 3.8975384726276716, + "grad_norm": 0.3344826400279999, + "learning_rate": 6.122495800294117e-06, + "loss": 0.4013, "step": 108145 }, { - "epoch": 3.81, - "learning_rate": 7.137686058685428e-06, - "loss": 0.2741, + "epoch": 3.897718672288896, + "grad_norm": 0.248307466506958, + "learning_rate": 6.120582772934136e-06, + "loss": 0.377, "step": 108150 }, { - "epoch": 3.81, - "learning_rate": 7.135693086612155e-06, - "loss": 0.2981, + "epoch": 3.8978988719501206, + "grad_norm": 0.21172760426998138, + "learning_rate": 6.118670002802568e-06, + "loss": 0.3875, "step": 108155 }, { - "epoch": 3.81, - "learning_rate": 7.133700346495134e-06, - "loss": 0.2628, + "epoch": 3.8980790716113454, + "grad_norm": 0.24647603929042816, + "learning_rate": 6.116757489925462e-06, + "loss": 0.3586, "step": 108160 }, { - "epoch": 3.81, - "learning_rate": 7.131707838360233e-06, - "loss": 0.2518, + "epoch": 3.89825927127257, + "grad_norm": 0.20428353548049927, + "learning_rate": 6.114845234328881e-06, + "loss": 0.3424, "step": 108165 }, { - "epoch": 3.81, - "learning_rate": 7.1297155622333356e-06, - "loss": 0.2673, + "epoch": 3.898439470933795, + "grad_norm": 0.2739085257053375, + "learning_rate": 6.112933236038878e-06, + "loss": 0.3761, "step": 108170 }, { - "epoch": 3.81, - "learning_rate": 7.1277235181403054e-06, - "loss": 0.2507, + "epoch": 3.898619670595019, + "grad_norm": 0.21551024913787842, + "learning_rate": 6.111021495081496e-06, + "loss": 0.3632, "step": 108175 }, { - "epoch": 3.81, - "learning_rate": 7.1257317061070004e-06, - "loss": 0.2517, + "epoch": 3.898799870256244, + "grad_norm": 0.2526455819606781, + "learning_rate": 6.109110011482797e-06, + "loss": 0.3779, "step": 108180 }, { - "epoch": 3.81, - "learning_rate": 7.123740126159298e-06, - "loss": 0.2719, + "epoch": 3.8989800699174686, + "grad_norm": 0.2527839243412018, + "learning_rate": 6.107198785268814e-06, + "loss": 0.3568, "step": 108185 }, { - "epoch": 3.81, - "learning_rate": 7.121748778323048e-06, - "loss": 0.2713, + "epoch": 3.8991602695786933, + "grad_norm": 0.24524226784706116, + "learning_rate": 6.105287816465591e-06, + "loss": 0.3872, "step": 108190 }, { - "epoch": 3.81, - "learning_rate": 7.119757662624107e-06, - "loss": 0.2425, + "epoch": 3.8993404692399176, + "grad_norm": 0.20940566062927246, + "learning_rate": 6.103377105099165e-06, + "loss": 0.3772, "step": 108195 }, { - "epoch": 3.81, - "learning_rate": 7.117766779088322e-06, - "loss": 0.2567, + "epoch": 3.8995206689011424, + "grad_norm": 0.2865338921546936, + "learning_rate": 6.101466651195564e-06, + "loss": 0.4084, "step": 108200 }, { - "epoch": 3.81, - "learning_rate": 7.115776127741553e-06, - "loss": 0.2699, + "epoch": 3.899700868562367, + "grad_norm": 0.2509433925151825, + "learning_rate": 6.099556454780817e-06, + "loss": 0.3579, "step": 108205 }, { - "epoch": 3.81, - "learning_rate": 7.113785708609649e-06, - "loss": 0.2453, + "epoch": 3.899881068223592, + "grad_norm": 0.271535187959671, + "learning_rate": 6.097646515880967e-06, + "loss": 0.4306, "step": 108210 }, { - "epoch": 3.81, - "learning_rate": 7.11179552171845e-06, - "loss": 0.277, + "epoch": 3.9000612678848166, + "grad_norm": 0.2970694899559021, + "learning_rate": 6.095736834522009e-06, + "loss": 0.3645, "step": 108215 }, { - "epoch": 3.81, - "learning_rate": 7.109805567093792e-06, - "loss": 0.2807, + "epoch": 3.9002414675460413, + "grad_norm": 0.2571132779121399, + "learning_rate": 6.093827410729985e-06, + "loss": 0.4004, "step": 108220 }, { - "epoch": 3.81, - "learning_rate": 7.107815844761525e-06, - "loss": 0.2494, + "epoch": 3.9004216672072656, + "grad_norm": 0.23718644678592682, + "learning_rate": 6.091918244530903e-06, + "loss": 0.3924, "step": 108225 }, { - "epoch": 3.81, - "learning_rate": 7.10582635474748e-06, - "loss": 0.255, + "epoch": 3.9006018668684903, + "grad_norm": 0.24922458827495575, + "learning_rate": 6.090009335950772e-06, + "loss": 0.3443, "step": 108230 }, { - "epoch": 3.81, - "learning_rate": 7.103837097077484e-06, - "loss": 0.2396, + "epoch": 3.900782066529715, + "grad_norm": 0.2874678075313568, + "learning_rate": 6.088100685015605e-06, + "loss": 0.3818, "step": 108235 }, { - "epoch": 3.81, - "learning_rate": 7.101848071777364e-06, - "loss": 0.2441, + "epoch": 3.9009622661909393, + "grad_norm": 0.23355595767498016, + "learning_rate": 6.0861922917514024e-06, + "loss": 0.391, "step": 108240 }, { - "epoch": 3.81, - "learning_rate": 7.099859278872958e-06, - "loss": 0.2754, + "epoch": 3.901142465852164, + "grad_norm": 0.2503456473350525, + "learning_rate": 6.084284156184161e-06, + "loss": 0.3889, "step": 108245 }, { - "epoch": 3.81, - "learning_rate": 7.097870718390084e-06, - "loss": 0.2485, + "epoch": 3.901322665513389, + "grad_norm": 0.2787312865257263, + "learning_rate": 6.082376278339893e-06, + "loss": 0.4176, "step": 108250 }, { - "epoch": 3.81, - "learning_rate": 7.095882390354553e-06, - "loss": 0.2541, + "epoch": 3.9015028651746135, + "grad_norm": 0.24109527468681335, + "learning_rate": 6.080468658244587e-06, + "loss": 0.3879, "step": 108255 }, { - "epoch": 3.81, - "learning_rate": 7.093894294792186e-06, - "loss": 0.2545, + "epoch": 3.9016830648358383, + "grad_norm": 0.24064235389232635, + "learning_rate": 6.078561295924232e-06, + "loss": 0.4142, "step": 108260 }, { - "epoch": 3.81, - "learning_rate": 7.091906431728812e-06, - "loss": 0.2585, + "epoch": 3.901863264497063, + "grad_norm": 0.20909632742404938, + "learning_rate": 6.076654191404815e-06, + "loss": 0.3915, "step": 108265 }, { - "epoch": 3.81, - "learning_rate": 7.089918801190232e-06, - "loss": 0.2477, + "epoch": 3.9020434641582873, + "grad_norm": 0.21806110441684723, + "learning_rate": 6.074747344712314e-06, + "loss": 0.3774, "step": 108270 }, { - "epoch": 3.81, - "learning_rate": 7.087931403202253e-06, - "loss": 0.2772, + "epoch": 3.902223663819512, + "grad_norm": 0.21959331631660461, + "learning_rate": 6.0728407558727246e-06, + "loss": 0.375, "step": 108275 }, { - "epoch": 3.81, - "learning_rate": 7.08594423779067e-06, - "loss": 0.2597, + "epoch": 3.9024038634807368, + "grad_norm": 0.20421810448169708, + "learning_rate": 6.070934424912014e-06, + "loss": 0.3855, "step": 108280 }, { - "epoch": 3.81, - "learning_rate": 7.083957304981303e-06, - "loss": 0.2318, + "epoch": 3.902584063141961, + "grad_norm": 0.18650266528129578, + "learning_rate": 6.06902835185616e-06, + "loss": 0.3661, "step": 108285 }, { - "epoch": 3.81, - "learning_rate": 7.081970604799945e-06, - "loss": 0.2652, + "epoch": 3.902764262803186, + "grad_norm": 0.23381759226322174, + "learning_rate": 6.067122536731126e-06, + "loss": 0.4026, "step": 108290 }, { - "epoch": 3.81, - "learning_rate": 7.079984137272383e-06, - "loss": 0.2515, + "epoch": 3.9029444624644105, + "grad_norm": 0.19069373607635498, + "learning_rate": 6.065216979562877e-06, + "loss": 0.3993, "step": 108295 }, { - "epoch": 3.81, - "learning_rate": 7.0779979024244245e-06, - "loss": 0.2409, + "epoch": 3.9031246621256352, + "grad_norm": 0.22146674990653992, + "learning_rate": 6.063311680377387e-06, + "loss": 0.3903, "step": 108300 }, { - "epoch": 3.81, - "learning_rate": 7.076011900281851e-06, - "loss": 0.2472, + "epoch": 3.90330486178686, + "grad_norm": 0.24637946486473083, + "learning_rate": 6.061406639200617e-06, + "loss": 0.3661, "step": 108305 }, { - "epoch": 3.81, - "learning_rate": 7.074026130870445e-06, - "loss": 0.264, + "epoch": 3.9034850614480847, + "grad_norm": 0.2656784653663635, + "learning_rate": 6.0595018560585e-06, + "loss": 0.3357, "step": 108310 }, { - "epoch": 3.81, - "learning_rate": 7.072040594215995e-06, - "loss": 0.2683, + "epoch": 3.903665261109309, + "grad_norm": 0.21832560002803802, + "learning_rate": 6.057597330977011e-06, + "loss": 0.348, "step": 108315 }, { - "epoch": 3.81, - "learning_rate": 7.070055290344291e-06, - "loss": 0.2634, + "epoch": 3.9038454607705337, + "grad_norm": 0.215951070189476, + "learning_rate": 6.055693063982082e-06, + "loss": 0.3295, "step": 108320 }, { - "epoch": 3.81, - "learning_rate": 7.0680702192811e-06, - "loss": 0.2489, + "epoch": 3.9040256604317585, + "grad_norm": 0.23906680941581726, + "learning_rate": 6.053789055099685e-06, + "loss": 0.3876, "step": 108325 }, { - "epoch": 3.81, - "learning_rate": 7.066085381052204e-06, - "loss": 0.2795, + "epoch": 3.9042058600929828, + "grad_norm": 0.2423383742570877, + "learning_rate": 6.051885304355734e-06, + "loss": 0.3528, "step": 108330 }, { - "epoch": 3.81, - "learning_rate": 7.064100775683358e-06, - "loss": 0.24, + "epoch": 3.9043860597542075, + "grad_norm": 0.20754733681678772, + "learning_rate": 6.04998181177617e-06, + "loss": 0.3529, "step": 108335 }, { - "epoch": 3.81, - "learning_rate": 7.0621164032003555e-06, - "loss": 0.2562, + "epoch": 3.9045662594154322, + "grad_norm": 0.23662780225276947, + "learning_rate": 6.048078577386945e-06, + "loss": 0.3659, "step": 108340 }, { - "epoch": 3.81, - "learning_rate": 7.060132263628946e-06, - "loss": 0.261, + "epoch": 3.904746459076657, + "grad_norm": 0.3288610577583313, + "learning_rate": 6.046175601213977e-06, + "loss": 0.4009, "step": 108345 }, { - "epoch": 3.81, - "learning_rate": 7.058148356994898e-06, - "loss": 0.2728, + "epoch": 3.9049266587378817, + "grad_norm": 0.30411192774772644, + "learning_rate": 6.044272883283198e-06, + "loss": 0.3728, "step": 108350 }, { - "epoch": 3.81, - "learning_rate": 7.056164683323962e-06, - "loss": 0.2721, + "epoch": 3.9051068583991064, + "grad_norm": 0.2672629654407501, + "learning_rate": 6.04237042362053e-06, + "loss": 0.3629, "step": 108355 }, { - "epoch": 3.81, - "learning_rate": 7.054181242641911e-06, - "loss": 0.276, + "epoch": 3.9052870580603307, + "grad_norm": 0.22953931987285614, + "learning_rate": 6.040468222251891e-06, + "loss": 0.3809, "step": 108360 }, { - "epoch": 3.81, - "learning_rate": 7.05219803497448e-06, - "loss": 0.2347, + "epoch": 3.9054672577215555, + "grad_norm": 0.22573940455913544, + "learning_rate": 6.038566279203206e-06, + "loss": 0.3482, "step": 108365 }, { - "epoch": 3.81, - "learning_rate": 7.05021506034744e-06, - "loss": 0.2393, + "epoch": 3.90564745738278, + "grad_norm": 0.29565203189849854, + "learning_rate": 6.036664594500385e-06, + "loss": 0.3928, "step": 108370 }, { - "epoch": 3.81, - "learning_rate": 7.048232318786519e-06, - "loss": 0.2607, + "epoch": 3.905827657044005, + "grad_norm": 0.2615247070789337, + "learning_rate": 6.0347631681693384e-06, + "loss": 0.3485, "step": 108375 }, { - "epoch": 3.81, - "learning_rate": 7.046249810317476e-06, - "loss": 0.2788, + "epoch": 3.906007856705229, + "grad_norm": 0.3057554364204407, + "learning_rate": 6.0328620002359695e-06, + "loss": 0.3922, "step": 108380 }, { - "epoch": 3.81, - "learning_rate": 7.044267534966051e-06, - "loss": 0.248, + "epoch": 3.906188056366454, + "grad_norm": 0.23635023832321167, + "learning_rate": 6.030961090726186e-06, + "loss": 0.4425, "step": 108385 }, { - "epoch": 3.81, - "learning_rate": 7.042285492757975e-06, - "loss": 0.2547, + "epoch": 3.9063682560276787, + "grad_norm": 0.16913343966007233, + "learning_rate": 6.029060439665876e-06, + "loss": 0.3574, "step": 108390 }, { - "epoch": 3.81, - "learning_rate": 7.040303683718982e-06, - "loss": 0.2323, + "epoch": 3.9065484556889034, + "grad_norm": 0.20368297398090363, + "learning_rate": 6.0271600470809595e-06, + "loss": 0.3818, "step": 108395 }, { - "epoch": 3.81, - "learning_rate": 7.038322107874817e-06, - "loss": 0.2419, + "epoch": 3.906728655350128, + "grad_norm": 0.27825161814689636, + "learning_rate": 6.0252599129973e-06, + "loss": 0.3998, "step": 108400 }, { - "epoch": 3.81, - "learning_rate": 7.036340765251204e-06, - "loss": 0.2638, + "epoch": 3.9069088550113524, + "grad_norm": 0.2604582905769348, + "learning_rate": 6.023360037440809e-06, + "loss": 0.4151, "step": 108405 }, { - "epoch": 3.81, - "learning_rate": 7.034359655873857e-06, - "loss": 0.2656, + "epoch": 3.907089054672577, + "grad_norm": 0.27497240900993347, + "learning_rate": 6.021460420437364e-06, + "loss": 0.382, "step": 108410 }, { - "epoch": 3.81, - "learning_rate": 7.03237877976852e-06, - "loss": 0.2576, + "epoch": 3.907269254333802, + "grad_norm": 0.18511195480823517, + "learning_rate": 6.0195610620128354e-06, + "loss": 0.389, "step": 108415 }, { - "epoch": 3.81, - "learning_rate": 7.030398136960894e-06, - "loss": 0.2746, + "epoch": 3.9074494539950266, + "grad_norm": 0.24932751059532166, + "learning_rate": 6.01766196219313e-06, + "loss": 0.3824, "step": 108420 }, { - "epoch": 3.81, - "learning_rate": 7.0284177274767135e-06, - "loss": 0.2601, + "epoch": 3.907629653656251, + "grad_norm": 0.24645911157131195, + "learning_rate": 6.0157631210040975e-06, + "loss": 0.3741, "step": 108425 }, { - "epoch": 3.81, - "learning_rate": 7.0264375513416776e-06, - "loss": 0.2409, + "epoch": 3.9078098533174757, + "grad_norm": 0.24892014265060425, + "learning_rate": 6.0138645384716085e-06, + "loss": 0.368, "step": 108430 }, { - "epoch": 3.82, - "learning_rate": 7.024457608581514e-06, - "loss": 0.2737, + "epoch": 3.9079900529787004, + "grad_norm": 0.22853389382362366, + "learning_rate": 6.0119662146215475e-06, + "loss": 0.3671, "step": 108435 }, { - "epoch": 3.82, - "learning_rate": 7.0224778992219214e-06, - "loss": 0.2879, + "epoch": 3.908170252639925, + "grad_norm": 0.2502167820930481, + "learning_rate": 6.010068149479772e-06, + "loss": 0.4101, "step": 108440 }, { - "epoch": 3.82, - "learning_rate": 7.020498423288604e-06, - "loss": 0.2708, + "epoch": 3.90835045230115, + "grad_norm": 0.25325870513916016, + "learning_rate": 6.008170343072139e-06, + "loss": 0.3676, "step": 108445 }, { - "epoch": 3.82, - "learning_rate": 7.018519180807262e-06, - "loss": 0.2499, + "epoch": 3.908530651962374, + "grad_norm": 0.269979864358902, + "learning_rate": 6.006272795424511e-06, + "loss": 0.3602, "step": 108450 }, { - "epoch": 3.82, - "learning_rate": 7.016540171803601e-06, - "loss": 0.2447, + "epoch": 3.908710851623599, + "grad_norm": 0.21623685956001282, + "learning_rate": 6.004375506562729e-06, + "loss": 0.3969, "step": 108455 }, { - "epoch": 3.82, - "learning_rate": 7.014561396303318e-06, - "loss": 0.2602, + "epoch": 3.9088910512848236, + "grad_norm": 0.20743024349212646, + "learning_rate": 6.002478476512663e-06, + "loss": 0.3587, "step": 108460 }, { - "epoch": 3.82, - "learning_rate": 7.012582854332101e-06, - "loss": 0.2647, + "epoch": 3.9090712509460483, + "grad_norm": 0.26383909583091736, + "learning_rate": 6.0005817053001485e-06, + "loss": 0.3759, "step": 108465 }, { - "epoch": 3.82, - "learning_rate": 7.0106045459156334e-06, - "loss": 0.2633, + "epoch": 3.9092514506072726, + "grad_norm": 0.22323699295520782, + "learning_rate": 5.998685192951029e-06, + "loss": 0.371, "step": 108470 }, { - "epoch": 3.82, - "learning_rate": 7.008626471079619e-06, - "loss": 0.2714, + "epoch": 3.9094316502684974, + "grad_norm": 0.2346304953098297, + "learning_rate": 5.996788939491143e-06, + "loss": 0.3776, "step": 108475 }, { - "epoch": 3.82, - "learning_rate": 7.006648629849724e-06, - "loss": 0.2677, + "epoch": 3.909611849929722, + "grad_norm": 0.25308549404144287, + "learning_rate": 5.994892944946326e-06, + "loss": 0.3947, "step": 108480 }, { - "epoch": 3.82, - "learning_rate": 7.004671022251649e-06, - "loss": 0.2508, + "epoch": 3.909792049590947, + "grad_norm": 0.24598398804664612, + "learning_rate": 5.992997209342416e-06, + "loss": 0.3543, "step": 108485 }, { - "epoch": 3.82, - "learning_rate": 7.002693648311051e-06, - "loss": 0.2567, + "epoch": 3.9099722492521716, + "grad_norm": 0.21500460803508759, + "learning_rate": 5.991101732705248e-06, + "loss": 0.3669, "step": 108490 }, { - "epoch": 3.82, - "learning_rate": 7.000716508053623e-06, - "loss": 0.2636, + "epoch": 3.9101524489133963, + "grad_norm": 0.2395174652338028, + "learning_rate": 5.989206515060625e-06, + "loss": 0.3834, "step": 108495 }, { - "epoch": 3.82, - "learning_rate": 6.99873960150503e-06, - "loss": 0.2607, + "epoch": 3.9103326485746206, + "grad_norm": 0.20296072959899902, + "learning_rate": 5.987311556434391e-06, + "loss": 0.4221, "step": 108500 }, { - "epoch": 3.82, - "eval_loss": 0.25093138217926025, - "eval_runtime": 10.5444, - "eval_samples_per_second": 9.484, - "eval_steps_per_second": 9.484, + "epoch": 3.9103326485746206, + "eval_loss": 0.4281059205532074, + "eval_runtime": 3.5316, + "eval_samples_per_second": 28.316, + "eval_steps_per_second": 7.079, "step": 108500 }, { - "epoch": 3.82, - "learning_rate": 6.9967629286909416e-06, - "loss": 0.241, + "epoch": 3.9105128482358453, + "grad_norm": 0.22557717561721802, + "learning_rate": 5.985416856852347e-06, + "loss": 0.3589, "step": 108505 }, { - "epoch": 3.82, - "learning_rate": 6.994786489637012e-06, - "loss": 0.26, + "epoch": 3.91069304789707, + "grad_norm": 0.22067588567733765, + "learning_rate": 5.9835224163403315e-06, + "loss": 0.3639, "step": 108510 }, { - "epoch": 3.82, - "learning_rate": 6.9928102843689225e-06, - "loss": 0.2456, + "epoch": 3.9108732475582944, + "grad_norm": 0.2571333050727844, + "learning_rate": 5.9816282349241335e-06, + "loss": 0.3619, "step": 108515 }, { - "epoch": 3.82, - "learning_rate": 6.990834312912323e-06, - "loss": 0.2521, + "epoch": 3.911053447219519, + "grad_norm": 0.24989810585975647, + "learning_rate": 5.979734312629562e-06, + "loss": 0.4091, "step": 108520 }, { - "epoch": 3.82, - "learning_rate": 6.988858575292867e-06, - "loss": 0.2463, + "epoch": 3.911233646880744, + "grad_norm": 0.2283269464969635, + "learning_rate": 5.977840649482435e-06, + "loss": 0.3804, "step": 108525 }, { - "epoch": 3.82, - "learning_rate": 6.9868830715362185e-06, - "loss": 0.2718, + "epoch": 3.9114138465419686, + "grad_norm": 0.2447991967201233, + "learning_rate": 5.975947245508548e-06, + "loss": 0.3941, "step": 108530 }, { - "epoch": 3.82, - "learning_rate": 6.98530283692935e-06, - "loss": 0.2599, + "epoch": 3.9115940462031933, + "grad_norm": 0.27993232011795044, + "learning_rate": 5.9740541007336974e-06, + "loss": 0.3952, "step": 108535 }, { - "epoch": 3.82, - "learning_rate": 6.983327754190374e-06, - "loss": 0.2566, + "epoch": 3.911774245864418, + "grad_norm": 0.21602769196033478, + "learning_rate": 5.972161215183672e-06, + "loss": 0.3507, "step": 108540 }, { - "epoch": 3.82, - "learning_rate": 6.981352905386021e-06, - "loss": 0.2568, + "epoch": 3.9119544455256423, + "grad_norm": 0.2476750910282135, + "learning_rate": 5.970268588884262e-06, + "loss": 0.3867, "step": 108545 }, { - "epoch": 3.82, - "learning_rate": 6.979378290541927e-06, - "loss": 0.2647, + "epoch": 3.912134645186867, + "grad_norm": 0.2227691113948822, + "learning_rate": 5.968376221861266e-06, + "loss": 0.3658, "step": 108550 }, { - "epoch": 3.82, - "learning_rate": 6.97740390968373e-06, - "loss": 0.2363, + "epoch": 3.9123148448480918, + "grad_norm": 0.19756188988685608, + "learning_rate": 5.9664841141404585e-06, + "loss": 0.3666, "step": 108555 }, { - "epoch": 3.82, - "learning_rate": 6.975429762837058e-06, - "loss": 0.2646, + "epoch": 3.912495044509316, + "grad_norm": 0.2740320563316345, + "learning_rate": 5.964592265747617e-06, + "loss": 0.3738, "step": 108560 }, { - "epoch": 3.82, - "learning_rate": 6.97345585002755e-06, - "loss": 0.258, + "epoch": 3.912675244170541, + "grad_norm": 0.28433406352996826, + "learning_rate": 5.9627006767085244e-06, + "loss": 0.3794, "step": 108565 }, { - "epoch": 3.82, - "learning_rate": 6.971482171280846e-06, - "loss": 0.2637, + "epoch": 3.9128554438317655, + "grad_norm": 0.3014092445373535, + "learning_rate": 5.960809347048948e-06, + "loss": 0.3592, "step": 108570 }, { - "epoch": 3.82, - "learning_rate": 6.969508726622565e-06, - "loss": 0.2563, + "epoch": 3.9130356434929903, + "grad_norm": 0.2842918038368225, + "learning_rate": 5.95891827679465e-06, + "loss": 0.3823, "step": 108575 }, { - "epoch": 3.82, - "learning_rate": 6.967535516078325e-06, - "loss": 0.2778, + "epoch": 3.913215843154215, + "grad_norm": 0.27905863523483276, + "learning_rate": 5.957027465971413e-06, + "loss": 0.3583, "step": 108580 }, { - "epoch": 3.82, - "learning_rate": 6.965562539673756e-06, - "loss": 0.2618, + "epoch": 3.9133960428154397, + "grad_norm": 0.27240729331970215, + "learning_rate": 5.955136914604989e-06, + "loss": 0.3527, "step": 108585 }, { - "epoch": 3.82, - "learning_rate": 6.963589797434475e-06, - "loss": 0.2527, + "epoch": 3.913576242476664, + "grad_norm": 0.2016393542289734, + "learning_rate": 5.953246622721137e-06, + "loss": 0.3709, "step": 108590 }, { - "epoch": 3.82, - "learning_rate": 6.9616172893860885e-06, - "loss": 0.2589, + "epoch": 3.9137564421378888, + "grad_norm": 0.24369855225086212, + "learning_rate": 5.951356590345611e-06, + "loss": 0.3883, "step": 108595 }, { - "epoch": 3.82, - "learning_rate": 6.959645015554217e-06, - "loss": 0.2867, + "epoch": 3.9139366417991135, + "grad_norm": 0.20250004529953003, + "learning_rate": 5.94946681750416e-06, + "loss": 0.3649, "step": 108600 }, { - "epoch": 3.82, - "learning_rate": 6.957672975964468e-06, - "loss": 0.2487, + "epoch": 3.914116841460338, + "grad_norm": 0.26689743995666504, + "learning_rate": 5.9475773042225455e-06, + "loss": 0.3699, "step": 108605 }, { - "epoch": 3.82, - "learning_rate": 6.955701170642445e-06, - "loss": 0.267, + "epoch": 3.9142970411215625, + "grad_norm": 0.24879872798919678, + "learning_rate": 5.945688050526496e-06, + "loss": 0.3921, "step": 108610 }, { - "epoch": 3.82, - "learning_rate": 6.953729599613743e-06, - "loss": 0.2538, + "epoch": 3.9144772407827872, + "grad_norm": 0.20798632502555847, + "learning_rate": 5.943799056441751e-06, + "loss": 0.3478, "step": 108615 }, { - "epoch": 3.82, - "learning_rate": 6.951758262903976e-06, - "loss": 0.2294, + "epoch": 3.914657440444012, + "grad_norm": 0.2264632135629654, + "learning_rate": 5.941910321994063e-06, + "loss": 0.3668, "step": 108620 }, { - "epoch": 3.82, - "learning_rate": 6.949787160538729e-06, - "loss": 0.2474, + "epoch": 3.9148376401052367, + "grad_norm": 0.26185837388038635, + "learning_rate": 5.940021847209157e-06, + "loss": 0.3667, "step": 108625 }, { - "epoch": 3.82, - "learning_rate": 6.947816292543605e-06, - "loss": 0.2506, + "epoch": 3.9150178397664614, + "grad_norm": 0.20407342910766602, + "learning_rate": 5.93813363211276e-06, + "loss": 0.3491, "step": 108630 }, { - "epoch": 3.82, - "learning_rate": 6.945845658944183e-06, - "loss": 0.2478, + "epoch": 3.9151980394276857, + "grad_norm": 0.21757426857948303, + "learning_rate": 5.936245676730603e-06, + "loss": 0.3534, "step": 108635 }, { - "epoch": 3.82, - "learning_rate": 6.943875259766064e-06, - "loss": 0.295, + "epoch": 3.9153782390889105, + "grad_norm": 0.23258854448795319, + "learning_rate": 5.934357981088403e-06, + "loss": 0.4074, "step": 108640 }, { - "epoch": 3.82, - "learning_rate": 6.941905095034826e-06, - "loss": 0.2573, + "epoch": 3.915558438750135, + "grad_norm": 0.31023046374320984, + "learning_rate": 5.932470545211891e-06, + "loss": 0.4156, "step": 108645 }, { - "epoch": 3.82, - "learning_rate": 6.9399351647760485e-06, - "loss": 0.253, + "epoch": 3.91573863841136, + "grad_norm": 0.2955925166606903, + "learning_rate": 5.930583369126774e-06, + "loss": 0.4184, "step": 108650 }, { - "epoch": 3.82, - "learning_rate": 6.937965469015304e-06, - "loss": 0.2792, + "epoch": 3.9159188380725842, + "grad_norm": 0.25111132860183716, + "learning_rate": 5.928696452858768e-06, + "loss": 0.3435, "step": 108655 }, { - "epoch": 3.82, - "learning_rate": 6.935996007778184e-06, - "loss": 0.251, + "epoch": 3.916099037733809, + "grad_norm": 0.25099194049835205, + "learning_rate": 5.92680979643358e-06, + "loss": 0.3505, "step": 108660 }, { - "epoch": 3.82, - "learning_rate": 6.93402678109025e-06, - "loss": 0.2515, + "epoch": 3.9162792373950337, + "grad_norm": 0.23228740692138672, + "learning_rate": 5.924923399876908e-06, + "loss": 0.3887, "step": 108665 }, { - "epoch": 3.82, - "learning_rate": 6.932057788977062e-06, - "loss": 0.264, + "epoch": 3.9164594370562584, + "grad_norm": 0.26973646879196167, + "learning_rate": 5.9230372632144705e-06, + "loss": 0.3786, "step": 108670 }, { - "epoch": 3.82, - "learning_rate": 6.930089031464207e-06, - "loss": 0.2479, + "epoch": 3.916639636717483, + "grad_norm": 0.2194148302078247, + "learning_rate": 5.921151386471962e-06, + "loss": 0.3936, "step": 108675 }, { - "epoch": 3.82, - "learning_rate": 6.928120508577229e-06, - "loss": 0.2568, + "epoch": 3.9168198363787075, + "grad_norm": 0.21613870561122894, + "learning_rate": 5.919265769675059e-06, + "loss": 0.3644, "step": 108680 }, { - "epoch": 3.82, - "learning_rate": 6.926152220341703e-06, - "loss": 0.2518, + "epoch": 3.917000036039932, + "grad_norm": 0.2264699786901474, + "learning_rate": 5.917380412849474e-06, + "loss": 0.3825, "step": 108685 }, { - "epoch": 3.82, - "learning_rate": 6.924184166783182e-06, - "loss": 0.2506, + "epoch": 3.917180235701157, + "grad_norm": 0.21265465021133423, + "learning_rate": 5.9154953160208776e-06, + "loss": 0.3764, "step": 108690 }, { - "epoch": 3.82, - "learning_rate": 6.922216347927208e-06, - "loss": 0.2643, + "epoch": 3.9173604353623817, + "grad_norm": 0.2905406653881073, + "learning_rate": 5.913610479214976e-06, + "loss": 0.3926, "step": 108695 }, { - "epoch": 3.82, - "learning_rate": 6.920248763799348e-06, - "loss": 0.2458, + "epoch": 3.917540635023606, + "grad_norm": 0.30452674627304077, + "learning_rate": 5.911725902457432e-06, + "loss": 0.3959, "step": 108700 }, { - "epoch": 3.82, - "learning_rate": 6.918281414425143e-06, - "loss": 0.2576, + "epoch": 3.9177208346848307, + "grad_norm": 0.20968550443649292, + "learning_rate": 5.909841585773917e-06, + "loss": 0.3671, "step": 108705 }, { - "epoch": 3.82, - "learning_rate": 6.916314299830129e-06, - "loss": 0.2654, + "epoch": 3.9179010343460554, + "grad_norm": 0.20306989550590515, + "learning_rate": 5.9079575291901254e-06, + "loss": 0.3766, "step": 108710 }, { - "epoch": 3.82, - "learning_rate": 6.914347420039865e-06, - "loss": 0.2416, + "epoch": 3.91808123400728, + "grad_norm": 0.20528307557106018, + "learning_rate": 5.906073732731712e-06, + "loss": 0.3609, "step": 108715 }, { - "epoch": 3.83, - "learning_rate": 6.912380775079882e-06, - "loss": 0.2236, + "epoch": 3.918261433668505, + "grad_norm": 0.32778772711753845, + "learning_rate": 5.90419019642435e-06, + "loss": 0.3861, "step": 108720 }, { - "epoch": 3.83, - "learning_rate": 6.9104143649757065e-06, - "loss": 0.2305, + "epoch": 3.9184416333297296, + "grad_norm": 0.25821900367736816, + "learning_rate": 5.9023069202936984e-06, + "loss": 0.3839, "step": 108725 }, { - "epoch": 3.83, - "learning_rate": 6.9084481897528835e-06, - "loss": 0.2546, + "epoch": 3.918621832990954, + "grad_norm": 0.2765897810459137, + "learning_rate": 5.90042390436541e-06, + "loss": 0.3741, "step": 108730 }, { - "epoch": 3.83, - "learning_rate": 6.906482249436933e-06, - "loss": 0.2442, + "epoch": 3.9188020326521786, + "grad_norm": 0.22229650616645813, + "learning_rate": 5.898541148665154e-06, + "loss": 0.3783, "step": 108735 }, { - "epoch": 3.83, - "learning_rate": 6.904516544053394e-06, - "loss": 0.2563, + "epoch": 3.9189822323134034, + "grad_norm": 0.32153648138046265, + "learning_rate": 5.896658653218578e-06, + "loss": 0.3865, "step": 108740 }, { - "epoch": 3.83, - "learning_rate": 6.902551073627783e-06, - "loss": 0.2573, + "epoch": 3.9191624319746277, + "grad_norm": 0.23668645322322845, + "learning_rate": 5.8947764180513316e-06, + "loss": 0.3962, "step": 108745 }, { - "epoch": 3.83, - "learning_rate": 6.9005858381856105e-06, - "loss": 0.2586, + "epoch": 3.9193426316358524, + "grad_norm": 0.26345136761665344, + "learning_rate": 5.892894443189056e-06, + "loss": 0.3455, "step": 108750 }, { - "epoch": 3.83, - "learning_rate": 6.898620837752412e-06, - "loss": 0.2463, + "epoch": 3.919522831297077, + "grad_norm": 0.23843061923980713, + "learning_rate": 5.891012728657394e-06, + "loss": 0.3981, "step": 108755 }, { - "epoch": 3.83, - "learning_rate": 6.896656072353691e-06, - "loss": 0.2651, + "epoch": 3.919703030958302, + "grad_norm": 0.25555703043937683, + "learning_rate": 5.889131274481977e-06, + "loss": 0.3639, "step": 108760 }, { - "epoch": 3.83, - "learning_rate": 6.894691542014961e-06, - "loss": 0.2895, + "epoch": 3.9198832306195266, + "grad_norm": 0.18954876065254211, + "learning_rate": 5.887250080688453e-06, + "loss": 0.3554, "step": 108765 }, { - "epoch": 3.83, - "learning_rate": 6.892727246761721e-06, - "loss": 0.2465, + "epoch": 3.9200634302807513, + "grad_norm": 0.2582615613937378, + "learning_rate": 5.885369147302447e-06, + "loss": 0.4017, "step": 108770 }, { - "epoch": 3.83, - "learning_rate": 6.890763186619495e-06, - "loss": 0.2639, + "epoch": 3.9202436299419756, + "grad_norm": 0.20174746215343475, + "learning_rate": 5.883488474349586e-06, + "loss": 0.381, "step": 108775 }, { - "epoch": 3.83, - "learning_rate": 6.888799361613771e-06, - "loss": 0.2673, + "epoch": 3.9204238296032003, + "grad_norm": 0.2733262777328491, + "learning_rate": 5.881608061855492e-06, + "loss": 0.39, "step": 108780 }, { - "epoch": 3.83, - "learning_rate": 6.8868357717700446e-06, - "loss": 0.2721, + "epoch": 3.920604029264425, + "grad_norm": 0.21888548135757446, + "learning_rate": 5.879727909845781e-06, + "loss": 0.3547, "step": 108785 }, { - "epoch": 3.83, - "learning_rate": 6.884872417113822e-06, - "loss": 0.2598, + "epoch": 3.9207842289256494, + "grad_norm": 0.26759272813796997, + "learning_rate": 5.8778480183460885e-06, + "loss": 0.4001, "step": 108790 }, { - "epoch": 3.83, - "learning_rate": 6.882909297670598e-06, - "loss": 0.2458, + "epoch": 3.920964428586874, + "grad_norm": 0.26541879773139954, + "learning_rate": 5.875968387382008e-06, + "loss": 0.342, "step": 108795 }, { - "epoch": 3.83, - "learning_rate": 6.880946413465858e-06, - "loss": 0.2658, + "epoch": 3.921144628248099, + "grad_norm": 0.2672159671783447, + "learning_rate": 5.874089016979151e-06, + "loss": 0.3564, "step": 108800 }, { - "epoch": 3.83, - "learning_rate": 6.878983764525085e-06, - "loss": 0.2494, + "epoch": 3.9213248279093236, + "grad_norm": 0.18863865733146667, + "learning_rate": 5.872209907163132e-06, + "loss": 0.3994, "step": 108805 }, { - "epoch": 3.83, - "learning_rate": 6.87702135087376e-06, - "loss": 0.256, + "epoch": 3.9215050275705483, + "grad_norm": 0.2630614638328552, + "learning_rate": 5.870331057959552e-06, + "loss": 0.3765, "step": 108810 }, { - "epoch": 3.83, - "learning_rate": 6.875059172537379e-06, - "loss": 0.2424, + "epoch": 3.921685227231773, + "grad_norm": 0.2330998033285141, + "learning_rate": 5.868452469394006e-06, + "loss": 0.3701, "step": 108815 }, { - "epoch": 3.83, - "learning_rate": 6.873097229541406e-06, - "loss": 0.2528, + "epoch": 3.9218654268929973, + "grad_norm": 0.20276689529418945, + "learning_rate": 5.86657414149209e-06, + "loss": 0.3582, "step": 108820 }, { - "epoch": 3.83, - "learning_rate": 6.871135521911315e-06, - "loss": 0.2658, + "epoch": 3.922045626554222, + "grad_norm": 0.2714731991291046, + "learning_rate": 5.864696074279394e-06, + "loss": 0.3724, "step": 108825 }, { - "epoch": 3.83, - "learning_rate": 6.869174049672589e-06, - "loss": 0.2566, + "epoch": 3.922225826215447, + "grad_norm": 0.2654849886894226, + "learning_rate": 5.862818267781514e-06, + "loss": 0.3798, "step": 108830 }, { - "epoch": 3.83, - "learning_rate": 6.8672128128506874e-06, - "loss": 0.2662, + "epoch": 3.922406025876671, + "grad_norm": 0.18133863806724548, + "learning_rate": 5.860940722024027e-06, + "loss": 0.4061, "step": 108835 }, { - "epoch": 3.83, - "learning_rate": 6.865251811471071e-06, - "loss": 0.2603, + "epoch": 3.922586225537896, + "grad_norm": 0.2509717345237732, + "learning_rate": 5.8590634370325224e-06, + "loss": 0.4112, "step": 108840 }, { - "epoch": 3.83, - "learning_rate": 6.863291045559209e-06, - "loss": 0.2534, + "epoch": 3.9227664251991206, + "grad_norm": 0.23945482075214386, + "learning_rate": 5.857186412832569e-06, + "loss": 0.3412, "step": 108845 }, { - "epoch": 3.83, - "learning_rate": 6.861330515140568e-06, - "loss": 0.257, + "epoch": 3.9229466248603453, + "grad_norm": 0.30383405089378357, + "learning_rate": 5.855309649449739e-06, + "loss": 0.4108, "step": 108850 }, { - "epoch": 3.83, - "learning_rate": 6.859370220240594e-06, - "loss": 0.2316, + "epoch": 3.92312682452157, + "grad_norm": 0.20874951779842377, + "learning_rate": 5.853433146909618e-06, + "loss": 0.4059, "step": 108855 }, { - "epoch": 3.83, - "learning_rate": 6.857410160884745e-06, - "loss": 0.2485, + "epoch": 3.9233070241827948, + "grad_norm": 0.3230457901954651, + "learning_rate": 5.851556905237768e-06, + "loss": 0.4288, "step": 108860 }, { - "epoch": 3.83, - "learning_rate": 6.8554503370984595e-06, - "loss": 0.2623, + "epoch": 3.923487223844019, + "grad_norm": 0.28123703598976135, + "learning_rate": 5.849680924459733e-06, + "loss": 0.4122, "step": 108865 }, { - "epoch": 3.83, - "learning_rate": 6.853490748907204e-06, - "loss": 0.264, + "epoch": 3.923667423505244, + "grad_norm": 0.2393689751625061, + "learning_rate": 5.847805204601095e-06, + "loss": 0.3825, "step": 108870 }, { - "epoch": 3.83, - "learning_rate": 6.8515313963364105e-06, - "loss": 0.2575, + "epoch": 3.9238476231664685, + "grad_norm": 0.23288077116012573, + "learning_rate": 5.845929745687398e-06, + "loss": 0.3605, "step": 108875 }, { - "epoch": 3.83, - "learning_rate": 6.84957227941152e-06, - "loss": 0.269, + "epoch": 3.9240278228276932, + "grad_norm": 0.23712435364723206, + "learning_rate": 5.844054547744204e-06, + "loss": 0.3832, "step": 108880 }, { - "epoch": 3.83, - "learning_rate": 6.847613398157965e-06, - "loss": 0.2572, + "epoch": 3.9242080224889175, + "grad_norm": 0.23456718027591705, + "learning_rate": 5.842179610797066e-06, + "loss": 0.3643, "step": 108885 }, { - "epoch": 3.83, - "learning_rate": 6.845654752601197e-06, - "loss": 0.2508, + "epoch": 3.9243882221501423, + "grad_norm": 0.2397494912147522, + "learning_rate": 5.84030493487151e-06, + "loss": 0.3329, "step": 108890 }, { - "epoch": 3.83, - "learning_rate": 6.843696342766628e-06, - "loss": 0.247, + "epoch": 3.924568421811367, + "grad_norm": 0.22742457687854767, + "learning_rate": 5.838430519993096e-06, + "loss": 0.3847, "step": 108895 }, { - "epoch": 3.83, - "learning_rate": 6.841738168679704e-06, - "loss": 0.2632, + "epoch": 3.9247486214725917, + "grad_norm": 0.21573381125926971, + "learning_rate": 5.836556366187354e-06, + "loss": 0.3471, "step": 108900 }, { - "epoch": 3.83, - "learning_rate": 6.839780230365836e-06, - "loss": 0.2564, + "epoch": 3.9249288211338165, + "grad_norm": 0.2524566650390625, + "learning_rate": 5.834682473479824e-06, + "loss": 0.35, "step": 108905 }, { - "epoch": 3.83, - "learning_rate": 6.837822527850463e-06, - "loss": 0.2442, + "epoch": 3.9251090207950408, + "grad_norm": 0.22824294865131378, + "learning_rate": 5.832808841896034e-06, + "loss": 0.3783, "step": 108910 }, { - "epoch": 3.83, - "learning_rate": 6.835865061158994e-06, - "loss": 0.2493, + "epoch": 3.9252892204562655, + "grad_norm": 0.2146436721086502, + "learning_rate": 5.830935471461504e-06, + "loss": 0.3896, "step": 108915 }, { - "epoch": 3.83, - "learning_rate": 6.833907830316846e-06, - "loss": 0.2528, + "epoch": 3.9254694201174902, + "grad_norm": 0.2600581645965576, + "learning_rate": 5.829062362201776e-06, + "loss": 0.3775, "step": 108920 }, { - "epoch": 3.83, - "learning_rate": 6.831950835349427e-06, - "loss": 0.2608, + "epoch": 3.925649619778715, + "grad_norm": 0.3017957806587219, + "learning_rate": 5.827189514142361e-06, + "loss": 0.4071, "step": 108925 }, { - "epoch": 3.83, - "learning_rate": 6.829994076282159e-06, - "loss": 0.2436, + "epoch": 3.9258298194399393, + "grad_norm": 0.2875199615955353, + "learning_rate": 5.8253169273087774e-06, + "loss": 0.4051, "step": 108930 }, { - "epoch": 3.83, - "learning_rate": 6.828037553140443e-06, - "loss": 0.2438, + "epoch": 3.926010019101164, + "grad_norm": 0.22084161639213562, + "learning_rate": 5.823444601726538e-06, + "loss": 0.3576, "step": 108935 }, { - "epoch": 3.83, - "learning_rate": 6.826081265949677e-06, - "loss": 0.2773, + "epoch": 3.9261902187623887, + "grad_norm": 0.19947287440299988, + "learning_rate": 5.8215725374211525e-06, + "loss": 0.3679, "step": 108940 }, { - "epoch": 3.83, - "learning_rate": 6.824125214735274e-06, - "loss": 0.2594, + "epoch": 3.9263704184236135, + "grad_norm": 0.2296856790781021, + "learning_rate": 5.819700734418121e-06, + "loss": 0.4062, "step": 108945 }, { - "epoch": 3.83, - "learning_rate": 6.822169399522624e-06, - "loss": 0.2415, + "epoch": 3.926550618084838, + "grad_norm": 0.258223295211792, + "learning_rate": 5.817829192742963e-06, + "loss": 0.3719, "step": 108950 }, { - "epoch": 3.83, - "learning_rate": 6.820213820337129e-06, - "loss": 0.272, + "epoch": 3.9267308177460625, + "grad_norm": 0.27443790435791016, + "learning_rate": 5.8159579124211665e-06, + "loss": 0.3654, "step": 108955 }, { - "epoch": 3.83, - "learning_rate": 6.81825847720417e-06, - "loss": 0.2607, + "epoch": 3.926911017407287, + "grad_norm": 0.23533061146736145, + "learning_rate": 5.8140868934782295e-06, + "loss": 0.3518, "step": 108960 }, { - "epoch": 3.83, - "learning_rate": 6.816303370149149e-06, - "loss": 0.2469, + "epoch": 3.927091217068512, + "grad_norm": 0.21623454988002777, + "learning_rate": 5.812216135939644e-06, + "loss": 0.3867, "step": 108965 }, { - "epoch": 3.83, - "learning_rate": 6.814348499197445e-06, - "loss": 0.2684, + "epoch": 3.9272714167297367, + "grad_norm": 0.2719837725162506, + "learning_rate": 5.810345639830891e-06, + "loss": 0.3659, "step": 108970 }, { - "epoch": 3.83, - "learning_rate": 6.812393864374442e-06, - "loss": 0.2724, + "epoch": 3.927451616390961, + "grad_norm": 0.2205159217119217, + "learning_rate": 5.80847540517748e-06, + "loss": 0.3625, "step": 108975 }, { - "epoch": 3.83, - "learning_rate": 6.810439465705512e-06, - "loss": 0.2911, + "epoch": 3.9276318160521857, + "grad_norm": 0.23744551837444305, + "learning_rate": 5.806605432004866e-06, + "loss": 0.3772, "step": 108980 }, { - "epoch": 3.83, - "learning_rate": 6.808485303216044e-06, - "loss": 0.2469, + "epoch": 3.9278120157134104, + "grad_norm": 0.235315203666687, + "learning_rate": 5.804735720338531e-06, + "loss": 0.3935, "step": 108985 }, { - "epoch": 3.83, - "learning_rate": 6.806531376931408e-06, - "loss": 0.2637, + "epoch": 3.927992215374635, + "grad_norm": 0.26385051012039185, + "learning_rate": 5.802866270203964e-06, + "loss": 0.3864, "step": 108990 }, { - "epoch": 3.83, - "learning_rate": 6.80457768687697e-06, - "loss": 0.2581, + "epoch": 3.92817241503586, + "grad_norm": 0.234380304813385, + "learning_rate": 5.800997081626619e-06, + "loss": 0.3736, "step": 108995 }, { - "epoch": 3.83, - "learning_rate": 6.802624233078092e-06, - "loss": 0.2617, + "epoch": 3.9283526146970846, + "grad_norm": 0.21200546622276306, + "learning_rate": 5.799128154631986e-06, + "loss": 0.3632, "step": 109000 }, { - "epoch": 3.83, - "eval_loss": 0.2511043846607208, - "eval_runtime": 10.5525, - "eval_samples_per_second": 9.476, - "eval_steps_per_second": 9.476, + "epoch": 3.9283526146970846, + "eval_loss": 0.42800185084342957, + "eval_runtime": 3.5264, + "eval_samples_per_second": 28.357, + "eval_steps_per_second": 7.089, "step": 109000 }, { - "epoch": 3.84, - "learning_rate": 6.800671015560145e-06, - "loss": 0.2452, + "epoch": 3.928532814358309, + "grad_norm": 0.24245581030845642, + "learning_rate": 5.797259489245508e-06, + "loss": 0.3871, "step": 109005 }, { - "epoch": 3.84, - "learning_rate": 6.7987180343485005e-06, - "loss": 0.25, + "epoch": 3.9287130140195337, + "grad_norm": 0.29158252477645874, + "learning_rate": 5.795391085492644e-06, + "loss": 0.4042, "step": 109010 }, { - "epoch": 3.84, - "learning_rate": 6.796765289468507e-06, - "loss": 0.26, + "epoch": 3.9288932136807584, + "grad_norm": 0.2737109959125519, + "learning_rate": 5.793522943398863e-06, + "loss": 0.4132, "step": 109015 }, { - "epoch": 3.84, - "learning_rate": 6.7948127809455115e-06, - "loss": 0.2422, + "epoch": 3.9290734133419827, + "grad_norm": 0.255136638879776, + "learning_rate": 5.791655062989618e-06, + "loss": 0.3969, "step": 109020 }, { - "epoch": 3.84, - "learning_rate": 6.792860508804882e-06, - "loss": 0.2714, + "epoch": 3.9292536130032074, + "grad_norm": 0.2431129515171051, + "learning_rate": 5.789787444290351e-06, + "loss": 0.3924, "step": 109025 }, { - "epoch": 3.84, - "learning_rate": 6.790908473071961e-06, - "loss": 0.3057, + "epoch": 3.929433812664432, + "grad_norm": 0.27715519070625305, + "learning_rate": 5.787920087326512e-06, + "loss": 0.3818, "step": 109030 }, { - "epoch": 3.84, - "learning_rate": 6.7889566737720896e-06, - "loss": 0.2673, + "epoch": 3.929614012325657, + "grad_norm": 0.24491608142852783, + "learning_rate": 5.786052992123533e-06, + "loss": 0.3742, "step": 109035 }, { - "epoch": 3.84, - "learning_rate": 6.787005110930611e-06, - "loss": 0.2768, + "epoch": 3.9297942119868816, + "grad_norm": 0.21611642837524414, + "learning_rate": 5.784186158706872e-06, + "loss": 0.3943, "step": 109040 }, { - "epoch": 3.84, - "learning_rate": 6.785053784572873e-06, - "loss": 0.2606, + "epoch": 3.9299744116481063, + "grad_norm": 0.2787199914455414, + "learning_rate": 5.782319587101953e-06, + "loss": 0.3888, "step": 109045 }, { - "epoch": 3.84, - "learning_rate": 6.783102694724211e-06, - "loss": 0.2553, + "epoch": 3.9301546113093306, + "grad_norm": 0.26569807529449463, + "learning_rate": 5.780453277334208e-06, + "loss": 0.3812, "step": 109050 }, { - "epoch": 3.84, - "learning_rate": 6.781151841409944e-06, - "loss": 0.2538, + "epoch": 3.9303348109705554, + "grad_norm": 0.25157374143600464, + "learning_rate": 5.778587229429069e-06, + "loss": 0.4124, "step": 109055 }, { - "epoch": 3.84, - "learning_rate": 6.7792012246554235e-06, - "loss": 0.2512, + "epoch": 3.93051501063178, + "grad_norm": 0.21944718062877655, + "learning_rate": 5.776721443411956e-06, + "loss": 0.3392, "step": 109060 }, { - "epoch": 3.84, - "learning_rate": 6.7772508444859575e-06, - "loss": 0.2475, + "epoch": 3.9306952102930044, + "grad_norm": 0.25366106629371643, + "learning_rate": 5.774855919308284e-06, + "loss": 0.4012, "step": 109065 }, { - "epoch": 3.84, - "learning_rate": 6.77530070092689e-06, - "loss": 0.2504, + "epoch": 3.930875409954229, + "grad_norm": 0.24010884761810303, + "learning_rate": 5.772990657143492e-06, + "loss": 0.3748, "step": 109070 }, { - "epoch": 3.84, - "learning_rate": 6.773350794003522e-06, - "loss": 0.2552, + "epoch": 3.931055609615454, + "grad_norm": 0.2395809143781662, + "learning_rate": 5.771125656942966e-06, + "loss": 0.3855, "step": 109075 }, { - "epoch": 3.84, - "learning_rate": 6.771401123741192e-06, - "loss": 0.2618, + "epoch": 3.9312358092766786, + "grad_norm": 0.22961130738258362, + "learning_rate": 5.769260918732139e-06, + "loss": 0.3671, "step": 109080 }, { - "epoch": 3.84, - "learning_rate": 6.7694516901652045e-06, - "loss": 0.2591, + "epoch": 3.9314160089379033, + "grad_norm": 0.21910971403121948, + "learning_rate": 5.767396442536404e-06, + "loss": 0.3404, "step": 109085 }, { - "epoch": 3.84, - "learning_rate": 6.767502493300873e-06, - "loss": 0.2501, + "epoch": 3.931596208599128, + "grad_norm": 0.2396189421415329, + "learning_rate": 5.765532228381174e-06, + "loss": 0.404, "step": 109090 }, { - "epoch": 3.84, - "learning_rate": 6.765553533173497e-06, - "loss": 0.2763, + "epoch": 3.9317764082603524, + "grad_norm": 0.25804564356803894, + "learning_rate": 5.763668276291842e-06, + "loss": 0.3575, "step": 109095 }, { - "epoch": 3.84, - "learning_rate": 6.7636048098084e-06, - "loss": 0.2461, + "epoch": 3.931956607921577, + "grad_norm": 0.2125440388917923, + "learning_rate": 5.761804586293798e-06, + "loss": 0.3658, "step": 109100 }, { - "epoch": 3.84, - "learning_rate": 6.761656323230875e-06, - "loss": 0.2704, + "epoch": 3.932136807582802, + "grad_norm": 0.20259714126586914, + "learning_rate": 5.759941158412449e-06, + "loss": 0.3936, "step": 109105 }, { - "epoch": 3.84, - "learning_rate": 6.759708073466217e-06, - "loss": 0.2636, + "epoch": 3.932317007244026, + "grad_norm": 0.26884329319000244, + "learning_rate": 5.758077992673175e-06, + "loss": 0.3642, "step": 109110 }, { - "epoch": 3.84, - "learning_rate": 6.7577600605397365e-06, - "loss": 0.2704, + "epoch": 3.932497206905251, + "grad_norm": 0.22424453496932983, + "learning_rate": 5.756215089101366e-06, + "loss": 0.404, "step": 109115 }, { - "epoch": 3.84, - "learning_rate": 6.755812284476712e-06, - "loss": 0.2373, + "epoch": 3.9326774065664756, + "grad_norm": 0.3316144645214081, + "learning_rate": 5.7543524477224e-06, + "loss": 0.3845, "step": 109120 }, { - "epoch": 3.84, - "learning_rate": 6.753864745302449e-06, - "loss": 0.2521, + "epoch": 3.9328576062277003, + "grad_norm": 0.2477588653564453, + "learning_rate": 5.752490068561653e-06, + "loss": 0.3847, "step": 109125 }, { - "epoch": 3.84, - "learning_rate": 6.7519174430422275e-06, - "loss": 0.2599, + "epoch": 3.933037805888925, + "grad_norm": 0.21736137568950653, + "learning_rate": 5.750627951644496e-06, + "loss": 0.3864, "step": 109130 }, { - "epoch": 3.84, - "learning_rate": 6.749970377721323e-06, - "loss": 0.2376, + "epoch": 3.9332180055501498, + "grad_norm": 0.253451406955719, + "learning_rate": 5.748766096996316e-06, + "loss": 0.3711, "step": 109135 }, { - "epoch": 3.84, - "learning_rate": 6.748023549365034e-06, - "loss": 0.2462, + "epoch": 3.933398205211374, + "grad_norm": 0.2794763147830963, + "learning_rate": 5.74690450464247e-06, + "loss": 0.3803, "step": 109140 }, { - "epoch": 3.84, - "learning_rate": 6.746076957998631e-06, - "loss": 0.2321, + "epoch": 3.933578404872599, + "grad_norm": 0.24337390065193176, + "learning_rate": 5.745043174608322e-06, + "loss": 0.3818, "step": 109145 }, { - "epoch": 3.84, - "learning_rate": 6.744130603647392e-06, - "loss": 0.2397, + "epoch": 3.9337586045338235, + "grad_norm": 0.2820989489555359, + "learning_rate": 5.7431821069192345e-06, + "loss": 0.4063, "step": 109150 }, { - "epoch": 3.84, - "learning_rate": 6.742184486336576e-06, - "loss": 0.2366, + "epoch": 3.9339388041950483, + "grad_norm": 0.27026164531707764, + "learning_rate": 5.741321301600553e-06, + "loss": 0.3897, "step": 109155 }, { - "epoch": 3.84, - "learning_rate": 6.740238606091468e-06, - "loss": 0.2545, + "epoch": 3.9341190038562726, + "grad_norm": 0.24505634605884552, + "learning_rate": 5.7394607586776475e-06, + "loss": 0.3734, "step": 109160 }, { - "epoch": 3.84, - "learning_rate": 6.738292962937323e-06, - "loss": 0.2597, + "epoch": 3.9342992035174973, + "grad_norm": 0.2610364258289337, + "learning_rate": 5.73760047817587e-06, + "loss": 0.3896, "step": 109165 }, { - "epoch": 3.84, - "learning_rate": 6.736347556899414e-06, - "loss": 0.2432, + "epoch": 3.934479403178722, + "grad_norm": 0.2157258540391922, + "learning_rate": 5.735740460120539e-06, + "loss": 0.3791, "step": 109170 }, { - "epoch": 3.84, - "learning_rate": 6.734402388002992e-06, - "loss": 0.2496, + "epoch": 3.9346596028399468, + "grad_norm": 0.2157232165336609, + "learning_rate": 5.733880704537023e-06, + "loss": 0.3816, "step": 109175 }, { - "epoch": 3.84, - "learning_rate": 6.7324574562733225e-06, - "loss": 0.2766, + "epoch": 3.9348398025011715, + "grad_norm": 0.22246497869491577, + "learning_rate": 5.732021211450647e-06, + "loss": 0.3887, "step": 109180 }, { - "epoch": 3.84, - "learning_rate": 6.730512761735655e-06, - "loss": 0.242, + "epoch": 3.935020002162396, + "grad_norm": 0.1965298354625702, + "learning_rate": 5.730161980886764e-06, + "loss": 0.373, "step": 109185 }, { - "epoch": 3.84, - "learning_rate": 6.728568304415231e-06, - "loss": 0.262, + "epoch": 3.9352002018236205, + "grad_norm": 0.2733939588069916, + "learning_rate": 5.7283030128706865e-06, + "loss": 0.3991, "step": 109190 }, { - "epoch": 3.84, - "learning_rate": 6.726624084337313e-06, - "loss": 0.2766, + "epoch": 3.9353804014848452, + "grad_norm": 0.22759440541267395, + "learning_rate": 5.726444307427742e-06, + "loss": 0.3825, "step": 109195 }, { - "epoch": 3.84, - "learning_rate": 6.724680101527139e-06, - "loss": 0.253, + "epoch": 3.93556060114607, + "grad_norm": 0.24440236389636993, + "learning_rate": 5.724585864583271e-06, + "loss": 0.3675, "step": 109200 }, { - "epoch": 3.84, - "learning_rate": 6.722736356009951e-06, - "loss": 0.2514, + "epoch": 3.9357408008072943, + "grad_norm": 0.24098166823387146, + "learning_rate": 5.722727684362583e-06, + "loss": 0.3957, "step": 109205 }, { - "epoch": 3.84, - "learning_rate": 6.720792847810977e-06, - "loss": 0.2806, + "epoch": 3.935921000468519, + "grad_norm": 0.1858464926481247, + "learning_rate": 5.720869766790999e-06, + "loss": 0.3768, "step": 109210 }, { - "epoch": 3.84, - "learning_rate": 6.718849576955471e-06, - "loss": 0.2738, + "epoch": 3.9361012001297437, + "grad_norm": 0.2523440420627594, + "learning_rate": 5.719012111893832e-06, + "loss": 0.3604, "step": 109215 }, { - "epoch": 3.84, - "learning_rate": 6.716906543468657e-06, - "loss": 0.2753, + "epoch": 3.9362813997909685, + "grad_norm": 0.3191049098968506, + "learning_rate": 5.7171547196963854e-06, + "loss": 0.3716, "step": 109220 }, { - "epoch": 3.84, - "learning_rate": 6.714963747375752e-06, - "loss": 0.2665, + "epoch": 3.936461599452193, + "grad_norm": 0.22533096373081207, + "learning_rate": 5.715297590223981e-06, + "loss": 0.3648, "step": 109225 }, { - "epoch": 3.84, - "learning_rate": 6.713021188701993e-06, - "loss": 0.2478, + "epoch": 3.936641799113418, + "grad_norm": 0.23449839651584625, + "learning_rate": 5.713440723501912e-06, + "loss": 0.4066, "step": 109230 }, { - "epoch": 3.84, - "learning_rate": 6.711078867472612e-06, - "loss": 0.2545, + "epoch": 3.9368219987746422, + "grad_norm": 0.2854231894016266, + "learning_rate": 5.711584119555477e-06, + "loss": 0.3489, "step": 109235 }, { - "epoch": 3.84, - "learning_rate": 6.709136783712816e-06, - "loss": 0.2623, + "epoch": 3.937002198435867, + "grad_norm": 0.21907241642475128, + "learning_rate": 5.709727778409976e-06, + "loss": 0.378, "step": 109240 }, { - "epoch": 3.84, - "learning_rate": 6.707194937447825e-06, - "loss": 0.2383, + "epoch": 3.9371823980970917, + "grad_norm": 0.2457626909017563, + "learning_rate": 5.7078717000907e-06, + "loss": 0.3533, "step": 109245 }, { - "epoch": 3.84, - "learning_rate": 6.705253328702846e-06, - "loss": 0.2522, + "epoch": 3.937362597758316, + "grad_norm": 0.24565333127975464, + "learning_rate": 5.706015884622928e-06, + "loss": 0.3805, "step": 109250 }, { - "epoch": 3.84, - "learning_rate": 6.703311957503103e-06, - "loss": 0.2479, + "epoch": 3.9375427974195407, + "grad_norm": 0.23259061574935913, + "learning_rate": 5.704160332031969e-06, + "loss": 0.373, "step": 109255 }, { - "epoch": 3.84, - "learning_rate": 6.7013708238737954e-06, - "loss": 0.2549, + "epoch": 3.9377229970807655, + "grad_norm": 0.2456471472978592, + "learning_rate": 5.702305042343075e-06, + "loss": 0.357, "step": 109260 }, { - "epoch": 3.84, - "learning_rate": 6.699429927840131e-06, - "loss": 0.254, + "epoch": 3.93790319674199, + "grad_norm": 0.27309781312942505, + "learning_rate": 5.700450015581549e-06, + "loss": 0.3559, "step": 109265 }, { - "epoch": 3.84, - "learning_rate": 6.697489269427298e-06, - "loss": 0.2509, + "epoch": 3.938083396403215, + "grad_norm": 0.2628590166568756, + "learning_rate": 5.698595251772651e-06, + "loss": 0.3645, "step": 109270 }, { - "epoch": 3.84, - "learning_rate": 6.695548848660515e-06, - "loss": 0.249, + "epoch": 3.9382635960644397, + "grad_norm": 0.19301342964172363, + "learning_rate": 5.696740750941651e-06, + "loss": 0.3669, "step": 109275 }, { - "epoch": 3.84, - "learning_rate": 6.693608665564957e-06, - "loss": 0.2779, + "epoch": 3.938443795725664, + "grad_norm": 0.2241153120994568, + "learning_rate": 5.6948865131138344e-06, + "loss": 0.3542, "step": 109280 }, { - "epoch": 3.84, - "learning_rate": 6.691668720165836e-06, - "loss": 0.2439, + "epoch": 3.9386239953868887, + "grad_norm": 0.21480773389339447, + "learning_rate": 5.6930325383144386e-06, + "loss": 0.3573, "step": 109285 }, { - "epoch": 3.85, - "learning_rate": 6.6897290124883216e-06, - "loss": 0.2579, + "epoch": 3.9388041950481134, + "grad_norm": 0.27999842166900635, + "learning_rate": 5.6911788265687435e-06, + "loss": 0.4112, "step": 109290 }, { - "epoch": 3.85, - "learning_rate": 6.687789542557618e-06, - "loss": 0.2478, + "epoch": 3.9389843947093377, + "grad_norm": 0.2639464735984802, + "learning_rate": 5.689325377901997e-06, + "loss": 0.3861, "step": 109295 }, { - "epoch": 3.85, - "learning_rate": 6.6858503103989e-06, - "loss": 0.2513, + "epoch": 3.9391645943705624, + "grad_norm": 0.21820655465126038, + "learning_rate": 5.6874721923394545e-06, + "loss": 0.3692, "step": 109300 }, { - "epoch": 3.85, - "learning_rate": 6.683911316037336e-06, - "loss": 0.2413, + "epoch": 3.939344794031787, + "grad_norm": 0.2508092522621155, + "learning_rate": 5.685619269906364e-06, + "loss": 0.3893, "step": 109305 }, { - "epoch": 3.85, - "learning_rate": 6.681972559498123e-06, - "loss": 0.2327, + "epoch": 3.939524993693012, + "grad_norm": 0.22443386912345886, + "learning_rate": 5.683766610627972e-06, + "loss": 0.3788, "step": 109310 }, { - "epoch": 3.85, - "learning_rate": 6.680034040806421e-06, - "loss": 0.256, + "epoch": 3.9397051933542366, + "grad_norm": 0.23551128804683685, + "learning_rate": 5.681914214529513e-06, + "loss": 0.3793, "step": 109315 }, { - "epoch": 3.85, - "learning_rate": 6.6780957599874075e-06, - "loss": 0.2294, + "epoch": 3.9398853930154614, + "grad_norm": 0.2493138313293457, + "learning_rate": 5.6800620816362395e-06, + "loss": 0.3987, "step": 109320 }, { - "epoch": 3.85, - "learning_rate": 6.676157717066237e-06, - "loss": 0.2734, + "epoch": 3.9400655926766857, + "grad_norm": 0.27576449513435364, + "learning_rate": 5.678210211973378e-06, + "loss": 0.3981, "step": 109325 }, { - "epoch": 3.85, - "learning_rate": 6.6742199120680885e-06, - "loss": 0.2602, + "epoch": 3.9402457923379104, + "grad_norm": 0.2820398509502411, + "learning_rate": 5.676358605566165e-06, + "loss": 0.346, "step": 109330 }, { - "epoch": 3.85, - "learning_rate": 6.672282345018111e-06, - "loss": 0.2599, + "epoch": 3.940425991999135, + "grad_norm": 0.21052910387516022, + "learning_rate": 5.674507262439821e-06, + "loss": 0.3848, "step": 109335 }, { - "epoch": 3.85, - "learning_rate": 6.670345015941476e-06, - "loss": 0.2409, + "epoch": 3.9406061916603594, + "grad_norm": 0.2435581535100937, + "learning_rate": 5.67265618261957e-06, + "loss": 0.3897, "step": 109340 }, { - "epoch": 3.85, - "learning_rate": 6.668407924863324e-06, - "loss": 0.2501, + "epoch": 3.940786391321584, + "grad_norm": 0.23726899921894073, + "learning_rate": 5.670805366130644e-06, + "loss": 0.3792, "step": 109345 }, { - "epoch": 3.85, - "learning_rate": 6.666471071808822e-06, - "loss": 0.2568, + "epoch": 3.940966590982809, + "grad_norm": 0.26930510997772217, + "learning_rate": 5.6689548129982565e-06, + "loss": 0.3823, "step": 109350 }, { - "epoch": 3.85, - "learning_rate": 6.664534456803109e-06, - "loss": 0.2652, + "epoch": 3.9411467906440336, + "grad_norm": 0.2104661613702774, + "learning_rate": 5.667104523247605e-06, + "loss": 0.4122, "step": 109355 }, { - "epoch": 3.85, - "learning_rate": 6.662598079871335e-06, - "loss": 0.2532, + "epoch": 3.9413269903052583, + "grad_norm": 0.2654925286769867, + "learning_rate": 5.665254496903919e-06, + "loss": 0.3602, "step": 109360 }, { - "epoch": 3.85, - "learning_rate": 6.660661941038632e-06, - "loss": 0.2475, + "epoch": 3.941507189966483, + "grad_norm": 0.26603254675865173, + "learning_rate": 5.663404733992389e-06, + "loss": 0.3944, "step": 109365 }, { - "epoch": 3.85, - "learning_rate": 6.658726040330155e-06, - "loss": 0.2356, + "epoch": 3.9416873896277074, + "grad_norm": 0.25029662251472473, + "learning_rate": 5.6615552345382405e-06, + "loss": 0.3695, "step": 109370 }, { - "epoch": 3.85, - "learning_rate": 6.6567903777710326e-06, - "loss": 0.2341, + "epoch": 3.941867589288932, + "grad_norm": 0.28742098808288574, + "learning_rate": 5.65970599856665e-06, + "loss": 0.3896, "step": 109375 }, { - "epoch": 3.85, - "learning_rate": 6.6548549533863995e-06, - "loss": 0.2409, + "epoch": 3.942047788950157, + "grad_norm": 0.24512720108032227, + "learning_rate": 5.657857026102814e-06, + "loss": 0.3886, "step": 109380 }, { - "epoch": 3.85, - "learning_rate": 6.652919767201377e-06, - "loss": 0.2327, + "epoch": 3.9422279886113816, + "grad_norm": 0.23668235540390015, + "learning_rate": 5.65600831717194e-06, + "loss": 0.3648, "step": 109385 }, { - "epoch": 3.85, - "learning_rate": 6.650984819241099e-06, - "loss": 0.2545, + "epoch": 3.942408188272606, + "grad_norm": 0.22627753019332886, + "learning_rate": 5.654159871799206e-06, + "loss": 0.3937, "step": 109390 }, { - "epoch": 3.85, - "learning_rate": 6.6490501095307e-06, - "loss": 0.2491, + "epoch": 3.9425883879338306, + "grad_norm": 0.2681983709335327, + "learning_rate": 5.652311690009798e-06, + "loss": 0.365, "step": 109395 }, { - "epoch": 3.85, - "learning_rate": 6.647115638095288e-06, - "loss": 0.2579, + "epoch": 3.9427685875950553, + "grad_norm": 0.19054733216762543, + "learning_rate": 5.650463771828898e-06, + "loss": 0.3927, "step": 109400 }, { - "epoch": 3.85, - "learning_rate": 6.6451814049599825e-06, - "loss": 0.2572, + "epoch": 3.94294878725628, + "grad_norm": 0.279351145029068, + "learning_rate": 5.648616117281677e-06, + "loss": 0.3602, "step": 109405 }, { - "epoch": 3.85, - "learning_rate": 6.643247410149903e-06, - "loss": 0.2585, + "epoch": 3.943128986917505, + "grad_norm": 0.20294272899627686, + "learning_rate": 5.64676872639332e-06, + "loss": 0.365, "step": 109410 }, { - "epoch": 3.85, - "learning_rate": 6.641313653690162e-06, - "loss": 0.2468, + "epoch": 3.943309186578729, + "grad_norm": 0.3039834201335907, + "learning_rate": 5.6449215991889934e-06, + "loss": 0.3659, "step": 109415 }, { - "epoch": 3.85, - "learning_rate": 6.639380135605855e-06, - "loss": 0.2684, + "epoch": 3.943489386239954, + "grad_norm": 0.3200457990169525, + "learning_rate": 5.643074735693862e-06, + "loss": 0.3948, "step": 109420 }, { - "epoch": 3.85, - "learning_rate": 6.6374468559221025e-06, - "loss": 0.2608, + "epoch": 3.9436695859011786, + "grad_norm": 0.2976410984992981, + "learning_rate": 5.641228135933091e-06, + "loss": 0.353, "step": 109425 }, { - "epoch": 3.85, - "learning_rate": 6.635513814664004e-06, - "loss": 0.255, + "epoch": 3.9438497855624033, + "grad_norm": 0.23301257193088531, + "learning_rate": 5.6393817999318365e-06, + "loss": 0.3733, "step": 109430 }, { - "epoch": 3.85, - "learning_rate": 6.6335810118566545e-06, - "loss": 0.2507, + "epoch": 3.9440299852236276, + "grad_norm": 0.2382785677909851, + "learning_rate": 5.637535727715251e-06, + "loss": 0.3598, "step": 109435 }, { - "epoch": 3.85, - "learning_rate": 6.631648447525144e-06, - "loss": 0.2536, + "epoch": 3.9442101848848523, + "grad_norm": 0.2509336471557617, + "learning_rate": 5.635689919308498e-06, + "loss": 0.3935, "step": 109440 }, { - "epoch": 3.85, - "learning_rate": 6.6297161216945735e-06, - "loss": 0.2414, + "epoch": 3.944390384546077, + "grad_norm": 0.2762444317340851, + "learning_rate": 5.633844374736724e-06, + "loss": 0.4127, "step": 109445 }, { - "epoch": 3.85, - "learning_rate": 6.6277840343900415e-06, - "loss": 0.2569, + "epoch": 3.9445705842073018, + "grad_norm": 0.2096366137266159, + "learning_rate": 5.6319990940250696e-06, + "loss": 0.393, "step": 109450 }, { - "epoch": 3.85, - "learning_rate": 6.625852185636625e-06, - "loss": 0.2659, + "epoch": 3.9447507838685265, + "grad_norm": 0.23750926554203033, + "learning_rate": 5.630154077198677e-06, + "loss": 0.371, "step": 109455 }, { - "epoch": 3.85, - "learning_rate": 6.623920575459403e-06, - "loss": 0.2423, + "epoch": 3.944930983529751, + "grad_norm": 0.22931966185569763, + "learning_rate": 5.628309324282676e-06, + "loss": 0.3814, "step": 109460 }, { - "epoch": 3.85, - "learning_rate": 6.6219892038834666e-06, - "loss": 0.2552, + "epoch": 3.9451111831909755, + "grad_norm": 0.23210839927196503, + "learning_rate": 5.626464835302228e-06, + "loss": 0.3851, "step": 109465 }, { - "epoch": 3.85, - "learning_rate": 6.62005807093389e-06, - "loss": 0.2629, + "epoch": 3.9452913828522003, + "grad_norm": 0.3315199911594391, + "learning_rate": 5.62462061028243e-06, + "loss": 0.3627, "step": 109470 }, { - "epoch": 3.85, - "learning_rate": 6.618127176635747e-06, - "loss": 0.2408, + "epoch": 3.945471582513425, + "grad_norm": 0.20502960681915283, + "learning_rate": 5.6227766492484315e-06, + "loss": 0.391, "step": 109475 }, { - "epoch": 3.85, - "learning_rate": 6.6161965210141e-06, - "loss": 0.2507, + "epoch": 3.9456517821746493, + "grad_norm": 0.23437687754631042, + "learning_rate": 5.62093295222535e-06, + "loss": 0.3435, "step": 109480 }, { - "epoch": 3.85, - "learning_rate": 6.614266104094035e-06, - "loss": 0.2448, + "epoch": 3.945831981835874, + "grad_norm": 0.2661973237991333, + "learning_rate": 5.619089519238305e-06, + "loss": 0.3835, "step": 109485 }, { - "epoch": 3.85, - "learning_rate": 6.612335925900606e-06, - "loss": 0.2761, + "epoch": 3.9460121814970988, + "grad_norm": 0.2659934163093567, + "learning_rate": 5.617246350312414e-06, + "loss": 0.4016, "step": 109490 }, { - "epoch": 3.85, - "learning_rate": 6.6104059864588705e-06, - "loss": 0.2613, + "epoch": 3.9461923811583235, + "grad_norm": 0.24436159431934357, + "learning_rate": 5.6154034454727885e-06, + "loss": 0.3518, "step": 109495 }, { - "epoch": 3.85, - "learning_rate": 6.608476285793902e-06, - "loss": 0.2673, + "epoch": 3.9463725808195482, + "grad_norm": 0.23105373978614807, + "learning_rate": 5.6135608047445306e-06, + "loss": 0.3574, "step": 109500 }, { - "epoch": 3.85, - "eval_loss": 0.2506560981273651, - "eval_runtime": 10.5745, - "eval_samples_per_second": 9.457, - "eval_steps_per_second": 9.457, + "epoch": 3.9463725808195482, + "eval_loss": 0.4281553030014038, + "eval_runtime": 3.5325, + "eval_samples_per_second": 28.309, + "eval_steps_per_second": 7.077, "step": 109500 }, { - "epoch": 3.85, - "learning_rate": 6.60654682393074e-06, - "loss": 0.2786, + "epoch": 3.946552780480773, + "grad_norm": 0.23463231325149536, + "learning_rate": 5.611718428152759e-06, + "loss": 0.372, "step": 109505 }, { - "epoch": 3.85, - "learning_rate": 6.6046176008944555e-06, - "loss": 0.2617, + "epoch": 3.9467329801419972, + "grad_norm": 0.19474101066589355, + "learning_rate": 5.609876315722573e-06, + "loss": 0.3851, "step": 109510 }, { - "epoch": 3.85, - "learning_rate": 6.602688616710087e-06, - "loss": 0.2634, + "epoch": 3.946913179803222, + "grad_norm": 0.27828896045684814, + "learning_rate": 5.6080344674790676e-06, + "loss": 0.389, "step": 109515 }, { - "epoch": 3.85, - "learning_rate": 6.600759871402673e-06, - "loss": 0.251, + "epoch": 3.9470933794644467, + "grad_norm": 0.22116512060165405, + "learning_rate": 5.606192883447336e-06, + "loss": 0.3621, "step": 109520 }, { - "epoch": 3.85, - "learning_rate": 6.598831364997276e-06, - "loss": 0.2717, + "epoch": 3.947273579125671, + "grad_norm": 0.27116748690605164, + "learning_rate": 5.604351563652466e-06, + "loss": 0.3818, "step": 109525 }, { - "epoch": 3.85, - "learning_rate": 6.596903097518928e-06, - "loss": 0.2611, + "epoch": 3.9474537787868957, + "grad_norm": 0.2014303356409073, + "learning_rate": 5.60251050811956e-06, + "loss": 0.3935, "step": 109530 }, { - "epoch": 3.85, - "learning_rate": 6.594975068992657e-06, - "loss": 0.2771, + "epoch": 3.9476339784481205, + "grad_norm": 0.30118808150291443, + "learning_rate": 5.600669716873697e-06, + "loss": 0.4039, "step": 109535 }, { - "epoch": 3.85, - "learning_rate": 6.593047279443512e-06, - "loss": 0.2661, + "epoch": 3.947814178109345, + "grad_norm": 0.23722492158412933, + "learning_rate": 5.5988291899399385e-06, + "loss": 0.4329, "step": 109540 }, { - "epoch": 3.85, - "learning_rate": 6.5911197288965165e-06, - "loss": 0.2525, + "epoch": 3.94799437777057, + "grad_norm": 0.25418469309806824, + "learning_rate": 5.596988927343386e-06, + "loss": 0.3885, "step": 109545 }, { - "epoch": 3.85, - "learning_rate": 6.589192417376694e-06, - "loss": 0.2556, + "epoch": 3.9481745774317947, + "grad_norm": 0.21690379083156586, + "learning_rate": 5.595148929109095e-06, + "loss": 0.3706, "step": 109550 }, { - "epoch": 3.85, - "learning_rate": 6.58726534490908e-06, - "loss": 0.2301, + "epoch": 3.948354777093019, + "grad_norm": 0.2371807098388672, + "learning_rate": 5.593309195262156e-06, + "loss": 0.401, "step": 109555 }, { - "epoch": 3.85, - "learning_rate": 6.585338511518685e-06, - "loss": 0.2549, + "epoch": 3.9485349767542437, + "grad_norm": 0.2515186369419098, + "learning_rate": 5.591469725827614e-06, + "loss": 0.3795, "step": 109560 }, { - "epoch": 3.85, - "learning_rate": 6.583411917230542e-06, - "loss": 0.2957, + "epoch": 3.9487151764154684, + "grad_norm": 0.22983525693416595, + "learning_rate": 5.589630520830536e-06, + "loss": 0.4015, "step": 109565 }, { - "epoch": 3.85, - "learning_rate": 6.581485562069656e-06, - "loss": 0.227, + "epoch": 3.9488953760766927, + "grad_norm": 0.2533568739891052, + "learning_rate": 5.587791580295987e-06, + "loss": 0.3962, "step": 109570 }, { - "epoch": 3.86, - "learning_rate": 6.579559446061037e-06, - "loss": 0.2694, + "epoch": 3.9490755757379175, + "grad_norm": 0.24301256239414215, + "learning_rate": 5.58595290424902e-06, + "loss": 0.3958, "step": 109575 }, { - "epoch": 3.86, - "learning_rate": 6.577633569229705e-06, - "loss": 0.2556, + "epoch": 3.949255775399142, + "grad_norm": 0.2538679838180542, + "learning_rate": 5.5841144927146875e-06, + "loss": 0.4044, "step": 109580 }, { - "epoch": 3.86, - "learning_rate": 6.575707931600661e-06, - "loss": 0.2774, + "epoch": 3.949435975060367, + "grad_norm": 0.2607662081718445, + "learning_rate": 5.582276345718037e-06, + "loss": 0.3572, "step": 109585 }, { - "epoch": 3.86, - "learning_rate": 6.573782533198908e-06, - "loss": 0.2739, + "epoch": 3.9496161747215917, + "grad_norm": 0.23929943144321442, + "learning_rate": 5.5804384632841044e-06, + "loss": 0.3594, "step": 109590 }, { - "epoch": 3.86, - "learning_rate": 6.5718573740494365e-06, - "loss": 0.2586, + "epoch": 3.9497963743828164, + "grad_norm": 0.29421868920326233, + "learning_rate": 5.578600845437942e-06, + "loss": 0.3578, "step": 109595 }, { - "epoch": 3.86, - "learning_rate": 6.569932454177261e-06, - "loss": 0.2453, + "epoch": 3.9499765740440407, + "grad_norm": 0.19604270160198212, + "learning_rate": 5.576763492204587e-06, + "loss": 0.3804, "step": 109600 }, { - "epoch": 3.86, - "learning_rate": 6.568007773607357e-06, - "loss": 0.2472, + "epoch": 3.9501567737052654, + "grad_norm": 0.2703362703323364, + "learning_rate": 5.574926403609066e-06, + "loss": 0.41, "step": 109605 }, { - "epoch": 3.86, - "learning_rate": 6.566083332364734e-06, - "loss": 0.277, + "epoch": 3.95033697336649, + "grad_norm": 0.336032509803772, + "learning_rate": 5.573089579676413e-06, + "loss": 0.3946, "step": 109610 }, { - "epoch": 3.86, - "learning_rate": 6.5641591304743625e-06, - "loss": 0.2531, + "epoch": 3.9505171730277144, + "grad_norm": 0.3432498574256897, + "learning_rate": 5.5712530204316545e-06, + "loss": 0.3912, "step": 109615 }, { - "epoch": 3.86, - "learning_rate": 6.5622351679612424e-06, - "loss": 0.2533, + "epoch": 3.950697372688939, + "grad_norm": 0.24581557512283325, + "learning_rate": 5.569416725899804e-06, + "loss": 0.3591, "step": 109620 }, { - "epoch": 3.86, - "learning_rate": 6.560311444850351e-06, - "loss": 0.2605, + "epoch": 3.950877572350164, + "grad_norm": 0.2500128448009491, + "learning_rate": 5.567580696105895e-06, + "loss": 0.4141, "step": 109625 }, { - "epoch": 3.86, - "learning_rate": 6.558387961166659e-06, - "loss": 0.2622, + "epoch": 3.9510577720113886, + "grad_norm": 0.2446908950805664, + "learning_rate": 5.565744931074937e-06, + "loss": 0.3828, "step": 109630 }, { - "epoch": 3.86, - "learning_rate": 6.55646471693514e-06, - "loss": 0.2792, + "epoch": 3.9512379716726134, + "grad_norm": 0.22978517413139343, + "learning_rate": 5.563909430831943e-06, + "loss": 0.3831, "step": 109635 }, { - "epoch": 3.86, - "learning_rate": 6.554541712180778e-06, - "loss": 0.2643, + "epoch": 3.951418171333838, + "grad_norm": 0.2268865555524826, + "learning_rate": 5.56207419540192e-06, + "loss": 0.3924, "step": 109640 }, { - "epoch": 3.86, - "learning_rate": 6.5526189469285375e-06, - "loss": 0.2605, + "epoch": 3.9515983709950624, + "grad_norm": 0.22920677065849304, + "learning_rate": 5.560239224809866e-06, + "loss": 0.3564, "step": 109645 }, { - "epoch": 3.86, - "learning_rate": 6.5506964212033754e-06, - "loss": 0.2747, + "epoch": 3.951778570656287, + "grad_norm": 0.24135887622833252, + "learning_rate": 5.558404519080801e-06, + "loss": 0.3767, "step": 109650 }, { - "epoch": 3.86, - "learning_rate": 6.548774135030269e-06, - "loss": 0.2637, + "epoch": 3.951958770317512, + "grad_norm": 0.27055665850639343, + "learning_rate": 5.556570078239703e-06, + "loss": 0.4004, "step": 109655 }, { - "epoch": 3.86, - "learning_rate": 6.546852088434172e-06, - "loss": 0.2543, + "epoch": 3.9521389699787366, + "grad_norm": 0.19351933896541595, + "learning_rate": 5.554735902311567e-06, + "loss": 0.3738, "step": 109660 }, { - "epoch": 3.86, - "learning_rate": 6.544930281440028e-06, - "loss": 0.2289, + "epoch": 3.952319169639961, + "grad_norm": 0.2451026439666748, + "learning_rate": 5.552901991321399e-06, + "loss": 0.3815, "step": 109665 }, { - "epoch": 3.86, - "learning_rate": 6.543008714072804e-06, - "loss": 0.2327, + "epoch": 3.9524993693011856, + "grad_norm": 0.2261337786912918, + "learning_rate": 5.5510683452941745e-06, + "loss": 0.3699, "step": 109670 }, { - "epoch": 3.86, - "learning_rate": 6.541087386357459e-06, - "loss": 0.2571, + "epoch": 3.9526795689624104, + "grad_norm": 0.22415819764137268, + "learning_rate": 5.549234964254879e-06, + "loss": 0.383, "step": 109675 }, { - "epoch": 3.86, - "learning_rate": 6.539166298318924e-06, - "loss": 0.2668, + "epoch": 3.952859768623635, + "grad_norm": 0.23757857084274292, + "learning_rate": 5.547401848228489e-06, + "loss": 0.3905, "step": 109680 }, { - "epoch": 3.86, - "learning_rate": 6.5372454499821515e-06, - "loss": 0.246, + "epoch": 3.95303996828486, + "grad_norm": 0.22890903055667877, + "learning_rate": 5.545568997239978e-06, + "loss": 0.3978, "step": 109685 }, { - "epoch": 3.86, - "learning_rate": 6.53532484137207e-06, - "loss": 0.2797, + "epoch": 3.953220167946084, + "grad_norm": 0.24089115858078003, + "learning_rate": 5.543736411314329e-06, + "loss": 0.3726, "step": 109690 }, { - "epoch": 3.86, - "learning_rate": 6.5334044725136375e-06, - "loss": 0.2634, + "epoch": 3.953400367607309, + "grad_norm": 0.2351144254207611, + "learning_rate": 5.541904090476505e-06, + "loss": 0.3726, "step": 109695 }, { - "epoch": 3.86, - "learning_rate": 6.5314843434317764e-06, - "loss": 0.2428, + "epoch": 3.9535805672685336, + "grad_norm": 0.20729920268058777, + "learning_rate": 5.54007203475147e-06, + "loss": 0.3672, "step": 109700 }, { - "epoch": 3.86, - "learning_rate": 6.529564454151418e-06, - "loss": 0.2481, + "epoch": 3.9537607669297583, + "grad_norm": 0.231421560049057, + "learning_rate": 5.538240244164186e-06, + "loss": 0.3376, "step": 109705 }, { - "epoch": 3.86, - "learning_rate": 6.527644804697489e-06, - "loss": 0.2572, + "epoch": 3.9539409665909826, + "grad_norm": 0.2679100036621094, + "learning_rate": 5.536408718739605e-06, + "loss": 0.3995, "step": 109710 }, { - "epoch": 3.86, - "learning_rate": 6.525725395094923e-06, - "loss": 0.254, + "epoch": 3.9541211662522073, + "grad_norm": 0.2754489481449127, + "learning_rate": 5.534577458502693e-06, + "loss": 0.3953, "step": 109715 }, { - "epoch": 3.86, - "learning_rate": 6.5238062253686314e-06, - "loss": 0.2514, + "epoch": 3.954301365913432, + "grad_norm": 0.21285389363765717, + "learning_rate": 5.532746463478403e-06, + "loss": 0.3766, "step": 109720 }, { - "epoch": 3.86, - "learning_rate": 6.5218872955435465e-06, - "loss": 0.2573, + "epoch": 3.954481565574657, + "grad_norm": 0.24366551637649536, + "learning_rate": 5.5309157336916575e-06, + "loss": 0.3781, "step": 109725 }, { - "epoch": 3.86, - "learning_rate": 6.519968605644569e-06, - "loss": 0.2663, + "epoch": 3.9546617652358815, + "grad_norm": 0.20494620501995087, + "learning_rate": 5.529085269167422e-06, + "loss": 0.3657, "step": 109730 }, { - "epoch": 3.86, - "learning_rate": 6.518050155696631e-06, - "loss": 0.2713, + "epoch": 3.9548419648971063, + "grad_norm": 0.268721342086792, + "learning_rate": 5.527255069930626e-06, + "loss": 0.3734, "step": 109735 }, { - "epoch": 3.86, - "learning_rate": 6.516131945724629e-06, - "loss": 0.279, + "epoch": 3.9550221645583306, + "grad_norm": 0.22930163145065308, + "learning_rate": 5.525425136006216e-06, + "loss": 0.368, "step": 109740 }, { - "epoch": 3.86, - "learning_rate": 6.514213975753472e-06, - "loss": 0.2773, + "epoch": 3.9552023642195553, + "grad_norm": 0.25767385959625244, + "learning_rate": 5.5235954674191235e-06, + "loss": 0.3905, "step": 109745 }, { - "epoch": 3.86, - "learning_rate": 6.5122962458080586e-06, - "loss": 0.2522, + "epoch": 3.95538256388078, + "grad_norm": 0.24452823400497437, + "learning_rate": 5.5217660641942585e-06, + "loss": 0.38, "step": 109750 }, { - "epoch": 3.86, - "learning_rate": 6.510378755913302e-06, - "loss": 0.2602, + "epoch": 3.9555627635420043, + "grad_norm": 0.2989307940006256, + "learning_rate": 5.519936926356567e-06, + "loss": 0.3936, "step": 109755 }, { - "epoch": 3.86, - "learning_rate": 6.5084615060940905e-06, - "loss": 0.2345, + "epoch": 3.955742963203229, + "grad_norm": 0.23451735079288483, + "learning_rate": 5.518108053930962e-06, + "loss": 0.3972, "step": 109760 }, { - "epoch": 3.86, - "learning_rate": 6.5065444963753134e-06, - "loss": 0.2481, + "epoch": 3.955923162864454, + "grad_norm": 0.27269425988197327, + "learning_rate": 5.516279446942366e-06, + "loss": 0.379, "step": 109765 }, { - "epoch": 3.86, - "learning_rate": 6.504627726781875e-06, - "loss": 0.2813, + "epoch": 3.9561033625256785, + "grad_norm": 0.2326863706111908, + "learning_rate": 5.5144511054156885e-06, + "loss": 0.3866, "step": 109770 }, { - "epoch": 3.86, - "learning_rate": 6.502711197338651e-06, - "loss": 0.2379, + "epoch": 3.9562835621869032, + "grad_norm": 0.2507339119911194, + "learning_rate": 5.512623029375835e-06, + "loss": 0.3986, "step": 109775 }, { - "epoch": 3.86, - "learning_rate": 6.500794908070537e-06, - "loss": 0.247, + "epoch": 3.956463761848128, + "grad_norm": 0.23485083878040314, + "learning_rate": 5.510795218847725e-06, + "loss": 0.4092, "step": 109780 }, { - "epoch": 3.86, - "learning_rate": 6.498878859002405e-06, - "loss": 0.2438, + "epoch": 3.9566439615093523, + "grad_norm": 0.22537368535995483, + "learning_rate": 5.5089676738562606e-06, + "loss": 0.3579, "step": 109785 }, { - "epoch": 3.86, - "learning_rate": 6.496963050159144e-06, - "loss": 0.2464, + "epoch": 3.956824161170577, + "grad_norm": 0.2544284760951996, + "learning_rate": 5.507140394426335e-06, + "loss": 0.3826, "step": 109790 }, { - "epoch": 3.86, - "learning_rate": 6.495047481565625e-06, - "loss": 0.253, + "epoch": 3.9570043608318017, + "grad_norm": 0.2154984176158905, + "learning_rate": 5.50531338058285e-06, + "loss": 0.3775, "step": 109795 }, { - "epoch": 3.86, - "learning_rate": 6.493132153246717e-06, - "loss": 0.2706, + "epoch": 3.957184560493026, + "grad_norm": 0.29244160652160645, + "learning_rate": 5.503486632350696e-06, + "loss": 0.3983, "step": 109800 }, { - "epoch": 3.86, - "learning_rate": 6.491217065227287e-06, - "loss": 0.2471, + "epoch": 3.9573647601542508, + "grad_norm": 0.21299979090690613, + "learning_rate": 5.501660149754753e-06, + "loss": 0.36, "step": 109805 }, { - "epoch": 3.86, - "learning_rate": 6.489302217532212e-06, - "loss": 0.25, + "epoch": 3.9575449598154755, + "grad_norm": 0.21175287663936615, + "learning_rate": 5.4998339328199255e-06, + "loss": 0.3463, "step": 109810 }, { - "epoch": 3.86, - "learning_rate": 6.487387610186346e-06, - "loss": 0.2396, + "epoch": 3.9577251594767002, + "grad_norm": 0.19887837767601013, + "learning_rate": 5.498007981571082e-06, + "loss": 0.3646, "step": 109815 }, { - "epoch": 3.86, - "learning_rate": 6.485473243214554e-06, - "loss": 0.2484, + "epoch": 3.957905359137925, + "grad_norm": 0.22215895354747772, + "learning_rate": 5.4961822960331064e-06, + "loss": 0.3875, "step": 109820 }, { - "epoch": 3.86, - "learning_rate": 6.483559116641683e-06, - "loss": 0.2551, + "epoch": 3.9580855587991497, + "grad_norm": 0.2197951376438141, + "learning_rate": 5.494356876230869e-06, + "loss": 0.3831, "step": 109825 }, { - "epoch": 3.86, - "learning_rate": 6.481645230492594e-06, - "loss": 0.272, + "epoch": 3.958265758460374, + "grad_norm": 0.27231666445732117, + "learning_rate": 5.492531722189237e-06, + "loss": 0.3833, "step": 109830 }, { - "epoch": 3.86, - "learning_rate": 6.479731584792145e-06, - "loss": 0.2526, + "epoch": 3.9584459581215987, + "grad_norm": 0.22005848586559296, + "learning_rate": 5.490706833933096e-06, + "loss": 0.3738, "step": 109835 }, { - "epoch": 3.86, - "learning_rate": 6.477818179565176e-06, - "loss": 0.2607, + "epoch": 3.9586261577828235, + "grad_norm": 0.19976355135440826, + "learning_rate": 5.488882211487292e-06, + "loss": 0.4119, "step": 109840 }, { - "epoch": 3.86, - "learning_rate": 6.475905014836525e-06, - "loss": 0.2703, + "epoch": 3.9588063574440477, + "grad_norm": 0.24296000599861145, + "learning_rate": 5.487057854876682e-06, + "loss": 0.3737, "step": 109845 }, { - "epoch": 3.86, - "learning_rate": 6.4739920906310456e-06, - "loss": 0.2606, + "epoch": 3.9589865571052725, + "grad_norm": 0.32272353768348694, + "learning_rate": 5.485233764126138e-06, + "loss": 0.421, "step": 109850 }, { - "epoch": 3.87, - "learning_rate": 6.472079406973569e-06, - "loss": 0.2238, + "epoch": 3.959166756766497, + "grad_norm": 0.23841454088687897, + "learning_rate": 5.4834099392605e-06, + "loss": 0.3983, "step": 109855 }, { - "epoch": 3.87, - "learning_rate": 6.470166963888932e-06, - "loss": 0.2683, + "epoch": 3.959346956427722, + "grad_norm": 0.2716634273529053, + "learning_rate": 5.481586380304637e-06, + "loss": 0.3676, "step": 109860 }, { - "epoch": 3.87, - "learning_rate": 6.4682547614019596e-06, - "loss": 0.2836, + "epoch": 3.9595271560889467, + "grad_norm": 0.2504705488681793, + "learning_rate": 5.479763087283374e-06, + "loss": 0.3868, "step": 109865 }, { - "epoch": 3.87, - "learning_rate": 6.466342799537492e-06, - "loss": 0.2543, + "epoch": 3.9597073557501714, + "grad_norm": 0.2552550137042999, + "learning_rate": 5.4779400602215505e-06, + "loss": 0.3778, "step": 109870 }, { - "epoch": 3.87, - "learning_rate": 6.4644310783203485e-06, - "loss": 0.2591, + "epoch": 3.9598875554113957, + "grad_norm": 0.24792692065238953, + "learning_rate": 5.476117299144024e-06, + "loss": 0.3571, "step": 109875 }, { - "epoch": 3.87, - "learning_rate": 6.462519597775346e-06, - "loss": 0.2443, + "epoch": 3.9600677550726204, + "grad_norm": 0.2851993441581726, + "learning_rate": 5.474294804075617e-06, + "loss": 0.3578, "step": 109880 }, { - "epoch": 3.87, - "learning_rate": 6.460608357927309e-06, - "loss": 0.2622, + "epoch": 3.960247954733845, + "grad_norm": 0.28467345237731934, + "learning_rate": 5.472472575041166e-06, + "loss": 0.3829, "step": 109885 }, { - "epoch": 3.87, - "learning_rate": 6.458697358801061e-06, - "loss": 0.2791, + "epoch": 3.96042815439507, + "grad_norm": 0.26601019501686096, + "learning_rate": 5.470650612065492e-06, + "loss": 0.3478, "step": 109890 }, { - "epoch": 3.87, - "learning_rate": 6.456786600421411e-06, - "loss": 0.2643, + "epoch": 3.960608354056294, + "grad_norm": 0.24324598908424377, + "learning_rate": 5.468828915173416e-06, + "loss": 0.3622, "step": 109895 }, { - "epoch": 3.87, - "learning_rate": 6.454876082813158e-06, - "loss": 0.2539, + "epoch": 3.960788553717519, + "grad_norm": 0.21530494093894958, + "learning_rate": 5.4670074843897715e-06, + "loss": 0.3667, "step": 109900 }, { - "epoch": 3.87, - "learning_rate": 6.452965806001124e-06, - "loss": 0.2467, + "epoch": 3.9609687533787437, + "grad_norm": 0.2549532651901245, + "learning_rate": 5.465186319739371e-06, + "loss": 0.4011, "step": 109905 }, { - "epoch": 3.87, - "learning_rate": 6.451055770010106e-06, - "loss": 0.277, + "epoch": 3.9611489530399684, + "grad_norm": 0.26102617383003235, + "learning_rate": 5.463365421247024e-06, + "loss": 0.3918, "step": 109910 }, { - "epoch": 3.87, - "learning_rate": 6.449145974864906e-06, - "loss": 0.2755, + "epoch": 3.961329152701193, + "grad_norm": 0.24327853322029114, + "learning_rate": 5.46154478893754e-06, + "loss": 0.3286, "step": 109915 }, { - "epoch": 3.87, - "learning_rate": 6.447236420590311e-06, - "loss": 0.2534, + "epoch": 3.9615093523624174, + "grad_norm": 0.26538389921188354, + "learning_rate": 5.459724422835716e-06, + "loss": 0.3881, "step": 109920 }, { - "epoch": 3.87, - "learning_rate": 6.445327107211132e-06, - "loss": 0.2553, + "epoch": 3.961689552023642, + "grad_norm": 0.18152877688407898, + "learning_rate": 5.457904322966373e-06, + "loss": 0.4096, "step": 109925 }, { - "epoch": 3.87, - "learning_rate": 6.443418034752155e-06, - "loss": 0.277, + "epoch": 3.961869751684867, + "grad_norm": 0.2658418118953705, + "learning_rate": 5.456084489354307e-06, + "loss": 0.3956, "step": 109930 }, { - "epoch": 3.87, - "learning_rate": 6.441509203238155e-06, - "loss": 0.2739, + "epoch": 3.9620499513460916, + "grad_norm": 0.23971152305603027, + "learning_rate": 5.454264922024294e-06, + "loss": 0.4017, "step": 109935 }, { - "epoch": 3.87, - "learning_rate": 6.439600612693935e-06, - "loss": 0.2784, + "epoch": 3.962230151007316, + "grad_norm": 0.2468896061182022, + "learning_rate": 5.4524456210011425e-06, + "loss": 0.3612, "step": 109940 }, { - "epoch": 3.87, - "learning_rate": 6.437692263144266e-06, - "loss": 0.2548, + "epoch": 3.9624103506685406, + "grad_norm": 0.29942211508750916, + "learning_rate": 5.450626586309634e-06, + "loss": 0.3639, "step": 109945 }, { - "epoch": 3.87, - "learning_rate": 6.435784154613933e-06, - "loss": 0.2584, + "epoch": 3.9625905503297654, + "grad_norm": 0.22130388021469116, + "learning_rate": 5.448807817974555e-06, + "loss": 0.3656, "step": 109950 }, { - "epoch": 3.87, - "learning_rate": 6.43387628712771e-06, - "loss": 0.2587, + "epoch": 3.96277074999099, + "grad_norm": 0.24621397256851196, + "learning_rate": 5.4469893160206815e-06, + "loss": 0.3605, "step": 109955 }, { - "epoch": 3.87, - "learning_rate": 6.431968660710361e-06, - "loss": 0.2411, + "epoch": 3.962950949652215, + "grad_norm": 0.2520377039909363, + "learning_rate": 5.4451710804727885e-06, + "loss": 0.3968, "step": 109960 }, { - "epoch": 3.87, - "learning_rate": 6.430061275386667e-06, - "loss": 0.2401, + "epoch": 3.963131149313439, + "grad_norm": 0.20365937054157257, + "learning_rate": 5.443353111355659e-06, + "loss": 0.3776, "step": 109965 }, { - "epoch": 3.87, - "learning_rate": 6.428154131181393e-06, - "loss": 0.2648, + "epoch": 3.963311348974664, + "grad_norm": 0.2284347116947174, + "learning_rate": 5.44153540869406e-06, + "loss": 0.3964, "step": 109970 }, { - "epoch": 3.87, - "learning_rate": 6.426247228119297e-06, - "loss": 0.252, + "epoch": 3.9634915486358886, + "grad_norm": 0.2807764708995819, + "learning_rate": 5.439717972512751e-06, + "loss": 0.3695, "step": 109975 }, { - "epoch": 3.87, - "learning_rate": 6.424340566225132e-06, - "loss": 0.2508, + "epoch": 3.9636717482971133, + "grad_norm": 0.2894369959831238, + "learning_rate": 5.437900802836499e-06, + "loss": 0.377, "step": 109980 }, { - "epoch": 3.87, - "learning_rate": 6.422434145523671e-06, - "loss": 0.2496, + "epoch": 3.9638519479583376, + "grad_norm": 0.2694891095161438, + "learning_rate": 5.436083899690061e-06, + "loss": 0.3832, "step": 109985 }, { - "epoch": 3.87, - "learning_rate": 6.420527966039652e-06, - "loss": 0.2349, + "epoch": 3.9640321476195624, + "grad_norm": 0.2883671224117279, + "learning_rate": 5.4342672630981865e-06, + "loss": 0.3855, "step": 109990 }, { - "epoch": 3.87, - "learning_rate": 6.418622027797838e-06, - "loss": 0.2563, + "epoch": 3.964212347280787, + "grad_norm": 0.2609039545059204, + "learning_rate": 5.432450893085639e-06, + "loss": 0.3817, "step": 109995 }, { - "epoch": 3.87, - "learning_rate": 6.416716330822967e-06, - "loss": 0.2675, + "epoch": 3.964392546942012, + "grad_norm": 0.20613409578800201, + "learning_rate": 5.430634789677158e-06, + "loss": 0.3783, "step": 110000 }, { - "epoch": 3.87, - "eval_loss": 0.2505649924278259, - "eval_runtime": 10.5578, - "eval_samples_per_second": 9.472, - "eval_steps_per_second": 9.472, + "epoch": 3.964392546942012, + "eval_loss": 0.427830308675766, + "eval_runtime": 3.5225, + "eval_samples_per_second": 28.389, + "eval_steps_per_second": 7.097, "step": 110000 }, { - "epoch": 3.87, - "learning_rate": 6.414810875139796e-06, - "loss": 0.2671, + "epoch": 3.9645727466032366, + "grad_norm": 0.23606936633586884, + "learning_rate": 5.428818952897491e-06, + "loss": 0.3744, "step": 110005 }, { - "epoch": 3.87, - "learning_rate": 6.4129056607730555e-06, - "loss": 0.2459, + "epoch": 3.9647529462644613, + "grad_norm": 0.2058301568031311, + "learning_rate": 5.427003382771376e-06, + "loss": 0.3679, "step": 110010 }, { - "epoch": 3.87, - "learning_rate": 6.411000687747478e-06, - "loss": 0.2513, + "epoch": 3.9649331459256856, + "grad_norm": 0.2392200529575348, + "learning_rate": 5.425188079323543e-06, + "loss": 0.3732, "step": 110015 }, { - "epoch": 3.87, - "learning_rate": 6.409095956087813e-06, - "loss": 0.2715, + "epoch": 3.9651133455869103, + "grad_norm": 0.22312918305397034, + "learning_rate": 5.423373042578742e-06, + "loss": 0.3765, "step": 110020 }, { - "epoch": 3.87, - "learning_rate": 6.407191465818788e-06, - "loss": 0.2547, + "epoch": 3.965293545248135, + "grad_norm": 0.2611244320869446, + "learning_rate": 5.421558272561697e-06, + "loss": 0.3729, "step": 110025 }, { - "epoch": 3.87, - "learning_rate": 6.405287216965125e-06, - "loss": 0.2419, + "epoch": 3.9654737449093593, + "grad_norm": 0.22937054932117462, + "learning_rate": 5.419743769297117e-06, + "loss": 0.3881, "step": 110030 }, { - "epoch": 3.87, - "learning_rate": 6.403383209551547e-06, - "loss": 0.2474, + "epoch": 3.965653944570584, + "grad_norm": 0.2127593755722046, + "learning_rate": 5.417929532809746e-06, + "loss": 0.3301, "step": 110035 }, { - "epoch": 3.87, - "learning_rate": 6.401479443602792e-06, - "loss": 0.2438, + "epoch": 3.965834144231809, + "grad_norm": 0.25653398036956787, + "learning_rate": 5.416115563124283e-06, + "loss": 0.3757, "step": 110040 }, { - "epoch": 3.87, - "learning_rate": 6.39957591914356e-06, - "loss": 0.2548, + "epoch": 3.9660143438930335, + "grad_norm": 0.23030655086040497, + "learning_rate": 5.414301860265472e-06, + "loss": 0.3658, "step": 110045 }, { - "epoch": 3.87, - "learning_rate": 6.397672636198584e-06, - "loss": 0.2529, + "epoch": 3.9661945435542583, + "grad_norm": 0.28496742248535156, + "learning_rate": 5.412488424257997e-06, + "loss": 0.3403, "step": 110050 }, { - "epoch": 3.87, - "learning_rate": 6.395769594792564e-06, - "loss": 0.2649, + "epoch": 3.966374743215483, + "grad_norm": 0.21361872553825378, + "learning_rate": 5.410675255126568e-06, + "loss": 0.3534, "step": 110055 }, { - "epoch": 3.87, - "learning_rate": 6.3938667949502205e-06, - "loss": 0.2417, + "epoch": 3.9665549428767073, + "grad_norm": 0.22324657440185547, + "learning_rate": 5.408862352895905e-06, + "loss": 0.3898, "step": 110060 }, { - "epoch": 3.87, - "learning_rate": 6.3919642366962545e-06, - "loss": 0.2702, + "epoch": 3.966735142537932, + "grad_norm": 0.2588847875595093, + "learning_rate": 5.407049717590698e-06, + "loss": 0.3481, "step": 110065 }, { - "epoch": 3.87, - "learning_rate": 6.3900619200553705e-06, - "loss": 0.2615, + "epoch": 3.9669153421991568, + "grad_norm": 0.23955267667770386, + "learning_rate": 5.405237349235645e-06, + "loss": 0.3918, "step": 110070 }, { - "epoch": 3.87, - "learning_rate": 6.38815984505226e-06, - "loss": 0.2561, + "epoch": 3.967095541860381, + "grad_norm": 0.2915831208229065, + "learning_rate": 5.40342524785544e-06, + "loss": 0.3471, "step": 110075 }, { - "epoch": 3.87, - "learning_rate": 6.386258011711638e-06, - "loss": 0.2494, + "epoch": 3.967275741521606, + "grad_norm": 0.24253830313682556, + "learning_rate": 5.401613413474762e-06, + "loss": 0.371, "step": 110080 }, { - "epoch": 3.87, - "learning_rate": 6.384356420058188e-06, - "loss": 0.2514, + "epoch": 3.9674559411828305, + "grad_norm": 0.25190943479537964, + "learning_rate": 5.399801846118318e-06, + "loss": 0.3664, "step": 110085 }, { - "epoch": 3.87, - "learning_rate": 6.382455070116599e-06, - "loss": 0.2737, + "epoch": 3.9676361408440552, + "grad_norm": 0.2514355480670929, + "learning_rate": 5.397990545810777e-06, + "loss": 0.3753, "step": 110090 }, { - "epoch": 3.87, - "learning_rate": 6.380553961911556e-06, - "loss": 0.2449, + "epoch": 3.96781634050528, + "grad_norm": 0.2575605809688568, + "learning_rate": 5.396179512576821e-06, + "loss": 0.3553, "step": 110095 }, { - "epoch": 3.87, - "learning_rate": 6.378653095467757e-06, - "loss": 0.2506, + "epoch": 3.9679965401665047, + "grad_norm": 0.24077442288398743, + "learning_rate": 5.394368746441125e-06, + "loss": 0.3611, "step": 110100 }, { - "epoch": 3.87, - "learning_rate": 6.376752470809869e-06, - "loss": 0.2576, + "epoch": 3.968176739827729, + "grad_norm": 0.2745268940925598, + "learning_rate": 5.39255824742835e-06, + "loss": 0.3721, "step": 110105 }, { - "epoch": 3.87, - "learning_rate": 6.374852087962574e-06, - "loss": 0.243, + "epoch": 3.9683569394889537, + "grad_norm": 0.22541183233261108, + "learning_rate": 5.39074801556318e-06, + "loss": 0.373, "step": 110110 }, { - "epoch": 3.87, - "learning_rate": 6.372951946950559e-06, - "loss": 0.2739, + "epoch": 3.9685371391501785, + "grad_norm": 0.21690218150615692, + "learning_rate": 5.388938050870279e-06, + "loss": 0.3838, "step": 110115 }, { - "epoch": 3.87, - "learning_rate": 6.37105204779849e-06, - "loss": 0.2525, + "epoch": 3.9687173388114028, + "grad_norm": 0.26479387283325195, + "learning_rate": 5.387128353374285e-06, + "loss": 0.3713, "step": 110120 }, { - "epoch": 3.87, - "learning_rate": 6.369152390531031e-06, - "loss": 0.2615, + "epoch": 3.9688975384726275, + "grad_norm": 0.2510485351085663, + "learning_rate": 5.385318923099877e-06, + "loss": 0.3695, "step": 110125 }, { - "epoch": 3.87, - "learning_rate": 6.367252975172841e-06, - "loss": 0.2457, + "epoch": 3.9690777381338522, + "grad_norm": 0.23300889134407043, + "learning_rate": 5.383509760071703e-06, + "loss": 0.3741, "step": 110130 }, { - "epoch": 3.87, - "learning_rate": 6.3653538017486e-06, - "loss": 0.2682, + "epoch": 3.969257937795077, + "grad_norm": 0.21996130049228668, + "learning_rate": 5.3817008643144096e-06, + "loss": 0.3319, "step": 110135 }, { - "epoch": 3.88, - "learning_rate": 6.363454870282959e-06, - "loss": 0.266, + "epoch": 3.9694381374563017, + "grad_norm": 0.24466222524642944, + "learning_rate": 5.379892235852646e-06, + "loss": 0.3745, "step": 110140 }, { - "epoch": 3.88, - "learning_rate": 6.361556180800577e-06, - "loss": 0.2501, + "epoch": 3.9696183371175264, + "grad_norm": 0.2463037371635437, + "learning_rate": 5.378083874711043e-06, + "loss": 0.3904, "step": 110145 }, { - "epoch": 3.88, - "learning_rate": 6.3596577333260935e-06, - "loss": 0.261, + "epoch": 3.9697985367787507, + "grad_norm": 0.23316043615341187, + "learning_rate": 5.3762757809142555e-06, + "loss": 0.3767, "step": 110150 }, { - "epoch": 3.88, - "learning_rate": 6.35775952788418e-06, - "loss": 0.2517, + "epoch": 3.9699787364399755, + "grad_norm": 0.24817629158496857, + "learning_rate": 5.374467954486911e-06, + "loss": 0.3823, "step": 110155 }, { - "epoch": 3.88, - "learning_rate": 6.355861564499466e-06, - "loss": 0.283, + "epoch": 3.9701589361012, + "grad_norm": 0.26238492131233215, + "learning_rate": 5.372660395453641e-06, + "loss": 0.4208, "step": 110160 }, { - "epoch": 3.88, - "learning_rate": 6.353963843196609e-06, - "loss": 0.2687, + "epoch": 3.970339135762425, + "grad_norm": 0.2136494368314743, + "learning_rate": 5.370853103839071e-06, + "loss": 0.3898, "step": 110165 }, { - "epoch": 3.88, - "learning_rate": 6.352066364000236e-06, - "loss": 0.2655, + "epoch": 3.970519335423649, + "grad_norm": 0.28680065274238586, + "learning_rate": 5.36904607966783e-06, + "loss": 0.4224, "step": 110170 }, { - "epoch": 3.88, - "learning_rate": 6.350169126935001e-06, - "loss": 0.2467, + "epoch": 3.970699535084874, + "grad_norm": 0.35283222794532776, + "learning_rate": 5.367239322964529e-06, + "loss": 0.3185, "step": 110175 }, { - "epoch": 3.88, - "learning_rate": 6.348272132025526e-06, - "loss": 0.2478, + "epoch": 3.9708797347460987, + "grad_norm": 0.2509290277957916, + "learning_rate": 5.365432833753797e-06, + "loss": 0.3933, "step": 110180 }, { - "epoch": 3.88, - "learning_rate": 6.3463753792964485e-06, - "loss": 0.2508, + "epoch": 3.9710599344073234, + "grad_norm": 0.2476135492324829, + "learning_rate": 5.363626612060241e-06, + "loss": 0.3734, "step": 110185 }, { - "epoch": 3.88, - "learning_rate": 6.344478868772386e-06, - "loss": 0.2584, + "epoch": 3.971240134068548, + "grad_norm": 0.2526455223560333, + "learning_rate": 5.36182065790847e-06, + "loss": 0.3917, "step": 110190 }, { - "epoch": 3.88, - "learning_rate": 6.342582600477975e-06, - "loss": 0.2522, + "epoch": 3.9714203337297724, + "grad_norm": 0.2520613670349121, + "learning_rate": 5.360014971323094e-06, + "loss": 0.413, "step": 110195 }, { - "epoch": 3.88, - "learning_rate": 6.340686574437835e-06, - "loss": 0.2353, + "epoch": 3.971600533390997, + "grad_norm": 0.24791032075881958, + "learning_rate": 5.358209552328699e-06, + "loss": 0.3903, "step": 110200 }, { - "epoch": 3.88, - "learning_rate": 6.338790790676585e-06, - "loss": 0.2505, + "epoch": 3.971780733052222, + "grad_norm": 0.22999384999275208, + "learning_rate": 5.356404400949908e-06, + "loss": 0.3685, "step": 110205 }, { - "epoch": 3.88, - "learning_rate": 6.336895249218827e-06, - "loss": 0.2479, + "epoch": 3.9719609327134466, + "grad_norm": 0.25010523200035095, + "learning_rate": 5.354599517211309e-06, + "loss": 0.3803, "step": 110210 }, { - "epoch": 3.88, - "learning_rate": 6.334999950089188e-06, - "loss": 0.2476, + "epoch": 3.972141132374671, + "grad_norm": 0.2954365313053131, + "learning_rate": 5.3527949011374724e-06, + "loss": 0.4006, "step": 110215 }, { - "epoch": 3.88, - "learning_rate": 6.33310489331228e-06, - "loss": 0.2702, + "epoch": 3.9723213320358957, + "grad_norm": 0.23827119171619415, + "learning_rate": 5.350990552753013e-06, + "loss": 0.347, "step": 110220 }, { - "epoch": 3.88, - "learning_rate": 6.3312100789126975e-06, - "loss": 0.2398, + "epoch": 3.9725015316971204, + "grad_norm": 0.23179586231708527, + "learning_rate": 5.349186472082493e-06, + "loss": 0.3856, "step": 110225 }, { - "epoch": 3.88, - "learning_rate": 6.329315506915054e-06, - "loss": 0.2507, + "epoch": 3.972681731358345, + "grad_norm": 0.21170353889465332, + "learning_rate": 5.347382659150516e-06, + "loss": 0.3922, "step": 110230 }, { - "epoch": 3.88, - "learning_rate": 6.327421177343945e-06, - "loss": 0.2615, + "epoch": 3.97286193101957, + "grad_norm": 0.2426007241010666, + "learning_rate": 5.345579113981641e-06, + "loss": 0.3793, "step": 110235 }, { - "epoch": 3.88, - "learning_rate": 6.325527090223965e-06, - "loss": 0.2568, + "epoch": 3.9730421306807946, + "grad_norm": 0.28864866495132446, + "learning_rate": 5.343775836600437e-06, + "loss": 0.3969, "step": 110240 }, { - "epoch": 3.88, - "learning_rate": 6.323633245579702e-06, - "loss": 0.2687, + "epoch": 3.973222330342019, + "grad_norm": 0.27137455344200134, + "learning_rate": 5.341972827031488e-06, + "loss": 0.4164, "step": 110245 }, { - "epoch": 3.88, - "learning_rate": 6.321739643435762e-06, - "loss": 0.2595, + "epoch": 3.9734025300032436, + "grad_norm": 0.286211758852005, + "learning_rate": 5.340170085299353e-06, + "loss": 0.393, "step": 110250 }, { - "epoch": 3.88, - "learning_rate": 6.319846283816721e-06, - "loss": 0.2578, + "epoch": 3.9735827296644683, + "grad_norm": 0.21608895063400269, + "learning_rate": 5.338367611428596e-06, + "loss": 0.3818, "step": 110255 }, { - "epoch": 3.88, - "learning_rate": 6.317953166747168e-06, - "loss": 0.2479, + "epoch": 3.9737629293256926, + "grad_norm": 0.24834184348583221, + "learning_rate": 5.336565405443772e-06, + "loss": 0.3731, "step": 110260 }, { - "epoch": 3.88, - "learning_rate": 6.316060292251675e-06, - "loss": 0.2583, + "epoch": 3.9739431289869174, + "grad_norm": 0.24085558950901031, + "learning_rate": 5.334763467369433e-06, + "loss": 0.3548, "step": 110265 }, { - "epoch": 3.88, - "learning_rate": 6.3141676603548226e-06, - "loss": 0.2528, + "epoch": 3.974123328648142, + "grad_norm": 0.28163549304008484, + "learning_rate": 5.332961797230138e-06, + "loss": 0.3742, "step": 110270 }, { - "epoch": 3.88, - "learning_rate": 6.312275271081197e-06, - "loss": 0.2585, + "epoch": 3.974303528309367, + "grad_norm": 0.26481056213378906, + "learning_rate": 5.331160395050433e-06, + "loss": 0.4114, "step": 110275 }, { - "epoch": 3.88, - "learning_rate": 6.31038312445536e-06, - "loss": 0.2335, + "epoch": 3.9744837279705916, + "grad_norm": 0.252930223941803, + "learning_rate": 5.3293592608548606e-06, + "loss": 0.3308, "step": 110280 }, { - "epoch": 3.88, - "learning_rate": 6.308491220501874e-06, - "loss": 0.2517, + "epoch": 3.9746639276318163, + "grad_norm": 0.2916033864021301, + "learning_rate": 5.327558394667958e-06, + "loss": 0.3712, "step": 110285 }, { - "epoch": 3.88, - "learning_rate": 6.306599559245322e-06, - "loss": 0.2758, + "epoch": 3.9748441272930406, + "grad_norm": 0.27334362268447876, + "learning_rate": 5.325757796514258e-06, + "loss": 0.3883, "step": 110290 }, { - "epoch": 3.88, - "learning_rate": 6.304708140710253e-06, - "loss": 0.2717, + "epoch": 3.9750243269542653, + "grad_norm": 0.2750318944454193, + "learning_rate": 5.323957466418303e-06, + "loss": 0.3864, "step": 110295 }, { - "epoch": 3.88, - "learning_rate": 6.302816964921226e-06, - "loss": 0.2278, + "epoch": 3.97520452661549, + "grad_norm": 0.2912689447402954, + "learning_rate": 5.322157404404629e-06, + "loss": 0.3845, "step": 110300 }, { - "epoch": 3.88, - "learning_rate": 6.300926031902796e-06, - "loss": 0.2578, + "epoch": 3.9753847262767144, + "grad_norm": 0.26538118720054626, + "learning_rate": 5.320357610497734e-06, + "loss": 0.3879, "step": 110305 }, { - "epoch": 3.88, - "learning_rate": 6.299035341679524e-06, - "loss": 0.2574, + "epoch": 3.975564925937939, + "grad_norm": 0.20100893080234528, + "learning_rate": 5.3185580847221635e-06, + "loss": 0.4083, "step": 110310 }, { - "epoch": 3.88, - "learning_rate": 6.297144894275952e-06, - "loss": 0.2487, + "epoch": 3.975745125599164, + "grad_norm": 0.2257593870162964, + "learning_rate": 5.31675882710243e-06, + "loss": 0.3935, "step": 110315 }, { - "epoch": 3.88, - "learning_rate": 6.29525468971662e-06, - "loss": 0.2582, + "epoch": 3.9759253252603886, + "grad_norm": 0.21260130405426025, + "learning_rate": 5.314959837663039e-06, + "loss": 0.3592, "step": 110320 }, { - "epoch": 3.88, - "learning_rate": 6.293364728026082e-06, - "loss": 0.2558, + "epoch": 3.9761055249216133, + "grad_norm": 0.2437485307455063, + "learning_rate": 5.313161116428522e-06, + "loss": 0.3784, "step": 110325 }, { - "epoch": 3.88, - "learning_rate": 6.291475009228881e-06, - "loss": 0.2433, + "epoch": 3.976285724582838, + "grad_norm": 0.25405043363571167, + "learning_rate": 5.311362663423361e-06, + "loss": 0.3999, "step": 110330 }, { - "epoch": 3.88, - "learning_rate": 6.289585533349548e-06, - "loss": 0.2345, + "epoch": 3.9764659242440623, + "grad_norm": 0.20990101993083954, + "learning_rate": 5.30956447867208e-06, + "loss": 0.351, "step": 110335 }, { - "epoch": 3.88, - "learning_rate": 6.287696300412607e-06, - "loss": 0.2536, + "epoch": 3.976646123905287, + "grad_norm": 0.22888407111167908, + "learning_rate": 5.307766562199168e-06, + "loss": 0.4054, "step": 110340 }, { - "epoch": 3.88, - "learning_rate": 6.2858073104426066e-06, - "loss": 0.2563, + "epoch": 3.976826323566512, + "grad_norm": 0.19908618927001953, + "learning_rate": 5.305968914029125e-06, + "loss": 0.3972, "step": 110345 }, { - "epoch": 3.88, - "learning_rate": 6.283918563464067e-06, - "loss": 0.2594, + "epoch": 3.977006523227736, + "grad_norm": 0.26034486293792725, + "learning_rate": 5.3041715341864415e-06, + "loss": 0.345, "step": 110350 }, { - "epoch": 3.88, - "learning_rate": 6.282030059501506e-06, - "loss": 0.2484, + "epoch": 3.977186722888961, + "grad_norm": 0.2160859853029251, + "learning_rate": 5.3023744226956106e-06, + "loss": 0.3957, "step": 110355 }, { - "epoch": 3.88, - "learning_rate": 6.280141798579445e-06, - "loss": 0.2491, + "epoch": 3.9773669225501855, + "grad_norm": 0.23834609985351562, + "learning_rate": 5.300577579581107e-06, + "loss": 0.387, "step": 110360 }, { - "epoch": 3.88, - "learning_rate": 6.2782537807224125e-06, - "loss": 0.2373, + "epoch": 3.9775471222114103, + "grad_norm": 0.2771296203136444, + "learning_rate": 5.298781004867425e-06, + "loss": 0.383, "step": 110365 }, { - "epoch": 3.88, - "learning_rate": 6.276366005954917e-06, - "loss": 0.2352, + "epoch": 3.977727321872635, + "grad_norm": 0.2532643973827362, + "learning_rate": 5.296984698579038e-06, + "loss": 0.3822, "step": 110370 }, { - "epoch": 3.88, - "learning_rate": 6.274478474301463e-06, - "loss": 0.2756, + "epoch": 3.9779075215338597, + "grad_norm": 0.24019809067249298, + "learning_rate": 5.295188660740422e-06, + "loss": 0.3729, "step": 110375 }, { - "epoch": 3.88, - "learning_rate": 6.27259118578657e-06, - "loss": 0.2509, + "epoch": 3.978087721195084, + "grad_norm": 0.20523947477340698, + "learning_rate": 5.2933928913760426e-06, + "loss": 0.3622, "step": 110380 }, { - "epoch": 3.88, - "learning_rate": 6.270704140434733e-06, - "loss": 0.2575, + "epoch": 3.9782679208563088, + "grad_norm": 0.21291251480579376, + "learning_rate": 5.291597390510364e-06, + "loss": 0.3355, "step": 110385 }, { - "epoch": 3.88, - "learning_rate": 6.268817338270466e-06, - "loss": 0.2689, + "epoch": 3.9784481205175335, + "grad_norm": 0.18715035915374756, + "learning_rate": 5.289802158167862e-06, + "loss": 0.3336, "step": 110390 }, { - "epoch": 3.88, - "learning_rate": 6.266930779318262e-06, - "loss": 0.2609, + "epoch": 3.9786283201787582, + "grad_norm": 0.24453963339328766, + "learning_rate": 5.288007194372996e-06, + "loss": 0.4144, "step": 110395 }, { - "epoch": 3.88, - "learning_rate": 6.26504446360261e-06, - "loss": 0.2579, + "epoch": 3.9788085198399825, + "grad_norm": 0.24042107164859772, + "learning_rate": 5.2862124991502035e-06, + "loss": 0.3996, "step": 110400 }, { - "epoch": 3.88, - "learning_rate": 6.2631583911480164e-06, - "loss": 0.2455, + "epoch": 3.9789887195012072, + "grad_norm": 0.2552798390388489, + "learning_rate": 5.284418072523953e-06, + "loss": 0.3834, "step": 110405 }, { - "epoch": 3.88, - "learning_rate": 6.261272561978962e-06, - "loss": 0.2613, + "epoch": 3.979168919162432, + "grad_norm": 0.273781418800354, + "learning_rate": 5.282623914518686e-06, + "loss": 0.3961, "step": 110410 }, { - "epoch": 3.88, - "learning_rate": 6.259386976119932e-06, - "loss": 0.2372, + "epoch": 3.9793491188236567, + "grad_norm": 0.2784363329410553, + "learning_rate": 5.280830025158861e-06, + "loss": 0.3527, "step": 110415 }, { - "epoch": 3.88, - "learning_rate": 6.257501633595408e-06, - "loss": 0.2716, + "epoch": 3.9795293184848815, + "grad_norm": 0.28789547085762024, + "learning_rate": 5.279036404468904e-06, + "loss": 0.3491, "step": 110420 }, { - "epoch": 3.89, - "learning_rate": 6.2556165344298785e-06, - "loss": 0.2526, + "epoch": 3.9797095181461057, + "grad_norm": 0.20534764230251312, + "learning_rate": 5.277243052473252e-06, + "loss": 0.3759, "step": 110425 }, { - "epoch": 3.89, - "learning_rate": 6.253731678647809e-06, - "loss": 0.2588, + "epoch": 3.9798897178073305, + "grad_norm": 0.29807037115097046, + "learning_rate": 5.27544996919635e-06, + "loss": 0.3851, "step": 110430 }, { - "epoch": 3.89, - "learning_rate": 6.251847066273686e-06, - "loss": 0.2479, + "epoch": 3.980069917468555, + "grad_norm": 0.2329607754945755, + "learning_rate": 5.273657154662626e-06, + "loss": 0.3944, "step": 110435 }, { - "epoch": 3.89, - "learning_rate": 6.249962697331968e-06, - "loss": 0.2687, + "epoch": 3.98025011712978, + "grad_norm": 0.2554316520690918, + "learning_rate": 5.271864608896501e-06, + "loss": 0.4016, "step": 110440 }, { - "epoch": 3.89, - "learning_rate": 6.248078571847133e-06, - "loss": 0.2849, + "epoch": 3.9804303167910042, + "grad_norm": 0.2084946185350418, + "learning_rate": 5.270072331922405e-06, + "loss": 0.3763, "step": 110445 }, { - "epoch": 3.89, - "learning_rate": 6.24619468984364e-06, - "loss": 0.2562, + "epoch": 3.980610516452229, + "grad_norm": 0.2905483841896057, + "learning_rate": 5.268280323764744e-06, + "loss": 0.3805, "step": 110450 }, { - "epoch": 3.89, - "learning_rate": 6.244311051345941e-06, - "loss": 0.2657, + "epoch": 3.9807907161134537, + "grad_norm": 0.2571609616279602, + "learning_rate": 5.266488584447949e-06, + "loss": 0.3814, "step": 110455 }, { - "epoch": 3.89, - "learning_rate": 6.242427656378514e-06, - "loss": 0.2758, + "epoch": 3.9809709157746784, + "grad_norm": 0.25238341093063354, + "learning_rate": 5.264697113996428e-06, + "loss": 0.396, "step": 110460 }, { - "epoch": 3.89, - "learning_rate": 6.240544504965798e-06, - "loss": 0.2533, + "epoch": 3.981151115435903, + "grad_norm": 0.21064473688602448, + "learning_rate": 5.2629059124345875e-06, + "loss": 0.3894, "step": 110465 }, { - "epoch": 3.89, - "learning_rate": 6.23866159713225e-06, - "loss": 0.2474, + "epoch": 3.9813313150971275, + "grad_norm": 0.19198505580425262, + "learning_rate": 5.261114979786832e-06, + "loss": 0.3529, "step": 110470 }, { - "epoch": 3.89, - "learning_rate": 6.23677893290231e-06, - "loss": 0.2443, + "epoch": 3.981511514758352, + "grad_norm": 0.20793737471103668, + "learning_rate": 5.259324316077561e-06, + "loss": 0.3754, "step": 110475 }, { - "epoch": 3.89, - "learning_rate": 6.234896512300437e-06, - "loss": 0.2682, + "epoch": 3.981691714419577, + "grad_norm": 0.2338850349187851, + "learning_rate": 5.257533921331176e-06, + "loss": 0.3651, "step": 110480 }, { - "epoch": 3.89, - "learning_rate": 6.233014335351059e-06, - "loss": 0.2568, + "epoch": 3.9818719140808017, + "grad_norm": 0.2929813265800476, + "learning_rate": 5.255743795572071e-06, + "loss": 0.3788, "step": 110485 }, { - "epoch": 3.89, - "learning_rate": 6.231132402078629e-06, - "loss": 0.2572, + "epoch": 3.982052113742026, + "grad_norm": 0.22587592899799347, + "learning_rate": 5.253953938824635e-06, + "loss": 0.3693, "step": 110490 }, { - "epoch": 3.89, - "learning_rate": 6.22925071250757e-06, - "loss": 0.262, + "epoch": 3.9822323134032507, + "grad_norm": 0.25955894589424133, + "learning_rate": 5.252164351113254e-06, + "loss": 0.3895, "step": 110495 }, { - "epoch": 3.89, - "learning_rate": 6.2273692666623255e-06, - "loss": 0.269, + "epoch": 3.9824125130644754, + "grad_norm": 0.22721461951732635, + "learning_rate": 5.250375032462307e-06, + "loss": 0.3641, "step": 110500 }, { - "epoch": 3.89, - "eval_loss": 0.25054579973220825, - "eval_runtime": 10.5376, - "eval_samples_per_second": 9.49, - "eval_steps_per_second": 9.49, + "epoch": 3.9824125130644754, + "eval_loss": 0.4280896484851837, + "eval_runtime": 3.5308, + "eval_samples_per_second": 28.322, + "eval_steps_per_second": 7.081, "step": 110500 }, { - "epoch": 3.89, - "learning_rate": 6.225488064567322e-06, - "loss": 0.2626, + "epoch": 3.9825927127257, + "grad_norm": 0.21194064617156982, + "learning_rate": 5.248585982896173e-06, + "loss": 0.3549, "step": 110505 }, { - "epoch": 3.89, - "learning_rate": 6.223607106246981e-06, - "loss": 0.2529, + "epoch": 3.982772912386925, + "grad_norm": 0.24252504110336304, + "learning_rate": 5.2467972024392435e-06, + "loss": 0.3785, "step": 110510 }, { - "epoch": 3.89, - "learning_rate": 6.2217263917257215e-06, - "loss": 0.2658, + "epoch": 3.9829531120481496, + "grad_norm": 0.28674301505088806, + "learning_rate": 5.245008691115863e-06, + "loss": 0.3665, "step": 110515 }, { - "epoch": 3.89, - "learning_rate": 6.219845921027975e-06, - "loss": 0.2449, + "epoch": 3.983133311709374, + "grad_norm": 0.25369998812675476, + "learning_rate": 5.243220448950423e-06, + "loss": 0.3448, "step": 110520 }, { - "epoch": 3.89, - "learning_rate": 6.217965694178152e-06, - "loss": 0.263, + "epoch": 3.9833135113705986, + "grad_norm": 0.19632992148399353, + "learning_rate": 5.241432475967278e-06, + "loss": 0.3655, "step": 110525 }, { - "epoch": 3.89, - "learning_rate": 6.21608571120067e-06, - "loss": 0.2426, + "epoch": 3.9834937110318234, + "grad_norm": 0.2441524714231491, + "learning_rate": 5.239644772190791e-06, + "loss": 0.3927, "step": 110530 }, { - "epoch": 3.89, - "learning_rate": 6.214205972119924e-06, - "loss": 0.2637, + "epoch": 3.9836739106930477, + "grad_norm": 0.2212579846382141, + "learning_rate": 5.237857337645319e-06, + "loss": 0.3874, "step": 110535 }, { - "epoch": 3.89, - "learning_rate": 6.212326476960345e-06, - "loss": 0.267, + "epoch": 3.9838541103542724, + "grad_norm": 0.2771473228931427, + "learning_rate": 5.236070172355212e-06, + "loss": 0.3673, "step": 110540 }, { - "epoch": 3.89, - "learning_rate": 6.210447225746316e-06, - "loss": 0.2568, + "epoch": 3.984034310015497, + "grad_norm": 0.2671329379081726, + "learning_rate": 5.234283276344818e-06, + "loss": 0.3517, "step": 110545 }, { - "epoch": 3.89, - "learning_rate": 6.208568218502253e-06, - "loss": 0.2487, + "epoch": 3.984214509676722, + "grad_norm": 0.20370586216449738, + "learning_rate": 5.232496649638494e-06, + "loss": 0.3667, "step": 110550 }, { - "epoch": 3.89, - "learning_rate": 6.206689455252543e-06, - "loss": 0.2381, + "epoch": 3.9843947093379466, + "grad_norm": 0.25102975964546204, + "learning_rate": 5.230710292260576e-06, + "loss": 0.382, "step": 110555 }, { - "epoch": 3.89, - "learning_rate": 6.20481093602159e-06, - "loss": 0.2342, + "epoch": 3.9845749089991713, + "grad_norm": 0.2473146915435791, + "learning_rate": 5.2289242042354025e-06, + "loss": 0.4072, "step": 110560 }, { - "epoch": 3.89, - "learning_rate": 6.202932660833783e-06, - "loss": 0.2762, + "epoch": 3.9847551086603956, + "grad_norm": 0.2885691523551941, + "learning_rate": 5.22713838558731e-06, + "loss": 0.3524, "step": 110565 }, { - "epoch": 3.89, - "learning_rate": 6.201054629713507e-06, - "loss": 0.2464, + "epoch": 3.9849353083216204, + "grad_norm": 0.2779085338115692, + "learning_rate": 5.225352836340622e-06, + "loss": 0.3521, "step": 110570 }, { - "epoch": 3.89, - "learning_rate": 6.199176842685139e-06, - "loss": 0.2611, + "epoch": 3.985115507982845, + "grad_norm": 0.2686401903629303, + "learning_rate": 5.223567556519679e-06, + "loss": 0.3986, "step": 110575 }, { - "epoch": 3.89, - "learning_rate": 6.197299299773077e-06, - "loss": 0.2453, + "epoch": 3.9852957076440694, + "grad_norm": 0.20243895053863525, + "learning_rate": 5.221782546148804e-06, + "loss": 0.358, "step": 110580 }, { - "epoch": 3.89, - "learning_rate": 6.195422001001694e-06, - "loss": 0.2531, + "epoch": 3.985475907305294, + "grad_norm": 0.2190602421760559, + "learning_rate": 5.219997805252303e-06, + "loss": 0.3865, "step": 110585 }, { - "epoch": 3.89, - "learning_rate": 6.193544946395355e-06, - "loss": 0.2641, + "epoch": 3.985656106966519, + "grad_norm": 0.2563530504703522, + "learning_rate": 5.2182133338545074e-06, + "loss": 0.3668, "step": 110590 }, { - "epoch": 3.89, - "learning_rate": 6.191668135978448e-06, - "loss": 0.2675, + "epoch": 3.9858363066277436, + "grad_norm": 0.26702284812927246, + "learning_rate": 5.216429131979717e-06, + "loss": 0.3666, "step": 110595 }, { - "epoch": 3.89, - "learning_rate": 6.189791569775327e-06, - "loss": 0.2526, + "epoch": 3.9860165062889683, + "grad_norm": 0.243973970413208, + "learning_rate": 5.214645199652257e-06, + "loss": 0.3657, "step": 110600 }, { - "epoch": 3.89, - "learning_rate": 6.187915247810375e-06, - "loss": 0.2349, + "epoch": 3.986196705950193, + "grad_norm": 0.22171549499034882, + "learning_rate": 5.212861536896435e-06, + "loss": 0.4154, "step": 110605 }, { - "epoch": 3.89, - "learning_rate": 6.1860391701079376e-06, - "loss": 0.253, + "epoch": 3.9863769056114173, + "grad_norm": 0.224653422832489, + "learning_rate": 5.211078143736528e-06, + "loss": 0.3901, "step": 110610 }, { - "epoch": 3.89, - "learning_rate": 6.184163336692392e-06, - "loss": 0.2703, + "epoch": 3.986557105272642, + "grad_norm": 0.2409631460905075, + "learning_rate": 5.209295020196855e-06, + "loss": 0.3659, "step": 110615 }, { - "epoch": 3.89, - "learning_rate": 6.182287747588084e-06, - "loss": 0.2514, + "epoch": 3.986737304933867, + "grad_norm": 0.21369391679763794, + "learning_rate": 5.207512166301709e-06, + "loss": 0.3474, "step": 110620 }, { - "epoch": 3.89, - "learning_rate": 6.180412402819366e-06, - "loss": 0.2467, + "epoch": 3.986917504595091, + "grad_norm": 0.24725468456745148, + "learning_rate": 5.205729582075375e-06, + "loss": 0.3711, "step": 110625 }, { - "epoch": 3.89, - "learning_rate": 6.178537302410587e-06, - "loss": 0.2518, + "epoch": 3.987097704256316, + "grad_norm": 0.23448602855205536, + "learning_rate": 5.203947267542145e-06, + "loss": 0.3823, "step": 110630 }, { - "epoch": 3.89, - "learning_rate": 6.1766624463861e-06, - "loss": 0.2674, + "epoch": 3.9872779039175406, + "grad_norm": 0.20930229127407074, + "learning_rate": 5.202165222726294e-06, + "loss": 0.3424, "step": 110635 }, { - "epoch": 3.89, - "learning_rate": 6.174787834770251e-06, - "loss": 0.2545, + "epoch": 3.9874581035787653, + "grad_norm": 0.2615196406841278, + "learning_rate": 5.200383447652115e-06, + "loss": 0.3677, "step": 110640 }, { - "epoch": 3.89, - "learning_rate": 6.172913467587374e-06, - "loss": 0.259, + "epoch": 3.98763830323999, + "grad_norm": 0.27675536274909973, + "learning_rate": 5.198601942343878e-06, + "loss": 0.3756, "step": 110645 }, { - "epoch": 3.89, - "learning_rate": 6.171039344861798e-06, - "loss": 0.2691, + "epoch": 3.9878185029012148, + "grad_norm": 0.2948794364929199, + "learning_rate": 5.196820706825856e-06, + "loss": 0.4064, "step": 110650 }, { - "epoch": 3.89, - "learning_rate": 6.169165466617871e-06, - "loss": 0.2591, + "epoch": 3.987998702562439, + "grad_norm": 0.2301834374666214, + "learning_rate": 5.195039741122318e-06, + "loss": 0.337, "step": 110655 }, { - "epoch": 3.89, - "learning_rate": 6.167291832879926e-06, - "loss": 0.2503, + "epoch": 3.988178902223664, + "grad_norm": 0.27218741178512573, + "learning_rate": 5.193259045257523e-06, + "loss": 0.4075, "step": 110660 }, { - "epoch": 3.89, - "learning_rate": 6.165418443672285e-06, - "loss": 0.2583, + "epoch": 3.9883591018848885, + "grad_norm": 0.2404523491859436, + "learning_rate": 5.191478619255746e-06, + "loss": 0.3845, "step": 110665 }, { - "epoch": 3.89, - "learning_rate": 6.163545299019266e-06, - "loss": 0.2442, + "epoch": 3.9885393015461132, + "grad_norm": 0.2734452486038208, + "learning_rate": 5.189698463141237e-06, + "loss": 0.3988, "step": 110670 }, { - "epoch": 3.89, - "learning_rate": 6.1616723989452064e-06, - "loss": 0.282, + "epoch": 3.9887195012073375, + "grad_norm": 0.2366897016763687, + "learning_rate": 5.18791857693825e-06, + "loss": 0.3859, "step": 110675 }, { - "epoch": 3.89, - "learning_rate": 6.159799743474415e-06, - "loss": 0.2496, + "epoch": 3.9888997008685623, + "grad_norm": 0.22594159841537476, + "learning_rate": 5.186138960671039e-06, + "loss": 0.3872, "step": 110680 }, { - "epoch": 3.89, - "learning_rate": 6.1579273326312085e-06, - "loss": 0.2592, + "epoch": 3.989079900529787, + "grad_norm": 0.24368955194950104, + "learning_rate": 5.184359614363846e-06, + "loss": 0.3196, "step": 110685 }, { - "epoch": 3.89, - "learning_rate": 6.156055166439889e-06, - "loss": 0.248, + "epoch": 3.9892601001910117, + "grad_norm": 0.25890570878982544, + "learning_rate": 5.18258053804091e-06, + "loss": 0.3746, "step": 110690 }, { - "epoch": 3.89, - "learning_rate": 6.154183244924783e-06, - "loss": 0.271, + "epoch": 3.9894402998522365, + "grad_norm": 0.23485039174556732, + "learning_rate": 5.180801731726493e-06, + "loss": 0.364, "step": 110695 }, { - "epoch": 3.89, - "learning_rate": 6.152311568110186e-06, - "loss": 0.2695, + "epoch": 3.9896204995134608, + "grad_norm": 0.18861232697963715, + "learning_rate": 5.179023195444802e-06, + "loss": 0.3839, "step": 110700 }, { - "epoch": 3.89, - "learning_rate": 6.150440136020397e-06, - "loss": 0.2529, + "epoch": 3.9898006991746855, + "grad_norm": 0.23490877449512482, + "learning_rate": 5.177244929220088e-06, + "loss": 0.3705, "step": 110705 }, { - "epoch": 3.9, - "learning_rate": 6.14856894867972e-06, - "loss": 0.2594, + "epoch": 3.9899808988359102, + "grad_norm": 0.2072230577468872, + "learning_rate": 5.175466933076573e-06, + "loss": 0.3797, "step": 110710 }, { - "epoch": 3.9, - "learning_rate": 6.1466980061124586e-06, - "loss": 0.2701, + "epoch": 3.990161098497135, + "grad_norm": 0.2669602334499359, + "learning_rate": 5.173689207038479e-06, + "loss": 0.3559, "step": 110715 }, { - "epoch": 3.9, - "learning_rate": 6.144827308342899e-06, - "loss": 0.2438, + "epoch": 3.9903412981583593, + "grad_norm": 0.2188437432050705, + "learning_rate": 5.17191175113004e-06, + "loss": 0.3732, "step": 110720 }, { - "epoch": 3.9, - "learning_rate": 6.142956855395324e-06, - "loss": 0.2585, + "epoch": 3.990521497819584, + "grad_norm": 0.2595865726470947, + "learning_rate": 5.1701345653754615e-06, + "loss": 0.3975, "step": 110725 }, { - "epoch": 3.9, - "learning_rate": 6.141086647294034e-06, - "loss": 0.2527, + "epoch": 3.9907016974808087, + "grad_norm": 0.22355596721172333, + "learning_rate": 5.168357649798952e-06, + "loss": 0.3346, "step": 110730 }, { - "epoch": 3.9, - "learning_rate": 6.139216684063307e-06, - "loss": 0.2485, + "epoch": 3.9908818971420335, + "grad_norm": 0.19183094799518585, + "learning_rate": 5.1665810044247395e-06, + "loss": 0.3851, "step": 110735 }, { - "epoch": 3.9, - "learning_rate": 6.137346965727417e-06, - "loss": 0.2606, + "epoch": 3.991062096803258, + "grad_norm": 0.2600308060646057, + "learning_rate": 5.164804629277018e-06, + "loss": 0.3942, "step": 110740 }, { - "epoch": 3.9, - "learning_rate": 6.135477492310643e-06, - "loss": 0.257, + "epoch": 3.991242296464483, + "grad_norm": 0.2572632431983948, + "learning_rate": 5.163028524379995e-06, + "loss": 0.4191, "step": 110745 }, { - "epoch": 3.9, - "learning_rate": 6.133608263837265e-06, - "loss": 0.2391, + "epoch": 3.991422496125707, + "grad_norm": 0.21313636004924774, + "learning_rate": 5.161252689757867e-06, + "loss": 0.3822, "step": 110750 }, { - "epoch": 3.9, - "learning_rate": 6.131739280331553e-06, - "loss": 0.2296, + "epoch": 3.991602695786932, + "grad_norm": 0.24319961667060852, + "learning_rate": 5.159477125434822e-06, + "loss": 0.3934, "step": 110755 }, { - "epoch": 3.9, - "learning_rate": 6.129870541817765e-06, - "loss": 0.2693, + "epoch": 3.9917828954481567, + "grad_norm": 0.2313625067472458, + "learning_rate": 5.157701831435069e-06, + "loss": 0.3736, "step": 110760 }, { - "epoch": 3.9, - "learning_rate": 6.128002048320169e-06, - "loss": 0.2424, + "epoch": 3.991963095109381, + "grad_norm": 0.2143564522266388, + "learning_rate": 5.155926807782785e-06, + "loss": 0.3559, "step": 110765 }, { - "epoch": 3.9, - "learning_rate": 6.12613379986304e-06, - "loss": 0.2491, + "epoch": 3.9921432947706057, + "grad_norm": 0.28823596239089966, + "learning_rate": 5.15415205450216e-06, + "loss": 0.3423, "step": 110770 }, { - "epoch": 3.9, - "learning_rate": 6.124265796470624e-06, - "loss": 0.2508, + "epoch": 3.9923234944318304, + "grad_norm": 0.21266886591911316, + "learning_rate": 5.152377571617368e-06, + "loss": 0.367, "step": 110775 }, { - "epoch": 3.9, - "learning_rate": 6.122398038167176e-06, - "loss": 0.2627, + "epoch": 3.992503694093055, + "grad_norm": 0.25234442949295044, + "learning_rate": 5.150603359152581e-06, + "loss": 0.3763, "step": 110780 }, { - "epoch": 3.9, - "learning_rate": 6.120530524976942e-06, - "loss": 0.2383, + "epoch": 3.99268389375428, + "grad_norm": 0.2379533350467682, + "learning_rate": 5.14882941713199e-06, + "loss": 0.3801, "step": 110785 }, { - "epoch": 3.9, - "learning_rate": 6.118663256924187e-06, - "loss": 0.262, + "epoch": 3.9928640934155046, + "grad_norm": 0.26140525937080383, + "learning_rate": 5.147410458251575e-06, + "loss": 0.3709, "step": 110790 }, { - "epoch": 3.9, - "learning_rate": 6.116796234033143e-06, - "loss": 0.2547, + "epoch": 3.993044293076729, + "grad_norm": 0.24928440153598785, + "learning_rate": 5.1456370030914245e-06, + "loss": 0.3754, "step": 110795 }, { - "epoch": 3.9, - "learning_rate": 6.114929456328056e-06, - "loss": 0.2659, + "epoch": 3.9932244927379537, + "grad_norm": 0.2500706613063812, + "learning_rate": 5.1438638184431175e-06, + "loss": 0.3691, "step": 110800 }, { - "epoch": 3.9, - "learning_rate": 6.113062923833157e-06, - "loss": 0.2456, + "epoch": 3.9934046923991784, + "grad_norm": 0.2307901233434677, + "learning_rate": 5.142090904330829e-06, + "loss": 0.3391, "step": 110805 }, { - "epoch": 3.9, - "learning_rate": 6.111196636572697e-06, - "loss": 0.2617, + "epoch": 3.9935848920604027, + "grad_norm": 0.2293684333562851, + "learning_rate": 5.1403182607787065e-06, + "loss": 0.3591, "step": 110810 }, { - "epoch": 3.9, - "learning_rate": 6.109330594570892e-06, - "loss": 0.2244, + "epoch": 3.9937650917216274, + "grad_norm": 0.21017733216285706, + "learning_rate": 5.138545887810903e-06, + "loss": 0.3675, "step": 110815 }, { - "epoch": 3.9, - "learning_rate": 6.107464797851989e-06, - "loss": 0.2801, + "epoch": 3.993945291382852, + "grad_norm": 0.26224109530448914, + "learning_rate": 5.136773785451565e-06, + "loss": 0.3531, "step": 110820 }, { - "epoch": 3.9, - "learning_rate": 6.105599246440197e-06, - "loss": 0.2624, + "epoch": 3.994125491044077, + "grad_norm": 0.20869319140911102, + "learning_rate": 5.135001953724833e-06, + "loss": 0.3558, "step": 110825 }, { - "epoch": 3.9, - "learning_rate": 6.103733940359754e-06, - "loss": 0.2427, + "epoch": 3.9943056907053016, + "grad_norm": 0.3143329918384552, + "learning_rate": 5.133230392654861e-06, + "loss": 0.3956, "step": 110830 }, { - "epoch": 3.9, - "learning_rate": 6.101868879634873e-06, - "loss": 0.265, + "epoch": 3.9944858903665263, + "grad_norm": 0.25546544790267944, + "learning_rate": 5.131459102265779e-06, + "loss": 0.3808, "step": 110835 }, { - "epoch": 3.9, - "learning_rate": 6.10000406428976e-06, - "loss": 0.2538, + "epoch": 3.9946660900277506, + "grad_norm": 0.20834296941757202, + "learning_rate": 5.1296880825817185e-06, + "loss": 0.3834, "step": 110840 }, { - "epoch": 3.9, - "learning_rate": 6.098139494348651e-06, - "loss": 0.249, + "epoch": 3.9948462896889754, + "grad_norm": 0.2101668119430542, + "learning_rate": 5.127917333626811e-06, + "loss": 0.3633, "step": 110845 }, { - "epoch": 3.9, - "learning_rate": 6.096275169835742e-06, - "loss": 0.2571, + "epoch": 3.9950264893502, + "grad_norm": 0.2534985840320587, + "learning_rate": 5.126146855425176e-06, + "loss": 0.3666, "step": 110850 }, { - "epoch": 3.9, - "learning_rate": 6.09441109077524e-06, - "loss": 0.2541, + "epoch": 3.9952066890114244, + "grad_norm": 0.1858637034893036, + "learning_rate": 5.12437664800095e-06, + "loss": 0.3767, "step": 110855 }, { - "epoch": 3.9, - "learning_rate": 6.092547257191344e-06, - "loss": 0.2631, + "epoch": 3.995386888672649, + "grad_norm": 0.25924816727638245, + "learning_rate": 5.12260671137825e-06, + "loss": 0.3932, "step": 110860 }, { - "epoch": 3.9, - "learning_rate": 6.0906836691082685e-06, - "loss": 0.2344, + "epoch": 3.995567088333874, + "grad_norm": 0.2486696094274521, + "learning_rate": 5.12083704558117e-06, + "loss": 0.3783, "step": 110865 }, { - "epoch": 3.9, - "learning_rate": 6.088820326550199e-06, - "loss": 0.2398, + "epoch": 3.9957472879950986, + "grad_norm": 0.24414058029651642, + "learning_rate": 5.119067650633847e-06, + "loss": 0.3898, "step": 110870 }, { - "epoch": 3.9, - "learning_rate": 6.0869572295413405e-06, - "loss": 0.2803, + "epoch": 3.9959274876563233, + "grad_norm": 0.22701597213745117, + "learning_rate": 5.117298526560374e-06, + "loss": 0.3538, "step": 110875 }, { - "epoch": 3.9, - "learning_rate": 6.085094378105868e-06, - "loss": 0.2657, + "epoch": 3.996107687317548, + "grad_norm": 0.2523609697818756, + "learning_rate": 5.115529673384861e-06, + "loss": 0.3745, "step": 110880 }, { - "epoch": 3.9, - "learning_rate": 6.083231772267991e-06, - "loss": 0.2532, + "epoch": 3.9962878869787724, + "grad_norm": 0.21400229632854462, + "learning_rate": 5.113761091131408e-06, + "loss": 0.3759, "step": 110885 }, { - "epoch": 3.9, - "learning_rate": 6.081369412051882e-06, - "loss": 0.2559, + "epoch": 3.996468086639997, + "grad_norm": 0.2355312556028366, + "learning_rate": 5.111992779824101e-06, + "loss": 0.3256, "step": 110890 }, { - "epoch": 3.9, - "learning_rate": 6.079507297481723e-06, - "loss": 0.2637, + "epoch": 3.996648286301222, + "grad_norm": 0.231663778424263, + "learning_rate": 5.110224739487051e-06, + "loss": 0.4152, "step": 110895 }, { - "epoch": 3.9, - "learning_rate": 6.0776454285816856e-06, - "loss": 0.2566, + "epoch": 3.9968284859624466, + "grad_norm": 0.22095178067684174, + "learning_rate": 5.108456970144338e-06, + "loss": 0.3665, "step": 110900 }, { - "epoch": 3.9, - "learning_rate": 6.075783805375956e-06, - "loss": 0.2534, + "epoch": 3.997008685623671, + "grad_norm": 0.31155142188072205, + "learning_rate": 5.106689471820045e-06, + "loss": 0.3996, "step": 110905 }, { - "epoch": 3.9, - "learning_rate": 6.073922427888707e-06, - "loss": 0.2611, + "epoch": 3.9971888852848956, + "grad_norm": 0.18749769032001495, + "learning_rate": 5.10492224453826e-06, + "loss": 0.3583, "step": 110910 }, { - "epoch": 3.9, - "learning_rate": 6.072061296144099e-06, - "loss": 0.2659, + "epoch": 3.9973690849461203, + "grad_norm": 0.28036782145500183, + "learning_rate": 5.1031552883230475e-06, + "loss": 0.3738, "step": 110915 }, { - "epoch": 3.9, - "learning_rate": 6.070200410166297e-06, - "loss": 0.2797, + "epoch": 3.997549284607345, + "grad_norm": 0.2641852796077728, + "learning_rate": 5.101388603198501e-06, + "loss": 0.3823, "step": 110920 }, { - "epoch": 3.9, - "learning_rate": 6.068339769979465e-06, - "loss": 0.2757, + "epoch": 3.9977294842685698, + "grad_norm": 0.22281073033809662, + "learning_rate": 5.099622189188679e-06, + "loss": 0.3982, "step": 110925 }, { - "epoch": 3.9, - "learning_rate": 6.066479375607773e-06, - "loss": 0.2475, + "epoch": 3.997909683929794, + "grad_norm": 0.2702135741710663, + "learning_rate": 5.097856046317656e-06, + "loss": 0.3821, "step": 110930 }, { - "epoch": 3.9, - "learning_rate": 6.06461922707536e-06, - "loss": 0.2344, + "epoch": 3.998089883591019, + "grad_norm": 0.25473445653915405, + "learning_rate": 5.096090174609489e-06, + "loss": 0.3727, "step": 110935 }, { - "epoch": 3.9, - "learning_rate": 6.062759324406397e-06, - "loss": 0.2698, + "epoch": 3.9982700832522435, + "grad_norm": 0.2873947024345398, + "learning_rate": 5.0943245740882414e-06, + "loss": 0.3805, "step": 110940 }, { - "epoch": 3.9, - "learning_rate": 6.060899667625023e-06, - "loss": 0.2541, + "epoch": 3.9984502829134683, + "grad_norm": 0.211199089884758, + "learning_rate": 5.092559244777958e-06, + "loss": 0.358, "step": 110945 }, { - "epoch": 3.9, - "learning_rate": 6.059040256755386e-06, - "loss": 0.2707, + "epoch": 3.9986304825746926, + "grad_norm": 0.256854772567749, + "learning_rate": 5.090794186702711e-06, + "loss": 0.3848, "step": 110950 }, { - "epoch": 3.9, - "learning_rate": 6.057181091821621e-06, - "loss": 0.2364, + "epoch": 3.9988106822359173, + "grad_norm": 0.22104725241661072, + "learning_rate": 5.0890293998865355e-06, + "loss": 0.3522, "step": 110955 }, { - "epoch": 3.9, - "learning_rate": 6.055322172847882e-06, - "loss": 0.2434, + "epoch": 3.998990881897142, + "grad_norm": 0.2235444188117981, + "learning_rate": 5.087264884353482e-06, + "loss": 0.3764, "step": 110960 }, { - "epoch": 3.9, - "learning_rate": 6.0534634998583e-06, - "loss": 0.2586, + "epoch": 3.9991710815583668, + "grad_norm": 0.21866776049137115, + "learning_rate": 5.085500640127588e-06, + "loss": 0.366, "step": 110965 }, { - "epoch": 3.9, - "learning_rate": 6.051605072877009e-06, - "loss": 0.2427, + "epoch": 3.9993512812195915, + "grad_norm": 0.2265380173921585, + "learning_rate": 5.083736667232886e-06, + "loss": 0.3968, "step": 110970 }, { - "epoch": 3.9, - "learning_rate": 6.049746891928132e-06, - "loss": 0.2499, + "epoch": 3.999531480880816, + "grad_norm": 0.19983336329460144, + "learning_rate": 5.081972965693429e-06, + "loss": 0.3938, "step": 110975 }, { - "epoch": 3.9, - "learning_rate": 6.047888957035808e-06, - "loss": 0.2687, + "epoch": 3.9997116805420405, + "grad_norm": 0.2481551617383957, + "learning_rate": 5.080209535533229e-06, + "loss": 0.3913, "step": 110980 }, { - "epoch": 3.9, - "learning_rate": 6.04603126822415e-06, - "loss": 0.2442, + "epoch": 3.9998918802032652, + "grad_norm": 0.20951221883296967, + "learning_rate": 5.078446376776308e-06, + "loss": 0.3391, "step": 110985 }, { - "epoch": 3.9, - "learning_rate": 6.0441738255172905e-06, - "loss": 0.2566, + "epoch": 4.0000720798644895, + "grad_norm": 0.25803592801094055, + "learning_rate": 5.076683489446707e-06, + "loss": 0.3555, "step": 110990 }, { - "epoch": 3.91, - "learning_rate": 6.042316628939337e-06, - "loss": 0.2521, + "epoch": 4.000252279525714, + "grad_norm": 0.17808286845684052, + "learning_rate": 5.074920873568434e-06, + "loss": 0.3527, "step": 110995 }, { - "epoch": 3.91, - "learning_rate": 6.0404596785144155e-06, - "loss": 0.2526, + "epoch": 4.000432479186939, + "grad_norm": 0.2654048502445221, + "learning_rate": 5.073158529165508e-06, + "loss": 0.3552, "step": 111000 }, { - "epoch": 3.91, - "eval_loss": 0.25022944808006287, - "eval_runtime": 10.5381, - "eval_samples_per_second": 9.489, - "eval_steps_per_second": 9.489, + "epoch": 4.000432479186939, + "eval_loss": 0.42817291617393494, + "eval_runtime": 3.521, + "eval_samples_per_second": 28.401, + "eval_steps_per_second": 7.1, "step": 111000 }, { - "epoch": 3.91, - "learning_rate": 6.038602974266627e-06, - "loss": 0.2202, + "epoch": 4.000612678848164, + "grad_norm": 0.34447306394577026, + "learning_rate": 5.07139645626194e-06, + "loss": 0.3778, "step": 111005 }, { - "epoch": 3.91, - "learning_rate": 6.036746516220088e-06, - "loss": 0.2684, + "epoch": 4.0007928785093885, + "grad_norm": 0.22575746476650238, + "learning_rate": 5.069634654881728e-06, + "loss": 0.4257, "step": 111010 }, { - "epoch": 3.91, - "learning_rate": 6.034890304398888e-06, - "loss": 0.2788, + "epoch": 4.000973078170613, + "grad_norm": 0.23802292346954346, + "learning_rate": 5.067873125048894e-06, + "loss": 0.3794, "step": 111015 }, { - "epoch": 3.91, - "learning_rate": 6.033034338827146e-06, - "loss": 0.2533, + "epoch": 4.001153277831838, + "grad_norm": 0.20055340230464935, + "learning_rate": 5.066111866787429e-06, + "loss": 0.3623, "step": 111020 }, { - "epoch": 3.91, - "learning_rate": 6.031178619528951e-06, - "loss": 0.2626, + "epoch": 4.001333477493063, + "grad_norm": 0.23983781039714813, + "learning_rate": 5.06435088012133e-06, + "loss": 0.3751, "step": 111025 }, { - "epoch": 3.91, - "learning_rate": 6.0293231465283986e-06, - "loss": 0.2392, + "epoch": 4.0015136771542865, + "grad_norm": 0.27245986461639404, + "learning_rate": 5.062590165074591e-06, + "loss": 0.3687, "step": 111030 }, { - "epoch": 3.91, - "learning_rate": 6.0274679198495856e-06, - "loss": 0.236, + "epoch": 4.001693876815511, + "grad_norm": 0.20769277215003967, + "learning_rate": 5.060829721671193e-06, + "loss": 0.365, "step": 111035 }, { - "epoch": 3.91, - "learning_rate": 6.025612939516595e-06, - "loss": 0.2892, + "epoch": 4.001874076476736, + "grad_norm": 0.2103443592786789, + "learning_rate": 5.059069549935139e-06, + "loss": 0.349, "step": 111040 }, { - "epoch": 3.91, - "learning_rate": 6.0237582055535205e-06, - "loss": 0.2554, + "epoch": 4.002054276137961, + "grad_norm": 0.26758506894111633, + "learning_rate": 5.057309649890407e-06, + "loss": 0.3724, "step": 111045 }, { - "epoch": 3.91, - "learning_rate": 6.021903717984434e-06, - "loss": 0.2808, + "epoch": 4.0022344757991855, + "grad_norm": 0.22477580606937408, + "learning_rate": 5.055550021560956e-06, + "loss": 0.3396, "step": 111050 }, { - "epoch": 3.91, - "learning_rate": 6.020049476833428e-06, - "loss": 0.2824, + "epoch": 4.00241467546041, + "grad_norm": 0.2671130299568176, + "learning_rate": 5.053790664970781e-06, + "loss": 0.3947, "step": 111055 }, { - "epoch": 3.91, - "learning_rate": 6.0181954821245695e-06, - "loss": 0.2596, + "epoch": 4.002594875121635, + "grad_norm": 0.24974533915519714, + "learning_rate": 5.052031580143848e-06, + "loss": 0.3721, "step": 111060 }, { - "epoch": 3.91, - "learning_rate": 6.016341733881936e-06, - "loss": 0.2509, + "epoch": 4.00277507478286, + "grad_norm": 0.2837058901786804, + "learning_rate": 5.050272767104114e-06, + "loss": 0.3729, "step": 111065 }, { - "epoch": 3.91, - "learning_rate": 6.014488232129584e-06, - "loss": 0.2612, + "epoch": 4.002955274444084, + "grad_norm": 0.29399046301841736, + "learning_rate": 5.048514225875567e-06, + "loss": 0.4003, "step": 111070 }, { - "epoch": 3.91, - "learning_rate": 6.012634976891598e-06, - "loss": 0.2241, + "epoch": 4.003135474105309, + "grad_norm": 0.2518676221370697, + "learning_rate": 5.046755956482135e-06, + "loss": 0.386, "step": 111075 }, { - "epoch": 3.91, - "learning_rate": 6.010781968192037e-06, - "loss": 0.2892, + "epoch": 4.003315673766533, + "grad_norm": 0.3091764748096466, + "learning_rate": 5.044997958947801e-06, + "loss": 0.3661, "step": 111080 }, { - "epoch": 3.91, - "learning_rate": 6.008929206054956e-06, - "loss": 0.2441, + "epoch": 4.003495873427758, + "grad_norm": 0.24037063121795654, + "learning_rate": 5.0432402332965005e-06, + "loss": 0.3483, "step": 111085 }, { - "epoch": 3.91, - "learning_rate": 6.007076690504404e-06, - "loss": 0.2606, + "epoch": 4.003676073088982, + "grad_norm": 0.2921180725097656, + "learning_rate": 5.041482779552192e-06, + "loss": 0.3889, "step": 111090 }, { - "epoch": 3.91, - "learning_rate": 6.0052244215644484e-06, - "loss": 0.2838, + "epoch": 4.003856272750207, + "grad_norm": 0.20005546510219574, + "learning_rate": 5.039725597738815e-06, + "loss": 0.3702, "step": 111095 }, { - "epoch": 3.91, - "learning_rate": 6.003372399259138e-06, - "loss": 0.2421, + "epoch": 4.004036472411432, + "grad_norm": 0.25480973720550537, + "learning_rate": 5.037968687880306e-06, + "loss": 0.363, "step": 111100 }, { - "epoch": 3.91, - "learning_rate": 6.001520623612522e-06, - "loss": 0.2662, + "epoch": 4.004216672072657, + "grad_norm": 0.2529853284358978, + "learning_rate": 5.036212050000616e-06, + "loss": 0.3698, "step": 111105 }, { - "epoch": 3.91, - "learning_rate": 5.999669094648633e-06, - "loss": 0.2496, + "epoch": 4.004396871733881, + "grad_norm": 0.2999667525291443, + "learning_rate": 5.034455684123673e-06, + "loss": 0.3507, "step": 111110 }, { - "epoch": 3.91, - "learning_rate": 5.997817812391523e-06, - "loss": 0.2536, + "epoch": 4.004577071395106, + "grad_norm": 0.21827971935272217, + "learning_rate": 5.032699590273404e-06, + "loss": 0.3973, "step": 111115 }, { - "epoch": 3.91, - "learning_rate": 5.995966776865228e-06, - "loss": 0.2571, + "epoch": 4.004757271056331, + "grad_norm": 0.2813790440559387, + "learning_rate": 5.030943768473736e-06, + "loss": 0.3708, "step": 111120 }, { - "epoch": 3.91, - "learning_rate": 5.994115988093782e-06, - "loss": 0.2573, + "epoch": 4.004937470717555, + "grad_norm": 0.28172221779823303, + "learning_rate": 5.029188218748595e-06, + "loss": 0.38, "step": 111125 }, { - "epoch": 3.91, - "learning_rate": 5.992265446101203e-06, - "loss": 0.2578, + "epoch": 4.005117670378779, + "grad_norm": 0.21738965809345245, + "learning_rate": 5.027432941121893e-06, + "loss": 0.3671, "step": 111130 }, { - "epoch": 3.91, - "learning_rate": 5.990415150911541e-06, - "loss": 0.297, + "epoch": 4.005297870040004, + "grad_norm": 0.24970416724681854, + "learning_rate": 5.025677935617554e-06, + "loss": 0.3419, "step": 111135 }, { - "epoch": 3.91, - "learning_rate": 5.98856510254881e-06, - "loss": 0.2555, + "epoch": 4.005478069701229, + "grad_norm": Infinity, + "learning_rate": 5.024274127158257e-06, + "loss": 0.3606, "step": 111140 }, { - "epoch": 3.91, - "learning_rate": 5.986715301037024e-06, - "loss": 0.2434, + "epoch": 4.005658269362454, + "grad_norm": 0.289569228887558, + "learning_rate": 5.0225196115344124e-06, + "loss": 0.3755, "step": 111145 }, { - "epoch": 3.91, - "learning_rate": 5.984865746400211e-06, - "loss": 0.266, + "epoch": 4.005838469023678, + "grad_norm": 0.2360028475522995, + "learning_rate": 5.020765368099878e-06, + "loss": 0.3216, "step": 111150 }, { - "epoch": 3.91, - "learning_rate": 5.983016438662392e-06, - "loss": 0.2509, + "epoch": 4.006018668684903, + "grad_norm": 0.26487815380096436, + "learning_rate": 5.019011396878548e-06, + "loss": 0.3865, "step": 111155 }, { - "epoch": 3.91, - "learning_rate": 5.981167377847571e-06, - "loss": 0.2096, + "epoch": 4.006198868346128, + "grad_norm": 0.19243910908699036, + "learning_rate": 5.017257697894321e-06, + "loss": 0.3547, "step": 111160 }, { - "epoch": 3.91, - "learning_rate": 5.97931856397975e-06, - "loss": 0.2772, + "epoch": 4.0063790680073526, + "grad_norm": 0.2855333983898163, + "learning_rate": 5.015504271171087e-06, + "loss": 0.3976, "step": 111165 }, { - "epoch": 3.91, - "learning_rate": 5.977469997082949e-06, - "loss": 0.2639, + "epoch": 4.006559267668576, + "grad_norm": 0.2589762508869171, + "learning_rate": 5.013751116732732e-06, + "loss": 0.3747, "step": 111170 }, { - "epoch": 3.91, - "learning_rate": 5.975621677181165e-06, - "loss": 0.2814, + "epoch": 4.006739467329801, + "grad_norm": 0.22709418833255768, + "learning_rate": 5.011998234603155e-06, + "loss": 0.357, "step": 111175 }, { - "epoch": 3.91, - "learning_rate": 5.973773604298397e-06, - "loss": 0.2605, + "epoch": 4.006919666991026, + "grad_norm": 0.2505840063095093, + "learning_rate": 5.010245624806232e-06, + "loss": 0.3812, "step": 111180 }, { - "epoch": 3.91, - "learning_rate": 5.9719257784586316e-06, - "loss": 0.265, + "epoch": 4.007099866652251, + "grad_norm": 0.23559889197349548, + "learning_rate": 5.008493287365842e-06, + "loss": 0.3925, "step": 111185 }, { - "epoch": 3.91, - "learning_rate": 5.970078199685877e-06, - "loss": 0.2361, + "epoch": 4.007280066313475, + "grad_norm": 0.22356297075748444, + "learning_rate": 5.006741222305861e-06, + "loss": 0.3622, "step": 111190 }, { - "epoch": 3.91, - "learning_rate": 5.968230868004115e-06, - "loss": 0.258, + "epoch": 4.0074602659747, + "grad_norm": 0.1752263903617859, + "learning_rate": 5.004989429650161e-06, + "loss": 0.3287, "step": 111195 }, { - "epoch": 3.91, - "learning_rate": 5.966383783437329e-06, - "loss": 0.2375, + "epoch": 4.007640465635925, + "grad_norm": 0.21405918896198273, + "learning_rate": 5.003237909422601e-06, + "loss": 0.3745, "step": 111200 }, { - "epoch": 3.91, - "learning_rate": 5.964536946009502e-06, - "loss": 0.241, + "epoch": 4.0078206652971495, + "grad_norm": 0.27426615357398987, + "learning_rate": 5.001486661647059e-06, + "loss": 0.401, "step": 111205 }, { - "epoch": 3.91, - "learning_rate": 5.962690355744627e-06, - "loss": 0.2537, + "epoch": 4.008000864958374, + "grad_norm": 0.21843719482421875, + "learning_rate": 4.99973568634739e-06, + "loss": 0.3893, "step": 111210 }, { - "epoch": 3.91, - "learning_rate": 5.960844012666672e-06, - "loss": 0.2603, + "epoch": 4.008181064619598, + "grad_norm": 0.20329011976718903, + "learning_rate": 4.9979849835474515e-06, + "loss": 0.3976, "step": 111215 }, { - "epoch": 3.91, - "learning_rate": 5.958997916799611e-06, - "loss": 0.2443, + "epoch": 4.008361264280823, + "grad_norm": 0.21530331671237946, + "learning_rate": 4.996234553271092e-06, + "loss": 0.3503, "step": 111220 }, { - "epoch": 3.91, - "learning_rate": 5.957152068167404e-06, - "loss": 0.2563, + "epoch": 4.008541463942048, + "grad_norm": 0.23485726118087769, + "learning_rate": 4.994484395542159e-06, + "loss": 0.3453, "step": 111225 }, { - "epoch": 3.91, - "learning_rate": 5.9553064667940366e-06, - "loss": 0.2483, + "epoch": 4.008721663603272, + "grad_norm": 0.22897350788116455, + "learning_rate": 4.992734510384511e-06, + "loss": 0.3664, "step": 111230 }, { - "epoch": 3.91, - "learning_rate": 5.953461112703462e-06, - "loss": 0.2582, + "epoch": 4.008901863264497, + "grad_norm": 0.2363205850124359, + "learning_rate": 4.990984897821988e-06, + "loss": 0.3858, "step": 111235 }, { - "epoch": 3.91, - "learning_rate": 5.951616005919644e-06, - "loss": 0.2488, + "epoch": 4.009082062925722, + "grad_norm": 0.2695505917072296, + "learning_rate": 4.989235557878408e-06, + "loss": 0.3598, "step": 111240 }, { - "epoch": 3.91, - "learning_rate": 5.949771146466532e-06, - "loss": 0.2573, + "epoch": 4.0092622625869465, + "grad_norm": 0.22455400228500366, + "learning_rate": 4.987486490577626e-06, + "loss": 0.3521, "step": 111245 }, { - "epoch": 3.91, - "learning_rate": 5.947926534368093e-06, - "loss": 0.2767, + "epoch": 4.009442462248171, + "grad_norm": 0.23414896428585052, + "learning_rate": 4.985737695943457e-06, + "loss": 0.3757, "step": 111250 }, { - "epoch": 3.91, - "learning_rate": 5.9460821696482645e-06, - "loss": 0.2423, + "epoch": 4.009622661909396, + "grad_norm": 0.26559486985206604, + "learning_rate": 4.9839891739997535e-06, + "loss": 0.3847, "step": 111255 }, { - "epoch": 3.91, - "learning_rate": 5.944238052331011e-06, - "loss": 0.2467, + "epoch": 4.00980286157062, + "grad_norm": 0.2793545126914978, + "learning_rate": 4.982240924770315e-06, + "loss": 0.4138, "step": 111260 }, { - "epoch": 3.91, - "learning_rate": 5.942394182440261e-06, - "loss": 0.2543, + "epoch": 4.009983061231845, + "grad_norm": 0.28454217314720154, + "learning_rate": 4.980492948278961e-06, + "loss": 0.3402, "step": 111265 }, { - "epoch": 3.91, - "learning_rate": 5.940550559999969e-06, - "loss": 0.2854, + "epoch": 4.010163260893069, + "grad_norm": 0.1981167495250702, + "learning_rate": 4.978745244549521e-06, + "loss": 0.3759, "step": 111270 }, { - "epoch": 3.91, - "learning_rate": 5.938707185034067e-06, - "loss": 0.2625, + "epoch": 4.010343460554294, + "grad_norm": 0.2657647132873535, + "learning_rate": 4.976997813605802e-06, + "loss": 0.3501, "step": 111275 }, { - "epoch": 3.92, - "learning_rate": 5.936864057566485e-06, - "loss": 0.255, + "epoch": 4.010523660215519, + "grad_norm": 0.20731034874916077, + "learning_rate": 4.975250655471611e-06, + "loss": 0.3307, "step": 111280 }, { - "epoch": 3.92, - "learning_rate": 5.9350211776211665e-06, - "loss": 0.2553, + "epoch": 4.0107038598767435, + "grad_norm": 0.3427305221557617, + "learning_rate": 4.973503770170751e-06, + "loss": 0.3818, "step": 111285 }, { - "epoch": 3.92, - "learning_rate": 5.933178545222034e-06, - "loss": 0.2503, + "epoch": 4.010884059537968, + "grad_norm": 0.2292642891407013, + "learning_rate": 4.971757157727019e-06, + "loss": 0.3537, "step": 111290 }, { - "epoch": 3.92, - "learning_rate": 5.931336160393014e-06, - "loss": 0.2601, + "epoch": 4.011064259199193, + "grad_norm": 0.2728843092918396, + "learning_rate": 4.970010818164225e-06, + "loss": 0.3926, "step": 111295 }, { - "epoch": 3.92, - "learning_rate": 5.929494023158016e-06, - "loss": 0.2451, + "epoch": 4.011244458860418, + "grad_norm": 0.2139047533273697, + "learning_rate": 4.968264751506157e-06, + "loss": 0.3357, "step": 111300 }, { - "epoch": 3.92, - "learning_rate": 5.927652133540981e-06, - "loss": 0.2553, + "epoch": 4.0114246585216415, + "grad_norm": 0.2937036156654358, + "learning_rate": 4.966518957776603e-06, + "loss": 0.3816, "step": 111305 }, { - "epoch": 3.92, - "learning_rate": 5.9258104915658054e-06, - "loss": 0.2729, + "epoch": 4.011604858182866, + "grad_norm": 0.31727510690689087, + "learning_rate": 4.964773436999348e-06, + "loss": 0.3726, "step": 111310 }, { - "epoch": 3.92, - "learning_rate": 5.9239690972564184e-06, - "loss": 0.2575, + "epoch": 4.011785057844091, + "grad_norm": 0.2226351499557495, + "learning_rate": 4.96302818919818e-06, + "loss": 0.3629, "step": 111315 }, { - "epoch": 3.92, - "learning_rate": 5.922127950636711e-06, - "loss": 0.2593, + "epoch": 4.011965257505316, + "grad_norm": 0.28360915184020996, + "learning_rate": 4.961283214396864e-06, + "loss": 0.3353, "step": 111320 }, { - "epoch": 3.92, - "learning_rate": 5.920287051730608e-06, - "loss": 0.2671, + "epoch": 4.0121454571665405, + "grad_norm": 0.25234130024909973, + "learning_rate": 4.959538512619197e-06, + "loss": 0.3666, "step": 111325 }, { - "epoch": 3.92, - "learning_rate": 5.918446400562005e-06, - "loss": 0.2497, + "epoch": 4.012325656827765, + "grad_norm": 0.357035368680954, + "learning_rate": 4.9577940838889255e-06, + "loss": 0.3633, "step": 111330 }, { - "epoch": 3.92, - "learning_rate": 5.9166059971548e-06, - "loss": 0.2913, + "epoch": 4.01250585648899, + "grad_norm": 0.26695460081100464, + "learning_rate": 4.956049928229836e-06, + "loss": 0.371, "step": 111335 }, { - "epoch": 3.92, - "learning_rate": 5.914765841532883e-06, - "loss": 0.2321, + "epoch": 4.012686056150215, + "grad_norm": 0.24045346677303314, + "learning_rate": 4.954306045665686e-06, + "loss": 0.384, "step": 111340 }, { - "epoch": 3.92, - "learning_rate": 5.912925933720159e-06, - "loss": 0.2588, + "epoch": 4.012866255811439, + "grad_norm": 0.22014078497886658, + "learning_rate": 4.952562436220234e-06, + "loss": 0.3773, "step": 111345 }, { - "epoch": 3.92, - "learning_rate": 5.911086273740512e-06, - "loss": 0.2605, + "epoch": 4.013046455472664, + "grad_norm": 0.22839348018169403, + "learning_rate": 4.950819099917239e-06, + "loss": 0.3454, "step": 111350 }, { - "epoch": 3.92, - "learning_rate": 5.909246861617834e-06, - "loss": 0.2494, + "epoch": 4.013226655133888, + "grad_norm": 0.2525436282157898, + "learning_rate": 4.949076036780445e-06, + "loss": 0.3878, "step": 111355 }, { - "epoch": 3.92, - "learning_rate": 5.907407697375994e-06, - "loss": 0.2646, + "epoch": 4.013406854795113, + "grad_norm": 0.23207542300224304, + "learning_rate": 4.947333246833613e-06, + "loss": 0.3541, "step": 111360 }, { - "epoch": 3.92, - "learning_rate": 5.905568781038884e-06, - "loss": 0.2306, + "epoch": 4.0135870544563375, + "grad_norm": 0.2273051142692566, + "learning_rate": 4.945590730100486e-06, + "loss": 0.3643, "step": 111365 }, { - "epoch": 3.92, - "learning_rate": 5.903730112630385e-06, - "loss": 0.2534, + "epoch": 4.013767254117562, + "grad_norm": 0.31350627541542053, + "learning_rate": 4.943848486604802e-06, + "loss": 0.3552, "step": 111370 }, { - "epoch": 3.92, - "learning_rate": 5.901891692174366e-06, - "loss": 0.258, + "epoch": 4.013947453778787, + "grad_norm": 0.26370009779930115, + "learning_rate": 4.942106516370298e-06, + "loss": 0.3816, "step": 111375 }, { - "epoch": 3.92, - "learning_rate": 5.90005351969469e-06, - "loss": 0.2528, + "epoch": 4.014127653440012, + "grad_norm": 0.24444960057735443, + "learning_rate": 4.940364819420709e-06, + "loss": 0.38, "step": 111380 }, { - "epoch": 3.92, - "learning_rate": 5.898215595215239e-06, - "loss": 0.2637, + "epoch": 4.014307853101236, + "grad_norm": 0.1919577717781067, + "learning_rate": 4.93862339577976e-06, + "loss": 0.3385, "step": 111385 }, { - "epoch": 3.92, - "learning_rate": 5.896377918759871e-06, - "loss": 0.2638, + "epoch": 4.014488052762461, + "grad_norm": 0.2629943788051605, + "learning_rate": 4.936882245471192e-06, + "loss": 0.3585, "step": 111390 }, { - "epoch": 3.92, - "learning_rate": 5.8945404903524404e-06, - "loss": 0.268, + "epoch": 4.014668252423686, + "grad_norm": 0.2678518295288086, + "learning_rate": 4.935141368518717e-06, + "loss": 0.3823, "step": 111395 }, { - "epoch": 3.92, - "learning_rate": 5.892703310016814e-06, - "loss": 0.2561, + "epoch": 4.01484845208491, + "grad_norm": 0.2841361165046692, + "learning_rate": 4.933400764946056e-06, + "loss": 0.3928, "step": 111400 }, { - "epoch": 3.92, - "learning_rate": 5.890866377776844e-06, - "loss": 0.247, + "epoch": 4.015028651746134, + "grad_norm": 0.2338554561138153, + "learning_rate": 4.931660434776925e-06, + "loss": 0.3745, "step": 111405 }, { - "epoch": 3.92, - "learning_rate": 5.889029693656381e-06, - "loss": 0.2453, + "epoch": 4.015208851407359, + "grad_norm": 0.2238939106464386, + "learning_rate": 4.929920378035027e-06, + "loss": 0.3918, "step": 111410 }, { - "epoch": 3.92, - "learning_rate": 5.887193257679266e-06, - "loss": 0.2482, + "epoch": 4.015389051068584, + "grad_norm": 0.28193482756614685, + "learning_rate": 4.9281805947440865e-06, + "loss": 0.3874, "step": 111415 }, { - "epoch": 3.92, - "learning_rate": 5.88535706986936e-06, - "loss": 0.2626, + "epoch": 4.015569250729809, + "grad_norm": 0.216234028339386, + "learning_rate": 4.926441084927805e-06, + "loss": 0.3888, "step": 111420 }, { - "epoch": 3.92, - "learning_rate": 5.883521130250485e-06, - "loss": 0.2708, + "epoch": 4.015749450391033, + "grad_norm": 0.22301048040390015, + "learning_rate": 4.924701848609864e-06, + "loss": 0.3606, "step": 111425 }, { - "epoch": 3.92, - "learning_rate": 5.881685438846498e-06, - "loss": 0.253, + "epoch": 4.015929650052258, + "grad_norm": 0.24179665744304657, + "learning_rate": 4.922962885813981e-06, + "loss": 0.3597, "step": 111430 }, { - "epoch": 3.92, - "learning_rate": 5.879849995681219e-06, - "loss": 0.2609, + "epoch": 4.016109849713483, + "grad_norm": 0.24188639223575592, + "learning_rate": 4.9212241965638365e-06, + "loss": 0.3479, "step": 111435 }, { - "epoch": 3.92, - "learning_rate": 5.878014800778495e-06, - "loss": 0.2458, + "epoch": 4.016290049374708, + "grad_norm": 0.26836642622947693, + "learning_rate": 4.919485780883135e-06, + "loss": 0.3606, "step": 111440 }, { - "epoch": 3.92, - "learning_rate": 5.876179854162145e-06, - "loss": 0.2702, + "epoch": 4.016470249035931, + "grad_norm": 0.2224971503019333, + "learning_rate": 4.9177476387955475e-06, + "loss": 0.3699, "step": 111445 }, { - "epoch": 3.92, - "learning_rate": 5.874345155855998e-06, - "loss": 0.277, + "epoch": 4.016650448697156, + "grad_norm": 0.27810531854629517, + "learning_rate": 4.916009770324753e-06, + "loss": 0.3726, "step": 111450 }, { - "epoch": 3.92, - "learning_rate": 5.872510705883868e-06, - "loss": 0.2534, + "epoch": 4.016830648358381, + "grad_norm": 0.21480928361415863, + "learning_rate": 4.9142721754944446e-06, + "loss": 0.3529, "step": 111455 }, { - "epoch": 3.92, - "learning_rate": 5.870676504269587e-06, - "loss": 0.275, + "epoch": 4.017010848019606, + "grad_norm": 0.24362391233444214, + "learning_rate": 4.9125348543282875e-06, + "loss": 0.3412, "step": 111460 }, { - "epoch": 3.92, - "learning_rate": 5.868842551036966e-06, - "loss": 0.2626, + "epoch": 4.01719104768083, + "grad_norm": 0.2782408893108368, + "learning_rate": 4.910797806849956e-06, + "loss": 0.3839, "step": 111465 }, { - "epoch": 3.92, - "learning_rate": 5.867008846209809e-06, - "loss": 0.2749, + "epoch": 4.017371247342055, + "grad_norm": 0.22316765785217285, + "learning_rate": 4.909061033083112e-06, + "loss": 0.3899, "step": 111470 }, { - "epoch": 3.92, - "learning_rate": 5.865175389811942e-06, - "loss": 0.2634, + "epoch": 4.01755144700328, + "grad_norm": 0.2562723457813263, + "learning_rate": 4.907324533051419e-06, + "loss": 0.3515, "step": 111475 }, { - "epoch": 3.92, - "learning_rate": 5.8633421818671515e-06, - "loss": 0.2567, + "epoch": 4.0177316466645046, + "grad_norm": 0.2701679468154907, + "learning_rate": 4.905588306778544e-06, + "loss": 0.3532, "step": 111480 }, { - "epoch": 3.92, - "learning_rate": 5.861509222399261e-06, - "loss": 0.2652, + "epoch": 4.017911846325729, + "grad_norm": 0.20503626763820648, + "learning_rate": 4.9038523542881355e-06, + "loss": 0.3923, "step": 111485 }, { - "epoch": 3.92, - "learning_rate": 5.8596765114320605e-06, - "loss": 0.2739, + "epoch": 4.018092045986953, + "grad_norm": 0.2625948488712311, + "learning_rate": 4.902116675603852e-06, + "loss": 0.3843, "step": 111490 }, { - "epoch": 3.92, - "learning_rate": 5.8578440489893385e-06, - "loss": 0.2396, + "epoch": 4.018272245648178, + "grad_norm": 0.24299366772174835, + "learning_rate": 4.900381270749335e-06, + "loss": 0.3681, "step": 111495 }, { - "epoch": 3.92, - "learning_rate": 5.856011835094902e-06, - "loss": 0.2528, + "epoch": 4.018452445309403, + "grad_norm": 0.19307754933834076, + "learning_rate": 4.898646139748234e-06, + "loss": 0.3657, "step": 111500 }, { - "epoch": 3.92, - "eval_loss": 0.2502204477787018, - "eval_runtime": 10.5596, - "eval_samples_per_second": 9.47, - "eval_steps_per_second": 9.47, + "epoch": 4.018452445309403, + "eval_loss": 0.42932766675949097, + "eval_runtime": 3.5358, + "eval_samples_per_second": 28.283, + "eval_steps_per_second": 7.071, "step": 111500 }, { - "epoch": 3.92, - "learning_rate": 5.8541798697725384e-06, - "loss": 0.2533, + "epoch": 4.018632644970627, + "grad_norm": 0.22898030281066895, + "learning_rate": 4.896911282624178e-06, + "loss": 0.3802, "step": 111505 }, { - "epoch": 3.92, - "learning_rate": 5.852348153046022e-06, - "loss": 0.2447, + "epoch": 4.018812844631852, + "grad_norm": 0.21475541591644287, + "learning_rate": 4.8951766994008274e-06, + "loss": 0.3608, "step": 111510 }, { - "epoch": 3.92, - "learning_rate": 5.8505166849391555e-06, - "loss": 0.259, + "epoch": 4.018993044293077, + "grad_norm": 0.2441704273223877, + "learning_rate": 4.893442390101791e-06, + "loss": 0.3579, "step": 111515 }, { - "epoch": 3.92, - "learning_rate": 5.848685465475709e-06, - "loss": 0.2621, + "epoch": 4.0191732439543015, + "grad_norm": 0.24041332304477692, + "learning_rate": 4.891708354750716e-06, + "loss": 0.3862, "step": 111520 }, { - "epoch": 3.92, - "learning_rate": 5.84685449467946e-06, - "loss": 0.2557, + "epoch": 4.019353443615526, + "grad_norm": 0.262669175863266, + "learning_rate": 4.889974593371218e-06, + "loss": 0.3809, "step": 111525 }, { - "epoch": 3.92, - "learning_rate": 5.845023772574177e-06, - "loss": 0.2671, + "epoch": 4.019533643276751, + "grad_norm": 0.3033019006252289, + "learning_rate": 4.888241105986918e-06, + "loss": 0.401, "step": 111530 }, { - "epoch": 3.92, - "learning_rate": 5.843193299183636e-06, - "loss": 0.2392, + "epoch": 4.019713842937975, + "grad_norm": 0.22395041584968567, + "learning_rate": 4.886507892621453e-06, + "loss": 0.3718, "step": 111535 }, { - "epoch": 3.92, - "learning_rate": 5.841363074531611e-06, - "loss": 0.2612, + "epoch": 4.0198940425992, + "grad_norm": 0.2800545394420624, + "learning_rate": 4.88477495329841e-06, + "loss": 0.3998, "step": 111540 }, { - "epoch": 3.92, - "learning_rate": 5.8398990739176875e-06, - "loss": 0.2569, + "epoch": 4.020074242260424, + "grad_norm": 0.2535509765148163, + "learning_rate": 4.883042288041423e-06, + "loss": 0.3666, "step": 111545 }, { - "epoch": 3.92, - "learning_rate": 5.838069297054866e-06, - "loss": 0.2386, + "epoch": 4.020254441921649, + "grad_norm": 0.26017653942108154, + "learning_rate": 4.881309896874087e-06, + "loss": 0.4077, "step": 111550 }, { - "epoch": 3.92, - "learning_rate": 5.836239768997084e-06, - "loss": 0.2383, + "epoch": 4.020434641582874, + "grad_norm": 0.2357221096754074, + "learning_rate": 4.879577779820007e-06, + "loss": 0.3764, "step": 111555 }, { - "epoch": 3.92, - "learning_rate": 5.834410489768097e-06, - "loss": 0.252, + "epoch": 4.0206148412440985, + "grad_norm": 0.24376040697097778, + "learning_rate": 4.877845936902789e-06, + "loss": 0.3291, "step": 111560 }, { - "epoch": 3.93, - "learning_rate": 5.832581459391662e-06, - "loss": 0.2548, + "epoch": 4.020795040905323, + "grad_norm": 0.22934941947460175, + "learning_rate": 4.87611436814602e-06, + "loss": 0.3561, "step": 111565 }, { - "epoch": 3.93, - "learning_rate": 5.830752677891521e-06, - "loss": 0.2536, + "epoch": 4.020975240566548, + "grad_norm": 0.25413674116134644, + "learning_rate": 4.874383073573294e-06, + "loss": 0.3447, "step": 111570 }, { - "epoch": 3.93, - "learning_rate": 5.8289241452914285e-06, - "loss": 0.246, + "epoch": 4.021155440227773, + "grad_norm": 0.23448479175567627, + "learning_rate": 4.872652053208207e-06, + "loss": 0.3587, "step": 111575 }, { - "epoch": 3.93, - "learning_rate": 5.827095861615114e-06, - "loss": 0.2682, + "epoch": 4.0213356398889974, + "grad_norm": 0.2433057725429535, + "learning_rate": 4.870921307074339e-06, + "loss": 0.3762, "step": 111580 }, { - "epoch": 3.93, - "learning_rate": 5.825267826886333e-06, - "loss": 0.2444, + "epoch": 4.021515839550221, + "grad_norm": 0.22767940163612366, + "learning_rate": 4.86919083519527e-06, + "loss": 0.383, "step": 111585 }, { - "epoch": 3.93, - "learning_rate": 5.823440041128813e-06, - "loss": 0.2722, + "epoch": 4.021696039211446, + "grad_norm": 0.24196745455265045, + "learning_rate": 4.867460637594579e-06, + "loss": 0.3517, "step": 111590 }, { - "epoch": 3.93, - "learning_rate": 5.821612504366286e-06, - "loss": 0.2466, + "epoch": 4.021876238872671, + "grad_norm": 0.27073025703430176, + "learning_rate": 4.865730714295832e-06, + "loss": 0.3839, "step": 111595 }, { - "epoch": 3.93, - "learning_rate": 5.819785216622473e-06, - "loss": 0.2542, + "epoch": 4.0220564385338955, + "grad_norm": 0.30549201369285583, + "learning_rate": 4.864001065322616e-06, + "loss": 0.377, "step": 111600 }, { - "epoch": 3.93, - "learning_rate": 5.817958177921118e-06, - "loss": 0.2506, + "epoch": 4.02223663819512, + "grad_norm": 0.23959122598171234, + "learning_rate": 4.862271690698489e-06, + "loss": 0.3811, "step": 111605 }, { - "epoch": 3.93, - "learning_rate": 5.816131388285934e-06, - "loss": 0.2846, + "epoch": 4.022416837856345, + "grad_norm": 0.2952626943588257, + "learning_rate": 4.8605425904470005e-06, + "loss": 0.3632, "step": 111610 }, { - "epoch": 3.93, - "learning_rate": 5.814304847740631e-06, - "loss": 0.2556, + "epoch": 4.02259703751757, + "grad_norm": 0.24323879182338715, + "learning_rate": 4.858813764591727e-06, + "loss": 0.4091, "step": 111615 }, { - "epoch": 3.93, - "learning_rate": 5.812478556308948e-06, - "loss": 0.2741, + "epoch": 4.022777237178794, + "grad_norm": 0.23510761559009552, + "learning_rate": 4.857085213156209e-06, + "loss": 0.3739, "step": 111620 }, { - "epoch": 3.93, - "learning_rate": 5.810652514014575e-06, - "loss": 0.2559, + "epoch": 4.022957436840019, + "grad_norm": 0.2673884630203247, + "learning_rate": 4.855356936164018e-06, + "loss": 0.3963, "step": 111625 }, { - "epoch": 3.93, - "learning_rate": 5.808826720881242e-06, - "loss": 0.236, + "epoch": 4.023137636501243, + "grad_norm": 0.25060799717903137, + "learning_rate": 4.853628933638682e-06, + "loss": 0.3889, "step": 111630 }, { - "epoch": 3.93, - "learning_rate": 5.807001176932644e-06, - "loss": 0.2694, + "epoch": 4.023317836162468, + "grad_norm": 0.2960754930973053, + "learning_rate": 4.851901205603746e-06, + "loss": 0.3883, "step": 111635 }, { - "epoch": 3.93, - "learning_rate": 5.805175882192484e-06, - "loss": 0.259, + "epoch": 4.0234980358236925, + "grad_norm": 0.2322222739458084, + "learning_rate": 4.85017375208276e-06, + "loss": 0.3718, "step": 111640 }, { - "epoch": 3.93, - "learning_rate": 5.8033508366844705e-06, - "loss": 0.2497, + "epoch": 4.023678235484917, + "grad_norm": 0.2459927797317505, + "learning_rate": 4.848446573099258e-06, + "loss": 0.3736, "step": 111645 }, { - "epoch": 3.93, - "learning_rate": 5.801526040432295e-06, - "loss": 0.2416, + "epoch": 4.023858435146142, + "grad_norm": 0.26722821593284607, + "learning_rate": 4.8467196686767695e-06, + "loss": 0.3832, "step": 111650 }, { - "epoch": 3.93, - "learning_rate": 5.79970149345965e-06, - "loss": 0.243, + "epoch": 4.024038634807367, + "grad_norm": 0.2405874878168106, + "learning_rate": 4.8449930388388215e-06, + "loss": 0.3748, "step": 111655 }, { - "epoch": 3.93, - "learning_rate": 5.797877195790222e-06, - "loss": 0.2511, + "epoch": 4.024218834468591, + "grad_norm": 0.2542579472064972, + "learning_rate": 4.843266683608936e-06, + "loss": 0.395, "step": 111660 }, { - "epoch": 3.93, - "learning_rate": 5.796053147447711e-06, - "loss": 0.2293, + "epoch": 4.024399034129816, + "grad_norm": 0.24059006571769714, + "learning_rate": 4.8415406030106465e-06, + "loss": 0.381, "step": 111665 }, { - "epoch": 3.93, - "learning_rate": 5.794229348455796e-06, - "loss": 0.2502, + "epoch": 4.024579233791041, + "grad_norm": 0.2607281804084778, + "learning_rate": 4.839814797067463e-06, + "loss": 0.3987, "step": 111670 }, { - "epoch": 3.93, - "learning_rate": 5.792405798838147e-06, - "loss": 0.2516, + "epoch": 4.024759433452265, + "grad_norm": 0.2265724539756775, + "learning_rate": 4.838089265802901e-06, + "loss": 0.3876, "step": 111675 }, { - "epoch": 3.93, - "learning_rate": 5.790582498618449e-06, - "loss": 0.2522, + "epoch": 4.0249396331134895, + "grad_norm": 0.28544244170188904, + "learning_rate": 4.836364009240465e-06, + "loss": 0.3896, "step": 111680 }, { - "epoch": 3.93, - "learning_rate": 5.788759447820386e-06, - "loss": 0.2582, + "epoch": 4.025119832774714, + "grad_norm": 0.21306416392326355, + "learning_rate": 4.83463902740367e-06, + "loss": 0.3687, "step": 111685 }, { - "epoch": 3.93, - "learning_rate": 5.786936646467622e-06, - "loss": 0.2405, + "epoch": 4.025300032435939, + "grad_norm": 0.22182679176330566, + "learning_rate": 4.832914320316006e-06, + "loss": 0.3602, "step": 111690 }, { - "epoch": 3.93, - "learning_rate": 5.785114094583815e-06, - "loss": 0.2332, + "epoch": 4.025480232097164, + "grad_norm": 0.2823246121406555, + "learning_rate": 4.831189888000986e-06, + "loss": 0.3927, "step": 111695 }, { - "epoch": 3.93, - "learning_rate": 5.783291792192647e-06, - "loss": 0.2697, + "epoch": 4.025660431758388, + "grad_norm": 0.2230641394853592, + "learning_rate": 4.8294657304821e-06, + "loss": 0.3698, "step": 111700 }, { - "epoch": 3.93, - "learning_rate": 5.781469739317769e-06, - "loss": 0.2611, + "epoch": 4.025840631419613, + "grad_norm": 0.2776728570461273, + "learning_rate": 4.827741847782838e-06, + "loss": 0.3692, "step": 111705 }, { - "epoch": 3.93, - "learning_rate": 5.779647935982843e-06, - "loss": 0.2595, + "epoch": 4.026020831080838, + "grad_norm": 0.2971155643463135, + "learning_rate": 4.826018239926689e-06, + "loss": 0.3711, "step": 111710 }, { - "epoch": 3.93, - "learning_rate": 5.777826382211513e-06, - "loss": 0.2782, + "epoch": 4.026201030742063, + "grad_norm": 0.23264330625534058, + "learning_rate": 4.824294906937126e-06, + "loss": 0.4002, "step": 111715 }, { - "epoch": 3.93, - "learning_rate": 5.776005078027447e-06, - "loss": 0.2558, + "epoch": 4.026381230403286, + "grad_norm": 0.20618966221809387, + "learning_rate": 4.8225718488376535e-06, + "loss": 0.3391, "step": 111720 }, { - "epoch": 3.93, - "learning_rate": 5.774184023454287e-06, - "loss": 0.2604, + "epoch": 4.026561430064511, + "grad_norm": 0.23295053839683533, + "learning_rate": 4.8208490656517225e-06, + "loss": 0.3322, "step": 111725 }, { - "epoch": 3.93, - "learning_rate": 5.77236321851567e-06, - "loss": 0.2573, + "epoch": 4.026741629725736, + "grad_norm": 0.19821318984031677, + "learning_rate": 4.81912655740282e-06, + "loss": 0.351, "step": 111730 }, { - "epoch": 3.93, - "learning_rate": 5.7705426632352426e-06, - "loss": 0.2599, + "epoch": 4.026921829386961, + "grad_norm": 0.3166259825229645, + "learning_rate": 4.81740432411441e-06, + "loss": 0.3433, "step": 111735 }, { - "epoch": 3.93, - "learning_rate": 5.768722357636658e-06, - "loss": 0.2361, + "epoch": 4.027102029048185, + "grad_norm": 0.2291676253080368, + "learning_rate": 4.815682365809959e-06, + "loss": 0.3965, "step": 111740 }, { - "epoch": 3.93, - "learning_rate": 5.766902301743534e-06, - "loss": 0.2601, + "epoch": 4.02728222870941, + "grad_norm": 0.2493552565574646, + "learning_rate": 4.81396068251293e-06, + "loss": 0.3637, "step": 111745 }, { - "epoch": 3.93, - "learning_rate": 5.765082495579513e-06, - "loss": 0.2411, + "epoch": 4.027462428370635, + "grad_norm": 0.2596898376941681, + "learning_rate": 4.812239274246777e-06, + "loss": 0.3494, "step": 111750 }, { - "epoch": 3.93, - "learning_rate": 5.76326293916821e-06, - "loss": 0.2592, + "epoch": 4.02764262803186, + "grad_norm": 0.246169313788414, + "learning_rate": 4.810518141034951e-06, + "loss": 0.4282, "step": 111755 }, { - "epoch": 3.93, - "learning_rate": 5.761443632533267e-06, - "loss": 0.2562, + "epoch": 4.027822827693084, + "grad_norm": 0.29572200775146484, + "learning_rate": 4.808797282900912e-06, + "loss": 0.3703, "step": 111760 }, { - "epoch": 3.93, - "learning_rate": 5.759624575698302e-06, - "loss": 0.2676, + "epoch": 4.028003027354308, + "grad_norm": 0.23195022344589233, + "learning_rate": 4.807076699868099e-06, + "loss": 0.3474, "step": 111765 }, { - "epoch": 3.93, - "learning_rate": 5.75780576868693e-06, - "loss": 0.2544, + "epoch": 4.028183227015533, + "grad_norm": 0.21716444194316864, + "learning_rate": 4.805356391959959e-06, + "loss": 0.3792, "step": 111770 }, { - "epoch": 3.93, - "learning_rate": 5.755987211522765e-06, - "loss": 0.2397, + "epoch": 4.028363426676758, + "grad_norm": 0.2253197580575943, + "learning_rate": 4.8036363591999255e-06, + "loss": 0.3733, "step": 111775 }, { - "epoch": 3.93, - "learning_rate": 5.754168904229426e-06, - "loss": 0.2674, + "epoch": 4.028543626337982, + "grad_norm": 0.26673805713653564, + "learning_rate": 4.801916601611433e-06, + "loss": 0.3862, "step": 111780 }, { - "epoch": 3.93, - "learning_rate": 5.7523508468305175e-06, - "loss": 0.2483, + "epoch": 4.028723825999207, + "grad_norm": 0.2358836978673935, + "learning_rate": 4.8001971192179226e-06, + "loss": 0.3647, "step": 111785 }, { - "epoch": 3.93, - "learning_rate": 5.750533039349654e-06, - "loss": 0.255, + "epoch": 4.028904025660432, + "grad_norm": 0.2774098813533783, + "learning_rate": 4.79847791204282e-06, + "loss": 0.3863, "step": 111790 }, { - "epoch": 3.93, - "learning_rate": 5.748715481810427e-06, - "loss": 0.2782, + "epoch": 4.0290842253216566, + "grad_norm": 0.287156879901886, + "learning_rate": 4.7967589801095335e-06, + "loss": 0.3558, "step": 111795 }, { - "epoch": 3.93, - "learning_rate": 5.746898174236448e-06, - "loss": 0.2647, + "epoch": 4.029264424982881, + "grad_norm": 0.24171940982341766, + "learning_rate": 4.7950403234414995e-06, + "loss": 0.358, "step": 111800 }, { - "epoch": 3.93, - "learning_rate": 5.745081116651307e-06, - "loss": 0.2554, + "epoch": 4.029444624644106, + "grad_norm": 0.23957575857639313, + "learning_rate": 4.793321942062124e-06, + "loss": 0.3846, "step": 111805 }, { - "epoch": 3.93, - "learning_rate": 5.7432643090786e-06, - "loss": 0.2611, + "epoch": 4.02962482430533, + "grad_norm": 0.28689080476760864, + "learning_rate": 4.791603835994829e-06, + "loss": 0.3987, "step": 111810 }, { - "epoch": 3.93, - "learning_rate": 5.741447751541909e-06, - "loss": 0.2397, + "epoch": 4.029805023966555, + "grad_norm": 0.28590720891952515, + "learning_rate": 4.789886005263028e-06, + "loss": 0.3648, "step": 111815 }, { - "epoch": 3.93, - "learning_rate": 5.739631444064833e-06, - "loss": 0.2665, + "epoch": 4.029985223627779, + "grad_norm": 0.22683300077915192, + "learning_rate": 4.788168449890104e-06, + "loss": 0.3772, "step": 111820 }, { - "epoch": 3.93, - "learning_rate": 5.7378153866709476e-06, - "loss": 0.2722, + "epoch": 4.030165423289004, + "grad_norm": 0.2961829900741577, + "learning_rate": 4.786451169899478e-06, + "loss": 0.3508, "step": 111825 }, { - "epoch": 3.93, - "learning_rate": 5.73599957938383e-06, - "loss": 0.2563, + "epoch": 4.030345622950229, + "grad_norm": 0.2695339322090149, + "learning_rate": 4.784734165314542e-06, + "loss": 0.3903, "step": 111830 }, { - "epoch": 3.93, - "learning_rate": 5.734184022227071e-06, - "loss": 0.2429, + "epoch": 4.0305258226114535, + "grad_norm": 0.1870175302028656, + "learning_rate": 4.783017436158691e-06, + "loss": 0.4212, "step": 111835 }, { - "epoch": 3.93, - "learning_rate": 5.732368715224226e-06, - "loss": 0.2568, + "epoch": 4.030706022272678, + "grad_norm": 0.26787975430488586, + "learning_rate": 4.781300982455309e-06, + "loss": 0.3925, "step": 111840 }, { - "epoch": 3.94, - "learning_rate": 5.730553658398885e-06, - "loss": 0.2663, + "epoch": 4.030886221933903, + "grad_norm": 0.2814364731311798, + "learning_rate": 4.779584804227783e-06, + "loss": 0.348, "step": 111845 }, { - "epoch": 3.94, - "learning_rate": 5.728738851774598e-06, - "loss": 0.2366, + "epoch": 4.031066421595128, + "grad_norm": 0.3020128309726715, + "learning_rate": 4.777868901499505e-06, + "loss": 0.3637, "step": 111850 }, { - "epoch": 3.94, - "learning_rate": 5.7269242953749416e-06, - "loss": 0.2618, + "epoch": 4.0312466212563525, + "grad_norm": 0.22931928932666779, + "learning_rate": 4.776153274293848e-06, + "loss": 0.3362, "step": 111855 }, { - "epoch": 3.94, - "learning_rate": 5.725109989223474e-06, - "loss": 0.2438, + "epoch": 4.031426820917576, + "grad_norm": 0.24356307089328766, + "learning_rate": 4.774437922634187e-06, + "loss": 0.3895, "step": 111860 }, { - "epoch": 3.94, - "learning_rate": 5.72329593334375e-06, - "loss": 0.237, + "epoch": 4.031607020578801, + "grad_norm": 0.2754194438457489, + "learning_rate": 4.772722846543892e-06, + "loss": 0.3886, "step": 111865 }, { - "epoch": 3.94, - "learning_rate": 5.721482127759317e-06, - "loss": 0.2486, + "epoch": 4.031787220240026, + "grad_norm": 0.2473660260438919, + "learning_rate": 4.7710080460463356e-06, + "loss": 0.3681, "step": 111870 }, { - "epoch": 3.94, - "learning_rate": 5.719668572493742e-06, - "loss": 0.2308, + "epoch": 4.0319674199012505, + "grad_norm": 0.24248690903186798, + "learning_rate": 4.769293521164869e-06, + "loss": 0.3893, "step": 111875 }, { - "epoch": 3.94, - "learning_rate": 5.717855267570563e-06, - "loss": 0.2412, + "epoch": 4.032147619562475, + "grad_norm": 0.27788907289505005, + "learning_rate": 4.76757927192287e-06, + "loss": 0.3653, "step": 111880 }, { - "epoch": 3.94, - "learning_rate": 5.716042213013326e-06, - "loss": 0.2283, + "epoch": 4.0323278192237, + "grad_norm": 0.22080229222774506, + "learning_rate": 4.765865298343686e-06, + "loss": 0.4033, "step": 111885 }, { - "epoch": 3.94, - "learning_rate": 5.714229408845567e-06, - "loss": 0.2597, + "epoch": 4.032508018884925, + "grad_norm": 0.26164865493774414, + "learning_rate": 4.764151600450667e-06, + "loss": 0.3896, "step": 111890 }, { - "epoch": 3.94, - "learning_rate": 5.712416855090827e-06, - "loss": 0.2631, + "epoch": 4.0326882185461495, + "grad_norm": 0.24852848052978516, + "learning_rate": 4.7624381782671675e-06, + "loss": 0.3699, "step": 111895 }, { - "epoch": 3.94, - "learning_rate": 5.7106045517726525e-06, - "loss": 0.2607, + "epoch": 4.032868418207374, + "grad_norm": 0.18736465275287628, + "learning_rate": 4.760725031816521e-06, + "loss": 0.3585, "step": 111900 }, { - "epoch": 3.94, - "learning_rate": 5.708792498914564e-06, - "loss": 0.2574, + "epoch": 4.033048617868598, + "grad_norm": 0.3258678913116455, + "learning_rate": 4.759012161122092e-06, + "loss": 0.4135, "step": 111905 }, { - "epoch": 3.94, - "learning_rate": 5.706980696540084e-06, - "loss": 0.2504, + "epoch": 4.033228817529823, + "grad_norm": 0.23651060461997986, + "learning_rate": 4.757299566207196e-06, + "loss": 0.3723, "step": 111910 }, { - "epoch": 3.94, - "learning_rate": 5.705169144672753e-06, - "loss": 0.2306, + "epoch": 4.0334090171910475, + "grad_norm": 0.2316112071275711, + "learning_rate": 4.75558724709517e-06, + "loss": 0.3634, "step": 111915 }, { - "epoch": 3.94, - "learning_rate": 5.703357843336085e-06, - "loss": 0.2684, + "epoch": 4.033589216852272, + "grad_norm": 0.22371111810207367, + "learning_rate": 4.753875203809352e-06, + "loss": 0.3619, "step": 111920 }, { - "epoch": 3.94, - "learning_rate": 5.701546792553597e-06, - "loss": 0.2615, + "epoch": 4.033769416513497, + "grad_norm": 0.2516692578792572, + "learning_rate": 4.752163436373061e-06, + "loss": 0.3887, "step": 111925 }, { - "epoch": 3.94, - "learning_rate": 5.6997359923488005e-06, - "loss": 0.2572, + "epoch": 4.033949616174722, + "grad_norm": 0.24128705263137817, + "learning_rate": 4.750451944809634e-06, + "loss": 0.3456, "step": 111930 }, { - "epoch": 3.94, - "learning_rate": 5.6979254427452195e-06, - "loss": 0.2564, + "epoch": 4.034129815835946, + "grad_norm": 0.24484743177890778, + "learning_rate": 4.748740729142373e-06, + "loss": 0.3982, "step": 111935 }, { - "epoch": 3.94, - "learning_rate": 5.696115143766356e-06, - "loss": 0.2539, + "epoch": 4.034310015497171, + "grad_norm": 0.21860742568969727, + "learning_rate": 4.747029789394591e-06, + "loss": 0.3646, "step": 111940 }, { - "epoch": 3.94, - "learning_rate": 5.694305095435709e-06, - "loss": 0.2691, + "epoch": 4.034490215158396, + "grad_norm": 0.24374550580978394, + "learning_rate": 4.7453191255896145e-06, + "loss": 0.3578, "step": 111945 }, { - "epoch": 3.94, - "learning_rate": 5.692495297776787e-06, - "loss": 0.2637, + "epoch": 4.03467041481962, + "grad_norm": 0.29540616273880005, + "learning_rate": 4.743608737750746e-06, + "loss": 0.3785, "step": 111950 }, { - "epoch": 3.94, - "learning_rate": 5.690685750813096e-06, - "loss": 0.255, + "epoch": 4.0348506144808445, + "grad_norm": 0.19289089739322662, + "learning_rate": 4.7418986259012836e-06, + "loss": 0.3744, "step": 111955 }, { - "epoch": 3.94, - "learning_rate": 5.688876454568126e-06, - "loss": 0.2362, + "epoch": 4.035030814142069, + "grad_norm": 0.2452259063720703, + "learning_rate": 4.740188790064531e-06, + "loss": 0.3566, "step": 111960 }, { - "epoch": 3.94, - "learning_rate": 5.6870674090653604e-06, - "loss": 0.2559, + "epoch": 4.035211013803294, + "grad_norm": 0.2256636768579483, + "learning_rate": 4.738479230263776e-06, + "loss": 0.3578, "step": 111965 }, { - "epoch": 3.94, - "learning_rate": 5.685258614328304e-06, - "loss": 0.2687, + "epoch": 4.035391213464519, + "grad_norm": 0.21084842085838318, + "learning_rate": 4.736769946522326e-06, + "loss": 0.359, "step": 111970 }, { - "epoch": 3.94, - "learning_rate": 5.683450070380436e-06, - "loss": 0.2709, + "epoch": 4.035571413125743, + "grad_norm": 0.22610744833946228, + "learning_rate": 4.735060938863464e-06, + "loss": 0.3715, "step": 111975 }, { - "epoch": 3.94, - "learning_rate": 5.681641777245239e-06, - "loss": 0.2485, + "epoch": 4.035751612786968, + "grad_norm": 0.2904773950576782, + "learning_rate": 4.733352207310473e-06, + "loss": 0.3575, "step": 111980 }, { - "epoch": 3.94, - "learning_rate": 5.679833734946186e-06, - "loss": 0.2666, + "epoch": 4.035931812448193, + "grad_norm": 0.2337644398212433, + "learning_rate": 4.731643751886633e-06, + "loss": 0.3408, "step": 111985 }, { - "epoch": 3.94, - "learning_rate": 5.678025943506765e-06, - "loss": 0.2412, + "epoch": 4.036112012109418, + "grad_norm": 0.2608318030834198, + "learning_rate": 4.729935572615219e-06, + "loss": 0.3671, "step": 111990 }, { - "epoch": 3.94, - "learning_rate": 5.676218402950445e-06, - "loss": 0.2448, + "epoch": 4.0362922117706415, + "grad_norm": 0.2300008088350296, + "learning_rate": 4.728227669519511e-06, + "loss": 0.3985, "step": 111995 }, { - "epoch": 3.94, - "learning_rate": 5.674411113300688e-06, - "loss": 0.2586, + "epoch": 4.036472411431866, + "grad_norm": 0.24639463424682617, + "learning_rate": 4.726520042622784e-06, + "loss": 0.364, "step": 112000 }, { - "epoch": 3.94, - "eval_loss": 0.2502914071083069, - "eval_runtime": 10.543, - "eval_samples_per_second": 9.485, - "eval_steps_per_second": 9.485, + "epoch": 4.036472411431866, + "eval_loss": 0.4291331470012665, + "eval_runtime": 3.5175, + "eval_samples_per_second": 28.429, + "eval_steps_per_second": 7.107, "step": 112000 }, { - "epoch": 3.94, - "learning_rate": 5.67260407458097e-06, - "loss": 0.26, + "epoch": 4.036652611093091, + "grad_norm": 0.2528611123561859, + "learning_rate": 4.724812691948285e-06, + "loss": 0.3803, "step": 112005 }, { - "epoch": 3.94, - "learning_rate": 5.670797286814747e-06, - "loss": 0.2595, + "epoch": 4.036832810754316, + "grad_norm": 0.21292369067668915, + "learning_rate": 4.723105617519294e-06, + "loss": 0.3711, "step": 112010 }, { - "epoch": 3.94, - "learning_rate": 5.668990750025488e-06, - "loss": 0.2258, + "epoch": 4.03701301041554, + "grad_norm": 0.238296777009964, + "learning_rate": 4.72139881935906e-06, + "loss": 0.4014, "step": 112015 }, { - "epoch": 3.94, - "learning_rate": 5.667184464236644e-06, - "loss": 0.2686, + "epoch": 4.037193210076765, + "grad_norm": 0.24008990824222565, + "learning_rate": 4.719692297490844e-06, + "loss": 0.3731, "step": 112020 }, { - "epoch": 3.94, - "learning_rate": 5.665378429471663e-06, - "loss": 0.274, + "epoch": 4.03737340973799, + "grad_norm": 0.2941230535507202, + "learning_rate": 4.717986051937895e-06, + "loss": 0.362, "step": 112025 }, { - "epoch": 3.94, - "learning_rate": 5.663572645754009e-06, - "loss": 0.2668, + "epoch": 4.037553609399215, + "grad_norm": 0.2386900782585144, + "learning_rate": 4.716280082723451e-06, + "loss": 0.3563, "step": 112030 }, { - "epoch": 3.94, - "learning_rate": 5.661767113107119e-06, - "loss": 0.2795, + "epoch": 4.037733809060439, + "grad_norm": 0.22380246222019196, + "learning_rate": 4.714574389870768e-06, + "loss": 0.3797, "step": 112035 }, { - "epoch": 3.94, - "learning_rate": 5.659961831554442e-06, - "loss": 0.2584, + "epoch": 4.037914008721663, + "grad_norm": 0.22900426387786865, + "learning_rate": 4.712868973403087e-06, + "loss": 0.3573, "step": 112040 }, { - "epoch": 3.94, - "learning_rate": 5.658156801119404e-06, - "loss": 0.25, + "epoch": 4.038094208382888, + "grad_norm": 0.21817940473556519, + "learning_rate": 4.711163833343635e-06, + "loss": 0.3323, "step": 112045 }, { - "epoch": 3.94, - "learning_rate": 5.656352021825464e-06, - "loss": 0.2491, + "epoch": 4.038274408044113, + "grad_norm": 0.2606887221336365, + "learning_rate": 4.709458969715646e-06, + "loss": 0.3829, "step": 112050 }, { - "epoch": 3.94, - "learning_rate": 5.654547493696036e-06, - "loss": 0.2243, + "epoch": 4.038454607705337, + "grad_norm": 0.17615626752376556, + "learning_rate": 4.7077543825423535e-06, + "loss": 0.3649, "step": 112055 }, { - "epoch": 3.94, - "learning_rate": 5.652743216754566e-06, - "loss": 0.2689, + "epoch": 4.038634807366562, + "grad_norm": 0.3894023299217224, + "learning_rate": 4.706050071846971e-06, + "loss": 0.3597, "step": 112060 }, { - "epoch": 3.94, - "learning_rate": 5.650939191024468e-06, - "loss": 0.2536, + "epoch": 4.038815007027787, + "grad_norm": 0.1984912008047104, + "learning_rate": 4.704346037652735e-06, + "loss": 0.3796, "step": 112065 }, { - "epoch": 3.94, - "learning_rate": 5.649135416529183e-06, - "loss": 0.2666, + "epoch": 4.038995206689012, + "grad_norm": 0.3245207667350769, + "learning_rate": 4.702642279982855e-06, + "loss": 0.4189, "step": 112070 }, { - "epoch": 3.94, - "learning_rate": 5.6473318932921185e-06, - "loss": 0.2499, + "epoch": 4.039175406350236, + "grad_norm": 0.2791898548603058, + "learning_rate": 4.700938798860544e-06, + "loss": 0.3301, "step": 112075 }, { - "epoch": 3.94, - "learning_rate": 5.6455286213366905e-06, - "loss": 0.2671, + "epoch": 4.039355606011461, + "grad_norm": 0.20591488480567932, + "learning_rate": 4.6992355943090095e-06, + "loss": 0.3515, "step": 112080 }, { - "epoch": 3.94, - "learning_rate": 5.643725600686326e-06, - "loss": 0.2577, + "epoch": 4.039535805672686, + "grad_norm": 0.20929506421089172, + "learning_rate": 4.6975326663514535e-06, + "loss": 0.349, "step": 112085 }, { - "epoch": 3.94, - "learning_rate": 5.641922831364424e-06, - "loss": 0.2648, + "epoch": 4.03971600533391, + "grad_norm": 0.27950236201286316, + "learning_rate": 4.695830015011094e-06, + "loss": 0.3617, "step": 112090 }, { - "epoch": 3.94, - "learning_rate": 5.6401203133944005e-06, - "loss": 0.2578, + "epoch": 4.039896204995134, + "grad_norm": 0.21270787715911865, + "learning_rate": 4.6941276403111245e-06, + "loss": 0.3668, "step": 112095 }, { - "epoch": 3.94, - "learning_rate": 5.6383180467996486e-06, - "loss": 0.2655, + "epoch": 4.040076404656359, + "grad_norm": 0.24949444830417633, + "learning_rate": 4.69242554227472e-06, + "loss": 0.3616, "step": 112100 }, { - "epoch": 3.94, - "learning_rate": 5.636516031603583e-06, - "loss": 0.2529, + "epoch": 4.040256604317584, + "grad_norm": 0.20137085020542145, + "learning_rate": 4.690723720925094e-06, + "loss": 0.347, "step": 112105 }, { - "epoch": 3.94, - "learning_rate": 5.6347142678295966e-06, - "loss": 0.2483, + "epoch": 4.0404368039788086, + "grad_norm": 0.2264707386493683, + "learning_rate": 4.689022176285418e-06, + "loss": 0.4139, "step": 112110 }, { - "epoch": 3.94, - "learning_rate": 5.632912755501077e-06, - "loss": 0.2485, + "epoch": 4.040617003640033, + "grad_norm": 0.25556209683418274, + "learning_rate": 4.687320908378895e-06, + "loss": 0.345, "step": 112115 }, { - "epoch": 3.94, - "learning_rate": 5.63111149464142e-06, - "loss": 0.2457, + "epoch": 4.040797203301258, + "grad_norm": 0.2599429488182068, + "learning_rate": 4.685619917228687e-06, + "loss": 0.3659, "step": 112120 }, { - "epoch": 3.94, - "learning_rate": 5.629310485274022e-06, - "loss": 0.2464, + "epoch": 4.040977402962483, + "grad_norm": 0.19548028707504272, + "learning_rate": 4.6839192028579674e-06, + "loss": 0.3567, "step": 112125 }, { - "epoch": 3.95, - "learning_rate": 5.627509727422265e-06, - "loss": 0.2581, + "epoch": 4.0411576026237075, + "grad_norm": 0.23381386697292328, + "learning_rate": 4.68221876528992e-06, + "loss": 0.3942, "step": 112130 }, { - "epoch": 3.95, - "learning_rate": 5.6257092211095236e-06, - "loss": 0.2541, + "epoch": 4.041337802284931, + "grad_norm": 0.21480177342891693, + "learning_rate": 4.68051860454771e-06, + "loss": 0.3674, "step": 112135 }, { - "epoch": 3.95, - "learning_rate": 5.623908966359176e-06, - "loss": 0.2483, + "epoch": 4.041518001946156, + "grad_norm": 0.20809517800807953, + "learning_rate": 4.678818720654502e-06, + "loss": 0.3405, "step": 112140 }, { - "epoch": 3.95, - "learning_rate": 5.622108963194605e-06, - "loss": 0.259, + "epoch": 4.041698201607381, + "grad_norm": 0.24788372218608856, + "learning_rate": 4.677119113633452e-06, + "loss": 0.4156, "step": 112145 }, { - "epoch": 3.95, - "learning_rate": 5.620309211639179e-06, - "loss": 0.2469, + "epoch": 4.0418784012686055, + "grad_norm": 0.24612867832183838, + "learning_rate": 4.675419783507712e-06, + "loss": 0.3738, "step": 112150 }, { - "epoch": 3.95, - "learning_rate": 5.618509711716269e-06, - "loss": 0.2391, + "epoch": 4.04205860092983, + "grad_norm": 0.2980947494506836, + "learning_rate": 4.673720730300451e-06, + "loss": 0.4167, "step": 112155 }, { - "epoch": 3.95, - "learning_rate": 5.616710463449229e-06, - "loss": 0.2729, + "epoch": 4.042238800591055, + "grad_norm": 0.2769397497177124, + "learning_rate": 4.672021954034811e-06, + "loss": 0.394, "step": 112160 }, { - "epoch": 3.95, - "learning_rate": 5.614911466861436e-06, - "loss": 0.2461, + "epoch": 4.04241900025228, + "grad_norm": 0.24730835855007172, + "learning_rate": 4.670323454733933e-06, + "loss": 0.3496, "step": 112165 }, { - "epoch": 3.95, - "learning_rate": 5.6131127219762355e-06, - "loss": 0.2695, + "epoch": 4.0425991999135045, + "grad_norm": 0.2504551410675049, + "learning_rate": 4.668625232420965e-06, + "loss": 0.3607, "step": 112170 }, { - "epoch": 3.95, - "learning_rate": 5.6113142288169916e-06, - "loss": 0.2568, + "epoch": 4.042779399574729, + "grad_norm": 0.2242858111858368, + "learning_rate": 4.666927287119032e-06, + "loss": 0.3641, "step": 112175 }, { - "epoch": 3.95, - "learning_rate": 5.60951598740706e-06, - "loss": 0.2366, + "epoch": 4.042959599235953, + "grad_norm": 0.22944757342338562, + "learning_rate": 4.665229618851289e-06, + "loss": 0.3853, "step": 112180 }, { - "epoch": 3.95, - "learning_rate": 5.607717997769785e-06, - "loss": 0.2724, + "epoch": 4.043139798897178, + "grad_norm": 0.24846546351909637, + "learning_rate": 4.663532227640857e-06, + "loss": 0.3888, "step": 112185 }, { - "epoch": 3.95, - "learning_rate": 5.605920259928512e-06, - "loss": 0.261, + "epoch": 4.0433199985584025, + "grad_norm": 0.2676353454589844, + "learning_rate": 4.661835113510851e-06, + "loss": 0.3324, "step": 112190 }, { - "epoch": 3.95, - "learning_rate": 5.604122773906576e-06, - "loss": 0.2338, + "epoch": 4.043500198219627, + "grad_norm": 0.39060455560684204, + "learning_rate": 4.6601382764844105e-06, + "loss": 0.3952, "step": 112195 }, { - "epoch": 3.95, - "learning_rate": 5.6023255397273295e-06, - "loss": 0.2412, + "epoch": 4.043680397880852, + "grad_norm": 0.2727122902870178, + "learning_rate": 4.6584417165846495e-06, + "loss": 0.3762, "step": 112200 }, { - "epoch": 3.95, - "learning_rate": 5.600528557414103e-06, - "loss": 0.2643, + "epoch": 4.043860597542077, + "grad_norm": 0.19775696098804474, + "learning_rate": 4.6567454338346804e-06, + "loss": 0.3758, "step": 112205 }, { - "epoch": 3.95, - "learning_rate": 5.598731826990231e-06, - "loss": 0.2565, + "epoch": 4.0440407972033015, + "grad_norm": 0.25784486532211304, + "learning_rate": 4.655049428257615e-06, + "loss": 0.3843, "step": 112210 }, { - "epoch": 3.95, - "learning_rate": 5.596935348479029e-06, - "loss": 0.2487, + "epoch": 4.044220996864526, + "grad_norm": 0.2990073263645172, + "learning_rate": 4.6533536998765555e-06, + "loss": 0.3875, "step": 112215 }, { - "epoch": 3.95, - "learning_rate": 5.595139121903844e-06, - "loss": 0.2487, + "epoch": 4.044401196525751, + "grad_norm": 0.24339328706264496, + "learning_rate": 4.651658248714621e-06, + "loss": 0.3816, "step": 112220 }, { - "epoch": 3.95, - "learning_rate": 5.593343147287983e-06, - "loss": 0.2763, + "epoch": 4.044581396186975, + "grad_norm": 0.2621822953224182, + "learning_rate": 4.649963074794902e-06, + "loss": 0.3381, "step": 112225 }, { - "epoch": 3.95, - "learning_rate": 5.591547424654775e-06, - "loss": 0.2291, + "epoch": 4.0447615958481995, + "grad_norm": 0.25351327657699585, + "learning_rate": 4.648268178140497e-06, + "loss": 0.3854, "step": 112230 }, { - "epoch": 3.95, - "learning_rate": 5.5897519540275305e-06, - "loss": 0.2449, + "epoch": 4.044941795509424, + "grad_norm": 0.3043707013130188, + "learning_rate": 4.646573558774497e-06, + "loss": 0.3838, "step": 112235 }, { - "epoch": 3.95, - "learning_rate": 5.587956735429569e-06, - "loss": 0.27, + "epoch": 4.045121995170649, + "grad_norm": 0.24836041033267975, + "learning_rate": 4.644879216719994e-06, + "loss": 0.3588, "step": 112240 }, { - "epoch": 3.95, - "learning_rate": 5.586161768884196e-06, - "loss": 0.2645, + "epoch": 4.045302194831874, + "grad_norm": 0.2699911594390869, + "learning_rate": 4.643185152000062e-06, + "loss": 0.3773, "step": 112245 }, { - "epoch": 3.95, - "learning_rate": 5.584367054414721e-06, - "loss": 0.2603, + "epoch": 4.045482394493098, + "grad_norm": 0.25113409757614136, + "learning_rate": 4.641491364637798e-06, + "loss": 0.3652, "step": 112250 }, { - "epoch": 3.95, - "learning_rate": 5.582572592044436e-06, - "loss": 0.2299, + "epoch": 4.045662594154323, + "grad_norm": 0.24034260213375092, + "learning_rate": 4.6397978546562744e-06, + "loss": 0.3806, "step": 112255 }, { - "epoch": 3.95, - "learning_rate": 5.580778381796656e-06, - "loss": 0.2727, + "epoch": 4.045842793815548, + "grad_norm": 0.2835821211338043, + "learning_rate": 4.6381046220785595e-06, + "loss": 0.3634, "step": 112260 }, { - "epoch": 3.95, - "learning_rate": 5.578984423694672e-06, - "loss": 0.2459, + "epoch": 4.046022993476773, + "grad_norm": 0.20180295407772064, + "learning_rate": 4.636411666927731e-06, + "loss": 0.3606, "step": 112265 }, { - "epoch": 3.95, - "learning_rate": 5.577190717761774e-06, - "loss": 0.244, + "epoch": 4.0462031931379965, + "grad_norm": 0.24634915590286255, + "learning_rate": 4.634718989226841e-06, + "loss": 0.3853, "step": 112270 }, { - "epoch": 3.95, - "learning_rate": 5.575397264021248e-06, - "loss": 0.2651, + "epoch": 4.046383392799221, + "grad_norm": 0.23739095032215118, + "learning_rate": 4.63302658899897e-06, + "loss": 0.3726, "step": 112275 }, { - "epoch": 3.95, - "learning_rate": 5.57360406249639e-06, - "loss": 0.281, + "epoch": 4.046563592460446, + "grad_norm": 0.32305994629859924, + "learning_rate": 4.631334466267176e-06, + "loss": 0.4008, "step": 112280 }, { - "epoch": 3.95, - "learning_rate": 5.571811113210484e-06, - "loss": 0.2392, + "epoch": 4.046743792121671, + "grad_norm": 0.2640374004840851, + "learning_rate": 4.629642621054492e-06, + "loss": 0.3762, "step": 112285 }, { - "epoch": 3.95, - "learning_rate": 5.570018416186804e-06, - "loss": 0.2478, + "epoch": 4.046923991782895, + "grad_norm": 0.25261178612709045, + "learning_rate": 4.627951053383991e-06, + "loss": 0.3538, "step": 112290 }, { - "epoch": 3.95, - "learning_rate": 5.568225971448634e-06, - "loss": 0.2503, + "epoch": 4.04710419144412, + "grad_norm": 0.25557366013526917, + "learning_rate": 4.626259763278707e-06, + "loss": 0.4039, "step": 112295 }, { - "epoch": 3.95, - "learning_rate": 5.566433779019245e-06, - "loss": 0.2471, + "epoch": 4.047284391105345, + "grad_norm": 0.23059487342834473, + "learning_rate": 4.624568750761699e-06, + "loss": 0.3653, "step": 112300 }, { - "epoch": 3.95, - "learning_rate": 5.564641838921905e-06, - "loss": 0.2399, + "epoch": 4.04746459076657, + "grad_norm": 0.1989499032497406, + "learning_rate": 4.622878015855994e-06, + "loss": 0.3872, "step": 112305 }, { - "epoch": 3.95, - "learning_rate": 5.562850151179877e-06, - "loss": 0.2553, + "epoch": 4.047644790427794, + "grad_norm": 0.2884153127670288, + "learning_rate": 4.621187558584622e-06, + "loss": 0.3838, "step": 112310 }, { - "epoch": 3.95, - "learning_rate": 5.561058715816436e-06, - "loss": 0.2661, + "epoch": 4.047824990089018, + "grad_norm": 0.26490432024002075, + "learning_rate": 4.61949737897063e-06, + "loss": 0.3588, "step": 112315 }, { - "epoch": 3.95, - "learning_rate": 5.5592675328548365e-06, - "loss": 0.2591, + "epoch": 4.048005189750243, + "grad_norm": 0.2709238827228546, + "learning_rate": 4.6178074770370424e-06, + "loss": 0.4018, "step": 112320 }, { - "epoch": 3.95, - "learning_rate": 5.557476602318337e-06, - "loss": 0.2437, + "epoch": 4.048185389411468, + "grad_norm": 0.2417963743209839, + "learning_rate": 4.616117852806881e-06, + "loss": 0.3779, "step": 112325 }, { - "epoch": 3.95, - "learning_rate": 5.5556859242301865e-06, - "loss": 0.2619, + "epoch": 4.048365589072692, + "grad_norm": 0.257938414812088, + "learning_rate": 4.614428506303168e-06, + "loss": 0.3659, "step": 112330 }, { - "epoch": 3.95, - "learning_rate": 5.5538954986136346e-06, - "loss": 0.2565, + "epoch": 4.048545788733917, + "grad_norm": 0.2447778433561325, + "learning_rate": 4.612739437548913e-06, + "loss": 0.363, "step": 112335 }, { - "epoch": 3.95, - "learning_rate": 5.5521053254919435e-06, - "loss": 0.252, + "epoch": 4.048725988395142, + "grad_norm": 0.26171186566352844, + "learning_rate": 4.611050646567142e-06, + "loss": 0.4081, "step": 112340 }, { - "epoch": 3.95, - "learning_rate": 5.550315404888348e-06, - "loss": 0.2521, + "epoch": 4.048906188056367, + "grad_norm": 0.27486851811408997, + "learning_rate": 4.60936213338086e-06, + "loss": 0.3324, "step": 112345 }, { - "epoch": 3.95, - "learning_rate": 5.5485257368260855e-06, - "loss": 0.2571, + "epoch": 4.049086387717591, + "grad_norm": 0.24321185052394867, + "learning_rate": 4.6076738980130705e-06, + "loss": 0.3454, "step": 112350 }, { - "epoch": 3.95, - "learning_rate": 5.5467363213283995e-06, - "loss": 0.2457, + "epoch": 4.049266587378816, + "grad_norm": 0.334555059671402, + "learning_rate": 4.605985940486776e-06, + "loss": 0.3937, "step": 112355 }, { - "epoch": 3.95, - "learning_rate": 5.5449471584185234e-06, - "loss": 0.2381, + "epoch": 4.049446787040041, + "grad_norm": 0.30166709423065186, + "learning_rate": 4.604298260824965e-06, + "loss": 0.3657, "step": 112360 }, { - "epoch": 3.95, - "learning_rate": 5.54315824811969e-06, - "loss": 0.2456, + "epoch": 4.049626986701265, + "grad_norm": 0.2711981236934662, + "learning_rate": 4.602610859050652e-06, + "loss": 0.3539, "step": 112365 }, { - "epoch": 3.95, - "learning_rate": 5.541369590455114e-06, - "loss": 0.2348, + "epoch": 4.049807186362489, + "grad_norm": 0.30052125453948975, + "learning_rate": 4.600923735186824e-06, + "loss": 0.3794, "step": 112370 }, { - "epoch": 3.95, - "learning_rate": 5.5395811854480365e-06, - "loss": 0.2713, + "epoch": 4.049987386023714, + "grad_norm": 0.24002404510974884, + "learning_rate": 4.5992368892564446e-06, + "loss": 0.3768, "step": 112375 }, { - "epoch": 3.95, - "learning_rate": 5.537793033121674e-06, - "loss": 0.2595, + "epoch": 4.050167585684939, + "grad_norm": 0.26016783714294434, + "learning_rate": 4.5975503212825206e-06, + "loss": 0.3781, "step": 112380 }, { - "epoch": 3.95, - "learning_rate": 5.536005133499236e-06, - "loss": 0.2677, + "epoch": 4.050347785346164, + "grad_norm": 0.2607904374599457, + "learning_rate": 4.595864031288025e-06, + "loss": 0.3555, "step": 112385 }, { - "epoch": 3.95, - "learning_rate": 5.534217486603946e-06, - "loss": 0.2486, + "epoch": 4.050527985007388, + "grad_norm": 0.2204258143901825, + "learning_rate": 4.594178019295922e-06, + "loss": 0.3897, "step": 112390 }, { - "epoch": 3.95, - "learning_rate": 5.532430092459017e-06, - "loss": 0.2567, + "epoch": 4.050708184668613, + "grad_norm": 0.2713705003261566, + "learning_rate": 4.592492285329208e-06, + "loss": 0.3818, "step": 112395 }, { - "epoch": 3.95, - "learning_rate": 5.530642951087656e-06, - "loss": 0.2534, + "epoch": 4.050888384329838, + "grad_norm": 0.271567165851593, + "learning_rate": 4.590806829410824e-06, + "loss": 0.3609, "step": 112400 }, { - "epoch": 3.95, - "learning_rate": 5.528856062513058e-06, - "loss": 0.2319, + "epoch": 4.0510685839910625, + "grad_norm": 0.24618300795555115, + "learning_rate": 4.589121651563749e-06, + "loss": 0.3876, "step": 112405 }, { - "epoch": 3.95, - "learning_rate": 5.527069426758438e-06, - "loss": 0.2461, + "epoch": 4.051248783652286, + "grad_norm": 0.29348066449165344, + "learning_rate": 4.587436751810942e-06, + "loss": 0.3868, "step": 112410 }, { - "epoch": 3.96, - "learning_rate": 5.525283043846988e-06, - "loss": 0.2407, + "epoch": 4.051428983313511, + "grad_norm": 0.21088933944702148, + "learning_rate": 4.585752130175355e-06, + "loss": 0.3823, "step": 112415 }, { - "epoch": 3.96, - "learning_rate": 5.523496913801904e-06, - "loss": 0.2517, + "epoch": 4.051609182974736, + "grad_norm": 0.23839104175567627, + "learning_rate": 4.5840677866799435e-06, + "loss": 0.3536, "step": 112420 }, { - "epoch": 3.96, - "learning_rate": 5.521711036646371e-06, - "loss": 0.2763, + "epoch": 4.051789382635961, + "grad_norm": 0.2764272391796112, + "learning_rate": 4.582383721347658e-06, + "loss": 0.3414, "step": 112425 }, { - "epoch": 3.96, - "learning_rate": 5.51992541240359e-06, - "loss": 0.243, + "epoch": 4.051969582297185, + "grad_norm": 0.2566319406032562, + "learning_rate": 4.580699934201432e-06, + "loss": 0.374, "step": 112430 }, { - "epoch": 3.96, - "learning_rate": 5.518140041096739e-06, - "loss": 0.2605, + "epoch": 4.05214978195841, + "grad_norm": 0.24363850057125092, + "learning_rate": 4.579016425264224e-06, + "loss": 0.3331, "step": 112435 }, { - "epoch": 3.96, - "learning_rate": 5.516354922748996e-06, - "loss": 0.2414, + "epoch": 4.052329981619635, + "grad_norm": 0.25705766677856445, + "learning_rate": 4.577333194558964e-06, + "loss": 0.3527, "step": 112440 }, { - "epoch": 3.96, - "learning_rate": 5.514570057383547e-06, - "loss": 0.2687, + "epoch": 4.0525101812808595, + "grad_norm": 0.23861709237098694, + "learning_rate": 4.575650242108584e-06, + "loss": 0.3601, "step": 112445 }, { - "epoch": 3.96, - "learning_rate": 5.512785445023561e-06, - "loss": 0.2659, + "epoch": 4.052690380942084, + "grad_norm": 0.23494866490364075, + "learning_rate": 4.573967567936016e-06, + "loss": 0.3682, "step": 112450 }, { - "epoch": 3.96, - "learning_rate": 5.511001085692219e-06, - "loss": 0.2549, + "epoch": 4.052870580603308, + "grad_norm": 0.29604148864746094, + "learning_rate": 4.572285172064181e-06, + "loss": 0.3502, "step": 112455 }, { - "epoch": 3.96, - "learning_rate": 5.509216979412685e-06, - "loss": 0.2681, + "epoch": 4.053050780264533, + "grad_norm": 0.2703550457954407, + "learning_rate": 4.570603054516009e-06, + "loss": 0.372, "step": 112460 }, { - "epoch": 3.96, - "learning_rate": 5.507433126208114e-06, - "loss": 0.2681, + "epoch": 4.0532309799257575, + "grad_norm": 0.20241078734397888, + "learning_rate": 4.568921215314423e-06, + "loss": 0.3367, "step": 112465 }, { - "epoch": 3.96, - "learning_rate": 5.5056495261016886e-06, - "loss": 0.2617, + "epoch": 4.053411179586982, + "grad_norm": 0.2732201814651489, + "learning_rate": 4.567239654482313e-06, + "loss": 0.3518, "step": 112470 }, { - "epoch": 3.96, - "learning_rate": 5.5038661791165545e-06, - "loss": 0.2559, + "epoch": 4.053591379248207, + "grad_norm": 0.2432149201631546, + "learning_rate": 4.5655583720426195e-06, + "loss": 0.3639, "step": 112475 }, { - "epoch": 3.96, - "learning_rate": 5.502083085275869e-06, - "loss": 0.2439, + "epoch": 4.053771578909432, + "grad_norm": 0.28690987825393677, + "learning_rate": 4.563877368018227e-06, + "loss": 0.395, "step": 112480 }, { - "epoch": 3.96, - "learning_rate": 5.500300244602779e-06, - "loss": 0.2442, + "epoch": 4.0539517785706565, + "grad_norm": 0.24714258313179016, + "learning_rate": 4.562196642432063e-06, + "loss": 0.3719, "step": 112485 }, { - "epoch": 3.96, - "learning_rate": 5.498517657120447e-06, - "loss": 0.2508, + "epoch": 4.054131978231881, + "grad_norm": 0.22608081996440887, + "learning_rate": 4.560516195307005e-06, + "loss": 0.3674, "step": 112490 }, { - "epoch": 3.96, - "learning_rate": 5.496735322852004e-06, - "loss": 0.2434, + "epoch": 4.054312177893106, + "grad_norm": 0.26828816533088684, + "learning_rate": 4.5588360266659495e-06, + "loss": 0.3886, "step": 112495 }, { - "epoch": 3.96, - "learning_rate": 5.494953241820608e-06, - "loss": 0.2763, + "epoch": 4.05449237755433, + "grad_norm": 0.24400553107261658, + "learning_rate": 4.557156136531804e-06, + "loss": 0.3551, "step": 112500 }, { - "epoch": 3.96, - "eval_loss": 0.25041884183883667, - "eval_runtime": 10.5467, - "eval_samples_per_second": 9.482, - "eval_steps_per_second": 9.482, + "epoch": 4.05449237755433, + "eval_loss": 0.42917296290397644, + "eval_runtime": 3.5255, + "eval_samples_per_second": 28.365, + "eval_steps_per_second": 7.091, "step": 112500 }, { - "epoch": 3.96, - "learning_rate": 5.49317141404938e-06, - "loss": 0.2521, + "epoch": 4.0546725772155545, + "grad_norm": 0.27448979020118713, + "learning_rate": 4.555476524927449e-06, + "loss": 0.378, "step": 112505 }, { - "epoch": 3.96, - "learning_rate": 5.491389839561476e-06, - "loss": 0.2585, + "epoch": 4.054852776876779, + "grad_norm": 0.24059316515922546, + "learning_rate": 4.553797191875767e-06, + "loss": 0.3708, "step": 112510 }, { - "epoch": 3.96, - "learning_rate": 5.489608518380016e-06, - "loss": 0.2675, + "epoch": 4.055032976538004, + "grad_norm": 0.26339319348335266, + "learning_rate": 4.552118137399639e-06, + "loss": 0.3886, "step": 112515 }, { - "epoch": 3.96, - "learning_rate": 5.487827450528124e-06, - "loss": 0.2587, + "epoch": 4.055213176199229, + "grad_norm": 0.2619602084159851, + "learning_rate": 4.550439361521935e-06, + "loss": 0.3898, "step": 112520 }, { - "epoch": 3.96, - "learning_rate": 5.486046636028941e-06, - "loss": 0.2654, + "epoch": 4.0553933758604535, + "grad_norm": 0.25006812810897827, + "learning_rate": 4.548760864265545e-06, + "loss": 0.3685, "step": 112525 }, { - "epoch": 3.96, - "learning_rate": 5.484266074905584e-06, - "loss": 0.2562, + "epoch": 4.055573575521678, + "grad_norm": 0.2471635937690735, + "learning_rate": 4.547082645653328e-06, + "loss": 0.3887, "step": 112530 }, { - "epoch": 3.96, - "learning_rate": 5.4824857671811695e-06, - "loss": 0.2806, + "epoch": 4.055753775182903, + "grad_norm": 0.2514253258705139, + "learning_rate": 4.545404705708148e-06, + "loss": 0.3463, "step": 112535 }, { - "epoch": 3.96, - "learning_rate": 5.4807057128788054e-06, - "loss": 0.2583, + "epoch": 4.055933974844128, + "grad_norm": 0.21959030628204346, + "learning_rate": 4.543727044452873e-06, + "loss": 0.4068, "step": 112540 }, { - "epoch": 3.96, - "learning_rate": 5.478925912021623e-06, - "loss": 0.253, + "epoch": 4.0561141745053515, + "grad_norm": 0.21514563262462616, + "learning_rate": 4.542049661910344e-06, + "loss": 0.337, "step": 112545 }, { - "epoch": 3.96, - "learning_rate": 5.477146364632724e-06, - "loss": 0.2511, + "epoch": 4.056294374166576, + "grad_norm": 0.1881556361913681, + "learning_rate": 4.540372558103439e-06, + "loss": 0.3604, "step": 112550 }, { - "epoch": 3.96, - "learning_rate": 5.475367070735207e-06, - "loss": 0.25, + "epoch": 4.056474573827801, + "grad_norm": 0.2358875721693039, + "learning_rate": 4.538695733054995e-06, + "loss": 0.3601, "step": 112555 }, { - "epoch": 3.96, - "learning_rate": 5.473588030352178e-06, - "loss": 0.2861, + "epoch": 4.056654773489026, + "grad_norm": 0.2567630410194397, + "learning_rate": 4.537019186787861e-06, + "loss": 0.3783, "step": 112560 }, { - "epoch": 3.96, - "learning_rate": 5.471809243506751e-06, - "loss": 0.2481, + "epoch": 4.05683497315025, + "grad_norm": 0.26848697662353516, + "learning_rate": 4.535342919324878e-06, + "loss": 0.4059, "step": 112565 }, { - "epoch": 3.96, - "learning_rate": 5.470030710222007e-06, - "loss": 0.2609, + "epoch": 4.057015172811475, + "grad_norm": 0.22706463932991028, + "learning_rate": 4.533666930688885e-06, + "loss": 0.368, "step": 112570 }, { - "epoch": 3.96, - "learning_rate": 5.468252430521048e-06, - "loss": 0.2488, + "epoch": 4.0571953724727, + "grad_norm": 0.26052287220954895, + "learning_rate": 4.531991220902712e-06, + "loss": 0.3455, "step": 112575 }, { - "epoch": 3.96, - "learning_rate": 5.4664744044269495e-06, - "loss": 0.279, + "epoch": 4.057375572133925, + "grad_norm": 0.2673739194869995, + "learning_rate": 4.530315789989209e-06, + "loss": 0.3275, "step": 112580 }, { - "epoch": 3.96, - "learning_rate": 5.464696631962815e-06, - "loss": 0.2666, + "epoch": 4.057555771795149, + "grad_norm": 0.22586947679519653, + "learning_rate": 4.528640637971177e-06, + "loss": 0.3832, "step": 112585 }, { - "epoch": 3.96, - "learning_rate": 5.462919113151721e-06, - "loss": 0.2647, + "epoch": 4.057735971456374, + "grad_norm": 0.28457507491111755, + "learning_rate": 4.5269657648714605e-06, + "loss": 0.3725, "step": 112590 }, { - "epoch": 3.96, - "learning_rate": 5.461141848016746e-06, - "loss": 0.2778, + "epoch": 4.057916171117598, + "grad_norm": 0.27478164434432983, + "learning_rate": 4.52529117071287e-06, + "loss": 0.3783, "step": 112595 }, { - "epoch": 3.96, - "learning_rate": 5.45936483658096e-06, - "loss": 0.2522, + "epoch": 4.058096370778823, + "grad_norm": 0.22563715279102325, + "learning_rate": 4.523616855518226e-06, + "loss": 0.3952, "step": 112600 }, { - "epoch": 3.96, - "learning_rate": 5.4575880788674534e-06, - "loss": 0.2357, + "epoch": 4.058276570440047, + "grad_norm": 0.26424163579940796, + "learning_rate": 4.5219428193103345e-06, + "loss": 0.3737, "step": 112605 }, { - "epoch": 3.96, - "learning_rate": 5.455811574899278e-06, - "loss": 0.2347, + "epoch": 4.058456770101272, + "grad_norm": 0.2882944643497467, + "learning_rate": 4.5202690621120095e-06, + "loss": 0.3523, "step": 112610 }, { - "epoch": 3.96, - "learning_rate": 5.454035324699517e-06, - "loss": 0.2538, + "epoch": 4.058636969762497, + "grad_norm": 0.2373085916042328, + "learning_rate": 4.518595583946048e-06, + "loss": 0.3757, "step": 112615 }, { - "epoch": 3.96, - "learning_rate": 5.452259328291218e-06, - "loss": 0.2651, + "epoch": 4.058817169423722, + "grad_norm": 0.2473360151052475, + "learning_rate": 4.516922384835259e-06, + "loss": 0.3808, "step": 112620 }, { - "epoch": 3.96, - "learning_rate": 5.450483585697458e-06, - "loss": 0.2506, + "epoch": 4.058997369084946, + "grad_norm": 0.19893379509449005, + "learning_rate": 4.51524946480244e-06, + "loss": 0.3353, "step": 112625 }, { - "epoch": 3.96, - "learning_rate": 5.448708096941282e-06, - "loss": 0.2621, + "epoch": 4.059177568746171, + "grad_norm": 0.3020991086959839, + "learning_rate": 4.51357682387038e-06, + "loss": 0.3551, "step": 112630 }, { - "epoch": 3.96, - "learning_rate": 5.446932862045742e-06, - "loss": 0.2529, + "epoch": 4.059357768407396, + "grad_norm": 0.2574337422847748, + "learning_rate": 4.511904462061873e-06, + "loss": 0.3703, "step": 112635 }, { - "epoch": 3.96, - "learning_rate": 5.445157881033899e-06, - "loss": 0.2539, + "epoch": 4.05953796806862, + "grad_norm": 0.2121739387512207, + "learning_rate": 4.51023237939969e-06, + "loss": 0.3629, "step": 112640 }, { - "epoch": 3.96, - "learning_rate": 5.443383153928794e-06, - "loss": 0.2768, + "epoch": 4.059718167729844, + "grad_norm": 0.2671159505844116, + "learning_rate": 4.508560575906631e-06, + "loss": 0.3642, "step": 112645 }, { - "epoch": 3.96, - "learning_rate": 5.441608680753471e-06, - "loss": 0.2464, + "epoch": 4.059898367391069, + "grad_norm": 0.18342912197113037, + "learning_rate": 4.506889051605473e-06, + "loss": 0.3798, "step": 112650 }, { - "epoch": 3.96, - "learning_rate": 5.439834461530963e-06, - "loss": 0.2607, + "epoch": 4.060078567052294, + "grad_norm": 0.22590021789073944, + "learning_rate": 4.505217806518975e-06, + "loss": 0.3921, "step": 112655 }, { - "epoch": 3.96, - "learning_rate": 5.43806049628432e-06, - "loss": 0.2392, + "epoch": 4.060258766713519, + "grad_norm": 0.24707923829555511, + "learning_rate": 4.50354684066992e-06, + "loss": 0.3438, "step": 112660 }, { - "epoch": 3.96, - "learning_rate": 5.436286785036562e-06, - "loss": 0.2353, + "epoch": 4.060438966374743, + "grad_norm": 0.22925209999084473, + "learning_rate": 4.501876154081064e-06, + "loss": 0.3645, "step": 112665 }, { - "epoch": 3.96, - "learning_rate": 5.434513327810733e-06, - "loss": 0.2523, + "epoch": 4.060619166035968, + "grad_norm": 0.21116822957992554, + "learning_rate": 4.500205746775185e-06, + "loss": 0.3326, "step": 112670 }, { - "epoch": 3.96, - "learning_rate": 5.432740124629851e-06, - "loss": 0.2582, + "epoch": 4.060799365697193, + "grad_norm": 0.25304093956947327, + "learning_rate": 4.498535618775037e-06, + "loss": 0.3963, "step": 112675 }, { - "epoch": 3.96, - "learning_rate": 5.430967175516949e-06, - "loss": 0.2597, + "epoch": 4.0609795653584175, + "grad_norm": 0.25455543398857117, + "learning_rate": 4.496865770103362e-06, + "loss": 0.3797, "step": 112680 }, { - "epoch": 3.96, - "learning_rate": 5.42919448049504e-06, - "loss": 0.2584, + "epoch": 4.061159765019641, + "grad_norm": 0.24712957441806793, + "learning_rate": 4.495196200782928e-06, + "loss": 0.3822, "step": 112685 }, { - "epoch": 3.96, - "learning_rate": 5.427422039587144e-06, - "loss": 0.2459, + "epoch": 4.061339964680866, + "grad_norm": 0.21119093894958496, + "learning_rate": 4.493526910836476e-06, + "loss": 0.3993, "step": 112690 }, { - "epoch": 3.96, - "learning_rate": 5.425649852816267e-06, - "loss": 0.2611, + "epoch": 4.061520164342091, + "grad_norm": 0.2761557698249817, + "learning_rate": 4.491857900286747e-06, + "loss": 0.3782, "step": 112695 }, { - "epoch": 3.97, - "learning_rate": 5.423877920205434e-06, - "loss": 0.2471, + "epoch": 4.061700364003316, + "grad_norm": 0.2396336793899536, + "learning_rate": 4.490189169156487e-06, + "loss": 0.3892, "step": 112700 }, { - "epoch": 3.97, - "learning_rate": 5.422106241777644e-06, - "loss": 0.2433, + "epoch": 4.06188056366454, + "grad_norm": 0.3060736656188965, + "learning_rate": 4.488520717468419e-06, + "loss": 0.3525, "step": 112705 }, { - "epoch": 3.97, - "learning_rate": 5.420334817555903e-06, - "loss": 0.2592, + "epoch": 4.062060763325765, + "grad_norm": 0.2690458595752716, + "learning_rate": 4.486852545245296e-06, + "loss": 0.3654, "step": 112710 }, { - "epoch": 3.97, - "learning_rate": 5.418563647563205e-06, - "loss": 0.2581, + "epoch": 4.06224096298699, + "grad_norm": 0.25593844056129456, + "learning_rate": 4.4851846525098305e-06, + "loss": 0.3875, "step": 112715 }, { - "epoch": 3.97, - "learning_rate": 5.416792731822551e-06, - "loss": 0.269, + "epoch": 4.0624211626482145, + "grad_norm": 0.276349812746048, + "learning_rate": 4.483517039284754e-06, + "loss": 0.367, "step": 112720 }, { - "epoch": 3.97, - "learning_rate": 5.415022070356948e-06, - "loss": 0.2512, + "epoch": 4.062601362309439, + "grad_norm": 0.3029731810092926, + "learning_rate": 4.481849705592789e-06, + "loss": 0.3476, "step": 112725 }, { - "epoch": 3.97, - "learning_rate": 5.413251663189373e-06, - "loss": 0.249, + "epoch": 4.062781561970663, + "grad_norm": 0.2233833223581314, + "learning_rate": 4.480182651456638e-06, + "loss": 0.3712, "step": 112730 }, { - "epoch": 3.97, - "learning_rate": 5.411481510342811e-06, - "loss": 0.257, + "epoch": 4.062961761631888, + "grad_norm": 0.29494714736938477, + "learning_rate": 4.4785158768990365e-06, + "loss": 0.4097, "step": 112735 }, { - "epoch": 3.97, - "learning_rate": 5.40971161184026e-06, - "loss": 0.2505, + "epoch": 4.063141961293113, + "grad_norm": 0.23293974995613098, + "learning_rate": 4.47684938194268e-06, + "loss": 0.362, "step": 112740 }, { - "epoch": 3.97, - "learning_rate": 5.407941967704694e-06, - "loss": 0.2469, + "epoch": 4.063322160954337, + "grad_norm": 0.2608863115310669, + "learning_rate": 4.475183166610278e-06, + "loss": 0.3822, "step": 112745 }, { - "epoch": 3.97, - "learning_rate": 5.406172577959082e-06, - "loss": 0.2598, + "epoch": 4.063502360615562, + "grad_norm": 0.2746981978416443, + "learning_rate": 4.473517230924532e-06, + "loss": 0.369, "step": 112750 }, { - "epoch": 3.97, - "learning_rate": 5.404403442626413e-06, - "loss": 0.2777, + "epoch": 4.063682560276787, + "grad_norm": 0.24028368294239044, + "learning_rate": 4.47185157490814e-06, + "loss": 0.402, "step": 112755 }, { - "epoch": 3.97, - "learning_rate": 5.4026345617296495e-06, - "loss": 0.2368, + "epoch": 4.0638627599380115, + "grad_norm": 0.2710413634777069, + "learning_rate": 4.47018619858379e-06, + "loss": 0.3931, "step": 112760 }, { - "epoch": 3.97, - "learning_rate": 5.400865935291763e-06, - "loss": 0.2426, + "epoch": 4.064042959599236, + "grad_norm": 0.22267523407936096, + "learning_rate": 4.468521101974188e-06, + "loss": 0.3561, "step": 112765 }, { - "epoch": 3.97, - "learning_rate": 5.399097563335709e-06, - "loss": 0.251, + "epoch": 4.064223159260461, + "grad_norm": 0.22819946706295013, + "learning_rate": 4.466856285102e-06, + "loss": 0.3752, "step": 112770 }, { - "epoch": 3.97, - "learning_rate": 5.397329445884456e-06, - "loss": 0.2764, + "epoch": 4.064403358921685, + "grad_norm": 0.2634032964706421, + "learning_rate": 4.465191747989927e-06, + "loss": 0.3828, "step": 112775 }, { - "epoch": 3.97, - "learning_rate": 5.395561582960968e-06, - "loss": 0.2656, + "epoch": 4.0645835585829095, + "grad_norm": 0.2825946807861328, + "learning_rate": 4.463527490660641e-06, + "loss": 0.3642, "step": 112780 }, { - "epoch": 3.97, - "learning_rate": 5.3937939745881935e-06, - "loss": 0.2502, + "epoch": 4.064763758244134, + "grad_norm": 0.24241934716701508, + "learning_rate": 4.461863513136816e-06, + "loss": 0.3458, "step": 112785 }, { - "epoch": 3.97, - "learning_rate": 5.392026620789075e-06, - "loss": 0.2647, + "epoch": 4.064943957905359, + "grad_norm": 0.27716174721717834, + "learning_rate": 4.460199815441124e-06, + "loss": 0.3882, "step": 112790 }, { - "epoch": 3.97, - "learning_rate": 5.390259521586577e-06, - "loss": 0.2678, + "epoch": 4.065124157566584, + "grad_norm": 0.24334949254989624, + "learning_rate": 4.458536397596233e-06, + "loss": 0.3789, "step": 112795 }, { - "epoch": 3.97, - "learning_rate": 5.388492677003634e-06, - "loss": 0.2412, + "epoch": 4.0653043572278085, + "grad_norm": 0.26515430212020874, + "learning_rate": 4.456873259624802e-06, + "loss": 0.4021, "step": 112800 }, { - "epoch": 3.97, - "learning_rate": 5.38672608706319e-06, - "loss": 0.2462, + "epoch": 4.065484556889033, + "grad_norm": 0.28932276368141174, + "learning_rate": 4.455210401549501e-06, + "loss": 0.3874, "step": 112805 }, { - "epoch": 3.97, - "learning_rate": 5.384959751788176e-06, - "loss": 0.2595, + "epoch": 4.065664756550258, + "grad_norm": 0.19949932396411896, + "learning_rate": 4.4535478233929786e-06, + "loss": 0.3483, "step": 112810 }, { - "epoch": 3.97, - "learning_rate": 5.383193671201539e-06, - "loss": 0.2423, + "epoch": 4.065844956211483, + "grad_norm": 0.2426164299249649, + "learning_rate": 4.45188552517789e-06, + "loss": 0.3642, "step": 112815 }, { - "epoch": 3.97, - "learning_rate": 5.381427845326206e-06, - "loss": 0.2729, + "epoch": 4.0660251558727065, + "grad_norm": 0.24339383840560913, + "learning_rate": 4.450223506926884e-06, + "loss": 0.3921, "step": 112820 }, { - "epoch": 3.97, - "learning_rate": 5.379662274185093e-06, - "loss": 0.2904, + "epoch": 4.066205355533931, + "grad_norm": 0.2848939001560211, + "learning_rate": 4.448561768662599e-06, + "loss": 0.3685, "step": 112825 }, { - "epoch": 3.97, - "learning_rate": 5.377896957801137e-06, - "loss": 0.2528, + "epoch": 4.066385555195156, + "grad_norm": 0.23407739400863647, + "learning_rate": 4.4469003104076865e-06, + "loss": 0.3936, "step": 112830 }, { - "epoch": 3.97, - "learning_rate": 5.376131896197267e-06, - "loss": 0.2373, + "epoch": 4.066565754856381, + "grad_norm": 0.34121012687683105, + "learning_rate": 4.445239132184778e-06, + "loss": 0.3816, "step": 112835 }, { - "epoch": 3.97, - "learning_rate": 5.374367089396387e-06, - "loss": 0.2667, + "epoch": 4.0667459545176055, + "grad_norm": 0.2433071732521057, + "learning_rate": 4.443578234016504e-06, + "loss": 0.3895, "step": 112840 }, { - "epoch": 3.97, - "learning_rate": 5.37260253742142e-06, - "loss": 0.271, + "epoch": 4.06692615417883, + "grad_norm": 0.24688871204853058, + "learning_rate": 4.4419176159255e-06, + "loss": 0.4009, "step": 112845 }, { - "epoch": 3.97, - "learning_rate": 5.370838240295267e-06, - "loss": 0.2607, + "epoch": 4.067106353840055, + "grad_norm": 0.255391389131546, + "learning_rate": 4.440257277934381e-06, + "loss": 0.3783, "step": 112850 }, { - "epoch": 3.97, - "learning_rate": 5.36907419804085e-06, - "loss": 0.2656, + "epoch": 4.06728655350128, + "grad_norm": 0.323646605014801, + "learning_rate": 4.438597220065782e-06, + "loss": 0.3655, "step": 112855 }, { - "epoch": 3.97, - "learning_rate": 5.367310410681064e-06, - "loss": 0.2527, + "epoch": 4.067466753162504, + "grad_norm": 0.3119712769985199, + "learning_rate": 4.436937442342323e-06, + "loss": 0.357, "step": 112860 }, { - "epoch": 3.97, - "learning_rate": 5.365546878238811e-06, - "loss": 0.2566, + "epoch": 4.067646952823729, + "grad_norm": 0.2642781138420105, + "learning_rate": 4.435277944786595e-06, + "loss": 0.3654, "step": 112865 }, { - "epoch": 3.97, - "learning_rate": 5.363783600736997e-06, - "loss": 0.239, + "epoch": 4.067827152484953, + "grad_norm": 0.19354425370693207, + "learning_rate": 4.433618727421232e-06, + "loss": 0.3937, "step": 112870 }, { - "epoch": 3.97, - "learning_rate": 5.362020578198512e-06, - "loss": 0.2389, + "epoch": 4.068007352146178, + "grad_norm": 0.227387472987175, + "learning_rate": 4.431959790268831e-06, + "loss": 0.3485, "step": 112875 }, { - "epoch": 3.97, - "learning_rate": 5.360257810646244e-06, - "loss": 0.2692, + "epoch": 4.068187551807402, + "grad_norm": 0.256632000207901, + "learning_rate": 4.430301133351997e-06, + "loss": 0.3545, "step": 112880 }, { - "epoch": 3.97, - "learning_rate": 5.358495298103089e-06, - "loss": 0.2588, + "epoch": 4.068367751468627, + "grad_norm": 0.21638993918895721, + "learning_rate": 4.428642756693324e-06, + "loss": 0.3527, "step": 112885 }, { - "epoch": 3.97, - "learning_rate": 5.356733040591921e-06, - "loss": 0.2516, + "epoch": 4.068547951129852, + "grad_norm": 0.30779585242271423, + "learning_rate": 4.426984660315409e-06, + "loss": 0.3595, "step": 112890 }, { - "epoch": 3.97, - "learning_rate": 5.354971038135639e-06, - "loss": 0.2483, + "epoch": 4.068728150791077, + "grad_norm": 0.2729836702346802, + "learning_rate": 4.425326844240849e-06, + "loss": 0.419, "step": 112895 }, { - "epoch": 3.97, - "learning_rate": 5.353209290757108e-06, - "loss": 0.2648, + "epoch": 4.068908350452301, + "grad_norm": 0.26512590050697327, + "learning_rate": 4.423669308492228e-06, + "loss": 0.3389, "step": 112900 }, { - "epoch": 3.97, - "learning_rate": 5.351447798479203e-06, - "loss": 0.2546, + "epoch": 4.069088550113526, + "grad_norm": 0.27653393149375916, + "learning_rate": 4.422012053092128e-06, + "loss": 0.3723, "step": 112905 }, { - "epoch": 3.97, - "learning_rate": 5.349686561324805e-06, - "loss": 0.2499, + "epoch": 4.069268749774751, + "grad_norm": 0.22129282355308533, + "learning_rate": 4.420355078063129e-06, + "loss": 0.3573, "step": 112910 }, { - "epoch": 3.97, - "learning_rate": 5.347925579316776e-06, - "loss": 0.2448, + "epoch": 4.069448949435975, + "grad_norm": 0.22182343900203705, + "learning_rate": 4.41869838342781e-06, + "loss": 0.3635, "step": 112915 }, { - "epoch": 3.97, - "learning_rate": 5.346164852477984e-06, - "loss": 0.2545, + "epoch": 4.069629149097199, + "grad_norm": 0.24553145468235016, + "learning_rate": 4.417041969208735e-06, + "loss": 0.3854, "step": 112920 }, { - "epoch": 3.97, - "learning_rate": 5.34440438083128e-06, - "loss": 0.2552, + "epoch": 4.069809348758424, + "grad_norm": 0.24355679750442505, + "learning_rate": 4.4153858354284814e-06, + "loss": 0.3762, "step": 112925 }, { - "epoch": 3.97, - "learning_rate": 5.34264416439954e-06, - "loss": 0.2438, + "epoch": 4.069989548419649, + "grad_norm": 0.24160782992839813, + "learning_rate": 4.413729982109613e-06, + "loss": 0.3698, "step": 112930 }, { - "epoch": 3.97, - "learning_rate": 5.340884203205604e-06, - "loss": 0.2661, + "epoch": 4.070169748080874, + "grad_norm": 0.23975780606269836, + "learning_rate": 4.412074409274689e-06, + "loss": 0.3852, "step": 112935 }, { - "epoch": 3.97, - "learning_rate": 5.339124497272338e-06, - "loss": 0.2654, + "epoch": 4.070349947742098, + "grad_norm": 0.19808252155780792, + "learning_rate": 4.4104191169462625e-06, + "loss": 0.341, "step": 112940 }, { - "epoch": 3.97, - "learning_rate": 5.337365046622575e-06, - "loss": 0.2698, + "epoch": 4.070530147403323, + "grad_norm": 0.2225724160671234, + "learning_rate": 4.4087641051468846e-06, + "loss": 0.3836, "step": 112945 }, { - "epoch": 3.97, - "learning_rate": 5.3356058512791765e-06, - "loss": 0.235, + "epoch": 4.070710347064548, + "grad_norm": 0.2074027806520462, + "learning_rate": 4.4071093738991125e-06, + "loss": 0.3487, "step": 112950 }, { - "epoch": 3.97, - "learning_rate": 5.333846911264975e-06, - "loss": 0.248, + "epoch": 4.0708905467257726, + "grad_norm": 0.271662175655365, + "learning_rate": 4.405454923225491e-06, + "loss": 0.3715, "step": 112955 }, { - "epoch": 3.97, - "learning_rate": 5.3320882266028135e-06, - "loss": 0.2477, + "epoch": 4.071070746386996, + "grad_norm": 0.23517721891403198, + "learning_rate": 4.40380075314856e-06, + "loss": 0.3949, "step": 112960 }, { - "epoch": 3.97, - "learning_rate": 5.330329797315517e-06, - "loss": 0.2536, + "epoch": 4.071250946048221, + "grad_norm": 0.24322974681854248, + "learning_rate": 4.402146863690854e-06, + "loss": 0.384, "step": 112965 }, { - "epoch": 3.97, - "learning_rate": 5.3285716234259316e-06, - "loss": 0.2584, + "epoch": 4.071431145709446, + "grad_norm": 0.2979622781276703, + "learning_rate": 4.400493254874902e-06, + "loss": 0.3898, "step": 112970 }, { - "epoch": 3.97, - "learning_rate": 5.326813704956879e-06, - "loss": 0.2535, + "epoch": 4.071611345370671, + "grad_norm": 0.19745075702667236, + "learning_rate": 4.3988399267232555e-06, + "loss": 0.3579, "step": 112975 }, { - "epoch": 3.97, - "learning_rate": 5.325056041931187e-06, - "loss": 0.2336, + "epoch": 4.071791545031895, + "grad_norm": 0.23466679453849792, + "learning_rate": 4.397186879258419e-06, + "loss": 0.3682, "step": 112980 }, { - "epoch": 3.98, - "learning_rate": 5.323298634371668e-06, - "loss": 0.2672, + "epoch": 4.07197174469312, + "grad_norm": 0.18801245093345642, + "learning_rate": 4.39553411250292e-06, + "loss": 0.3595, "step": 112985 }, { - "epoch": 3.98, - "learning_rate": 5.321541482301157e-06, - "loss": 0.2798, + "epoch": 4.072151944354345, + "grad_norm": 0.23627303540706635, + "learning_rate": 4.393881626479282e-06, + "loss": 0.3774, "step": 112990 }, { - "epoch": 3.98, - "learning_rate": 5.319784585742455e-06, - "loss": 0.2321, + "epoch": 4.0723321440155695, + "grad_norm": 0.28023478388786316, + "learning_rate": 4.3922294212100175e-06, + "loss": 0.3824, "step": 112995 }, { - "epoch": 3.98, - "learning_rate": 5.318027944718379e-06, - "loss": 0.2488, + "epoch": 4.072512343676794, + "grad_norm": 0.2806805372238159, + "learning_rate": 4.390577496717638e-06, + "loss": 0.4054, "step": 113000 }, { - "epoch": 3.98, - "eval_loss": 0.25009918212890625, - "eval_runtime": 10.5513, - "eval_samples_per_second": 9.478, - "eval_steps_per_second": 9.478, + "epoch": 4.072512343676794, + "eval_loss": 0.4291308522224426, + "eval_runtime": 3.5302, + "eval_samples_per_second": 28.327, + "eval_steps_per_second": 7.082, "step": 113000 }, { - "epoch": 3.98, - "learning_rate": 5.316271559251745e-06, - "loss": 0.243, + "epoch": 4.072692543338018, + "grad_norm": 0.30133917927742004, + "learning_rate": 4.3889258530246505e-06, + "loss": 0.3692, "step": 113005 }, { - "epoch": 3.98, - "learning_rate": 5.314515429365355e-06, - "loss": 0.2529, + "epoch": 4.072872742999243, + "grad_norm": 0.22192399203777313, + "learning_rate": 4.387274490153551e-06, + "loss": 0.3635, "step": 113010 }, { - "epoch": 3.98, - "learning_rate": 5.312759555082006e-06, - "loss": 0.2655, + "epoch": 4.073052942660468, + "grad_norm": 0.23747487366199493, + "learning_rate": 4.385623408126852e-06, + "loss": 0.3788, "step": 113015 }, { - "epoch": 3.98, - "learning_rate": 5.311003936424497e-06, - "loss": 0.2353, + "epoch": 4.073233142321692, + "grad_norm": 0.25482213497161865, + "learning_rate": 4.383972606967044e-06, + "loss": 0.3497, "step": 113020 }, { - "epoch": 3.98, - "learning_rate": 5.309248573415629e-06, - "loss": 0.2534, + "epoch": 4.073413341982917, + "grad_norm": 0.2351936399936676, + "learning_rate": 4.382322086696616e-06, + "loss": 0.3412, "step": 113025 }, { - "epoch": 3.98, - "learning_rate": 5.307493466078195e-06, - "loss": 0.2704, + "epoch": 4.073593541644142, + "grad_norm": 0.21274252235889435, + "learning_rate": 4.380671847338061e-06, + "loss": 0.3827, "step": 113030 }, { - "epoch": 3.98, - "learning_rate": 5.305738614434977e-06, - "loss": 0.262, + "epoch": 4.0737737413053665, + "grad_norm": 0.26760604977607727, + "learning_rate": 4.379021888913851e-06, + "loss": 0.4002, "step": 113035 }, { - "epoch": 3.98, - "learning_rate": 5.303984018508759e-06, - "loss": 0.254, + "epoch": 4.073953940966591, + "grad_norm": 0.28649207949638367, + "learning_rate": 4.377372211446482e-06, + "loss": 0.3985, "step": 113040 }, { - "epoch": 3.98, - "learning_rate": 5.302229678322338e-06, - "loss": 0.2676, + "epoch": 4.074134140627816, + "grad_norm": 0.23005342483520508, + "learning_rate": 4.37572281495843e-06, + "loss": 0.3485, "step": 113045 }, { - "epoch": 3.98, - "learning_rate": 5.300475593898471e-06, - "loss": 0.2601, + "epoch": 4.07431434028904, + "grad_norm": 0.24847397208213806, + "learning_rate": 4.37407369947215e-06, + "loss": 0.3606, "step": 113050 }, { - "epoch": 3.98, - "learning_rate": 5.298721765259956e-06, - "loss": 0.27, + "epoch": 4.074494539950265, + "grad_norm": 0.2495976686477661, + "learning_rate": 4.37242486501013e-06, + "loss": 0.4039, "step": 113055 }, { - "epoch": 3.98, - "learning_rate": 5.296968192429549e-06, - "loss": 0.2447, + "epoch": 4.074674739611489, + "grad_norm": 0.25559067726135254, + "learning_rate": 4.370776311594826e-06, + "loss": 0.3642, "step": 113060 }, { - "epoch": 3.98, - "learning_rate": 5.295214875430032e-06, - "loss": 0.249, + "epoch": 4.074854939272714, + "grad_norm": 0.22897782921791077, + "learning_rate": 4.369128039248699e-06, + "loss": 0.3976, "step": 113065 }, { - "epoch": 3.98, - "learning_rate": 5.293461814284162e-06, - "loss": 0.25, + "epoch": 4.075035138933939, + "grad_norm": 0.24675941467285156, + "learning_rate": 4.36748004799421e-06, + "loss": 0.4089, "step": 113070 }, { - "epoch": 3.98, - "learning_rate": 5.291709009014706e-06, - "loss": 0.2598, + "epoch": 4.0752153385951635, + "grad_norm": 0.30933111906051636, + "learning_rate": 4.365832337853806e-06, + "loss": 0.3718, "step": 113075 }, { - "epoch": 3.98, - "learning_rate": 5.28995645964441e-06, - "loss": 0.2613, + "epoch": 4.075395538256388, + "grad_norm": 0.2843681871891022, + "learning_rate": 4.364184908849947e-06, + "loss": 0.3674, "step": 113080 }, { - "epoch": 3.98, - "learning_rate": 5.288204166196048e-06, - "loss": 0.2532, + "epoch": 4.075575737917613, + "grad_norm": 0.20400385558605194, + "learning_rate": 4.362537761005073e-06, + "loss": 0.4067, "step": 113085 }, { - "epoch": 3.98, - "learning_rate": 5.286452128692365e-06, - "loss": 0.2574, + "epoch": 4.075755937578838, + "grad_norm": 0.2532506585121155, + "learning_rate": 4.3608908943416264e-06, + "loss": 0.3598, "step": 113090 }, { - "epoch": 3.98, - "learning_rate": 5.284700347156105e-06, - "loss": 0.2614, + "epoch": 4.0759361372400615, + "grad_norm": 0.23435381054878235, + "learning_rate": 4.359244308882043e-06, + "loss": 0.3748, "step": 113095 }, { - "epoch": 3.98, - "learning_rate": 5.28294882161002e-06, - "loss": 0.2657, + "epoch": 4.076116336901286, + "grad_norm": 0.2436995804309845, + "learning_rate": 4.357598004648763e-06, + "loss": 0.3937, "step": 113100 }, { - "epoch": 3.98, - "learning_rate": 5.281197552076847e-06, - "loss": 0.2509, + "epoch": 4.076296536562511, + "grad_norm": 0.26135680079460144, + "learning_rate": 4.355951981664208e-06, + "loss": 0.3677, "step": 113105 }, { - "epoch": 3.98, - "learning_rate": 5.279446538579336e-06, - "loss": 0.2641, + "epoch": 4.076476736223736, + "grad_norm": 0.3116925060749054, + "learning_rate": 4.354306239950814e-06, + "loss": 0.3898, "step": 113110 }, { - "epoch": 3.98, - "learning_rate": 5.2776957811402044e-06, - "loss": 0.2451, + "epoch": 4.0766569358849605, + "grad_norm": 0.25382331013679504, + "learning_rate": 4.352660779531001e-06, + "loss": 0.4214, "step": 113115 }, { - "epoch": 3.98, - "learning_rate": 5.275945279782207e-06, - "loss": 0.259, + "epoch": 4.076837135546185, + "grad_norm": 0.2189731001853943, + "learning_rate": 4.3510156004271886e-06, + "loss": 0.3645, "step": 113120 }, { - "epoch": 3.98, - "learning_rate": 5.27419503452806e-06, - "loss": 0.2733, + "epoch": 4.07701733520741, + "grad_norm": 0.26065686345100403, + "learning_rate": 4.3493707026617894e-06, + "loss": 0.3576, "step": 113125 }, { - "epoch": 3.98, - "learning_rate": 5.272445045400489e-06, - "loss": 0.273, + "epoch": 4.077197534868635, + "grad_norm": 0.25834450125694275, + "learning_rate": 4.347726086257212e-06, + "loss": 0.3675, "step": 113130 }, { - "epoch": 3.98, - "learning_rate": 5.270695312422214e-06, - "loss": 0.2471, + "epoch": 4.077377734529859, + "grad_norm": 0.28615930676460266, + "learning_rate": 4.346081751235873e-06, + "loss": 0.3697, "step": 113135 }, { - "epoch": 3.98, - "learning_rate": 5.268945835615962e-06, - "loss": 0.247, + "epoch": 4.077557934191084, + "grad_norm": 0.27379459142684937, + "learning_rate": 4.344437697620174e-06, + "loss": 0.3802, "step": 113140 }, { - "epoch": 3.98, - "learning_rate": 5.267196615004446e-06, - "loss": 0.2455, + "epoch": 4.077738133852308, + "grad_norm": 0.27598029375076294, + "learning_rate": 4.342793925432509e-06, + "loss": 0.3636, "step": 113145 }, { - "epoch": 3.98, - "learning_rate": 5.265447650610381e-06, - "loss": 0.2811, + "epoch": 4.077918333513533, + "grad_norm": 0.2820260226726532, + "learning_rate": 4.341150434695279e-06, + "loss": 0.3939, "step": 113150 }, { - "epoch": 3.98, - "learning_rate": 5.263698942456463e-06, - "loss": 0.255, + "epoch": 4.0780985331747575, + "grad_norm": 0.25555455684661865, + "learning_rate": 4.339507225430867e-06, + "loss": 0.3348, "step": 113155 }, { - "epoch": 3.98, - "learning_rate": 5.261950490565407e-06, - "loss": 0.2532, + "epoch": 4.078278732835982, + "grad_norm": 0.2568057179450989, + "learning_rate": 4.337864297661684e-06, + "loss": 0.3276, "step": 113160 }, { - "epoch": 3.98, - "learning_rate": 5.260202294959923e-06, - "loss": 0.2497, + "epoch": 4.078458932497207, + "grad_norm": 0.2430773675441742, + "learning_rate": 4.336221651410091e-06, + "loss": 0.3626, "step": 113165 }, { - "epoch": 3.98, - "learning_rate": 5.258454355662704e-06, - "loss": 0.2666, + "epoch": 4.078639132158432, + "grad_norm": 0.26786041259765625, + "learning_rate": 4.334579286698473e-06, + "loss": 0.3538, "step": 113170 }, { - "epoch": 3.98, - "learning_rate": 5.256706672696438e-06, - "loss": 0.2644, + "epoch": 4.078819331819656, + "grad_norm": 0.23630788922309875, + "learning_rate": 4.3329372035492145e-06, + "loss": 0.3774, "step": 113175 }, { - "epoch": 3.98, - "learning_rate": 5.254959246083832e-06, - "loss": 0.2605, + "epoch": 4.078999531480881, + "grad_norm": 0.2112296223640442, + "learning_rate": 4.3312954019846894e-06, + "loss": 0.3677, "step": 113180 }, { - "epoch": 3.98, - "learning_rate": 5.253212075847566e-06, - "loss": 0.2268, + "epoch": 4.079179731142106, + "grad_norm": 0.20694564282894135, + "learning_rate": 4.32965388202726e-06, + "loss": 0.3376, "step": 113185 }, { - "epoch": 3.98, - "learning_rate": 5.251465162010327e-06, - "loss": 0.271, + "epoch": 4.07935993080333, + "grad_norm": 0.19612739980220795, + "learning_rate": 4.3280126436992944e-06, + "loss": 0.3649, "step": 113190 }, { - "epoch": 3.98, - "learning_rate": 5.249718504594794e-06, - "loss": 0.2515, + "epoch": 4.079540130464554, + "grad_norm": 0.2163010984659195, + "learning_rate": 4.32637168702315e-06, + "loss": 0.3567, "step": 113195 }, { - "epoch": 3.98, - "learning_rate": 5.247972103623655e-06, - "loss": 0.2726, + "epoch": 4.079720330125779, + "grad_norm": 0.27228453755378723, + "learning_rate": 4.324731012021193e-06, + "loss": 0.3773, "step": 113200 }, { - "epoch": 3.98, - "learning_rate": 5.246225959119583e-06, - "loss": 0.2534, + "epoch": 4.079900529787004, + "grad_norm": 0.28055036067962646, + "learning_rate": 4.323090618715775e-06, + "loss": 0.3608, "step": 113205 }, { - "epoch": 3.98, - "learning_rate": 5.244480071105243e-06, - "loss": 0.258, + "epoch": 4.080080729448229, + "grad_norm": 0.2570863366127014, + "learning_rate": 4.321450507129243e-06, + "loss": 0.3704, "step": 113210 }, { - "epoch": 3.98, - "learning_rate": 5.242734439603306e-06, - "loss": 0.2804, + "epoch": 4.080260929109453, + "grad_norm": 0.262403666973114, + "learning_rate": 4.319810677283945e-06, + "loss": 0.3805, "step": 113215 }, { - "epoch": 3.98, - "learning_rate": 5.240989064636451e-06, - "loss": 0.246, + "epoch": 4.080441128770678, + "grad_norm": 0.31988710165023804, + "learning_rate": 4.318171129202217e-06, + "loss": 0.3807, "step": 113220 }, { - "epoch": 3.98, - "learning_rate": 5.2392439462273315e-06, - "loss": 0.2575, + "epoch": 4.080621328431903, + "grad_norm": 0.2832499146461487, + "learning_rate": 4.316531862906409e-06, + "loss": 0.3679, "step": 113225 }, { - "epoch": 3.98, - "learning_rate": 5.2374990843985975e-06, - "loss": 0.2712, + "epoch": 4.080801528093128, + "grad_norm": 0.3199155330657959, + "learning_rate": 4.314892878418855e-06, + "loss": 0.3587, "step": 113230 }, { - "epoch": 3.98, - "learning_rate": 5.235754479172925e-06, - "loss": 0.2583, + "epoch": 4.080981727754351, + "grad_norm": 0.3142116069793701, + "learning_rate": 4.313254175761869e-06, + "loss": 0.3666, "step": 113235 }, { - "epoch": 3.98, - "learning_rate": 5.23401013057295e-06, - "loss": 0.2454, + "epoch": 4.081161927415576, + "grad_norm": 0.2442038655281067, + "learning_rate": 4.311615754957796e-06, + "loss": 0.3704, "step": 113240 }, { - "epoch": 3.98, - "learning_rate": 5.2322660386213306e-06, - "loss": 0.2527, + "epoch": 4.081342127076801, + "grad_norm": 0.2437644749879837, + "learning_rate": 4.309977616028954e-06, + "loss": 0.3603, "step": 113245 }, { - "epoch": 3.98, - "learning_rate": 5.2305222033407015e-06, - "loss": 0.2681, + "epoch": 4.081522326738026, + "grad_norm": 0.24333834648132324, + "learning_rate": 4.3083397589976535e-06, + "loss": 0.3627, "step": 113250 }, { - "epoch": 3.98, - "learning_rate": 5.2287786247537215e-06, - "loss": 0.2444, + "epoch": 4.08170252639925, + "grad_norm": 0.2943533957004547, + "learning_rate": 4.30670218388623e-06, + "loss": 0.3846, "step": 113255 }, { - "epoch": 3.98, - "learning_rate": 5.227035302883018e-06, - "loss": 0.2593, + "epoch": 4.081882726060475, + "grad_norm": 0.24343356490135193, + "learning_rate": 4.305064890716973e-06, + "loss": 0.3757, "step": 113260 }, { - "epoch": 3.98, - "learning_rate": 5.225292237751225e-06, - "loss": 0.2631, + "epoch": 4.0820629257217, + "grad_norm": 0.22839723527431488, + "learning_rate": 4.3034278795122035e-06, + "loss": 0.3877, "step": 113265 }, { - "epoch": 3.99, - "learning_rate": 5.223549429380983e-06, - "loss": 0.2598, + "epoch": 4.0822431253829246, + "grad_norm": 0.3169880211353302, + "learning_rate": 4.301791150294224e-06, + "loss": 0.4014, "step": 113270 }, { - "epoch": 3.99, - "learning_rate": 5.221806877794922e-06, - "loss": 0.2705, + "epoch": 4.082423325044149, + "grad_norm": 0.24656805396080017, + "learning_rate": 4.300154703085332e-06, + "loss": 0.3673, "step": 113275 }, { - "epoch": 3.99, - "learning_rate": 5.220064583015666e-06, - "loss": 0.2551, + "epoch": 4.082603524705373, + "grad_norm": 0.2953994870185852, + "learning_rate": 4.298518537907826e-06, + "loss": 0.3797, "step": 113280 }, { - "epoch": 3.99, - "learning_rate": 5.218322545065834e-06, - "loss": 0.2478, + "epoch": 4.082783724366598, + "grad_norm": 0.2698920965194702, + "learning_rate": 4.296882654783993e-06, + "loss": 0.3937, "step": 113285 }, { - "epoch": 3.99, - "learning_rate": 5.216580763968043e-06, - "loss": 0.2449, + "epoch": 4.082963924027823, + "grad_norm": 0.27484050393104553, + "learning_rate": 4.295247053736124e-06, + "loss": 0.3645, "step": 113290 }, { - "epoch": 3.99, - "learning_rate": 5.214839239744917e-06, - "loss": 0.2426, + "epoch": 4.083144123689047, + "grad_norm": 0.22406524419784546, + "learning_rate": 4.293611734786507e-06, + "loss": 0.3469, "step": 113295 }, { - "epoch": 3.99, - "learning_rate": 5.2130979724190644e-06, - "loss": 0.2532, + "epoch": 4.083324323350272, + "grad_norm": 0.22183670103549957, + "learning_rate": 4.291976697957425e-06, + "loss": 0.3374, "step": 113300 }, { - "epoch": 3.99, - "learning_rate": 5.211356962013098e-06, - "loss": 0.2717, + "epoch": 4.083504523011497, + "grad_norm": 0.25425711274147034, + "learning_rate": 4.290341943271151e-06, + "loss": 0.3478, "step": 113305 }, { - "epoch": 3.99, - "learning_rate": 5.209616208549611e-06, - "loss": 0.259, + "epoch": 4.0836847226727215, + "grad_norm": 0.26116448640823364, + "learning_rate": 4.288707470749956e-06, + "loss": 0.3491, "step": 113310 }, { - "epoch": 3.99, - "learning_rate": 5.2078757120512215e-06, - "loss": 0.2383, + "epoch": 4.083864922333946, + "grad_norm": 0.21055485308170319, + "learning_rate": 4.287073280416107e-06, + "loss": 0.3569, "step": 113315 }, { - "epoch": 3.99, - "learning_rate": 5.206135472540519e-06, - "loss": 0.2458, + "epoch": 4.084045121995171, + "grad_norm": 0.24054095149040222, + "learning_rate": 4.28543937229188e-06, + "loss": 0.3747, "step": 113320 }, { - "epoch": 3.99, - "learning_rate": 5.204395490040106e-06, - "loss": 0.2585, + "epoch": 4.084225321656396, + "grad_norm": 0.2076614648103714, + "learning_rate": 4.283805746399533e-06, + "loss": 0.374, "step": 113325 }, { - "epoch": 3.99, - "learning_rate": 5.202655764572567e-06, - "loss": 0.2655, + "epoch": 4.08440552131762, + "grad_norm": 0.2705661356449127, + "learning_rate": 4.2821724027613224e-06, + "loss": 0.3778, "step": 113330 }, { - "epoch": 3.99, - "learning_rate": 5.200916296160502e-06, - "loss": 0.2825, + "epoch": 4.084585720978844, + "grad_norm": 0.20731863379478455, + "learning_rate": 4.280539341399497e-06, + "loss": 0.3833, "step": 113335 }, { - "epoch": 3.99, - "learning_rate": 5.1991770848264894e-06, - "loss": 0.2488, + "epoch": 4.084765920640069, + "grad_norm": 0.2571890950202942, + "learning_rate": 4.27890656233631e-06, + "loss": 0.3689, "step": 113340 }, { - "epoch": 3.99, - "learning_rate": 5.197438130593107e-06, - "loss": 0.2558, + "epoch": 4.084946120301294, + "grad_norm": 0.2876233756542206, + "learning_rate": 4.27727406559402e-06, + "loss": 0.3643, "step": 113345 }, { - "epoch": 3.99, - "learning_rate": 5.195699433482948e-06, - "loss": 0.2687, + "epoch": 4.0851263199625185, + "grad_norm": 0.21950480341911316, + "learning_rate": 4.275641851194853e-06, + "loss": 0.3595, "step": 113350 }, { - "epoch": 3.99, - "learning_rate": 5.193960993518579e-06, - "loss": 0.2517, + "epoch": 4.085306519623743, + "grad_norm": 0.2071562260389328, + "learning_rate": 4.274009919161048e-06, + "loss": 0.3788, "step": 113355 }, { - "epoch": 3.99, - "learning_rate": 5.1922228107225746e-06, - "loss": 0.2554, + "epoch": 4.085486719284968, + "grad_norm": 0.2707284688949585, + "learning_rate": 4.272378269514851e-06, + "loss": 0.3466, "step": 113360 }, { - "epoch": 3.99, - "learning_rate": 5.190484885117497e-06, - "loss": 0.2562, + "epoch": 4.085666918946193, + "grad_norm": 0.22272071242332458, + "learning_rate": 4.2707469022784804e-06, + "loss": 0.3734, "step": 113365 }, { - "epoch": 3.99, - "learning_rate": 5.188747216725925e-06, - "loss": 0.2593, + "epoch": 4.0858471186074174, + "grad_norm": 0.26984214782714844, + "learning_rate": 4.269115817474181e-06, + "loss": 0.3943, "step": 113370 }, { - "epoch": 3.99, - "learning_rate": 5.187009805570406e-06, - "loss": 0.2627, + "epoch": 4.086027318268641, + "grad_norm": 0.23475557565689087, + "learning_rate": 4.267485015124162e-06, + "loss": 0.4002, "step": 113375 }, { - "epoch": 3.99, - "learning_rate": 5.1852726516735175e-06, - "loss": 0.2581, + "epoch": 4.086207517929866, + "grad_norm": 0.2445220947265625, + "learning_rate": 4.265854495250638e-06, + "loss": 0.3855, "step": 113380 }, { - "epoch": 3.99, - "learning_rate": 5.1835357550577945e-06, - "loss": 0.2622, + "epoch": 4.086387717591091, + "grad_norm": 0.3065510392189026, + "learning_rate": 4.264224257875837e-06, + "loss": 0.3648, "step": 113385 }, { - "epoch": 3.99, - "learning_rate": 5.181799115745809e-06, - "loss": 0.2618, + "epoch": 4.0865679172523155, + "grad_norm": 0.25749513506889343, + "learning_rate": 4.262594303021966e-06, + "loss": 0.3851, "step": 113390 }, { - "epoch": 3.99, - "learning_rate": 5.180062733760102e-06, - "loss": 0.2673, + "epoch": 4.08674811691354, + "grad_norm": 0.307963103055954, + "learning_rate": 4.260964630711234e-06, + "loss": 0.3856, "step": 113395 }, { - "epoch": 3.99, - "learning_rate": 5.178326609123216e-06, - "loss": 0.2467, + "epoch": 4.086928316574765, + "grad_norm": 0.2993742525577545, + "learning_rate": 4.2593352409658445e-06, + "loss": 0.4103, "step": 113400 }, { - "epoch": 3.99, - "learning_rate": 5.176590741857689e-06, - "loss": 0.28, + "epoch": 4.08710851623599, + "grad_norm": 0.28788864612579346, + "learning_rate": 4.25770613380799e-06, + "loss": 0.3724, "step": 113405 }, { - "epoch": 3.99, - "learning_rate": 5.174855131986079e-06, - "loss": 0.2534, + "epoch": 4.087288715897214, + "grad_norm": 0.22574250400066376, + "learning_rate": 4.2560773092598786e-06, + "loss": 0.3732, "step": 113410 }, { - "epoch": 3.99, - "learning_rate": 5.173119779530905e-06, - "loss": 0.2566, + "epoch": 4.087468915558439, + "grad_norm": 0.21905584633350372, + "learning_rate": 4.2544487673436996e-06, + "loss": 0.368, "step": 113415 }, { - "epoch": 3.99, - "learning_rate": 5.171384684514705e-06, - "loss": 0.2594, + "epoch": 4.087649115219663, + "grad_norm": 0.20254629850387573, + "learning_rate": 4.252820508081637e-06, + "loss": 0.3668, "step": 113420 }, { - "epoch": 3.99, - "learning_rate": 5.169649846960001e-06, - "loss": 0.2377, + "epoch": 4.087829314880888, + "grad_norm": 0.24264578521251678, + "learning_rate": 4.251192531495879e-06, + "loss": 0.3633, "step": 113425 }, { - "epoch": 3.99, - "learning_rate": 5.1679152668893255e-06, - "loss": 0.2462, + "epoch": 4.0880095145421125, + "grad_norm": 0.25658249855041504, + "learning_rate": 4.249564837608608e-06, + "loss": 0.3669, "step": 113430 }, { - "epoch": 3.99, - "learning_rate": 5.166180944325208e-06, - "loss": 0.2307, + "epoch": 4.088189714203337, + "grad_norm": 0.27399855852127075, + "learning_rate": 4.247937426441989e-06, + "loss": 0.3615, "step": 113435 }, { - "epoch": 3.99, - "learning_rate": 5.164446879290158e-06, - "loss": 0.2457, + "epoch": 4.088369913864562, + "grad_norm": 0.22151198983192444, + "learning_rate": 4.2463102980182216e-06, + "loss": 0.3419, "step": 113440 }, { - "epoch": 3.99, - "learning_rate": 5.162713071806688e-06, - "loss": 0.2578, + "epoch": 4.088550113525787, + "grad_norm": 0.23478932678699493, + "learning_rate": 4.244683452359443e-06, + "loss": 0.3594, "step": 113445 }, { - "epoch": 3.99, - "learning_rate": 5.160979521897322e-06, - "loss": 0.2747, + "epoch": 4.088730313187011, + "grad_norm": 0.19280430674552917, + "learning_rate": 4.24305688948784e-06, + "loss": 0.3518, "step": 113450 }, { - "epoch": 3.99, - "learning_rate": 5.159246229584561e-06, - "loss": 0.2453, + "epoch": 4.088910512848236, + "grad_norm": 0.2612258195877075, + "learning_rate": 4.24143060942557e-06, + "loss": 0.3779, "step": 113455 }, { - "epoch": 3.99, - "learning_rate": 5.157513194890906e-06, - "loss": 0.2429, + "epoch": 4.089090712509461, + "grad_norm": 0.24851427972316742, + "learning_rate": 4.239804612194787e-06, + "loss": 0.3803, "step": 113460 }, { - "epoch": 3.99, - "learning_rate": 5.155780417838874e-06, - "loss": 0.2773, + "epoch": 4.089270912170685, + "grad_norm": 0.24412818253040314, + "learning_rate": 4.238178897817646e-06, + "loss": 0.3736, "step": 113465 }, { - "epoch": 3.99, - "learning_rate": 5.154047898450954e-06, - "loss": 0.2238, + "epoch": 4.0894511118319095, + "grad_norm": 0.2470037341117859, + "learning_rate": 4.2365534663163005e-06, + "loss": 0.3824, "step": 113470 }, { - "epoch": 3.99, - "learning_rate": 5.152315636749641e-06, - "loss": 0.2749, + "epoch": 4.089631311493134, + "grad_norm": 0.27560508251190186, + "learning_rate": 4.234928317712888e-06, + "loss": 0.3997, "step": 113475 }, { - "epoch": 3.99, - "learning_rate": 5.150583632757427e-06, - "loss": 0.2671, + "epoch": 4.089811511154359, + "grad_norm": 0.29321983456611633, + "learning_rate": 4.233303452029561e-06, + "loss": 0.3753, "step": 113480 }, { - "epoch": 3.99, - "learning_rate": 5.148851886496809e-06, - "loss": 0.2504, + "epoch": 4.089991710815584, + "grad_norm": 0.26840469241142273, + "learning_rate": 4.231678869288455e-06, + "loss": 0.3921, "step": 113485 }, { - "epoch": 3.99, - "learning_rate": 5.147120397990257e-06, - "loss": 0.2435, + "epoch": 4.090171910476808, + "grad_norm": 0.18927091360092163, + "learning_rate": 4.230054569511705e-06, + "loss": 0.3568, "step": 113490 }, { - "epoch": 3.99, - "learning_rate": 5.145389167260273e-06, - "loss": 0.2397, + "epoch": 4.090352110138033, + "grad_norm": 0.2522876262664795, + "learning_rate": 4.228430552721438e-06, + "loss": 0.351, "step": 113495 }, { - "epoch": 3.99, - "learning_rate": 5.143658194329318e-06, - "loss": 0.2781, + "epoch": 4.090532309799258, + "grad_norm": 0.30679717659950256, + "learning_rate": 4.226806818939777e-06, + "loss": 0.3625, "step": 113500 }, { - "epoch": 3.99, - "eval_loss": 0.2501179575920105, - "eval_runtime": 10.5566, - "eval_samples_per_second": 9.473, - "eval_steps_per_second": 9.473, + "epoch": 4.090532309799258, + "eval_loss": 0.4292847514152527, + "eval_runtime": 3.5606, + "eval_samples_per_second": 28.085, + "eval_steps_per_second": 7.021, "step": 113500 }, { - "epoch": 3.99, - "learning_rate": 5.14192747921988e-06, - "loss": 0.2406, + "epoch": 4.090712509460483, + "grad_norm": 0.28912264108657837, + "learning_rate": 4.225183368188859e-06, + "loss": 0.3747, "step": 113505 }, { - "epoch": 3.99, - "learning_rate": 5.1401970219544296e-06, - "loss": 0.2449, + "epoch": 4.090892709121706, + "grad_norm": 0.2808322012424469, + "learning_rate": 4.223560200490801e-06, + "loss": 0.355, "step": 113510 }, { - "epoch": 3.99, - "learning_rate": 5.138466822555432e-06, - "loss": 0.2581, + "epoch": 4.091072908782931, + "grad_norm": 0.27322959899902344, + "learning_rate": 4.221937315867702e-06, + "loss": 0.3809, "step": 113515 }, { - "epoch": 3.99, - "learning_rate": 5.136736881045348e-06, - "loss": 0.2583, + "epoch": 4.091253108444156, + "grad_norm": 0.22654719650745392, + "learning_rate": 4.220314714341689e-06, + "loss": 0.3452, "step": 113520 }, { - "epoch": 3.99, - "learning_rate": 5.13500719744665e-06, - "loss": 0.2618, + "epoch": 4.091433308105381, + "grad_norm": 0.28565314412117004, + "learning_rate": 4.218692395934856e-06, + "loss": 0.3838, "step": 113525 }, { - "epoch": 3.99, - "learning_rate": 5.133277771781794e-06, - "loss": 0.2484, + "epoch": 4.091613507766605, + "grad_norm": 0.24002893269062042, + "learning_rate": 4.217070360669325e-06, + "loss": 0.3882, "step": 113530 }, { - "epoch": 3.99, - "learning_rate": 5.131548604073225e-06, - "loss": 0.2693, + "epoch": 4.09179370742783, + "grad_norm": 0.28343817591667175, + "learning_rate": 4.215448608567193e-06, + "loss": 0.3902, "step": 113535 }, { - "epoch": 3.99, - "learning_rate": 5.12981969434341e-06, - "loss": 0.247, + "epoch": 4.091973907089055, + "grad_norm": 0.2013174593448639, + "learning_rate": 4.213827139650536e-06, + "loss": 0.3459, "step": 113540 }, { - "epoch": 3.99, - "learning_rate": 5.128091042614785e-06, - "loss": 0.2388, + "epoch": 4.09215410675028, + "grad_norm": 0.3252928555011749, + "learning_rate": 4.212205953941467e-06, + "loss": 0.3473, "step": 113545 }, { - "epoch": 4.0, - "learning_rate": 5.1263626489098106e-06, - "loss": 0.2597, + "epoch": 4.092334306411504, + "grad_norm": 0.20931507647037506, + "learning_rate": 4.2105850514620625e-06, + "loss": 0.3574, "step": 113550 }, { - "epoch": 4.0, - "learning_rate": 5.124634513250917e-06, - "loss": 0.2461, + "epoch": 4.092514506072728, + "grad_norm": 0.21451285481452942, + "learning_rate": 4.208964432234422e-06, + "loss": 0.4097, "step": 113555 }, { - "epoch": 4.0, - "learning_rate": 5.12290663566054e-06, - "loss": 0.2549, + "epoch": 4.092694705733953, + "grad_norm": 0.23355019092559814, + "learning_rate": 4.20734409628061e-06, + "loss": 0.348, "step": 113560 }, { - "epoch": 4.0, - "learning_rate": 5.121179016161129e-06, - "loss": 0.2586, + "epoch": 4.092874905395178, + "grad_norm": 0.21140533685684204, + "learning_rate": 4.2057240436227025e-06, + "loss": 0.3602, "step": 113565 }, { - "epoch": 4.0, - "learning_rate": 5.119451654775106e-06, - "loss": 0.2343, + "epoch": 4.093055105056402, + "grad_norm": 0.22797791659832, + "learning_rate": 4.2041042742827855e-06, + "loss": 0.3894, "step": 113570 }, { - "epoch": 4.0, - "learning_rate": 5.117724551524897e-06, - "loss": 0.2777, + "epoch": 4.093235304717627, + "grad_norm": 0.2819134593009949, + "learning_rate": 4.202484788282923e-06, + "loss": 0.3677, "step": 113575 }, { - "epoch": 4.0, - "learning_rate": 5.115997706432937e-06, - "loss": 0.2766, + "epoch": 4.093415504378852, + "grad_norm": 0.27418237924575806, + "learning_rate": 4.200865585645178e-06, + "loss": 0.3588, "step": 113580 }, { - "epoch": 4.0, - "learning_rate": 5.114271119521644e-06, - "loss": 0.2655, + "epoch": 4.0935957040400766, + "grad_norm": 0.258755087852478, + "learning_rate": 4.19924666639161e-06, + "loss": 0.3634, "step": 113585 }, { - "epoch": 4.0, - "learning_rate": 5.1125447908134335e-06, - "loss": 0.2774, + "epoch": 4.093775903701301, + "grad_norm": 0.2556750476360321, + "learning_rate": 4.197628030544276e-06, + "loss": 0.3824, "step": 113590 }, { - "epoch": 4.0, - "learning_rate": 5.110818720330715e-06, - "loss": 0.2562, + "epoch": 4.093956103362526, + "grad_norm": 0.19411814212799072, + "learning_rate": 4.196009678125237e-06, + "loss": 0.3567, "step": 113595 }, { - "epoch": 4.0, - "learning_rate": 5.109092908095911e-06, - "loss": 0.2677, + "epoch": 4.094136303023751, + "grad_norm": 0.23082207143306732, + "learning_rate": 4.19439160915654e-06, + "loss": 0.3516, "step": 113600 }, { - "epoch": 4.0, - "learning_rate": 5.107367354131432e-06, - "loss": 0.2523, + "epoch": 4.094316502684975, + "grad_norm": 0.26558494567871094, + "learning_rate": 4.192773823660229e-06, + "loss": 0.4062, "step": 113605 }, { - "epoch": 4.0, - "learning_rate": 5.105642058459678e-06, - "loss": 0.2611, + "epoch": 4.094496702346199, + "grad_norm": 0.2534783184528351, + "learning_rate": 4.191156321658343e-06, + "loss": 0.3816, "step": 113610 }, { - "epoch": 4.0, - "learning_rate": 5.103917021103044e-06, - "loss": 0.2413, + "epoch": 4.094676902007424, + "grad_norm": 0.2528476417064667, + "learning_rate": 4.189539103172926e-06, + "loss": 0.355, "step": 113615 }, { - "epoch": 4.0, - "learning_rate": 5.102192242083945e-06, - "loss": 0.2559, + "epoch": 4.094857101668649, + "grad_norm": 0.29152292013168335, + "learning_rate": 4.1879221682260054e-06, + "loss": 0.3748, "step": 113620 }, { - "epoch": 4.0, - "learning_rate": 5.100467721424765e-06, - "loss": 0.2795, + "epoch": 4.0950373013298735, + "grad_norm": 0.22224609553813934, + "learning_rate": 4.186305516839625e-06, + "loss": 0.3615, "step": 113625 }, { - "epoch": 4.0, - "learning_rate": 5.098743459147895e-06, - "loss": 0.2676, + "epoch": 4.095217500991098, + "grad_norm": 0.2683086395263672, + "learning_rate": 4.184689149035792e-06, + "loss": 0.4004, "step": 113630 }, { - "epoch": 4.0, - "learning_rate": 5.09701945527572e-06, - "loss": 0.2559, + "epoch": 4.095397700652323, + "grad_norm": 0.2543949782848358, + "learning_rate": 4.183073064836545e-06, + "loss": 0.3536, "step": 113635 }, { - "epoch": 4.0, - "learning_rate": 5.095295709830638e-06, - "loss": 0.2421, + "epoch": 4.095577900313548, + "grad_norm": 0.28799569606781006, + "learning_rate": 4.181457264263897e-06, + "loss": 0.3799, "step": 113640 }, { - "epoch": 4.0, - "learning_rate": 5.093572222835025e-06, - "loss": 0.2599, + "epoch": 4.0957580999747725, + "grad_norm": 0.21432629227638245, + "learning_rate": 4.179841747339864e-06, + "loss": 0.3454, "step": 113645 }, { - "epoch": 4.0, - "learning_rate": 5.091848994311252e-06, - "loss": 0.253, + "epoch": 4.095938299635996, + "grad_norm": 0.256199449300766, + "learning_rate": 4.178226514086453e-06, + "loss": 0.3815, "step": 113650 }, { - "epoch": 4.0, - "learning_rate": 5.090126024281697e-06, - "loss": 0.2594, + "epoch": 4.096118499297221, + "grad_norm": 0.19477510452270508, + "learning_rate": 4.176611564525679e-06, + "loss": 0.3582, "step": 113655 }, { - "epoch": 4.0, - "learning_rate": 5.088403312768741e-06, - "loss": 0.275, + "epoch": 4.096298698958446, + "grad_norm": 0.2505654990673065, + "learning_rate": 4.1749968986795316e-06, + "loss": 0.3512, "step": 113660 }, { - "epoch": 4.0, - "learning_rate": 5.086680859794748e-06, - "loss": 0.2643, + "epoch": 4.0964788986196705, + "grad_norm": 0.26357826590538025, + "learning_rate": 4.173382516570026e-06, + "loss": 0.361, "step": 113665 }, { - "epoch": 4.0, - "learning_rate": 5.084958665382078e-06, - "loss": 0.2585, + "epoch": 4.096659098280895, + "grad_norm": 0.22375109791755676, + "learning_rate": 4.171768418219152e-06, + "loss": 0.3643, "step": 113670 }, { - "epoch": 4.0, - "learning_rate": 5.08323672955309e-06, - "loss": 0.2616, + "epoch": 4.09683929794212, + "grad_norm": 0.2819434404373169, + "learning_rate": 4.170154603648901e-06, + "loss": 0.3794, "step": 113675 }, { - "epoch": 4.0, - "learning_rate": 5.081515052330154e-06, - "loss": 0.2442, + "epoch": 4.097019497603345, + "grad_norm": 0.24838387966156006, + "learning_rate": 4.168541072881263e-06, + "loss": 0.3224, "step": 113680 }, { - "epoch": 4.0, - "learning_rate": 5.079793633735619e-06, - "loss": 0.2518, + "epoch": 4.0971996972645695, + "grad_norm": 0.2065441608428955, + "learning_rate": 4.1669278259382104e-06, + "loss": 0.343, "step": 113685 }, { - "epoch": 4.0, - "learning_rate": 5.0780724737918285e-06, - "loss": 0.2479, + "epoch": 4.097379896925794, + "grad_norm": 0.2739092707633972, + "learning_rate": 4.16531486284174e-06, + "loss": 0.4081, "step": 113690 }, { - "epoch": 4.0, - "learning_rate": 5.076351572521146e-06, - "loss": 0.2412, + "epoch": 4.097560096587018, + "grad_norm": 0.2650372385978699, + "learning_rate": 4.1637021836138215e-06, + "loss": 0.3675, "step": 113695 }, { - "epoch": 4.0, - "learning_rate": 5.074630929945906e-06, - "loss": 0.2516, + "epoch": 4.097740296248243, + "grad_norm": 0.23538993299007416, + "learning_rate": 4.162089788276424e-06, + "loss": 0.397, "step": 113700 }, { - "epoch": 4.0, - "learning_rate": 5.0729105460884464e-06, - "loss": 0.2561, + "epoch": 4.0979204959094675, + "grad_norm": 0.2418733686208725, + "learning_rate": 4.160477676851523e-06, + "loss": 0.3946, "step": 113705 }, { - "epoch": 4.0, - "learning_rate": 5.071190420971112e-06, - "loss": 0.2434, + "epoch": 4.098100695570692, + "grad_norm": 0.2708457112312317, + "learning_rate": 4.158865849361071e-06, + "loss": 0.3647, "step": 113710 }, { - "epoch": 4.0, - "learning_rate": 5.06947055461624e-06, - "loss": 0.2354, + "epoch": 4.098280895231917, + "grad_norm": 0.24828830361366272, + "learning_rate": 4.157254305827041e-06, + "loss": 0.3341, "step": 113715 }, { - "epoch": 4.0, - "learning_rate": 5.06775094704616e-06, - "loss": 0.2413, + "epoch": 4.098461094893142, + "grad_norm": 0.27252304553985596, + "learning_rate": 4.1556430462713954e-06, + "loss": 0.3764, "step": 113720 }, { - "epoch": 4.0, - "learning_rate": 5.066031598283197e-06, - "loss": 0.2387, + "epoch": 4.098641294554366, + "grad_norm": 0.21500465273857117, + "learning_rate": 4.154032070716063e-06, + "loss": 0.3764, "step": 113725 }, { - "epoch": 4.0, - "learning_rate": 5.064312508349672e-06, - "loss": 0.2448, + "epoch": 4.098821494215591, + "grad_norm": 0.2509201467037201, + "learning_rate": 4.152421379183013e-06, + "loss": 0.3784, "step": 113730 }, { - "epoch": 4.0, - "learning_rate": 5.062593677267916e-06, - "loss": 0.2661, + "epoch": 4.099001693876816, + "grad_norm": 0.24733483791351318, + "learning_rate": 4.150810971694183e-06, + "loss": 0.3653, "step": 113735 }, { - "epoch": 4.0, - "learning_rate": 5.060875105060242e-06, - "loss": 0.2472, + "epoch": 4.09918189353804, + "grad_norm": 0.22112403810024261, + "learning_rate": 4.1492008482715254e-06, + "loss": 0.3734, "step": 113740 }, { - "epoch": 4.0, - "learning_rate": 5.059156791748965e-06, - "loss": 0.2515, + "epoch": 4.0993620931992645, + "grad_norm": 0.22997091710567474, + "learning_rate": 4.147591008936966e-06, + "loss": 0.3426, "step": 113745 }, { - "epoch": 4.0, - "learning_rate": 5.05743873735639e-06, - "loss": 0.258, + "epoch": 4.099542292860489, + "grad_norm": 0.21955984830856323, + "learning_rate": 4.145981453712436e-06, + "loss": 0.3695, "step": 113750 }, { - "epoch": 4.0, - "learning_rate": 5.055720941904835e-06, - "loss": 0.2551, + "epoch": 4.099722492521714, + "grad_norm": 0.2155844122171402, + "learning_rate": 4.144372182619874e-06, + "loss": 0.3533, "step": 113755 }, { - "epoch": 4.0, - "learning_rate": 5.0540034054165934e-06, - "loss": 0.2431, + "epoch": 4.099902692182939, + "grad_norm": 0.2931585907936096, + "learning_rate": 4.1430849703355755e-06, + "loss": 0.4126, "step": 113760 }, { - "epoch": 4.0, - "learning_rate": 5.05228612791398e-06, - "loss": 0.2497, + "epoch": 4.100082891844163, + "grad_norm": 0.23086297512054443, + "learning_rate": 4.141476210735803e-06, + "loss": 0.3405, "step": 113765 }, { - "epoch": 4.0, - "learning_rate": 5.050569109419279e-06, - "loss": 0.2363, + "epoch": 4.100263091505388, + "grad_norm": 0.259778767824173, + "learning_rate": 4.139867735329381e-06, + "loss": 0.3706, "step": 113770 }, { - "epoch": 4.0, - "learning_rate": 5.048852349954797e-06, - "loss": 0.2531, + "epoch": 4.100443291166613, + "grad_norm": 0.22859430313110352, + "learning_rate": 4.13825954413822e-06, + "loss": 0.3301, "step": 113775 }, { - "epoch": 4.0, - "learning_rate": 5.04713584954282e-06, - "loss": 0.2519, + "epoch": 4.100623490827838, + "grad_norm": 0.25174832344055176, + "learning_rate": 4.13665163718423e-06, + "loss": 0.3392, "step": 113780 }, { - "epoch": 4.0, - "learning_rate": 5.045419608205632e-06, - "loss": 0.2519, + "epoch": 4.1008036904890615, + "grad_norm": 0.2478947937488556, + "learning_rate": 4.135044014489326e-06, + "loss": 0.3599, "step": 113785 }, { - "epoch": 4.0, - "learning_rate": 5.0437036259655155e-06, - "loss": 0.2466, + "epoch": 4.100983890150286, + "grad_norm": 0.2401927262544632, + "learning_rate": 4.133436676075414e-06, + "loss": 0.3812, "step": 113790 }, { - "epoch": 4.0, - "learning_rate": 5.041987902844761e-06, - "loss": 0.2551, + "epoch": 4.101164089811511, + "grad_norm": 0.2504352331161499, + "learning_rate": 4.131829621964378e-06, + "loss": 0.4121, "step": 113795 }, { - "epoch": 4.0, - "learning_rate": 5.040272438865642e-06, - "loss": 0.2258, + "epoch": 4.101344289472736, + "grad_norm": 0.21674078702926636, + "learning_rate": 4.130222852178129e-06, + "loss": 0.323, "step": 113800 }, { - "epoch": 4.0, - "learning_rate": 5.038557234050423e-06, - "loss": 0.2406, + "epoch": 4.10152448913396, + "grad_norm": 0.2414158135652542, + "learning_rate": 4.128616366738544e-06, + "loss": 0.3697, "step": 113805 }, { - "epoch": 4.0, - "learning_rate": 5.036842288421392e-06, - "loss": 0.2453, + "epoch": 4.101704688795185, + "grad_norm": 0.2412973940372467, + "learning_rate": 4.127010165667533e-06, + "loss": 0.3529, "step": 113810 }, { - "epoch": 4.0, - "learning_rate": 5.0351276020008e-06, - "loss": 0.2551, + "epoch": 4.10188488845641, + "grad_norm": 0.24448582530021667, + "learning_rate": 4.125404248986961e-06, + "loss": 0.3587, "step": 113815 }, { - "epoch": 4.0, - "learning_rate": 5.0334131748109245e-06, - "loss": 0.254, + "epoch": 4.102065088117635, + "grad_norm": 0.2748193144798279, + "learning_rate": 4.12379861671871e-06, + "loss": 0.374, "step": 113820 }, { - "epoch": 4.0, - "learning_rate": 5.0316990068740165e-06, - "loss": 0.2592, + "epoch": 4.102245287778859, + "grad_norm": 0.28702595829963684, + "learning_rate": 4.1221932688846664e-06, + "loss": 0.3794, "step": 113825 }, { - "epoch": 4.0, - "learning_rate": 5.029985098212342e-06, - "loss": 0.2589, + "epoch": 4.102425487440083, + "grad_norm": 0.23407040536403656, + "learning_rate": 4.120588205506698e-06, + "loss": 0.3652, "step": 113830 }, { - "epoch": 4.01, - "learning_rate": 5.028271448848149e-06, - "loss": 0.257, + "epoch": 4.102605687101308, + "grad_norm": 0.28252798318862915, + "learning_rate": 4.1189834266066735e-06, + "loss": 0.3586, "step": 113835 }, { - "epoch": 4.01, - "learning_rate": 5.026558058803691e-06, - "loss": 0.2377, + "epoch": 4.102785886762533, + "grad_norm": 0.2878764569759369, + "learning_rate": 4.117378932206456e-06, + "loss": 0.4031, "step": 113840 }, { - "epoch": 4.01, - "learning_rate": 5.024844928101208e-06, - "loss": 0.2496, + "epoch": 4.102966086423757, + "grad_norm": 0.21540088951587677, + "learning_rate": 4.1157747223279034e-06, + "loss": 0.3806, "step": 113845 }, { - "epoch": 4.01, - "learning_rate": 5.023132056762953e-06, - "loss": 0.2402, + "epoch": 4.103146286084982, + "grad_norm": 0.24554520845413208, + "learning_rate": 4.114170796992881e-06, + "loss": 0.3937, "step": 113850 }, { - "epoch": 4.01, - "learning_rate": 5.021419444811165e-06, - "loss": 0.2561, + "epoch": 4.103326485746207, + "grad_norm": 0.2406504899263382, + "learning_rate": 4.112567156223237e-06, + "loss": 0.3786, "step": 113855 }, { - "epoch": 4.01, - "learning_rate": 5.01970709226808e-06, - "loss": 0.259, + "epoch": 4.103506685407432, + "grad_norm": 0.25493624806404114, + "learning_rate": 4.110963800040824e-06, + "loss": 0.3673, "step": 113860 }, { - "epoch": 4.01, - "learning_rate": 5.017994999155923e-06, - "loss": 0.2348, + "epoch": 4.103686885068656, + "grad_norm": 0.20379377901554108, + "learning_rate": 4.109360728467485e-06, + "loss": 0.3536, "step": 113865 }, { - "epoch": 4.01, - "learning_rate": 5.016283165496933e-06, - "loss": 0.2411, + "epoch": 4.103867084729881, + "grad_norm": 0.25673457980155945, + "learning_rate": 4.107757941525059e-06, + "loss": 0.3743, "step": 113870 }, { - "epoch": 4.01, - "learning_rate": 5.014571591313341e-06, - "loss": 0.2645, + "epoch": 4.104047284391106, + "grad_norm": 0.2972078025341034, + "learning_rate": 4.1061554392353816e-06, + "loss": 0.3751, "step": 113875 }, { - "epoch": 4.01, - "learning_rate": 5.012860276627368e-06, - "loss": 0.2558, + "epoch": 4.10422748405233, + "grad_norm": 0.2244071513414383, + "learning_rate": 4.104553221620297e-06, + "loss": 0.3879, "step": 113880 }, { - "epoch": 4.01, - "learning_rate": 5.0111492214612254e-06, - "loss": 0.2518, + "epoch": 4.104407683713554, + "grad_norm": 0.20610791444778442, + "learning_rate": 4.1029512887016286e-06, + "loss": 0.3761, "step": 113885 }, { - "epoch": 4.01, - "learning_rate": 5.009438425837143e-06, - "loss": 0.2665, + "epoch": 4.104587883374779, + "grad_norm": 0.23367708921432495, + "learning_rate": 4.101349640501206e-06, + "loss": 0.3863, "step": 113890 }, { - "epoch": 4.01, - "learning_rate": 5.007727889777327e-06, - "loss": 0.2439, + "epoch": 4.104768083036004, + "grad_norm": 0.26050618290901184, + "learning_rate": 4.099748277040846e-06, + "loss": 0.4031, "step": 113895 }, { - "epoch": 4.01, - "learning_rate": 5.006017613303993e-06, - "loss": 0.2468, + "epoch": 4.104948282697229, + "grad_norm": 0.2746144235134125, + "learning_rate": 4.098147198342364e-06, + "loss": 0.3751, "step": 113900 }, { - "epoch": 4.01, - "learning_rate": 5.004307596439334e-06, - "loss": 0.2549, + "epoch": 4.105128482358453, + "grad_norm": 0.22158581018447876, + "learning_rate": 4.09654640442759e-06, + "loss": 0.3932, "step": 113905 }, { - "epoch": 4.01, - "learning_rate": 5.002597839205572e-06, - "loss": 0.2707, + "epoch": 4.105308682019678, + "grad_norm": 0.21861512959003448, + "learning_rate": 4.09494589531832e-06, + "loss": 0.3805, "step": 113910 }, { - "epoch": 4.01, - "learning_rate": 5.000888341624896e-06, - "loss": 0.2407, + "epoch": 4.105488881680903, + "grad_norm": 0.2370881885290146, + "learning_rate": 4.0933456710363585e-06, + "loss": 0.3719, "step": 113915 }, { - "epoch": 4.01, - "learning_rate": 4.999179103719501e-06, - "loss": 0.2382, + "epoch": 4.1056690813421275, + "grad_norm": 0.23769058287143707, + "learning_rate": 4.091745731603519e-06, + "loss": 0.3728, "step": 113920 }, { - "epoch": 4.01, - "learning_rate": 4.997470125511588e-06, - "loss": 0.2536, + "epoch": 4.105849281003351, + "grad_norm": 0.19477471709251404, + "learning_rate": 4.090146077041598e-06, + "loss": 0.3491, "step": 113925 }, { - "epoch": 4.01, - "learning_rate": 4.995761407023339e-06, - "loss": 0.2392, + "epoch": 4.106029480664576, + "grad_norm": 0.310822457075119, + "learning_rate": 4.088546707372387e-06, + "loss": 0.3381, "step": 113930 }, { - "epoch": 4.01, - "learning_rate": 4.994052948276948e-06, - "loss": 0.2406, + "epoch": 4.106209680325801, + "grad_norm": 0.25079676508903503, + "learning_rate": 4.086947622617682e-06, + "loss": 0.3781, "step": 113935 }, { - "epoch": 4.01, - "learning_rate": 4.992344749294592e-06, - "loss": 0.2357, + "epoch": 4.1063898799870255, + "grad_norm": 0.2638763189315796, + "learning_rate": 4.085348822799257e-06, + "loss": 0.4023, "step": 113940 }, { - "epoch": 4.01, - "learning_rate": 4.990636810098459e-06, - "loss": 0.2519, + "epoch": 4.10657007964825, + "grad_norm": 0.25837767124176025, + "learning_rate": 4.083750307938911e-06, + "loss": 0.3652, "step": 113945 }, { - "epoch": 4.01, - "learning_rate": 4.98892913071072e-06, - "loss": 0.2271, + "epoch": 4.106750279309475, + "grad_norm": 0.2736017405986786, + "learning_rate": 4.082152078058419e-06, + "loss": 0.3973, "step": 113950 }, { - "epoch": 4.01, - "learning_rate": 4.98722171115355e-06, - "loss": 0.2694, + "epoch": 4.1069304789707, + "grad_norm": 0.2712678611278534, + "learning_rate": 4.080554133179554e-06, + "loss": 0.381, "step": 113955 }, { - "epoch": 4.01, - "learning_rate": 4.985514551449108e-06, - "loss": 0.2582, + "epoch": 4.1071106786319245, + "grad_norm": 0.26710009574890137, + "learning_rate": 4.078956473324091e-06, + "loss": 0.3954, "step": 113960 }, { - "epoch": 4.01, - "learning_rate": 4.9838076516195796e-06, - "loss": 0.2609, + "epoch": 4.107290878293149, + "grad_norm": 0.2664423882961273, + "learning_rate": 4.077359098513789e-06, + "loss": 0.3671, "step": 113965 }, { - "epoch": 4.01, - "learning_rate": 4.982101011687115e-06, - "loss": 0.2533, + "epoch": 4.107471077954373, + "grad_norm": 0.2606585919857025, + "learning_rate": 4.075762008770423e-06, + "loss": 0.4075, "step": 113970 }, { - "epoch": 4.01, - "learning_rate": 4.980394631673871e-06, - "loss": 0.2564, + "epoch": 4.107651277615598, + "grad_norm": 0.26195093989372253, + "learning_rate": 4.074165204115754e-06, + "loss": 0.3666, "step": 113975 }, { - "epoch": 4.01, - "learning_rate": 4.978688511602014e-06, - "loss": 0.2653, + "epoch": 4.1078314772768225, + "grad_norm": 0.23841117322444916, + "learning_rate": 4.072568684571524e-06, + "loss": 0.3521, "step": 113980 }, { - "epoch": 4.01, - "learning_rate": 4.976982651493689e-06, - "loss": 0.2505, + "epoch": 4.108011676938047, + "grad_norm": 0.24487237632274628, + "learning_rate": 4.070972450159497e-06, + "loss": 0.3861, "step": 113985 }, { - "epoch": 4.01, - "learning_rate": 4.9752770513710524e-06, - "loss": 0.2538, + "epoch": 4.108191876599272, + "grad_norm": 0.22291886806488037, + "learning_rate": 4.069376500901414e-06, + "loss": 0.3597, "step": 113990 }, { - "epoch": 4.01, - "learning_rate": 4.9735717112562475e-06, - "loss": 0.2483, + "epoch": 4.108372076260497, + "grad_norm": 0.2304575890302658, + "learning_rate": 4.06778083681903e-06, + "loss": 0.362, "step": 113995 }, { - "epoch": 4.01, - "learning_rate": 4.971866631171409e-06, - "loss": 0.2482, + "epoch": 4.1085522759217215, + "grad_norm": 0.2351878434419632, + "learning_rate": 4.066185457934083e-06, + "loss": 0.3816, "step": 114000 }, { - "epoch": 4.01, - "eval_loss": 0.25011008977890015, - "eval_runtime": 10.5643, - "eval_samples_per_second": 9.466, - "eval_steps_per_second": 9.466, + "epoch": 4.1085522759217215, + "eval_loss": 0.4291282296180725, + "eval_runtime": 3.5373, + "eval_samples_per_second": 28.27, + "eval_steps_per_second": 7.068, "step": 114000 }, { - "epoch": 4.01, - "learning_rate": 4.9701618111386906e-06, - "loss": 0.2451, + "epoch": 4.108732475582946, + "grad_norm": 0.25444066524505615, + "learning_rate": 4.064590364268298e-06, + "loss": 0.3812, "step": 114005 }, { - "epoch": 4.01, - "learning_rate": 4.968457251180222e-06, - "loss": 0.2401, + "epoch": 4.108912675244171, + "grad_norm": 0.23914211988449097, + "learning_rate": 4.062995555843419e-06, + "loss": 0.3443, "step": 114010 }, { - "epoch": 4.01, - "learning_rate": 4.966752951318132e-06, - "loss": 0.2673, + "epoch": 4.109092874905395, + "grad_norm": 0.25930920243263245, + "learning_rate": 4.061401032681172e-06, + "loss": 0.3715, "step": 114015 }, { - "epoch": 4.01, - "learning_rate": 4.965048911574547e-06, - "loss": 0.2462, + "epoch": 4.1092730745666195, + "grad_norm": 0.37500235438346863, + "learning_rate": 4.059806794803283e-06, + "loss": 0.4553, "step": 114020 }, { - "epoch": 4.01, - "learning_rate": 4.963345131971606e-06, - "loss": 0.2481, + "epoch": 4.109453274227844, + "grad_norm": 0.26621177792549133, + "learning_rate": 4.058212842231474e-06, + "loss": 0.3756, "step": 114025 }, { - "epoch": 4.01, - "learning_rate": 4.961641612531423e-06, - "loss": 0.2547, + "epoch": 4.109633473889069, + "grad_norm": 0.22278466820716858, + "learning_rate": 4.056619174987453e-06, + "loss": 0.3579, "step": 114030 }, { - "epoch": 4.01, - "learning_rate": 4.959938353276111e-06, - "loss": 0.257, + "epoch": 4.109813673550294, + "grad_norm": 0.2507738769054413, + "learning_rate": 4.055025793092945e-06, + "loss": 0.3704, "step": 114035 }, { - "epoch": 4.01, - "learning_rate": 4.958235354227791e-06, - "loss": 0.2402, + "epoch": 4.109993873211518, + "grad_norm": 0.29194319248199463, + "learning_rate": 4.053432696569659e-06, + "loss": 0.3727, "step": 114040 }, { - "epoch": 4.01, - "learning_rate": 4.956532615408588e-06, - "loss": 0.2731, + "epoch": 4.110174072872743, + "grad_norm": 0.22868956625461578, + "learning_rate": 4.0518398854392955e-06, + "loss": 0.3543, "step": 114045 }, { - "epoch": 4.01, - "learning_rate": 4.954830136840597e-06, - "loss": 0.2485, + "epoch": 4.110354272533968, + "grad_norm": 0.24495558440685272, + "learning_rate": 4.050247359723558e-06, + "loss": 0.3589, "step": 114050 }, { - "epoch": 4.01, - "learning_rate": 4.9531279185459186e-06, - "loss": 0.2538, + "epoch": 4.110534472195193, + "grad_norm": 0.22219400107860565, + "learning_rate": 4.0486551194441445e-06, + "loss": 0.3735, "step": 114055 }, { - "epoch": 4.01, - "learning_rate": 4.9514259605466715e-06, - "loss": 0.2468, + "epoch": 4.1107146718564165, + "grad_norm": 0.240044966340065, + "learning_rate": 4.0470631646227416e-06, + "loss": 0.3688, "step": 114060 }, { - "epoch": 4.01, - "learning_rate": 4.949724262864944e-06, - "loss": 0.2409, + "epoch": 4.110894871517641, + "grad_norm": 0.2267971634864807, + "learning_rate": 4.045471495281053e-06, + "loss": 0.3558, "step": 114065 }, { - "epoch": 4.01, - "learning_rate": 4.948022825522835e-06, - "loss": 0.2396, + "epoch": 4.111075071178866, + "grad_norm": 0.24570490419864655, + "learning_rate": 4.043880111440759e-06, + "loss": 0.3887, "step": 114070 }, { - "epoch": 4.01, - "learning_rate": 4.946321648542429e-06, - "loss": 0.2473, + "epoch": 4.111255270840091, + "grad_norm": 0.24875159561634064, + "learning_rate": 4.04228901312354e-06, + "loss": 0.3644, "step": 114075 }, { - "epoch": 4.01, - "learning_rate": 4.944620731945826e-06, - "loss": 0.2257, + "epoch": 4.111435470501315, + "grad_norm": 0.2332112193107605, + "learning_rate": 4.040698200351076e-06, + "loss": 0.3569, "step": 114080 }, { - "epoch": 4.01, - "learning_rate": 4.942920075755103e-06, - "loss": 0.2723, + "epoch": 4.11161567016254, + "grad_norm": 0.2375115007162094, + "learning_rate": 4.039107673145038e-06, + "loss": 0.3451, "step": 114085 }, { - "epoch": 4.01, - "learning_rate": 4.9412196799923424e-06, - "loss": 0.2209, + "epoch": 4.111795869823765, + "grad_norm": 0.26559481024742126, + "learning_rate": 4.037517431527108e-06, + "loss": 0.3891, "step": 114090 }, { - "epoch": 4.01, - "learning_rate": 4.939519544679622e-06, - "loss": 0.2604, + "epoch": 4.11197606948499, + "grad_norm": 0.26684683561325073, + "learning_rate": 4.0359274755189386e-06, + "loss": 0.3918, "step": 114095 }, { - "epoch": 4.01, - "learning_rate": 4.937819669839028e-06, - "loss": 0.2597, + "epoch": 4.112156269146214, + "grad_norm": 0.288522332906723, + "learning_rate": 4.034337805142193e-06, + "loss": 0.3604, "step": 114100 }, { - "epoch": 4.01, - "learning_rate": 4.936120055492624e-06, - "loss": 0.2312, + "epoch": 4.112336468807438, + "grad_norm": 0.23801618814468384, + "learning_rate": 4.032748420418545e-06, + "loss": 0.3558, "step": 114105 }, { - "epoch": 4.01, - "learning_rate": 4.934420701662479e-06, - "loss": 0.2435, + "epoch": 4.112516668468663, + "grad_norm": 0.26979947090148926, + "learning_rate": 4.031159321369637e-06, + "loss": 0.3942, "step": 114110 }, { - "epoch": 4.01, - "learning_rate": 4.93272160837065e-06, - "loss": 0.2504, + "epoch": 4.112696868129888, + "grad_norm": 0.24331551790237427, + "learning_rate": 4.029570508017125e-06, + "loss": 0.4005, "step": 114115 }, { - "epoch": 4.02, - "learning_rate": 4.931022775639213e-06, - "loss": 0.2324, + "epoch": 4.112877067791112, + "grad_norm": 0.24675403535366058, + "learning_rate": 4.027981980382656e-06, + "loss": 0.3982, "step": 114120 }, { - "epoch": 4.02, - "learning_rate": 4.929324203490221e-06, - "loss": 0.2535, + "epoch": 4.113057267452337, + "grad_norm": 0.18268905580043793, + "learning_rate": 4.026393738487863e-06, + "loss": 0.3782, "step": 114125 }, { - "epoch": 4.02, - "learning_rate": 4.927625891945725e-06, - "loss": 0.2404, + "epoch": 4.113237467113562, + "grad_norm": 0.24003072082996368, + "learning_rate": 4.024805782354402e-06, + "loss": 0.3864, "step": 114130 }, { - "epoch": 4.02, - "learning_rate": 4.925927841027772e-06, - "loss": 0.2518, + "epoch": 4.113417666774787, + "grad_norm": 0.26609840989112854, + "learning_rate": 4.023218112003902e-06, + "loss": 0.3561, "step": 114135 }, { - "epoch": 4.02, - "learning_rate": 4.924230050758425e-06, - "loss": 0.2626, + "epoch": 4.113597866436011, + "grad_norm": 0.29449740052223206, + "learning_rate": 4.021630727457995e-06, + "loss": 0.4234, "step": 114140 }, { - "epoch": 4.02, - "learning_rate": 4.922532521159712e-06, - "loss": 0.267, + "epoch": 4.113778066097236, + "grad_norm": 0.20891326665878296, + "learning_rate": 4.020043628738304e-06, + "loss": 0.3911, "step": 114145 }, { - "epoch": 4.02, - "learning_rate": 4.920835252253683e-06, - "loss": 0.2593, + "epoch": 4.113958265758461, + "grad_norm": 0.20913873612880707, + "learning_rate": 4.018456815866453e-06, + "loss": 0.3592, "step": 114150 }, { - "epoch": 4.02, - "learning_rate": 4.9191382440623826e-06, - "loss": 0.2333, + "epoch": 4.114138465419685, + "grad_norm": 0.22680309414863586, + "learning_rate": 4.016870288864072e-06, + "loss": 0.367, "step": 114155 }, { - "epoch": 4.02, - "learning_rate": 4.917441496607836e-06, - "loss": 0.2776, + "epoch": 4.114318665080909, + "grad_norm": 0.2160423845052719, + "learning_rate": 4.015284047752771e-06, + "loss": 0.3595, "step": 114160 }, { - "epoch": 4.02, - "learning_rate": 4.915745009912079e-06, - "loss": 0.2543, + "epoch": 4.114498864742134, + "grad_norm": 0.2502204179763794, + "learning_rate": 4.013698092554163e-06, + "loss": 0.3754, "step": 114165 }, { - "epoch": 4.02, - "learning_rate": 4.91404878399713e-06, - "loss": 0.255, + "epoch": 4.114679064403359, + "grad_norm": 0.2723202109336853, + "learning_rate": 4.012112423289855e-06, + "loss": 0.3667, "step": 114170 }, { - "epoch": 4.02, - "learning_rate": 4.912352818885027e-06, - "loss": 0.2524, + "epoch": 4.114859264064584, + "grad_norm": 0.279547780752182, + "learning_rate": 4.010527039981443e-06, + "loss": 0.3663, "step": 114175 }, { - "epoch": 4.02, - "learning_rate": 4.910657114597786e-06, - "loss": 0.2635, + "epoch": 4.115039463725808, + "grad_norm": 0.2516183853149414, + "learning_rate": 4.008941942650546e-06, + "loss": 0.368, "step": 114180 }, { - "epoch": 4.02, - "learning_rate": 4.908961671157422e-06, - "loss": 0.2681, + "epoch": 4.115219663387033, + "grad_norm": 0.277414470911026, + "learning_rate": 4.007357131318753e-06, + "loss": 0.3524, "step": 114185 }, { - "epoch": 4.02, - "learning_rate": 4.9072664885859435e-06, - "loss": 0.2215, + "epoch": 4.115399863048258, + "grad_norm": 0.237200066447258, + "learning_rate": 4.005772606007646e-06, + "loss": 0.3486, "step": 114190 }, { - "epoch": 4.02, - "learning_rate": 4.905571566905373e-06, - "loss": 0.229, + "epoch": 4.1155800627094825, + "grad_norm": 0.2728269100189209, + "learning_rate": 4.004188366738829e-06, + "loss": 0.3794, "step": 114195 }, { - "epoch": 4.02, - "learning_rate": 4.9038769061377085e-06, - "loss": 0.2318, + "epoch": 4.115760262370706, + "grad_norm": 0.2168535739183426, + "learning_rate": 4.002604413533878e-06, + "loss": 0.3768, "step": 114200 }, { - "epoch": 4.02, - "learning_rate": 4.902182506304964e-06, - "loss": 0.2536, + "epoch": 4.115940462031931, + "grad_norm": 0.22027429938316345, + "learning_rate": 4.00102074641438e-06, + "loss": 0.3745, "step": 114205 }, { - "epoch": 4.02, - "learning_rate": 4.900488367429129e-06, - "loss": 0.2424, + "epoch": 4.116120661693156, + "grad_norm": 0.23675896227359772, + "learning_rate": 3.999437365401906e-06, + "loss": 0.3777, "step": 114210 }, { - "epoch": 4.02, - "learning_rate": 4.898794489532213e-06, - "loss": 0.248, + "epoch": 4.116300861354381, + "grad_norm": 0.24391767382621765, + "learning_rate": 3.997854270518026e-06, + "loss": 0.3857, "step": 114215 }, { - "epoch": 4.02, - "learning_rate": 4.897100872636201e-06, - "loss": 0.2413, + "epoch": 4.116481061015605, + "grad_norm": 0.239417165517807, + "learning_rate": 3.996271461784324e-06, + "loss": 0.3692, "step": 114220 }, { - "epoch": 4.02, - "learning_rate": 4.895407516763087e-06, - "loss": 0.2437, + "epoch": 4.11666126067683, + "grad_norm": 0.2657260596752167, + "learning_rate": 3.9946889392223545e-06, + "loss": 0.3556, "step": 114225 }, { - "epoch": 4.02, - "learning_rate": 4.893714421934853e-06, - "loss": 0.2403, + "epoch": 4.116841460338055, + "grad_norm": 0.2049977332353592, + "learning_rate": 3.993106702853683e-06, + "loss": 0.3461, "step": 114230 }, { - "epoch": 4.02, - "learning_rate": 4.89202158817349e-06, - "loss": 0.2628, + "epoch": 4.1170216599992795, + "grad_norm": 0.24377499520778656, + "learning_rate": 3.9915247526998625e-06, + "loss": 0.3724, "step": 114235 }, { - "epoch": 4.02, - "learning_rate": 4.890329015500977e-06, - "loss": 0.2512, + "epoch": 4.117201859660504, + "grad_norm": 0.23424182832241058, + "learning_rate": 3.989943088782453e-06, + "loss": 0.3779, "step": 114240 }, { - "epoch": 4.02, - "learning_rate": 4.888636703939289e-06, - "loss": 0.2466, + "epoch": 4.117382059321728, + "grad_norm": 0.21818037331104279, + "learning_rate": 3.9883617111229955e-06, + "loss": 0.3498, "step": 114245 }, { - "epoch": 4.02, - "learning_rate": 4.886944653510392e-06, - "loss": 0.2495, + "epoch": 4.117562258982953, + "grad_norm": 0.25411292910575867, + "learning_rate": 3.986780619743047e-06, + "loss": 0.3329, "step": 114250 }, { - "epoch": 4.02, - "learning_rate": 4.885252864236264e-06, - "loss": 0.2409, + "epoch": 4.1177424586441775, + "grad_norm": 0.2751554846763611, + "learning_rate": 3.985199814664142e-06, + "loss": 0.3821, "step": 114255 }, { - "epoch": 4.02, - "learning_rate": 4.883561336138878e-06, - "loss": 0.2546, + "epoch": 4.117922658305402, + "grad_norm": 0.20654655992984772, + "learning_rate": 3.9836192959078225e-06, + "loss": 0.3981, "step": 114260 }, { - "epoch": 4.02, - "learning_rate": 4.881870069240191e-06, - "loss": 0.2439, + "epoch": 4.118102857966627, + "grad_norm": 0.21761824190616608, + "learning_rate": 3.982039063495621e-06, + "loss": 0.3587, "step": 114265 }, { - "epoch": 4.02, - "learning_rate": 4.880179063562154e-06, - "loss": 0.2482, + "epoch": 4.118283057627852, + "grad_norm": 0.3250308334827423, + "learning_rate": 3.980459117449065e-06, + "loss": 0.3831, "step": 114270 }, { - "epoch": 4.02, - "learning_rate": 4.878826447113366e-06, - "loss": 0.2398, + "epoch": 4.1184632572890765, + "grad_norm": 0.22311455011367798, + "learning_rate": 3.978879457789686e-06, + "loss": 0.3779, "step": 114275 }, { - "epoch": 4.02, - "learning_rate": 4.8771359116878464e-06, - "loss": 0.2486, + "epoch": 4.118643456950301, + "grad_norm": 0.28070372343063354, + "learning_rate": 3.977300084539013e-06, + "loss": 0.3387, "step": 114280 }, { - "epoch": 4.02, - "learning_rate": 4.875445637544462e-06, - "loss": 0.2419, + "epoch": 4.118823656611526, + "grad_norm": 0.19799430668354034, + "learning_rate": 3.975720997718544e-06, + "loss": 0.3412, "step": 114285 }, { - "epoch": 4.02, - "learning_rate": 4.873755624705145e-06, - "loss": 0.2332, + "epoch": 4.11900385627275, + "grad_norm": 0.24587881565093994, + "learning_rate": 3.974142197349809e-06, + "loss": 0.3441, "step": 114290 }, { - "epoch": 4.02, - "learning_rate": 4.872065873191855e-06, - "loss": 0.2522, + "epoch": 4.1191840559339745, + "grad_norm": 0.26680803298950195, + "learning_rate": 3.972563683454314e-06, + "loss": 0.3738, "step": 114295 }, { - "epoch": 4.02, - "learning_rate": 4.870376383026518e-06, - "loss": 0.2662, + "epoch": 4.119364255595199, + "grad_norm": 0.22729459404945374, + "learning_rate": 3.970985456053578e-06, + "loss": 0.38, "step": 114300 }, { - "epoch": 4.02, - "learning_rate": 4.868687154231083e-06, - "loss": 0.2625, + "epoch": 4.119544455256424, + "grad_norm": 0.28258028626441956, + "learning_rate": 3.969407515169088e-06, + "loss": 0.3575, "step": 114305 }, { - "epoch": 4.02, - "learning_rate": 4.866998186827482e-06, - "loss": 0.2336, + "epoch": 4.119724654917649, + "grad_norm": 0.2133554369211197, + "learning_rate": 3.967829860822342e-06, + "loss": 0.3429, "step": 114310 }, { - "epoch": 4.02, - "learning_rate": 4.865309480837638e-06, - "loss": 0.2593, + "epoch": 4.1199048545788735, + "grad_norm": 0.2711166441440582, + "learning_rate": 3.966252493034847e-06, + "loss": 0.3196, "step": 114315 }, { - "epoch": 4.02, - "learning_rate": 4.863621036283475e-06, - "loss": 0.2451, + "epoch": 4.120085054240098, + "grad_norm": 0.2520091235637665, + "learning_rate": 3.964675411828092e-06, + "loss": 0.387, "step": 114320 }, { - "epoch": 4.02, - "learning_rate": 4.861932853186929e-06, - "loss": 0.2312, + "epoch": 4.120265253901323, + "grad_norm": 0.2586155831813812, + "learning_rate": 3.963098617223562e-06, + "loss": 0.3943, "step": 114325 }, { - "epoch": 4.02, - "learning_rate": 4.860244931569915e-06, - "loss": 0.2612, + "epoch": 4.120445453562548, + "grad_norm": 0.23721350729465485, + "learning_rate": 3.961522109242741e-06, + "loss": 0.3705, "step": 114330 }, { - "epoch": 4.02, - "learning_rate": 4.858557271454348e-06, - "loss": 0.2539, + "epoch": 4.120625653223772, + "grad_norm": 0.24257147312164307, + "learning_rate": 3.959945887907099e-06, + "loss": 0.3615, "step": 114335 }, { - "epoch": 4.02, - "learning_rate": 4.856869872862132e-06, - "loss": 0.2428, + "epoch": 4.120805852884996, + "grad_norm": 0.2581164240837097, + "learning_rate": 3.9583699532381304e-06, + "loss": 0.3599, "step": 114340 }, { - "epoch": 4.02, - "learning_rate": 4.855182735815189e-06, - "loss": 0.2413, + "epoch": 4.120986052546221, + "grad_norm": 0.2084408849477768, + "learning_rate": 3.956794305257294e-06, + "loss": 0.3728, "step": 114345 }, { - "epoch": 4.02, - "learning_rate": 4.853495860335428e-06, - "loss": 0.2506, + "epoch": 4.121166252207446, + "grad_norm": 0.22756318747997284, + "learning_rate": 3.955218943986064e-06, + "loss": 0.3401, "step": 114350 }, { - "epoch": 4.02, - "learning_rate": 4.851809246444739e-06, - "loss": 0.2711, + "epoch": 4.12134645186867, + "grad_norm": 0.21954157948493958, + "learning_rate": 3.9536438694459e-06, + "loss": 0.394, "step": 114355 }, { - "epoch": 4.02, - "learning_rate": 4.850122894165035e-06, - "loss": 0.2475, + "epoch": 4.121526651529895, + "grad_norm": 0.24439333379268646, + "learning_rate": 3.952069081658258e-06, + "loss": 0.3647, "step": 114360 }, { - "epoch": 4.02, - "learning_rate": 4.848436803518205e-06, - "loss": 0.2499, + "epoch": 4.12170685119112, + "grad_norm": 0.2532067596912384, + "learning_rate": 3.950494580644606e-06, + "loss": 0.3597, "step": 114365 }, { - "epoch": 4.02, - "learning_rate": 4.846750974526146e-06, - "loss": 0.2573, + "epoch": 4.121887050852345, + "grad_norm": 0.23494374752044678, + "learning_rate": 3.9489203664263955e-06, + "loss": 0.3872, "step": 114370 }, { - "epoch": 4.02, - "learning_rate": 4.845065407210733e-06, - "loss": 0.23, + "epoch": 4.122067250513569, + "grad_norm": 0.2250523865222931, + "learning_rate": 3.947346439025057e-06, + "loss": 0.3852, "step": 114375 }, { - "epoch": 4.02, - "learning_rate": 4.843380101593872e-06, - "loss": 0.2409, + "epoch": 4.122247450174794, + "grad_norm": 0.21566912531852722, + "learning_rate": 3.9457727984620524e-06, + "loss": 0.3862, "step": 114380 }, { - "epoch": 4.02, - "learning_rate": 4.8416950576974375e-06, - "loss": 0.2442, + "epoch": 4.122427649836018, + "grad_norm": 0.33429858088493347, + "learning_rate": 3.94419944475882e-06, + "loss": 0.3911, "step": 114385 }, { - "epoch": 4.02, - "learning_rate": 4.840010275543308e-06, - "loss": 0.2409, + "epoch": 4.122607849497243, + "grad_norm": 0.28775933384895325, + "learning_rate": 3.942626377936793e-06, + "loss": 0.3773, "step": 114390 }, { - "epoch": 4.02, - "learning_rate": 4.838325755153353e-06, - "loss": 0.2386, + "epoch": 4.122788049158467, + "grad_norm": 0.23396413028240204, + "learning_rate": 3.941053598017402e-06, + "loss": 0.3944, "step": 114395 }, { - "epoch": 4.02, - "learning_rate": 4.836641496549449e-06, - "loss": 0.2406, + "epoch": 4.122968248819692, + "grad_norm": 0.2518860101699829, + "learning_rate": 3.939481105022075e-06, + "loss": 0.3333, "step": 114400 }, { - "epoch": 4.03, - "learning_rate": 4.834957499753473e-06, - "loss": 0.2305, + "epoch": 4.123148448480917, + "grad_norm": 0.2610318064689636, + "learning_rate": 3.937908898972248e-06, + "loss": 0.3537, "step": 114405 }, { - "epoch": 4.03, - "learning_rate": 4.833273764787286e-06, - "loss": 0.2556, + "epoch": 4.123328648142142, + "grad_norm": 0.2574523687362671, + "learning_rate": 3.936336979889332e-06, + "loss": 0.3709, "step": 114410 }, { - "epoch": 4.03, - "learning_rate": 4.831590291672744e-06, - "loss": 0.2356, + "epoch": 4.123508847803366, + "grad_norm": 0.2853260040283203, + "learning_rate": 3.934765347794747e-06, + "loss": 0.372, "step": 114415 }, { - "epoch": 4.03, - "learning_rate": 4.8299070804317145e-06, - "loss": 0.2411, + "epoch": 4.123689047464591, + "grad_norm": 0.22084416449069977, + "learning_rate": 3.933194002709906e-06, + "loss": 0.3494, "step": 114420 }, { - "epoch": 4.03, - "learning_rate": 4.828224131086051e-06, - "loss": 0.2539, + "epoch": 4.123869247125816, + "grad_norm": 0.23949111998081207, + "learning_rate": 3.9316229446562184e-06, + "loss": 0.3449, "step": 114425 }, { - "epoch": 4.03, - "learning_rate": 4.826541443657601e-06, - "loss": 0.2573, + "epoch": 4.12404944678704, + "grad_norm": 0.2440653145313263, + "learning_rate": 3.9300521736550825e-06, + "loss": 0.358, "step": 114430 }, { - "epoch": 4.03, - "learning_rate": 4.824859018168209e-06, - "loss": 0.2431, + "epoch": 4.124229646448264, + "grad_norm": 0.2716491222381592, + "learning_rate": 3.928481689727911e-06, + "loss": 0.3932, "step": 114435 }, { - "epoch": 4.03, - "learning_rate": 4.823176854639735e-06, - "loss": 0.2348, + "epoch": 4.124409846109489, + "grad_norm": 0.26395681500434875, + "learning_rate": 3.926911492896098e-06, + "loss": 0.3841, "step": 114440 }, { - "epoch": 4.03, - "learning_rate": 4.821494953094013e-06, - "loss": 0.2431, + "epoch": 4.124590045770714, + "grad_norm": 0.2175927311182022, + "learning_rate": 3.925341583181039e-06, + "loss": 0.3782, "step": 114445 }, { - "epoch": 4.03, - "learning_rate": 4.819813313552873e-06, - "loss": 0.2549, + "epoch": 4.124770245431939, + "grad_norm": 0.26144078373908997, + "learning_rate": 3.923771960604117e-06, + "loss": 0.3954, "step": 114450 }, { - "epoch": 4.03, - "learning_rate": 4.818131936038159e-06, - "loss": 0.2597, + "epoch": 4.124950445093163, + "grad_norm": 0.24796469509601593, + "learning_rate": 3.922202625186719e-06, + "loss": 0.3914, "step": 114455 }, { - "epoch": 4.03, - "learning_rate": 4.816450820571708e-06, - "loss": 0.2243, + "epoch": 4.125130644754388, + "grad_norm": 0.24708044528961182, + "learning_rate": 3.920633576950234e-06, + "loss": 0.391, "step": 114460 }, { - "epoch": 4.03, - "learning_rate": 4.814769967175339e-06, - "loss": 0.2375, + "epoch": 4.125310844415613, + "grad_norm": 0.23979832231998444, + "learning_rate": 3.91906481591604e-06, + "loss": 0.3627, "step": 114465 }, { - "epoch": 4.03, - "learning_rate": 4.813089375870878e-06, - "loss": 0.2516, + "epoch": 4.1254910440768375, + "grad_norm": 0.2370292693376541, + "learning_rate": 3.917496342105495e-06, + "loss": 0.3845, "step": 114470 }, { - "epoch": 4.03, - "learning_rate": 4.811409046680149e-06, - "loss": 0.258, + "epoch": 4.125671243738061, + "grad_norm": 0.2842269837856293, + "learning_rate": 3.915928155539986e-06, + "loss": 0.3571, "step": 114475 }, { - "epoch": 4.03, - "learning_rate": 4.809728979624972e-06, - "loss": 0.2288, + "epoch": 4.125851443399286, + "grad_norm": 0.23658975958824158, + "learning_rate": 3.914360256240871e-06, + "loss": 0.357, "step": 114480 }, { - "epoch": 4.03, - "learning_rate": 4.808049174727155e-06, - "loss": 0.2605, + "epoch": 4.126031643060511, + "grad_norm": 0.24424517154693604, + "learning_rate": 3.912792644229524e-06, + "loss": 0.3581, "step": 114485 }, { - "epoch": 4.03, - "learning_rate": 4.806369632008509e-06, - "loss": 0.2512, + "epoch": 4.126211842721736, + "grad_norm": 0.3009839951992035, + "learning_rate": 3.91122531952729e-06, + "loss": 0.3738, "step": 114490 }, { - "epoch": 4.03, - "learning_rate": 4.804690351490851e-06, - "loss": 0.2438, + "epoch": 4.12639204238296, + "grad_norm": 0.1834869086742401, + "learning_rate": 3.9096582821555204e-06, + "loss": 0.3381, "step": 114495 }, { - "epoch": 4.03, - "learning_rate": 4.8030113331959805e-06, - "loss": 0.2472, + "epoch": 4.126572242044185, + "grad_norm": 0.19505488872528076, + "learning_rate": 3.908091532135583e-06, + "loss": 0.3742, "step": 114500 }, { - "epoch": 4.03, - "eval_loss": 0.24984058737754822, - "eval_runtime": 10.5589, - "eval_samples_per_second": 9.471, - "eval_steps_per_second": 9.471, + "epoch": 4.126572242044185, + "eval_loss": 0.42937061190605164, + "eval_runtime": 3.5336, + "eval_samples_per_second": 28.3, + "eval_steps_per_second": 7.075, "step": 114500 }, { - "epoch": 4.03, - "learning_rate": 4.80133257714569e-06, - "loss": 0.2481, + "epoch": 4.12675244170541, + "grad_norm": 0.21231359243392944, + "learning_rate": 3.906525069488812e-06, + "loss": 0.3716, "step": 114505 }, { - "epoch": 4.03, - "learning_rate": 4.79965408336179e-06, - "loss": 0.2397, + "epoch": 4.1269326413666345, + "grad_norm": 0.255136638879776, + "learning_rate": 3.904958894236554e-06, + "loss": 0.3851, "step": 114510 }, { - "epoch": 4.03, - "learning_rate": 4.797975851866063e-06, - "loss": 0.2519, + "epoch": 4.127112841027859, + "grad_norm": 0.22166618704795837, + "learning_rate": 3.903393006400147e-06, + "loss": 0.3521, "step": 114515 }, { - "epoch": 4.03, - "learning_rate": 4.796297882680315e-06, - "loss": 0.229, + "epoch": 4.127293040689083, + "grad_norm": 0.23030775785446167, + "learning_rate": 3.901827406000918e-06, + "loss": 0.348, "step": 114520 }, { - "epoch": 4.03, - "learning_rate": 4.794620175826323e-06, - "loss": 0.2386, + "epoch": 4.127473240350308, + "grad_norm": 0.20609252154827118, + "learning_rate": 3.900262093060214e-06, + "loss": 0.3331, "step": 114525 }, { - "epoch": 4.03, - "learning_rate": 4.792942731325864e-06, - "loss": 0.2488, + "epoch": 4.127653440011533, + "grad_norm": 0.2601858377456665, + "learning_rate": 3.898697067599355e-06, + "loss": 0.3415, "step": 114530 }, { - "epoch": 4.03, - "learning_rate": 4.791265549200735e-06, - "loss": 0.2647, + "epoch": 4.127833639672757, + "grad_norm": 0.20611776411533356, + "learning_rate": 3.897132329639661e-06, + "loss": 0.3781, "step": 114535 }, { - "epoch": 4.03, - "learning_rate": 4.789588629472702e-06, - "loss": 0.2424, + "epoch": 4.128013839333982, + "grad_norm": 0.2544640898704529, + "learning_rate": 3.895567879202452e-06, + "loss": 0.3592, "step": 114540 }, { - "epoch": 4.03, - "learning_rate": 4.7879119721635444e-06, - "loss": 0.2584, + "epoch": 4.128194038995207, + "grad_norm": 0.2738896608352661, + "learning_rate": 3.894003716309047e-06, + "loss": 0.3508, "step": 114545 }, { - "epoch": 4.03, - "learning_rate": 4.786235577295023e-06, - "loss": 0.2679, + "epoch": 4.1283742386564315, + "grad_norm": 0.20672179758548737, + "learning_rate": 3.89243984098075e-06, + "loss": 0.3829, "step": 114550 }, { - "epoch": 4.03, - "learning_rate": 4.784559444888917e-06, - "loss": 0.2353, + "epoch": 4.128554438317656, + "grad_norm": 0.31415534019470215, + "learning_rate": 3.890876253238884e-06, + "loss": 0.3825, "step": 114555 }, { - "epoch": 4.03, - "learning_rate": 4.782883574966979e-06, - "loss": 0.2396, + "epoch": 4.128734637978881, + "grad_norm": 0.2770949900150299, + "learning_rate": 3.8893129531047314e-06, + "loss": 0.3637, "step": 114560 }, { - "epoch": 4.03, - "learning_rate": 4.781207967550977e-06, - "loss": 0.2575, + "epoch": 4.128914837640105, + "grad_norm": 0.20876848697662354, + "learning_rate": 3.887749940599608e-06, + "loss": 0.3571, "step": 114565 }, { - "epoch": 4.03, - "learning_rate": 4.779532622662661e-06, - "loss": 0.2447, + "epoch": 4.1290950373013295, + "grad_norm": 0.2529737055301666, + "learning_rate": 3.886187215744805e-06, + "loss": 0.3665, "step": 114570 }, { - "epoch": 4.03, - "learning_rate": 4.777857540323794e-06, - "loss": 0.2481, + "epoch": 4.129275236962554, + "grad_norm": 0.27046170830726624, + "learning_rate": 3.884624778561605e-06, + "loss": 0.368, "step": 114575 }, { - "epoch": 4.03, - "learning_rate": 4.776182720556121e-06, - "loss": 0.2639, + "epoch": 4.129455436623779, + "grad_norm": 0.2750345766544342, + "learning_rate": 3.8830626290713185e-06, + "loss": 0.3764, "step": 114580 }, { - "epoch": 4.03, - "learning_rate": 4.774508163381381e-06, - "loss": 0.2617, + "epoch": 4.129635636285004, + "grad_norm": 0.24169829487800598, + "learning_rate": 3.881500767295201e-06, + "loss": 0.363, "step": 114585 }, { - "epoch": 4.03, - "learning_rate": 4.772833868821328e-06, - "loss": 0.2352, + "epoch": 4.1298158359462285, + "grad_norm": 0.24228541553020477, + "learning_rate": 3.8799391932545555e-06, + "loss": 0.3773, "step": 114590 }, { - "epoch": 4.03, - "learning_rate": 4.771159836897699e-06, - "loss": 0.2407, + "epoch": 4.129996035607453, + "grad_norm": 0.21988023817539215, + "learning_rate": 3.878377906970648e-06, + "loss": 0.3664, "step": 114595 }, { - "epoch": 4.03, - "learning_rate": 4.769486067632226e-06, - "loss": 0.2519, + "epoch": 4.130176235268678, + "grad_norm": 0.2830277979373932, + "learning_rate": 3.876816908464753e-06, + "loss": 0.4126, "step": 114600 }, { - "epoch": 4.03, - "learning_rate": 4.767812561046639e-06, - "loss": 0.2444, + "epoch": 4.130356434929903, + "grad_norm": 0.2855124771595001, + "learning_rate": 3.875256197758137e-06, + "loss": 0.3822, "step": 114605 }, { - "epoch": 4.03, - "learning_rate": 4.76613931716268e-06, - "loss": 0.2578, + "epoch": 4.130536634591127, + "grad_norm": 0.314847856760025, + "learning_rate": 3.873695774872066e-06, + "loss": 0.3807, "step": 114610 }, { - "epoch": 4.03, - "learning_rate": 4.764466336002066e-06, - "loss": 0.2422, + "epoch": 4.130716834252351, + "grad_norm": 0.26585081219673157, + "learning_rate": 3.872135639827795e-06, + "loss": 0.3741, "step": 114615 }, { - "epoch": 4.03, - "learning_rate": 4.762793617586514e-06, - "loss": 0.2292, + "epoch": 4.130897033913576, + "grad_norm": 0.2731850743293762, + "learning_rate": 3.870575792646591e-06, + "loss": 0.3896, "step": 114620 }, { - "epoch": 4.03, - "learning_rate": 4.761121161937751e-06, - "loss": 0.2377, + "epoch": 4.131077233574801, + "grad_norm": 0.2461189180612564, + "learning_rate": 3.8690162333497006e-06, + "loss": 0.4125, "step": 114625 }, { - "epoch": 4.03, - "learning_rate": 4.759448969077496e-06, - "loss": 0.2634, + "epoch": 4.1312574332360255, + "grad_norm": 0.2130453884601593, + "learning_rate": 3.867456961958374e-06, + "loss": 0.3652, "step": 114630 }, { - "epoch": 4.03, - "learning_rate": 4.757777039027458e-06, - "loss": 0.2383, + "epoch": 4.13143763289725, + "grad_norm": 0.2346867322921753, + "learning_rate": 3.865897978493851e-06, + "loss": 0.3495, "step": 114635 }, { - "epoch": 4.03, - "learning_rate": 4.756105371809347e-06, - "loss": 0.2616, + "epoch": 4.131617832558475, + "grad_norm": 0.23216021060943604, + "learning_rate": 3.864339282977375e-06, + "loss": 0.367, "step": 114640 }, { - "epoch": 4.03, - "learning_rate": 4.75443396744486e-06, - "loss": 0.2624, + "epoch": 4.1317980322197, + "grad_norm": 0.28052276372909546, + "learning_rate": 3.862780875430188e-06, + "loss": 0.3296, "step": 114645 }, { - "epoch": 4.03, - "learning_rate": 4.752762825955711e-06, - "loss": 0.2428, + "epoch": 4.131978231880924, + "grad_norm": 0.26213961839675903, + "learning_rate": 3.861222755873526e-06, + "loss": 0.4356, "step": 114650 }, { - "epoch": 4.03, - "learning_rate": 4.751091947363592e-06, - "loss": 0.2472, + "epoch": 4.132158431542149, + "grad_norm": 0.3032136857509613, + "learning_rate": 3.859664924328599e-06, + "loss": 0.3864, "step": 114655 }, { - "epoch": 4.03, - "learning_rate": 4.749421331690202e-06, - "loss": 0.2706, + "epoch": 4.132338631203373, + "grad_norm": 0.297122061252594, + "learning_rate": 3.85810738081665e-06, + "loss": 0.3805, "step": 114660 }, { - "epoch": 4.03, - "learning_rate": 4.747750978957221e-06, - "loss": 0.2544, + "epoch": 4.132518830864598, + "grad_norm": 0.1998542994260788, + "learning_rate": 3.85655012535889e-06, + "loss": 0.3474, "step": 114665 }, { - "epoch": 4.03, - "learning_rate": 4.746080889186355e-06, - "loss": 0.2538, + "epoch": 4.132699030525822, + "grad_norm": 0.21667265892028809, + "learning_rate": 3.854993157976553e-06, + "loss": 0.3602, "step": 114670 }, { - "epoch": 4.03, - "learning_rate": 4.744411062399276e-06, - "loss": 0.2644, + "epoch": 4.132879230187047, + "grad_norm": 0.2543310523033142, + "learning_rate": 3.853436478690833e-06, + "loss": 0.3556, "step": 114675 }, { - "epoch": 4.03, - "learning_rate": 4.742741498617676e-06, - "loss": 0.2504, + "epoch": 4.133059429848272, + "grad_norm": 0.2114337682723999, + "learning_rate": 3.851880087522941e-06, + "loss": 0.3946, "step": 114680 }, { - "epoch": 4.03, - "learning_rate": 4.74107219786322e-06, - "loss": 0.2516, + "epoch": 4.133239629509497, + "grad_norm": 0.2727842330932617, + "learning_rate": 3.850323984494095e-06, + "loss": 0.3585, "step": 114685 }, { - "epoch": 4.04, - "learning_rate": 4.739403160157596e-06, - "loss": 0.2469, + "epoch": 4.133419829170721, + "grad_norm": 0.27654939889907837, + "learning_rate": 3.848768169625491e-06, + "loss": 0.362, "step": 114690 }, { - "epoch": 4.04, - "learning_rate": 4.737734385522471e-06, - "loss": 0.2598, + "epoch": 4.133600028831946, + "grad_norm": 0.26321300864219666, + "learning_rate": 3.8472126429383246e-06, + "loss": 0.3651, "step": 114695 }, { - "epoch": 4.04, - "learning_rate": 4.736065873979506e-06, - "loss": 0.2376, + "epoch": 4.133780228493171, + "grad_norm": 0.23769168555736542, + "learning_rate": 3.84565740445379e-06, + "loss": 0.3345, "step": 114700 }, { - "epoch": 4.04, - "learning_rate": 4.734397625550377e-06, - "loss": 0.2479, + "epoch": 4.133960428154395, + "grad_norm": 0.2931036353111267, + "learning_rate": 3.844102454193072e-06, + "loss": 0.3837, "step": 114705 }, { - "epoch": 4.04, - "learning_rate": 4.732729640256739e-06, - "loss": 0.2631, + "epoch": 4.134140627815619, + "grad_norm": 0.24765877425670624, + "learning_rate": 3.84254779217737e-06, + "loss": 0.3692, "step": 114710 }, { - "epoch": 4.04, - "learning_rate": 4.7310619181202495e-06, - "loss": 0.2706, + "epoch": 4.134320827476844, + "grad_norm": 0.23711945116519928, + "learning_rate": 3.840993418427855e-06, + "loss": 0.3612, "step": 114715 }, { - "epoch": 4.04, - "learning_rate": 4.729394459162556e-06, - "loss": 0.2574, + "epoch": 4.134501027138069, + "grad_norm": 0.23325876891613007, + "learning_rate": 3.839439332965711e-06, + "loss": 0.3809, "step": 114720 }, { - "epoch": 4.04, - "learning_rate": 4.727727263405326e-06, - "loss": 0.2489, + "epoch": 4.134681226799294, + "grad_norm": 0.2400001585483551, + "learning_rate": 3.837885535812108e-06, + "loss": 0.3972, "step": 114725 }, { - "epoch": 4.04, - "learning_rate": 4.7260603308701885e-06, - "loss": 0.2596, + "epoch": 4.134861426460518, + "grad_norm": 0.24221131205558777, + "learning_rate": 3.836332026988218e-06, + "loss": 0.3731, "step": 114730 }, { - "epoch": 4.04, - "learning_rate": 4.7243936615788045e-06, - "loss": 0.2392, + "epoch": 4.135041626121743, + "grad_norm": 0.22964808344841003, + "learning_rate": 3.834778806515199e-06, + "loss": 0.3772, "step": 114735 }, { - "epoch": 4.04, - "learning_rate": 4.7227272555528e-06, - "loss": 0.2669, + "epoch": 4.135221825782968, + "grad_norm": 0.2366577684879303, + "learning_rate": 3.833225874414228e-06, + "loss": 0.3839, "step": 114740 }, { - "epoch": 4.04, - "learning_rate": 4.721061112813827e-06, - "loss": 0.2326, + "epoch": 4.1354020254441926, + "grad_norm": 0.281572550535202, + "learning_rate": 3.8316732307064566e-06, + "loss": 0.3524, "step": 114745 }, { - "epoch": 4.04, - "learning_rate": 4.719395233383509e-06, - "loss": 0.2521, + "epoch": 4.135582225105416, + "grad_norm": 0.24455080926418304, + "learning_rate": 3.83012087541304e-06, + "loss": 0.3965, "step": 114750 }, { - "epoch": 4.04, - "learning_rate": 4.717729617283478e-06, - "loss": 0.2751, + "epoch": 4.135762424766641, + "grad_norm": 0.2673299312591553, + "learning_rate": 3.828568808555127e-06, + "loss": 0.3553, "step": 114755 }, { - "epoch": 4.04, - "learning_rate": 4.716064264535356e-06, - "loss": 0.2576, + "epoch": 4.135942624427866, + "grad_norm": 0.29149943590164185, + "learning_rate": 3.827017030153859e-06, + "loss": 0.3808, "step": 114760 }, { - "epoch": 4.04, - "learning_rate": 4.714399175160777e-06, - "loss": 0.2684, + "epoch": 4.136122824089091, + "grad_norm": 0.2999207377433777, + "learning_rate": 3.825465540230397e-06, + "loss": 0.4054, "step": 114765 }, { - "epoch": 4.04, - "learning_rate": 4.712734349181358e-06, - "loss": 0.2575, + "epoch": 4.136303023750315, + "grad_norm": 0.2576257288455963, + "learning_rate": 3.823914338805856e-06, + "loss": 0.3723, "step": 114770 }, { - "epoch": 4.04, - "learning_rate": 4.7110697866187135e-06, - "loss": 0.248, + "epoch": 4.13648322341154, + "grad_norm": 0.2315170168876648, + "learning_rate": 3.822363425901388e-06, + "loss": 0.3923, "step": 114775 }, { - "epoch": 4.04, - "learning_rate": 4.709405487494447e-06, - "loss": 0.2375, + "epoch": 4.136663423072765, + "grad_norm": 0.24673189222812653, + "learning_rate": 3.8208128015381176e-06, + "loss": 0.3754, "step": 114780 }, { - "epoch": 4.04, - "learning_rate": 4.707741451830183e-06, - "loss": 0.2677, + "epoch": 4.1368436227339895, + "grad_norm": 0.24493588507175446, + "learning_rate": 3.8192624657371725e-06, + "loss": 0.3749, "step": 114785 }, { - "epoch": 4.04, - "learning_rate": 4.706077679647528e-06, - "loss": 0.2777, + "epoch": 4.137023822395214, + "grad_norm": 0.2173871397972107, + "learning_rate": 3.817712418519676e-06, + "loss": 0.3839, "step": 114790 }, { - "epoch": 4.04, - "learning_rate": 4.704414170968077e-06, - "loss": 0.2592, + "epoch": 4.137204022056438, + "grad_norm": 0.3121466040611267, + "learning_rate": 3.816162659906747e-06, + "loss": 0.3741, "step": 114795 }, { - "epoch": 4.04, - "learning_rate": 4.702750925813432e-06, - "loss": 0.252, + "epoch": 4.137384221717663, + "grad_norm": 0.2459418624639511, + "learning_rate": 3.8146131899194942e-06, + "loss": 0.3915, "step": 114800 }, { - "epoch": 4.04, - "learning_rate": 4.70108794420519e-06, - "loss": 0.2279, + "epoch": 4.137564421378888, + "grad_norm": 0.2291506677865982, + "learning_rate": 3.8130640085790396e-06, + "loss": 0.3682, "step": 114805 }, { - "epoch": 4.04, - "learning_rate": 4.69942522616495e-06, - "loss": 0.2586, + "epoch": 4.137744621040112, + "grad_norm": 0.2138427197933197, + "learning_rate": 3.811515115906489e-06, + "loss": 0.3368, "step": 114810 }, { - "epoch": 4.04, - "learning_rate": 4.697762771714284e-06, - "loss": 0.2462, + "epoch": 4.137924820701337, + "grad_norm": 0.28790679574012756, + "learning_rate": 3.8099665119229382e-06, + "loss": 0.3658, "step": 114815 }, { - "epoch": 4.04, - "learning_rate": 4.696100580874796e-06, - "loss": 0.2519, + "epoch": 4.138105020362562, + "grad_norm": 0.2095593363046646, + "learning_rate": 3.8084181966494966e-06, + "loss": 0.3348, "step": 114820 }, { - "epoch": 4.04, - "learning_rate": 4.694438653668062e-06, - "loss": 0.2567, + "epoch": 4.1382852200237865, + "grad_norm": 0.28139370679855347, + "learning_rate": 3.8068701701072436e-06, + "loss": 0.3597, "step": 114825 }, { - "epoch": 4.04, - "learning_rate": 4.692776990115661e-06, - "loss": 0.2444, + "epoch": 4.138465419685011, + "grad_norm": 0.2298731952905655, + "learning_rate": 3.8053224323172904e-06, + "loss": 0.3687, "step": 114830 }, { - "epoch": 4.04, - "learning_rate": 4.691115590239159e-06, - "loss": 0.2809, + "epoch": 4.138645619346236, + "grad_norm": 0.2664845287799835, + "learning_rate": 3.8037749833007203e-06, + "loss": 0.3159, "step": 114835 }, { - "epoch": 4.04, - "learning_rate": 4.689454454060138e-06, - "loss": 0.2277, + "epoch": 4.13882581900746, + "grad_norm": 0.21755090355873108, + "learning_rate": 3.8022278230785996e-06, + "loss": 0.3509, "step": 114840 }, { - "epoch": 4.04, - "learning_rate": 4.6877935816001725e-06, - "loss": 0.2642, + "epoch": 4.139006018668685, + "grad_norm": 0.29724082350730896, + "learning_rate": 3.8006809516720282e-06, + "loss": 0.3392, "step": 114845 }, { - "epoch": 4.04, - "learning_rate": 4.686132972880819e-06, - "loss": 0.2511, + "epoch": 4.139186218329909, + "grad_norm": 0.23176833987236023, + "learning_rate": 3.79913436910207e-06, + "loss": 0.3852, "step": 114850 }, { - "epoch": 4.04, - "learning_rate": 4.684472627923639e-06, - "loss": 0.236, + "epoch": 4.139366417991134, + "grad_norm": 0.2422746866941452, + "learning_rate": 3.7975880753898045e-06, + "loss": 0.3726, "step": 114855 }, { - "epoch": 4.04, - "learning_rate": 4.682812546750198e-06, - "loss": 0.2635, + "epoch": 4.139546617652359, + "grad_norm": 0.23337143659591675, + "learning_rate": 3.796042070556302e-06, + "loss": 0.3838, "step": 114860 }, { - "epoch": 4.04, - "learning_rate": 4.681152729382046e-06, - "loss": 0.2551, + "epoch": 4.1397268173135835, + "grad_norm": 0.2680877447128296, + "learning_rate": 3.794496354622612e-06, + "loss": 0.4008, "step": 114865 }, { - "epoch": 4.04, - "learning_rate": 4.6794931758407354e-06, - "loss": 0.2644, + "epoch": 4.139907016974808, + "grad_norm": 0.26528939604759216, + "learning_rate": 3.792950927609809e-06, + "loss": 0.389, "step": 114870 }, { - "epoch": 4.04, - "learning_rate": 4.677833886147809e-06, - "loss": 0.2385, + "epoch": 4.140087216636033, + "grad_norm": 0.2818238139152527, + "learning_rate": 3.7914057895389433e-06, + "loss": 0.4002, "step": 114875 }, { - "epoch": 4.04, - "learning_rate": 4.676174860324822e-06, - "loss": 0.2558, + "epoch": 4.140267416297258, + "grad_norm": 0.2836932837963104, + "learning_rate": 3.789860940431067e-06, + "loss": 0.3686, "step": 114880 }, { - "epoch": 4.04, - "learning_rate": 4.674516098393311e-06, - "loss": 0.2599, + "epoch": 4.140447615958482, + "grad_norm": 0.22321908175945282, + "learning_rate": 3.7883163803072277e-06, + "loss": 0.3532, "step": 114885 }, { - "epoch": 4.04, - "learning_rate": 4.672857600374805e-06, - "loss": 0.2463, + "epoch": 4.140627815619706, + "grad_norm": 0.18690916895866394, + "learning_rate": 3.786772109188466e-06, + "loss": 0.3564, "step": 114890 }, { - "epoch": 4.04, - "learning_rate": 4.671199366290849e-06, - "loss": 0.2445, + "epoch": 4.140808015280931, + "grad_norm": 0.21489687263965607, + "learning_rate": 3.7852281270958357e-06, + "loss": 0.3558, "step": 114895 }, { - "epoch": 4.04, - "learning_rate": 4.66954139616298e-06, - "loss": 0.2477, + "epoch": 4.140988214942156, + "grad_norm": 0.2863061726093292, + "learning_rate": 3.7836844340503637e-06, + "loss": 0.3493, "step": 114900 }, { - "epoch": 4.04, - "learning_rate": 4.6678836900127145e-06, - "loss": 0.2558, + "epoch": 4.1411684146033805, + "grad_norm": 0.22299538552761078, + "learning_rate": 3.782141030073083e-06, + "loss": 0.356, "step": 114905 }, { - "epoch": 4.04, - "learning_rate": 4.666226247861583e-06, - "loss": 0.2577, + "epoch": 4.141348614264605, + "grad_norm": 0.2980153262615204, + "learning_rate": 3.7805979151850217e-06, + "loss": 0.3812, "step": 114910 }, { - "epoch": 4.04, - "learning_rate": 4.6645690697310925e-06, - "loss": 0.2541, + "epoch": 4.14152881392583, + "grad_norm": 0.20422306656837463, + "learning_rate": 3.7790550894072073e-06, + "loss": 0.362, "step": 114915 }, { - "epoch": 4.04, - "learning_rate": 4.662912155642782e-06, - "loss": 0.2404, + "epoch": 4.141709013587055, + "grad_norm": 0.2799051105976105, + "learning_rate": 3.7775125527606534e-06, + "loss": 0.37, "step": 114920 }, { - "epoch": 4.04, - "learning_rate": 4.661255505618153e-06, - "loss": 0.2522, + "epoch": 4.141889213248279, + "grad_norm": 0.2169763743877411, + "learning_rate": 3.775970305266385e-06, + "loss": 0.3863, "step": 114925 }, { - "epoch": 4.04, - "learning_rate": 4.659599119678709e-06, - "loss": 0.2446, + "epoch": 4.142069412909504, + "grad_norm": 0.27121099829673767, + "learning_rate": 3.774428346945416e-06, + "loss": 0.3563, "step": 114930 }, { - "epoch": 4.04, - "learning_rate": 4.657942997845974e-06, - "loss": 0.2606, + "epoch": 4.142249612570728, + "grad_norm": 0.28678664565086365, + "learning_rate": 3.772886677818749e-06, + "loss": 0.4166, "step": 114935 }, { - "epoch": 4.04, - "learning_rate": 4.656287140141444e-06, - "loss": 0.248, + "epoch": 4.142429812231953, + "grad_norm": 0.24329231679439545, + "learning_rate": 3.7713452979073913e-06, + "loss": 0.372, "step": 114940 }, { - "epoch": 4.04, - "learning_rate": 4.65463154658661e-06, - "loss": 0.2571, + "epoch": 4.1426100118931775, + "grad_norm": 0.25663426518440247, + "learning_rate": 3.7698042072323386e-06, + "loss": 0.3657, "step": 114945 }, { - "epoch": 4.04, - "learning_rate": 4.652976217202987e-06, - "loss": 0.2576, + "epoch": 4.142790211554402, + "grad_norm": 0.29921120405197144, + "learning_rate": 3.7682634058146065e-06, + "loss": 0.3919, "step": 114950 }, { - "epoch": 4.04, - "learning_rate": 4.651321152012048e-06, - "loss": 0.2447, + "epoch": 4.142970411215627, + "grad_norm": 0.25063279271125793, + "learning_rate": 3.766722893675162e-06, + "loss": 0.3886, "step": 114955 }, { - "epoch": 4.04, - "learning_rate": 4.649666351035304e-06, - "loss": 0.2749, + "epoch": 4.143150610876852, + "grad_norm": 0.24971787631511688, + "learning_rate": 3.76518267083501e-06, + "loss": 0.3546, "step": 114960 }, { - "epoch": 4.04, - "learning_rate": 4.648011814294229e-06, - "loss": 0.2494, + "epoch": 4.143330810538076, + "grad_norm": 0.25325024127960205, + "learning_rate": 3.763642737315137e-06, + "loss": 0.3863, "step": 114965 }, { - "epoch": 4.04, - "learning_rate": 4.646357541810301e-06, - "loss": 0.2668, + "epoch": 4.143511010199301, + "grad_norm": 0.24295130372047424, + "learning_rate": 3.762103093136518e-06, + "loss": 0.3941, "step": 114970 }, { - "epoch": 4.05, - "learning_rate": 4.644703533605013e-06, - "loss": 0.2353, + "epoch": 4.143691209860526, + "grad_norm": 0.19150157272815704, + "learning_rate": 3.7605637383201357e-06, + "loss": 0.3463, "step": 114975 }, { - "epoch": 4.05, - "learning_rate": 4.643049789699835e-06, - "loss": 0.2493, + "epoch": 4.14387140952175, + "grad_norm": 0.24536427855491638, + "learning_rate": 3.759024672886957e-06, + "loss": 0.3587, "step": 114980 }, { - "epoch": 4.05, - "learning_rate": 4.641396310116241e-06, - "loss": 0.2338, + "epoch": 4.144051609182974, + "grad_norm": 0.3299379348754883, + "learning_rate": 3.7574858968579513e-06, + "loss": 0.3832, "step": 114985 }, { - "epoch": 4.05, - "learning_rate": 4.639743094875693e-06, - "loss": 0.2765, + "epoch": 4.144231808844199, + "grad_norm": 0.2293405532836914, + "learning_rate": 3.7559474102540905e-06, + "loss": 0.3422, "step": 114990 }, { - "epoch": 4.05, - "learning_rate": 4.638090143999668e-06, - "loss": 0.2424, + "epoch": 4.144412008505424, + "grad_norm": 0.29431861639022827, + "learning_rate": 3.7544092130963355e-06, + "loss": 0.3949, "step": 114995 }, { - "epoch": 4.05, - "learning_rate": 4.636437457509618e-06, - "loss": 0.2276, + "epoch": 4.144592208166649, + "grad_norm": 0.23299720883369446, + "learning_rate": 3.752871305405642e-06, + "loss": 0.3659, "step": 115000 }, { - "epoch": 4.05, - "eval_loss": 0.24977658689022064, - "eval_runtime": 10.5663, - "eval_samples_per_second": 9.464, - "eval_steps_per_second": 9.464, + "epoch": 4.144592208166649, + "eval_loss": 0.4293991029262543, + "eval_runtime": 3.5316, + "eval_samples_per_second": 28.316, + "eval_steps_per_second": 7.079, "step": 115000 }, { - "epoch": 4.05, - "learning_rate": 4.634785035427014e-06, - "loss": 0.2404, + "epoch": 4.144772407827873, + "grad_norm": 0.23517471551895142, + "learning_rate": 3.7513336872029625e-06, + "loss": 0.381, "step": 115005 }, { - "epoch": 4.05, - "learning_rate": 4.6331328777733e-06, - "loss": 0.2405, + "epoch": 4.144952607489098, + "grad_norm": 0.24651336669921875, + "learning_rate": 3.7497963585092415e-06, + "loss": 0.4108, "step": 115010 }, { - "epoch": 4.05, - "learning_rate": 4.6314809845699355e-06, - "loss": 0.2592, + "epoch": 4.145132807150323, + "grad_norm": 0.2341834455728531, + "learning_rate": 3.7482593193454375e-06, + "loss": 0.3443, "step": 115015 }, { - "epoch": 4.05, - "learning_rate": 4.629829355838372e-06, - "loss": 0.2474, + "epoch": 4.145313006811548, + "grad_norm": 0.2706274092197418, + "learning_rate": 3.7467225697324833e-06, + "loss": 0.3762, "step": 115020 }, { - "epoch": 4.05, - "learning_rate": 4.628177991600047e-06, - "loss": 0.2443, + "epoch": 4.145493206472771, + "grad_norm": 0.2560116946697235, + "learning_rate": 3.7451861096913234e-06, + "loss": 0.3493, "step": 115025 }, { - "epoch": 4.05, - "learning_rate": 4.626526891876398e-06, - "loss": 0.2654, + "epoch": 4.145673406133996, + "grad_norm": 0.23122893273830414, + "learning_rate": 3.7436499392428855e-06, + "loss": 0.3732, "step": 115030 }, { - "epoch": 4.05, - "learning_rate": 4.624876056688879e-06, - "loss": 0.2398, + "epoch": 4.145853605795221, + "grad_norm": 0.2816644012928009, + "learning_rate": 3.7421140584080942e-06, + "loss": 0.378, "step": 115035 }, { - "epoch": 4.05, - "learning_rate": 4.623225486058918e-06, - "loss": 0.2607, + "epoch": 4.146033805456446, + "grad_norm": 0.3167472183704376, + "learning_rate": 3.740578467207892e-06, + "loss": 0.4059, "step": 115040 }, { - "epoch": 4.05, - "learning_rate": 4.621575180007937e-06, - "loss": 0.2391, + "epoch": 4.14621400511767, + "grad_norm": 0.22045278549194336, + "learning_rate": 3.7390431656631975e-06, + "loss": 0.3764, "step": 115045 }, { - "epoch": 4.05, - "learning_rate": 4.61992513855738e-06, - "loss": 0.2658, + "epoch": 4.146394204778895, + "grad_norm": 0.25917506217956543, + "learning_rate": 3.737508153794911e-06, + "loss": 0.3721, "step": 115050 }, { - "epoch": 4.05, - "learning_rate": 4.61827536172866e-06, - "loss": 0.2454, + "epoch": 4.14657440444012, + "grad_norm": 0.2379886656999588, + "learning_rate": 3.7359734316239653e-06, + "loss": 0.3326, "step": 115055 }, { - "epoch": 4.05, - "learning_rate": 4.616625849543199e-06, - "loss": 0.271, + "epoch": 4.1467546041013446, + "grad_norm": 0.3082965612411499, + "learning_rate": 3.7344389991712666e-06, + "loss": 0.339, "step": 115060 }, { - "epoch": 4.05, - "learning_rate": 4.61497660202242e-06, - "loss": 0.2539, + "epoch": 4.146934803762569, + "grad_norm": 0.25842636823654175, + "learning_rate": 3.732904856457717e-06, + "loss": 0.3674, "step": 115065 }, { - "epoch": 4.05, - "learning_rate": 4.6133276191877405e-06, - "loss": 0.2412, + "epoch": 4.147115003423793, + "grad_norm": 0.2803257405757904, + "learning_rate": 3.731371003504222e-06, + "loss": 0.3743, "step": 115070 }, { - "epoch": 4.05, - "learning_rate": 4.611678901060565e-06, - "loss": 0.2474, + "epoch": 4.147295203085018, + "grad_norm": 0.1919623762369156, + "learning_rate": 3.7298374403316737e-06, + "loss": 0.3497, "step": 115075 }, { - "epoch": 4.05, - "learning_rate": 4.610030447662303e-06, - "loss": 0.2609, + "epoch": 4.147475402746243, + "grad_norm": 0.25834715366363525, + "learning_rate": 3.7283041669609775e-06, + "loss": 0.3668, "step": 115080 }, { - "epoch": 4.05, - "learning_rate": 4.608382259014355e-06, - "loss": 0.2499, + "epoch": 4.147655602407467, + "grad_norm": 0.2559783458709717, + "learning_rate": 3.7267711834130194e-06, + "loss": 0.3553, "step": 115085 }, { - "epoch": 4.05, - "learning_rate": 4.606734335138127e-06, - "loss": 0.2305, + "epoch": 4.147835802068692, + "grad_norm": 0.30941301584243774, + "learning_rate": 3.725238489708685e-06, + "loss": 0.3845, "step": 115090 }, { - "epoch": 4.05, - "learning_rate": 4.6050866760550145e-06, - "loss": 0.2512, + "epoch": 4.148016001729917, + "grad_norm": 0.22858378291130066, + "learning_rate": 3.7237060858688584e-06, + "loss": 0.3625, "step": 115095 }, { - "epoch": 4.05, - "learning_rate": 4.6034392817864145e-06, - "loss": 0.2668, + "epoch": 4.1481962013911415, + "grad_norm": 0.2147953361272812, + "learning_rate": 3.722173971914417e-06, + "loss": 0.3643, "step": 115100 }, { - "epoch": 4.05, - "learning_rate": 4.601792152353704e-06, - "loss": 0.2516, + "epoch": 4.148376401052366, + "grad_norm": 0.3191777169704437, + "learning_rate": 3.7206421478662294e-06, + "loss": 0.3437, "step": 115105 }, { - "epoch": 4.05, - "learning_rate": 4.600145287778285e-06, - "loss": 0.248, + "epoch": 4.148556600713591, + "grad_norm": 0.2847544848918915, + "learning_rate": 3.7191106137451776e-06, + "loss": 0.3535, "step": 115110 }, { - "epoch": 4.05, - "learning_rate": 4.598498688081532e-06, - "loss": 0.2313, + "epoch": 4.148736800374815, + "grad_norm": 0.3304956555366516, + "learning_rate": 3.7175793695721244e-06, + "loss": 0.3658, "step": 115115 }, { - "epoch": 4.05, - "learning_rate": 4.596852353284834e-06, - "loss": 0.2647, + "epoch": 4.14891700003604, + "grad_norm": 0.2144591212272644, + "learning_rate": 3.7160484153679313e-06, + "loss": 0.3673, "step": 115120 }, { - "epoch": 4.05, - "learning_rate": 4.595206283409554e-06, - "loss": 0.2435, + "epoch": 4.149097199697264, + "grad_norm": 0.2552328407764435, + "learning_rate": 3.714517751153457e-06, + "loss": 0.3804, "step": 115125 }, { - "epoch": 4.05, - "learning_rate": 4.59356047847708e-06, - "loss": 0.2797, + "epoch": 4.149277399358489, + "grad_norm": 0.19830626249313354, + "learning_rate": 3.7129873769495534e-06, + "loss": 0.3935, "step": 115130 }, { - "epoch": 4.05, - "learning_rate": 4.591914938508776e-06, - "loss": 0.2365, + "epoch": 4.149457599019714, + "grad_norm": 0.22259441018104553, + "learning_rate": 3.7114572927770875e-06, + "loss": 0.3558, "step": 115135 }, { - "epoch": 4.05, - "learning_rate": 4.590269663526006e-06, - "loss": 0.2377, + "epoch": 4.1496377986809385, + "grad_norm": 0.26004859805107117, + "learning_rate": 3.709927498656887e-06, + "loss": 0.3926, "step": 115140 }, { - "epoch": 4.05, - "learning_rate": 4.5886246535501275e-06, - "loss": 0.2622, + "epoch": 4.149817998342163, + "grad_norm": 0.2216162085533142, + "learning_rate": 3.708397994609797e-06, + "loss": 0.3725, "step": 115145 }, { - "epoch": 4.05, - "learning_rate": 4.586979908602515e-06, - "loss": 0.2367, + "epoch": 4.149998198003388, + "grad_norm": 0.23738591372966766, + "learning_rate": 3.7068687806566694e-06, + "loss": 0.3529, "step": 115150 }, { - "epoch": 4.05, - "learning_rate": 4.585335428704512e-06, - "loss": 0.254, + "epoch": 4.150178397664613, + "grad_norm": 0.2530551254749298, + "learning_rate": 3.705339856818324e-06, + "loss": 0.3268, "step": 115155 }, { - "epoch": 4.05, - "learning_rate": 4.583691213877472e-06, - "loss": 0.2522, + "epoch": 4.1503585973258375, + "grad_norm": 0.20809151232242584, + "learning_rate": 3.7038112231156134e-06, + "loss": 0.3447, "step": 115160 }, { - "epoch": 4.05, - "learning_rate": 4.5820472641427495e-06, - "loss": 0.249, + "epoch": 4.150538796987061, + "grad_norm": 0.22564588487148285, + "learning_rate": 3.7022828795693467e-06, + "loss": 0.3774, "step": 115165 }, { - "epoch": 4.05, - "learning_rate": 4.580403579521683e-06, - "loss": 0.2377, + "epoch": 4.150718996648286, + "grad_norm": 0.2013123780488968, + "learning_rate": 3.700754826200345e-06, + "loss": 0.3752, "step": 115170 }, { - "epoch": 4.05, - "learning_rate": 4.578760160035625e-06, - "loss": 0.2376, + "epoch": 4.150899196309511, + "grad_norm": 0.29589053988456726, + "learning_rate": 3.6992270630294396e-06, + "loss": 0.345, "step": 115175 }, { - "epoch": 4.05, - "learning_rate": 4.577117005705902e-06, - "loss": 0.2611, + "epoch": 4.1510793959707355, + "grad_norm": 0.2514277994632721, + "learning_rate": 3.697699590077444e-06, + "loss": 0.4192, "step": 115180 }, { - "epoch": 4.05, - "learning_rate": 4.575474116553863e-06, - "loss": 0.2428, + "epoch": 4.15125959563196, + "grad_norm": 0.23917201161384583, + "learning_rate": 3.696172407365167e-06, + "loss": 0.3958, "step": 115185 }, { - "epoch": 4.05, - "learning_rate": 4.573831492600831e-06, - "loss": 0.283, + "epoch": 4.151439795293185, + "grad_norm": 0.3023105561733246, + "learning_rate": 3.694645514913417e-06, + "loss": 0.3855, "step": 115190 }, { - "epoch": 4.05, - "learning_rate": 4.5721891338681345e-06, - "loss": 0.2481, + "epoch": 4.15161999495441, + "grad_norm": 0.20139765739440918, + "learning_rate": 3.693118912742988e-06, + "loss": 0.3553, "step": 115195 }, { - "epoch": 4.05, - "learning_rate": 4.570547040377096e-06, - "loss": 0.2675, + "epoch": 4.151800194615634, + "grad_norm": 0.25707682967185974, + "learning_rate": 3.6915926008746964e-06, + "loss": 0.3776, "step": 115200 }, { - "epoch": 4.05, - "learning_rate": 4.568905212149047e-06, - "loss": 0.2363, + "epoch": 4.151980394276859, + "grad_norm": 0.2819069027900696, + "learning_rate": 3.690066579329332e-06, + "loss": 0.4, "step": 115205 }, { - "epoch": 4.05, - "learning_rate": 4.567263649205297e-06, - "loss": 0.2414, + "epoch": 4.152160593938083, + "grad_norm": 0.24341073632240295, + "learning_rate": 3.6885408481276833e-06, + "loss": 0.3162, "step": 115210 }, { - "epoch": 4.05, - "learning_rate": 4.565622351567164e-06, - "loss": 0.2436, + "epoch": 4.152340793599308, + "grad_norm": 0.262385755777359, + "learning_rate": 3.6870154072905396e-06, + "loss": 0.4045, "step": 115215 }, { - "epoch": 4.05, - "learning_rate": 4.56398131925595e-06, - "loss": 0.2647, + "epoch": 4.1525209932605325, + "grad_norm": 0.2288028746843338, + "learning_rate": 3.6854902568386777e-06, + "loss": 0.3662, "step": 115220 }, { - "epoch": 4.05, - "learning_rate": 4.5623405522929735e-06, - "loss": 0.2445, + "epoch": 4.152701192921757, + "grad_norm": 0.3026867210865021, + "learning_rate": 3.6839653967928907e-06, + "loss": 0.3821, "step": 115225 }, { - "epoch": 4.05, - "learning_rate": 4.56070005069954e-06, - "loss": 0.2502, + "epoch": 4.152881392582982, + "grad_norm": 0.2670912742614746, + "learning_rate": 3.6824408271739552e-06, + "loss": 0.3924, "step": 115230 }, { - "epoch": 4.05, - "learning_rate": 4.559059814496947e-06, - "loss": 0.2407, + "epoch": 4.153061592244207, + "grad_norm": 0.2640323340892792, + "learning_rate": 3.6809165480026246e-06, + "loss": 0.3696, "step": 115235 }, { - "epoch": 4.05, - "learning_rate": 4.557419843706484e-06, - "loss": 0.2386, + "epoch": 4.153241791905431, + "grad_norm": 0.272617369890213, + "learning_rate": 3.6793925592996825e-06, + "loss": 0.3597, "step": 115240 }, { - "epoch": 4.05, - "learning_rate": 4.555780138349461e-06, - "loss": 0.2654, + "epoch": 4.153421991566656, + "grad_norm": 0.2459690123796463, + "learning_rate": 3.6778688610858896e-06, + "loss": 0.3587, "step": 115245 }, { - "epoch": 4.05, - "learning_rate": 4.55414069844716e-06, - "loss": 0.2552, + "epoch": 4.153602191227881, + "grad_norm": 0.22347575426101685, + "learning_rate": 3.6763454533820075e-06, + "loss": 0.3799, "step": 115250 }, { - "epoch": 4.05, - "learning_rate": 4.552501524020863e-06, - "loss": 0.2426, + "epoch": 4.153782390889105, + "grad_norm": 0.2681044042110443, + "learning_rate": 3.6748223362087886e-06, + "loss": 0.3968, "step": 115255 }, { - "epoch": 4.06, - "learning_rate": 4.550862615091855e-06, - "loss": 0.2663, + "epoch": 4.1539625905503295, + "grad_norm": 0.2700466215610504, + "learning_rate": 3.6732995095869806e-06, + "loss": 0.3832, "step": 115260 }, { - "epoch": 4.06, - "learning_rate": 4.549223971681426e-06, - "loss": 0.2714, + "epoch": 4.154142790211554, + "grad_norm": 0.24011053144931793, + "learning_rate": 3.671776973537344e-06, + "loss": 0.3449, "step": 115265 }, { - "epoch": 4.06, - "learning_rate": 4.547585593810846e-06, - "loss": 0.2595, + "epoch": 4.154322989872779, + "grad_norm": 0.26308053731918335, + "learning_rate": 3.670254728080616e-06, + "loss": 0.3731, "step": 115270 }, { - "epoch": 4.06, - "learning_rate": 4.545947481501384e-06, - "loss": 0.2411, + "epoch": 4.154503189534004, + "grad_norm": 0.19285693764686584, + "learning_rate": 3.6687327732375376e-06, + "loss": 0.3644, "step": 115275 }, { - "epoch": 4.06, - "learning_rate": 4.544309634774313e-06, - "loss": 0.2261, + "epoch": 4.154683389195228, + "grad_norm": 0.327217698097229, + "learning_rate": 3.667211109028848e-06, + "loss": 0.4314, "step": 115280 }, { - "epoch": 4.06, - "learning_rate": 4.542672053650904e-06, - "loss": 0.2438, + "epoch": 4.154863588856453, + "grad_norm": 0.22068528831005096, + "learning_rate": 3.665689735475275e-06, + "loss": 0.3825, "step": 115285 }, { - "epoch": 4.06, - "learning_rate": 4.541034738152419e-06, - "loss": 0.2659, + "epoch": 4.155043788517678, + "grad_norm": 0.23468096554279327, + "learning_rate": 3.664168652597541e-06, + "loss": 0.3646, "step": 115290 }, { - "epoch": 4.06, - "learning_rate": 4.539397688300109e-06, - "loss": 0.2445, + "epoch": 4.155223988178903, + "grad_norm": 0.2511104941368103, + "learning_rate": 3.6626478604163877e-06, + "loss": 0.3713, "step": 115295 }, { - "epoch": 4.06, - "learning_rate": 4.537760904115243e-06, - "loss": 0.2389, + "epoch": 4.155404187840126, + "grad_norm": 0.2715943157672882, + "learning_rate": 3.6611273589525236e-06, + "loss": 0.3386, "step": 115300 }, { - "epoch": 4.06, - "learning_rate": 4.5361243856190645e-06, - "loss": 0.2672, + "epoch": 4.155584387501351, + "grad_norm": 0.2659306526184082, + "learning_rate": 3.659607148226671e-06, + "loss": 0.3665, "step": 115305 }, { - "epoch": 4.06, - "learning_rate": 4.534488132832826e-06, - "loss": 0.2355, + "epoch": 4.155764587162576, + "grad_norm": 0.2604070007801056, + "learning_rate": 3.658087228259538e-06, + "loss": 0.4022, "step": 115310 }, { - "epoch": 4.06, - "learning_rate": 4.532852145777763e-06, - "loss": 0.2406, + "epoch": 4.155944786823801, + "grad_norm": 0.25114676356315613, + "learning_rate": 3.6565675990718277e-06, + "loss": 0.3772, "step": 115315 }, { - "epoch": 4.06, - "learning_rate": 4.531216424475135e-06, - "loss": 0.2558, + "epoch": 4.156124986485025, + "grad_norm": 0.25636595487594604, + "learning_rate": 3.6550482606842572e-06, + "loss": 0.3437, "step": 115320 }, { - "epoch": 4.06, - "learning_rate": 4.529580968946173e-06, - "loss": 0.235, + "epoch": 4.15630518614625, + "grad_norm": 0.23536555469036102, + "learning_rate": 3.653529213117529e-06, + "loss": 0.391, "step": 115325 }, { - "epoch": 4.06, - "learning_rate": 4.5279457792121034e-06, - "loss": 0.2618, + "epoch": 4.156485385807475, + "grad_norm": 0.26349982619285583, + "learning_rate": 3.6520104563923213e-06, + "loss": 0.3576, "step": 115330 }, { - "epoch": 4.06, - "learning_rate": 4.526310855294166e-06, - "loss": 0.2475, + "epoch": 4.1566655854687, + "grad_norm": 0.26209887862205505, + "learning_rate": 3.6504919905293422e-06, + "loss": 0.3723, "step": 115335 }, { - "epoch": 4.06, - "learning_rate": 4.524676197213598e-06, - "loss": 0.2574, + "epoch": 4.156845785129924, + "grad_norm": 0.21833176910877228, + "learning_rate": 3.6489738155492696e-06, + "loss": 0.3723, "step": 115340 }, { - "epoch": 4.06, - "learning_rate": 4.523041804991615e-06, - "loss": 0.235, + "epoch": 4.157025984791149, + "grad_norm": 0.29597973823547363, + "learning_rate": 3.6474559314728097e-06, + "loss": 0.3672, "step": 115345 }, { - "epoch": 4.06, - "learning_rate": 4.521407678649439e-06, - "loss": 0.2392, + "epoch": 4.157206184452373, + "grad_norm": 0.24725015461444855, + "learning_rate": 3.6459383383206202e-06, + "loss": 0.3619, "step": 115350 }, { - "epoch": 4.06, - "learning_rate": 4.519773818208284e-06, - "loss": 0.2594, + "epoch": 4.157386384113598, + "grad_norm": 0.29851892590522766, + "learning_rate": 3.6444210361133824e-06, + "loss": 0.3806, "step": 115355 }, { - "epoch": 4.06, - "learning_rate": 4.518140223689371e-06, - "loss": 0.249, + "epoch": 4.157566583774822, + "grad_norm": 0.2327483892440796, + "learning_rate": 3.6429040248717795e-06, + "loss": 0.3437, "step": 115360 }, { - "epoch": 4.06, - "learning_rate": 4.5165068951139115e-06, - "loss": 0.2587, + "epoch": 4.157746783436047, + "grad_norm": 0.23190578818321228, + "learning_rate": 3.6413873046164728e-06, + "loss": 0.3671, "step": 115365 }, { - "epoch": 4.06, - "learning_rate": 4.514873832503111e-06, - "loss": 0.2529, + "epoch": 4.157926983097272, + "grad_norm": 0.21011842787265778, + "learning_rate": 3.6398708753681286e-06, + "loss": 0.3481, "step": 115370 }, { - "epoch": 4.06, - "learning_rate": 4.513241035878168e-06, - "loss": 0.2408, + "epoch": 4.158107182758497, + "grad_norm": 0.3232230544090271, + "learning_rate": 3.638354737147412e-06, + "loss": 0.4116, "step": 115375 }, { - "epoch": 4.06, - "learning_rate": 4.511608505260295e-06, - "loss": 0.2692, + "epoch": 4.158287382419721, + "grad_norm": 0.23676584661006927, + "learning_rate": 3.6368388899749667e-06, + "loss": 0.3751, "step": 115380 }, { - "epoch": 4.06, - "learning_rate": 4.509976240670677e-06, - "loss": 0.2393, + "epoch": 4.158467582080946, + "grad_norm": 0.2382199913263321, + "learning_rate": 3.6353233338714626e-06, + "loss": 0.3765, "step": 115385 }, { - "epoch": 4.06, - "learning_rate": 4.508344242130519e-06, - "loss": 0.2522, + "epoch": 4.15864778174217, + "grad_norm": 0.2705291509628296, + "learning_rate": 3.6338080688575415e-06, + "loss": 0.3765, "step": 115390 }, { - "epoch": 4.06, - "learning_rate": 4.506712509661002e-06, - "loss": 0.2416, + "epoch": 4.158827981403395, + "grad_norm": 0.25305843353271484, + "learning_rate": 3.632293094953848e-06, + "loss": 0.3927, "step": 115395 }, { - "epoch": 4.06, - "learning_rate": 4.505081043283321e-06, - "loss": 0.2572, + "epoch": 4.159008181064619, + "grad_norm": 0.20932044088840485, + "learning_rate": 3.6307784121810267e-06, + "loss": 0.3489, "step": 115400 }, { - "epoch": 4.06, - "learning_rate": 4.503449843018659e-06, - "loss": 0.2597, + "epoch": 4.159188380725844, + "grad_norm": 0.22791607677936554, + "learning_rate": 3.6292640205597055e-06, + "loss": 0.3658, "step": 115405 }, { - "epoch": 4.06, - "learning_rate": 4.501818908888184e-06, - "loss": 0.2414, + "epoch": 4.159368580387069, + "grad_norm": 0.24676699936389923, + "learning_rate": 3.627749920110529e-06, + "loss": 0.3737, "step": 115410 }, { - "epoch": 4.06, - "learning_rate": 4.500188240913089e-06, - "loss": 0.2503, + "epoch": 4.1595487800482935, + "grad_norm": 0.26706597208976746, + "learning_rate": 3.626236110854128e-06, + "loss": 0.3784, "step": 115415 }, { - "epoch": 4.06, - "learning_rate": 4.498557839114542e-06, - "loss": 0.2564, + "epoch": 4.159728979709518, + "grad_norm": 0.22861981391906738, + "learning_rate": 3.624722592811111e-06, + "loss": 0.3555, "step": 115420 }, { - "epoch": 4.06, - "learning_rate": 4.496927703513709e-06, - "loss": 0.2322, + "epoch": 4.159909179370743, + "grad_norm": 0.24083659052848816, + "learning_rate": 3.623209366002117e-06, + "loss": 0.3431, "step": 115425 }, { - "epoch": 4.06, - "learning_rate": 4.495297834131751e-06, - "loss": 0.2533, + "epoch": 4.160089379031968, + "grad_norm": 0.19510973989963531, + "learning_rate": 3.6216964304477547e-06, + "loss": 0.3485, "step": 115430 }, { - "epoch": 4.06, - "learning_rate": 4.493668230989842e-06, - "loss": 0.2587, + "epoch": 4.1602695786931925, + "grad_norm": 0.27537184953689575, + "learning_rate": 3.620183786168635e-06, + "loss": 0.3949, "step": 115435 }, { - "epoch": 4.06, - "learning_rate": 4.492038894109133e-06, - "loss": 0.2619, + "epoch": 4.160449778354416, + "grad_norm": 0.23948287963867188, + "learning_rate": 3.618671433185383e-06, + "loss": 0.3744, "step": 115440 }, { - "epoch": 4.06, - "learning_rate": 4.490409823510791e-06, - "loss": 0.2575, + "epoch": 4.160629978015641, + "grad_norm": 0.2727479040622711, + "learning_rate": 3.617159371518583e-06, + "loss": 0.3983, "step": 115445 }, { - "epoch": 4.06, - "learning_rate": 4.488781019215951e-06, - "loss": 0.2596, + "epoch": 4.160810177676866, + "grad_norm": 0.2685748040676117, + "learning_rate": 3.6156476011888514e-06, + "loss": 0.3499, "step": 115450 }, { - "epoch": 4.06, - "learning_rate": 4.487152481245782e-06, - "loss": 0.2451, + "epoch": 4.1609903773380905, + "grad_norm": 0.24433422088623047, + "learning_rate": 3.6141361222167826e-06, + "loss": 0.3799, "step": 115455 }, { - "epoch": 4.06, - "learning_rate": 4.4855242096214195e-06, - "loss": 0.2547, + "epoch": 4.161170576999315, + "grad_norm": 0.24008378386497498, + "learning_rate": 3.6126249346229656e-06, + "loss": 0.3337, "step": 115460 }, { - "epoch": 4.06, - "learning_rate": 4.483896204364002e-06, - "loss": 0.248, + "epoch": 4.16135077666054, + "grad_norm": 0.28376707434654236, + "learning_rate": 3.6111140384279955e-06, + "loss": 0.3982, "step": 115465 }, { - "epoch": 4.06, - "learning_rate": 4.482268465494669e-06, - "loss": 0.2582, + "epoch": 4.161530976321765, + "grad_norm": 0.22135163843631744, + "learning_rate": 3.609603433652453e-06, + "loss": 0.3829, "step": 115470 }, { - "epoch": 4.06, - "learning_rate": 4.480640993034563e-06, - "loss": 0.2328, + "epoch": 4.1617111759829895, + "grad_norm": 0.2689472436904907, + "learning_rate": 3.608093120316919e-06, + "loss": 0.366, "step": 115475 }, { - "epoch": 4.06, - "learning_rate": 4.479013787004813e-06, - "loss": 0.2704, + "epoch": 4.161891375644214, + "grad_norm": 0.24480800330638885, + "learning_rate": 3.606583098441982e-06, + "loss": 0.3782, "step": 115480 }, { - "epoch": 4.06, - "learning_rate": 4.477386847426543e-06, - "loss": 0.2622, + "epoch": 4.162071575305438, + "grad_norm": 0.27388742566108704, + "learning_rate": 3.6050733680482063e-06, + "loss": 0.3936, "step": 115485 }, { - "epoch": 4.06, - "learning_rate": 4.475760174320872e-06, - "loss": 0.2616, + "epoch": 4.162251774966663, + "grad_norm": 0.2704862356185913, + "learning_rate": 3.603563929156162e-06, + "loss": 0.3431, "step": 115490 }, { - "epoch": 4.06, - "learning_rate": 4.474133767708938e-06, - "loss": 0.2608, + "epoch": 4.1624319746278875, + "grad_norm": 0.6965771913528442, + "learning_rate": 3.602054781786418e-06, + "loss": 0.4249, "step": 115495 }, { - "epoch": 4.06, - "learning_rate": 4.472507627611844e-06, - "loss": 0.2762, + "epoch": 4.162612174289112, + "grad_norm": 0.32166483998298645, + "learning_rate": 3.600545925959531e-06, + "loss": 0.3667, "step": 115500 }, { - "epoch": 4.06, - "eval_loss": 0.2497061789035797, - "eval_runtime": 10.5326, - "eval_samples_per_second": 9.494, - "eval_steps_per_second": 9.494, + "epoch": 4.162612174289112, + "eval_loss": 0.42964106798171997, + "eval_runtime": 3.5337, + "eval_samples_per_second": 28.299, + "eval_steps_per_second": 7.075, "step": 115500 }, { - "epoch": 4.06, - "learning_rate": 4.470881754050716e-06, - "loss": 0.254, + "epoch": 4.162792373950337, + "grad_norm": 0.19025029242038727, + "learning_rate": 3.599037361696067e-06, + "loss": 0.3388, "step": 115505 }, { - "epoch": 4.06, - "learning_rate": 4.469256147046652e-06, - "loss": 0.2564, + "epoch": 4.162972573611562, + "grad_norm": 0.27333715558052063, + "learning_rate": 3.5975290890165796e-06, + "loss": 0.3742, "step": 115510 }, { - "epoch": 4.06, - "learning_rate": 4.467630806620773e-06, - "loss": 0.2523, + "epoch": 4.163152773272786, + "grad_norm": 0.28083357214927673, + "learning_rate": 3.596021107941605e-06, + "loss": 0.3722, "step": 115515 }, { - "epoch": 4.06, - "learning_rate": 4.4660057327941765e-06, - "loss": 0.2488, + "epoch": 4.163332972934011, + "grad_norm": 0.26710939407348633, + "learning_rate": 3.5945134184917046e-06, + "loss": 0.3906, "step": 115520 }, { - "epoch": 4.06, - "learning_rate": 4.464380925587958e-06, - "loss": 0.2506, + "epoch": 4.163513172595236, + "grad_norm": 0.23718206584453583, + "learning_rate": 3.5930060206874087e-06, + "loss": 0.3612, "step": 115525 }, { - "epoch": 4.06, - "learning_rate": 4.462756385023226e-06, - "loss": 0.2627, + "epoch": 4.16369337225646, + "grad_norm": 0.23605412244796753, + "learning_rate": 3.591498914549274e-06, + "loss": 0.359, "step": 115530 }, { - "epoch": 4.06, - "learning_rate": 4.461132111121066e-06, - "loss": 0.2383, + "epoch": 4.1638735719176845, + "grad_norm": 0.2375323325395584, + "learning_rate": 3.5899921000978137e-06, + "loss": 0.4013, "step": 115535 }, { - "epoch": 4.07, - "learning_rate": 4.459508103902573e-06, - "loss": 0.2519, + "epoch": 4.164053771578909, + "grad_norm": 0.19733993709087372, + "learning_rate": 3.588485577353562e-06, + "loss": 0.3864, "step": 115540 }, { - "epoch": 4.07, - "learning_rate": 4.457884363388823e-06, - "loss": 0.2319, + "epoch": 4.164233971240134, + "grad_norm": 0.28855153918266296, + "learning_rate": 3.5869793463370547e-06, + "loss": 0.3374, "step": 115545 }, { - "epoch": 4.07, - "learning_rate": 4.456260889600913e-06, - "loss": 0.2517, + "epoch": 4.164414170901359, + "grad_norm": 0.24822647869586945, + "learning_rate": 3.585473407068801e-06, + "loss": 0.3587, "step": 115550 }, { - "epoch": 4.07, - "learning_rate": 4.454637682559909e-06, - "loss": 0.2411, + "epoch": 4.164594370562583, + "grad_norm": 0.26548629999160767, + "learning_rate": 3.58396775956934e-06, + "loss": 0.3545, "step": 115555 }, { - "epoch": 4.07, - "learning_rate": 4.453014742286901e-06, - "loss": 0.2421, + "epoch": 4.164774570223808, + "grad_norm": 0.20807038247585297, + "learning_rate": 3.582462403859163e-06, + "loss": 0.3993, "step": 115560 }, { - "epoch": 4.07, - "learning_rate": 4.451392068802951e-06, - "loss": 0.2497, + "epoch": 4.164954769885033, + "grad_norm": 0.20517727732658386, + "learning_rate": 3.5809573399587846e-06, + "loss": 0.3603, "step": 115565 }, { - "epoch": 4.07, - "learning_rate": 4.449769662129138e-06, - "loss": 0.2512, + "epoch": 4.165134969546258, + "grad_norm": 0.27609434723854065, + "learning_rate": 3.579452567888722e-06, + "loss": 0.387, "step": 115570 }, { - "epoch": 4.07, - "learning_rate": 4.448147522286525e-06, - "loss": 0.256, + "epoch": 4.1653151692074815, + "grad_norm": 0.24407240748405457, + "learning_rate": 3.577948087669472e-06, + "loss": 0.4059, "step": 115575 }, { - "epoch": 4.07, - "learning_rate": 4.446525649296168e-06, - "loss": 0.2621, + "epoch": 4.165495368868706, + "grad_norm": 0.2198261171579361, + "learning_rate": 3.5764438993215298e-06, + "loss": 0.3422, "step": 115580 }, { - "epoch": 4.07, - "learning_rate": 4.444904043179127e-06, - "loss": 0.2608, + "epoch": 4.165675568529931, + "grad_norm": 0.2635994255542755, + "learning_rate": 3.5749400028653927e-06, + "loss": 0.3966, "step": 115585 }, { - "epoch": 4.07, - "learning_rate": 4.443282703956467e-06, - "loss": 0.2618, + "epoch": 4.165855768191156, + "grad_norm": 0.2824068069458008, + "learning_rate": 3.5734363983215448e-06, + "loss": 0.36, "step": 115590 }, { - "epoch": 4.07, - "learning_rate": 4.4416616316492326e-06, - "loss": 0.2595, + "epoch": 4.16603596785238, + "grad_norm": 0.3001638650894165, + "learning_rate": 3.571933085710483e-06, + "loss": 0.3725, "step": 115595 }, { - "epoch": 4.07, - "learning_rate": 4.440040826278468e-06, - "loss": 0.2439, + "epoch": 4.166216167513605, + "grad_norm": 0.27790218591690063, + "learning_rate": 3.5704300650526823e-06, + "loss": 0.3894, "step": 115600 }, { - "epoch": 4.07, - "learning_rate": 4.438420287865231e-06, - "loss": 0.2538, + "epoch": 4.16639636717483, + "grad_norm": 0.2569793462753296, + "learning_rate": 3.568927336368627e-06, + "loss": 0.3765, "step": 115605 }, { - "epoch": 4.07, - "learning_rate": 4.436800016430548e-06, - "loss": 0.2538, + "epoch": 4.166576566836055, + "grad_norm": 0.25097736716270447, + "learning_rate": 3.567424899678784e-06, + "loss": 0.3612, "step": 115610 }, { - "epoch": 4.07, - "learning_rate": 4.435180011995471e-06, - "loss": 0.2486, + "epoch": 4.166756766497279, + "grad_norm": 0.2786616086959839, + "learning_rate": 3.5659227550036284e-06, + "loss": 0.3388, "step": 115615 }, { - "epoch": 4.07, - "learning_rate": 4.433560274581031e-06, - "loss": 0.2631, + "epoch": 4.166936966158504, + "grad_norm": 0.22307883203029633, + "learning_rate": 3.5644209023636214e-06, + "loss": 0.3804, "step": 115620 }, { - "epoch": 4.07, - "learning_rate": 4.431940804208248e-06, - "loss": 0.227, + "epoch": 4.167117165819728, + "grad_norm": 0.27132895588874817, + "learning_rate": 3.562919341779239e-06, + "loss": 0.3481, "step": 115625 }, { - "epoch": 4.07, - "learning_rate": 4.4303216008981685e-06, - "loss": 0.2532, + "epoch": 4.167297365480953, + "grad_norm": 0.2521323263645172, + "learning_rate": 3.5614180732709194e-06, + "loss": 0.409, "step": 115630 }, { - "epoch": 4.07, - "learning_rate": 4.428702664671805e-06, - "loss": 0.2379, + "epoch": 4.167477565142177, + "grad_norm": 0.2317046821117401, + "learning_rate": 3.5599170968591334e-06, + "loss": 0.3321, "step": 115635 }, { - "epoch": 4.07, - "learning_rate": 4.427083995550174e-06, - "loss": 0.2311, + "epoch": 4.167657764803402, + "grad_norm": 0.2945384979248047, + "learning_rate": 3.558416412564325e-06, + "loss": 0.347, "step": 115640 }, { - "epoch": 4.07, - "learning_rate": 4.425465593554304e-06, - "loss": 0.2583, + "epoch": 4.167837964464627, + "grad_norm": 0.26886409521102905, + "learning_rate": 3.5569160204069446e-06, + "loss": 0.391, "step": 115645 }, { - "epoch": 4.07, - "learning_rate": 4.423847458705207e-06, - "loss": 0.2367, + "epoch": 4.168018164125852, + "grad_norm": 0.25826171040534973, + "learning_rate": 3.555415920407429e-06, + "loss": 0.3669, "step": 115650 }, { - "epoch": 4.07, - "learning_rate": 4.422229591023891e-06, - "loss": 0.2599, + "epoch": 4.168198363787076, + "grad_norm": 0.3123614192008972, + "learning_rate": 3.5539161125862226e-06, + "loss": 0.3831, "step": 115655 }, { - "epoch": 4.07, - "learning_rate": 4.420611990531353e-06, - "loss": 0.2391, + "epoch": 4.168378563448301, + "grad_norm": 0.21039628982543945, + "learning_rate": 3.552416596963748e-06, + "loss": 0.3678, "step": 115660 }, { - "epoch": 4.07, - "learning_rate": 4.418994657248607e-06, - "loss": 0.2353, + "epoch": 4.168558763109526, + "grad_norm": 0.21605625748634338, + "learning_rate": 3.550917373560453e-06, + "loss": 0.3595, "step": 115665 }, { - "epoch": 4.07, - "learning_rate": 4.417377591196659e-06, - "loss": 0.2539, + "epoch": 4.16873896277075, + "grad_norm": 0.1955236792564392, + "learning_rate": 3.549418442396757e-06, + "loss": 0.3606, "step": 115670 }, { - "epoch": 4.07, - "learning_rate": 4.4157607923965006e-06, - "loss": 0.2533, + "epoch": 4.168919162431974, + "grad_norm": 0.2682759165763855, + "learning_rate": 3.54791980349308e-06, + "loss": 0.4046, "step": 115675 }, { - "epoch": 4.07, - "learning_rate": 4.414144260869113e-06, - "loss": 0.2263, + "epoch": 4.169099362093199, + "grad_norm": 0.26579999923706055, + "learning_rate": 3.546421456869842e-06, + "loss": 0.3668, "step": 115680 }, { - "epoch": 4.07, - "learning_rate": 4.412527996635504e-06, - "loss": 0.251, + "epoch": 4.169279561754424, + "grad_norm": 0.2342423051595688, + "learning_rate": 3.5449234025474536e-06, + "loss": 0.3764, "step": 115685 }, { - "epoch": 4.07, - "learning_rate": 4.41091199971665e-06, - "loss": 0.2435, + "epoch": 4.169459761415649, + "grad_norm": 0.24451355636119843, + "learning_rate": 3.543425640546333e-06, + "loss": 0.4006, "step": 115690 }, { - "epoch": 4.07, - "learning_rate": 4.409296270133537e-06, - "loss": 0.2552, + "epoch": 4.169639961076873, + "grad_norm": 0.26802754402160645, + "learning_rate": 3.5419281708868933e-06, + "loss": 0.3725, "step": 115695 }, { - "epoch": 4.07, - "learning_rate": 4.407680807907133e-06, - "loss": 0.2533, + "epoch": 4.169820160738098, + "grad_norm": 0.21809948980808258, + "learning_rate": 3.5404309935895134e-06, + "loss": 0.3756, "step": 115700 }, { - "epoch": 4.07, - "learning_rate": 4.40606561305843e-06, - "loss": 0.2538, + "epoch": 4.170000360399323, + "grad_norm": 0.23009651899337769, + "learning_rate": 3.5389341086746158e-06, + "loss": 0.3687, "step": 115705 }, { - "epoch": 4.07, - "learning_rate": 4.404450685608394e-06, - "loss": 0.2393, + "epoch": 4.1701805600605475, + "grad_norm": 0.2777594327926636, + "learning_rate": 3.537437516162578e-06, + "loss": 0.3974, "step": 115710 }, { - "epoch": 4.07, - "learning_rate": 4.402836025577986e-06, - "loss": 0.2787, + "epoch": 4.170360759721771, + "grad_norm": 0.22270822525024414, + "learning_rate": 3.5359412160738044e-06, + "loss": 0.3573, "step": 115715 }, { - "epoch": 4.07, - "learning_rate": 4.401221632988178e-06, - "loss": 0.2305, + "epoch": 4.170540959382996, + "grad_norm": 0.2586876451969147, + "learning_rate": 3.5344452084286826e-06, + "loss": 0.3584, "step": 115720 }, { - "epoch": 4.07, - "learning_rate": 4.3996075078599405e-06, - "loss": 0.2592, + "epoch": 4.170721159044221, + "grad_norm": 0.2567586600780487, + "learning_rate": 3.5329494932475754e-06, + "loss": 0.3843, "step": 115725 }, { - "epoch": 4.07, - "learning_rate": 4.3979936502142186e-06, - "loss": 0.2396, + "epoch": 4.1709013587054455, + "grad_norm": 0.24362897872924805, + "learning_rate": 3.5314540705508823e-06, + "loss": 0.3739, "step": 115730 }, { - "epoch": 4.07, - "learning_rate": 4.396380060071975e-06, - "loss": 0.2475, + "epoch": 4.17108155836667, + "grad_norm": 0.3008570373058319, + "learning_rate": 3.529958940358971e-06, + "loss": 0.3583, "step": 115735 }, { - "epoch": 4.07, - "learning_rate": 4.3947667374541534e-06, - "loss": 0.2531, + "epoch": 4.171261758027895, + "grad_norm": 0.272904634475708, + "learning_rate": 3.5284641026922137e-06, + "loss": 0.3998, "step": 115740 }, { - "epoch": 4.07, - "learning_rate": 4.3931536823817095e-06, - "loss": 0.2214, + "epoch": 4.17144195768912, + "grad_norm": 0.27428102493286133, + "learning_rate": 3.526969557570975e-06, + "loss": 0.3883, "step": 115745 }, { - "epoch": 4.07, - "learning_rate": 4.391540894875587e-06, - "loss": 0.2422, + "epoch": 4.1716221573503445, + "grad_norm": 0.25134575366973877, + "learning_rate": 3.525475305015613e-06, + "loss": 0.3813, "step": 115750 }, { - "epoch": 4.07, - "learning_rate": 4.389928374956717e-06, - "loss": 0.2586, + "epoch": 4.171802357011569, + "grad_norm": 0.2150566279888153, + "learning_rate": 3.5239813450464983e-06, + "loss": 0.3792, "step": 115755 }, { - "epoch": 4.07, - "learning_rate": 4.38831612264605e-06, - "loss": 0.2234, + "epoch": 4.171982556672793, + "grad_norm": 0.2204793393611908, + "learning_rate": 3.5224876776839806e-06, + "loss": 0.3873, "step": 115760 }, { - "epoch": 4.07, - "learning_rate": 4.386704137964517e-06, - "loss": 0.261, + "epoch": 4.172162756334018, + "grad_norm": 0.2468349039554596, + "learning_rate": 3.5209943029484077e-06, + "loss": 0.4102, "step": 115765 }, { - "epoch": 4.07, - "learning_rate": 4.385092420933037e-06, - "loss": 0.2488, + "epoch": 4.1723429559952425, + "grad_norm": 0.2205950766801834, + "learning_rate": 3.5195012208601303e-06, + "loss": 0.3288, "step": 115770 }, { - "epoch": 4.07, - "learning_rate": 4.383480971572549e-06, - "loss": 0.2711, + "epoch": 4.172523155656467, + "grad_norm": 0.28197482228279114, + "learning_rate": 3.5180084314394845e-06, + "loss": 0.3871, "step": 115775 }, { - "epoch": 4.07, - "learning_rate": 4.381869789903978e-06, - "loss": 0.2592, + "epoch": 4.172703355317692, + "grad_norm": 0.28598877787590027, + "learning_rate": 3.516515934706821e-06, + "loss": 0.379, "step": 115780 }, { - "epoch": 4.07, - "learning_rate": 4.38025887594824e-06, - "loss": 0.2521, + "epoch": 4.172883554978917, + "grad_norm": 0.22734612226486206, + "learning_rate": 3.515023730682468e-06, + "loss": 0.3956, "step": 115785 }, { - "epoch": 4.07, - "learning_rate": 4.378648229726251e-06, - "loss": 0.2311, + "epoch": 4.1730637546401415, + "grad_norm": 0.2860606610774994, + "learning_rate": 3.5135318193867563e-06, + "loss": 0.4039, "step": 115790 }, { - "epoch": 4.07, - "learning_rate": 4.377037851258919e-06, - "loss": 0.2464, + "epoch": 4.173243954301366, + "grad_norm": 0.263313889503479, + "learning_rate": 3.5120402008400167e-06, + "loss": 0.3714, "step": 115795 }, { - "epoch": 4.07, - "learning_rate": 4.375427740567167e-06, - "loss": 0.2263, + "epoch": 4.173424153962591, + "grad_norm": 0.28966277837753296, + "learning_rate": 3.5105488750625666e-06, + "loss": 0.3525, "step": 115800 }, { - "epoch": 4.07, - "learning_rate": 4.373817897671895e-06, - "loss": 0.2361, + "epoch": 4.173604353623815, + "grad_norm": 0.27305835485458374, + "learning_rate": 3.50905784207472e-06, + "loss": 0.361, "step": 115805 }, { - "epoch": 4.07, - "learning_rate": 4.372208322594001e-06, - "loss": 0.2518, + "epoch": 4.1737845532850395, + "grad_norm": 0.28714773058891296, + "learning_rate": 3.5075671018968136e-06, + "loss": 0.3572, "step": 115810 }, { - "epoch": 4.07, - "learning_rate": 4.370599015354382e-06, - "loss": 0.2433, + "epoch": 4.173964752946264, + "grad_norm": 0.23891617357730865, + "learning_rate": 3.506076654549134e-06, + "loss": 0.3833, "step": 115815 }, { - "epoch": 4.07, - "learning_rate": 4.368989975973947e-06, - "loss": 0.2632, + "epoch": 4.174144952607489, + "grad_norm": 0.2725772261619568, + "learning_rate": 3.504586500052001e-06, + "loss": 0.3409, "step": 115820 }, { - "epoch": 4.08, - "learning_rate": 4.367381204473575e-06, - "loss": 0.2546, + "epoch": 4.174325152268714, + "grad_norm": 0.2952682375907898, + "learning_rate": 3.503096638425718e-06, + "loss": 0.3602, "step": 115825 }, { - "epoch": 4.08, - "learning_rate": 4.365772700874165e-06, - "loss": 0.2687, + "epoch": 4.174505351929938, + "grad_norm": 0.2583956718444824, + "learning_rate": 3.5016070696905794e-06, + "loss": 0.3786, "step": 115830 }, { - "epoch": 4.08, - "learning_rate": 4.364164465196594e-06, - "loss": 0.241, + "epoch": 4.174685551591163, + "grad_norm": 0.2635524868965149, + "learning_rate": 3.5001177938668834e-06, + "loss": 0.3726, "step": 115835 }, { - "epoch": 4.08, - "learning_rate": 4.36255649746175e-06, - "loss": 0.2409, + "epoch": 4.174865751252388, + "grad_norm": 0.3067127764225006, + "learning_rate": 3.4986288109749217e-06, + "loss": 0.41, "step": 115840 }, { - "epoch": 4.08, - "learning_rate": 4.360948797690515e-06, - "loss": 0.2332, + "epoch": 4.175045950913613, + "grad_norm": 0.2670503556728363, + "learning_rate": 3.4971401210349724e-06, + "loss": 0.3842, "step": 115845 }, { - "epoch": 4.08, - "learning_rate": 4.3593413659037544e-06, - "loss": 0.2577, + "epoch": 4.1752261505748365, + "grad_norm": 0.2181471437215805, + "learning_rate": 3.4956517240673307e-06, + "loss": 0.4022, "step": 115850 }, { - "epoch": 4.08, - "learning_rate": 4.357734202122338e-06, - "loss": 0.2678, + "epoch": 4.175406350236061, + "grad_norm": 0.21090255677700043, + "learning_rate": 3.4941636200922695e-06, + "loss": 0.3688, "step": 115855 }, { - "epoch": 4.08, - "learning_rate": 4.356127306367147e-06, - "loss": 0.257, + "epoch": 4.175586549897286, + "grad_norm": 0.2528001368045807, + "learning_rate": 3.4926758091300694e-06, + "loss": 0.3877, "step": 115860 }, { - "epoch": 4.08, - "learning_rate": 4.354520678659035e-06, - "loss": 0.2372, + "epoch": 4.175766749558511, + "grad_norm": 0.2146507054567337, + "learning_rate": 3.4911882912009947e-06, + "loss": 0.392, "step": 115865 }, { - "epoch": 4.08, - "learning_rate": 4.352914319018864e-06, - "loss": 0.2447, + "epoch": 4.175946949219735, + "grad_norm": 0.2687932848930359, + "learning_rate": 3.48970106632531e-06, + "loss": 0.3346, "step": 115870 }, { - "epoch": 4.08, - "learning_rate": 4.351308227467496e-06, - "loss": 0.2486, + "epoch": 4.17612714888096, + "grad_norm": 0.33123210072517395, + "learning_rate": 3.4882141345232903e-06, + "loss": 0.3685, "step": 115875 }, { - "epoch": 4.08, - "learning_rate": 4.349702404025777e-06, - "loss": 0.2786, + "epoch": 4.176307348542185, + "grad_norm": 0.2785130739212036, + "learning_rate": 3.486727495815187e-06, + "loss": 0.3529, "step": 115880 }, { - "epoch": 4.08, - "learning_rate": 4.348096848714573e-06, - "loss": 0.2499, + "epoch": 4.17648754820341, + "grad_norm": 0.31324052810668945, + "learning_rate": 3.485241150221258e-06, + "loss": 0.3905, "step": 115885 }, { - "epoch": 4.08, - "learning_rate": 4.346491561554711e-06, - "loss": 0.252, + "epoch": 4.176667747864634, + "grad_norm": 0.23870332539081573, + "learning_rate": 3.483755097761751e-06, + "loss": 0.3257, "step": 115890 }, { - "epoch": 4.08, - "learning_rate": 4.344886542567053e-06, - "loss": 0.264, + "epoch": 4.176847947525859, + "grad_norm": 0.296524316072464, + "learning_rate": 3.4822693384569087e-06, + "loss": 0.3858, "step": 115895 }, { - "epoch": 4.08, - "learning_rate": 4.34328179177243e-06, - "loss": 0.2619, + "epoch": 4.177028147187083, + "grad_norm": 0.22270824015140533, + "learning_rate": 3.4807838723269896e-06, + "loss": 0.3931, "step": 115900 }, { - "epoch": 4.08, - "learning_rate": 4.34167730919168e-06, - "loss": 0.2441, + "epoch": 4.177208346848308, + "grad_norm": 0.25900474190711975, + "learning_rate": 3.479298699392228e-06, + "loss": 0.3714, "step": 115905 }, { - "epoch": 4.08, - "learning_rate": 4.340073094845631e-06, - "loss": 0.2411, + "epoch": 4.177388546509532, + "grad_norm": 0.28385141491889954, + "learning_rate": 3.4778138196728425e-06, + "loss": 0.3734, "step": 115910 }, { - "epoch": 4.08, - "learning_rate": 4.338469148755123e-06, - "loss": 0.2495, + "epoch": 4.177568746170757, + "grad_norm": 0.2589219808578491, + "learning_rate": 3.476329233189085e-06, + "loss": 0.3638, "step": 115915 }, { - "epoch": 4.08, - "learning_rate": 4.336865470940976e-06, - "loss": 0.2487, + "epoch": 4.177748945831982, + "grad_norm": 0.23942036926746368, + "learning_rate": 3.4748449399611747e-06, + "loss": 0.3584, "step": 115920 }, { - "epoch": 4.08, - "learning_rate": 4.335262061424011e-06, - "loss": 0.2509, + "epoch": 4.177929145493207, + "grad_norm": 0.2529667317867279, + "learning_rate": 3.473360940009332e-06, + "loss": 0.3587, "step": 115925 }, { - "epoch": 4.08, - "learning_rate": 4.333658920225048e-06, - "loss": 0.2359, + "epoch": 4.178109345154431, + "grad_norm": 0.22213000059127808, + "learning_rate": 3.471877233353782e-06, + "loss": 0.3889, "step": 115930 }, { - "epoch": 4.08, - "learning_rate": 4.332056047364908e-06, - "loss": 0.263, + "epoch": 4.178289544815656, + "grad_norm": 0.2727386951446533, + "learning_rate": 3.470393820014728e-06, + "loss": 0.4031, "step": 115935 }, { - "epoch": 4.08, - "learning_rate": 4.3304534428643935e-06, - "loss": 0.2865, + "epoch": 4.178469744476881, + "grad_norm": 0.20552094280719757, + "learning_rate": 3.4689107000123982e-06, + "loss": 0.3649, "step": 115940 }, { - "epoch": 4.08, - "learning_rate": 4.328851106744325e-06, - "loss": 0.2596, + "epoch": 4.178649944138105, + "grad_norm": 0.25130918622016907, + "learning_rate": 3.467427873366988e-06, + "loss": 0.3563, "step": 115945 }, { - "epoch": 4.08, - "learning_rate": 4.327249039025497e-06, - "loss": 0.2376, + "epoch": 4.178830143799329, + "grad_norm": 0.2697368562221527, + "learning_rate": 3.4659453400987084e-06, + "loss": 0.388, "step": 115950 }, { - "epoch": 4.08, - "learning_rate": 4.3256472397287224e-06, - "loss": 0.247, + "epoch": 4.179010343460554, + "grad_norm": 0.2934541404247284, + "learning_rate": 3.4644631002277523e-06, + "loss": 0.3953, "step": 115955 }, { - "epoch": 4.08, - "learning_rate": 4.324045708874793e-06, - "loss": 0.252, + "epoch": 4.179190543121779, + "grad_norm": 0.20682430267333984, + "learning_rate": 3.462981153774311e-06, + "loss": 0.3416, "step": 115960 }, { - "epoch": 4.08, - "learning_rate": 4.322444446484505e-06, - "loss": 0.2502, + "epoch": 4.179370742783004, + "grad_norm": 0.22139784693717957, + "learning_rate": 3.461499500758589e-06, + "loss": 0.3593, "step": 115965 }, { - "epoch": 4.08, - "learning_rate": 4.32084345257864e-06, - "loss": 0.2715, + "epoch": 4.179550942444228, + "grad_norm": 0.26105570793151855, + "learning_rate": 3.4600181412007633e-06, + "loss": 0.3664, "step": 115970 }, { - "epoch": 4.08, - "learning_rate": 4.319242727178005e-06, - "loss": 0.2224, + "epoch": 4.179731142105453, + "grad_norm": 0.25991979241371155, + "learning_rate": 3.458537075121024e-06, + "loss": 0.394, "step": 115975 }, { - "epoch": 4.08, - "learning_rate": 4.317642270303371e-06, - "loss": 0.2457, + "epoch": 4.179911341766678, + "grad_norm": 0.24015338718891144, + "learning_rate": 3.4570563025395435e-06, + "loss": 0.3873, "step": 115980 }, { - "epoch": 4.08, - "learning_rate": 4.316042081975519e-06, - "loss": 0.2356, + "epoch": 4.1800915414279025, + "grad_norm": 0.24884821474552155, + "learning_rate": 3.4555758234765006e-06, + "loss": 0.3393, "step": 115985 }, { - "epoch": 4.08, - "learning_rate": 4.314442162215232e-06, - "loss": 0.2499, + "epoch": 4.180271741089126, + "grad_norm": 0.211832195520401, + "learning_rate": 3.4540956379520623e-06, + "loss": 0.3627, "step": 115990 }, { - "epoch": 4.08, - "learning_rate": 4.312842511043274e-06, - "loss": 0.2539, + "epoch": 4.180451940750351, + "grad_norm": 0.28997382521629333, + "learning_rate": 3.4526157459864093e-06, + "loss": 0.3957, "step": 115995 }, { - "epoch": 4.08, - "learning_rate": 4.311243128480433e-06, - "loss": 0.2663, + "epoch": 4.180632140411576, + "grad_norm": 0.23022478818893433, + "learning_rate": 3.4511361475996843e-06, + "loss": 0.3877, "step": 116000 }, { - "epoch": 4.08, - "eval_loss": 0.24984586238861084, - "eval_runtime": 10.5713, - "eval_samples_per_second": 9.46, - "eval_steps_per_second": 9.46, + "epoch": 4.180632140411576, + "eval_loss": 0.42928194999694824, + "eval_runtime": 3.5376, + "eval_samples_per_second": 28.268, + "eval_steps_per_second": 7.067, "step": 116000 }, { - "epoch": 4.08, - "learning_rate": 4.309644014547457e-06, - "loss": 0.2392, + "epoch": 4.180812340072801, + "grad_norm": 0.23664060235023499, + "learning_rate": 3.449656842812063e-06, + "loss": 0.3887, "step": 116005 }, { - "epoch": 4.08, - "learning_rate": 4.308045169265126e-06, - "loss": 0.2409, + "epoch": 4.180992539734025, + "grad_norm": 0.31014373898506165, + "learning_rate": 3.4481778316436925e-06, + "loss": 0.3901, "step": 116010 }, { - "epoch": 4.08, - "learning_rate": 4.306446592654189e-06, - "loss": 0.2452, + "epoch": 4.18117273939525, + "grad_norm": 0.26111266016960144, + "learning_rate": 3.4466991141147238e-06, + "loss": 0.4023, "step": 116015 }, { - "epoch": 4.08, - "learning_rate": 4.304848284735408e-06, - "loss": 0.2598, + "epoch": 4.181352939056475, + "grad_norm": 0.20176957547664642, + "learning_rate": 3.445220690245318e-06, + "loss": 0.3669, "step": 116020 }, { - "epoch": 4.08, - "learning_rate": 4.3032502455295275e-06, - "loss": 0.2368, + "epoch": 4.1815331387176995, + "grad_norm": 0.23669221997261047, + "learning_rate": 3.4437425600555988e-06, + "loss": 0.3726, "step": 116025 }, { - "epoch": 4.08, - "learning_rate": 4.301652475057308e-06, - "loss": 0.246, + "epoch": 4.181713338378924, + "grad_norm": 0.22191861271858215, + "learning_rate": 3.4422647235657105e-06, + "loss": 0.3698, "step": 116030 }, { - "epoch": 4.08, - "learning_rate": 4.3000549733394905e-06, - "loss": 0.2407, + "epoch": 4.181893538040148, + "grad_norm": 0.24171018600463867, + "learning_rate": 3.4407871807957957e-06, + "loss": 0.3699, "step": 116035 }, { - "epoch": 4.08, - "learning_rate": 4.29845774039681e-06, - "loss": 0.2708, + "epoch": 4.182073737701373, + "grad_norm": 0.22796885669231415, + "learning_rate": 3.439309931765983e-06, + "loss": 0.3784, "step": 116040 }, { - "epoch": 4.08, - "learning_rate": 4.29686077625002e-06, - "loss": 0.2366, + "epoch": 4.1822539373625975, + "grad_norm": 0.344339519739151, + "learning_rate": 3.4378329764963995e-06, + "loss": 0.3725, "step": 116045 }, { - "epoch": 4.08, - "learning_rate": 4.295264080919842e-06, - "loss": 0.2562, + "epoch": 4.182434137023822, + "grad_norm": 0.2166433185338974, + "learning_rate": 3.4363563150071636e-06, + "loss": 0.3542, "step": 116050 }, { - "epoch": 4.08, - "learning_rate": 4.293667654427022e-06, - "loss": 0.2507, + "epoch": 4.182614336685047, + "grad_norm": 0.2705228924751282, + "learning_rate": 3.434879947318395e-06, + "loss": 0.3529, "step": 116055 }, { - "epoch": 4.08, - "learning_rate": 4.292071496792282e-06, - "loss": 0.2261, + "epoch": 4.182794536346272, + "grad_norm": 0.2719237506389618, + "learning_rate": 3.4334038734502166e-06, + "loss": 0.3851, "step": 116060 }, { - "epoch": 4.08, - "learning_rate": 4.29047560803634e-06, - "loss": 0.2649, + "epoch": 4.1829747360074965, + "grad_norm": 0.2510291635990143, + "learning_rate": 3.431928093422737e-06, + "loss": 0.3846, "step": 116065 }, { - "epoch": 4.08, - "learning_rate": 4.2888799881799305e-06, - "loss": 0.2358, + "epoch": 4.183154935668721, + "grad_norm": 0.2954877018928528, + "learning_rate": 3.43045260725606e-06, + "loss": 0.3558, "step": 116070 }, { - "epoch": 4.08, - "learning_rate": 4.287284637243763e-06, - "loss": 0.2658, + "epoch": 4.183335135329946, + "grad_norm": 0.2507972717285156, + "learning_rate": 3.4289774149702887e-06, + "loss": 0.3594, "step": 116075 }, { - "epoch": 4.08, - "learning_rate": 4.2856895552485565e-06, - "loss": 0.2595, + "epoch": 4.18351533499117, + "grad_norm": 0.24110138416290283, + "learning_rate": 3.427502516585521e-06, + "loss": 0.3744, "step": 116080 }, { - "epoch": 4.08, - "learning_rate": 4.284094742215014e-06, - "loss": 0.2434, + "epoch": 4.1836955346523945, + "grad_norm": 0.25332164764404297, + "learning_rate": 3.426027912121857e-06, + "loss": 0.4066, "step": 116085 }, { - "epoch": 4.08, - "learning_rate": 4.282500198163855e-06, - "loss": 0.2826, + "epoch": 4.183875734313619, + "grad_norm": 0.22243022918701172, + "learning_rate": 3.424553601599395e-06, + "loss": 0.3625, "step": 116090 }, { - "epoch": 4.08, - "learning_rate": 4.280905923115777e-06, - "loss": 0.2464, + "epoch": 4.184055933974844, + "grad_norm": 0.23225410282611847, + "learning_rate": 3.4230795850381998e-06, + "loss": 0.3428, "step": 116095 }, { - "epoch": 4.08, - "learning_rate": 4.279311917091475e-06, - "loss": 0.2642, + "epoch": 4.184236133636069, + "grad_norm": 0.28310906887054443, + "learning_rate": 3.4216058624583743e-06, + "loss": 0.3719, "step": 116100 }, { - "epoch": 4.08, - "learning_rate": 4.277718180111653e-06, - "loss": 0.2515, + "epoch": 4.1844163332972935, + "grad_norm": 0.2678923010826111, + "learning_rate": 3.4201324338799884e-06, + "loss": 0.3407, "step": 116105 }, { - "epoch": 4.09, - "learning_rate": 4.276124712197011e-06, - "loss": 0.2432, + "epoch": 4.184596532958518, + "grad_norm": 0.2929144501686096, + "learning_rate": 3.418659299323124e-06, + "loss": 0.3569, "step": 116110 }, { - "epoch": 4.09, - "learning_rate": 4.2745315133682295e-06, - "loss": 0.2789, + "epoch": 4.184776732619743, + "grad_norm": 0.2045121192932129, + "learning_rate": 3.4171864588078474e-06, + "loss": 0.3586, "step": 116115 }, { - "epoch": 4.09, - "learning_rate": 4.2729385836459915e-06, - "loss": 0.2398, + "epoch": 4.184956932280968, + "grad_norm": 0.21932172775268555, + "learning_rate": 3.415713912354221e-06, + "loss": 0.3525, "step": 116120 }, { - "epoch": 4.09, - "learning_rate": 4.2713459230509955e-06, - "loss": 0.2717, + "epoch": 4.1851371319421915, + "grad_norm": 0.21695736050605774, + "learning_rate": 3.4142416599823175e-06, + "loss": 0.3583, "step": 116125 }, { - "epoch": 4.09, - "learning_rate": 4.269753531603909e-06, - "loss": 0.255, + "epoch": 4.185317331603416, + "grad_norm": 0.2124013900756836, + "learning_rate": 3.4127697017121928e-06, + "loss": 0.3525, "step": 116130 }, { - "epoch": 4.09, - "learning_rate": 4.268161409325413e-06, - "loss": 0.2511, + "epoch": 4.185497531264641, + "grad_norm": 0.2589491307735443, + "learning_rate": 3.4112980375639035e-06, + "loss": 0.3682, "step": 116135 }, { - "epoch": 4.09, - "learning_rate": 4.2665695562361715e-06, - "loss": 0.2584, + "epoch": 4.185677730925866, + "grad_norm": 0.2505156099796295, + "learning_rate": 3.4098266675574974e-06, + "loss": 0.3868, "step": 116140 }, { - "epoch": 4.09, - "learning_rate": 4.264977972356868e-06, - "loss": 0.249, + "epoch": 4.18585793058709, + "grad_norm": 0.2609826326370239, + "learning_rate": 3.4083555917130165e-06, + "loss": 0.3701, "step": 116145 }, { - "epoch": 4.09, - "learning_rate": 4.2633866577081576e-06, - "loss": 0.2773, + "epoch": 4.186038130248315, + "grad_norm": 0.2588742971420288, + "learning_rate": 3.4068848100505176e-06, + "loss": 0.3641, "step": 116150 }, { - "epoch": 4.09, - "learning_rate": 4.261795612310701e-06, - "loss": 0.2374, + "epoch": 4.18621832990954, + "grad_norm": 0.20735150575637817, + "learning_rate": 3.4054143225900335e-06, + "loss": 0.3315, "step": 116155 }, { - "epoch": 4.09, - "learning_rate": 4.260204836185161e-06, - "loss": 0.2514, + "epoch": 4.186398529570765, + "grad_norm": 0.23937851190567017, + "learning_rate": 3.4039441293515966e-06, + "loss": 0.3675, "step": 116160 }, { - "epoch": 4.09, - "learning_rate": 4.2586143293522005e-06, - "loss": 0.2469, + "epoch": 4.186578729231989, + "grad_norm": 0.24689562618732452, + "learning_rate": 3.4024742303552405e-06, + "loss": 0.3567, "step": 116165 }, { - "epoch": 4.09, - "learning_rate": 4.257024091832463e-06, - "loss": 0.2366, + "epoch": 4.186758928893214, + "grad_norm": 0.24008159339427948, + "learning_rate": 3.4010046256209906e-06, + "loss": 0.3822, "step": 116170 }, { - "epoch": 4.09, - "learning_rate": 4.255434123646598e-06, - "loss": 0.2403, + "epoch": 4.186939128554438, + "grad_norm": 0.1905672252178192, + "learning_rate": 3.3995353151688667e-06, + "loss": 0.3537, "step": 116175 }, { - "epoch": 4.09, - "learning_rate": 4.253844424815243e-06, - "loss": 0.2289, + "epoch": 4.187119328215663, + "grad_norm": 0.20124223828315735, + "learning_rate": 3.3980662990188956e-06, + "loss": 0.3373, "step": 116180 }, { - "epoch": 4.09, - "learning_rate": 4.252254995359051e-06, - "loss": 0.2692, + "epoch": 4.187299527876887, + "grad_norm": 0.29924681782722473, + "learning_rate": 3.396597577191091e-06, + "loss": 0.3908, "step": 116185 }, { - "epoch": 4.09, - "learning_rate": 4.250665835298656e-06, - "loss": 0.2502, + "epoch": 4.187479727538112, + "grad_norm": 0.2561193108558655, + "learning_rate": 3.3951291497054616e-06, + "loss": 0.364, "step": 116190 }, { - "epoch": 4.09, - "learning_rate": 4.249076944654692e-06, - "loss": 0.246, + "epoch": 4.187659927199337, + "grad_norm": 0.23734471201896667, + "learning_rate": 3.3936610165820144e-06, + "loss": 0.4054, "step": 116195 }, { - "epoch": 4.09, - "learning_rate": 4.247488323447779e-06, - "loss": 0.2582, + "epoch": 4.187840126860562, + "grad_norm": 0.21448448300361633, + "learning_rate": 3.3921931778407467e-06, + "loss": 0.3877, "step": 116200 }, { - "epoch": 4.09, - "learning_rate": 4.245899971698564e-06, - "loss": 0.2559, + "epoch": 4.188020326521786, + "grad_norm": 0.25979727506637573, + "learning_rate": 3.390725633501676e-06, + "loss": 0.3934, "step": 116205 }, { - "epoch": 4.09, - "learning_rate": 4.244311889427652e-06, - "loss": 0.2409, + "epoch": 4.188200526183011, + "grad_norm": 0.25088486075401306, + "learning_rate": 3.3892583835847786e-06, + "loss": 0.3796, "step": 116210 }, { - "epoch": 4.09, - "learning_rate": 4.242724076655671e-06, - "loss": 0.2466, + "epoch": 4.188380725844236, + "grad_norm": 0.24051101505756378, + "learning_rate": 3.387791428110046e-06, + "loss": 0.3435, "step": 116215 }, { - "epoch": 4.09, - "learning_rate": 4.241136533403245e-06, - "loss": 0.2795, + "epoch": 4.18856092550546, + "grad_norm": 0.265553742647171, + "learning_rate": 3.3863247670974764e-06, + "loss": 0.3918, "step": 116220 }, { - "epoch": 4.09, - "learning_rate": 4.2395492596909806e-06, - "loss": 0.2432, + "epoch": 4.188741125166684, + "grad_norm": 0.22547340393066406, + "learning_rate": 3.384858400567045e-06, + "loss": 0.3579, "step": 116225 }, { - "epoch": 4.09, - "learning_rate": 4.237962255539488e-06, - "loss": 0.2446, + "epoch": 4.188921324827909, + "grad_norm": 0.2896125614643097, + "learning_rate": 3.383392328538737e-06, + "loss": 0.3668, "step": 116230 }, { - "epoch": 4.09, - "learning_rate": 4.236375520969366e-06, - "loss": 0.2807, + "epoch": 4.189101524489134, + "grad_norm": 0.24138803780078888, + "learning_rate": 3.3819265510325214e-06, + "loss": 0.3922, "step": 116235 }, { - "epoch": 4.09, - "learning_rate": 4.234789056001231e-06, - "loss": 0.2676, + "epoch": 4.189281724150359, + "grad_norm": 0.29337865114212036, + "learning_rate": 3.3804610680683665e-06, + "loss": 0.4072, "step": 116240 }, { - "epoch": 4.09, - "learning_rate": 4.233202860655675e-06, - "loss": 0.2475, + "epoch": 4.189461923811583, + "grad_norm": 0.29773908853530884, + "learning_rate": 3.3789958796662496e-06, + "loss": 0.3651, "step": 116245 }, { - "epoch": 4.09, - "learning_rate": 4.231616934953292e-06, - "loss": 0.2703, + "epoch": 4.189642123472808, + "grad_norm": 0.25638502836227417, + "learning_rate": 3.3775309858461253e-06, + "loss": 0.3669, "step": 116250 }, { - "epoch": 4.09, - "learning_rate": 4.230031278914673e-06, - "loss": 0.2574, + "epoch": 4.189822323134033, + "grad_norm": 0.23928916454315186, + "learning_rate": 3.376066386627957e-06, + "loss": 0.3569, "step": 116255 }, { - "epoch": 4.09, - "learning_rate": 4.228445892560415e-06, - "loss": 0.2597, + "epoch": 4.1900025227952575, + "grad_norm": 0.24761387705802917, + "learning_rate": 3.3746020820316972e-06, + "loss": 0.4, "step": 116260 }, { - "epoch": 4.09, - "learning_rate": 4.226860775911093e-06, - "loss": 0.2352, + "epoch": 4.190182722456481, + "grad_norm": 0.2518233358860016, + "learning_rate": 3.373138072077292e-06, + "loss": 0.3886, "step": 116265 }, { - "epoch": 4.09, - "learning_rate": 4.225592876793025e-06, - "loss": 0.2428, + "epoch": 4.190362922117706, + "grad_norm": 0.2131795734167099, + "learning_rate": 3.3716743567847015e-06, + "loss": 0.3559, "step": 116270 }, { - "epoch": 4.09, - "learning_rate": 4.224008245664465e-06, - "loss": 0.2457, + "epoch": 4.190543121778931, + "grad_norm": 0.27569085359573364, + "learning_rate": 3.370210936173862e-06, + "loss": 0.3568, "step": 116275 }, { - "epoch": 4.09, - "learning_rate": 4.222423884298468e-06, - "loss": 0.2538, + "epoch": 4.190723321440156, + "grad_norm": 0.24571947753429413, + "learning_rate": 3.3687478102647025e-06, + "loss": 0.3836, "step": 116280 }, { - "epoch": 4.09, - "learning_rate": 4.220839792715597e-06, - "loss": 0.2322, + "epoch": 4.19090352110138, + "grad_norm": 0.25763317942619324, + "learning_rate": 3.3672849790771704e-06, + "loss": 0.3327, "step": 116285 }, { - "epoch": 4.09, - "learning_rate": 4.219255970936431e-06, - "loss": 0.2299, + "epoch": 4.191083720762605, + "grad_norm": 0.21428771317005157, + "learning_rate": 3.365822442631192e-06, + "loss": 0.3846, "step": 116290 }, { - "epoch": 4.09, - "learning_rate": 4.217672418981533e-06, - "loss": 0.2667, + "epoch": 4.19126392042383, + "grad_norm": 0.24545690417289734, + "learning_rate": 3.3643602009466897e-06, + "loss": 0.3348, "step": 116295 }, { - "epoch": 4.09, - "learning_rate": 4.216089136871457e-06, - "loss": 0.2434, + "epoch": 4.1914441200850545, + "grad_norm": 0.23609334230422974, + "learning_rate": 3.3628982540436004e-06, + "loss": 0.3521, "step": 116300 }, { - "epoch": 4.09, - "learning_rate": 4.214506124626766e-06, - "loss": 0.2448, + "epoch": 4.191624319746279, + "grad_norm": 0.28154176473617554, + "learning_rate": 3.3614366019418255e-06, + "loss": 0.3723, "step": 116305 }, { - "epoch": 4.09, - "learning_rate": 4.212923382268022e-06, - "loss": 0.2586, + "epoch": 4.191804519407503, + "grad_norm": 0.3257730007171631, + "learning_rate": 3.3599752446612896e-06, + "loss": 0.3708, "step": 116310 }, { - "epoch": 4.09, - "learning_rate": 4.211340909815767e-06, - "loss": 0.2356, + "epoch": 4.191984719068728, + "grad_norm": 0.2922912836074829, + "learning_rate": 3.358514182221903e-06, + "loss": 0.3717, "step": 116315 }, { - "epoch": 4.09, - "learning_rate": 4.209758707290551e-06, - "loss": 0.2419, + "epoch": 4.192164918729953, + "grad_norm": 0.2203517109155655, + "learning_rate": 3.357053414643571e-06, + "loss": 0.3921, "step": 116320 }, { - "epoch": 4.09, - "learning_rate": 4.20817677471291e-06, - "loss": 0.232, + "epoch": 4.192345118391177, + "grad_norm": 0.27214643359184265, + "learning_rate": 3.355592941946195e-06, + "loss": 0.3657, "step": 116325 }, { - "epoch": 4.09, - "learning_rate": 4.206595112103398e-06, - "loss": 0.2493, + "epoch": 4.192525318052402, + "grad_norm": 0.22760187089443207, + "learning_rate": 3.3541327641496754e-06, + "loss": 0.3207, "step": 116330 }, { - "epoch": 4.09, - "learning_rate": 4.205013719482545e-06, - "loss": 0.262, + "epoch": 4.192705517713627, + "grad_norm": 0.2242773473262787, + "learning_rate": 3.3526728812738994e-06, + "loss": 0.3882, "step": 116335 }, { - "epoch": 4.09, - "learning_rate": 4.203432596870885e-06, - "loss": 0.2578, + "epoch": 4.1928857173748515, + "grad_norm": 0.2846883237361908, + "learning_rate": 3.35121329333877e-06, + "loss": 0.3775, "step": 116340 }, { - "epoch": 4.09, - "learning_rate": 4.20185174428894e-06, - "loss": 0.2718, + "epoch": 4.193065917036076, + "grad_norm": 0.2596665918827057, + "learning_rate": 3.349754000364169e-06, + "loss": 0.3672, "step": 116345 }, { - "epoch": 4.09, - "learning_rate": 4.200271161757249e-06, - "loss": 0.2511, + "epoch": 4.193246116697301, + "grad_norm": 0.2771369516849518, + "learning_rate": 3.3482950023699777e-06, + "loss": 0.3545, "step": 116350 }, { - "epoch": 4.09, - "learning_rate": 4.198690849296325e-06, - "loss": 0.2662, + "epoch": 4.193426316358526, + "grad_norm": 0.2566249966621399, + "learning_rate": 3.346836299376077e-06, + "loss": 0.3985, "step": 116355 }, { - "epoch": 4.09, - "learning_rate": 4.197110806926696e-06, - "loss": 0.2592, + "epoch": 4.1936065160197495, + "grad_norm": 0.24861952662467957, + "learning_rate": 3.345377891402332e-06, + "loss": 0.3828, "step": 116360 }, { - "epoch": 4.09, - "learning_rate": 4.19553103466887e-06, - "loss": 0.2554, + "epoch": 4.193786715680974, + "grad_norm": 0.2329019010066986, + "learning_rate": 3.3439197784686267e-06, + "loss": 0.4036, "step": 116365 }, { - "epoch": 4.09, - "learning_rate": 4.1939515325433665e-06, - "loss": 0.2606, + "epoch": 4.193966915342199, + "grad_norm": 0.20926065742969513, + "learning_rate": 3.342461960594823e-06, + "loss": 0.3728, "step": 116370 }, { - "epoch": 4.09, - "learning_rate": 4.192372300570693e-06, - "loss": 0.2593, + "epoch": 4.194147115003424, + "grad_norm": 0.29187721014022827, + "learning_rate": 3.341004437800779e-06, + "loss": 0.3899, "step": 116375 }, { - "epoch": 4.09, - "learning_rate": 4.190793338771351e-06, - "loss": 0.2644, + "epoch": 4.1943273146646485, + "grad_norm": 0.25245538353919983, + "learning_rate": 3.3395472101063584e-06, + "loss": 0.3589, "step": 116380 }, { - "epoch": 4.09, - "learning_rate": 4.189214647165837e-06, - "loss": 0.2494, + "epoch": 4.194507514325873, + "grad_norm": 0.24126659333705902, + "learning_rate": 3.338090277531408e-06, + "loss": 0.38, "step": 116385 }, { - "epoch": 4.09, - "learning_rate": 4.187636225774663e-06, - "loss": 0.2551, + "epoch": 4.194687713987098, + "grad_norm": 0.31214639544487, + "learning_rate": 3.336633640095793e-06, + "loss": 0.3653, "step": 116390 }, { - "epoch": 4.1, - "learning_rate": 4.186058074618318e-06, - "loss": 0.2457, + "epoch": 4.194867913648323, + "grad_norm": 0.2557271122932434, + "learning_rate": 3.3351772978193472e-06, + "loss": 0.3487, "step": 116395 }, { - "epoch": 4.1, - "learning_rate": 4.184480193717285e-06, - "loss": 0.2555, + "epoch": 4.1950481133095465, + "grad_norm": 0.22002442181110382, + "learning_rate": 3.3337212507219077e-06, + "loss": 0.3418, "step": 116400 }, { - "epoch": 4.1, - "learning_rate": 4.182902583092063e-06, - "loss": 0.2545, + "epoch": 4.195228312970771, + "grad_norm": 0.3052828907966614, + "learning_rate": 3.332265498823331e-06, + "loss": 0.3924, "step": 116405 }, { - "epoch": 4.1, - "learning_rate": 4.181325242763129e-06, - "loss": 0.2385, + "epoch": 4.195408512631996, + "grad_norm": 0.2319956123828888, + "learning_rate": 3.3308100421434397e-06, + "loss": 0.4031, "step": 116410 }, { - "epoch": 4.1, - "learning_rate": 4.179748172750972e-06, - "loss": 0.229, + "epoch": 4.195588712293221, + "grad_norm": 0.2713097631931305, + "learning_rate": 3.3293548807020654e-06, + "loss": 0.3555, "step": 116415 }, { - "epoch": 4.1, - "learning_rate": 4.178171373076056e-06, - "loss": 0.2596, + "epoch": 4.1957689119544455, + "grad_norm": 0.296252578496933, + "learning_rate": 3.3279000145190366e-06, + "loss": 0.3809, "step": 116420 }, { - "epoch": 4.1, - "learning_rate": 4.176594843758869e-06, - "loss": 0.2502, + "epoch": 4.19594911161567, + "grad_norm": 0.29615938663482666, + "learning_rate": 3.3264454436141677e-06, + "loss": 0.382, "step": 116425 }, { - "epoch": 4.1, - "learning_rate": 4.175018584819876e-06, - "loss": 0.2371, + "epoch": 4.196129311276895, + "grad_norm": 0.2384418398141861, + "learning_rate": 3.324991168007291e-06, + "loss": 0.3745, "step": 116430 }, { - "epoch": 4.1, - "learning_rate": 4.173442596279542e-06, - "loss": 0.234, + "epoch": 4.19630951093812, + "grad_norm": 0.26839661598205566, + "learning_rate": 3.323537187718212e-06, + "loss": 0.3893, "step": 116435 }, { - "epoch": 4.1, - "learning_rate": 4.171866878158326e-06, - "loss": 0.2441, + "epoch": 4.196489710599344, + "grad_norm": 0.3010355830192566, + "learning_rate": 3.322083502766743e-06, + "loss": 0.3965, "step": 116440 }, { - "epoch": 4.1, - "learning_rate": 4.170291430476697e-06, - "loss": 0.2283, + "epoch": 4.196669910260569, + "grad_norm": 0.25463324785232544, + "learning_rate": 3.3206301131726875e-06, + "loss": 0.3634, "step": 116445 }, { - "epoch": 4.1, - "learning_rate": 4.168716253255106e-06, - "loss": 0.2577, + "epoch": 4.196850109921793, + "grad_norm": 0.2410764843225479, + "learning_rate": 3.3191770189558434e-06, + "loss": 0.3554, "step": 116450 }, { - "epoch": 4.1, - "learning_rate": 4.167141346514006e-06, - "loss": 0.2455, + "epoch": 4.197030309583018, + "grad_norm": 0.315395325422287, + "learning_rate": 3.317724220136023e-06, + "loss": 0.3935, "step": 116455 }, { - "epoch": 4.1, - "learning_rate": 4.165566710273841e-06, - "loss": 0.2543, + "epoch": 4.197210509244242, + "grad_norm": 0.27405762672424316, + "learning_rate": 3.3162717167330103e-06, + "loss": 0.379, "step": 116460 }, { - "epoch": 4.1, - "learning_rate": 4.163992344555062e-06, - "loss": 0.2596, + "epoch": 4.197390708905467, + "grad_norm": 0.21512751281261444, + "learning_rate": 3.314819508766598e-06, + "loss": 0.358, "step": 116465 }, { - "epoch": 4.1, - "learning_rate": 4.162418249378117e-06, - "loss": 0.2453, + "epoch": 4.197570908566692, + "grad_norm": 0.2765870988368988, + "learning_rate": 3.3133675962565723e-06, + "loss": 0.3922, "step": 116470 }, { - "epoch": 4.1, - "learning_rate": 4.160844424763438e-06, - "loss": 0.2464, + "epoch": 4.197751108227917, + "grad_norm": 0.20413513481616974, + "learning_rate": 3.311915979222713e-06, + "loss": 0.3726, "step": 116475 }, { - "epoch": 4.1, - "learning_rate": 4.159270870731455e-06, - "loss": 0.2531, + "epoch": 4.197931307889141, + "grad_norm": 0.28501495718955994, + "learning_rate": 3.3104646576847947e-06, + "loss": 0.3839, "step": 116480 }, { - "epoch": 4.1, - "learning_rate": 4.157697587302612e-06, - "loss": 0.2168, + "epoch": 4.198111507550366, + "grad_norm": 0.2470470815896988, + "learning_rate": 3.309013631662608e-06, + "loss": 0.3825, "step": 116485 }, { - "epoch": 4.1, - "learning_rate": 4.156124574497327e-06, - "loss": 0.2791, + "epoch": 4.198291707211591, + "grad_norm": 0.2344074696302414, + "learning_rate": 3.3075629011759007e-06, + "loss": 0.37, "step": 116490 }, { - "epoch": 4.1, - "learning_rate": 4.154551832336031e-06, - "loss": 0.2586, + "epoch": 4.198471906872815, + "grad_norm": 0.2440483570098877, + "learning_rate": 3.3061124662444513e-06, + "loss": 0.3667, "step": 116495 }, { - "epoch": 4.1, - "learning_rate": 4.152979360839135e-06, - "loss": 0.2233, + "epoch": 4.198652106534039, + "grad_norm": 0.22694821655750275, + "learning_rate": 3.3046623268880224e-06, + "loss": 0.3171, "step": 116500 }, { - "epoch": 4.1, - "eval_loss": 0.2499181181192398, - "eval_runtime": 10.5421, - "eval_samples_per_second": 9.486, - "eval_steps_per_second": 9.486, + "epoch": 4.198652106534039, + "eval_loss": 0.42932018637657166, + "eval_runtime": 3.531, + "eval_samples_per_second": 28.32, + "eval_steps_per_second": 7.08, "step": 116500 }, { - "epoch": 4.1, - "learning_rate": 4.151407160027068e-06, - "loss": 0.2409, + "epoch": 4.198832306195264, + "grad_norm": 0.25988444685935974, + "learning_rate": 3.3032124831263668e-06, + "loss": 0.3446, "step": 116505 }, { - "epoch": 4.1, - "learning_rate": 4.149835229920241e-06, - "loss": 0.2562, + "epoch": 4.199012505856489, + "grad_norm": 0.2985076308250427, + "learning_rate": 3.301762934979244e-06, + "loss": 0.3893, "step": 116510 }, { - "epoch": 4.1, - "learning_rate": 4.148263570539057e-06, - "loss": 0.2317, + "epoch": 4.199192705517714, + "grad_norm": 0.24939928948879242, + "learning_rate": 3.3003136824663965e-06, + "loss": 0.3893, "step": 116515 }, { - "epoch": 4.1, - "learning_rate": 4.146692181903933e-06, - "loss": 0.2491, + "epoch": 4.199372905178938, + "grad_norm": 0.24855536222457886, + "learning_rate": 3.29886472560757e-06, + "loss": 0.3757, "step": 116520 }, { - "epoch": 4.1, - "learning_rate": 4.1451210640352625e-06, - "loss": 0.2983, + "epoch": 4.199553104840163, + "grad_norm": 0.23109517991542816, + "learning_rate": 3.297416064422515e-06, + "loss": 0.3638, "step": 116525 }, { - "epoch": 4.1, - "learning_rate": 4.1435502169534595e-06, - "loss": 0.251, + "epoch": 4.199733304501388, + "grad_norm": 0.3385915160179138, + "learning_rate": 3.2959676989309657e-06, + "loss": 0.3723, "step": 116530 }, { - "epoch": 4.1, - "learning_rate": 4.141979640678903e-06, - "loss": 0.2506, + "epoch": 4.1999135041626126, + "grad_norm": 0.24903911352157593, + "learning_rate": 3.2945196291526535e-06, + "loss": 0.3575, "step": 116535 }, { - "epoch": 4.1, - "learning_rate": 4.140409335232004e-06, - "loss": 0.2547, + "epoch": 4.200093703823836, + "grad_norm": 0.28424081206321716, + "learning_rate": 3.29307185510731e-06, + "loss": 0.3789, "step": 116540 }, { - "epoch": 4.1, - "learning_rate": 4.13883930063314e-06, - "loss": 0.242, + "epoch": 4.200273903485061, + "grad_norm": 0.2683737277984619, + "learning_rate": 3.2916243768146533e-06, + "loss": 0.3787, "step": 116545 }, { - "epoch": 4.1, - "learning_rate": 4.137269536902699e-06, - "loss": 0.2716, + "epoch": 4.200454103146286, + "grad_norm": 0.26493752002716064, + "learning_rate": 3.2901771942944167e-06, + "loss": 0.3706, "step": 116550 }, { - "epoch": 4.1, - "learning_rate": 4.135700044061061e-06, - "loss": 0.2277, + "epoch": 4.200634302807511, + "grad_norm": 0.33468732237815857, + "learning_rate": 3.288730307566312e-06, + "loss": 0.3563, "step": 116555 }, { - "epoch": 4.1, - "learning_rate": 4.134130822128612e-06, - "loss": 0.2366, + "epoch": 4.200814502468735, + "grad_norm": 0.26545238494873047, + "learning_rate": 3.2872837166500555e-06, + "loss": 0.3556, "step": 116560 }, { - "epoch": 4.1, - "learning_rate": 4.132561871125723e-06, - "loss": 0.2422, + "epoch": 4.20099470212996, + "grad_norm": 0.2382139265537262, + "learning_rate": 3.2858374215653547e-06, + "loss": 0.3753, "step": 116565 }, { - "epoch": 4.1, - "learning_rate": 4.1309931910727604e-06, - "loss": 0.2761, + "epoch": 4.201174901791185, + "grad_norm": 0.18432195484638214, + "learning_rate": 3.284391422331909e-06, + "loss": 0.3495, "step": 116570 }, { - "epoch": 4.1, - "learning_rate": 4.129424781990104e-06, - "loss": 0.243, + "epoch": 4.2013551014524095, + "grad_norm": 0.2506224513053894, + "learning_rate": 3.282945718969435e-06, + "loss": 0.3927, "step": 116575 }, { - "epoch": 4.1, - "learning_rate": 4.127856643898109e-06, - "loss": 0.2461, + "epoch": 4.201535301113634, + "grad_norm": 0.270535945892334, + "learning_rate": 3.2815003114976177e-06, + "loss": 0.383, "step": 116580 }, { - "epoch": 4.1, - "learning_rate": 4.126288776817142e-06, - "loss": 0.2552, + "epoch": 4.201715500774858, + "grad_norm": 0.26242825388908386, + "learning_rate": 3.2800551999361497e-06, + "loss": 0.3206, "step": 116585 }, { - "epoch": 4.1, - "learning_rate": 4.1247211807675625e-06, - "loss": 0.2589, + "epoch": 4.201895700436083, + "grad_norm": 0.29855820536613464, + "learning_rate": 3.2786103843047254e-06, + "loss": 0.3921, "step": 116590 }, { - "epoch": 4.1, - "learning_rate": 4.123153855769712e-06, - "loss": 0.2355, + "epoch": 4.202075900097308, + "grad_norm": 0.25370094180107117, + "learning_rate": 3.2771658646230274e-06, + "loss": 0.3744, "step": 116595 }, { - "epoch": 4.1, - "learning_rate": 4.121586801843958e-06, - "loss": 0.267, + "epoch": 4.202256099758532, + "grad_norm": 0.22978107631206512, + "learning_rate": 3.2757216409107506e-06, + "loss": 0.3969, "step": 116600 }, { - "epoch": 4.1, - "learning_rate": 4.12002001901064e-06, - "loss": 0.2511, + "epoch": 4.202436299419757, + "grad_norm": 0.2947256565093994, + "learning_rate": 3.2742777131875514e-06, + "loss": 0.3395, "step": 116605 }, { - "epoch": 4.1, - "learning_rate": 4.118453507290102e-06, - "loss": 0.2624, + "epoch": 4.202616499080982, + "grad_norm": 0.22788827121257782, + "learning_rate": 3.2728340814731082e-06, + "loss": 0.354, "step": 116610 }, { - "epoch": 4.1, - "learning_rate": 4.116887266702676e-06, - "loss": 0.2692, + "epoch": 4.2027966987422065, + "grad_norm": 0.3038329482078552, + "learning_rate": 3.2713907457871006e-06, + "loss": 0.3497, "step": 116615 }, { - "epoch": 4.1, - "learning_rate": 4.115321297268715e-06, - "loss": 0.2561, + "epoch": 4.202976898403431, + "grad_norm": 0.3307643234729767, + "learning_rate": 3.2699477061491873e-06, + "loss": 0.3688, "step": 116620 }, { - "epoch": 4.1, - "learning_rate": 4.113755599008537e-06, - "loss": 0.2559, + "epoch": 4.203157098064656, + "grad_norm": 0.2802697718143463, + "learning_rate": 3.2685049625790276e-06, + "loss": 0.4305, "step": 116625 }, { - "epoch": 4.1, - "learning_rate": 4.112190171942482e-06, - "loss": 0.2299, + "epoch": 4.203337297725881, + "grad_norm": 0.24683642387390137, + "learning_rate": 3.2670625150962806e-06, + "loss": 0.3701, "step": 116630 }, { - "epoch": 4.1, - "learning_rate": 4.110625016090866e-06, - "loss": 0.2546, + "epoch": 4.203517497387105, + "grad_norm": 0.22279055416584015, + "learning_rate": 3.265620363720595e-06, + "loss": 0.3596, "step": 116635 }, { - "epoch": 4.1, - "learning_rate": 4.109060131474026e-06, - "loss": 0.2338, + "epoch": 4.203697697048329, + "grad_norm": 0.21414312720298767, + "learning_rate": 3.2641785084716296e-06, + "loss": 0.4076, "step": 116640 }, { - "epoch": 4.1, - "learning_rate": 4.10749551811227e-06, - "loss": 0.2297, + "epoch": 4.203877896709554, + "grad_norm": 0.2691706717014313, + "learning_rate": 3.2627369493690245e-06, + "loss": 0.3498, "step": 116645 }, { - "epoch": 4.1, - "learning_rate": 4.105931176025912e-06, - "loss": 0.2499, + "epoch": 4.204058096370779, + "grad_norm": 0.24428191781044006, + "learning_rate": 3.261295686432417e-06, + "loss": 0.4001, "step": 116650 }, { - "epoch": 4.1, - "learning_rate": 4.104367105235274e-06, - "loss": 0.268, + "epoch": 4.2042382960320035, + "grad_norm": 0.23344086110591888, + "learning_rate": 3.2598547196814487e-06, + "loss": 0.3345, "step": 116655 }, { - "epoch": 4.1, - "learning_rate": 4.102803305760658e-06, - "loss": 0.2344, + "epoch": 4.204418495693228, + "grad_norm": 0.2314455211162567, + "learning_rate": 3.2584140491357523e-06, + "loss": 0.3732, "step": 116660 }, { - "epoch": 4.1, - "learning_rate": 4.101239777622368e-06, - "loss": 0.2388, + "epoch": 4.204598695354453, + "grad_norm": 0.2983637750148773, + "learning_rate": 3.2569736748149477e-06, + "loss": 0.4034, "step": 116665 }, { - "epoch": 4.1, - "learning_rate": 4.099676520840701e-06, - "loss": 0.2223, + "epoch": 4.204778895015678, + "grad_norm": 0.2737831473350525, + "learning_rate": 3.2555335967386804e-06, + "loss": 0.334, "step": 116670 }, { - "epoch": 4.1, - "learning_rate": 4.098113535435968e-06, - "loss": 0.2801, + "epoch": 4.204959094676902, + "grad_norm": 0.24176688492298126, + "learning_rate": 3.2540938149265453e-06, + "loss": 0.3541, "step": 116675 }, { - "epoch": 4.11, - "learning_rate": 4.096550821428455e-06, - "loss": 0.259, + "epoch": 4.205139294338126, + "grad_norm": 0.21122761070728302, + "learning_rate": 3.2526543293981775e-06, + "loss": 0.3499, "step": 116680 }, { - "epoch": 4.11, - "learning_rate": 4.094988378838446e-06, - "loss": 0.2475, + "epoch": 4.205319493999351, + "grad_norm": 0.23312951624393463, + "learning_rate": 3.2512151401731833e-06, + "loss": 0.4094, "step": 116685 }, { - "epoch": 4.11, - "learning_rate": 4.093426207686238e-06, - "loss": 0.233, + "epoch": 4.205499693660576, + "grad_norm": 0.24352633953094482, + "learning_rate": 3.249776247271172e-06, + "loss": 0.3615, "step": 116690 }, { - "epoch": 4.11, - "learning_rate": 4.091864307992119e-06, - "loss": 0.2482, + "epoch": 4.2056798933218005, + "grad_norm": 0.23205436766147614, + "learning_rate": 3.24833765071175e-06, + "loss": 0.3529, "step": 116695 }, { - "epoch": 4.11, - "learning_rate": 4.090302679776361e-06, - "loss": 0.2608, + "epoch": 4.205860092983025, + "grad_norm": 0.23461531102657318, + "learning_rate": 3.246899350514515e-06, + "loss": 0.3595, "step": 116700 }, { - "epoch": 4.11, - "learning_rate": 4.0887413230592444e-06, - "loss": 0.2399, + "epoch": 4.20604029264425, + "grad_norm": 0.26459741592407227, + "learning_rate": 3.245461346699061e-06, + "loss": 0.408, "step": 116705 }, { - "epoch": 4.11, - "learning_rate": 4.087180237861035e-06, - "loss": 0.2476, + "epoch": 4.206220492305475, + "grad_norm": 0.2987162172794342, + "learning_rate": 3.2440236392849877e-06, + "loss": 0.3556, "step": 116710 }, { - "epoch": 4.11, - "learning_rate": 4.0856194242020144e-06, - "loss": 0.2735, + "epoch": 4.206400691966699, + "grad_norm": 0.22682152688503265, + "learning_rate": 3.24258622829188e-06, + "loss": 0.361, "step": 116715 }, { - "epoch": 4.11, - "learning_rate": 4.08405888210244e-06, - "loss": 0.2621, + "epoch": 4.206580891627924, + "grad_norm": 0.2210111767053604, + "learning_rate": 3.241149113739325e-06, + "loss": 0.3509, "step": 116720 }, { - "epoch": 4.11, - "learning_rate": 4.08249861158258e-06, - "loss": 0.244, + "epoch": 4.206761091289148, + "grad_norm": 0.2724922001361847, + "learning_rate": 3.239712295646899e-06, + "loss": 0.3745, "step": 116725 }, { - "epoch": 4.11, - "learning_rate": 4.080938612662682e-06, - "loss": 0.2618, + "epoch": 4.206941290950373, + "grad_norm": 0.25349199771881104, + "learning_rate": 3.238275774034175e-06, + "loss": 0.3653, "step": 116730 }, { - "epoch": 4.11, - "learning_rate": 4.079378885363017e-06, - "loss": 0.2448, + "epoch": 4.2071214906115975, + "grad_norm": 0.29536816477775574, + "learning_rate": 3.2368395489207366e-06, + "loss": 0.383, "step": 116735 }, { - "epoch": 4.11, - "learning_rate": 4.0778194297038245e-06, - "loss": 0.2634, + "epoch": 4.207301690272822, + "grad_norm": 0.2729000151157379, + "learning_rate": 3.235403620326144e-06, + "loss": 0.354, "step": 116740 }, { - "epoch": 4.11, - "learning_rate": 4.076260245705362e-06, - "loss": 0.2495, + "epoch": 4.207481889934047, + "grad_norm": 0.27360138297080994, + "learning_rate": 3.233967988269965e-06, + "loss": 0.3776, "step": 116745 }, { - "epoch": 4.11, - "learning_rate": 4.074701333387867e-06, - "loss": 0.2487, + "epoch": 4.207662089595272, + "grad_norm": 0.20422430336475372, + "learning_rate": 3.2325326527717582e-06, + "loss": 0.3573, "step": 116750 }, { - "epoch": 4.11, - "learning_rate": 4.073142692771592e-06, - "loss": 0.2425, + "epoch": 4.207842289256496, + "grad_norm": 0.31160596013069153, + "learning_rate": 3.2310976138510724e-06, + "loss": 0.3787, "step": 116755 }, { - "epoch": 4.11, - "learning_rate": 4.071584323876765e-06, - "loss": 0.2397, + "epoch": 4.208022488917721, + "grad_norm": 0.2688230574131012, + "learning_rate": 3.2296628715274723e-06, + "loss": 0.3671, "step": 116760 }, { - "epoch": 4.11, - "learning_rate": 4.0700262267236195e-06, - "loss": 0.267, + "epoch": 4.208202688578946, + "grad_norm": 0.2496272772550583, + "learning_rate": 3.2282284258205093e-06, + "loss": 0.3467, "step": 116765 }, { - "epoch": 4.11, - "learning_rate": 4.068468401332395e-06, - "loss": 0.2278, + "epoch": 4.20838288824017, + "grad_norm": 0.2692166268825531, + "learning_rate": 3.226794276749706e-06, + "loss": 0.3607, "step": 116770 }, { - "epoch": 4.11, - "learning_rate": 4.066910847723315e-06, - "loss": 0.2298, + "epoch": 4.208563087901394, + "grad_norm": 0.27493536472320557, + "learning_rate": 3.2253604243346196e-06, + "loss": 0.3849, "step": 116775 }, { - "epoch": 4.11, - "learning_rate": 4.065353565916599e-06, - "loss": 0.2571, + "epoch": 4.208743287562619, + "grad_norm": 0.25217947363853455, + "learning_rate": 3.2239268685947787e-06, + "loss": 0.3608, "step": 116780 }, { - "epoch": 4.11, - "learning_rate": 4.063796555932467e-06, - "loss": 0.2576, + "epoch": 4.208923487223844, + "grad_norm": 0.24530518054962158, + "learning_rate": 3.222493609549726e-06, + "loss": 0.4019, "step": 116785 }, { - "epoch": 4.11, - "learning_rate": 4.062239817791147e-06, - "loss": 0.2294, + "epoch": 4.209103686885069, + "grad_norm": 0.2516281008720398, + "learning_rate": 3.2210606472189792e-06, + "loss": 0.3362, "step": 116790 }, { - "epoch": 4.11, - "learning_rate": 4.060683351512837e-06, - "loss": 0.2432, + "epoch": 4.209283886546293, + "grad_norm": 0.237876757979393, + "learning_rate": 3.2196279816220587e-06, + "loss": 0.3502, "step": 116795 }, { - "epoch": 4.11, - "learning_rate": 4.0591271571177596e-06, - "loss": 0.2427, + "epoch": 4.209464086207518, + "grad_norm": 0.242570698261261, + "learning_rate": 3.2181956127784933e-06, + "loss": 0.3677, "step": 116800 }, { - "epoch": 4.11, - "learning_rate": 4.057571234626109e-06, - "loss": 0.2379, + "epoch": 4.209644285868743, + "grad_norm": 0.2549905478954315, + "learning_rate": 3.2167635407077982e-06, + "loss": 0.3541, "step": 116805 }, { - "epoch": 4.11, - "learning_rate": 4.056015584058101e-06, - "loss": 0.2645, + "epoch": 4.209824485529968, + "grad_norm": 0.33291956782341003, + "learning_rate": 3.2153317654294803e-06, + "loss": 0.4255, "step": 116810 }, { - "epoch": 4.11, - "learning_rate": 4.054460205433927e-06, - "loss": 0.2201, + "epoch": 4.210004685191191, + "grad_norm": 0.2316872775554657, + "learning_rate": 3.213900286963051e-06, + "loss": 0.3734, "step": 116815 }, { - "epoch": 4.11, - "learning_rate": 4.052905098773785e-06, - "loss": 0.2621, + "epoch": 4.210184884852416, + "grad_norm": 0.24276618659496307, + "learning_rate": 3.212469105328006e-06, + "loss": 0.402, "step": 116820 }, { - "epoch": 4.11, - "learning_rate": 4.0513502640978584e-06, - "loss": 0.2348, + "epoch": 4.210365084513641, + "grad_norm": 0.20943395793437958, + "learning_rate": 3.211038220543858e-06, + "loss": 0.3512, "step": 116825 }, { - "epoch": 4.11, - "learning_rate": 4.049795701426351e-06, - "loss": 0.2692, + "epoch": 4.210545284174866, + "grad_norm": 0.263275682926178, + "learning_rate": 3.2096076326300933e-06, + "loss": 0.3615, "step": 116830 }, { - "epoch": 4.11, - "learning_rate": 4.048241410779438e-06, - "loss": 0.2655, + "epoch": 4.21072548383609, + "grad_norm": 0.234664648771286, + "learning_rate": 3.2081773416062083e-06, + "loss": 0.3926, "step": 116835 }, { - "epoch": 4.11, - "learning_rate": 4.046687392177303e-06, - "loss": 0.2686, + "epoch": 4.210905683497315, + "grad_norm": 0.2711605429649353, + "learning_rate": 3.206747347491687e-06, + "loss": 0.3848, "step": 116840 }, { - "epoch": 4.11, - "learning_rate": 4.045133645640118e-06, - "loss": 0.2371, + "epoch": 4.21108588315854, + "grad_norm": 0.22846084833145142, + "learning_rate": 3.205317650306014e-06, + "loss": 0.3744, "step": 116845 }, { - "epoch": 4.11, - "learning_rate": 4.043580171188061e-06, - "loss": 0.2566, + "epoch": 4.2112660828197646, + "grad_norm": 0.2208365499973297, + "learning_rate": 3.2038882500686625e-06, + "loss": 0.3703, "step": 116850 }, { - "epoch": 4.11, - "learning_rate": 4.042026968841311e-06, - "loss": 0.2576, + "epoch": 4.211446282480989, + "grad_norm": 0.3054751753807068, + "learning_rate": 3.202459146799125e-06, + "loss": 0.3998, "step": 116855 }, { - "epoch": 4.11, - "learning_rate": 4.040474038620032e-06, - "loss": 0.2379, + "epoch": 4.211626482142213, + "grad_norm": 0.23200340569019318, + "learning_rate": 3.201030340516853e-06, + "loss": 0.3711, "step": 116860 }, { - "epoch": 4.11, - "learning_rate": 4.0389213805443775e-06, - "loss": 0.2348, + "epoch": 4.211806681803438, + "grad_norm": 0.29509085416793823, + "learning_rate": 3.199601831241325e-06, + "loss": 0.385, "step": 116865 }, { - "epoch": 4.11, - "learning_rate": 4.037368994634519e-06, - "loss": 0.2544, + "epoch": 4.211986881464663, + "grad_norm": 0.22504891455173492, + "learning_rate": 3.198173618992001e-06, + "loss": 0.3803, "step": 116870 }, { - "epoch": 4.11, - "learning_rate": 4.035816880910612e-06, - "loss": 0.2621, + "epoch": 4.212167081125887, + "grad_norm": 0.2349441796541214, + "learning_rate": 3.196745703788334e-06, + "loss": 0.3666, "step": 116875 }, { - "epoch": 4.11, - "learning_rate": 4.034265039392798e-06, - "loss": 0.2743, + "epoch": 4.212347280787112, + "grad_norm": 0.3012699484825134, + "learning_rate": 3.1953180856497984e-06, + "loss": 0.3774, "step": 116880 }, { - "epoch": 4.11, - "learning_rate": 4.032713470101243e-06, - "loss": 0.2546, + "epoch": 4.212527480448337, + "grad_norm": 0.23085784912109375, + "learning_rate": 3.1938907645958275e-06, + "loss": 0.3879, "step": 116885 }, { - "epoch": 4.11, - "learning_rate": 4.031162173056086e-06, - "loss": 0.2264, + "epoch": 4.2127076801095615, + "grad_norm": 0.20077353715896606, + "learning_rate": 3.192463740645868e-06, + "loss": 0.3781, "step": 116890 }, { - "epoch": 4.11, - "learning_rate": 4.029611148277468e-06, - "loss": 0.2662, + "epoch": 4.212887879770786, + "grad_norm": 0.26119643449783325, + "learning_rate": 3.19103701381937e-06, + "loss": 0.3959, "step": 116895 }, { - "epoch": 4.11, - "learning_rate": 4.028060395785524e-06, - "loss": 0.2493, + "epoch": 4.213068079432011, + "grad_norm": 0.28771156072616577, + "learning_rate": 3.1896105841357745e-06, + "loss": 0.3555, "step": 116900 }, { - "epoch": 4.11, - "learning_rate": 4.026509915600393e-06, - "loss": 0.2831, + "epoch": 4.213248279093236, + "grad_norm": 0.23709803819656372, + "learning_rate": 3.188184451614509e-06, + "loss": 0.3638, "step": 116905 }, { - "epoch": 4.11, - "learning_rate": 4.024959707742218e-06, - "loss": 0.2659, + "epoch": 4.21342847875446, + "grad_norm": 0.25012511014938354, + "learning_rate": 3.1867586162750095e-06, + "loss": 0.3855, "step": 116910 }, { - "epoch": 4.11, - "learning_rate": 4.023409772231115e-06, - "loss": 0.246, + "epoch": 4.213608678415684, + "grad_norm": 0.24966490268707275, + "learning_rate": 3.185333078136693e-06, + "loss": 0.3769, "step": 116915 }, { - "epoch": 4.11, - "learning_rate": 4.021860109087208e-06, - "loss": 0.2348, + "epoch": 4.213788878076909, + "grad_norm": 0.2194073498249054, + "learning_rate": 3.1839078372189966e-06, + "loss": 0.3601, "step": 116920 }, { - "epoch": 4.11, - "learning_rate": 4.020310718330628e-06, - "loss": 0.2468, + "epoch": 4.213969077738134, + "grad_norm": 0.2655618488788605, + "learning_rate": 3.182482893541333e-06, + "loss": 0.4166, "step": 116925 }, { - "epoch": 4.11, - "learning_rate": 4.018761599981488e-06, - "loss": 0.2415, + "epoch": 4.2141492773993585, + "grad_norm": 0.23362277448177338, + "learning_rate": 3.1810582471231143e-06, + "loss": 0.3466, "step": 116930 }, { - "epoch": 4.11, - "learning_rate": 4.017212754059901e-06, - "loss": 0.2619, + "epoch": 4.214329477060583, + "grad_norm": 0.2612302303314209, + "learning_rate": 3.179633897983755e-06, + "loss": 0.3442, "step": 116935 }, { - "epoch": 4.11, - "learning_rate": 4.015664180585971e-06, - "loss": 0.2512, + "epoch": 4.214509676721808, + "grad_norm": 0.258493572473526, + "learning_rate": 3.1782098461426516e-06, + "loss": 0.3749, "step": 116940 }, { - "epoch": 4.11, - "learning_rate": 4.0141158795798185e-06, - "loss": 0.2675, + "epoch": 4.214689876383033, + "grad_norm": 0.297879159450531, + "learning_rate": 3.1767860916192216e-06, + "loss": 0.395, "step": 116945 }, { - "epoch": 4.11, - "learning_rate": 4.012567851061544e-06, - "loss": 0.243, + "epoch": 4.2148700760442575, + "grad_norm": 0.23272161185741425, + "learning_rate": 3.17536263443286e-06, + "loss": 0.3777, "step": 116950 }, { - "epoch": 4.11, - "learning_rate": 4.011020095051235e-06, - "loss": 0.2618, + "epoch": 4.215050275705481, + "grad_norm": 0.21118395030498505, + "learning_rate": 3.1739394746029465e-06, + "loss": 0.3803, "step": 116955 }, { - "epoch": 4.11, - "learning_rate": 4.009472611569006e-06, - "loss": 0.2237, + "epoch": 4.215230475366706, + "grad_norm": 0.1826326549053192, + "learning_rate": 3.1725166121488875e-06, + "loss": 0.3826, "step": 116960 }, { - "epoch": 4.12, - "learning_rate": 4.007925400634935e-06, - "loss": 0.2513, + "epoch": 4.215410675027931, + "grad_norm": 0.23955535888671875, + "learning_rate": 3.171094047090059e-06, + "loss": 0.3674, "step": 116965 }, { - "epoch": 4.12, - "learning_rate": 4.0063784622691255e-06, - "loss": 0.2646, + "epoch": 4.2155908746891555, + "grad_norm": 0.27745673060417175, + "learning_rate": 3.169671779445857e-06, + "loss": 0.3951, "step": 116970 }, { - "epoch": 4.12, - "learning_rate": 4.004831796491653e-06, - "loss": 0.2411, + "epoch": 4.21577107435038, + "grad_norm": 0.1933916211128235, + "learning_rate": 3.168249809235646e-06, + "loss": 0.3331, "step": 116975 }, { - "epoch": 4.12, - "learning_rate": 4.003285403322602e-06, - "loss": 0.244, + "epoch": 4.215951274011605, + "grad_norm": 0.30480435490608215, + "learning_rate": 3.1668281364788e-06, + "loss": 0.3533, "step": 116980 }, { - "epoch": 4.12, - "learning_rate": 4.001739282782055e-06, - "loss": 0.2428, + "epoch": 4.21613147367283, + "grad_norm": 0.2264707088470459, + "learning_rate": 3.1654067611946974e-06, + "loss": 0.3712, "step": 116985 }, { - "epoch": 4.12, - "learning_rate": 4.000193434890087e-06, - "loss": 0.2518, + "epoch": 4.216311673334054, + "grad_norm": 0.25249165296554565, + "learning_rate": 3.1639856834027036e-06, + "loss": 0.4058, "step": 116990 }, { - "epoch": 4.12, - "learning_rate": 3.99864785966676e-06, - "loss": 0.2488, + "epoch": 4.216491872995279, + "grad_norm": 0.22679844498634338, + "learning_rate": 3.1625649031221725e-06, + "loss": 0.3795, "step": 116995 }, { - "epoch": 4.12, - "learning_rate": 3.997102557132157e-06, - "loss": 0.25, + "epoch": 4.216672072656503, + "grad_norm": 0.29318711161613464, + "learning_rate": 3.1611444203724695e-06, + "loss": 0.3723, "step": 117000 }, { - "epoch": 4.12, - "eval_loss": 0.24963203072547913, - "eval_runtime": 10.5439, - "eval_samples_per_second": 9.484, - "eval_steps_per_second": 9.484, + "epoch": 4.216672072656503, + "eval_loss": 0.4291633367538452, + "eval_runtime": 3.5298, + "eval_samples_per_second": 28.33, + "eval_steps_per_second": 7.083, "step": 117000 }, { - "epoch": 4.12, - "learning_rate": 3.995557527306337e-06, - "loss": 0.2382, + "epoch": 4.216852272317728, + "grad_norm": 0.26223427057266235, + "learning_rate": 3.1597242351729425e-06, + "loss": 0.3777, "step": 117005 }, { - "epoch": 4.12, - "learning_rate": 3.994012770209354e-06, - "loss": 0.2457, + "epoch": 4.2170324719789525, + "grad_norm": 0.24939653277397156, + "learning_rate": 3.1583043475429453e-06, + "loss": 0.3564, "step": 117010 }, { - "epoch": 4.12, - "learning_rate": 3.992468285861275e-06, - "loss": 0.2589, + "epoch": 4.217212671640177, + "grad_norm": 0.27071085572242737, + "learning_rate": 3.156884757501824e-06, + "loss": 0.3712, "step": 117015 }, { - "epoch": 4.12, - "learning_rate": 3.9909240742821484e-06, - "loss": 0.2411, + "epoch": 4.217392871301402, + "grad_norm": 0.2698659598827362, + "learning_rate": 3.1554654650689215e-06, + "loss": 0.394, "step": 117020 }, { - "epoch": 4.12, - "learning_rate": 3.989380135492032e-06, - "loss": 0.2512, + "epoch": 4.217573070962627, + "grad_norm": 0.28294146060943604, + "learning_rate": 3.154046470263569e-06, + "loss": 0.3858, "step": 117025 }, { - "epoch": 4.12, - "learning_rate": 3.987836469510967e-06, - "loss": 0.2572, + "epoch": 4.217753270623851, + "grad_norm": 0.28658536076545715, + "learning_rate": 3.1526277731051074e-06, + "loss": 0.3811, "step": 117030 }, { - "epoch": 4.12, - "learning_rate": 3.986293076358993e-06, - "loss": 0.2433, + "epoch": 4.217933470285076, + "grad_norm": 0.22426393628120422, + "learning_rate": 3.1512093736128538e-06, + "loss": 0.3315, "step": 117035 }, { - "epoch": 4.12, - "learning_rate": 3.9847499560561605e-06, - "loss": 0.2575, + "epoch": 4.218113669946301, + "grad_norm": 0.22765573859214783, + "learning_rate": 3.149791271806152e-06, + "loss": 0.3562, "step": 117040 }, { - "epoch": 4.12, - "learning_rate": 3.9832071086225e-06, - "loss": 0.2379, + "epoch": 4.218293869607525, + "grad_norm": 0.2529604434967041, + "learning_rate": 3.1483734677043102e-06, + "loss": 0.3777, "step": 117045 }, { - "epoch": 4.12, - "learning_rate": 3.981664534078045e-06, - "loss": 0.2512, + "epoch": 4.2184740692687495, + "grad_norm": 0.21222354471683502, + "learning_rate": 3.14695596132665e-06, + "loss": 0.3615, "step": 117050 }, { - "epoch": 4.12, - "learning_rate": 3.980122232442817e-06, - "loss": 0.2441, + "epoch": 4.218654268929974, + "grad_norm": 0.28866201639175415, + "learning_rate": 3.145538752692484e-06, + "loss": 0.3926, "step": 117055 }, { - "epoch": 4.12, - "learning_rate": 3.978580203736857e-06, - "loss": 0.2501, + "epoch": 4.218834468591199, + "grad_norm": 0.27018246054649353, + "learning_rate": 3.1441218418211155e-06, + "loss": 0.3999, "step": 117060 }, { - "epoch": 4.12, - "learning_rate": 3.977038447980172e-06, - "loss": 0.2599, + "epoch": 4.219014668252424, + "grad_norm": 0.2012593001127243, + "learning_rate": 3.142705228731868e-06, + "loss": 0.3466, "step": 117065 }, { - "epoch": 4.12, - "learning_rate": 3.975496965192796e-06, - "loss": 0.2381, + "epoch": 4.219194867913648, + "grad_norm": 0.33578363060951233, + "learning_rate": 3.141288913444024e-06, + "loss": 0.3932, "step": 117070 }, { - "epoch": 4.12, - "learning_rate": 3.9739557553947285e-06, - "loss": 0.2545, + "epoch": 4.219375067574873, + "grad_norm": 0.24772273004055023, + "learning_rate": 3.139872895976881e-06, + "loss": 0.3928, "step": 117075 }, { - "epoch": 4.12, - "learning_rate": 3.972414818605996e-06, - "loss": 0.2488, + "epoch": 4.219555267236098, + "grad_norm": 0.2516150176525116, + "learning_rate": 3.1384571763497435e-06, + "loss": 0.3273, "step": 117080 }, { - "epoch": 4.12, - "learning_rate": 3.970874154846599e-06, - "loss": 0.2561, + "epoch": 4.219735466897323, + "grad_norm": 0.3026273250579834, + "learning_rate": 3.1370417545818933e-06, + "loss": 0.388, "step": 117085 }, { - "epoch": 4.12, - "learning_rate": 3.969333764136543e-06, - "loss": 0.2487, + "epoch": 4.219915666558546, + "grad_norm": 0.2576843202114105, + "learning_rate": 3.1356266306926175e-06, + "loss": 0.3755, "step": 117090 }, { - "epoch": 4.12, - "learning_rate": 3.967793646495821e-06, - "loss": 0.2425, + "epoch": 4.220095866219771, + "grad_norm": 0.24807590246200562, + "learning_rate": 3.1342118047011955e-06, + "loss": 0.3744, "step": 117095 }, { - "epoch": 4.12, - "learning_rate": 3.966253801944444e-06, - "loss": 0.2653, + "epoch": 4.220276065880996, + "grad_norm": 0.25404757261276245, + "learning_rate": 3.1327972766268977e-06, + "loss": 0.3697, "step": 117100 }, { - "epoch": 4.12, - "learning_rate": 3.9647142305024e-06, - "loss": 0.2509, + "epoch": 4.220456265542221, + "grad_norm": 0.22809305787086487, + "learning_rate": 3.1313830464890085e-06, + "loss": 0.3502, "step": 117105 }, { - "epoch": 4.12, - "learning_rate": 3.963174932189672e-06, - "loss": 0.241, + "epoch": 4.220636465203445, + "grad_norm": 0.2612314224243164, + "learning_rate": 3.1299691143067937e-06, + "loss": 0.3539, "step": 117110 }, { - "epoch": 4.12, - "learning_rate": 3.9616359070262625e-06, - "loss": 0.2357, + "epoch": 4.22081666486467, + "grad_norm": 0.3267393708229065, + "learning_rate": 3.1285554800995127e-06, + "loss": 0.389, "step": 117115 }, { - "epoch": 4.12, - "learning_rate": 3.9600971550321444e-06, - "loss": 0.2503, + "epoch": 4.220996864525895, + "grad_norm": 0.23879055678844452, + "learning_rate": 3.127142143886433e-06, + "loss": 0.4019, "step": 117120 }, { - "epoch": 4.12, - "learning_rate": 3.958558676227292e-06, - "loss": 0.2639, + "epoch": 4.22117706418712, + "grad_norm": 0.27509593963623047, + "learning_rate": 3.1257291056867976e-06, + "loss": 0.377, "step": 117125 }, { - "epoch": 4.12, - "learning_rate": 3.9570204706316895e-06, - "loss": 0.2484, + "epoch": 4.221357263848344, + "grad_norm": 0.3244267702102661, + "learning_rate": 3.124316365519875e-06, + "loss": 0.3989, "step": 117130 }, { - "epoch": 4.12, - "learning_rate": 3.955482538265312e-06, - "loss": 0.2749, + "epoch": 4.221537463509568, + "grad_norm": 0.2525143623352051, + "learning_rate": 3.122903923404913e-06, + "loss": 0.3723, "step": 117135 }, { - "epoch": 4.12, - "learning_rate": 3.953944879148128e-06, - "loss": 0.2528, + "epoch": 4.221717663170793, + "grad_norm": 0.2540621757507324, + "learning_rate": 3.1214917793611355e-06, + "loss": 0.3725, "step": 117140 }, { - "epoch": 4.12, - "learning_rate": 3.9524074933000974e-06, - "loss": 0.2484, + "epoch": 4.221897862832018, + "grad_norm": 0.2635304033756256, + "learning_rate": 3.1200799334078044e-06, + "loss": 0.346, "step": 117145 }, { - "epoch": 4.12, - "learning_rate": 3.950870380741181e-06, - "loss": 0.245, + "epoch": 4.222078062493242, + "grad_norm": 0.24621783196926117, + "learning_rate": 3.118668385564144e-06, + "loss": 0.3364, "step": 117150 }, { - "epoch": 4.12, - "learning_rate": 3.949333541491346e-06, - "loss": 0.2538, + "epoch": 4.222258262154467, + "grad_norm": 0.2203269898891449, + "learning_rate": 3.117257135849394e-06, + "loss": 0.3385, "step": 117155 }, { - "epoch": 4.12, - "learning_rate": 3.947796975570542e-06, - "loss": 0.258, + "epoch": 4.222438461815692, + "grad_norm": 0.2397884875535965, + "learning_rate": 3.1158461842827833e-06, + "loss": 0.3664, "step": 117160 }, { - "epoch": 4.12, - "learning_rate": 3.946260682998717e-06, - "loss": 0.2483, + "epoch": 4.222618661476917, + "grad_norm": 0.2857881188392639, + "learning_rate": 3.114435530883522e-06, + "loss": 0.3521, "step": 117165 }, { - "epoch": 4.12, - "learning_rate": 3.94472466379582e-06, - "loss": 0.2473, + "epoch": 4.222798861138141, + "grad_norm": 0.25649380683898926, + "learning_rate": 3.113025175670847e-06, + "loss": 0.3339, "step": 117170 }, { - "epoch": 4.12, - "learning_rate": 3.943188917981802e-06, - "loss": 0.2507, + "epoch": 4.222979060799366, + "grad_norm": 0.30346161127090454, + "learning_rate": 3.111615118663963e-06, + "loss": 0.3665, "step": 117175 }, { - "epoch": 4.12, - "learning_rate": 3.941653445576596e-06, - "loss": 0.2639, + "epoch": 4.223159260460591, + "grad_norm": 0.29739436507225037, + "learning_rate": 3.1102053598820874e-06, + "loss": 0.3754, "step": 117180 }, { - "epoch": 4.12, - "learning_rate": 3.940118246600144e-06, - "loss": 0.2472, + "epoch": 4.223339460121815, + "grad_norm": 0.23481184244155884, + "learning_rate": 3.108795899344424e-06, + "loss": 0.3601, "step": 117185 }, { - "epoch": 4.12, - "learning_rate": 3.938583321072373e-06, - "loss": 0.2471, + "epoch": 4.223519659783039, + "grad_norm": 0.2552500069141388, + "learning_rate": 3.107386737070175e-06, + "loss": 0.4017, "step": 117190 }, { - "epoch": 4.12, - "learning_rate": 3.937048669013224e-06, - "loss": 0.247, + "epoch": 4.223699859444264, + "grad_norm": 0.29149770736694336, + "learning_rate": 3.105977873078547e-06, + "loss": 0.3966, "step": 117195 }, { - "epoch": 4.12, - "learning_rate": 3.935514290442616e-06, - "loss": 0.2579, + "epoch": 4.223880059105489, + "grad_norm": 0.28181055188179016, + "learning_rate": 3.1045693073887327e-06, + "loss": 0.4144, "step": 117200 }, { - "epoch": 4.12, - "learning_rate": 3.933980185380473e-06, - "loss": 0.261, + "epoch": 4.2240602587667135, + "grad_norm": 0.25641298294067383, + "learning_rate": 3.1031610400199225e-06, + "loss": 0.3707, "step": 117205 }, { - "epoch": 4.12, - "learning_rate": 3.932446353846708e-06, - "loss": 0.2598, + "epoch": 4.224240458427938, + "grad_norm": 0.20379073917865753, + "learning_rate": 3.1017530709913033e-06, + "loss": 0.3584, "step": 117210 }, { - "epoch": 4.12, - "learning_rate": 3.930912795861249e-06, - "loss": 0.262, + "epoch": 4.224420658089163, + "grad_norm": 0.2511664927005768, + "learning_rate": 3.1003454003220577e-06, + "loss": 0.374, "step": 117215 }, { - "epoch": 4.12, - "learning_rate": 3.929379511444006e-06, - "loss": 0.2617, + "epoch": 4.224600857750388, + "grad_norm": 0.21009521186351776, + "learning_rate": 3.098938028031362e-06, + "loss": 0.3763, "step": 117220 }, { - "epoch": 4.12, - "learning_rate": 3.9278465006148735e-06, - "loss": 0.2531, + "epoch": 4.2247810574116125, + "grad_norm": 0.21885792911052704, + "learning_rate": 3.0975309541384e-06, + "loss": 0.3636, "step": 117225 }, { - "epoch": 4.12, - "learning_rate": 3.926313763393774e-06, - "loss": 0.2529, + "epoch": 4.224961257072836, + "grad_norm": 0.2734248638153076, + "learning_rate": 3.096124178662338e-06, + "loss": 0.3794, "step": 117230 }, { - "epoch": 4.12, - "learning_rate": 3.924781299800598e-06, - "loss": 0.2453, + "epoch": 4.225141456734061, + "grad_norm": 0.24762408435344696, + "learning_rate": 3.094717701622346e-06, + "loss": 0.3325, "step": 117235 }, { - "epoch": 4.12, - "learning_rate": 3.9232491098552535e-06, - "loss": 0.2564, + "epoch": 4.225321656395286, + "grad_norm": 0.3509252071380615, + "learning_rate": 3.0933115230375812e-06, + "loss": 0.3739, "step": 117240 }, { - "epoch": 4.13, - "learning_rate": 3.921717193577621e-06, - "loss": 0.2345, + "epoch": 4.2255018560565105, + "grad_norm": 0.23616372048854828, + "learning_rate": 3.0919056429272037e-06, + "loss": 0.3129, "step": 117245 }, { - "epoch": 4.13, - "learning_rate": 3.920185550987607e-06, - "loss": 0.2375, + "epoch": 4.225682055717735, + "grad_norm": 0.33164361119270325, + "learning_rate": 3.090500061310378e-06, + "loss": 0.3645, "step": 117250 }, { - "epoch": 4.13, - "learning_rate": 3.918654182105092e-06, - "loss": 0.2377, + "epoch": 4.22586225537896, + "grad_norm": 0.2735852003097534, + "learning_rate": 3.0890947782062416e-06, + "loss": 0.3664, "step": 117255 }, { - "epoch": 4.13, - "learning_rate": 3.917123086949959e-06, - "loss": 0.2494, + "epoch": 4.226042455040185, + "grad_norm": 0.22918978333473206, + "learning_rate": 3.087689793633944e-06, + "loss": 0.3726, "step": 117260 }, { - "epoch": 4.13, - "learning_rate": 3.915592265542081e-06, - "loss": 0.2634, + "epoch": 4.2262226547014095, + "grad_norm": 0.2797573506832123, + "learning_rate": 3.0862851076126325e-06, + "loss": 0.3745, "step": 117265 }, { - "epoch": 4.13, - "learning_rate": 3.914061717901352e-06, - "loss": 0.2385, + "epoch": 4.226402854362634, + "grad_norm": 0.26490533351898193, + "learning_rate": 3.084880720161448e-06, + "loss": 0.3739, "step": 117270 }, { - "epoch": 4.13, - "learning_rate": 3.9125314440476316e-06, - "loss": 0.2662, + "epoch": 4.226583054023858, + "grad_norm": 0.2413850724697113, + "learning_rate": 3.0834766312995174e-06, + "loss": 0.3652, "step": 117275 }, { - "epoch": 4.13, - "learning_rate": 3.911001444000795e-06, - "loss": 0.2587, + "epoch": 4.226763253685083, + "grad_norm": 0.2586573660373688, + "learning_rate": 3.082072841045977e-06, + "loss": 0.3765, "step": 117280 }, { - "epoch": 4.13, - "learning_rate": 3.9094717177806995e-06, - "loss": 0.2776, + "epoch": 4.2269434533463075, + "grad_norm": 0.2556487023830414, + "learning_rate": 3.080669349419943e-06, + "loss": 0.4075, "step": 117285 }, { - "epoch": 4.13, - "learning_rate": 3.907942265407216e-06, - "loss": 0.2637, + "epoch": 4.227123653007532, + "grad_norm": 0.2673538029193878, + "learning_rate": 3.0792661564405524e-06, + "loss": 0.4263, "step": 117290 }, { - "epoch": 4.13, - "learning_rate": 3.906413086900207e-06, - "loss": 0.2112, + "epoch": 4.227303852668757, + "grad_norm": 0.2932944893836975, + "learning_rate": 3.077863262126915e-06, + "loss": 0.3598, "step": 117295 }, { - "epoch": 4.13, - "learning_rate": 3.904884182279523e-06, - "loss": 0.259, + "epoch": 4.227484052329982, + "grad_norm": 0.22899527847766876, + "learning_rate": 3.076460666498146e-06, + "loss": 0.3761, "step": 117300 }, { - "epoch": 4.13, - "learning_rate": 3.903355551565011e-06, - "loss": 0.2503, + "epoch": 4.227664251991206, + "grad_norm": 0.26272207498550415, + "learning_rate": 3.075058369573358e-06, + "loss": 0.3494, "step": 117305 }, { - "epoch": 4.13, - "learning_rate": 3.9018271947765274e-06, - "loss": 0.2463, + "epoch": 4.227844451652431, + "grad_norm": 0.28200453519821167, + "learning_rate": 3.07365637137165e-06, + "loss": 0.3753, "step": 117310 }, { - "epoch": 4.13, - "learning_rate": 3.900299111933917e-06, - "loss": 0.2535, + "epoch": 4.228024651313656, + "grad_norm": 0.2724056839942932, + "learning_rate": 3.072254671912131e-06, + "loss": 0.3659, "step": 117315 }, { - "epoch": 4.13, - "learning_rate": 3.898771303057014e-06, - "loss": 0.2524, + "epoch": 4.22820485097488, + "grad_norm": 0.19723065197467804, + "learning_rate": 3.0708532712139e-06, + "loss": 0.3401, "step": 117320 }, { - "epoch": 4.13, - "learning_rate": 3.8972437681656535e-06, - "loss": 0.258, + "epoch": 4.2283850506361045, + "grad_norm": 0.23599091172218323, + "learning_rate": 3.0694521692960444e-06, + "loss": 0.4004, "step": 117325 }, { - "epoch": 4.13, - "learning_rate": 3.895716507279684e-06, - "loss": 0.2496, + "epoch": 4.228565250297329, + "grad_norm": 0.2825251817703247, + "learning_rate": 3.06805136617766e-06, + "loss": 0.3491, "step": 117330 }, { - "epoch": 4.13, - "learning_rate": 3.8941895204189256e-06, - "loss": 0.2401, + "epoch": 4.228745449958554, + "grad_norm": 0.2366117537021637, + "learning_rate": 3.066650861877823e-06, + "loss": 0.3718, "step": 117335 }, { - "epoch": 4.13, - "learning_rate": 3.8926628076032e-06, - "loss": 0.2656, + "epoch": 4.228925649619779, + "grad_norm": 0.252418577671051, + "learning_rate": 3.0652506564156275e-06, + "loss": 0.4073, "step": 117340 }, { - "epoch": 4.13, - "learning_rate": 3.891136368852341e-06, - "loss": 0.2565, + "epoch": 4.229105849281003, + "grad_norm": 0.25921630859375, + "learning_rate": 3.063850749810149e-06, + "loss": 0.4102, "step": 117345 }, { - "epoch": 4.13, - "learning_rate": 3.8896102041861695e-06, - "loss": 0.2567, + "epoch": 4.229286048942228, + "grad_norm": 0.3047523498535156, + "learning_rate": 3.062451142080447e-06, + "loss": 0.3875, "step": 117350 }, { - "epoch": 4.13, - "learning_rate": 3.888084313624496e-06, - "loss": 0.2358, + "epoch": 4.229466248603453, + "grad_norm": 0.2350996434688568, + "learning_rate": 3.061051833245607e-06, + "loss": 0.3618, "step": 117355 }, { - "epoch": 4.13, - "learning_rate": 3.886558697187129e-06, - "loss": 0.2781, + "epoch": 4.229646448264678, + "grad_norm": 0.19591708481311798, + "learning_rate": 3.059652823324688e-06, + "loss": 0.3718, "step": 117360 }, { - "epoch": 4.13, - "learning_rate": 3.885033354893888e-06, - "loss": 0.2804, + "epoch": 4.229826647925902, + "grad_norm": 0.26747336983680725, + "learning_rate": 3.058254112336753e-06, + "loss": 0.378, "step": 117365 }, { - "epoch": 4.13, - "learning_rate": 3.883508286764576e-06, - "loss": 0.257, + "epoch": 4.230006847587126, + "grad_norm": 0.25700604915618896, + "learning_rate": 3.0568557003008535e-06, + "loss": 0.3677, "step": 117370 }, { - "epoch": 4.13, - "learning_rate": 3.881983492818991e-06, - "loss": 0.2531, + "epoch": 4.230187047248351, + "grad_norm": 0.21863490343093872, + "learning_rate": 3.0554575872360463e-06, + "loss": 0.4036, "step": 117375 }, { - "epoch": 4.13, - "learning_rate": 3.880458973076928e-06, - "loss": 0.2463, + "epoch": 4.230367246909576, + "grad_norm": 0.2407938539981842, + "learning_rate": 3.054059773161383e-06, + "loss": 0.3465, "step": 117380 }, { - "epoch": 4.13, - "learning_rate": 3.878934727558192e-06, - "loss": 0.2458, + "epoch": 4.2305474465708, + "grad_norm": 0.27205365896224976, + "learning_rate": 3.0526622580959063e-06, + "loss": 0.3897, "step": 117385 }, { - "epoch": 4.13, - "learning_rate": 3.877410756282571e-06, - "loss": 0.2487, + "epoch": 4.230727646232025, + "grad_norm": 0.21772317588329315, + "learning_rate": 3.0512650420586595e-06, + "loss": 0.3788, "step": 117390 }, { - "epoch": 4.13, - "learning_rate": 3.875887059269842e-06, - "loss": 0.2371, + "epoch": 4.23090784589325, + "grad_norm": 0.24121488630771637, + "learning_rate": 3.049868125068675e-06, + "loss": 0.3711, "step": 117395 }, { - "epoch": 4.13, - "learning_rate": 3.874363636539808e-06, - "loss": 0.2452, + "epoch": 4.231088045554475, + "grad_norm": 0.22327007353305817, + "learning_rate": 3.04847150714499e-06, + "loss": 0.3705, "step": 117400 }, { - "epoch": 4.13, - "learning_rate": 3.872840488112231e-06, - "loss": 0.2494, + "epoch": 4.231268245215699, + "grad_norm": 0.21851101517677307, + "learning_rate": 3.0470751883066223e-06, + "loss": 0.3382, "step": 117405 }, { - "epoch": 4.13, - "learning_rate": 3.871317614006903e-06, - "loss": 0.2555, + "epoch": 4.231448444876923, + "grad_norm": 0.3136238753795624, + "learning_rate": 3.0456791685726126e-06, + "loss": 0.3754, "step": 117410 }, { - "epoch": 4.13, - "learning_rate": 3.869795014243591e-06, - "loss": 0.2455, + "epoch": 4.231628644538148, + "grad_norm": 0.24977144598960876, + "learning_rate": 3.0442834479619707e-06, + "loss": 0.3548, "step": 117415 }, { - "epoch": 4.13, - "learning_rate": 3.868272688842062e-06, - "loss": 0.2585, + "epoch": 4.231808844199373, + "grad_norm": 0.24291959404945374, + "learning_rate": 3.0428880264937204e-06, + "loss": 0.3734, "step": 117420 }, { - "epoch": 4.13, - "learning_rate": 3.866750637822089e-06, - "loss": 0.2314, + "epoch": 4.231989043860597, + "grad_norm": 0.2255113422870636, + "learning_rate": 3.0414929041868657e-06, + "loss": 0.3476, "step": 117425 }, { - "epoch": 4.13, - "learning_rate": 3.865228861203435e-06, - "loss": 0.2348, + "epoch": 4.232169243521822, + "grad_norm": 0.5030208230018616, + "learning_rate": 3.0400980810604164e-06, + "loss": 0.3713, "step": 117430 }, { - "epoch": 4.13, - "learning_rate": 3.863707359005852e-06, - "loss": 0.2387, + "epoch": 4.232349443183047, + "grad_norm": 0.22084102034568787, + "learning_rate": 3.0387035571333906e-06, + "loss": 0.365, "step": 117435 }, { - "epoch": 4.13, - "learning_rate": 3.862186131249096e-06, - "loss": 0.2661, + "epoch": 4.232529642844272, + "grad_norm": 0.20808632671833038, + "learning_rate": 3.0373093324247676e-06, + "loss": 0.3707, "step": 117440 }, { - "epoch": 4.13, - "learning_rate": 3.860665177952927e-06, - "loss": 0.2644, + "epoch": 4.232709842505496, + "grad_norm": 0.2130783647298813, + "learning_rate": 3.035915406953552e-06, + "loss": 0.3532, "step": 117445 }, { - "epoch": 4.13, - "learning_rate": 3.859144499137085e-06, - "loss": 0.2466, + "epoch": 4.232890042166721, + "grad_norm": 0.205308198928833, + "learning_rate": 3.0345217807387387e-06, + "loss": 0.35, "step": 117450 }, { - "epoch": 4.13, - "learning_rate": 3.857624094821324e-06, - "loss": 0.2712, + "epoch": 4.233070241827946, + "grad_norm": 0.2538002133369446, + "learning_rate": 3.033128453799308e-06, + "loss": 0.3864, "step": 117455 }, { - "epoch": 4.13, - "learning_rate": 3.856103965025376e-06, - "loss": 0.2314, + "epoch": 4.23325044148917, + "grad_norm": 0.27659669518470764, + "learning_rate": 3.0317354261542613e-06, + "loss": 0.3974, "step": 117460 }, { - "epoch": 4.13, - "learning_rate": 3.854584109768991e-06, - "loss": 0.2546, + "epoch": 4.233430641150394, + "grad_norm": 0.2688102722167969, + "learning_rate": 3.0303426978225607e-06, + "loss": 0.3609, "step": 117465 }, { - "epoch": 4.13, - "learning_rate": 3.853064529071895e-06, - "loss": 0.2517, + "epoch": 4.233610840811619, + "grad_norm": 0.25696924328804016, + "learning_rate": 3.028950268823183e-06, + "loss": 0.3414, "step": 117470 }, { - "epoch": 4.13, - "learning_rate": 3.851545222953812e-06, - "loss": 0.2374, + "epoch": 4.233791040472844, + "grad_norm": 0.19637776911258698, + "learning_rate": 3.0275581391751075e-06, + "loss": 0.3418, "step": 117475 }, { - "epoch": 4.13, - "learning_rate": 3.850026191434483e-06, - "loss": 0.2398, + "epoch": 4.233971240134069, + "grad_norm": 0.25194182991981506, + "learning_rate": 3.026166308897299e-06, + "loss": 0.3505, "step": 117480 }, { - "epoch": 4.13, - "learning_rate": 3.848507434533627e-06, - "loss": 0.2402, + "epoch": 4.234151439795293, + "grad_norm": 0.3067736029624939, + "learning_rate": 3.024774778008721e-06, + "loss": 0.3806, "step": 117485 }, { - "epoch": 4.13, - "learning_rate": 3.846988952270964e-06, - "loss": 0.2551, + "epoch": 4.234331639456518, + "grad_norm": 0.2704439163208008, + "learning_rate": 3.023383546528333e-06, + "loss": 0.4056, "step": 117490 }, { - "epoch": 4.13, - "learning_rate": 3.8454707446661995e-06, - "loss": 0.2419, + "epoch": 4.234511839117743, + "grad_norm": 0.23782427608966827, + "learning_rate": 3.021992614475083e-06, + "loss": 0.3591, "step": 117495 }, { - "epoch": 4.13, - "learning_rate": 3.8439528117390625e-06, - "loss": 0.2405, + "epoch": 4.2346920387789675, + "grad_norm": 0.21595478057861328, + "learning_rate": 3.020601981867932e-06, + "loss": 0.3604, "step": 117500 }, { - "epoch": 4.13, - "eval_loss": 0.2497539073228836, - "eval_runtime": 10.5514, - "eval_samples_per_second": 9.477, - "eval_steps_per_second": 9.477, + "epoch": 4.2346920387789675, + "eval_loss": 0.42911702394485474, + "eval_runtime": 3.5344, + "eval_samples_per_second": 28.293, + "eval_steps_per_second": 7.073, "step": 117500 }, { - "epoch": 4.13, - "learning_rate": 3.842435153509252e-06, - "loss": 0.2553, + "epoch": 4.234872238440191, + "grad_norm": 0.2689270079135895, + "learning_rate": 3.0192116487258252e-06, + "loss": 0.378, "step": 117505 }, { - "epoch": 4.13, - "learning_rate": 3.840917769996483e-06, - "loss": 0.231, + "epoch": 4.235052438101416, + "grad_norm": 0.24561098217964172, + "learning_rate": 3.017821615067706e-06, + "loss": 0.349, "step": 117510 }, { - "epoch": 4.13, - "learning_rate": 3.839400661220446e-06, - "loss": 0.2546, + "epoch": 4.235232637762641, + "grad_norm": 0.2601664364337921, + "learning_rate": 3.01643188091251e-06, + "loss": 0.3717, "step": 117515 }, { - "epoch": 4.13, - "learning_rate": 3.837883827200853e-06, - "loss": 0.2302, + "epoch": 4.2354128374238655, + "grad_norm": 0.21555201709270477, + "learning_rate": 3.0150424462791706e-06, + "loss": 0.404, "step": 117520 }, { - "epoch": 4.13, - "learning_rate": 3.836367267957391e-06, - "loss": 0.2676, + "epoch": 4.23559303708509, + "grad_norm": 0.30406418442726135, + "learning_rate": 3.0136533111866206e-06, + "loss": 0.3917, "step": 117525 }, { - "epoch": 4.14, - "learning_rate": 3.834850983509752e-06, - "loss": 0.2365, + "epoch": 4.235773236746315, + "grad_norm": 0.21674159169197083, + "learning_rate": 3.012264475653795e-06, + "loss": 0.365, "step": 117530 }, { - "epoch": 4.14, - "learning_rate": 3.833334973877623e-06, - "loss": 0.26, + "epoch": 4.23595343640754, + "grad_norm": 0.27085331082344055, + "learning_rate": 3.0108759396996004e-06, + "loss": 0.3447, "step": 117535 }, { - "epoch": 4.14, - "learning_rate": 3.831819239080692e-06, - "loss": 0.2557, + "epoch": 4.2361336360687645, + "grad_norm": 0.20267651975154877, + "learning_rate": 3.0094877033429667e-06, + "loss": 0.3808, "step": 117540 }, { - "epoch": 4.14, - "learning_rate": 3.830303779138639e-06, - "loss": 0.2491, + "epoch": 4.236313835729989, + "grad_norm": 0.26923617720603943, + "learning_rate": 3.0080997666028095e-06, + "loss": 0.3694, "step": 117545 }, { - "epoch": 4.14, - "learning_rate": 3.828788594071137e-06, - "loss": 0.2355, + "epoch": 4.236494035391213, + "grad_norm": 0.21540358662605286, + "learning_rate": 3.0067121294980323e-06, + "loss": 0.3543, "step": 117550 }, { - "epoch": 4.14, - "learning_rate": 3.827273683897861e-06, - "loss": 0.2416, + "epoch": 4.236674235052438, + "grad_norm": 0.23205356299877167, + "learning_rate": 3.0053247920475452e-06, + "loss": 0.4063, "step": 117555 }, { - "epoch": 4.14, - "learning_rate": 3.825759048638486e-06, - "loss": 0.2717, + "epoch": 4.2368544347136625, + "grad_norm": 0.20041492581367493, + "learning_rate": 3.0039377542702417e-06, + "loss": 0.3609, "step": 117560 }, { - "epoch": 4.14, - "learning_rate": 3.824244688312673e-06, - "loss": 0.2813, + "epoch": 4.237034634374887, + "grad_norm": 0.3079770803451538, + "learning_rate": 3.0025510161850373e-06, + "loss": 0.3538, "step": 117565 }, { - "epoch": 4.14, - "learning_rate": 3.8227306029400834e-06, - "loss": 0.2594, + "epoch": 4.237214834036112, + "grad_norm": 0.28639087080955505, + "learning_rate": 3.0011645778108137e-06, + "loss": 0.3659, "step": 117570 }, { - "epoch": 4.14, - "learning_rate": 3.821216792540388e-06, - "loss": 0.2486, + "epoch": 4.237395033697337, + "grad_norm": 0.2304767519235611, + "learning_rate": 2.9997784391664647e-06, + "loss": 0.4097, "step": 117575 }, { - "epoch": 4.14, - "learning_rate": 3.819703257133236e-06, - "loss": 0.251, + "epoch": 4.2375752333585615, + "grad_norm": 0.2690269947052002, + "learning_rate": 2.9983926002708746e-06, + "loss": 0.3809, "step": 117580 }, { - "epoch": 4.14, - "learning_rate": 3.818189996738278e-06, - "loss": 0.2698, + "epoch": 4.237755433019786, + "grad_norm": 0.26503047347068787, + "learning_rate": 2.9970070611429257e-06, + "loss": 0.3656, "step": 117585 }, { - "epoch": 4.14, - "learning_rate": 3.8166770113751585e-06, - "loss": 0.2715, + "epoch": 4.237935632681011, + "grad_norm": 0.1937418282032013, + "learning_rate": 2.9956218218014893e-06, + "loss": 0.3704, "step": 117590 }, { - "epoch": 4.14, - "learning_rate": 3.815164301063531e-06, - "loss": 0.2503, + "epoch": 4.238115832342235, + "grad_norm": 0.24524284899234772, + "learning_rate": 2.9942368822654527e-06, + "loss": 0.4087, "step": 117595 }, { - "epoch": 4.14, - "learning_rate": 3.8136518658230367e-06, - "loss": 0.2364, + "epoch": 4.2382960320034595, + "grad_norm": 0.27891620993614197, + "learning_rate": 2.9928522425536757e-06, + "loss": 0.3533, "step": 117600 }, { - "epoch": 4.14, - "learning_rate": 3.8121397056733093e-06, - "loss": 0.2451, + "epoch": 4.238476231664684, + "grad_norm": 0.2644476890563965, + "learning_rate": 2.991467902685027e-06, + "loss": 0.3914, "step": 117605 }, { - "epoch": 4.14, - "learning_rate": 3.81062782063398e-06, - "loss": 0.2568, + "epoch": 4.238656431325909, + "grad_norm": 0.2886243164539337, + "learning_rate": 2.9900838626783693e-06, + "loss": 0.3735, "step": 117610 }, { - "epoch": 4.14, - "learning_rate": 3.809116210724689e-06, - "loss": 0.2539, + "epoch": 4.238836630987134, + "grad_norm": 0.2373829036951065, + "learning_rate": 2.9887001225525514e-06, + "loss": 0.3564, "step": 117615 }, { - "epoch": 4.14, - "learning_rate": 3.807604875965051e-06, - "loss": 0.2518, + "epoch": 4.239016830648358, + "grad_norm": 0.22608888149261475, + "learning_rate": 2.987316682326438e-06, + "loss": 0.3711, "step": 117620 }, { - "epoch": 4.14, - "learning_rate": 3.806093816374706e-06, - "loss": 0.2304, + "epoch": 4.239197030309583, + "grad_norm": 0.24686965346336365, + "learning_rate": 2.985933542018876e-06, + "loss": 0.3849, "step": 117625 }, { - "epoch": 4.14, - "learning_rate": 3.8045830319732577e-06, - "loss": 0.2379, + "epoch": 4.239377229970808, + "grad_norm": 0.22954131662845612, + "learning_rate": 2.9845507016487e-06, + "loss": 0.3794, "step": 117630 }, { - "epoch": 4.14, - "learning_rate": 3.8030725227803377e-06, - "loss": 0.2314, + "epoch": 4.239557429632033, + "grad_norm": 0.2016417235136032, + "learning_rate": 2.9831681612347616e-06, + "loss": 0.3694, "step": 117635 }, { - "epoch": 4.14, - "learning_rate": 3.8015622888155523e-06, - "loss": 0.2765, + "epoch": 4.239737629293257, + "grad_norm": 0.21057425439357758, + "learning_rate": 2.9817859207958905e-06, + "loss": 0.3543, "step": 117640 }, { - "epoch": 4.14, - "learning_rate": 3.800052330098508e-06, - "loss": 0.2572, + "epoch": 4.239917828954481, + "grad_norm": 0.23975323140621185, + "learning_rate": 2.980403980350932e-06, + "loss": 0.3292, "step": 117645 }, { - "epoch": 4.14, - "learning_rate": 3.798542646648809e-06, - "loss": 0.2456, + "epoch": 4.240098028615706, + "grad_norm": 0.23370075225830078, + "learning_rate": 2.979022339918702e-06, + "loss": 0.3572, "step": 117650 }, { - "epoch": 4.14, - "learning_rate": 3.7970332384860642e-06, - "loss": 0.2484, + "epoch": 4.240278228276931, + "grad_norm": 0.22406595945358276, + "learning_rate": 2.977640999518025e-06, + "loss": 0.3527, "step": 117655 }, { - "epoch": 4.14, - "learning_rate": 3.795524105629872e-06, - "loss": 0.2325, + "epoch": 4.240458427938155, + "grad_norm": 0.27608826756477356, + "learning_rate": 2.976259959167732e-06, + "loss": 0.3598, "step": 117660 }, { - "epoch": 4.14, - "learning_rate": 3.794015248099825e-06, - "loss": 0.249, + "epoch": 4.24063862759938, + "grad_norm": 0.286156564950943, + "learning_rate": 2.9748792188866308e-06, + "loss": 0.3439, "step": 117665 }, { - "epoch": 4.14, - "learning_rate": 3.79250666591551e-06, - "loss": 0.2444, + "epoch": 4.240818827260605, + "grad_norm": 0.3072472810745239, + "learning_rate": 2.973498778693537e-06, + "loss": 0.3777, "step": 117670 }, { - "epoch": 4.14, - "learning_rate": 3.7909983590965176e-06, - "loss": 0.2437, + "epoch": 4.24099902692183, + "grad_norm": 0.2714073956012726, + "learning_rate": 2.9721186386072573e-06, + "loss": 0.3941, "step": 117675 }, { - "epoch": 4.14, - "learning_rate": 3.78949032766244e-06, - "loss": 0.2645, + "epoch": 4.241179226583054, + "grad_norm": 0.22121752798557281, + "learning_rate": 2.970738798646591e-06, + "loss": 0.354, "step": 117680 }, { - "epoch": 4.14, - "learning_rate": 3.7879825716328475e-06, - "loss": 0.2362, + "epoch": 4.241359426244279, + "grad_norm": 0.2743302285671234, + "learning_rate": 2.9693592588303482e-06, + "loss": 0.3862, "step": 117685 }, { - "epoch": 4.14, - "learning_rate": 3.786475091027328e-06, - "loss": 0.2244, + "epoch": 4.241539625905503, + "grad_norm": 0.2942706048488617, + "learning_rate": 2.9679800191773215e-06, + "loss": 0.388, "step": 117690 }, { - "epoch": 4.14, - "learning_rate": 3.784967885865451e-06, - "loss": 0.2653, + "epoch": 4.241719825566728, + "grad_norm": 0.26952454447746277, + "learning_rate": 2.966601079706302e-06, + "loss": 0.381, "step": 117695 }, { - "epoch": 4.14, - "learning_rate": 3.7834609561667817e-06, - "loss": 0.2634, + "epoch": 4.241900025227952, + "grad_norm": 0.17778491973876953, + "learning_rate": 2.9652224404360747e-06, + "loss": 0.3192, "step": 117700 }, { - "epoch": 4.14, - "learning_rate": 3.7819543019508852e-06, - "loss": 0.2359, + "epoch": 4.242080224889177, + "grad_norm": 0.21804936230182648, + "learning_rate": 2.9638441013854268e-06, + "loss": 0.3632, "step": 117705 }, { - "epoch": 4.14, - "learning_rate": 3.7804479232373345e-06, - "loss": 0.2654, + "epoch": 4.242260424550402, + "grad_norm": 0.25280699133872986, + "learning_rate": 2.9624660625731325e-06, + "loss": 0.3887, "step": 117710 }, { - "epoch": 4.14, - "learning_rate": 3.778941820045681e-06, - "loss": 0.2537, + "epoch": 4.242440624211627, + "grad_norm": 0.2739708721637726, + "learning_rate": 2.96108832401798e-06, + "loss": 0.3653, "step": 117715 }, { - "epoch": 4.14, - "learning_rate": 3.7774359923954866e-06, - "loss": 0.2491, + "epoch": 4.242620823872851, + "grad_norm": 0.19621554017066956, + "learning_rate": 2.959710885738723e-06, + "loss": 0.3652, "step": 117720 }, { - "epoch": 4.14, - "learning_rate": 3.7759304403062914e-06, - "loss": 0.2456, + "epoch": 4.242801023534076, + "grad_norm": 0.25483089685440063, + "learning_rate": 2.9583337477541437e-06, + "loss": 0.3788, "step": 117725 }, { - "epoch": 4.14, - "learning_rate": 3.774425163797654e-06, - "loss": 0.2637, + "epoch": 4.242981223195301, + "grad_norm": 0.25373703241348267, + "learning_rate": 2.9569569100830003e-06, + "loss": 0.3526, "step": 117730 }, { - "epoch": 4.14, - "learning_rate": 3.7729201628891207e-06, - "loss": 0.2687, + "epoch": 4.243161422856525, + "grad_norm": 0.29680660367012024, + "learning_rate": 2.9555803727440436e-06, + "loss": 0.4243, "step": 117735 }, { - "epoch": 4.14, - "learning_rate": 3.7714154376002313e-06, - "loss": 0.2358, + "epoch": 4.243341622517749, + "grad_norm": 0.24780967831611633, + "learning_rate": 2.95420413575605e-06, + "loss": 0.3667, "step": 117740 }, { - "epoch": 4.14, - "learning_rate": 3.7699109879505173e-06, - "loss": 0.268, + "epoch": 4.243521822178974, + "grad_norm": 0.22331710159778595, + "learning_rate": 2.9528281991377443e-06, + "loss": 0.3457, "step": 117745 }, { - "epoch": 4.14, - "learning_rate": 3.7684068139595243e-06, - "loss": 0.2769, + "epoch": 4.243702021840199, + "grad_norm": 0.20010043680667877, + "learning_rate": 2.9514525629078915e-06, + "loss": 0.3773, "step": 117750 }, { - "epoch": 4.14, - "learning_rate": 3.766902915646775e-06, - "loss": 0.2521, + "epoch": 4.243882221501424, + "grad_norm": 0.27777811884880066, + "learning_rate": 2.9500772270852268e-06, + "loss": 0.3659, "step": 117755 }, { - "epoch": 4.14, - "learning_rate": 3.7653992930317987e-06, - "loss": 0.2455, + "epoch": 4.244062421162648, + "grad_norm": 0.25709882378578186, + "learning_rate": 2.948702191688493e-06, + "loss": 0.385, "step": 117760 }, { - "epoch": 4.14, - "learning_rate": 3.7638959461341133e-06, - "loss": 0.2522, + "epoch": 4.244242620823873, + "grad_norm": 0.3200331926345825, + "learning_rate": 2.9473274567364234e-06, + "loss": 0.3751, "step": 117765 }, { - "epoch": 4.14, - "learning_rate": 3.76239287497325e-06, - "loss": 0.2544, + "epoch": 4.244422820485098, + "grad_norm": 0.1971009075641632, + "learning_rate": 2.9459530222477464e-06, + "loss": 0.3772, "step": 117770 }, { - "epoch": 4.14, - "learning_rate": 3.7608900795687186e-06, - "loss": 0.2571, + "epoch": 4.2446030201463225, + "grad_norm": 0.23797380924224854, + "learning_rate": 2.944578888241184e-06, + "loss": 0.3566, "step": 117775 }, { - "epoch": 4.14, - "learning_rate": 3.759387559940025e-06, - "loss": 0.2311, + "epoch": 4.244783219807546, + "grad_norm": 0.2632618844509125, + "learning_rate": 2.9432050547354706e-06, + "loss": 0.3802, "step": 117780 }, { - "epoch": 4.14, - "learning_rate": 3.7578853161066878e-06, - "loss": 0.2418, + "epoch": 4.244963419468771, + "grad_norm": 0.30606886744499207, + "learning_rate": 2.9418315217493165e-06, + "loss": 0.3788, "step": 117785 }, { - "epoch": 4.14, - "learning_rate": 3.756383348088216e-06, - "loss": 0.2437, + "epoch": 4.245143619129996, + "grad_norm": 0.25161272287368774, + "learning_rate": 2.94045828930144e-06, + "loss": 0.3892, "step": 117790 }, { - "epoch": 4.14, - "learning_rate": 3.754881655904105e-06, - "loss": 0.2508, + "epoch": 4.245323818791221, + "grad_norm": 0.24037761986255646, + "learning_rate": 2.9390853574105455e-06, + "loss": 0.3661, "step": 117795 }, { - "epoch": 4.14, - "learning_rate": 3.7533802395738503e-06, - "loss": 0.2473, + "epoch": 4.245504018452445, + "grad_norm": 0.2599053680896759, + "learning_rate": 2.9377127260953374e-06, + "loss": 0.3867, "step": 117800 }, { - "epoch": 4.14, - "learning_rate": 3.7518790991169563e-06, - "loss": 0.2565, + "epoch": 4.24568421811367, + "grad_norm": 0.2276243269443512, + "learning_rate": 2.936340395374529e-06, + "loss": 0.4001, "step": 117805 }, { - "epoch": 4.14, - "learning_rate": 3.7503782345529094e-06, - "loss": 0.2237, + "epoch": 4.245864417774895, + "grad_norm": 0.3195003271102905, + "learning_rate": 2.934968365266816e-06, + "loss": 0.351, "step": 117810 }, { - "epoch": 4.15, - "learning_rate": 3.7488776459011972e-06, - "loss": 0.2534, + "epoch": 4.2460446174361195, + "grad_norm": 0.24188397824764252, + "learning_rate": 2.933596635790875e-06, + "loss": 0.3847, "step": 117815 }, { - "epoch": 4.15, - "learning_rate": 3.7473773331812982e-06, - "loss": 0.249, + "epoch": 4.246224817097344, + "grad_norm": 0.23399367928504944, + "learning_rate": 2.9322252069654137e-06, + "loss": 0.3921, "step": 117820 }, { - "epoch": 4.15, - "learning_rate": 3.745877296412706e-06, - "loss": 0.2373, + "epoch": 4.246405016758568, + "grad_norm": 0.25146761536598206, + "learning_rate": 2.930854078809106e-06, + "loss": 0.3672, "step": 117825 }, { - "epoch": 4.15, - "learning_rate": 3.7443775356148875e-06, - "loss": 0.2657, + "epoch": 4.246585216419793, + "grad_norm": 0.27147242426872253, + "learning_rate": 2.929483251340648e-06, + "loss": 0.4172, "step": 117830 }, { - "epoch": 4.15, - "learning_rate": 3.7428780508073134e-06, - "loss": 0.287, + "epoch": 4.2467654160810175, + "grad_norm": 0.21517179906368256, + "learning_rate": 2.9281127245787047e-06, + "loss": 0.3483, "step": 117835 }, { - "epoch": 4.15, - "learning_rate": 3.741378842009466e-06, - "loss": 0.2629, + "epoch": 4.246945615742242, + "grad_norm": 0.1976982206106186, + "learning_rate": 2.9267424985419457e-06, + "loss": 0.3928, "step": 117840 }, { - "epoch": 4.15, - "learning_rate": 3.7398799092407956e-06, - "loss": 0.2411, + "epoch": 4.247125815403467, + "grad_norm": 0.2845209836959839, + "learning_rate": 2.925372573249052e-06, + "loss": 0.3635, "step": 117845 }, { - "epoch": 4.15, - "learning_rate": 3.738381252520781e-06, - "loss": 0.2443, + "epoch": 4.247306015064692, + "grad_norm": 0.31492358446121216, + "learning_rate": 2.924002948718685e-06, + "loss": 0.3828, "step": 117850 }, { - "epoch": 4.15, - "learning_rate": 3.736882871868877e-06, - "loss": 0.2538, + "epoch": 4.2474862147259165, + "grad_norm": 0.3145192265510559, + "learning_rate": 2.922633624969501e-06, + "loss": 0.3752, "step": 117855 }, { - "epoch": 4.15, - "learning_rate": 3.7353847673045255e-06, - "loss": 0.2542, + "epoch": 4.247666414387141, + "grad_norm": 0.26329880952835083, + "learning_rate": 2.9212646020201602e-06, + "loss": 0.3715, "step": 117860 }, { - "epoch": 4.15, - "learning_rate": 3.7338869388471975e-06, - "loss": 0.2587, + "epoch": 4.247846614048366, + "grad_norm": 0.22250141203403473, + "learning_rate": 2.9198958798893115e-06, + "loss": 0.403, "step": 117865 }, { - "epoch": 4.15, - "learning_rate": 3.732389386516333e-06, - "loss": 0.2654, + "epoch": 4.24802681370959, + "grad_norm": 0.25044476985931396, + "learning_rate": 2.9185274585956102e-06, + "loss": 0.368, "step": 117870 }, { - "epoch": 4.15, - "learning_rate": 3.7308921103313742e-06, - "loss": 0.253, + "epoch": 4.2482070133708145, + "grad_norm": 0.22099538147449493, + "learning_rate": 2.9171593381576962e-06, + "loss": 0.3449, "step": 117875 }, { - "epoch": 4.15, - "learning_rate": 3.729395110311759e-06, - "loss": 0.2443, + "epoch": 4.248387213032039, + "grad_norm": 0.26484134793281555, + "learning_rate": 2.915791518594213e-06, + "loss": 0.3742, "step": 117880 }, { - "epoch": 4.15, - "learning_rate": 3.7278983864769386e-06, - "loss": 0.2693, + "epoch": 4.248567412693264, + "grad_norm": 0.2437736839056015, + "learning_rate": 2.914423999923793e-06, + "loss": 0.3779, "step": 117885 }, { - "epoch": 4.15, - "learning_rate": 3.72640193884633e-06, - "loss": 0.2623, + "epoch": 4.248747612354489, + "grad_norm": 0.2136683613061905, + "learning_rate": 2.9130567821650717e-06, + "loss": 0.3602, "step": 117890 }, { - "epoch": 4.15, - "learning_rate": 3.7249057674393797e-06, - "loss": 0.251, + "epoch": 4.2489278120157135, + "grad_norm": 0.2902207374572754, + "learning_rate": 2.9116898653366697e-06, + "loss": 0.3767, "step": 117895 }, { - "epoch": 4.15, - "learning_rate": 3.7234098722755017e-06, - "loss": 0.2495, + "epoch": 4.249108011676938, + "grad_norm": 0.263248473405838, + "learning_rate": 2.9103232494572198e-06, + "loss": 0.3361, "step": 117900 }, { - "epoch": 4.15, - "learning_rate": 3.7219142533741288e-06, - "loss": 0.2352, + "epoch": 4.249288211338163, + "grad_norm": 0.31815701723098755, + "learning_rate": 2.9089569345453425e-06, + "loss": 0.3547, "step": 117905 }, { - "epoch": 4.15, - "learning_rate": 3.7204189107546777e-06, - "loss": 0.2399, + "epoch": 4.249468410999388, + "grad_norm": 0.27344661951065063, + "learning_rate": 2.907590920619649e-06, + "loss": 0.3585, "step": 117910 }, { - "epoch": 4.15, - "learning_rate": 3.718923844436559e-06, - "loss": 0.2328, + "epoch": 4.249648610660612, + "grad_norm": 0.25131598114967346, + "learning_rate": 2.9062252076987512e-06, + "loss": 0.3798, "step": 117915 }, { - "epoch": 4.15, - "learning_rate": 3.717429054439195e-06, - "loss": 0.247, + "epoch": 4.249828810321836, + "grad_norm": 0.24895480275154114, + "learning_rate": 2.9048597958012515e-06, + "loss": 0.3788, "step": 117920 }, { - "epoch": 4.15, - "learning_rate": 3.7159345407819874e-06, - "loss": 0.2389, + "epoch": 4.250009009983061, + "grad_norm": 0.2820626497268677, + "learning_rate": 2.9034946849457707e-06, + "loss": 0.3905, "step": 117925 }, { - "epoch": 4.15, - "learning_rate": 3.714440303484343e-06, - "loss": 0.2324, + "epoch": 4.250189209644286, + "grad_norm": 0.24140560626983643, + "learning_rate": 2.9021298751508856e-06, + "loss": 0.3902, "step": 117930 }, { - "epoch": 4.15, - "learning_rate": 3.712946342565657e-06, - "loss": 0.2551, + "epoch": 4.25036940930551, + "grad_norm": 0.19780555367469788, + "learning_rate": 2.9007653664352095e-06, + "loss": 0.3747, "step": 117935 }, { - "epoch": 4.15, - "learning_rate": 3.7114526580453416e-06, - "loss": 0.245, + "epoch": 4.250549608966735, + "grad_norm": 0.23243926465511322, + "learning_rate": 2.899401158817325e-06, + "loss": 0.3512, "step": 117940 }, { - "epoch": 4.15, - "learning_rate": 3.7099592499427764e-06, - "loss": 0.2531, + "epoch": 4.25072980862796, + "grad_norm": 0.19363261759281158, + "learning_rate": 2.898037252315822e-06, + "loss": 0.3274, "step": 117945 }, { - "epoch": 4.15, - "learning_rate": 3.708466118277368e-06, - "loss": 0.2384, + "epoch": 4.250910008289185, + "grad_norm": 0.22952446341514587, + "learning_rate": 2.8966736469492826e-06, + "loss": 0.3882, "step": 117950 }, { - "epoch": 4.15, - "learning_rate": 3.706973263068489e-06, - "loss": 0.2494, + "epoch": 4.251090207950409, + "grad_norm": 0.2463994026184082, + "learning_rate": 2.8953103427362847e-06, + "loss": 0.3941, "step": 117955 }, { - "epoch": 4.15, - "learning_rate": 3.7054806843355328e-06, - "loss": 0.2568, + "epoch": 4.251270407611634, + "grad_norm": 0.2101328819990158, + "learning_rate": 2.893947339695399e-06, + "loss": 0.3343, "step": 117960 }, { - "epoch": 4.15, - "learning_rate": 3.703988382097878e-06, - "loss": 0.2533, + "epoch": 4.251450607272858, + "grad_norm": 0.22500717639923096, + "learning_rate": 2.8925846378452056e-06, + "loss": 0.3845, "step": 117965 }, { - "epoch": 4.15, - "learning_rate": 3.702496356374899e-06, - "loss": 0.2504, + "epoch": 4.251630806934083, + "grad_norm": 0.2579551935195923, + "learning_rate": 2.8912222372042663e-06, + "loss": 0.3704, "step": 117970 }, { - "epoch": 4.15, - "learning_rate": 3.701004607185965e-06, - "loss": 0.2526, + "epoch": 4.251811006595307, + "grad_norm": 0.28678232431411743, + "learning_rate": 2.889860137791145e-06, + "loss": 0.3729, "step": 117975 }, { - "epoch": 4.15, - "learning_rate": 3.699513134550453e-06, - "loss": 0.2547, + "epoch": 4.251991206256532, + "grad_norm": 0.24297846853733063, + "learning_rate": 2.8884983396243988e-06, + "loss": 0.3957, "step": 117980 }, { - "epoch": 4.15, - "learning_rate": 3.698021938487728e-06, - "loss": 0.2365, + "epoch": 4.252171405917757, + "grad_norm": 0.2564467191696167, + "learning_rate": 2.8871368427225736e-06, + "loss": 0.3645, "step": 117985 }, { - "epoch": 4.15, - "learning_rate": 3.696531019017149e-06, - "loss": 0.2438, + "epoch": 4.252351605578982, + "grad_norm": 0.21953606605529785, + "learning_rate": 2.8857756471042357e-06, + "loss": 0.3874, "step": 117990 }, { - "epoch": 4.15, - "learning_rate": 3.6950403761580683e-06, - "loss": 0.2765, + "epoch": 4.252531805240206, + "grad_norm": 0.249432772397995, + "learning_rate": 2.8844147527879283e-06, + "loss": 0.4071, "step": 117995 }, { - "epoch": 4.15, - "learning_rate": 3.6935500099298524e-06, - "loss": 0.2527, + "epoch": 4.252712004901431, + "grad_norm": 0.2628020942211151, + "learning_rate": 2.8830541597921755e-06, + "loss": 0.3869, "step": 118000 }, { - "epoch": 4.15, - "eval_loss": 0.24940206110477448, - "eval_runtime": 10.5669, - "eval_samples_per_second": 9.464, - "eval_steps_per_second": 9.464, + "epoch": 4.252712004901431, + "eval_loss": 0.42919114232063293, + "eval_runtime": 3.5338, + "eval_samples_per_second": 28.298, + "eval_steps_per_second": 7.075, "step": 118000 }, { - "epoch": 4.15, - "learning_rate": 3.6920599203518453e-06, - "loss": 0.2677, + "epoch": 4.252892204562656, + "grad_norm": 0.2826399505138397, + "learning_rate": 2.881693868135535e-06, + "loss": 0.3887, "step": 118005 }, { - "epoch": 4.15, - "learning_rate": 3.690570107443403e-06, - "loss": 0.2606, + "epoch": 4.25307240422388, + "grad_norm": 0.24618491530418396, + "learning_rate": 2.8803338778365273e-06, + "loss": 0.3729, "step": 118010 }, { - "epoch": 4.15, - "learning_rate": 3.6890805712238576e-06, - "loss": 0.2524, + "epoch": 4.253252603885104, + "grad_norm": 0.2526216208934784, + "learning_rate": 2.8789741889136994e-06, + "loss": 0.4333, "step": 118015 }, { - "epoch": 4.15, - "learning_rate": 3.6875913117125626e-06, - "loss": 0.2526, + "epoch": 4.253432803546329, + "grad_norm": 0.3000968098640442, + "learning_rate": 2.877614801385559e-06, + "loss": 0.3544, "step": 118020 }, { - "epoch": 4.15, - "learning_rate": 3.6861023289288476e-06, - "loss": 0.2534, + "epoch": 4.253613003207554, + "grad_norm": 0.23581233620643616, + "learning_rate": 2.8762557152706286e-06, + "loss": 0.3761, "step": 118025 }, { - "epoch": 4.15, - "learning_rate": 3.6846136228920497e-06, - "loss": 0.2687, + "epoch": 4.253793202868779, + "grad_norm": 0.2421904057264328, + "learning_rate": 2.8748969305874367e-06, + "loss": 0.3642, "step": 118030 }, { - "epoch": 4.15, - "learning_rate": 3.6831251936214894e-06, - "loss": 0.2521, + "epoch": 4.253973402530003, + "grad_norm": 0.20584893226623535, + "learning_rate": 2.87353844735449e-06, + "loss": 0.3414, "step": 118035 }, { - "epoch": 4.15, - "learning_rate": 3.681637041136507e-06, - "loss": 0.2366, + "epoch": 4.254153602191228, + "grad_norm": 0.23724088072776794, + "learning_rate": 2.872180265590299e-06, + "loss": 0.3598, "step": 118040 }, { - "epoch": 4.15, - "learning_rate": 3.6801491654564174e-06, - "loss": 0.2429, + "epoch": 4.254333801852453, + "grad_norm": 0.23216982185840607, + "learning_rate": 2.870822385313368e-06, + "loss": 0.3497, "step": 118045 }, { - "epoch": 4.15, - "learning_rate": 3.678661566600533e-06, - "loss": 0.2454, + "epoch": 4.2545140015136775, + "grad_norm": 0.27174580097198486, + "learning_rate": 2.8694648065421907e-06, + "loss": 0.3824, "step": 118050 }, { - "epoch": 4.15, - "learning_rate": 3.677174244588186e-06, - "loss": 0.2515, + "epoch": 4.254694201174901, + "grad_norm": 0.31109705567359924, + "learning_rate": 2.8681075292952776e-06, + "loss": 0.3833, "step": 118055 }, { - "epoch": 4.15, - "learning_rate": 3.675687199438671e-06, - "loss": 0.2501, + "epoch": 4.254874400836126, + "grad_norm": 0.2299165278673172, + "learning_rate": 2.866750553591116e-06, + "loss": 0.3445, "step": 118060 }, { - "epoch": 4.15, - "learning_rate": 3.6742004311713123e-06, - "loss": 0.2654, + "epoch": 4.255054600497351, + "grad_norm": 0.30145031213760376, + "learning_rate": 2.865393879448189e-06, + "loss": 0.3773, "step": 118065 }, { - "epoch": 4.15, - "learning_rate": 3.6727139398054e-06, - "loss": 0.2568, + "epoch": 4.255234800158576, + "grad_norm": 0.22781316936016083, + "learning_rate": 2.8640375068849867e-06, + "loss": 0.3582, "step": 118070 }, { - "epoch": 4.15, - "learning_rate": 3.671227725360249e-06, - "loss": 0.2438, + "epoch": 4.2554149998198, + "grad_norm": 0.27182450890541077, + "learning_rate": 2.8626814359199894e-06, + "loss": 0.3855, "step": 118075 }, { - "epoch": 4.15, - "learning_rate": 3.6697417878551465e-06, - "loss": 0.2377, + "epoch": 4.255595199481025, + "grad_norm": 0.2907634377479553, + "learning_rate": 2.8613256665716653e-06, + "loss": 0.37, "step": 118080 }, { - "epoch": 4.15, - "learning_rate": 3.6682561273093914e-06, - "loss": 0.2439, + "epoch": 4.25577539914225, + "grad_norm": 0.2272307574748993, + "learning_rate": 2.859970198858497e-06, + "loss": 0.3706, "step": 118085 }, { - "epoch": 4.15, - "learning_rate": 3.6667707437422677e-06, - "loss": 0.2687, + "epoch": 4.2559555988034745, + "grad_norm": 0.20825766026973724, + "learning_rate": 2.85861503279895e-06, + "loss": 0.3394, "step": 118090 }, { - "epoch": 4.15, - "learning_rate": 3.6652856371730715e-06, - "loss": 0.2454, + "epoch": 4.256135798464699, + "grad_norm": 0.21996937692165375, + "learning_rate": 2.8572601684114853e-06, + "loss": 0.3639, "step": 118095 }, { - "epoch": 4.16, - "learning_rate": 3.663800807621082e-06, - "loss": 0.2303, + "epoch": 4.256315998125924, + "grad_norm": 0.23290881514549255, + "learning_rate": 2.855905605714565e-06, + "loss": 0.3777, "step": 118100 }, { - "epoch": 4.16, - "learning_rate": 3.6623162551055774e-06, - "loss": 0.2616, + "epoch": 4.256496197787148, + "grad_norm": 0.21647751331329346, + "learning_rate": 2.854551344726636e-06, + "loss": 0.3746, "step": 118105 }, { - "epoch": 4.16, - "learning_rate": 3.660831979645829e-06, - "loss": 0.2408, + "epoch": 4.256676397448373, + "grad_norm": 0.2545035183429718, + "learning_rate": 2.853197385466169e-06, + "loss": 0.3716, "step": 118110 }, { - "epoch": 4.16, - "learning_rate": 3.6593479812611133e-06, - "loss": 0.2452, + "epoch": 4.256856597109597, + "grad_norm": 0.20453687012195587, + "learning_rate": 2.851843727951595e-06, + "loss": 0.3852, "step": 118115 }, { - "epoch": 4.16, - "learning_rate": 3.657864259970706e-06, - "loss": 0.2708, + "epoch": 4.257036796770822, + "grad_norm": 0.24812854826450348, + "learning_rate": 2.8504903722013587e-06, + "loss": 0.3759, "step": 118120 }, { - "epoch": 4.16, - "learning_rate": 3.6563808157938666e-06, - "loss": 0.2449, + "epoch": 4.257216996432047, + "grad_norm": 0.29127225279808044, + "learning_rate": 2.8491373182339048e-06, + "loss": 0.3276, "step": 118125 }, { - "epoch": 4.16, - "learning_rate": 3.6548976487498498e-06, - "loss": 0.2404, + "epoch": 4.2573971960932715, + "grad_norm": 0.2810105085372925, + "learning_rate": 2.8477845660676684e-06, + "loss": 0.4026, "step": 118130 }, { - "epoch": 4.16, - "learning_rate": 3.6534147588579258e-06, - "loss": 0.257, + "epoch": 4.257577395754496, + "grad_norm": 0.29022449254989624, + "learning_rate": 2.846432115721079e-06, + "loss": 0.3805, "step": 118135 }, { - "epoch": 4.16, - "learning_rate": 3.651932146137346e-06, - "loss": 0.2464, + "epoch": 4.257757595415721, + "grad_norm": 0.25384798645973206, + "learning_rate": 2.8450799672125607e-06, + "loss": 0.361, "step": 118140 }, { - "epoch": 4.16, - "learning_rate": 3.6504498106073564e-06, - "loss": 0.2475, + "epoch": 4.257937795076945, + "grad_norm": 0.23675154149532318, + "learning_rate": 2.843728120560535e-06, + "loss": 0.3651, "step": 118145 }, { - "epoch": 4.16, - "learning_rate": 3.6489677522871997e-06, - "loss": 0.259, + "epoch": 4.2581179947381695, + "grad_norm": 0.20214100182056427, + "learning_rate": 2.8423765757834287e-06, + "loss": 0.3901, "step": 118150 }, { - "epoch": 4.16, - "learning_rate": 3.6474859711961324e-06, - "loss": 0.2459, + "epoch": 4.258298194399394, + "grad_norm": 0.2030409872531891, + "learning_rate": 2.8410253328996523e-06, + "loss": 0.3818, "step": 118155 }, { - "epoch": 4.16, - "learning_rate": 3.6460044673533865e-06, - "loss": 0.2547, + "epoch": 4.258478394060619, + "grad_norm": 0.2594359815120697, + "learning_rate": 2.839674391927613e-06, + "loss": 0.3604, "step": 118160 }, { - "epoch": 4.16, - "learning_rate": 3.644523240778197e-06, - "loss": 0.2544, + "epoch": 4.258658593721844, + "grad_norm": 0.2291530966758728, + "learning_rate": 2.8383237528857243e-06, + "loss": 0.3353, "step": 118165 }, { - "epoch": 4.16, - "learning_rate": 3.6430422914897984e-06, - "loss": 0.2793, + "epoch": 4.2588387933830685, + "grad_norm": 0.23758092522621155, + "learning_rate": 2.836973415792374e-06, + "loss": 0.3764, "step": 118170 }, { - "epoch": 4.16, - "learning_rate": 3.6415616195074246e-06, - "loss": 0.2845, + "epoch": 4.259018993044293, + "grad_norm": 0.27188360691070557, + "learning_rate": 2.8356233806659777e-06, + "loss": 0.3948, "step": 118175 }, { - "epoch": 4.16, - "learning_rate": 3.6400812248502976e-06, - "loss": 0.2606, + "epoch": 4.259199192705518, + "grad_norm": 0.21040531992912292, + "learning_rate": 2.834273647524921e-06, + "loss": 0.3403, "step": 118180 }, { - "epoch": 4.16, - "learning_rate": 3.638601107537637e-06, - "loss": 0.2469, + "epoch": 4.259379392366743, + "grad_norm": 0.28796496987342834, + "learning_rate": 2.832924216387595e-06, + "loss": 0.4118, "step": 118185 }, { - "epoch": 4.16, - "learning_rate": 3.6371212675886668e-06, - "loss": 0.2576, + "epoch": 4.259559592027967, + "grad_norm": 0.23446249961853027, + "learning_rate": 2.831575087272387e-06, + "loss": 0.3733, "step": 118190 }, { - "epoch": 4.16, - "learning_rate": 3.6356417050225967e-06, - "loss": 0.2511, + "epoch": 4.259739791689191, + "grad_norm": 0.28546759486198425, + "learning_rate": 2.8302262601976687e-06, + "loss": 0.3581, "step": 118195 }, { - "epoch": 4.16, - "learning_rate": 3.6341624198586416e-06, - "loss": 0.264, + "epoch": 4.259919991350416, + "grad_norm": 0.1949034482240677, + "learning_rate": 2.8288777351818287e-06, + "loss": 0.3575, "step": 118200 }, { - "epoch": 4.16, - "learning_rate": 3.6326834121160002e-06, - "loss": 0.2621, + "epoch": 4.260100191011641, + "grad_norm": 0.28568407893180847, + "learning_rate": 2.827529512243246e-06, + "loss": 0.3726, "step": 118205 }, { - "epoch": 4.16, - "learning_rate": 3.6312046818138877e-06, - "loss": 0.2457, + "epoch": 4.2602803906728655, + "grad_norm": 0.24735015630722046, + "learning_rate": 2.8261815914002675e-06, + "loss": 0.376, "step": 118210 }, { - "epoch": 4.16, - "learning_rate": 3.6297262289715028e-06, - "loss": 0.2514, + "epoch": 4.26046059033409, + "grad_norm": 0.18040810525417328, + "learning_rate": 2.8248339726712787e-06, + "loss": 0.3756, "step": 118215 }, { - "epoch": 4.16, - "learning_rate": 3.62824805360803e-06, - "loss": 0.269, + "epoch": 4.260640789995315, + "grad_norm": 0.20753014087677002, + "learning_rate": 2.823486656074634e-06, + "loss": 0.3616, "step": 118220 }, { - "epoch": 4.16, - "learning_rate": 3.626770155742676e-06, - "loss": 0.2475, + "epoch": 4.26082098965654, + "grad_norm": 0.21727077662944794, + "learning_rate": 2.8221396416286905e-06, + "loss": 0.3591, "step": 118225 }, { - "epoch": 4.16, - "learning_rate": 3.6252925353946284e-06, - "loss": 0.2417, + "epoch": 4.261001189317764, + "grad_norm": 0.24121779203414917, + "learning_rate": 2.820792929351798e-06, + "loss": 0.3535, "step": 118230 }, { - "epoch": 4.16, - "learning_rate": 3.6238151925830716e-06, - "loss": 0.2555, + "epoch": 4.261181388978989, + "grad_norm": 0.22880592942237854, + "learning_rate": 2.819446519262306e-06, + "loss": 0.3734, "step": 118235 }, { - "epoch": 4.16, - "learning_rate": 3.6223381273271876e-06, - "loss": 0.2482, + "epoch": 4.261361588640213, + "grad_norm": 0.26039087772369385, + "learning_rate": 2.8181004113785632e-06, + "loss": 0.3805, "step": 118240 }, { - "epoch": 4.16, - "learning_rate": 3.6208613396461476e-06, - "loss": 0.2454, + "epoch": 4.261541788301438, + "grad_norm": 0.21298900246620178, + "learning_rate": 2.81675460571891e-06, + "loss": 0.3944, "step": 118245 }, { - "epoch": 4.16, - "learning_rate": 3.6193848295591416e-06, - "loss": 0.2589, + "epoch": 4.261721987962662, + "grad_norm": 0.24916015565395355, + "learning_rate": 2.8154091023016805e-06, + "loss": 0.3752, "step": 118250 }, { - "epoch": 4.16, - "learning_rate": 3.6179085970853317e-06, - "loss": 0.2389, + "epoch": 4.261902187623887, + "grad_norm": 0.31941652297973633, + "learning_rate": 2.8140639011452057e-06, + "loss": 0.3874, "step": 118255 }, { - "epoch": 4.16, - "learning_rate": 3.6164326422438886e-06, - "loss": 0.2575, + "epoch": 4.262082387285112, + "grad_norm": 0.2693467140197754, + "learning_rate": 2.8127190022678135e-06, + "loss": 0.3512, "step": 118260 }, { - "epoch": 4.16, - "learning_rate": 3.614956965053967e-06, - "loss": 0.2208, + "epoch": 4.262262586946337, + "grad_norm": 0.28551846742630005, + "learning_rate": 2.8113744056878277e-06, + "loss": 0.3449, "step": 118265 }, { - "epoch": 4.16, - "learning_rate": 3.613481565534743e-06, - "loss": 0.2806, + "epoch": 4.262442786607561, + "grad_norm": 0.24838872253894806, + "learning_rate": 2.8100301114235726e-06, + "loss": 0.354, "step": 118270 }, { - "epoch": 4.16, - "learning_rate": 3.6120064437053625e-06, - "loss": 0.2387, + "epoch": 4.262622986268786, + "grad_norm": 0.20312124490737915, + "learning_rate": 2.8086861194933616e-06, + "loss": 0.3844, "step": 118275 }, { - "epoch": 4.16, - "learning_rate": 3.6105315995849876e-06, - "loss": 0.248, + "epoch": 4.262803185930011, + "grad_norm": 0.29347795248031616, + "learning_rate": 2.807342429915505e-06, + "loss": 0.3986, "step": 118280 }, { - "epoch": 4.16, - "learning_rate": 3.6090570331927586e-06, - "loss": 0.2652, + "epoch": 4.262983385591235, + "grad_norm": 0.23310750722885132, + "learning_rate": 2.8059990427083127e-06, + "loss": 0.3745, "step": 118285 }, { - "epoch": 4.16, - "learning_rate": 3.60758274454783e-06, - "loss": 0.2443, + "epoch": 4.263163585252459, + "grad_norm": 0.2755783200263977, + "learning_rate": 2.8046559578900815e-06, + "loss": 0.3808, "step": 118290 }, { - "epoch": 4.16, - "learning_rate": 3.6061087336693444e-06, - "loss": 0.2452, + "epoch": 4.263343784913684, + "grad_norm": 0.21833378076553345, + "learning_rate": 2.8033131754791246e-06, + "loss": 0.3937, "step": 118295 }, { - "epoch": 4.16, - "learning_rate": 3.6046350005764314e-06, - "loss": 0.2472, + "epoch": 4.263523984574909, + "grad_norm": 0.2588220536708832, + "learning_rate": 2.801970695493722e-06, + "loss": 0.372, "step": 118300 }, { - "epoch": 4.16, - "learning_rate": 3.6031615452882388e-06, - "loss": 0.2673, + "epoch": 4.263704184236134, + "grad_norm": 0.23471491038799286, + "learning_rate": 2.8006285179521696e-06, + "loss": 0.3778, "step": 118305 }, { - "epoch": 4.16, - "learning_rate": 3.601688367823891e-06, - "loss": 0.2678, + "epoch": 4.263884383897358, + "grad_norm": 0.31038230657577515, + "learning_rate": 2.7992866428727587e-06, + "loss": 0.3946, "step": 118310 }, { - "epoch": 4.16, - "learning_rate": 3.6002154682025167e-06, - "loss": 0.2513, + "epoch": 4.264064583558583, + "grad_norm": 0.2159845381975174, + "learning_rate": 2.7979450702737637e-06, + "loss": 0.3621, "step": 118315 }, { - "epoch": 4.16, - "learning_rate": 3.59874284644324e-06, - "loss": 0.2497, + "epoch": 4.264244783219808, + "grad_norm": 0.1845831722021103, + "learning_rate": 2.7966038001734777e-06, + "loss": 0.3498, "step": 118320 }, { - "epoch": 4.16, - "learning_rate": 3.5972705025651863e-06, - "loss": 0.2727, + "epoch": 4.2644249828810326, + "grad_norm": 0.21577754616737366, + "learning_rate": 2.7952628325901647e-06, + "loss": 0.3733, "step": 118325 }, { - "epoch": 4.16, - "learning_rate": 3.5957984365874637e-06, - "loss": 0.2371, + "epoch": 4.264605182542256, + "grad_norm": 0.234993115067482, + "learning_rate": 2.793922167542087e-06, + "loss": 0.3823, "step": 118330 }, { - "epoch": 4.16, - "learning_rate": 3.5943266485292005e-06, - "loss": 0.2343, + "epoch": 4.264785382203481, + "grad_norm": 0.22045071423053741, + "learning_rate": 2.7925818050475284e-06, + "loss": 0.3653, "step": 118335 }, { - "epoch": 4.16, - "learning_rate": 3.5928551384094926e-06, - "loss": 0.2563, + "epoch": 4.264965581864706, + "grad_norm": 0.2358977198600769, + "learning_rate": 2.7912417451247453e-06, + "loss": 0.3503, "step": 118340 }, { - "epoch": 4.16, - "learning_rate": 3.5913839062474584e-06, - "loss": 0.2377, + "epoch": 4.265145781525931, + "grad_norm": 0.2254144698381424, + "learning_rate": 2.78990198779199e-06, + "loss": 0.3497, "step": 118345 }, { - "epoch": 4.16, - "learning_rate": 3.589912952062194e-06, - "loss": 0.2378, + "epoch": 4.265325981187155, + "grad_norm": 0.2657366693019867, + "learning_rate": 2.7885625330675206e-06, + "loss": 0.3555, "step": 118350 }, { - "epoch": 4.16, - "learning_rate": 3.5884422758728026e-06, - "loss": 0.28, + "epoch": 4.26550618084838, + "grad_norm": 0.21268507838249207, + "learning_rate": 2.7872233809695837e-06, + "loss": 0.366, "step": 118355 }, { - "epoch": 4.16, - "learning_rate": 3.5869718776983694e-06, - "loss": 0.2559, + "epoch": 4.265686380509605, + "grad_norm": 0.2723730504512787, + "learning_rate": 2.7858845315164307e-06, + "loss": 0.3865, "step": 118360 }, { - "epoch": 4.16, - "learning_rate": 3.5855017575580017e-06, - "loss": 0.252, + "epoch": 4.2658665801708295, + "grad_norm": 0.1851958930492401, + "learning_rate": 2.7845459847263e-06, + "loss": 0.3302, "step": 118365 }, { - "epoch": 4.16, - "learning_rate": 3.5840319154707815e-06, - "loss": 0.2521, + "epoch": 4.266046779832054, + "grad_norm": 0.30947184562683105, + "learning_rate": 2.78320774061743e-06, + "loss": 0.3843, "step": 118370 }, { - "epoch": 4.16, - "learning_rate": 3.582562351455793e-06, - "loss": 0.2526, + "epoch": 4.266226979493279, + "grad_norm": 0.2673863172531128, + "learning_rate": 2.7818697992080536e-06, + "loss": 0.3744, "step": 118375 }, { - "epoch": 4.16, - "learning_rate": 3.5810930655321125e-06, - "loss": 0.2539, + "epoch": 4.266407179154503, + "grad_norm": 0.22295500338077545, + "learning_rate": 2.780532160516394e-06, + "loss": 0.4008, "step": 118380 }, { - "epoch": 4.17, - "learning_rate": 3.5796240577188196e-06, - "loss": 0.2334, + "epoch": 4.266587378815728, + "grad_norm": 0.2774406969547272, + "learning_rate": 2.779194824560688e-06, + "loss": 0.3602, "step": 118385 }, { - "epoch": 4.17, - "learning_rate": 3.5781553280350016e-06, - "loss": 0.272, + "epoch": 4.266767578476952, + "grad_norm": 0.3125916123390198, + "learning_rate": 2.7778577913591534e-06, + "loss": 0.3976, "step": 118390 }, { - "epoch": 4.17, - "learning_rate": 3.5766868764997103e-06, - "loss": 0.2575, + "epoch": 4.266947778138177, + "grad_norm": 0.1951741874217987, + "learning_rate": 2.7765210609299958e-06, + "loss": 0.3666, "step": 118395 }, { - "epoch": 4.17, - "learning_rate": 3.57521870313203e-06, - "loss": 0.2603, + "epoch": 4.267127977799402, + "grad_norm": 0.2636171877384186, + "learning_rate": 2.775184633291439e-06, + "loss": 0.3816, "step": 118400 }, { - "epoch": 4.17, - "learning_rate": 3.5737508079510125e-06, - "loss": 0.2526, + "epoch": 4.2673081774606265, + "grad_norm": 0.25289151072502136, + "learning_rate": 2.7738485084616904e-06, + "loss": 0.383, "step": 118405 }, { - "epoch": 4.17, - "learning_rate": 3.5722831909757225e-06, - "loss": 0.267, + "epoch": 4.267488377121851, + "grad_norm": 0.23693720996379852, + "learning_rate": 2.7725126864589527e-06, + "loss": 0.3504, "step": 118410 }, { - "epoch": 4.17, - "learning_rate": 3.5708158522252067e-06, - "loss": 0.237, + "epoch": 4.267668576783076, + "grad_norm": 0.31816625595092773, + "learning_rate": 2.7711771673014253e-06, + "loss": 0.3926, "step": 118415 }, { - "epoch": 4.17, - "learning_rate": 3.5693487917185325e-06, - "loss": 0.2494, + "epoch": 4.2678487764443, + "grad_norm": 0.20201784372329712, + "learning_rate": 2.7698419510073014e-06, + "loss": 0.3648, "step": 118420 }, { - "epoch": 4.17, - "learning_rate": 3.567882009474738e-06, - "loss": 0.2606, + "epoch": 4.268028976105525, + "grad_norm": 0.28270724415779114, + "learning_rate": 2.768507037594781e-06, + "loss": 0.3847, "step": 118425 }, { - "epoch": 4.17, - "learning_rate": 3.566415505512874e-06, - "loss": 0.24, + "epoch": 4.268209175766749, + "grad_norm": 0.2311401665210724, + "learning_rate": 2.7671724270820487e-06, + "loss": 0.3913, "step": 118430 }, { - "epoch": 4.17, - "learning_rate": 3.564949279851973e-06, - "loss": 0.2432, + "epoch": 4.268389375427974, + "grad_norm": 0.2863185405731201, + "learning_rate": 2.765838119487288e-06, + "loss": 0.4017, "step": 118435 }, { - "epoch": 4.17, - "learning_rate": 3.5634833325110835e-06, - "loss": 0.2706, + "epoch": 4.268569575089199, + "grad_norm": 0.3267385959625244, + "learning_rate": 2.764504114828678e-06, + "loss": 0.3994, "step": 118440 }, { - "epoch": 4.17, - "learning_rate": 3.5620176635092317e-06, - "loss": 0.2564, + "epoch": 4.2687497747504235, + "grad_norm": 0.350425660610199, + "learning_rate": 2.7631704131243967e-06, + "loss": 0.3746, "step": 118445 }, { - "epoch": 4.17, - "learning_rate": 3.5605522728654584e-06, - "loss": 0.2298, + "epoch": 4.268929974411648, + "grad_norm": 0.2516806125640869, + "learning_rate": 2.761837014392604e-06, + "loss": 0.3326, "step": 118450 }, { - "epoch": 4.17, - "learning_rate": 3.5590871605987756e-06, - "loss": 0.2499, + "epoch": 4.269110174072873, + "grad_norm": 0.27395808696746826, + "learning_rate": 2.7605039186514858e-06, + "loss": 0.3911, "step": 118455 }, { - "epoch": 4.17, - "learning_rate": 3.5576223267282242e-06, - "loss": 0.2324, + "epoch": 4.269290373734098, + "grad_norm": 0.24805009365081787, + "learning_rate": 2.7591711259191938e-06, + "loss": 0.3596, "step": 118460 }, { - "epoch": 4.17, - "learning_rate": 3.5561577712728168e-06, - "loss": 0.2419, + "epoch": 4.269470573395322, + "grad_norm": 0.23260460793972015, + "learning_rate": 2.7578386362138886e-06, + "loss": 0.3581, "step": 118465 }, { - "epoch": 4.17, - "learning_rate": 3.554693494251568e-06, - "loss": 0.2381, + "epoch": 4.269650773056546, + "grad_norm": 0.24661628901958466, + "learning_rate": 2.756506449553725e-06, + "loss": 0.3633, "step": 118470 }, { - "epoch": 4.17, - "learning_rate": 3.5532294956834854e-06, - "loss": 0.2537, + "epoch": 4.269830972717771, + "grad_norm": 0.25693389773368835, + "learning_rate": 2.7551745659568527e-06, + "loss": 0.3823, "step": 118475 }, { - "epoch": 4.17, - "learning_rate": 3.55176577558759e-06, - "loss": 0.2592, + "epoch": 4.270011172378996, + "grad_norm": 0.2382129281759262, + "learning_rate": 2.7538429854414238e-06, + "loss": 0.3639, "step": 118480 }, { - "epoch": 4.17, - "learning_rate": 3.55030233398288e-06, - "loss": 0.245, + "epoch": 4.2701913720402205, + "grad_norm": 0.2255300134420395, + "learning_rate": 2.7525117080255817e-06, + "loss": 0.3625, "step": 118485 }, { - "epoch": 4.17, - "learning_rate": 3.548839170888352e-06, - "loss": 0.2571, + "epoch": 4.270371571701445, + "grad_norm": 0.2332393378019333, + "learning_rate": 2.7511807337274513e-06, + "loss": 0.3748, "step": 118490 }, { - "epoch": 4.17, - "learning_rate": 3.5473762863230176e-06, - "loss": 0.2119, + "epoch": 4.27055177136267, + "grad_norm": 0.18017563223838806, + "learning_rate": 2.749850062565179e-06, + "loss": 0.4011, "step": 118495 }, { - "epoch": 4.17, - "learning_rate": 3.5459136803058597e-06, - "loss": 0.2487, + "epoch": 4.270731971023895, + "grad_norm": 0.2834778428077698, + "learning_rate": 2.748519694556889e-06, + "loss": 0.3585, "step": 118500 }, { - "epoch": 4.17, - "eval_loss": 0.2493087649345398, - "eval_runtime": 10.5519, - "eval_samples_per_second": 9.477, - "eval_steps_per_second": 9.477, + "epoch": 4.270731971023895, + "eval_loss": 0.42914897203445435, + "eval_runtime": 3.5317, + "eval_samples_per_second": 28.315, + "eval_steps_per_second": 7.079, "step": 118500 }, { - "epoch": 4.17, - "learning_rate": 3.5444513528558766e-06, - "loss": 0.2779, + "epoch": 4.270912170685119, + "grad_norm": 0.2678777575492859, + "learning_rate": 2.747189629720717e-06, + "loss": 0.4119, "step": 118505 }, { - "epoch": 4.17, - "learning_rate": 3.54298930399205e-06, - "loss": 0.2735, + "epoch": 4.271092370346344, + "grad_norm": 0.23227082192897797, + "learning_rate": 2.7458598680747764e-06, + "loss": 0.3705, "step": 118510 }, { - "epoch": 4.17, - "learning_rate": 3.5415275337333714e-06, - "loss": 0.2558, + "epoch": 4.271272570007568, + "grad_norm": 0.26510533690452576, + "learning_rate": 2.7445304096371803e-06, + "loss": 0.3262, "step": 118515 }, { - "epoch": 4.17, - "learning_rate": 3.5400660420988136e-06, - "loss": 0.2297, + "epoch": 4.271452769668793, + "grad_norm": 0.22343812882900238, + "learning_rate": 2.7432012544260533e-06, + "loss": 0.3612, "step": 118520 }, { - "epoch": 4.17, - "learning_rate": 3.538604829107359e-06, - "loss": 0.2475, + "epoch": 4.2716329693300175, + "grad_norm": 0.2528297007083893, + "learning_rate": 2.7418724024595e-06, + "loss": 0.3779, "step": 118525 }, { - "epoch": 4.17, - "learning_rate": 3.5371438947779694e-06, - "loss": 0.2564, + "epoch": 4.271813168991242, + "grad_norm": 0.28484654426574707, + "learning_rate": 2.7405438537556284e-06, + "loss": 0.3164, "step": 118530 }, { - "epoch": 4.17, - "learning_rate": 3.535683239129628e-06, - "loss": 0.2665, + "epoch": 4.271993368652467, + "grad_norm": 0.2651745676994324, + "learning_rate": 2.739215608332535e-06, + "loss": 0.4037, "step": 118535 }, { - "epoch": 4.17, - "learning_rate": 3.534222862181297e-06, - "loss": 0.2458, + "epoch": 4.272173568313692, + "grad_norm": 0.2041269987821579, + "learning_rate": 2.737887666208314e-06, + "loss": 0.3386, "step": 118540 }, { - "epoch": 4.17, - "learning_rate": 3.532762763951933e-06, - "loss": 0.2273, + "epoch": 4.272353767974916, + "grad_norm": 0.22839802503585815, + "learning_rate": 2.7365600274010694e-06, + "loss": 0.3724, "step": 118545 }, { - "epoch": 4.17, - "learning_rate": 3.531302944460491e-06, - "loss": 0.2514, + "epoch": 4.272533967636141, + "grad_norm": 0.2205089032649994, + "learning_rate": 2.7352326919288824e-06, + "loss": 0.3454, "step": 118550 }, { - "epoch": 4.17, - "learning_rate": 3.5298434037259327e-06, - "loss": 0.2274, + "epoch": 4.272714167297366, + "grad_norm": 0.25789710879325867, + "learning_rate": 2.7339056598098407e-06, + "loss": 0.3494, "step": 118555 }, { - "epoch": 4.17, - "learning_rate": 3.528384141767213e-06, - "loss": 0.2526, + "epoch": 4.27289436695859, + "grad_norm": 0.3210128843784332, + "learning_rate": 2.7325789310620212e-06, + "loss": 0.3931, "step": 118560 }, { - "epoch": 4.17, - "learning_rate": 3.526925158603275e-06, - "loss": 0.2429, + "epoch": 4.273074566619814, + "grad_norm": 0.24505449831485748, + "learning_rate": 2.731252505703502e-06, + "loss": 0.3628, "step": 118565 }, { - "epoch": 4.17, - "learning_rate": 3.5254664542530596e-06, - "loss": 0.2546, + "epoch": 4.273254766281039, + "grad_norm": 0.2696022689342499, + "learning_rate": 2.729926383752357e-06, + "loss": 0.3555, "step": 118570 }, { - "epoch": 4.17, - "learning_rate": 3.524008028735512e-06, - "loss": 0.2668, + "epoch": 4.273434965942264, + "grad_norm": 0.23730942606925964, + "learning_rate": 2.7286005652266604e-06, + "loss": 0.4053, "step": 118575 }, { - "epoch": 4.17, - "learning_rate": 3.5225498820695703e-06, - "loss": 0.2632, + "epoch": 4.273615165603489, + "grad_norm": 0.2668841779232025, + "learning_rate": 2.7272750501444594e-06, + "loss": 0.38, "step": 118580 }, { - "epoch": 4.17, - "learning_rate": 3.5210920142741605e-06, - "loss": 0.2648, + "epoch": 4.273795365264713, + "grad_norm": 0.23594842851161957, + "learning_rate": 2.7259498385238282e-06, + "loss": 0.3971, "step": 118585 }, { - "epoch": 4.17, - "learning_rate": 3.5196344253682124e-06, - "loss": 0.2559, + "epoch": 4.273975564925938, + "grad_norm": 0.2008124589920044, + "learning_rate": 2.724624930382819e-06, + "loss": 0.3583, "step": 118590 }, { - "epoch": 4.17, - "learning_rate": 3.5181771153706605e-06, - "loss": 0.2444, + "epoch": 4.274155764587163, + "grad_norm": 0.21719537675380707, + "learning_rate": 2.723300325739475e-06, + "loss": 0.3669, "step": 118595 }, { - "epoch": 4.17, - "learning_rate": 3.5167200843004233e-06, - "loss": 0.2436, + "epoch": 4.274335964248388, + "grad_norm": 0.25217390060424805, + "learning_rate": 2.7219760246118638e-06, + "loss": 0.3523, "step": 118600 }, { - "epoch": 4.17, - "learning_rate": 3.5152633321764107e-06, - "loss": 0.259, + "epoch": 4.274516163909611, + "grad_norm": 0.19717147946357727, + "learning_rate": 2.720652027018006e-06, + "loss": 0.3957, "step": 118605 }, { - "epoch": 4.17, - "learning_rate": 3.5138068590175435e-06, - "loss": 0.2611, + "epoch": 4.274696363570836, + "grad_norm": 0.2760673761367798, + "learning_rate": 2.7193283329759537e-06, + "loss": 0.3723, "step": 118610 }, { - "epoch": 4.17, - "learning_rate": 3.5123506648427425e-06, - "loss": 0.2542, + "epoch": 4.274876563232061, + "grad_norm": 0.2848314344882965, + "learning_rate": 2.7180049425037407e-06, + "loss": 0.3912, "step": 118615 }, { - "epoch": 4.17, - "learning_rate": 3.5108947496709065e-06, - "loss": 0.242, + "epoch": 4.275056762893286, + "grad_norm": 0.30649277567863464, + "learning_rate": 2.7166818556193963e-06, + "loss": 0.396, "step": 118620 }, { - "epoch": 4.17, - "learning_rate": 3.509439113520932e-06, - "loss": 0.2552, + "epoch": 4.27523696255451, + "grad_norm": 0.31178387999534607, + "learning_rate": 2.7153590723409478e-06, + "loss": 0.4219, "step": 118625 }, { - "epoch": 4.17, - "learning_rate": 3.507983756411737e-06, - "loss": 0.2484, + "epoch": 4.275417162215735, + "grad_norm": 0.2467115819454193, + "learning_rate": 2.714036592686417e-06, + "loss": 0.3704, "step": 118630 }, { - "epoch": 4.17, - "learning_rate": 3.506528678362206e-06, - "loss": 0.2656, + "epoch": 4.27559736187696, + "grad_norm": 0.2777189314365387, + "learning_rate": 2.7127144166738173e-06, + "loss": 0.3898, "step": 118635 }, { - "epoch": 4.17, - "learning_rate": 3.505073879391238e-06, - "loss": 0.2556, + "epoch": 4.275777561538185, + "grad_norm": 0.23906902968883514, + "learning_rate": 2.711392544321176e-06, + "loss": 0.3649, "step": 118640 }, { - "epoch": 4.17, - "learning_rate": 3.5036193595177123e-06, - "loss": 0.2626, + "epoch": 4.275957761199409, + "grad_norm": 0.19675643742084503, + "learning_rate": 2.710070975646492e-06, + "loss": 0.3649, "step": 118645 }, { - "epoch": 4.17, - "learning_rate": 3.502165118760531e-06, - "loss": 0.2427, + "epoch": 4.276137960860634, + "grad_norm": 0.2644674777984619, + "learning_rate": 2.7087497106677793e-06, + "loss": 0.3944, "step": 118650 }, { - "epoch": 4.17, - "learning_rate": 3.500711157138567e-06, - "loss": 0.2403, + "epoch": 4.276318160521858, + "grad_norm": 0.2813570201396942, + "learning_rate": 2.707428749403035e-06, + "loss": 0.3593, "step": 118655 }, { - "epoch": 4.17, - "learning_rate": 3.499257474670692e-06, - "loss": 0.2379, + "epoch": 4.276498360183083, + "grad_norm": 0.24779802560806274, + "learning_rate": 2.706108091870252e-06, + "loss": 0.3836, "step": 118660 }, { - "epoch": 4.17, - "learning_rate": 3.4978040713757902e-06, - "loss": 0.2363, + "epoch": 4.276678559844307, + "grad_norm": 0.2426891177892685, + "learning_rate": 2.704787738087436e-06, + "loss": 0.3918, "step": 118665 }, { - "epoch": 4.18, - "learning_rate": 3.4963509472727392e-06, - "loss": 0.2586, + "epoch": 4.276858759505532, + "grad_norm": 0.3096999228000641, + "learning_rate": 2.703467688072575e-06, + "loss": 0.3599, "step": 118670 }, { - "epoch": 4.18, - "learning_rate": 3.4948981023804007e-06, - "loss": 0.2377, + "epoch": 4.277038959166757, + "grad_norm": 0.20364230871200562, + "learning_rate": 2.702147941843641e-06, + "loss": 0.3373, "step": 118675 }, { - "epoch": 4.18, - "learning_rate": 3.493445536717635e-06, - "loss": 0.2561, + "epoch": 4.2772191588279815, + "grad_norm": 0.24588048458099365, + "learning_rate": 2.7008284994186284e-06, + "loss": 0.3355, "step": 118680 }, { - "epoch": 4.18, - "learning_rate": 3.4919932503033022e-06, - "loss": 0.2447, + "epoch": 4.277399358489206, + "grad_norm": 0.24682247638702393, + "learning_rate": 2.6995093608155053e-06, + "loss": 0.3409, "step": 118685 }, { - "epoch": 4.18, - "learning_rate": 3.4905412431562675e-06, - "loss": 0.2475, + "epoch": 4.277579558150431, + "grad_norm": 0.28250157833099365, + "learning_rate": 2.6981905260522607e-06, + "loss": 0.3628, "step": 118690 }, { - "epoch": 4.18, - "learning_rate": 3.4890895152953794e-06, - "loss": 0.258, + "epoch": 4.277759757811655, + "grad_norm": 0.25315603613853455, + "learning_rate": 2.6968719951468464e-06, + "loss": 0.3906, "step": 118695 }, { - "epoch": 4.18, - "learning_rate": 3.4876380667394874e-06, - "loss": 0.2361, + "epoch": 4.27793995747288, + "grad_norm": 0.2779964506626129, + "learning_rate": 2.695553768117226e-06, + "loss": 0.3717, "step": 118700 }, { - "epoch": 4.18, - "learning_rate": 3.486186897507432e-06, - "loss": 0.2671, + "epoch": 4.278120157134104, + "grad_norm": 0.24498099088668823, + "learning_rate": 2.6942358449813745e-06, + "loss": 0.3657, "step": 118705 }, { - "epoch": 4.18, - "learning_rate": 3.484736007618067e-06, - "loss": 0.2538, + "epoch": 4.278300356795329, + "grad_norm": 0.2811969816684723, + "learning_rate": 2.6929182257572432e-06, + "loss": 0.3594, "step": 118710 }, { - "epoch": 4.18, - "learning_rate": 3.4832853970902218e-06, - "loss": 0.2454, + "epoch": 4.278480556456554, + "grad_norm": 0.2293609082698822, + "learning_rate": 2.6916009104627797e-06, + "loss": 0.3518, "step": 118715 }, { - "epoch": 4.18, - "learning_rate": 3.48183506594274e-06, - "loss": 0.2589, + "epoch": 4.2786607561177785, + "grad_norm": 0.3065091669559479, + "learning_rate": 2.6902838991159334e-06, + "loss": 0.3964, "step": 118720 }, { - "epoch": 4.18, - "learning_rate": 3.4803850141944426e-06, - "loss": 0.2587, + "epoch": 4.278840955779003, + "grad_norm": 0.21707165241241455, + "learning_rate": 2.6889671917346483e-06, + "loss": 0.3497, "step": 118725 }, { - "epoch": 4.18, - "learning_rate": 3.4789352418641728e-06, - "loss": 0.2427, + "epoch": 4.279021155440228, + "grad_norm": 0.3095008432865143, + "learning_rate": 2.687650788336868e-06, + "loss": 0.3616, "step": 118730 }, { - "epoch": 4.18, - "learning_rate": 3.4774857489707435e-06, - "loss": 0.2502, + "epoch": 4.279201355101453, + "grad_norm": 0.2820426821708679, + "learning_rate": 2.6863346889405255e-06, + "loss": 0.388, "step": 118735 }, { - "epoch": 4.18, - "learning_rate": 3.4760365355329725e-06, - "loss": 0.2455, + "epoch": 4.2793815547626775, + "grad_norm": 0.24508492648601532, + "learning_rate": 2.685018893563554e-06, + "loss": 0.394, "step": 118740 }, { - "epoch": 4.18, - "learning_rate": 3.4745876015696872e-06, - "loss": 0.2785, + "epoch": 4.279561754423901, + "grad_norm": 0.2219163477420807, + "learning_rate": 2.6837034022238806e-06, + "loss": 0.3613, "step": 118745 }, { - "epoch": 4.18, - "learning_rate": 3.473138947099697e-06, - "loss": 0.2643, + "epoch": 4.279741954085126, + "grad_norm": 0.23274736106395721, + "learning_rate": 2.6823882149394215e-06, + "loss": 0.3737, "step": 118750 }, { - "epoch": 4.18, - "learning_rate": 3.4716905721418114e-06, - "loss": 0.2686, + "epoch": 4.279922153746351, + "grad_norm": 0.27821585536003113, + "learning_rate": 2.6810733317281095e-06, + "loss": 0.3587, "step": 118755 }, { - "epoch": 4.18, - "learning_rate": 3.47024247671483e-06, - "loss": 0.255, + "epoch": 4.2801023534075755, + "grad_norm": 0.2513722777366638, + "learning_rate": 2.67975875260785e-06, + "loss": 0.3816, "step": 118760 }, { - "epoch": 4.18, - "learning_rate": 3.4687946608375655e-06, - "loss": 0.2389, + "epoch": 4.2802825530688, + "grad_norm": 0.26026418805122375, + "learning_rate": 2.6784444775965563e-06, + "loss": 0.3802, "step": 118765 }, { - "epoch": 4.18, - "learning_rate": 3.4673471245288083e-06, - "loss": 0.2675, + "epoch": 4.280462752730025, + "grad_norm": 0.2638653814792633, + "learning_rate": 2.6771305067121363e-06, + "loss": 0.3756, "step": 118770 }, { - "epoch": 4.18, - "learning_rate": 3.465899867807365e-06, - "loss": 0.2451, + "epoch": 4.28064295239125, + "grad_norm": 0.2352520376443863, + "learning_rate": 2.675816839972489e-06, + "loss": 0.4026, "step": 118775 }, { - "epoch": 4.18, - "learning_rate": 3.4644528906920132e-06, - "loss": 0.233, + "epoch": 4.280823152052474, + "grad_norm": 0.3046009838581085, + "learning_rate": 2.6745034773955095e-06, + "loss": 0.3791, "step": 118780 }, { - "epoch": 4.18, - "learning_rate": 3.463006193201554e-06, - "loss": 0.2499, + "epoch": 4.281003351713699, + "grad_norm": 0.21931016445159912, + "learning_rate": 2.6731904189991108e-06, + "loss": 0.3673, "step": 118785 }, { - "epoch": 4.18, - "learning_rate": 3.461559775354767e-06, - "loss": 0.2304, + "epoch": 4.281183551374923, + "grad_norm": 0.21778719127178192, + "learning_rate": 2.671877664801159e-06, + "loss": 0.3712, "step": 118790 }, { - "epoch": 4.18, - "learning_rate": 3.4601136371704314e-06, - "loss": 0.2727, + "epoch": 4.281363751036148, + "grad_norm": 0.22375808656215668, + "learning_rate": 2.6705652148195566e-06, + "loss": 0.3566, "step": 118795 }, { - "epoch": 4.18, - "learning_rate": 3.458667778667318e-06, - "loss": 0.26, + "epoch": 4.2815439506973725, + "grad_norm": 0.30791062116622925, + "learning_rate": 2.6692530690721777e-06, + "loss": 0.3972, "step": 118800 }, { - "epoch": 4.18, - "learning_rate": 3.4572221998642128e-06, - "loss": 0.2551, + "epoch": 4.281724150358597, + "grad_norm": 0.259032279253006, + "learning_rate": 2.667941227576906e-06, + "loss": 0.3812, "step": 118805 }, { - "epoch": 4.18, - "learning_rate": 3.4557769007798807e-06, - "loss": 0.2581, + "epoch": 4.281904350019822, + "grad_norm": 0.2583853006362915, + "learning_rate": 2.66662969035161e-06, + "loss": 0.3902, "step": 118810 }, { - "epoch": 4.18, - "learning_rate": 3.45433188143309e-06, - "loss": 0.2619, + "epoch": 4.282084549681047, + "grad_norm": 0.3107074499130249, + "learning_rate": 2.6653184574141594e-06, + "loss": 0.4077, "step": 118815 }, { - "epoch": 4.18, - "learning_rate": 3.4528871418425922e-06, - "loss": 0.2278, + "epoch": 4.282264749342271, + "grad_norm": 0.2655295729637146, + "learning_rate": 2.6640075287824166e-06, + "loss": 0.3709, "step": 118820 }, { - "epoch": 4.18, - "learning_rate": 3.451442682027156e-06, - "loss": 0.2565, + "epoch": 4.282444949003496, + "grad_norm": 0.2224242240190506, + "learning_rate": 2.6626969044742518e-06, + "loss": 0.3534, "step": 118825 }, { - "epoch": 4.18, - "learning_rate": 3.4499985020055386e-06, - "loss": 0.2589, + "epoch": 4.282625148664721, + "grad_norm": 0.25910016894340515, + "learning_rate": 2.6613865845075164e-06, + "loss": 0.3621, "step": 118830 }, { - "epoch": 4.18, - "learning_rate": 3.4485546017964916e-06, - "loss": 0.2446, + "epoch": 4.282805348325946, + "grad_norm": 0.24160423874855042, + "learning_rate": 2.6600765689000634e-06, + "loss": 0.3737, "step": 118835 }, { - "epoch": 4.18, - "learning_rate": 3.4471109814187552e-06, - "loss": 0.2836, + "epoch": 4.2829855479871695, + "grad_norm": 0.2217995524406433, + "learning_rate": 2.658766857669745e-06, + "loss": 0.3841, "step": 118840 }, { - "epoch": 4.18, - "learning_rate": 3.4456676408910814e-06, - "loss": 0.2464, + "epoch": 4.283165747648394, + "grad_norm": 0.25542962551116943, + "learning_rate": 2.657457450834394e-06, + "loss": 0.4073, "step": 118845 }, { - "epoch": 4.18, - "learning_rate": 3.444224580232211e-06, - "loss": 0.2539, + "epoch": 4.283345947309619, + "grad_norm": 0.26905080676078796, + "learning_rate": 2.656148348411866e-06, + "loss": 0.3513, "step": 118850 }, { - "epoch": 4.18, - "learning_rate": 3.4427817994608702e-06, - "loss": 0.2403, + "epoch": 4.283526146970844, + "grad_norm": 0.2789619266986847, + "learning_rate": 2.6548395504199963e-06, + "loss": 0.3766, "step": 118855 }, { - "epoch": 4.18, - "learning_rate": 3.441339298595808e-06, - "loss": 0.2511, + "epoch": 4.283706346632068, + "grad_norm": 0.21455584466457367, + "learning_rate": 2.6535310568766013e-06, + "loss": 0.3954, "step": 118860 }, { - "epoch": 4.18, - "learning_rate": 3.439897077655749e-06, - "loss": 0.2489, + "epoch": 4.283886546293293, + "grad_norm": 0.3085833787918091, + "learning_rate": 2.6522228677995252e-06, + "loss": 0.4161, "step": 118865 }, { - "epoch": 4.18, - "learning_rate": 3.438455136659416e-06, - "loss": 0.2584, + "epoch": 4.284066745954518, + "grad_norm": 0.22385522723197937, + "learning_rate": 2.6509149832065817e-06, + "loss": 0.3982, "step": 118870 }, { - "epoch": 4.18, - "learning_rate": 3.437013475625528e-06, - "loss": 0.2549, + "epoch": 4.284246945615743, + "grad_norm": 0.2983032464981079, + "learning_rate": 2.6496074031156004e-06, + "loss": 0.3327, "step": 118875 }, { - "epoch": 4.18, - "learning_rate": 3.435572094572817e-06, - "loss": 0.2346, + "epoch": 4.284427145276966, + "grad_norm": 0.27663928270339966, + "learning_rate": 2.6483001275443896e-06, + "loss": 0.3619, "step": 118880 }, { - "epoch": 4.18, - "learning_rate": 3.434130993519982e-06, - "loss": 0.269, + "epoch": 4.284607344938191, + "grad_norm": 0.28087663650512695, + "learning_rate": 2.6469931565107575e-06, + "loss": 0.3947, "step": 118885 }, { - "epoch": 4.18, - "learning_rate": 3.43269017248575e-06, - "loss": 0.2394, + "epoch": 4.284787544599416, + "grad_norm": 0.27490758895874023, + "learning_rate": 2.6456864900325174e-06, + "loss": 0.3937, "step": 118890 }, { - "epoch": 4.18, - "learning_rate": 3.4312496314888198e-06, - "loss": 0.2548, + "epoch": 4.284967744260641, + "grad_norm": 0.28742140531539917, + "learning_rate": 2.644380128127474e-06, + "loss": 0.3938, "step": 118895 }, { - "epoch": 4.18, - "learning_rate": 3.4298093705479012e-06, - "loss": 0.2422, + "epoch": 4.285147943921865, + "grad_norm": 0.2435084581375122, + "learning_rate": 2.6430740708134226e-06, + "loss": 0.3626, "step": 118900 }, { - "epoch": 4.18, - "learning_rate": 3.428369389681696e-06, - "loss": 0.24, + "epoch": 4.28532814358309, + "grad_norm": 0.27407246828079224, + "learning_rate": 2.6417683181081592e-06, + "loss": 0.3699, "step": 118905 }, { - "epoch": 4.18, - "learning_rate": 3.4269296889088946e-06, - "loss": 0.2584, + "epoch": 4.285508343244315, + "grad_norm": 0.2970322370529175, + "learning_rate": 2.640462870029467e-06, + "loss": 0.3878, "step": 118910 }, { - "epoch": 4.18, - "learning_rate": 3.425490268248191e-06, - "loss": 0.2567, + "epoch": 4.28568854290554, + "grad_norm": 0.27050116658210754, + "learning_rate": 2.6391577265951455e-06, + "loss": 0.4109, "step": 118915 }, { - "epoch": 4.18, - "learning_rate": 3.4240511277182808e-06, - "loss": 0.2364, + "epoch": 4.285868742566764, + "grad_norm": 0.18248991668224335, + "learning_rate": 2.6378528878229697e-06, + "loss": 0.3786, "step": 118920 }, { - "epoch": 4.18, - "learning_rate": 3.422612267337852e-06, - "loss": 0.25, + "epoch": 4.286048942227989, + "grad_norm": 0.24371817708015442, + "learning_rate": 2.6365483537307197e-06, + "loss": 0.3315, "step": 118925 }, { - "epoch": 4.18, - "learning_rate": 3.421173687125573e-06, - "loss": 0.2475, + "epoch": 4.286229141889213, + "grad_norm": 0.2470201700925827, + "learning_rate": 2.6352441243361698e-06, + "loss": 0.3701, "step": 118930 }, { - "epoch": 4.18, - "learning_rate": 3.41973538710014e-06, - "loss": 0.27, + "epoch": 4.286409341550438, + "grad_norm": 0.19732847809791565, + "learning_rate": 2.6339401996570813e-06, + "loss": 0.3634, "step": 118935 }, { - "epoch": 4.18, - "learning_rate": 3.4182973672802153e-06, - "loss": 0.2454, + "epoch": 4.286589541211662, + "grad_norm": 0.21893630921840668, + "learning_rate": 2.632636579711234e-06, + "loss": 0.3664, "step": 118940 }, { - "epoch": 4.18, - "learning_rate": 3.4168596276844823e-06, - "loss": 0.2377, + "epoch": 4.286769740872887, + "grad_norm": 0.26393207907676697, + "learning_rate": 2.6313332645163836e-06, + "loss": 0.3683, "step": 118945 }, { - "epoch": 4.18, - "learning_rate": 3.415709637781853e-06, - "loss": 0.241, + "epoch": 4.286949940534112, + "grad_norm": 0.2583698630332947, + "learning_rate": 2.630030254090285e-06, + "loss": 0.3745, "step": 118950 }, { - "epoch": 4.19, - "learning_rate": 3.4142724026366953e-06, - "loss": 0.2411, + "epoch": 4.287130140195337, + "grad_norm": 0.22545069456100464, + "learning_rate": 2.6287275484506934e-06, + "loss": 0.4017, "step": 118955 }, { - "epoch": 4.19, - "learning_rate": 3.412835447767987e-06, - "loss": 0.2634, + "epoch": 4.287310339856561, + "grad_norm": 0.3334513008594513, + "learning_rate": 2.6274251476153587e-06, + "loss": 0.382, "step": 118960 }, { - "epoch": 4.19, - "learning_rate": 3.4113987731943743e-06, - "loss": 0.2555, + "epoch": 4.287490539517786, + "grad_norm": 0.32256385684013367, + "learning_rate": 2.62612305160202e-06, + "loss": 0.3868, "step": 118965 }, { - "epoch": 4.19, - "learning_rate": 3.409962378934525e-06, - "loss": 0.2322, + "epoch": 4.287670739179011, + "grad_norm": 0.28778645396232605, + "learning_rate": 2.6248212604284313e-06, + "loss": 0.3516, "step": 118970 }, { - "epoch": 4.19, - "learning_rate": 3.408526265007089e-06, - "loss": 0.2418, + "epoch": 4.287850938840235, + "grad_norm": 0.21005696058273315, + "learning_rate": 2.6235197741123122e-06, + "loss": 0.3425, "step": 118975 }, { - "epoch": 4.19, - "learning_rate": 3.407090431430712e-06, - "loss": 0.2518, + "epoch": 4.288031138501459, + "grad_norm": 0.20069657266139984, + "learning_rate": 2.6222185926714076e-06, + "loss": 0.3665, "step": 118980 }, { - "epoch": 4.19, - "learning_rate": 3.4056548782240265e-06, - "loss": 0.2366, + "epoch": 4.288211338162684, + "grad_norm": 0.30215156078338623, + "learning_rate": 2.6209177161234445e-06, + "loss": 0.3804, "step": 118985 }, { - "epoch": 4.19, - "learning_rate": 3.4042196054056896e-06, - "loss": 0.2806, + "epoch": 4.288391537823909, + "grad_norm": 0.22637207806110382, + "learning_rate": 2.6196171444861417e-06, + "loss": 0.3878, "step": 118990 }, { - "epoch": 4.19, - "learning_rate": 3.402784612994328e-06, - "loss": 0.2627, + "epoch": 4.2885717374851335, + "grad_norm": 0.28523603081703186, + "learning_rate": 2.6183168777772244e-06, + "loss": 0.3649, "step": 118995 }, { - "epoch": 4.19, - "learning_rate": 3.401349901008577e-06, - "loss": 0.2632, + "epoch": 4.288751937146358, + "grad_norm": 0.22392310202121735, + "learning_rate": 2.617016916014406e-06, + "loss": 0.3961, "step": 119000 }, { - "epoch": 4.19, - "eval_loss": 0.2492852360010147, - "eval_runtime": 10.5487, - "eval_samples_per_second": 9.48, - "eval_steps_per_second": 9.48, + "epoch": 4.288751937146358, + "eval_loss": 0.4290127456188202, + "eval_runtime": 3.5355, + "eval_samples_per_second": 28.284, + "eval_steps_per_second": 7.071, "step": 119000 }, { - "epoch": 4.19, - "learning_rate": 3.3999154694670548e-06, - "loss": 0.254, + "epoch": 4.288932136807583, + "grad_norm": 0.2049478143453598, + "learning_rate": 2.6157172592153944e-06, + "loss": 0.3615, "step": 119005 }, { - "epoch": 4.19, - "learning_rate": 3.3984813183884022e-06, - "loss": 0.2624, + "epoch": 4.289112336468808, + "grad_norm": 0.23813143372535706, + "learning_rate": 2.614417907397906e-06, + "loss": 0.3763, "step": 119010 }, { - "epoch": 4.19, - "learning_rate": 3.397047447791235e-06, - "loss": 0.2382, + "epoch": 4.2892925361300325, + "grad_norm": 0.29590684175491333, + "learning_rate": 2.6131188605796384e-06, + "loss": 0.3898, "step": 119015 }, { - "epoch": 4.19, - "learning_rate": 3.3956138576941633e-06, - "loss": 0.258, + "epoch": 4.289472735791256, + "grad_norm": 0.24687814712524414, + "learning_rate": 2.6118201187782937e-06, + "loss": 0.3756, "step": 119020 }, { - "epoch": 4.19, - "learning_rate": 3.3941805481158133e-06, - "loss": 0.2312, + "epoch": 4.289652935452481, + "grad_norm": 0.2702934443950653, + "learning_rate": 2.6105216820115657e-06, + "loss": 0.3699, "step": 119025 }, { - "epoch": 4.19, - "learning_rate": 3.392747519074785e-06, - "loss": 0.245, + "epoch": 4.289833135113706, + "grad_norm": 0.24939516186714172, + "learning_rate": 2.609223550297141e-06, + "loss": 0.3705, "step": 119030 }, { - "epoch": 4.19, - "learning_rate": 3.391314770589696e-06, - "loss": 0.2552, + "epoch": 4.2900133347749305, + "grad_norm": 0.2283768504858017, + "learning_rate": 2.607925723652713e-06, + "loss": 0.3792, "step": 119035 }, { - "epoch": 4.19, - "learning_rate": 3.389882302679137e-06, - "loss": 0.2511, + "epoch": 4.290193534436155, + "grad_norm": 0.24190238118171692, + "learning_rate": 2.6066282020959678e-06, + "loss": 0.3742, "step": 119040 }, { - "epoch": 4.19, - "learning_rate": 3.3884501153617214e-06, - "loss": 0.2263, + "epoch": 4.29037373409738, + "grad_norm": 0.1982034593820572, + "learning_rate": 2.605330985644566e-06, + "loss": 0.3406, "step": 119045 }, { - "epoch": 4.19, - "learning_rate": 3.3870182086560396e-06, - "loss": 0.268, + "epoch": 4.290553933758605, + "grad_norm": 0.2854166030883789, + "learning_rate": 2.6040340743162e-06, + "loss": 0.3875, "step": 119050 }, { - "epoch": 4.19, - "learning_rate": 3.3855865825806797e-06, - "loss": 0.2514, + "epoch": 4.2907341334198295, + "grad_norm": 0.2228785753250122, + "learning_rate": 2.60273746812853e-06, + "loss": 0.3492, "step": 119055 }, { - "epoch": 4.19, - "learning_rate": 3.3841552371542297e-06, - "loss": 0.2475, + "epoch": 4.290914333081054, + "grad_norm": 0.3077186346054077, + "learning_rate": 2.6014411670992305e-06, + "loss": 0.4009, "step": 119060 }, { - "epoch": 4.19, - "learning_rate": 3.382724172395285e-06, - "loss": 0.2654, + "epoch": 4.291094532742278, + "grad_norm": 0.26383304595947266, + "learning_rate": 2.60014517124596e-06, + "loss": 0.3999, "step": 119065 }, { - "epoch": 4.19, - "learning_rate": 3.381293388322421e-06, - "loss": 0.2376, + "epoch": 4.291274732403503, + "grad_norm": 0.2802242934703827, + "learning_rate": 2.5988494805863682e-06, + "loss": 0.3834, "step": 119070 }, { - "epoch": 4.19, - "learning_rate": 3.3798628849542072e-06, - "loss": 0.2617, + "epoch": 4.2914549320647275, + "grad_norm": 0.21582341194152832, + "learning_rate": 2.597554095138122e-06, + "loss": 0.367, "step": 119075 }, { - "epoch": 4.19, - "learning_rate": 3.3784326623092333e-06, - "loss": 0.2533, + "epoch": 4.291635131725952, + "grad_norm": 0.2001202404499054, + "learning_rate": 2.5962590149188616e-06, + "loss": 0.3211, "step": 119080 }, { - "epoch": 4.19, - "learning_rate": 3.3770027204060555e-06, - "loss": 0.2464, + "epoch": 4.291815331387177, + "grad_norm": 0.27572277188301086, + "learning_rate": 2.594964239946232e-06, + "loss": 0.3591, "step": 119085 }, { - "epoch": 4.19, - "learning_rate": 3.375573059263254e-06, - "loss": 0.2422, + "epoch": 4.291995531048402, + "grad_norm": 0.2560023069381714, + "learning_rate": 2.5936697702378804e-06, + "loss": 0.3773, "step": 119090 }, { - "epoch": 4.19, - "learning_rate": 3.374143678899383e-06, - "loss": 0.2526, + "epoch": 4.292175730709626, + "grad_norm": 0.23489350080490112, + "learning_rate": 2.592375605811434e-06, + "loss": 0.3515, "step": 119095 }, { - "epoch": 4.19, - "learning_rate": 3.372714579333003e-06, - "loss": 0.2593, + "epoch": 4.292355930370851, + "grad_norm": 0.25158292055130005, + "learning_rate": 2.591081746684537e-06, + "loss": 0.3998, "step": 119100 }, { - "epoch": 4.19, - "learning_rate": 3.3712857605826738e-06, - "loss": 0.255, + "epoch": 4.292536130032076, + "grad_norm": 0.23154456913471222, + "learning_rate": 2.5897881928748145e-06, + "loss": 0.3961, "step": 119105 }, { - "epoch": 4.19, - "learning_rate": 3.3698572226669446e-06, - "loss": 0.2451, + "epoch": 4.292716329693301, + "grad_norm": 0.2787790298461914, + "learning_rate": 2.5884949443998857e-06, + "loss": 0.375, "step": 119110 }, { - "epoch": 4.19, - "learning_rate": 3.3684289656043645e-06, - "loss": 0.2499, + "epoch": 4.2928965293545245, + "grad_norm": 0.23555888235569, + "learning_rate": 2.587202001277378e-06, + "loss": 0.3809, "step": 119115 }, { - "epoch": 4.19, - "learning_rate": 3.3670009894134743e-06, - "loss": 0.2308, + "epoch": 4.293076729015749, + "grad_norm": 0.25257548689842224, + "learning_rate": 2.5859093635248964e-06, + "loss": 0.3726, "step": 119120 }, { - "epoch": 4.19, - "learning_rate": 3.3655732941128255e-06, - "loss": 0.2485, + "epoch": 4.293256928676974, + "grad_norm": 0.23325851559638977, + "learning_rate": 2.5846170311600637e-06, + "loss": 0.3535, "step": 119125 }, { - "epoch": 4.19, - "learning_rate": 3.364145879720948e-06, - "loss": 0.2657, + "epoch": 4.293437128338199, + "grad_norm": 0.2508438527584076, + "learning_rate": 2.5833250042004876e-06, + "loss": 0.3674, "step": 119130 }, { - "epoch": 4.19, - "learning_rate": 3.362718746256374e-06, - "loss": 0.2777, + "epoch": 4.293617327999423, + "grad_norm": 0.3049331605434418, + "learning_rate": 2.582033282663765e-06, + "loss": 0.409, "step": 119135 }, { - "epoch": 4.19, - "learning_rate": 3.3612918937376366e-06, - "loss": 0.2472, + "epoch": 4.293797527660648, + "grad_norm": 0.22652621567249298, + "learning_rate": 2.5807418665675017e-06, + "loss": 0.3541, "step": 119140 }, { - "epoch": 4.19, - "learning_rate": 3.35986532218327e-06, - "loss": 0.2415, + "epoch": 4.293977727321873, + "grad_norm": 0.2690187096595764, + "learning_rate": 2.5794507559292885e-06, + "loss": 0.3622, "step": 119145 }, { - "epoch": 4.19, - "learning_rate": 3.3584390316117904e-06, - "loss": 0.2415, + "epoch": 4.294157926983098, + "grad_norm": 0.24629640579223633, + "learning_rate": 2.578159950766712e-06, + "loss": 0.3736, "step": 119150 }, { - "epoch": 4.19, - "learning_rate": 3.3570130220417107e-06, - "loss": 0.2461, + "epoch": 4.2943381266443215, + "grad_norm": 0.2614523470401764, + "learning_rate": 2.576869451097377e-06, + "loss": 0.384, "step": 119155 }, { - "epoch": 4.19, - "learning_rate": 3.3555872934915604e-06, - "loss": 0.2749, + "epoch": 4.294518326305546, + "grad_norm": 0.2685030996799469, + "learning_rate": 2.5755792569388455e-06, + "loss": 0.3787, "step": 119160 }, { - "epoch": 4.19, - "learning_rate": 3.3541618459798447e-06, - "loss": 0.2483, + "epoch": 4.294698525966771, + "grad_norm": 0.23941963911056519, + "learning_rate": 2.5742893683087076e-06, + "loss": 0.4015, "step": 119165 }, { - "epoch": 4.19, - "learning_rate": 3.352736679525073e-06, - "loss": 0.2565, + "epoch": 4.294878725627996, + "grad_norm": 0.2448320984840393, + "learning_rate": 2.5729997852245363e-06, + "loss": 0.4058, "step": 119170 }, { - "epoch": 4.19, - "learning_rate": 3.3513117941457424e-06, - "loss": 0.2477, + "epoch": 4.29505892528922, + "grad_norm": 0.2499290406703949, + "learning_rate": 2.571710507703895e-06, + "loss": 0.3464, "step": 119175 }, { - "epoch": 4.19, - "learning_rate": 3.3498871898603677e-06, - "loss": 0.2491, + "epoch": 4.295239124950445, + "grad_norm": 0.263079971075058, + "learning_rate": 2.5704215357643666e-06, + "loss": 0.3939, "step": 119180 }, { - "epoch": 4.19, - "learning_rate": 3.3484628666874405e-06, - "loss": 0.254, + "epoch": 4.29541932461167, + "grad_norm": 0.22637499868869781, + "learning_rate": 2.5691328694234964e-06, + "loss": 0.393, "step": 119185 }, { - "epoch": 4.19, - "learning_rate": 3.347038824645446e-06, - "loss": 0.2663, + "epoch": 4.295599524272895, + "grad_norm": 0.29773595929145813, + "learning_rate": 2.567844508698844e-06, + "loss": 0.4092, "step": 119190 }, { - "epoch": 4.19, - "learning_rate": 3.3456150637528856e-06, - "loss": 0.2688, + "epoch": 4.295779723934119, + "grad_norm": 0.25655168294906616, + "learning_rate": 2.566556453607974e-06, + "loss": 0.3502, "step": 119195 }, { - "epoch": 4.19, - "learning_rate": 3.3441915840282473e-06, - "loss": 0.274, + "epoch": 4.295959923595344, + "grad_norm": 0.28525644540786743, + "learning_rate": 2.5652687041684247e-06, + "loss": 0.3887, "step": 119200 }, { - "epoch": 4.19, - "learning_rate": 3.342768385490008e-06, - "loss": 0.2508, + "epoch": 4.296140123256568, + "grad_norm": 0.2969033718109131, + "learning_rate": 2.563981260397749e-06, + "loss": 0.3525, "step": 119205 }, { - "epoch": 4.19, - "learning_rate": 3.341345468156651e-06, - "loss": 0.2563, + "epoch": 4.296320322917793, + "grad_norm": 0.20924274623394012, + "learning_rate": 2.562694122313483e-06, + "loss": 0.3388, "step": 119210 }, { - "epoch": 4.19, - "learning_rate": 3.339922832046641e-06, - "loss": 0.2641, + "epoch": 4.296500522579017, + "grad_norm": 0.24764671921730042, + "learning_rate": 2.5614072899331625e-06, + "loss": 0.3648, "step": 119215 }, { - "epoch": 4.19, - "learning_rate": 3.3385004771784663e-06, - "loss": 0.263, + "epoch": 4.296680722240242, + "grad_norm": 0.2710559368133545, + "learning_rate": 2.560120763274329e-06, + "loss": 0.342, "step": 119220 }, { - "epoch": 4.19, - "learning_rate": 3.337078403570587e-06, - "loss": 0.2462, + "epoch": 4.296860921901467, + "grad_norm": 0.3236762285232544, + "learning_rate": 2.5588345423545046e-06, + "loss": 0.3871, "step": 119225 }, { - "epoch": 4.19, - "learning_rate": 3.3356566112414695e-06, - "loss": 0.2361, + "epoch": 4.297041121562692, + "grad_norm": 0.2507827877998352, + "learning_rate": 2.5575486271912143e-06, + "loss": 0.3975, "step": 119230 }, { - "epoch": 4.2, - "learning_rate": 3.334235100209565e-06, - "loss": 0.2614, + "epoch": 4.297221321223916, + "grad_norm": 0.3020266890525818, + "learning_rate": 2.556263017801977e-06, + "loss": 0.3805, "step": 119235 }, { - "epoch": 4.2, - "learning_rate": 3.332813870493348e-06, - "loss": 0.2713, + "epoch": 4.297401520885141, + "grad_norm": 0.2670553922653198, + "learning_rate": 2.55497771420431e-06, + "loss": 0.3863, "step": 119240 }, { - "epoch": 4.2, - "learning_rate": 3.3313929221112564e-06, - "loss": 0.2877, + "epoch": 4.297581720546366, + "grad_norm": 0.29104623198509216, + "learning_rate": 2.553692716415729e-06, + "loss": 0.3768, "step": 119245 }, { - "epoch": 4.2, - "learning_rate": 3.3299722550817536e-06, - "loss": 0.2484, + "epoch": 4.29776192020759, + "grad_norm": 0.2508760392665863, + "learning_rate": 2.552408024453742e-06, + "loss": 0.3921, "step": 119250 }, { - "epoch": 4.2, - "learning_rate": 3.328551869423274e-06, - "loss": 0.2591, + "epoch": 4.297942119868814, + "grad_norm": 0.27478477358818054, + "learning_rate": 2.5511236383358422e-06, + "loss": 0.3742, "step": 119255 }, { - "epoch": 4.2, - "learning_rate": 3.3271317651542733e-06, - "loss": 0.2664, + "epoch": 4.298122319530039, + "grad_norm": 0.28668543696403503, + "learning_rate": 2.549839558079542e-06, + "loss": 0.374, "step": 119260 }, { - "epoch": 4.2, - "learning_rate": 3.325711942293183e-06, - "loss": 0.2699, + "epoch": 4.298302519191264, + "grad_norm": 0.22271794080734253, + "learning_rate": 2.5485557837023303e-06, + "loss": 0.3471, "step": 119265 }, { - "epoch": 4.2, - "learning_rate": 3.324292400858439e-06, - "loss": 0.2607, + "epoch": 4.298482718852489, + "grad_norm": 0.22481966018676758, + "learning_rate": 2.5472723152216992e-06, + "loss": 0.3503, "step": 119270 }, { - "epoch": 4.2, - "learning_rate": 3.3228731408684654e-06, - "loss": 0.2423, + "epoch": 4.298662918513713, + "grad_norm": 0.22791920602321625, + "learning_rate": 2.545989152655137e-06, + "loss": 0.3945, "step": 119275 }, { - "epoch": 4.2, - "learning_rate": 3.321454162341708e-06, - "loss": 0.2607, + "epoch": 4.298843118174938, + "grad_norm": 0.3118433952331543, + "learning_rate": 2.544706296020119e-06, + "loss": 0.368, "step": 119280 }, { - "epoch": 4.2, - "learning_rate": 3.320035465296578e-06, - "loss": 0.2525, + "epoch": 4.299023317836163, + "grad_norm": 0.2143528163433075, + "learning_rate": 2.5434237453341364e-06, + "loss": 0.3645, "step": 119285 }, { - "epoch": 4.2, - "learning_rate": 3.3186170497514963e-06, - "loss": 0.2772, + "epoch": 4.2992035174973875, + "grad_norm": 0.28790873289108276, + "learning_rate": 2.5421415006146558e-06, + "loss": 0.391, "step": 119290 }, { - "epoch": 4.2, - "learning_rate": 3.3171989157248897e-06, - "loss": 0.2663, + "epoch": 4.299383717158611, + "grad_norm": 0.25662392377853394, + "learning_rate": 2.5408595618791497e-06, + "loss": 0.3487, "step": 119295 }, { - "epoch": 4.2, - "learning_rate": 3.3157810632351597e-06, - "loss": 0.2536, + "epoch": 4.299563916819836, + "grad_norm": 0.26446598768234253, + "learning_rate": 2.5395779291450846e-06, + "loss": 0.3973, "step": 119300 }, { - "epoch": 4.2, - "learning_rate": 3.314363492300726e-06, - "loss": 0.2519, + "epoch": 4.299744116481061, + "grad_norm": 0.24997925758361816, + "learning_rate": 2.538296602429921e-06, + "loss": 0.3521, "step": 119305 }, { - "epoch": 4.2, - "learning_rate": 3.312946202939987e-06, - "loss": 0.2738, + "epoch": 4.2999243161422855, + "grad_norm": 0.22187812626361847, + "learning_rate": 2.537015581751115e-06, + "loss": 0.3569, "step": 119310 }, { - "epoch": 4.2, - "learning_rate": 3.311529195171356e-06, - "loss": 0.2578, + "epoch": 4.30010451580351, + "grad_norm": 0.23413535952568054, + "learning_rate": 2.5357348671261246e-06, + "loss": 0.3708, "step": 119315 }, { - "epoch": 4.2, - "learning_rate": 3.310112469013224e-06, - "loss": 0.2636, + "epoch": 4.300284715464735, + "grad_norm": 0.2146236151456833, + "learning_rate": 2.5344544585723997e-06, + "loss": 0.3627, "step": 119320 }, { - "epoch": 4.2, - "learning_rate": 3.308696024483987e-06, - "loss": 0.2374, + "epoch": 4.30046491512596, + "grad_norm": 0.24262899160385132, + "learning_rate": 2.5331743561073816e-06, + "loss": 0.3694, "step": 119325 }, { - "epoch": 4.2, - "learning_rate": 3.3072798616020306e-06, - "loss": 0.2472, + "epoch": 4.3006451147871845, + "grad_norm": 0.21525375545024872, + "learning_rate": 2.5318945597485125e-06, + "loss": 0.359, "step": 119330 }, { - "epoch": 4.2, - "learning_rate": 3.3058639803857568e-06, - "loss": 0.2488, + "epoch": 4.300825314448409, + "grad_norm": 0.25374144315719604, + "learning_rate": 2.5306150695132286e-06, + "loss": 0.3611, "step": 119335 }, { - "epoch": 4.2, - "learning_rate": 3.304448380853542e-06, - "loss": 0.2489, + "epoch": 4.301005514109633, + "grad_norm": 0.2709214389324188, + "learning_rate": 2.529335885418965e-06, + "loss": 0.391, "step": 119340 }, { - "epoch": 4.2, - "learning_rate": 3.3030330630237666e-06, - "loss": 0.2465, + "epoch": 4.301185713770858, + "grad_norm": 0.30983975529670715, + "learning_rate": 2.5280570074831505e-06, + "loss": 0.3709, "step": 119345 }, { - "epoch": 4.2, - "learning_rate": 3.3016180269148018e-06, - "loss": 0.2632, + "epoch": 4.3013659134320825, + "grad_norm": 0.23917073011398315, + "learning_rate": 2.5267784357232123e-06, + "loss": 0.3768, "step": 119350 }, { - "epoch": 4.2, - "learning_rate": 3.3002032725450274e-06, - "loss": 0.2499, + "epoch": 4.301546113093307, + "grad_norm": 0.22569625079631805, + "learning_rate": 2.525500170156564e-06, + "loss": 0.3792, "step": 119355 }, { - "epoch": 4.2, - "learning_rate": 3.2987887999328173e-06, - "loss": 0.2446, + "epoch": 4.301726312754532, + "grad_norm": 0.19801470637321472, + "learning_rate": 2.5242222108006196e-06, + "loss": 0.3409, "step": 119360 }, { - "epoch": 4.2, - "learning_rate": 3.2973746090965347e-06, - "loss": 0.2313, + "epoch": 4.301906512415757, + "grad_norm": 0.3329550623893738, + "learning_rate": 2.5229445576728044e-06, + "loss": 0.339, "step": 119365 }, { - "epoch": 4.2, - "learning_rate": 3.2959607000545346e-06, - "loss": 0.2592, + "epoch": 4.3020867120769815, + "grad_norm": 0.2501632273197174, + "learning_rate": 2.5216672107905153e-06, + "loss": 0.3723, "step": 119370 }, { - "epoch": 4.2, - "learning_rate": 3.2945470728251855e-06, - "loss": 0.2602, + "epoch": 4.302266911738206, + "grad_norm": 0.255783349275589, + "learning_rate": 2.5203901701711525e-06, + "loss": 0.3433, "step": 119375 }, { - "epoch": 4.2, - "learning_rate": 3.2931337274268393e-06, - "loss": 0.2268, + "epoch": 4.302447111399431, + "grad_norm": 0.30260223150253296, + "learning_rate": 2.5191134358321267e-06, + "loss": 0.3702, "step": 119380 }, { - "epoch": 4.2, - "learning_rate": 3.2917206638778453e-06, - "loss": 0.2442, + "epoch": 4.302627311060656, + "grad_norm": 0.33803072571754456, + "learning_rate": 2.5178370077908297e-06, + "loss": 0.374, "step": 119385 }, { - "epoch": 4.2, - "learning_rate": 3.29030788219655e-06, - "loss": 0.2654, + "epoch": 4.3028075107218795, + "grad_norm": 0.20354852080345154, + "learning_rate": 2.5165608860646476e-06, + "loss": 0.3368, "step": 119390 }, { - "epoch": 4.2, - "learning_rate": 3.2888953824013026e-06, - "loss": 0.2611, + "epoch": 4.302987710383104, + "grad_norm": 0.2563473880290985, + "learning_rate": 2.515285070670972e-06, + "loss": 0.4105, "step": 119395 }, { - "epoch": 4.2, - "learning_rate": 3.2874831645104383e-06, - "loss": 0.2603, + "epoch": 4.303167910044329, + "grad_norm": 0.2289261668920517, + "learning_rate": 2.5140095616271816e-06, + "loss": 0.3794, "step": 119400 }, { - "epoch": 4.2, - "learning_rate": 3.286071228542295e-06, - "loss": 0.2546, + "epoch": 4.303348109705554, + "grad_norm": 0.1935594230890274, + "learning_rate": 2.5127343589506607e-06, + "loss": 0.3595, "step": 119405 }, { - "epoch": 4.2, - "learning_rate": 3.2846595745152033e-06, - "loss": 0.2498, + "epoch": 4.303528309366778, + "grad_norm": 0.2854008078575134, + "learning_rate": 2.51145946265878e-06, + "loss": 0.399, "step": 119410 }, { - "epoch": 4.2, - "learning_rate": 3.283248202447503e-06, - "loss": 0.277, + "epoch": 4.303708509028003, + "grad_norm": 0.22041873633861542, + "learning_rate": 2.510184872768909e-06, + "loss": 0.3513, "step": 119415 }, { - "epoch": 4.2, - "learning_rate": 3.2818371123575136e-06, - "loss": 0.2537, + "epoch": 4.303888708689228, + "grad_norm": 0.2398824393749237, + "learning_rate": 2.508910589298416e-06, + "loss": 0.3586, "step": 119420 }, { - "epoch": 4.2, - "learning_rate": 3.280426304263548e-06, - "loss": 0.2423, + "epoch": 4.304068908350453, + "grad_norm": 0.2502867877483368, + "learning_rate": 2.5076366122646577e-06, + "loss": 0.3853, "step": 119425 }, { - "epoch": 4.2, - "learning_rate": 3.2790157781839414e-06, - "loss": 0.2577, + "epoch": 4.3042491080116765, + "grad_norm": 0.29783013463020325, + "learning_rate": 2.5063629416850006e-06, + "loss": 0.3828, "step": 119430 }, { - "epoch": 4.2, - "learning_rate": 3.2776055341370014e-06, - "loss": 0.2648, + "epoch": 4.304429307672901, + "grad_norm": 0.22847159206867218, + "learning_rate": 2.505089577576797e-06, + "loss": 0.3618, "step": 119435 }, { - "epoch": 4.2, - "learning_rate": 3.2761955721410357e-06, - "loss": 0.225, + "epoch": 4.304609507334126, + "grad_norm": 0.2972005009651184, + "learning_rate": 2.503816519957383e-06, + "loss": 0.3382, "step": 119440 }, { - "epoch": 4.2, - "learning_rate": 3.2747858922143493e-06, - "loss": 0.2659, + "epoch": 4.304789706995351, + "grad_norm": 0.26005420088768005, + "learning_rate": 2.5025437688441206e-06, + "loss": 0.3663, "step": 119445 }, { - "epoch": 4.2, - "learning_rate": 3.273376494375255e-06, - "loss": 0.28, + "epoch": 4.304969906656575, + "grad_norm": 0.23355317115783691, + "learning_rate": 2.5012713242543422e-06, + "loss": 0.3487, "step": 119450 }, { - "epoch": 4.2, - "learning_rate": 3.271967378642049e-06, - "loss": 0.2796, + "epoch": 4.3051501063178, + "grad_norm": 0.23067983984947205, + "learning_rate": 2.4999991862053818e-06, + "loss": 0.3627, "step": 119455 }, { - "epoch": 4.2, - "learning_rate": 3.270558545033023e-06, - "loss": 0.2658, + "epoch": 4.305330305979025, + "grad_norm": 0.28559330105781555, + "learning_rate": 2.498727354714586e-06, + "loss": 0.3992, "step": 119460 }, { - "epoch": 4.2, - "learning_rate": 3.2691499935664784e-06, - "loss": 0.2435, + "epoch": 4.30551050564025, + "grad_norm": 0.20493915677070618, + "learning_rate": 2.497455829799261e-06, + "loss": 0.3717, "step": 119465 }, { - "epoch": 4.2, - "learning_rate": 3.267741724260695e-06, - "loss": 0.2611, + "epoch": 4.305690705301474, + "grad_norm": 0.2629398703575134, + "learning_rate": 2.496184611476754e-06, + "loss": 0.3634, "step": 119470 }, { - "epoch": 4.2, - "learning_rate": 3.266333737133967e-06, - "loss": 0.2511, + "epoch": 4.305870904962699, + "grad_norm": 0.2835977077484131, + "learning_rate": 2.49491369976437e-06, + "loss": 0.3968, "step": 119475 }, { - "epoch": 4.2, - "learning_rate": 3.264926032204574e-06, - "loss": 0.2607, + "epoch": 4.306051104623923, + "grad_norm": 0.29381829500198364, + "learning_rate": 2.4936430946794322e-06, + "loss": 0.39, "step": 119480 }, { - "epoch": 4.2, - "learning_rate": 3.2635186094907872e-06, - "loss": 0.2444, + "epoch": 4.306231304285148, + "grad_norm": 0.21818934381008148, + "learning_rate": 2.492372796239251e-06, + "loss": 0.3773, "step": 119485 }, { - "epoch": 4.2, - "learning_rate": 3.262111469010892e-06, - "loss": 0.2625, + "epoch": 4.306411503946372, + "grad_norm": 0.3064677119255066, + "learning_rate": 2.491102804461129e-06, + "loss": 0.3716, "step": 119490 }, { - "epoch": 4.2, - "learning_rate": 3.2607046107831522e-06, - "loss": 0.2604, + "epoch": 4.306591703607597, + "grad_norm": 0.20513369143009186, + "learning_rate": 2.4898331193623724e-06, + "loss": 0.3567, "step": 119495 }, { - "epoch": 4.2, - "learning_rate": 3.259298034825836e-06, - "loss": 0.2672, + "epoch": 4.306771903268822, + "grad_norm": 0.28164172172546387, + "learning_rate": 2.488563740960284e-06, + "loss": 0.3665, "step": 119500 }, { - "epoch": 4.2, - "eval_loss": 0.24933363497257233, - "eval_runtime": 10.5337, - "eval_samples_per_second": 9.493, - "eval_steps_per_second": 9.493, + "epoch": 4.306771903268822, + "eval_loss": 0.4290541708469391, + "eval_runtime": 3.5337, + "eval_samples_per_second": 28.299, + "eval_steps_per_second": 7.075, "step": 119500 }, { - "epoch": 4.2, - "learning_rate": 3.257891741157204e-06, - "loss": 0.2517, + "epoch": 4.306952102930047, + "grad_norm": 0.24295520782470703, + "learning_rate": 2.4872946692721554e-06, + "loss": 0.346, "step": 119505 }, { - "epoch": 4.2, - "learning_rate": 3.2564857297955247e-06, - "loss": 0.265, + "epoch": 4.307132302591271, + "grad_norm": 0.29755252599716187, + "learning_rate": 2.4860259043152813e-06, + "loss": 0.3668, "step": 119510 }, { - "epoch": 4.2, - "learning_rate": 3.255080000759042e-06, - "loss": 0.2615, + "epoch": 4.307312502252496, + "grad_norm": 0.24635331332683563, + "learning_rate": 2.4847574461069446e-06, + "loss": 0.3949, "step": 119515 }, { - "epoch": 4.21, - "learning_rate": 3.253674554066019e-06, - "loss": 0.2522, + "epoch": 4.307492701913721, + "grad_norm": 0.2744773328304291, + "learning_rate": 2.483489294664423e-06, + "loss": 0.4012, "step": 119520 }, { - "epoch": 4.21, - "learning_rate": 3.252269389734697e-06, - "loss": 0.2523, + "epoch": 4.307672901574945, + "grad_norm": 0.26855769753456116, + "learning_rate": 2.482221450005004e-06, + "loss": 0.37, "step": 119525 }, { - "epoch": 4.21, - "learning_rate": 3.25086450778333e-06, - "loss": 0.2653, + "epoch": 4.307853101236169, + "grad_norm": 0.2621409595012665, + "learning_rate": 2.4809539121459613e-06, + "loss": 0.3522, "step": 119530 }, { - "epoch": 4.21, - "learning_rate": 3.249459908230154e-06, - "loss": 0.2465, + "epoch": 4.308033300897394, + "grad_norm": 0.2507140040397644, + "learning_rate": 2.4796866811045595e-06, + "loss": 0.3455, "step": 119535 }, { - "epoch": 4.21, - "learning_rate": 3.2480555910934017e-06, - "loss": 0.2678, + "epoch": 4.308213500558619, + "grad_norm": 0.26044005155563354, + "learning_rate": 2.478419756898065e-06, + "loss": 0.3529, "step": 119540 }, { - "epoch": 4.21, - "learning_rate": 3.2466515563913167e-06, - "loss": 0.2462, + "epoch": 4.308393700219844, + "grad_norm": 0.2704107165336609, + "learning_rate": 2.4771531395437394e-06, + "loss": 0.3842, "step": 119545 }, { - "epoch": 4.21, - "learning_rate": 3.2452478041421263e-06, - "loss": 0.2608, + "epoch": 4.308573899881068, + "grad_norm": 0.23864923417568207, + "learning_rate": 2.4758868290588487e-06, + "loss": 0.371, "step": 119550 }, { - "epoch": 4.21, - "learning_rate": 3.243844334364057e-06, - "loss": 0.247, + "epoch": 4.308754099542293, + "grad_norm": 0.3333123028278351, + "learning_rate": 2.4746208254606386e-06, + "loss": 0.3874, "step": 119555 }, { - "epoch": 4.21, - "learning_rate": 3.242441147075323e-06, - "loss": 0.2504, + "epoch": 4.308934299203518, + "grad_norm": 0.236053004860878, + "learning_rate": 2.47335512876635e-06, + "loss": 0.3794, "step": 119560 }, { - "epoch": 4.21, - "learning_rate": 3.241038242294159e-06, - "loss": 0.2384, + "epoch": 4.3091144988647425, + "grad_norm": 0.24085481464862823, + "learning_rate": 2.4720897389932413e-06, + "loss": 0.3613, "step": 119565 }, { - "epoch": 4.21, - "learning_rate": 3.239635620038775e-06, - "loss": 0.2452, + "epoch": 4.309294698525966, + "grad_norm": 0.2732737064361572, + "learning_rate": 2.470824656158549e-06, + "loss": 0.383, "step": 119570 }, { - "epoch": 4.21, - "learning_rate": 3.238233280327374e-06, - "loss": 0.2408, + "epoch": 4.309474898187191, + "grad_norm": 0.23329199850559235, + "learning_rate": 2.4695598802795096e-06, + "loss": 0.3829, "step": 119575 }, { - "epoch": 4.21, - "learning_rate": 3.2368312231781746e-06, - "loss": 0.2511, + "epoch": 4.309655097848416, + "grad_norm": 0.22309060394763947, + "learning_rate": 2.4682954113733557e-06, + "loss": 0.369, "step": 119580 }, { - "epoch": 4.21, - "learning_rate": 3.235429448609384e-06, - "loss": 0.2409, + "epoch": 4.309835297509641, + "grad_norm": 0.24578943848609924, + "learning_rate": 2.4670312494573076e-06, + "loss": 0.4047, "step": 119585 }, { - "epoch": 4.21, - "learning_rate": 3.234027956639199e-06, - "loss": 0.2468, + "epoch": 4.310015497170865, + "grad_norm": 0.304373562335968, + "learning_rate": 2.4657673945486045e-06, + "loss": 0.3447, "step": 119590 }, { - "epoch": 4.21, - "learning_rate": 3.2326267472858164e-06, - "loss": 0.2512, + "epoch": 4.31019569683209, + "grad_norm": 0.32049259543418884, + "learning_rate": 2.4645038466644538e-06, + "loss": 0.4057, "step": 119595 }, { - "epoch": 4.21, - "learning_rate": 3.231225820567424e-06, - "loss": 0.2381, + "epoch": 4.310375896493315, + "grad_norm": 0.22794209420681, + "learning_rate": 2.4632406058220758e-06, + "loss": 0.3567, "step": 119600 }, { - "epoch": 4.21, - "learning_rate": 3.2298251765022243e-06, - "loss": 0.2532, + "epoch": 4.3105560961545395, + "grad_norm": 0.21800605952739716, + "learning_rate": 2.4619776720386846e-06, + "loss": 0.3863, "step": 119605 }, { - "epoch": 4.21, - "learning_rate": 3.2284248151083967e-06, - "loss": 0.2436, + "epoch": 4.310736295815764, + "grad_norm": 0.23417772352695465, + "learning_rate": 2.460715045331477e-06, + "loss": 0.3833, "step": 119610 }, { - "epoch": 4.21, - "learning_rate": 3.2270247364041246e-06, - "loss": 0.2452, + "epoch": 4.310916495476988, + "grad_norm": 0.26621463894844055, + "learning_rate": 2.4594527257176676e-06, + "loss": 0.3761, "step": 119615 }, { - "epoch": 4.21, - "learning_rate": 3.2256249404075846e-06, - "loss": 0.2474, + "epoch": 4.311096695138213, + "grad_norm": 0.23449282348155975, + "learning_rate": 2.4581907132144506e-06, + "loss": 0.35, "step": 119620 }, { - "epoch": 4.21, - "learning_rate": 3.2242254271369593e-06, - "loss": 0.2411, + "epoch": 4.3112768947994375, + "grad_norm": 0.2845596671104431, + "learning_rate": 2.4569290078390205e-06, + "loss": 0.3927, "step": 119625 }, { - "epoch": 4.21, - "learning_rate": 3.2228261966104097e-06, - "loss": 0.2612, + "epoch": 4.311457094460662, + "grad_norm": 0.24761004745960236, + "learning_rate": 2.4556676096085696e-06, + "loss": 0.3448, "step": 119630 }, { - "epoch": 4.21, - "learning_rate": 3.2214272488461096e-06, - "loss": 0.2593, + "epoch": 4.311637294121887, + "grad_norm": 0.30386558175086975, + "learning_rate": 2.4544065185402837e-06, + "loss": 0.3739, "step": 119635 }, { - "epoch": 4.21, - "learning_rate": 3.2200285838622305e-06, - "loss": 0.2482, + "epoch": 4.311817493783112, + "grad_norm": 0.32800036668777466, + "learning_rate": 2.4531457346513382e-06, + "loss": 0.3819, "step": 119640 }, { - "epoch": 4.21, - "learning_rate": 3.2186302016769256e-06, - "loss": 0.2464, + "epoch": 4.3119976934443365, + "grad_norm": 0.24963368475437164, + "learning_rate": 2.4518852579589247e-06, + "loss": 0.3603, "step": 119645 }, { - "epoch": 4.21, - "learning_rate": 3.217232102308354e-06, - "loss": 0.253, + "epoch": 4.312177893105561, + "grad_norm": 0.24996010959148407, + "learning_rate": 2.450625088480202e-06, + "loss": 0.3518, "step": 119650 }, { - "epoch": 4.21, - "learning_rate": 3.2158342857746632e-06, - "loss": 0.2417, + "epoch": 4.312358092766786, + "grad_norm": 0.22332359850406647, + "learning_rate": 2.44936522623235e-06, + "loss": 0.4102, "step": 119655 }, { - "epoch": 4.21, - "learning_rate": 3.2144367520940135e-06, - "loss": 0.2467, + "epoch": 4.312538292428011, + "grad_norm": 0.20659637451171875, + "learning_rate": 2.448105671232531e-06, + "loss": 0.3485, "step": 119660 }, { - "epoch": 4.21, - "learning_rate": 3.2130395012845455e-06, - "loss": 0.2488, + "epoch": 4.3127184920892345, + "grad_norm": 0.24972330033779144, + "learning_rate": 2.4468464234979063e-06, + "loss": 0.3618, "step": 119665 }, { - "epoch": 4.21, - "learning_rate": 3.2116425333644007e-06, - "loss": 0.2447, + "epoch": 4.312898691750459, + "grad_norm": 0.2645004987716675, + "learning_rate": 2.4455874830456334e-06, + "loss": 0.3474, "step": 119670 }, { - "epoch": 4.21, - "learning_rate": 3.210245848351712e-06, - "loss": 0.2906, + "epoch": 4.313078891411684, + "grad_norm": 0.20380602777004242, + "learning_rate": 2.444328849892863e-06, + "loss": 0.3755, "step": 119675 }, { - "epoch": 4.21, - "learning_rate": 3.208849446264628e-06, - "loss": 0.2576, + "epoch": 4.313259091072909, + "grad_norm": 0.23130224645137787, + "learning_rate": 2.4430705240567425e-06, + "loss": 0.3706, "step": 119680 }, { - "epoch": 4.21, - "learning_rate": 3.207453327121268e-06, - "loss": 0.2508, + "epoch": 4.3134392907341335, + "grad_norm": 0.23018963634967804, + "learning_rate": 2.441812505554422e-06, + "loss": 0.3971, "step": 119685 }, { - "epoch": 4.21, - "learning_rate": 3.2060574909397677e-06, - "loss": 0.2648, + "epoch": 4.313619490395358, + "grad_norm": 0.26062634587287903, + "learning_rate": 2.4405547944030383e-06, + "loss": 0.3614, "step": 119690 }, { - "epoch": 4.21, - "learning_rate": 3.204661937738246e-06, - "loss": 0.2435, + "epoch": 4.313799690056583, + "grad_norm": 0.2006555050611496, + "learning_rate": 2.4392973906197297e-06, + "loss": 0.3731, "step": 119695 }, { - "epoch": 4.21, - "learning_rate": 3.2032666675348266e-06, - "loss": 0.2678, + "epoch": 4.313979889717808, + "grad_norm": 0.2953641712665558, + "learning_rate": 2.4380402942216245e-06, + "loss": 0.3982, "step": 119700 }, { - "epoch": 4.21, - "learning_rate": 3.201871680347629e-06, - "loss": 0.2452, + "epoch": 4.3141600893790315, + "grad_norm": 0.23595255613327026, + "learning_rate": 2.436783505225848e-06, + "loss": 0.3646, "step": 119705 }, { - "epoch": 4.21, - "learning_rate": 3.2004769761947604e-06, - "loss": 0.2479, + "epoch": 4.314340289040256, + "grad_norm": 0.265407919883728, + "learning_rate": 2.4355270236495338e-06, + "loss": 0.3473, "step": 119710 }, { - "epoch": 4.21, - "learning_rate": 3.1990825550943264e-06, - "loss": 0.2468, + "epoch": 4.314520488701481, + "grad_norm": 0.19755345582962036, + "learning_rate": 2.434270849509793e-06, + "loss": 0.4052, "step": 119715 }, { - "epoch": 4.21, - "learning_rate": 3.197688417064443e-06, - "loss": 0.2438, + "epoch": 4.314700688362706, + "grad_norm": 0.24743589758872986, + "learning_rate": 2.433014982823745e-06, + "loss": 0.3699, "step": 119720 }, { - "epoch": 4.21, - "learning_rate": 3.1962945621232087e-06, - "loss": 0.229, + "epoch": 4.31488088802393, + "grad_norm": 0.38127291202545166, + "learning_rate": 2.431759423608496e-06, + "loss": 0.3905, "step": 119725 }, { - "epoch": 4.21, - "learning_rate": 3.1949009902887185e-06, - "loss": 0.248, + "epoch": 4.315061087685155, + "grad_norm": 0.24170427024364471, + "learning_rate": 2.430504171881154e-06, + "loss": 0.382, "step": 119730 }, { - "epoch": 4.21, - "learning_rate": 3.1935077015790655e-06, - "loss": 0.2514, + "epoch": 4.31524128734638, + "grad_norm": 0.2505660951137543, + "learning_rate": 2.4292492276588306e-06, + "loss": 0.3779, "step": 119735 }, { - "epoch": 4.21, - "learning_rate": 3.1921146960123437e-06, - "loss": 0.2503, + "epoch": 4.315421487007605, + "grad_norm": 0.2789454162120819, + "learning_rate": 2.4279945909586125e-06, + "loss": 0.3524, "step": 119740 }, { - "epoch": 4.21, - "learning_rate": 3.1907219736066448e-06, - "loss": 0.242, + "epoch": 4.315601686668829, + "grad_norm": 0.2678326964378357, + "learning_rate": 2.4267402617975944e-06, + "loss": 0.3817, "step": 119745 }, { - "epoch": 4.21, - "learning_rate": 3.1893295343800424e-06, - "loss": 0.2572, + "epoch": 4.315781886330054, + "grad_norm": 0.2486988753080368, + "learning_rate": 2.4254862401928728e-06, + "loss": 0.3714, "step": 119750 }, { - "epoch": 4.21, - "learning_rate": 3.187937378350628e-06, - "loss": 0.2471, + "epoch": 4.315962085991278, + "grad_norm": 0.27803459763526917, + "learning_rate": 2.4242325261615267e-06, + "loss": 0.37, "step": 119755 }, { - "epoch": 4.21, - "learning_rate": 3.186545505536473e-06, - "loss": 0.2571, + "epoch": 4.316142285652503, + "grad_norm": 0.22846895456314087, + "learning_rate": 2.4229791197206502e-06, + "loss": 0.3555, "step": 119760 }, { - "epoch": 4.21, - "learning_rate": 3.185153915955649e-06, - "loss": 0.262, + "epoch": 4.316322485313727, + "grad_norm": 0.2598210573196411, + "learning_rate": 2.4217260208873077e-06, + "loss": 0.3441, "step": 119765 }, { - "epoch": 4.21, - "learning_rate": 3.1837626096262175e-06, - "loss": 0.2488, + "epoch": 4.316502684974952, + "grad_norm": 0.25162217020988464, + "learning_rate": 2.4204732296785743e-06, + "loss": 0.3937, "step": 119770 }, { - "epoch": 4.21, - "learning_rate": 3.18237158656626e-06, - "loss": 0.2683, + "epoch": 4.316682884636177, + "grad_norm": 0.25310778617858887, + "learning_rate": 2.419220746111525e-06, + "loss": 0.3502, "step": 119775 }, { - "epoch": 4.21, - "learning_rate": 3.180980846793827e-06, - "loss": 0.2407, + "epoch": 4.316863084297402, + "grad_norm": 0.2426055669784546, + "learning_rate": 2.417968570203219e-06, + "loss": 0.365, "step": 119780 }, { - "epoch": 4.21, - "learning_rate": 3.179590390326978e-06, - "loss": 0.2662, + "epoch": 4.317043283958626, + "grad_norm": Infinity, + "learning_rate": 2.416967051002339e-06, + "loss": 0.3506, "step": 119785 }, { - "epoch": 4.21, - "learning_rate": 3.178200217183763e-06, - "loss": 0.2616, + "epoch": 4.317223483619851, + "grad_norm": 0.2902944087982178, + "learning_rate": 2.415715428922766e-06, + "loss": 0.3559, "step": 119790 }, { - "epoch": 4.21, - "learning_rate": 3.1768103273822377e-06, - "loss": 0.2381, + "epoch": 4.317403683281076, + "grad_norm": 0.2499387264251709, + "learning_rate": 2.4144641145497e-06, + "loss": 0.3648, "step": 119795 }, { - "epoch": 4.21, - "learning_rate": 3.17542072094045e-06, - "loss": 0.2518, + "epoch": 4.3175838829423, + "grad_norm": 0.25404685735702515, + "learning_rate": 2.4132131079001807e-06, + "loss": 0.3391, "step": 119800 }, { - "epoch": 4.22, - "learning_rate": 3.174031397876445e-06, - "loss": 0.238, + "epoch": 4.317764082603524, + "grad_norm": 0.26094871759414673, + "learning_rate": 2.411962408991264e-06, + "loss": 0.3919, "step": 119805 }, { - "epoch": 4.22, - "learning_rate": 3.1726423582082524e-06, - "loss": 0.2465, + "epoch": 4.317944282264749, + "grad_norm": 0.2552245855331421, + "learning_rate": 2.4107120178399945e-06, + "loss": 0.3532, "step": 119810 }, { - "epoch": 4.22, - "learning_rate": 3.171253601953919e-06, - "loss": 0.2735, + "epoch": 4.318124481925974, + "grad_norm": 0.24921070039272308, + "learning_rate": 2.409461934463389e-06, + "loss": 0.3427, "step": 119815 }, { - "epoch": 4.22, - "learning_rate": 3.1698651291314717e-06, - "loss": 0.2575, + "epoch": 4.318304681587199, + "grad_norm": 0.23261171579360962, + "learning_rate": 2.408212158878495e-06, + "loss": 0.3322, "step": 119820 }, { - "epoch": 4.22, - "learning_rate": 3.168476939758941e-06, - "loss": 0.2336, + "epoch": 4.318484881248423, + "grad_norm": 0.23947398364543915, + "learning_rate": 2.4069626911023325e-06, + "loss": 0.3588, "step": 119825 }, { - "epoch": 4.22, - "learning_rate": 3.1670890338543425e-06, - "loss": 0.2427, + "epoch": 4.318665080909648, + "grad_norm": 0.2614345848560333, + "learning_rate": 2.40571353115194e-06, + "loss": 0.3382, "step": 119830 }, { - "epoch": 4.22, - "learning_rate": 3.1657014114357093e-06, - "loss": 0.2528, + "epoch": 4.318845280570873, + "grad_norm": 0.30958092212677, + "learning_rate": 2.4044646790443188e-06, + "loss": 0.3832, "step": 119835 }, { - "epoch": 4.22, - "learning_rate": 3.1643140725210546e-06, - "loss": 0.241, + "epoch": 4.3190254802320975, + "grad_norm": 0.26829853653907776, + "learning_rate": 2.4032161347964875e-06, + "loss": 0.3675, "step": 119840 }, { - "epoch": 4.22, - "learning_rate": 3.1629270171283864e-06, - "loss": 0.2535, + "epoch": 4.319205679893322, + "grad_norm": 0.24423977732658386, + "learning_rate": 2.4019678984254696e-06, + "loss": 0.3462, "step": 119845 }, { - "epoch": 4.22, - "learning_rate": 3.161540245275721e-06, - "loss": 0.2415, + "epoch": 4.319385879554546, + "grad_norm": 0.3319460153579712, + "learning_rate": 2.400719969948259e-06, + "loss": 0.389, "step": 119850 }, { - "epoch": 4.22, - "learning_rate": 3.1601537569810655e-06, - "loss": 0.2592, + "epoch": 4.319566079215771, + "grad_norm": 0.26566433906555176, + "learning_rate": 2.399472349381868e-06, + "loss": 0.3806, "step": 119855 }, { - "epoch": 4.22, - "learning_rate": 3.1587675522624233e-06, - "loss": 0.2557, + "epoch": 4.319746278876996, + "grad_norm": 0.32363948225975037, + "learning_rate": 2.3982250367432874e-06, + "loss": 0.3628, "step": 119860 }, { - "epoch": 4.22, - "learning_rate": 3.1573816311377847e-06, - "loss": 0.2379, + "epoch": 4.31992647853822, + "grad_norm": 0.2810172736644745, + "learning_rate": 2.396978032049513e-06, + "loss": 0.3626, "step": 119865 }, { - "epoch": 4.22, - "learning_rate": 3.155995993625155e-06, - "loss": 0.2406, + "epoch": 4.320106678199445, + "grad_norm": 0.262736439704895, + "learning_rate": 2.3957313353175387e-06, + "loss": 0.3693, "step": 119870 }, { - "epoch": 4.22, - "learning_rate": 3.154610639742522e-06, - "loss": 0.2616, + "epoch": 4.32028687786067, + "grad_norm": 0.2636507749557495, + "learning_rate": 2.3944849465643488e-06, + "loss": 0.3753, "step": 119875 }, { - "epoch": 4.22, - "learning_rate": 3.1532255695078725e-06, - "loss": 0.2408, + "epoch": 4.3204670775218945, + "grad_norm": 0.26590320467948914, + "learning_rate": 2.3932388658069264e-06, + "loss": 0.3929, "step": 119880 }, { - "epoch": 4.22, - "learning_rate": 3.151840782939186e-06, - "loss": 0.2526, + "epoch": 4.320647277183119, + "grad_norm": 0.232346773147583, + "learning_rate": 2.3919930930622476e-06, + "loss": 0.3505, "step": 119885 }, { - "epoch": 4.22, - "learning_rate": 3.150456280054456e-06, - "loss": 0.2352, + "epoch": 4.320827476844343, + "grad_norm": 0.23552198708057404, + "learning_rate": 2.390747628347284e-06, + "loss": 0.3903, "step": 119890 }, { - "epoch": 4.22, - "learning_rate": 3.1490720608716493e-06, - "loss": 0.2672, + "epoch": 4.321007676505568, + "grad_norm": 0.2921289801597595, + "learning_rate": 2.389502471679003e-06, + "loss": 0.3556, "step": 119895 }, { - "epoch": 4.22, - "learning_rate": 3.1476881254087374e-06, - "loss": 0.2511, + "epoch": 4.321187876166793, + "grad_norm": 0.2172670215368271, + "learning_rate": 2.3882576230743832e-06, + "loss": 0.3701, "step": 119900 }, { - "epoch": 4.22, - "learning_rate": 3.1463044736837007e-06, - "loss": 0.2399, + "epoch": 4.321368075828017, + "grad_norm": 0.23092396557331085, + "learning_rate": 2.3870130825503656e-06, + "loss": 0.3806, "step": 119905 }, { - "epoch": 4.22, - "learning_rate": 3.1449211057144905e-06, - "loss": 0.248, + "epoch": 4.321548275489242, + "grad_norm": 0.2696508467197418, + "learning_rate": 2.3857688501239234e-06, + "loss": 0.375, "step": 119910 }, { - "epoch": 4.22, - "learning_rate": 3.1435380215190853e-06, - "loss": 0.2494, + "epoch": 4.321728475150467, + "grad_norm": 0.30084672570228577, + "learning_rate": 2.3845249258120012e-06, + "loss": 0.3592, "step": 119915 }, { - "epoch": 4.22, - "learning_rate": 3.142155221115431e-06, - "loss": 0.2433, + "epoch": 4.3219086748116915, + "grad_norm": 0.23035989701747894, + "learning_rate": 2.383281309631544e-06, + "loss": 0.4045, "step": 119920 }, { - "epoch": 4.22, - "learning_rate": 3.140772704521483e-06, - "loss": 0.2539, + "epoch": 4.322088874472916, + "grad_norm": 0.2685616910457611, + "learning_rate": 2.382038001599507e-06, + "loss": 0.3809, "step": 119925 }, { - "epoch": 4.22, - "learning_rate": 3.139390471755202e-06, - "loss": 0.251, + "epoch": 4.322269074134141, + "grad_norm": 0.2256428748369217, + "learning_rate": 2.3807950017328217e-06, + "loss": 0.3882, "step": 119930 }, { - "epoch": 4.22, - "learning_rate": 3.138008522834526e-06, - "loss": 0.2298, + "epoch": 4.322449273795366, + "grad_norm": 0.2647817134857178, + "learning_rate": 2.379552310048419e-06, + "loss": 0.3421, "step": 119935 }, { - "epoch": 4.22, - "learning_rate": 3.1366268577774057e-06, - "loss": 0.2414, + "epoch": 4.3226294734565895, + "grad_norm": 0.25552016496658325, + "learning_rate": 2.378309926563241e-06, + "loss": 0.3568, "step": 119940 }, { - "epoch": 4.22, - "learning_rate": 3.1352454766017698e-06, - "loss": 0.2607, + "epoch": 4.322809673117814, + "grad_norm": 0.23187574744224548, + "learning_rate": 2.3770678512942097e-06, + "loss": 0.379, "step": 119945 }, { - "epoch": 4.22, - "learning_rate": 3.133864379325566e-06, - "loss": 0.232, + "epoch": 4.322989872779039, + "grad_norm": 0.2112554907798767, + "learning_rate": 2.375826084258251e-06, + "loss": 0.3505, "step": 119950 }, { - "epoch": 4.22, - "learning_rate": 3.132483565966718e-06, - "loss": 0.2561, + "epoch": 4.323170072440264, + "grad_norm": 0.30000877380371094, + "learning_rate": 2.3745846254722816e-06, + "loss": 0.3723, "step": 119955 }, { - "epoch": 4.22, - "learning_rate": 3.1311030365431675e-06, - "loss": 0.2499, + "epoch": 4.3233502721014885, + "grad_norm": 0.20632152259349823, + "learning_rate": 2.3733434749532104e-06, + "loss": 0.3575, "step": 119960 }, { - "epoch": 4.22, - "learning_rate": 3.129722791072823e-06, - "loss": 0.2691, + "epoch": 4.323530471762713, + "grad_norm": 0.2675947844982147, + "learning_rate": 2.3721026327179595e-06, + "loss": 0.4144, "step": 119965 }, { - "epoch": 4.22, - "learning_rate": 3.1283428295736243e-06, - "loss": 0.2471, + "epoch": 4.323710671423938, + "grad_norm": 0.2205601930618286, + "learning_rate": 2.37086209878343e-06, + "loss": 0.3685, "step": 119970 }, { - "epoch": 4.22, - "learning_rate": 3.126963152063478e-06, - "loss": 0.2565, + "epoch": 4.323890871085163, + "grad_norm": 0.24030627310276031, + "learning_rate": 2.369621873166522e-06, + "loss": 0.3793, "step": 119975 }, { - "epoch": 4.22, - "learning_rate": 3.1255837585602936e-06, - "loss": 0.2482, + "epoch": 4.324071070746387, + "grad_norm": 0.2809261679649353, + "learning_rate": 2.368381955884136e-06, + "loss": 0.3722, "step": 119980 }, { - "epoch": 4.22, - "learning_rate": 3.1242046490819935e-06, - "loss": 0.2418, + "epoch": 4.324251270407611, + "grad_norm": 0.2488822638988495, + "learning_rate": 2.367142346953158e-06, + "loss": 0.3601, "step": 119985 }, { - "epoch": 4.22, - "learning_rate": 3.122825823646483e-06, - "loss": 0.2433, + "epoch": 4.324431470068836, + "grad_norm": 0.23414598405361176, + "learning_rate": 2.3659030463904884e-06, + "loss": 0.3784, "step": 119990 }, { - "epoch": 4.22, - "learning_rate": 3.1214472822716584e-06, - "loss": 0.2518, + "epoch": 4.324611669730061, + "grad_norm": 0.27667903900146484, + "learning_rate": 2.3646640542130117e-06, + "loss": 0.3638, "step": 119995 }, { - "epoch": 4.22, - "learning_rate": 3.1200690249754193e-06, - "loss": 0.2385, + "epoch": 4.3247918693912855, + "grad_norm": 0.2562429904937744, + "learning_rate": 2.3634253704375967e-06, + "loss": 0.3664, "step": 120000 }, { - "epoch": 4.22, - "eval_loss": 0.2491784542798996, - "eval_runtime": 10.5428, - "eval_samples_per_second": 9.485, - "eval_steps_per_second": 9.485, + "epoch": 4.3247918693912855, + "eval_loss": 0.4289790987968445, + "eval_runtime": 3.5378, + "eval_samples_per_second": 28.266, + "eval_steps_per_second": 7.067, "step": 120000 }, { - "epoch": 4.22, - "learning_rate": 3.1186910517756684e-06, - "loss": 0.2656, + "epoch": 4.32497206905251, + "grad_norm": 0.27249255776405334, + "learning_rate": 2.362186995081134e-06, + "loss": 0.3678, "step": 120005 }, { - "epoch": 4.22, - "learning_rate": 3.117313362690294e-06, - "loss": 0.2626, + "epoch": 4.325152268713735, + "grad_norm": 0.29627206921577454, + "learning_rate": 2.3609489281604837e-06, + "loss": 0.3616, "step": 120010 }, { - "epoch": 4.22, - "learning_rate": 3.1159359577371787e-06, - "loss": 0.2739, + "epoch": 4.32533246837496, + "grad_norm": 0.27116361260414124, + "learning_rate": 2.3597111696925327e-06, + "loss": 0.3541, "step": 120015 }, { - "epoch": 4.22, - "learning_rate": 3.1145588369342117e-06, - "loss": 0.2564, + "epoch": 4.325512668036184, + "grad_norm": 0.26802965998649597, + "learning_rate": 2.3584737196941266e-06, + "loss": 0.3786, "step": 120020 }, { - "epoch": 4.22, - "learning_rate": 3.1131820002992835e-06, - "loss": 0.2396, + "epoch": 4.325692867697409, + "grad_norm": 0.22563250362873077, + "learning_rate": 2.3572365781821286e-06, + "loss": 0.4056, "step": 120025 }, { - "epoch": 4.22, - "learning_rate": 3.1118054478502605e-06, - "loss": 0.2415, + "epoch": 4.325873067358633, + "grad_norm": 0.205821692943573, + "learning_rate": 2.355999745173404e-06, + "loss": 0.3754, "step": 120030 }, { - "epoch": 4.22, - "learning_rate": 3.1104291796050204e-06, - "loss": 0.2329, + "epoch": 4.326053267019858, + "grad_norm": 0.24381496012210846, + "learning_rate": 2.3547632206847998e-06, + "loss": 0.3969, "step": 120035 }, { - "epoch": 4.22, - "learning_rate": 3.109053195581424e-06, - "loss": 0.2654, + "epoch": 4.326233466681082, + "grad_norm": 0.2163795679807663, + "learning_rate": 2.353527004733161e-06, + "loss": 0.3793, "step": 120040 }, { - "epoch": 4.22, - "learning_rate": 3.1076774957973537e-06, - "loss": 0.2385, + "epoch": 4.326413666342307, + "grad_norm": 0.257968544960022, + "learning_rate": 2.3522910973353324e-06, + "loss": 0.3811, "step": 120045 }, { - "epoch": 4.22, - "learning_rate": 3.1063020802706623e-06, - "loss": 0.2725, + "epoch": 4.326593866003532, + "grad_norm": 0.229737788438797, + "learning_rate": 2.3510554985081477e-06, + "loss": 0.3886, "step": 120050 }, { - "epoch": 4.22, - "learning_rate": 3.1049269490192102e-06, - "loss": 0.2649, + "epoch": 4.326774065664757, + "grad_norm": 0.3103449046611786, + "learning_rate": 2.349820208268455e-06, + "loss": 0.3495, "step": 120055 }, { - "epoch": 4.22, - "learning_rate": 3.1035521020608444e-06, - "loss": 0.2447, + "epoch": 4.326954265325981, + "grad_norm": 0.2852785289287567, + "learning_rate": 2.348585226633074e-06, + "loss": 0.3366, "step": 120060 }, { - "epoch": 4.22, - "learning_rate": 3.1021775394134314e-06, - "loss": 0.2564, + "epoch": 4.327134464987206, + "grad_norm": 0.26147526502609253, + "learning_rate": 2.3473505536188324e-06, + "loss": 0.327, "step": 120065 }, { - "epoch": 4.22, - "learning_rate": 3.100803261094809e-06, - "loss": 0.252, + "epoch": 4.327314664648431, + "grad_norm": 0.24618807435035706, + "learning_rate": 2.346116189242556e-06, + "loss": 0.3706, "step": 120070 }, { - "epoch": 4.22, - "learning_rate": 3.0994292671228247e-06, - "loss": 0.2462, + "epoch": 4.327494864309655, + "grad_norm": 0.3258048892021179, + "learning_rate": 2.344882133521059e-06, + "loss": 0.3921, "step": 120075 }, { - "epoch": 4.22, - "learning_rate": 3.098055557515317e-06, - "loss": 0.2526, + "epoch": 4.327675063970879, + "grad_norm": 0.2521205544471741, + "learning_rate": 2.3436483864711535e-06, + "loss": 0.3588, "step": 120080 }, { - "epoch": 4.22, - "learning_rate": 3.096682132290127e-06, - "loss": 0.2599, + "epoch": 4.327855263632104, + "grad_norm": 0.23046831786632538, + "learning_rate": 2.342414948109656e-06, + "loss": 0.3982, "step": 120085 }, { - "epoch": 4.23, - "learning_rate": 3.0953089914650875e-06, - "loss": 0.2409, + "epoch": 4.328035463293329, + "grad_norm": 0.260557621717453, + "learning_rate": 2.3411818184533698e-06, + "loss": 0.3612, "step": 120090 }, { - "epoch": 4.23, - "learning_rate": 3.093936135058015e-06, - "loss": 0.2726, + "epoch": 4.328215662954554, + "grad_norm": 0.20361223816871643, + "learning_rate": 2.3399489975190923e-06, + "loss": 0.3768, "step": 120095 }, { - "epoch": 4.23, - "learning_rate": 3.0925635630867534e-06, - "loss": 0.2622, + "epoch": 4.328395862615778, + "grad_norm": 0.24504904448986053, + "learning_rate": 2.3387164853236247e-06, + "loss": 0.3511, "step": 120100 }, { - "epoch": 4.23, - "learning_rate": 3.0911912755691163e-06, - "loss": 0.237, + "epoch": 4.328576062277003, + "grad_norm": 0.22165676951408386, + "learning_rate": 2.3374842818837507e-06, + "loss": 0.3731, "step": 120105 }, { - "epoch": 4.23, - "learning_rate": 3.089819272522923e-06, - "loss": 0.2613, + "epoch": 4.328756261938228, + "grad_norm": 0.29873546957969666, + "learning_rate": 2.3362523872162756e-06, + "loss": 0.3618, "step": 120110 }, { - "epoch": 4.23, - "learning_rate": 3.0884475539659813e-06, - "loss": 0.2473, + "epoch": 4.328936461599453, + "grad_norm": 0.22169798612594604, + "learning_rate": 2.33502080133797e-06, + "loss": 0.37, "step": 120115 }, { - "epoch": 4.23, - "learning_rate": 3.08707611991611e-06, - "loss": 0.2497, + "epoch": 4.329116661260677, + "grad_norm": 0.19561375677585602, + "learning_rate": 2.3337895242656144e-06, + "loss": 0.3759, "step": 120120 }, { - "epoch": 4.23, - "learning_rate": 3.0857049703911124e-06, - "loss": 0.2457, + "epoch": 4.329296860921901, + "grad_norm": 0.3174193799495697, + "learning_rate": 2.33255855601599e-06, + "loss": 0.3906, "step": 120125 }, { - "epoch": 4.23, - "learning_rate": 3.0843341054087982e-06, - "loss": 0.2464, + "epoch": 4.329477060583126, + "grad_norm": 0.2505806088447571, + "learning_rate": 2.33132789660587e-06, + "loss": 0.4405, "step": 120130 }, { - "epoch": 4.23, - "learning_rate": 3.0829635249869566e-06, - "loss": 0.2472, + "epoch": 4.329657260244351, + "grad_norm": 0.2282123565673828, + "learning_rate": 2.330097546052015e-06, + "loss": 0.3469, "step": 120135 }, { - "epoch": 4.23, - "learning_rate": 3.0815932291433952e-06, - "loss": 0.256, + "epoch": 4.329837459905575, + "grad_norm": 0.2437511831521988, + "learning_rate": 2.3288675043711933e-06, + "loss": 0.3587, "step": 120140 }, { - "epoch": 4.23, - "learning_rate": 3.080223217895903e-06, - "loss": 0.2195, + "epoch": 4.3300176595668, + "grad_norm": 0.20224423706531525, + "learning_rate": 2.3276377715801603e-06, + "loss": 0.3455, "step": 120145 }, { - "epoch": 4.23, - "learning_rate": 3.0788534912622654e-06, - "loss": 0.2621, + "epoch": 4.330197859228025, + "grad_norm": 0.23955215513706207, + "learning_rate": 2.3264083476956743e-06, + "loss": 0.3537, "step": 120150 }, { - "epoch": 4.23, - "learning_rate": 3.0774840492602625e-06, - "loss": 0.2544, + "epoch": 4.3303780588892495, + "grad_norm": 0.23328891396522522, + "learning_rate": 2.3251792327344867e-06, + "loss": 0.3624, "step": 120155 }, { - "epoch": 4.23, - "learning_rate": 3.076114891907686e-06, - "loss": 0.2692, + "epoch": 4.330558258550474, + "grad_norm": 0.20525789260864258, + "learning_rate": 2.323950426713342e-06, + "loss": 0.3493, "step": 120160 }, { - "epoch": 4.23, - "learning_rate": 3.0747460192223103e-06, - "loss": 0.2575, + "epoch": 4.330738458211698, + "grad_norm": 0.252485066652298, + "learning_rate": 2.3227219296489827e-06, + "loss": 0.3537, "step": 120165 }, { - "epoch": 4.23, - "learning_rate": 3.0733774312219104e-06, - "loss": 0.265, + "epoch": 4.330918657872923, + "grad_norm": 0.22659099102020264, + "learning_rate": 2.3214937415581424e-06, + "loss": 0.3759, "step": 120170 }, { - "epoch": 4.23, - "learning_rate": 3.072009127924247e-06, - "loss": 0.2286, + "epoch": 4.331098857534148, + "grad_norm": 0.24077731370925903, + "learning_rate": 2.320265862457563e-06, + "loss": 0.3789, "step": 120175 }, { - "epoch": 4.23, - "learning_rate": 3.0706411093470944e-06, - "loss": 0.2429, + "epoch": 4.331279057195372, + "grad_norm": 0.2883579432964325, + "learning_rate": 2.319038292363973e-06, + "loss": 0.3513, "step": 120180 }, { - "epoch": 4.23, - "learning_rate": 3.0692733755082194e-06, - "loss": 0.2679, + "epoch": 4.331459256856597, + "grad_norm": 0.25911572575569153, + "learning_rate": 2.31781103129409e-06, + "loss": 0.3691, "step": 120185 }, { - "epoch": 4.23, - "learning_rate": 3.0679059264253795e-06, - "loss": 0.2519, + "epoch": 4.331639456517822, + "grad_norm": 0.2714954912662506, + "learning_rate": 2.3165840792646414e-06, + "loss": 0.3817, "step": 120190 }, { - "epoch": 4.23, - "learning_rate": 3.0665387621163192e-06, - "loss": 0.2533, + "epoch": 4.3318196561790465, + "grad_norm": 0.2899724841117859, + "learning_rate": 2.3153574362923395e-06, + "loss": 0.3776, "step": 120195 }, { - "epoch": 4.23, - "learning_rate": 3.065171882598805e-06, - "loss": 0.259, + "epoch": 4.331999855840271, + "grad_norm": 0.2563213109970093, + "learning_rate": 2.31413110239391e-06, + "loss": 0.4069, "step": 120200 }, { - "epoch": 4.23, - "learning_rate": 3.063805287890578e-06, - "loss": 0.2365, + "epoch": 4.332180055501496, + "grad_norm": 0.261004239320755, + "learning_rate": 2.3129050775860473e-06, + "loss": 0.3644, "step": 120205 }, { - "epoch": 4.23, - "learning_rate": 3.062438978009377e-06, - "loss": 0.2513, + "epoch": 4.332360255162721, + "grad_norm": 0.2524876296520233, + "learning_rate": 2.3116793618854576e-06, + "loss": 0.3632, "step": 120210 }, { - "epoch": 4.23, - "learning_rate": 3.061072952972954e-06, - "loss": 0.2644, + "epoch": 4.332540454823945, + "grad_norm": 0.24342192709445953, + "learning_rate": 2.3104539553088474e-06, + "loss": 0.3615, "step": 120215 }, { - "epoch": 4.23, - "learning_rate": 3.05970721279904e-06, - "loss": 0.2378, + "epoch": 4.332720654485169, + "grad_norm": 0.27565255761146545, + "learning_rate": 2.3092288578729083e-06, + "loss": 0.3985, "step": 120220 }, { - "epoch": 4.23, - "learning_rate": 3.05834175750537e-06, - "loss": 0.2433, + "epoch": 4.332900854146394, + "grad_norm": 0.22608241438865662, + "learning_rate": 2.308004069594333e-06, + "loss": 0.3917, "step": 120225 }, { - "epoch": 4.23, - "learning_rate": 3.0569765871096666e-06, - "loss": 0.2603, + "epoch": 4.333081053807619, + "grad_norm": 0.2556675374507904, + "learning_rate": 2.3067795904898103e-06, + "loss": 0.3498, "step": 120230 }, { - "epoch": 4.23, - "learning_rate": 3.0556117016296597e-06, - "loss": 0.2841, + "epoch": 4.3332612534688435, + "grad_norm": 0.2671878933906555, + "learning_rate": 2.3055554205760168e-06, + "loss": 0.3587, "step": 120235 }, { - "epoch": 4.23, - "learning_rate": 3.05424710108308e-06, - "loss": 0.2475, + "epoch": 4.333441453130068, + "grad_norm": 0.232125386595726, + "learning_rate": 2.3043315598696406e-06, + "loss": 0.3345, "step": 120240 }, { - "epoch": 4.23, - "learning_rate": 3.0528827854876374e-06, - "loss": 0.2428, + "epoch": 4.333621652791293, + "grad_norm": 0.268929123878479, + "learning_rate": 2.3031080083873556e-06, + "loss": 0.4026, "step": 120245 }, { - "epoch": 4.23, - "learning_rate": 3.0515187548610437e-06, - "loss": 0.2155, + "epoch": 4.333801852452518, + "grad_norm": 0.2586337625980377, + "learning_rate": 2.3018847661458285e-06, + "loss": 0.3407, "step": 120250 }, { - "epoch": 4.23, - "learning_rate": 3.0501550092210203e-06, - "loss": 0.2604, + "epoch": 4.333982052113742, + "grad_norm": 0.20876720547676086, + "learning_rate": 2.300661833161724e-06, + "loss": 0.3956, "step": 120255 }, { - "epoch": 4.23, - "learning_rate": 3.048791548585267e-06, - "loss": 0.2346, + "epoch": 4.334162251774966, + "grad_norm": 0.26193374395370483, + "learning_rate": 2.299439209451712e-06, + "loss": 0.3588, "step": 120260 }, { - "epoch": 4.23, - "learning_rate": 3.047428372971492e-06, - "loss": 0.2381, + "epoch": 4.334342451436191, + "grad_norm": 0.29305386543273926, + "learning_rate": 2.2982168950324374e-06, + "loss": 0.3434, "step": 120265 }, { - "epoch": 4.23, - "learning_rate": 3.0460654823973837e-06, - "loss": 0.2488, + "epoch": 4.334522651097416, + "grad_norm": 0.1981567144393921, + "learning_rate": 2.29699488992057e-06, + "loss": 0.3667, "step": 120270 }, { - "epoch": 4.23, - "learning_rate": 3.0447028768806535e-06, - "loss": 0.2498, + "epoch": 4.3347028507586405, + "grad_norm": 0.2516523599624634, + "learning_rate": 2.29577319413275e-06, + "loss": 0.3551, "step": 120275 }, { - "epoch": 4.23, - "learning_rate": 3.0433405564389866e-06, - "loss": 0.2293, + "epoch": 4.334883050419865, + "grad_norm": 0.2072906494140625, + "learning_rate": 2.2945518076856243e-06, + "loss": 0.3863, "step": 120280 }, { - "epoch": 4.23, - "learning_rate": 3.041978521090069e-06, - "loss": 0.251, + "epoch": 4.33506325008109, + "grad_norm": 0.2831047475337982, + "learning_rate": 2.2933307305958356e-06, + "loss": 0.344, "step": 120285 }, { - "epoch": 4.23, - "learning_rate": 3.040616770851587e-06, - "loss": 0.2576, + "epoch": 4.335243449742315, + "grad_norm": 0.2764025330543518, + "learning_rate": 2.2921099628800126e-06, + "loss": 0.4017, "step": 120290 }, { - "epoch": 4.23, - "learning_rate": 3.0392553057412316e-06, - "loss": 0.2502, + "epoch": 4.335423649403539, + "grad_norm": 0.2790067195892334, + "learning_rate": 2.290889504554808e-06, + "loss": 0.3866, "step": 120295 }, { - "epoch": 4.23, - "learning_rate": 3.0378941257766745e-06, - "loss": 0.2585, + "epoch": 4.335603849064764, + "grad_norm": 0.20841774344444275, + "learning_rate": 2.2896693556368284e-06, + "loss": 0.3428, "step": 120300 }, { - "epoch": 4.23, - "learning_rate": 3.0365332309755856e-06, - "loss": 0.2348, + "epoch": 4.335784048725988, + "grad_norm": 0.2399793118238449, + "learning_rate": 2.2884495161427043e-06, + "loss": 0.3393, "step": 120305 }, { - "epoch": 4.23, - "learning_rate": 3.0351726213556364e-06, - "loss": 0.2497, + "epoch": 4.335964248387213, + "grad_norm": 0.2118580937385559, + "learning_rate": 2.287229986089065e-06, + "loss": 0.3936, "step": 120310 }, { - "epoch": 4.23, - "learning_rate": 3.033812296934499e-06, - "loss": 0.2333, + "epoch": 4.3361444480484375, + "grad_norm": 0.23589551448822021, + "learning_rate": 2.286010765492516e-06, + "loss": 0.376, "step": 120315 }, { - "epoch": 4.23, - "learning_rate": 3.0324522577298312e-06, - "loss": 0.2393, + "epoch": 4.336324647709662, + "grad_norm": 0.240196093916893, + "learning_rate": 2.284791854369675e-06, + "loss": 0.3936, "step": 120320 }, { - "epoch": 4.23, - "learning_rate": 3.031092503759289e-06, - "loss": 0.2626, + "epoch": 4.336504847370887, + "grad_norm": 0.3579064607620239, + "learning_rate": 2.2835732527371478e-06, + "loss": 0.3894, "step": 120325 }, { - "epoch": 4.23, - "learning_rate": 3.0297330350405382e-06, - "loss": 0.226, + "epoch": 4.336685047032112, + "grad_norm": 0.21416036784648895, + "learning_rate": 2.2823549606115324e-06, + "loss": 0.3483, "step": 120330 }, { - "epoch": 4.23, - "learning_rate": 3.0283738515912262e-06, - "loss": 0.2447, + "epoch": 4.336865246693336, + "grad_norm": 0.2611549496650696, + "learning_rate": 2.281136978009435e-06, + "loss": 0.4018, "step": 120335 }, { - "epoch": 4.23, - "learning_rate": 3.027014953428994e-06, - "loss": 0.25, + "epoch": 4.337045446354561, + "grad_norm": 0.2718494236469269, + "learning_rate": 2.2799193049474505e-06, + "loss": 0.373, "step": 120340 }, { - "epoch": 4.23, - "learning_rate": 3.025656340571495e-06, - "loss": 0.2542, + "epoch": 4.337225646015786, + "grad_norm": 0.2340647578239441, + "learning_rate": 2.2787019414421627e-06, + "loss": 0.364, "step": 120345 }, { - "epoch": 4.23, - "learning_rate": 3.024298013036364e-06, - "loss": 0.2484, + "epoch": 4.33740584567701, + "grad_norm": 0.1981993019580841, + "learning_rate": 2.2774848875101647e-06, + "loss": 0.3663, "step": 120350 }, { - "epoch": 4.23, - "learning_rate": 3.022939970841246e-06, - "loss": 0.2477, + "epoch": 4.337586045338234, + "grad_norm": 0.32858866453170776, + "learning_rate": 2.276268143168031e-06, + "loss": 0.3422, "step": 120355 }, { - "epoch": 4.23, - "learning_rate": 3.021582214003768e-06, - "loss": 0.2303, + "epoch": 4.337766244999459, + "grad_norm": 0.24654696881771088, + "learning_rate": 2.275051708432349e-06, + "loss": 0.3722, "step": 120360 }, { - "epoch": 4.23, - "learning_rate": 3.0202247425415553e-06, - "loss": 0.2471, + "epoch": 4.337946444660684, + "grad_norm": 0.20134994387626648, + "learning_rate": 2.2738355833196916e-06, + "loss": 0.3416, "step": 120365 }, { - "epoch": 4.23, - "learning_rate": 3.0188675564722463e-06, - "loss": 0.2564, + "epoch": 4.338126644321909, + "grad_norm": 0.23259273171424866, + "learning_rate": 2.2726197678466178e-06, + "loss": 0.3483, "step": 120370 }, { - "epoch": 4.24, - "learning_rate": 3.0175106558134546e-06, - "loss": 0.2469, + "epoch": 4.338306843983133, + "grad_norm": 0.25836434960365295, + "learning_rate": 2.271404262029703e-06, + "loss": 0.3657, "step": 120375 }, { - "epoch": 4.24, - "learning_rate": 3.0161540405828e-06, - "loss": 0.257, + "epoch": 4.338487043644358, + "grad_norm": 0.2423553764820099, + "learning_rate": 2.2701890658854975e-06, + "loss": 0.3692, "step": 120380 }, { - "epoch": 4.24, - "learning_rate": 3.0147977107978926e-06, - "loss": 0.2453, + "epoch": 4.338667243305583, + "grad_norm": 0.29900985956192017, + "learning_rate": 2.268974179430572e-06, + "loss": 0.3787, "step": 120385 }, { - "epoch": 4.24, - "learning_rate": 3.0134416664763522e-06, - "loss": 0.2795, + "epoch": 4.338847442966808, + "grad_norm": 0.2342345416545868, + "learning_rate": 2.267759602681477e-06, + "loss": 0.4216, "step": 120390 }, { - "epoch": 4.24, - "learning_rate": 3.0120859076357755e-06, - "loss": 0.2609, + "epoch": 4.339027642628032, + "grad_norm": Infinity, + "learning_rate": 2.266788164281511e-06, + "loss": 0.399, "step": 120395 }, { - "epoch": 4.24, - "learning_rate": 3.010730434293782e-06, - "loss": 0.2433, + "epoch": 4.339207842289256, + "grad_norm": 0.22955337166786194, + "learning_rate": 2.265574145044594e-06, + "loss": 0.389, "step": 120400 }, { - "epoch": 4.24, - "learning_rate": 3.009375246467955e-06, - "loss": 0.26, + "epoch": 4.339388041950481, + "grad_norm": 0.25099387764930725, + "learning_rate": 2.264360435559831e-06, + "loss": 0.3777, "step": 120405 }, { - "epoch": 4.24, - "learning_rate": 3.008020344175902e-06, - "loss": 0.2411, + "epoch": 4.339568241611706, + "grad_norm": 0.23291358351707458, + "learning_rate": 2.2631470358437545e-06, + "loss": 0.3862, "step": 120410 }, { - "epoch": 4.24, - "learning_rate": 3.006665727435212e-06, - "loss": 0.2562, + "epoch": 4.33974844127293, + "grad_norm": 0.2527669370174408, + "learning_rate": 2.2619339459128975e-06, + "loss": 0.3817, "step": 120415 }, { - "epoch": 4.24, - "learning_rate": 3.0053113962634744e-06, - "loss": 0.2415, + "epoch": 4.339928640934155, + "grad_norm": 0.28029200434684753, + "learning_rate": 2.2607211657837886e-06, + "loss": 0.34, "step": 120420 }, { - "epoch": 4.24, - "learning_rate": 3.003957350678269e-06, - "loss": 0.2475, + "epoch": 4.34010884059538, + "grad_norm": 0.2483205646276474, + "learning_rate": 2.2595086954729476e-06, + "loss": 0.3732, "step": 120425 }, { - "epoch": 4.24, - "learning_rate": 3.002603590697184e-06, - "loss": 0.256, + "epoch": 4.340289040256605, + "grad_norm": 0.26079532504081726, + "learning_rate": 2.2582965349969035e-06, + "loss": 0.4245, "step": 120430 }, { - "epoch": 4.24, - "learning_rate": 3.0012501163377974e-06, - "loss": 0.2479, + "epoch": 4.340469239917829, + "grad_norm": 0.3403790593147278, + "learning_rate": 2.257084684372163e-06, + "loss": 0.347, "step": 120435 }, { - "epoch": 4.24, - "learning_rate": 2.999896927617679e-06, - "loss": 0.2468, + "epoch": 4.340649439579053, + "grad_norm": 0.2807793915271759, + "learning_rate": 2.255873143615242e-06, + "loss": 0.3802, "step": 120440 }, { - "epoch": 4.24, - "learning_rate": 2.998544024554395e-06, - "loss": 0.242, + "epoch": 4.340829639240278, + "grad_norm": 0.2575584948062897, + "learning_rate": 2.2546619127426477e-06, + "loss": 0.3579, "step": 120445 }, { - "epoch": 4.24, - "learning_rate": 2.9971914071655227e-06, - "loss": 0.2531, + "epoch": 4.341009838901503, + "grad_norm": 0.2251080423593521, + "learning_rate": 2.2534509917708753e-06, + "loss": 0.3715, "step": 120450 }, { - "epoch": 4.24, - "learning_rate": 2.9958390754686148e-06, - "loss": 0.2293, + "epoch": 4.341190038562727, + "grad_norm": 0.2660796046257019, + "learning_rate": 2.2522403807164333e-06, + "loss": 0.3834, "step": 120455 }, { - "epoch": 4.24, - "learning_rate": 2.9944870294812326e-06, - "loss": 0.2333, + "epoch": 4.341370238223952, + "grad_norm": 0.277935653924942, + "learning_rate": 2.251030079595817e-06, + "loss": 0.3487, "step": 120460 }, { - "epoch": 4.24, - "learning_rate": 2.9931352692209423e-06, - "loss": 0.2495, + "epoch": 4.341550437885177, + "grad_norm": 0.24934552609920502, + "learning_rate": 2.249820088425503e-06, + "loss": 0.3702, "step": 120465 }, { - "epoch": 4.24, - "learning_rate": 2.9917837947052885e-06, - "loss": 0.2532, + "epoch": 4.3417306375464015, + "grad_norm": 0.22956299781799316, + "learning_rate": 2.2486104072219905e-06, + "loss": 0.3344, "step": 120470 }, { - "epoch": 4.24, - "learning_rate": 2.990432605951815e-06, - "loss": 0.2596, + "epoch": 4.341910837207626, + "grad_norm": 0.2705623507499695, + "learning_rate": 2.247401036001753e-06, + "loss": 0.3638, "step": 120475 }, { - "epoch": 4.24, - "learning_rate": 2.9890817029780666e-06, - "loss": 0.2376, + "epoch": 4.342091036868851, + "grad_norm": 0.29147830605506897, + "learning_rate": 2.2461919747812753e-06, + "loss": 0.3837, "step": 120480 }, { - "epoch": 4.24, - "learning_rate": 2.98773108580159e-06, - "loss": 0.2323, + "epoch": 4.342271236530076, + "grad_norm": 0.23217852413654327, + "learning_rate": 2.244983223577024e-06, + "loss": 0.3905, "step": 120485 }, { - "epoch": 4.24, - "learning_rate": 2.9863807544399186e-06, - "loss": 0.2409, + "epoch": 4.3424514361913, + "grad_norm": 0.22803989052772522, + "learning_rate": 2.2437747824054694e-06, + "loss": 0.3589, "step": 120490 }, { - "epoch": 4.24, - "learning_rate": 2.9850307089105856e-06, - "loss": 0.24, + "epoch": 4.342631635852524, + "grad_norm": 0.22938886284828186, + "learning_rate": 2.242566651283079e-06, + "loss": 0.3986, "step": 120495 }, { - "epoch": 4.24, - "learning_rate": 2.9836809492311156e-06, - "loss": 0.2306, + "epoch": 4.342811835513749, + "grad_norm": 0.2516026198863983, + "learning_rate": 2.2413588302263143e-06, + "loss": 0.3489, "step": 120500 }, { - "epoch": 4.24, - "eval_loss": 0.24912191927433014, - "eval_runtime": 10.5509, - "eval_samples_per_second": 9.478, - "eval_steps_per_second": 9.478, + "epoch": 4.342811835513749, + "eval_loss": 0.4290280044078827, + "eval_runtime": 3.5287, + "eval_samples_per_second": 28.339, + "eval_steps_per_second": 7.085, "step": 120500 }, { - "epoch": 4.24, - "learning_rate": 2.982331475419045e-06, - "loss": 0.2357, + "epoch": 4.342992035174974, + "grad_norm": 0.2596825361251831, + "learning_rate": 2.2401513192516267e-06, + "loss": 0.3906, "step": 120505 }, { - "epoch": 4.24, - "learning_rate": 2.980982287491885e-06, - "loss": 0.2507, + "epoch": 4.3431722348361985, + "grad_norm": 0.26794061064720154, + "learning_rate": 2.2389441183754745e-06, + "loss": 0.4, "step": 120510 }, { - "epoch": 4.24, - "learning_rate": 2.979633385467165e-06, - "loss": 0.2242, + "epoch": 4.343352434497423, + "grad_norm": 0.2583240568637848, + "learning_rate": 2.2377372276143003e-06, + "loss": 0.3953, "step": 120515 }, { - "epoch": 4.24, - "learning_rate": 2.9782847693623857e-06, - "loss": 0.2441, + "epoch": 4.343532634158648, + "grad_norm": 0.27646011114120483, + "learning_rate": 2.236530646984544e-06, + "loss": 0.3915, "step": 120520 }, { - "epoch": 4.24, - "learning_rate": 2.9769364391950717e-06, - "loss": 0.2661, + "epoch": 4.343712833819873, + "grad_norm": 0.2590472400188446, + "learning_rate": 2.235324376502654e-06, + "loss": 0.334, "step": 120525 }, { - "epoch": 4.24, - "learning_rate": 2.9755883949827255e-06, - "loss": 0.2413, + "epoch": 4.3438930334810975, + "grad_norm": 0.2593589723110199, + "learning_rate": 2.234118416185066e-06, + "loss": 0.408, "step": 120530 }, { - "epoch": 4.24, - "learning_rate": 2.9742406367428477e-06, - "loss": 0.25, + "epoch": 4.344073233142321, + "grad_norm": 0.2586502134799957, + "learning_rate": 2.2329127660482036e-06, + "loss": 0.3995, "step": 120535 }, { - "epoch": 4.24, - "learning_rate": 2.9728931644929348e-06, - "loss": 0.2511, + "epoch": 4.344253432803546, + "grad_norm": Infinity, + "learning_rate": 2.2319484692798763e-06, + "loss": 0.3567, "step": 120540 }, { - "epoch": 4.24, - "learning_rate": 2.971545978250495e-06, - "loss": 0.2275, + "epoch": 4.344433632464771, + "grad_norm": 0.2285531461238861, + "learning_rate": 2.2307433775097196e-06, + "loss": 0.3395, "step": 120545 }, { - "epoch": 4.24, - "learning_rate": 2.970199078033012e-06, - "loss": 0.2579, + "epoch": 4.3446138321259955, + "grad_norm": 0.28931424021720886, + "learning_rate": 2.229538595966277e-06, + "loss": 0.3916, "step": 120550 }, { - "epoch": 4.24, - "learning_rate": 2.9688524638579712e-06, - "loss": 0.232, + "epoch": 4.34479403178722, + "grad_norm": 0.19079504907131195, + "learning_rate": 2.2283341246659623e-06, + "loss": 0.3671, "step": 120555 }, { - "epoch": 4.24, - "learning_rate": 2.9675061357428647e-06, - "loss": 0.2694, + "epoch": 4.344974231448445, + "grad_norm": 0.33871787786483765, + "learning_rate": 2.2271299636251795e-06, + "loss": 0.3929, "step": 120560 }, { - "epoch": 4.24, - "learning_rate": 2.96616009370517e-06, - "loss": 0.2594, + "epoch": 4.34515443110967, + "grad_norm": 0.2659500241279602, + "learning_rate": 2.225926112860349e-06, + "loss": 0.3407, "step": 120565 }, { - "epoch": 4.24, - "learning_rate": 2.964814337762367e-06, - "loss": 0.2464, + "epoch": 4.345334630770894, + "grad_norm": 0.2704176902770996, + "learning_rate": 2.2247225723878663e-06, + "loss": 0.3852, "step": 120570 }, { - "epoch": 4.24, - "learning_rate": 2.9634688679319263e-06, - "loss": 0.2456, + "epoch": 4.345514830432119, + "grad_norm": 0.27689260244369507, + "learning_rate": 2.2235193422241283e-06, + "loss": 0.3471, "step": 120575 }, { - "epoch": 4.24, - "learning_rate": 2.9621236842313217e-06, - "loss": 0.2684, + "epoch": 4.345695030093343, + "grad_norm": 0.21617692708969116, + "learning_rate": 2.2223164223855287e-06, + "loss": 0.36, "step": 120580 }, { - "epoch": 4.24, - "learning_rate": 2.9607787866780202e-06, - "loss": 0.2601, + "epoch": 4.345875229754568, + "grad_norm": 0.32375189661979675, + "learning_rate": 2.221113812888459e-06, + "loss": 0.3506, "step": 120585 }, { - "epoch": 4.24, - "learning_rate": 2.95943417528948e-06, - "loss": 0.2679, + "epoch": 4.3460554294157925, + "grad_norm": 0.26374006271362305, + "learning_rate": 2.2199115137492983e-06, + "loss": 0.3691, "step": 120590 }, { - "epoch": 4.24, - "learning_rate": 2.9580898500831565e-06, - "loss": 0.2576, + "epoch": 4.346235629077017, + "grad_norm": 0.2784136235713959, + "learning_rate": 2.2187095249844385e-06, + "loss": 0.3435, "step": 120595 }, { - "epoch": 4.24, - "learning_rate": 2.9567458110765166e-06, - "loss": 0.2452, + "epoch": 4.346415828738242, + "grad_norm": 0.23929151892662048, + "learning_rate": 2.2175078466102477e-06, + "loss": 0.3381, "step": 120600 }, { - "epoch": 4.24, - "learning_rate": 2.9554020582870046e-06, - "loss": 0.2327, + "epoch": 4.346596028399467, + "grad_norm": 0.29352447390556335, + "learning_rate": 2.216306478643104e-06, + "loss": 0.3898, "step": 120605 }, { - "epoch": 4.24, - "learning_rate": 2.9540585917320673e-06, - "loss": 0.2696, + "epoch": 4.346776228060691, + "grad_norm": 0.2777535021305084, + "learning_rate": 2.2151054210993754e-06, + "loss": 0.3625, "step": 120610 }, { - "epoch": 4.24, - "learning_rate": 2.9527154114291495e-06, - "loss": 0.2686, + "epoch": 4.346956427721916, + "grad_norm": 0.2904403507709503, + "learning_rate": 2.213904673995418e-06, + "loss": 0.3525, "step": 120615 }, { - "epoch": 4.24, - "learning_rate": 2.9513725173956897e-06, - "loss": 0.2216, + "epoch": 4.347136627383141, + "grad_norm": 0.31510671973228455, + "learning_rate": 2.2127042373476016e-06, + "loss": 0.3433, "step": 120620 }, { - "epoch": 4.24, - "learning_rate": 2.9500299096491323e-06, - "loss": 0.2457, + "epoch": 4.347316827044365, + "grad_norm": 0.26004233956336975, + "learning_rate": 2.211504111172283e-06, + "loss": 0.3828, "step": 120625 }, { - "epoch": 4.24, - "learning_rate": 2.9486875882069027e-06, - "loss": 0.2603, + "epoch": 4.3474970267055895, + "grad_norm": 0.2599365711212158, + "learning_rate": 2.210304295485799e-06, + "loss": 0.4023, "step": 120630 }, { - "epoch": 4.24, - "learning_rate": 2.947345553086431e-06, - "loss": 0.2277, + "epoch": 4.347677226366814, + "grad_norm": 0.2279340624809265, + "learning_rate": 2.2091047903045142e-06, + "loss": 0.3625, "step": 120635 }, { - "epoch": 4.24, - "learning_rate": 2.9460038043051475e-06, - "loss": 0.265, + "epoch": 4.347857426028039, + "grad_norm": 0.2609095871448517, + "learning_rate": 2.2079055956447574e-06, + "loss": 0.3503, "step": 120640 }, { - "epoch": 4.24, - "learning_rate": 2.944662341880472e-06, - "loss": 0.2402, + "epoch": 4.348037625689264, + "grad_norm": 0.2672995328903198, + "learning_rate": 2.206706711522882e-06, + "loss": 0.3367, "step": 120645 }, { - "epoch": 4.24, - "learning_rate": 2.943321165829818e-06, - "loss": 0.2363, + "epoch": 4.348217825350488, + "grad_norm": 0.2778962552547455, + "learning_rate": 2.205508137955212e-06, + "loss": 0.3601, "step": 120650 }, { - "epoch": 4.24, - "learning_rate": 2.9419802761706e-06, - "loss": 0.2539, + "epoch": 4.348398025011713, + "grad_norm": 0.2293660044670105, + "learning_rate": 2.204309874958077e-06, + "loss": 0.4096, "step": 120655 }, { - "epoch": 4.25, - "learning_rate": 2.9406396729202367e-06, - "loss": 0.248, + "epoch": 4.348578224672938, + "grad_norm": 0.22069476544857025, + "learning_rate": 2.2031119225478096e-06, + "loss": 0.3899, "step": 120660 }, { - "epoch": 4.25, - "learning_rate": 2.9392993560961287e-06, - "loss": 0.2392, + "epoch": 4.348758424334163, + "grad_norm": 0.22815968096256256, + "learning_rate": 2.201914280740727e-06, + "loss": 0.3738, "step": 120665 }, { - "epoch": 4.25, - "learning_rate": 2.9379593257156753e-06, - "loss": 0.2712, + "epoch": 4.348938623995387, + "grad_norm": 0.28191712498664856, + "learning_rate": 2.2007169495531527e-06, + "loss": 0.3879, "step": 120670 }, { - "epoch": 4.25, - "learning_rate": 2.9366195817962793e-06, - "loss": 0.2678, + "epoch": 4.349118823656611, + "grad_norm": 0.2501116991043091, + "learning_rate": 2.1995199290013926e-06, + "loss": 0.3743, "step": 120675 }, { - "epoch": 4.25, - "learning_rate": 2.935280124355344e-06, - "loss": 0.2323, + "epoch": 4.349299023317836, + "grad_norm": 0.3013772666454315, + "learning_rate": 2.1983232191017556e-06, + "loss": 0.3554, "step": 120680 }, { - "epoch": 4.25, - "learning_rate": 2.933940953410255e-06, - "loss": 0.2584, + "epoch": 4.349479222979061, + "grad_norm": 0.2710776627063751, + "learning_rate": 2.1971268198705543e-06, + "loss": 0.3778, "step": 120685 }, { - "epoch": 4.25, - "learning_rate": 2.9326020689783963e-06, - "loss": 0.247, + "epoch": 4.349659422640285, + "grad_norm": 0.28486597537994385, + "learning_rate": 2.195930731324086e-06, + "loss": 0.3988, "step": 120690 }, { - "epoch": 4.25, - "learning_rate": 2.9312634710771613e-06, - "loss": 0.2425, + "epoch": 4.34983962230151, + "grad_norm": 0.27696672081947327, + "learning_rate": 2.1947349534786443e-06, + "loss": 0.3684, "step": 120695 }, { - "epoch": 4.25, - "learning_rate": 2.9299251597239256e-06, - "loss": 0.2444, + "epoch": 4.350019821962735, + "grad_norm": 0.2695504426956177, + "learning_rate": 2.1935394863505265e-06, + "loss": 0.3804, "step": 120700 }, { - "epoch": 4.25, - "learning_rate": 2.928587134936067e-06, - "loss": 0.2666, + "epoch": 4.35020002162396, + "grad_norm": 0.2949710786342621, + "learning_rate": 2.1923443299560113e-06, + "loss": 0.3809, "step": 120705 }, { - "epoch": 4.25, - "learning_rate": 2.927249396730955e-06, - "loss": 0.2563, + "epoch": 4.350380221285184, + "grad_norm": 0.29531538486480713, + "learning_rate": 2.1911494843113943e-06, + "loss": 0.38, "step": 120710 }, { - "epoch": 4.25, - "learning_rate": 2.925911945125967e-06, - "loss": 0.2358, + "epoch": 4.350560420946408, + "grad_norm": 0.2712008059024811, + "learning_rate": 2.1899549494329513e-06, + "loss": 0.358, "step": 120715 }, { - "epoch": 4.25, - "learning_rate": 2.9245747801384644e-06, - "loss": 0.2709, + "epoch": 4.350740620607633, + "grad_norm": 0.3000184893608093, + "learning_rate": 2.1887607253369525e-06, + "loss": 0.3792, "step": 120720 }, { - "epoch": 4.25, - "learning_rate": 2.923237901785808e-06, - "loss": 0.2481, + "epoch": 4.350920820268858, + "grad_norm": 0.28387993574142456, + "learning_rate": 2.187566812039676e-06, + "loss": 0.3642, "step": 120725 }, { - "epoch": 4.25, - "learning_rate": 2.9219013100853536e-06, - "loss": 0.2693, + "epoch": 4.351101019930082, + "grad_norm": 0.21037955582141876, + "learning_rate": 2.1863732095573825e-06, + "loss": 0.3565, "step": 120730 }, { - "epoch": 4.25, - "learning_rate": 2.920565005054471e-06, - "loss": 0.2394, + "epoch": 4.351281219591307, + "grad_norm": 0.2745780646800995, + "learning_rate": 2.1851799179063326e-06, + "loss": 0.3818, "step": 120735 }, { - "epoch": 4.25, - "learning_rate": 2.9192289867104983e-06, - "loss": 0.2328, + "epoch": 4.351461419252532, + "grad_norm": 0.25862976908683777, + "learning_rate": 2.1839869371028e-06, + "loss": 0.3547, "step": 120740 }, { - "epoch": 4.25, - "learning_rate": 2.9178932550707834e-06, - "loss": 0.2582, + "epoch": 4.351641618913757, + "grad_norm": 0.2394440770149231, + "learning_rate": 2.1827942671630184e-06, + "loss": 0.3666, "step": 120745 }, { - "epoch": 4.25, - "learning_rate": 2.9165578101526703e-06, - "loss": 0.2498, + "epoch": 4.351821818574981, + "grad_norm": 0.26800236105918884, + "learning_rate": 2.181601908103251e-06, + "loss": 0.3695, "step": 120750 }, { - "epoch": 4.25, - "learning_rate": 2.9152226519735036e-06, - "loss": 0.2736, + "epoch": 4.352002018236206, + "grad_norm": 0.24717827141284943, + "learning_rate": 2.1804098599397418e-06, + "loss": 0.3604, "step": 120755 }, { - "epoch": 4.25, - "learning_rate": 2.913887780550617e-06, - "loss": 0.2409, + "epoch": 4.352182217897431, + "grad_norm": 0.24239477515220642, + "learning_rate": 2.179218122688728e-06, + "loss": 0.3599, "step": 120760 }, { - "epoch": 4.25, - "learning_rate": 2.9125531959013434e-06, - "loss": 0.243, + "epoch": 4.352362417558655, + "grad_norm": 0.28817063570022583, + "learning_rate": 2.17802669636645e-06, + "loss": 0.4087, "step": 120765 }, { - "epoch": 4.25, - "learning_rate": 2.911218898043003e-06, - "loss": 0.2517, + "epoch": 4.352542617219879, + "grad_norm": 0.25521016120910645, + "learning_rate": 2.1768355809891396e-06, + "loss": 0.3246, "step": 120770 }, { - "epoch": 4.25, - "learning_rate": 2.9098848869929334e-06, - "loss": 0.2564, + "epoch": 4.352722816881104, + "grad_norm": 0.23344235122203827, + "learning_rate": 2.1756447765730214e-06, + "loss": 0.3409, "step": 120775 }, { - "epoch": 4.25, - "learning_rate": 2.908551162768447e-06, - "loss": 0.2724, + "epoch": 4.352903016542329, + "grad_norm": 0.27485767006874084, + "learning_rate": 2.174454283134328e-06, + "loss": 0.3866, "step": 120780 }, { - "epoch": 4.25, - "learning_rate": 2.907217725386871e-06, - "loss": 0.2543, + "epoch": 4.3530832162035535, + "grad_norm": 0.24092566967010498, + "learning_rate": 2.1732641006892763e-06, + "loss": 0.3861, "step": 120785 }, { - "epoch": 4.25, - "learning_rate": 2.9058845748655056e-06, - "loss": 0.2508, + "epoch": 4.353263415864778, + "grad_norm": 0.23562981188297272, + "learning_rate": 2.1720742292540814e-06, + "loss": 0.3637, "step": 120790 }, { - "epoch": 4.25, - "learning_rate": 2.9045517112216757e-06, - "loss": 0.2684, + "epoch": 4.353443615526003, + "grad_norm": 0.30748188495635986, + "learning_rate": 2.1708846688449553e-06, + "loss": 0.3486, "step": 120795 }, { - "epoch": 4.25, - "learning_rate": 2.903219134472679e-06, - "loss": 0.2584, + "epoch": 4.353623815187228, + "grad_norm": 0.263886421918869, + "learning_rate": 2.1696954194781016e-06, + "loss": 0.3717, "step": 120800 }, { - "epoch": 4.25, - "learning_rate": 2.901886844635812e-06, - "loss": 0.2492, + "epoch": 4.3538040148484525, + "grad_norm": 0.22052530944347382, + "learning_rate": 2.1685064811697325e-06, + "loss": 0.3637, "step": 120805 }, { - "epoch": 4.25, - "learning_rate": 2.9005548417283897e-06, - "loss": 0.2458, + "epoch": 4.353984214509676, + "grad_norm": 0.2120153307914734, + "learning_rate": 2.1673178539360488e-06, + "loss": 0.3608, "step": 120810 }, { - "epoch": 4.25, - "learning_rate": 2.8992231257676974e-06, - "loss": 0.2482, + "epoch": 4.354164414170901, + "grad_norm": 0.26720255613327026, + "learning_rate": 2.1661295377932267e-06, + "loss": 0.3854, "step": 120815 }, { - "epoch": 4.25, - "learning_rate": 2.8978916967710297e-06, - "loss": 0.2641, + "epoch": 4.354344613832126, + "grad_norm": 0.22795835137367249, + "learning_rate": 2.164941532757475e-06, + "loss": 0.3628, "step": 120820 }, { - "epoch": 4.25, - "learning_rate": 2.8965605547556646e-06, - "loss": 0.2626, + "epoch": 4.3545248134933505, + "grad_norm": 0.33458447456359863, + "learning_rate": 2.1637538388449674e-06, + "loss": 0.3986, "step": 120825 }, { - "epoch": 4.25, - "learning_rate": 2.895229699738902e-06, - "loss": 0.26, + "epoch": 4.354705013154575, + "grad_norm": 0.2997223436832428, + "learning_rate": 2.1625664560719043e-06, + "loss": 0.3833, "step": 120830 }, { - "epoch": 4.25, - "learning_rate": 2.8938991317380065e-06, - "loss": 0.2568, + "epoch": 4.3548852128158, + "grad_norm": 0.2829420864582062, + "learning_rate": 2.1613793844544454e-06, + "loss": 0.3839, "step": 120835 }, { - "epoch": 4.25, - "learning_rate": 2.8925688507702692e-06, - "loss": 0.2586, + "epoch": 4.355065412477025, + "grad_norm": 0.2701405882835388, + "learning_rate": 2.160192624008767e-06, + "loss": 0.3709, "step": 120840 }, { - "epoch": 4.25, - "learning_rate": 2.8912388568529485e-06, - "loss": 0.2451, + "epoch": 4.3552456121382495, + "grad_norm": 0.3344108462333679, + "learning_rate": 2.1590061747510465e-06, + "loss": 0.3555, "step": 120845 }, { - "epoch": 4.25, - "learning_rate": 2.889909150003328e-06, - "loss": 0.2543, + "epoch": 4.355425811799474, + "grad_norm": 0.20211008191108704, + "learning_rate": 2.1578200366974386e-06, + "loss": 0.4124, "step": 120850 }, { - "epoch": 4.25, - "learning_rate": 2.8885797302386665e-06, - "loss": 0.2514, + "epoch": 4.355606011460699, + "grad_norm": 0.25280535221099854, + "learning_rate": 2.1566342098641224e-06, + "loss": 0.3437, "step": 120855 }, { - "epoch": 4.25, - "learning_rate": 2.8872505975762244e-06, - "loss": 0.2537, + "epoch": 4.355786211121923, + "grad_norm": 0.29722049832344055, + "learning_rate": 2.1554486942672367e-06, + "loss": 0.37, "step": 120860 }, { - "epoch": 4.25, - "learning_rate": 2.885921752033255e-06, - "loss": 0.248, + "epoch": 4.3559664107831475, + "grad_norm": 0.22865262627601624, + "learning_rate": 2.1542634899229337e-06, + "loss": 0.3594, "step": 120865 }, { - "epoch": 4.25, - "learning_rate": 2.8845931936270247e-06, - "loss": 0.2462, + "epoch": 4.356146610444372, + "grad_norm": 0.2604643702507019, + "learning_rate": 2.1530785968473744e-06, + "loss": 0.3679, "step": 120870 }, { - "epoch": 4.25, - "learning_rate": 2.883264922374776e-06, - "loss": 0.2396, + "epoch": 4.356326810105597, + "grad_norm": 0.23937147855758667, + "learning_rate": 2.1518940150566937e-06, + "loss": 0.3803, "step": 120875 }, { - "epoch": 4.25, - "learning_rate": 2.8819369382937555e-06, - "loss": 0.2467, + "epoch": 4.356507009766822, + "grad_norm": 0.2754795253276825, + "learning_rate": 2.1507097445670337e-06, + "loss": 0.3809, "step": 120880 }, { - "epoch": 4.25, - "learning_rate": 2.880609241401205e-06, - "loss": 0.2532, + "epoch": 4.356687209428046, + "grad_norm": 0.2818678617477417, + "learning_rate": 2.149525785394532e-06, + "loss": 0.3701, "step": 120885 }, { - "epoch": 4.25, - "learning_rate": 2.8792818317143698e-06, - "loss": 0.253, + "epoch": 4.356867409089271, + "grad_norm": 0.28676289319992065, + "learning_rate": 2.148342137555312e-06, + "loss": 0.3622, "step": 120890 }, { - "epoch": 4.25, - "learning_rate": 2.877954709250477e-06, - "loss": 0.2502, + "epoch": 4.357047608750496, + "grad_norm": 0.23003120720386505, + "learning_rate": 2.1471588010655103e-06, + "loss": 0.3463, "step": 120895 }, { - "epoch": 4.25, - "learning_rate": 2.876627874026769e-06, - "loss": 0.2448, + "epoch": 4.35722780841172, + "grad_norm": 0.2534288465976715, + "learning_rate": 2.145975775941245e-06, + "loss": 0.359, "step": 120900 }, { - "epoch": 4.25, - "learning_rate": 2.875301326060462e-06, - "loss": 0.2594, + "epoch": 4.3574080080729445, + "grad_norm": 0.22617748379707336, + "learning_rate": 2.144793062198636e-06, + "loss": 0.3883, "step": 120905 }, { - "epoch": 4.25, - "learning_rate": 2.873975065368792e-06, - "loss": 0.2519, + "epoch": 4.357588207734169, + "grad_norm": 0.23090511560440063, + "learning_rate": 2.143610659853795e-06, + "loss": 0.3662, "step": 120910 }, { - "epoch": 4.25, - "learning_rate": 2.872649091968976e-06, - "loss": 0.2296, + "epoch": 4.357768407395394, + "grad_norm": 0.26390355825424194, + "learning_rate": 2.142428568922836e-06, + "loss": 0.4108, "step": 120915 }, { - "epoch": 4.25, - "learning_rate": 2.8713234058782207e-06, - "loss": 0.2255, + "epoch": 4.357948607056619, + "grad_norm": 0.2305891066789627, + "learning_rate": 2.141246789421855e-06, + "loss": 0.3628, "step": 120920 }, { - "epoch": 4.25, - "learning_rate": 2.8699980071137554e-06, - "loss": 0.26, + "epoch": 4.358128806717843, + "grad_norm": 0.29624882340431213, + "learning_rate": 2.1400653213669715e-06, + "loss": 0.4234, "step": 120925 }, { - "epoch": 4.25, - "learning_rate": 2.8686728956927813e-06, - "loss": 0.2547, + "epoch": 4.358309006379068, + "grad_norm": 0.24561457335948944, + "learning_rate": 2.138884164774263e-06, + "loss": 0.3441, "step": 120930 }, { - "epoch": 4.25, - "learning_rate": 2.867348071632503e-06, - "loss": 0.2509, + "epoch": 4.358489206040293, + "grad_norm": 0.30524396896362305, + "learning_rate": 2.1377033196598367e-06, + "loss": 0.3792, "step": 120935 }, { - "epoch": 4.26, - "learning_rate": 2.8660235349501213e-06, - "loss": 0.2581, + "epoch": 4.358669405701518, + "grad_norm": 0.22376736998558044, + "learning_rate": 2.1365227860397767e-06, + "loss": 0.3578, "step": 120940 }, { - "epoch": 4.26, - "learning_rate": 2.864699285662839e-06, - "loss": 0.2587, + "epoch": 4.358849605362742, + "grad_norm": 0.2799444794654846, + "learning_rate": 2.135342563930165e-06, + "loss": 0.396, "step": 120945 }, { - "epoch": 4.26, - "learning_rate": 2.863375323787848e-06, - "loss": 0.2342, + "epoch": 4.359029805023966, + "grad_norm": 0.26537925004959106, + "learning_rate": 2.1341626533470855e-06, + "loss": 0.397, "step": 120950 }, { - "epoch": 4.26, - "learning_rate": 2.8620516493423427e-06, - "loss": 0.2549, + "epoch": 4.359210004685191, + "grad_norm": 0.30358922481536865, + "learning_rate": 2.1329830543066144e-06, + "loss": 0.3729, "step": 120955 }, { - "epoch": 4.26, - "learning_rate": 2.8607282623435013e-06, - "loss": 0.2453, + "epoch": 4.359390204346416, + "grad_norm": 0.25468435883522034, + "learning_rate": 2.1318037668248165e-06, + "loss": 0.39, "step": 120960 }, { - "epoch": 4.26, - "learning_rate": 2.859405162808518e-06, - "loss": 0.2539, + "epoch": 4.35957040400764, + "grad_norm": 0.32225528359413147, + "learning_rate": 2.1306247909177677e-06, + "loss": 0.3589, "step": 120965 }, { - "epoch": 4.26, - "learning_rate": 2.858082350754568e-06, - "loss": 0.2283, + "epoch": 4.359750603668865, + "grad_norm": 0.28411632776260376, + "learning_rate": 2.1294461266015307e-06, + "loss": 0.3462, "step": 120970 }, { - "epoch": 4.26, - "learning_rate": 2.8567598261988267e-06, - "loss": 0.2295, + "epoch": 4.35993080333009, + "grad_norm": 0.23393802344799042, + "learning_rate": 2.128267773892162e-06, + "loss": 0.3878, "step": 120975 }, { - "epoch": 4.26, - "learning_rate": 2.8554375891584634e-06, - "loss": 0.2649, + "epoch": 4.360111002991315, + "grad_norm": 0.27324995398521423, + "learning_rate": 2.127089732805715e-06, + "loss": 0.4043, "step": 120980 }, { - "epoch": 4.26, - "learning_rate": 2.854115639650651e-06, - "loss": 0.2505, + "epoch": 4.360291202652539, + "grad_norm": 0.24664054811000824, + "learning_rate": 2.125912003358238e-06, + "loss": 0.3737, "step": 120985 }, { - "epoch": 4.26, - "learning_rate": 2.852793977692553e-06, - "loss": 0.2589, + "epoch": 4.360471402313763, + "grad_norm": 0.22538726031780243, + "learning_rate": 2.124734585565788e-06, + "loss": 0.3723, "step": 120990 }, { - "epoch": 4.26, - "learning_rate": 2.851472603301325e-06, - "loss": 0.2311, + "epoch": 4.360651601974988, + "grad_norm": 0.27115336060523987, + "learning_rate": 2.1235574794444046e-06, + "loss": 0.3447, "step": 120995 }, { - "epoch": 4.26, - "learning_rate": 2.8501515164941346e-06, - "loss": 0.2492, + "epoch": 4.360831801636213, + "grad_norm": 0.19281277060508728, + "learning_rate": 2.1223806850101114e-06, + "loss": 0.3592, "step": 121000 }, { - "epoch": 4.26, - "eval_loss": 0.2490115761756897, - "eval_runtime": 10.5538, - "eval_samples_per_second": 9.475, - "eval_steps_per_second": 9.475, + "epoch": 4.360831801636213, + "eval_loss": 0.42894721031188965, + "eval_runtime": 3.5365, + "eval_samples_per_second": 28.276, + "eval_steps_per_second": 7.069, "step": 121000 }, { - "epoch": 4.26, - "learning_rate": 2.848830717288123e-06, - "loss": 0.247, + "epoch": 4.361012001297437, + "grad_norm": 0.24042321741580963, + "learning_rate": 2.1212042022789562e-06, + "loss": 0.3635, "step": 121005 }, { - "epoch": 4.26, - "learning_rate": 2.8475102057004542e-06, - "loss": 0.2259, + "epoch": 4.361192200958662, + "grad_norm": 0.2706429660320282, + "learning_rate": 2.12002803126696e-06, + "loss": 0.3986, "step": 121010 }, { - "epoch": 4.26, - "learning_rate": 2.8461899817482625e-06, - "loss": 0.2403, + "epoch": 4.361372400619887, + "grad_norm": 0.21886184811592102, + "learning_rate": 2.1188521719901574e-06, + "loss": 0.328, "step": 121015 }, { - "epoch": 4.26, - "learning_rate": 2.8448700454486917e-06, - "loss": 0.2519, + "epoch": 4.361552600281112, + "grad_norm": 0.23916944861412048, + "learning_rate": 2.117676624464568e-06, + "loss": 0.3908, "step": 121020 }, { - "epoch": 4.26, - "learning_rate": 2.8435503968188863e-06, - "loss": 0.2418, + "epoch": 4.361732799942336, + "grad_norm": 0.24652530252933502, + "learning_rate": 2.1165013887061963e-06, + "loss": 0.3795, "step": 121025 }, { - "epoch": 4.26, - "learning_rate": 2.842231035875978e-06, - "loss": 0.2397, + "epoch": 4.361912999603561, + "grad_norm": 0.20936556160449982, + "learning_rate": 2.1153264647310654e-06, + "loss": 0.3649, "step": 121030 }, { - "epoch": 4.26, - "learning_rate": 2.8409119626370913e-06, - "loss": 0.2589, + "epoch": 4.362093199264786, + "grad_norm": 0.2558532953262329, + "learning_rate": 2.1141518525551766e-06, + "loss": 0.3569, "step": 121035 }, { - "epoch": 4.26, - "learning_rate": 2.839593177119365e-06, - "loss": 0.2376, + "epoch": 4.36227339892601, + "grad_norm": 0.2516135573387146, + "learning_rate": 2.1129775521945476e-06, + "loss": 0.3817, "step": 121040 }, { - "epoch": 4.26, - "learning_rate": 2.8382746793399166e-06, - "loss": 0.259, + "epoch": 4.362453598587234, + "grad_norm": 0.22439998388290405, + "learning_rate": 2.111803563665166e-06, + "loss": 0.3627, "step": 121045 }, { - "epoch": 4.26, - "learning_rate": 2.8369564693158684e-06, - "loss": 0.2618, + "epoch": 4.362633798248459, + "grad_norm": 0.2665523886680603, + "learning_rate": 2.110629886983023e-06, + "loss": 0.3555, "step": 121050 }, { - "epoch": 4.26, - "learning_rate": 2.835638547064329e-06, - "loss": 0.2712, + "epoch": 4.362813997909684, + "grad_norm": 0.19356732070446014, + "learning_rate": 2.109456522164119e-06, + "loss": 0.3745, "step": 121055 }, { - "epoch": 4.26, - "learning_rate": 2.8343209126024147e-06, - "loss": 0.2358, + "epoch": 4.362994197570909, + "grad_norm": 0.3014737069606781, + "learning_rate": 2.1082834692244424e-06, + "loss": 0.3489, "step": 121060 }, { - "epoch": 4.26, - "learning_rate": 2.83300356594724e-06, - "loss": 0.2489, + "epoch": 4.363174397232133, + "grad_norm": 0.21742402017116547, + "learning_rate": 2.1071107281799675e-06, + "loss": 0.3669, "step": 121065 }, { - "epoch": 4.26, - "learning_rate": 2.8316865071159078e-06, - "loss": 0.2154, + "epoch": 4.363354596893358, + "grad_norm": 0.23088960349559784, + "learning_rate": 2.105938299046678e-06, + "loss": 0.3678, "step": 121070 }, { - "epoch": 4.26, - "learning_rate": 2.83036973612551e-06, - "loss": 0.2405, + "epoch": 4.363534796554583, + "grad_norm": 0.2994691729545593, + "learning_rate": 2.1047661818405423e-06, + "loss": 0.4163, "step": 121075 }, { - "epoch": 4.26, - "learning_rate": 2.829053252993155e-06, - "loss": 0.2571, + "epoch": 4.3637149962158075, + "grad_norm": 0.22919879853725433, + "learning_rate": 2.103594376577542e-06, + "loss": 0.3619, "step": 121080 }, { - "epoch": 4.26, - "learning_rate": 2.8277370577359354e-06, - "loss": 0.2402, + "epoch": 4.363895195877031, + "grad_norm": 0.21188896894454956, + "learning_rate": 2.1024228832736332e-06, + "loss": 0.3756, "step": 121085 }, { - "epoch": 4.26, - "learning_rate": 2.8264211503709335e-06, - "loss": 0.2652, + "epoch": 4.364075395538256, + "grad_norm": 0.18373993039131165, + "learning_rate": 2.1012517019447786e-06, + "loss": 0.3687, "step": 121090 }, { - "epoch": 4.26, - "learning_rate": 2.8251055309152396e-06, - "loss": 0.2483, + "epoch": 4.364255595199481, + "grad_norm": 0.27971792221069336, + "learning_rate": 2.100080832606935e-06, + "loss": 0.4032, "step": 121095 }, { - "epoch": 4.26, - "learning_rate": 2.8237901993859396e-06, - "loss": 0.2549, + "epoch": 4.3644357948607055, + "grad_norm": 0.24079079926013947, + "learning_rate": 2.0989102752760587e-06, + "loss": 0.3679, "step": 121100 }, { - "epoch": 4.26, - "learning_rate": 2.8224751558001084e-06, - "loss": 0.2425, + "epoch": 4.36461599452193, + "grad_norm": 0.21578475832939148, + "learning_rate": 2.0977400299680927e-06, + "loss": 0.377, "step": 121105 }, { - "epoch": 4.26, - "learning_rate": 2.821160400174819e-06, - "loss": 0.2424, + "epoch": 4.364796194183155, + "grad_norm": 0.24278196692466736, + "learning_rate": 2.096570096698991e-06, + "loss": 0.3817, "step": 121110 }, { - "epoch": 4.26, - "learning_rate": 2.819845932527143e-06, - "loss": 0.2504, + "epoch": 4.36497639384438, + "grad_norm": 0.2502589523792267, + "learning_rate": 2.0954004754846795e-06, + "loss": 0.3749, "step": 121115 }, { - "epoch": 4.26, - "learning_rate": 2.818531752874157e-06, - "loss": 0.2361, + "epoch": 4.3651565935056045, + "grad_norm": 0.24439960718154907, + "learning_rate": 2.0942311663411095e-06, + "loss": 0.3807, "step": 121120 }, { - "epoch": 4.26, - "learning_rate": 2.817217861232918e-06, - "loss": 0.2291, + "epoch": 4.365336793166829, + "grad_norm": 0.2953266501426697, + "learning_rate": 2.093062169284202e-06, + "loss": 0.3785, "step": 121125 }, { - "epoch": 4.26, - "learning_rate": 2.815904257620483e-06, - "loss": 0.2484, + "epoch": 4.365516992828054, + "grad_norm": 0.21381710469722748, + "learning_rate": 2.091893484329882e-06, + "loss": 0.3554, "step": 121130 }, { - "epoch": 4.26, - "learning_rate": 2.8145909420539103e-06, - "loss": 0.2387, + "epoch": 4.365697192489278, + "grad_norm": 0.2584659457206726, + "learning_rate": 2.0907251114940907e-06, + "loss": 0.3765, "step": 121135 }, { - "epoch": 4.26, - "learning_rate": 2.813277914550258e-06, - "loss": 0.2293, + "epoch": 4.3658773921505025, + "grad_norm": 0.205842986702919, + "learning_rate": 2.089557050792726e-06, + "loss": 0.364, "step": 121140 }, { - "epoch": 4.26, - "learning_rate": 2.8119651751265687e-06, - "loss": 0.2186, + "epoch": 4.366057591811727, + "grad_norm": 0.22627507150173187, + "learning_rate": 2.088389302241708e-06, + "loss": 0.3628, "step": 121145 }, { - "epoch": 4.26, - "learning_rate": 2.8106527237998835e-06, - "loss": 0.2495, + "epoch": 4.366237791472952, + "grad_norm": 0.22469063103199005, + "learning_rate": 2.087221865856956e-06, + "loss": 0.328, "step": 121150 }, { - "epoch": 4.26, - "learning_rate": 2.8093405605872565e-06, - "loss": 0.2678, + "epoch": 4.366417991134177, + "grad_norm": 0.24385787546634674, + "learning_rate": 2.0860547416543697e-06, + "loss": 0.3852, "step": 121155 }, { - "epoch": 4.26, - "learning_rate": 2.8080286855057174e-06, - "loss": 0.2364, + "epoch": 4.3665981907954015, + "grad_norm": 0.2274542599916458, + "learning_rate": 2.084887929649848e-06, + "loss": 0.3781, "step": 121160 }, { - "epoch": 4.26, - "learning_rate": 2.806717098572295e-06, - "loss": 0.2825, + "epoch": 4.366778390456626, + "grad_norm": 0.31590333580970764, + "learning_rate": 2.0837214298592948e-06, + "loss": 0.3794, "step": 121165 }, { - "epoch": 4.26, - "learning_rate": 2.8054057998040255e-06, - "loss": 0.2593, + "epoch": 4.366958590117851, + "grad_norm": 0.24526174366474152, + "learning_rate": 2.082555242298595e-06, + "loss": 0.3573, "step": 121170 }, { - "epoch": 4.26, - "learning_rate": 2.8040947892179424e-06, - "loss": 0.2576, + "epoch": 4.367138789779075, + "grad_norm": 0.3067219853401184, + "learning_rate": 2.0813893669836495e-06, + "loss": 0.388, "step": 121175 }, { - "epoch": 4.26, - "learning_rate": 2.802784066831057e-06, - "loss": 0.2451, + "epoch": 4.3673189894402995, + "grad_norm": 0.21108821034431458, + "learning_rate": 2.080223803930334e-06, + "loss": 0.3336, "step": 121180 }, { - "epoch": 4.26, - "learning_rate": 2.801473632660395e-06, - "loss": 0.2642, + "epoch": 4.367499189101524, + "grad_norm": 0.312544584274292, + "learning_rate": 2.0790585531545336e-06, + "loss": 0.3726, "step": 121185 }, { - "epoch": 4.26, - "learning_rate": 2.800163486722962e-06, - "loss": 0.2485, + "epoch": 4.367679388762749, + "grad_norm": 0.2698698937892914, + "learning_rate": 2.077893614672122e-06, + "loss": 0.3531, "step": 121190 }, { - "epoch": 4.26, - "learning_rate": 2.79885362903578e-06, - "loss": 0.2436, + "epoch": 4.367859588423974, + "grad_norm": 0.2712307274341583, + "learning_rate": 2.076728988498966e-06, + "loss": 0.352, "step": 121195 }, { - "epoch": 4.26, - "learning_rate": 2.7975440596158533e-06, - "loss": 0.2518, + "epoch": 4.368039788085198, + "grad_norm": 0.2807067036628723, + "learning_rate": 2.0755646746509453e-06, + "loss": 0.3675, "step": 121200 }, { - "epoch": 4.26, - "learning_rate": 2.796234778480186e-06, - "loss": 0.2564, + "epoch": 4.368219987746423, + "grad_norm": 0.23370806872844696, + "learning_rate": 2.0744006731439225e-06, + "loss": 0.3685, "step": 121205 }, { - "epoch": 4.26, - "learning_rate": 2.7949257856457715e-06, - "loss": 0.281, + "epoch": 4.368400187407648, + "grad_norm": 0.30535852909088135, + "learning_rate": 2.0732369839937428e-06, + "loss": 0.3524, "step": 121210 }, { - "epoch": 4.26, - "learning_rate": 2.793617081129618e-06, - "loss": 0.2407, + "epoch": 4.368580387068873, + "grad_norm": 0.21354763209819794, + "learning_rate": 2.072073607216274e-06, + "loss": 0.3562, "step": 121215 }, { - "epoch": 4.26, - "learning_rate": 2.792308664948706e-06, - "loss": 0.2698, + "epoch": 4.368760586730097, + "grad_norm": 0.23636168241500854, + "learning_rate": 2.0709105428273647e-06, + "loss": 0.3307, "step": 121220 }, { - "epoch": 4.27, - "learning_rate": 2.7910005371200337e-06, - "loss": 0.2251, + "epoch": 4.368940786391321, + "grad_norm": 0.22024032473564148, + "learning_rate": 2.0697477908428602e-06, + "loss": 0.3996, "step": 121225 }, { - "epoch": 4.27, - "learning_rate": 2.7896926976605813e-06, - "loss": 0.2399, + "epoch": 4.369120986052546, + "grad_norm": 0.19595114886760712, + "learning_rate": 2.0685853512786013e-06, + "loss": 0.3299, "step": 121230 }, { - "epoch": 4.27, - "learning_rate": 2.7883851465873356e-06, - "loss": 0.2554, + "epoch": 4.369301185713771, + "grad_norm": 0.28025031089782715, + "learning_rate": 2.0674232241504223e-06, + "loss": 0.4299, "step": 121235 }, { - "epoch": 4.27, - "learning_rate": 2.7870778839172718e-06, - "loss": 0.2535, + "epoch": 4.369481385374995, + "grad_norm": 0.2547728419303894, + "learning_rate": 2.0662614094741683e-06, + "loss": 0.3679, "step": 121240 }, { - "epoch": 4.27, - "learning_rate": 2.7857709096673623e-06, - "loss": 0.261, + "epoch": 4.36966158503622, + "grad_norm": 0.22356577217578888, + "learning_rate": 2.0650999072656606e-06, + "loss": 0.3665, "step": 121245 }, { - "epoch": 4.27, - "learning_rate": 2.7844642238545717e-06, - "loss": 0.2599, + "epoch": 4.369841784697445, + "grad_norm": 0.2514247000217438, + "learning_rate": 2.063938717540728e-06, + "loss": 0.3732, "step": 121250 }, { - "epoch": 4.27, - "learning_rate": 2.78315782649588e-06, - "loss": 0.2503, + "epoch": 4.37002198435867, + "grad_norm": 0.2653522193431854, + "learning_rate": 2.0627778403151886e-06, + "loss": 0.364, "step": 121255 }, { - "epoch": 4.27, - "learning_rate": 2.7818517176082388e-06, - "loss": 0.2507, + "epoch": 4.370202184019894, + "grad_norm": 0.28731030225753784, + "learning_rate": 2.0616172756048573e-06, + "loss": 0.3676, "step": 121260 }, { - "epoch": 4.27, - "learning_rate": 2.7805458972086084e-06, - "loss": 0.2684, + "epoch": 4.370382383681119, + "grad_norm": 0.2413557469844818, + "learning_rate": 2.060457023425552e-06, + "loss": 0.3525, "step": 121265 }, { - "epoch": 4.27, - "learning_rate": 2.779240365313951e-06, - "loss": 0.246, + "epoch": 4.370562583342343, + "grad_norm": 0.27309590578079224, + "learning_rate": 2.0592970837930825e-06, + "loss": 0.3655, "step": 121270 }, { - "epoch": 4.27, - "learning_rate": 2.777935121941208e-06, - "loss": 0.2402, + "epoch": 4.370742783003568, + "grad_norm": 0.2248447686433792, + "learning_rate": 2.0581374567232443e-06, + "loss": 0.3994, "step": 121275 }, { - "epoch": 4.27, - "learning_rate": 2.7766301671073386e-06, - "loss": 0.2546, + "epoch": 4.370922982664792, + "grad_norm": 0.1932300329208374, + "learning_rate": 2.056978142231844e-06, + "loss": 0.3831, "step": 121280 }, { - "epoch": 4.27, - "learning_rate": 2.7753255008292763e-06, - "loss": 0.2616, + "epoch": 4.371103182326017, + "grad_norm": 0.22570841014385223, + "learning_rate": 2.055819140334675e-06, + "loss": 0.3929, "step": 121285 }, { - "epoch": 4.27, - "learning_rate": 2.774021123123968e-06, - "loss": 0.2429, + "epoch": 4.371283381987242, + "grad_norm": 0.2672460973262787, + "learning_rate": 2.0546604510475213e-06, + "loss": 0.3756, "step": 121290 }, { - "epoch": 4.27, - "learning_rate": 2.7727170340083487e-06, - "loss": 0.2257, + "epoch": 4.371463581648467, + "grad_norm": 0.2674674689769745, + "learning_rate": 2.0535020743861822e-06, + "loss": 0.3614, "step": 121295 }, { - "epoch": 4.27, - "learning_rate": 2.771413233499351e-06, - "loss": 0.2453, + "epoch": 4.371643781309691, + "grad_norm": 0.2942127585411072, + "learning_rate": 2.0523440103664358e-06, + "loss": 0.3904, "step": 121300 }, { - "epoch": 4.27, - "learning_rate": 2.7701097216139006e-06, - "loss": 0.2324, + "epoch": 4.371823980970916, + "grad_norm": 0.210496187210083, + "learning_rate": 2.0511862590040566e-06, + "loss": 0.3949, "step": 121305 }, { - "epoch": 4.27, - "learning_rate": 2.7688064983689253e-06, - "loss": 0.247, + "epoch": 4.372004180632141, + "grad_norm": 0.2888723909854889, + "learning_rate": 2.0500288203148254e-06, + "loss": 0.3525, "step": 121310 }, { - "epoch": 4.27, - "learning_rate": 2.767503563781351e-06, - "loss": 0.2538, + "epoch": 4.372184380293365, + "grad_norm": 0.2629977762699127, + "learning_rate": 2.0488716943145025e-06, + "loss": 0.3809, "step": 121315 }, { - "epoch": 4.27, - "learning_rate": 2.766200917868089e-06, - "loss": 0.2301, + "epoch": 4.372364579954589, + "grad_norm": 0.2915158271789551, + "learning_rate": 2.047714881018867e-06, + "loss": 0.3755, "step": 121320 }, { - "epoch": 4.27, - "learning_rate": 2.764898560646051e-06, - "loss": 0.2277, + "epoch": 4.372544779615814, + "grad_norm": 0.2206728309392929, + "learning_rate": 2.0465583804436668e-06, + "loss": 0.3814, "step": 121325 }, { - "epoch": 4.27, - "learning_rate": 2.763596492132159e-06, - "loss": 0.2596, + "epoch": 4.372724979277039, + "grad_norm": 0.27409932017326355, + "learning_rate": 2.0454021926046645e-06, + "loss": 0.3721, "step": 121330 }, { - "epoch": 4.27, - "learning_rate": 2.762294712343305e-06, - "loss": 0.2412, + "epoch": 4.372905178938264, + "grad_norm": 0.2713205814361572, + "learning_rate": 2.0442463175176147e-06, + "loss": 0.3677, "step": 121335 }, { - "epoch": 4.27, - "learning_rate": 2.760993221296404e-06, - "loss": 0.2399, + "epoch": 4.373085378599488, + "grad_norm": 0.2275678962469101, + "learning_rate": 2.0430907551982654e-06, + "loss": 0.365, "step": 121340 }, { - "epoch": 4.27, - "learning_rate": 2.7596920190083445e-06, - "loss": 0.2316, + "epoch": 4.373265578260713, + "grad_norm": 0.2382095456123352, + "learning_rate": 2.0419355056623597e-06, + "loss": 0.3718, "step": 121345 }, { - "epoch": 4.27, - "learning_rate": 2.758391105496033e-06, - "loss": 0.2384, + "epoch": 4.373445777921938, + "grad_norm": 0.207700714468956, + "learning_rate": 2.0407805689256374e-06, + "loss": 0.3636, "step": 121350 }, { - "epoch": 4.27, - "learning_rate": 2.757090480776353e-06, - "loss": 0.2528, + "epoch": 4.3736259775831625, + "grad_norm": 0.2512191832065582, + "learning_rate": 2.0396259450038315e-06, + "loss": 0.3407, "step": 121355 }, { - "epoch": 4.27, - "learning_rate": 2.7557901448661964e-06, - "loss": 0.2692, + "epoch": 4.373806177244386, + "grad_norm": 0.24360640347003937, + "learning_rate": 2.038471633912678e-06, + "loss": 0.3854, "step": 121360 }, { - "epoch": 4.27, - "learning_rate": 2.7544900977824416e-06, - "loss": 0.251, + "epoch": 4.373986376905611, + "grad_norm": 0.2918420433998108, + "learning_rate": 2.0373176356679067e-06, + "loss": 0.3846, "step": 121365 }, { - "epoch": 4.27, - "learning_rate": 2.753190339541978e-06, - "loss": 0.2458, + "epoch": 4.374166576566836, + "grad_norm": 0.21557246148586273, + "learning_rate": 2.0361639502852358e-06, + "loss": 0.3759, "step": 121370 }, { - "epoch": 4.27, - "learning_rate": 2.751890870161675e-06, - "loss": 0.2533, + "epoch": 4.374346776228061, + "grad_norm": 0.25849926471710205, + "learning_rate": 2.0350105777803856e-06, + "loss": 0.3576, "step": 121375 }, { - "epoch": 4.27, - "learning_rate": 2.7505916896583993e-06, - "loss": 0.2551, + "epoch": 4.374526975889285, + "grad_norm": 0.19965314865112305, + "learning_rate": 2.033857518169066e-06, + "loss": 0.3659, "step": 121380 }, { - "epoch": 4.27, - "learning_rate": 2.7492927980490356e-06, - "loss": 0.2596, + "epoch": 4.37470717555051, + "grad_norm": 0.2662714421749115, + "learning_rate": 2.032704771466995e-06, + "loss": 0.3814, "step": 121385 }, { - "epoch": 4.27, - "learning_rate": 2.7479941953504366e-06, - "loss": 0.2748, + "epoch": 4.374887375211735, + "grad_norm": 0.2764718532562256, + "learning_rate": 2.0315523376898765e-06, + "loss": 0.3941, "step": 121390 }, { - "epoch": 4.27, - "learning_rate": 2.7466958815794723e-06, - "loss": 0.2447, + "epoch": 4.3750675748729595, + "grad_norm": 0.2914084494113922, + "learning_rate": 2.0304002168534065e-06, + "loss": 0.3853, "step": 121395 }, { - "epoch": 4.27, - "learning_rate": 2.745397856752993e-06, - "loss": 0.2598, + "epoch": 4.375247774534184, + "grad_norm": 0.29716944694519043, + "learning_rate": 2.029248408973289e-06, + "loss": 0.3849, "step": 121400 }, { - "epoch": 4.27, - "learning_rate": 2.7441001208878575e-06, - "loss": 0.2256, + "epoch": 4.375427974195409, + "grad_norm": 0.21052998304367065, + "learning_rate": 2.028096914065214e-06, + "loss": 0.3677, "step": 121405 }, { - "epoch": 4.27, - "learning_rate": 2.742802674000919e-06, - "loss": 0.2501, + "epoch": 4.375608173856633, + "grad_norm": 0.20469604432582855, + "learning_rate": 2.026945732144872e-06, + "loss": 0.3207, "step": 121410 }, { - "epoch": 4.27, - "learning_rate": 2.7415055161090166e-06, - "loss": 0.2437, + "epoch": 4.3757883735178575, + "grad_norm": 0.24874727427959442, + "learning_rate": 2.025794863227945e-06, + "loss": 0.3688, "step": 121415 }, { - "epoch": 4.27, - "learning_rate": 2.740208647228992e-06, - "loss": 0.247, + "epoch": 4.375968573179082, + "grad_norm": 0.2315155416727066, + "learning_rate": 2.024644307330112e-06, + "loss": 0.3849, "step": 121420 }, { - "epoch": 4.27, - "learning_rate": 2.738912067377694e-06, - "loss": 0.2348, + "epoch": 4.376148772840307, + "grad_norm": 0.27341365814208984, + "learning_rate": 2.0234940644670576e-06, + "loss": 0.3887, "step": 121425 }, { - "epoch": 4.27, - "learning_rate": 2.737615776571953e-06, - "loss": 0.2418, + "epoch": 4.376328972501532, + "grad_norm": 0.2433520257472992, + "learning_rate": 2.0223441346544468e-06, + "loss": 0.3943, "step": 121430 }, { - "epoch": 4.27, - "learning_rate": 2.7363197748285942e-06, - "loss": 0.2489, + "epoch": 4.3765091721627565, + "grad_norm": 0.24576754868030548, + "learning_rate": 2.0211945179079483e-06, + "loss": 0.371, "step": 121435 }, { - "epoch": 4.27, - "learning_rate": 2.735024062164454e-06, - "loss": 0.2592, + "epoch": 4.376689371823981, + "grad_norm": 0.19979894161224365, + "learning_rate": 2.020045214243227e-06, + "loss": 0.3891, "step": 121440 }, { - "epoch": 4.27, - "learning_rate": 2.7337286385963503e-06, - "loss": 0.2512, + "epoch": 4.376869571485206, + "grad_norm": 0.27705785632133484, + "learning_rate": 2.0188962236759342e-06, + "loss": 0.3707, "step": 121445 }, { - "epoch": 4.27, - "learning_rate": 2.7324335041411105e-06, - "loss": 0.2565, + "epoch": 4.37704977114643, + "grad_norm": 0.26884958148002625, + "learning_rate": 2.0177475462217376e-06, + "loss": 0.3791, "step": 121450 }, { - "epoch": 4.27, - "learning_rate": 2.7311386588155464e-06, - "loss": 0.2642, + "epoch": 4.3772299708076545, + "grad_norm": 0.26522353291511536, + "learning_rate": 2.0165991818962784e-06, + "loss": 0.3957, "step": 121455 }, { - "epoch": 4.27, - "learning_rate": 2.7298441026364667e-06, - "loss": 0.2381, + "epoch": 4.377410170468879, + "grad_norm": 0.2895340323448181, + "learning_rate": 2.0154511307152066e-06, + "loss": 0.3571, "step": 121460 }, { - "epoch": 4.27, - "learning_rate": 2.728549835620689e-06, - "loss": 0.2478, + "epoch": 4.377590370130104, + "grad_norm": 0.23953476548194885, + "learning_rate": 2.014303392694164e-06, + "loss": 0.361, "step": 121465 }, { - "epoch": 4.27, - "learning_rate": 2.727255857785016e-06, - "loss": 0.255, + "epoch": 4.377770569791329, + "grad_norm": 0.26138606667518616, + "learning_rate": 2.01315596784879e-06, + "loss": 0.3653, "step": 121470 }, { - "epoch": 4.27, - "learning_rate": 2.7259621691462454e-06, - "loss": 0.2477, + "epoch": 4.3779507694525535, + "grad_norm": 0.2306900918483734, + "learning_rate": 2.0120088561947082e-06, + "loss": 0.3363, "step": 121475 }, { - "epoch": 4.27, - "learning_rate": 2.7246687697211728e-06, - "loss": 0.2686, + "epoch": 4.378130969113778, + "grad_norm": 0.24305300414562225, + "learning_rate": 2.010862057747559e-06, + "loss": 0.394, "step": 121480 }, { - "epoch": 4.27, - "learning_rate": 2.7233756595266003e-06, - "loss": 0.2574, + "epoch": 4.378311168775003, + "grad_norm": 0.2668958604335785, + "learning_rate": 2.0097155725229634e-06, + "loss": 0.3817, "step": 121485 }, { - "epoch": 4.27, - "learning_rate": 2.722082838579315e-06, - "loss": 0.2669, + "epoch": 4.378491368436228, + "grad_norm": 0.23452064394950867, + "learning_rate": 2.0085694005365446e-06, + "loss": 0.361, "step": 121490 }, { - "epoch": 4.27, - "learning_rate": 2.7207903068960982e-06, - "loss": 0.2532, + "epoch": 4.378671568097452, + "grad_norm": 0.2706201672554016, + "learning_rate": 2.007423541803913e-06, + "loss": 0.3506, "step": 121495 }, { - "epoch": 4.27, - "learning_rate": 2.7194980644937335e-06, - "loss": 0.2605, + "epoch": 4.378851767758676, + "grad_norm": 0.2533268332481384, + "learning_rate": 2.006277996340683e-06, + "loss": 0.3454, "step": 121500 }, { - "epoch": 4.27, - "eval_loss": 0.2489774525165558, - "eval_runtime": 10.5356, - "eval_samples_per_second": 9.492, - "eval_steps_per_second": 9.492, + "epoch": 4.378851767758676, + "eval_loss": 0.42894676327705383, + "eval_runtime": 3.5453, + "eval_samples_per_second": 28.206, + "eval_steps_per_second": 7.052, "step": 121500 }, { - "epoch": 4.27, - "learning_rate": 2.71820611138901e-06, - "loss": 0.2496, + "epoch": 4.379031967419901, + "grad_norm": 0.23799777030944824, + "learning_rate": 2.0051327641624706e-06, + "loss": 0.3548, "step": 121505 }, { - "epoch": 4.28, - "learning_rate": 2.716914447598698e-06, - "loss": 0.2578, + "epoch": 4.379212167081126, + "grad_norm": 0.302617609500885, + "learning_rate": 2.0039878452848687e-06, + "loss": 0.3609, "step": 121510 }, { - "epoch": 4.28, - "learning_rate": 2.715623073139559e-06, - "loss": 0.2563, + "epoch": 4.37939236674235, + "grad_norm": 0.21825525164604187, + "learning_rate": 2.0028432397234758e-06, + "loss": 0.3464, "step": 121515 }, { - "epoch": 4.28, - "learning_rate": 2.7143319880283735e-06, - "loss": 0.2564, + "epoch": 4.379572566403575, + "grad_norm": 0.26662659645080566, + "learning_rate": 2.0016989474938934e-06, + "loss": 0.3305, "step": 121520 }, { - "epoch": 4.28, - "learning_rate": 2.713041192281901e-06, - "loss": 0.2646, + "epoch": 4.3797527660648, + "grad_norm": 0.31008052825927734, + "learning_rate": 2.000554968611712e-06, + "loss": 0.3879, "step": 121525 }, { - "epoch": 4.28, - "learning_rate": 2.7117506859169028e-06, - "loss": 0.254, + "epoch": 4.379932965726025, + "grad_norm": 0.2644076943397522, + "learning_rate": 1.9994113030925188e-06, + "loss": 0.3359, "step": 121530 }, { - "epoch": 4.28, - "learning_rate": 2.710460468950127e-06, - "loss": 0.2599, + "epoch": 4.380113165387249, + "grad_norm": 0.25222399830818176, + "learning_rate": 1.9982679509518905e-06, + "loss": 0.3419, "step": 121535 }, { - "epoch": 4.28, - "learning_rate": 2.709170541398337e-06, - "loss": 0.2422, + "epoch": 4.380293365048474, + "grad_norm": 0.2816098928451538, + "learning_rate": 1.997124912205403e-06, + "loss": 0.3983, "step": 121540 }, { - "epoch": 4.28, - "learning_rate": 2.707880903278276e-06, - "loss": 0.2631, + "epoch": 4.380473564709698, + "grad_norm": 0.5892481803894043, + "learning_rate": 1.9959821868686416e-06, + "loss": 0.3839, "step": 121545 }, { - "epoch": 4.28, - "learning_rate": 2.7065915546066883e-06, - "loss": 0.2415, + "epoch": 4.380653764370923, + "grad_norm": 0.2820974886417389, + "learning_rate": 1.994839774957169e-06, + "loss": 0.3707, "step": 121550 }, { - "epoch": 4.28, - "learning_rate": 2.7053024954003163e-06, - "loss": 0.2481, + "epoch": 4.380833964032147, + "grad_norm": 0.24034863710403442, + "learning_rate": 1.9936976764865502e-06, + "loss": 0.3653, "step": 121555 }, { - "epoch": 4.28, - "learning_rate": 2.7040137256759022e-06, - "loss": 0.2766, + "epoch": 4.381014163693372, + "grad_norm": 0.2928817570209503, + "learning_rate": 1.9925558914723475e-06, + "loss": 0.3839, "step": 121560 }, { - "epoch": 4.28, - "learning_rate": 2.7027252454501772e-06, - "loss": 0.2354, + "epoch": 4.381194363354597, + "grad_norm": 0.2788287401199341, + "learning_rate": 1.99141441993011e-06, + "loss": 0.3397, "step": 121565 }, { - "epoch": 4.28, - "learning_rate": 2.701437054739872e-06, - "loss": 0.2468, + "epoch": 4.381374563015822, + "grad_norm": 0.2363235503435135, + "learning_rate": 1.9902732618753998e-06, + "loss": 0.3395, "step": 121570 }, { - "epoch": 4.28, - "learning_rate": 2.700149153561704e-06, - "loss": 0.2653, + "epoch": 4.381554762677046, + "grad_norm": 0.2198476344347, + "learning_rate": 1.989132417323769e-06, + "loss": 0.3649, "step": 121575 }, { - "epoch": 4.28, - "learning_rate": 2.69886154193241e-06, - "loss": 0.2803, + "epoch": 4.381734962338271, + "grad_norm": 0.2580127716064453, + "learning_rate": 1.987991886290744e-06, + "loss": 0.3786, "step": 121580 }, { - "epoch": 4.28, - "learning_rate": 2.697574219868701e-06, - "loss": 0.256, + "epoch": 4.381915161999496, + "grad_norm": 0.225581094622612, + "learning_rate": 1.986851668791878e-06, + "loss": 0.3706, "step": 121585 }, { - "epoch": 4.28, - "learning_rate": 2.696287187387292e-06, - "loss": 0.2505, + "epoch": 4.38209536166072, + "grad_norm": 0.22968202829360962, + "learning_rate": 1.9857117648427044e-06, + "loss": 0.3682, "step": 121590 }, { - "epoch": 4.28, - "learning_rate": 2.6950004445048917e-06, - "loss": 0.2512, + "epoch": 4.382275561321944, + "grad_norm": 0.26144370436668396, + "learning_rate": 1.984572174458746e-06, + "loss": 0.3679, "step": 121595 }, { - "epoch": 4.28, - "learning_rate": 2.6937139912382142e-06, - "loss": 0.255, + "epoch": 4.382455760983169, + "grad_norm": 0.2481224089860916, + "learning_rate": 1.983432897655546e-06, + "loss": 0.3604, "step": 121600 }, { - "epoch": 4.28, - "learning_rate": 2.692427827603958e-06, - "loss": 0.2581, + "epoch": 4.382635960644394, + "grad_norm": 0.24234378337860107, + "learning_rate": 1.9822939344486063e-06, + "loss": 0.389, "step": 121605 }, { - "epoch": 4.28, - "learning_rate": 2.6911419536188228e-06, - "loss": 0.2419, + "epoch": 4.382816160305619, + "grad_norm": 0.27252396941185, + "learning_rate": 1.9811552848534587e-06, + "loss": 0.3839, "step": 121610 }, { - "epoch": 4.28, - "learning_rate": 2.689856369299512e-06, - "loss": 0.2618, + "epoch": 4.382996359966843, + "grad_norm": 0.30419376492500305, + "learning_rate": 1.980016948885616e-06, + "loss": 0.3901, "step": 121615 }, { - "epoch": 4.28, - "learning_rate": 2.688571074662716e-06, - "loss": 0.2373, + "epoch": 4.383176559628068, + "grad_norm": 0.3380153477191925, + "learning_rate": 1.9788789265605844e-06, + "loss": 0.3617, "step": 121620 }, { - "epoch": 4.28, - "learning_rate": 2.68728606972512e-06, - "loss": 0.2525, + "epoch": 4.383356759289293, + "grad_norm": 0.23734219372272491, + "learning_rate": 1.9777412178938722e-06, + "loss": 0.3463, "step": 121625 }, { - "epoch": 4.28, - "learning_rate": 2.6860013545034034e-06, - "loss": 0.2449, + "epoch": 4.3835369589505175, + "grad_norm": 0.2698742747306824, + "learning_rate": 1.976603822900974e-06, + "loss": 0.3741, "step": 121630 }, { - "epoch": 4.28, - "learning_rate": 2.6847169290142583e-06, - "loss": 0.2477, + "epoch": 4.383717158611741, + "grad_norm": 0.2732264995574951, + "learning_rate": 1.9754667415973953e-06, + "loss": 0.3122, "step": 121635 }, { - "epoch": 4.28, - "learning_rate": 2.6834327932743598e-06, - "loss": 0.2501, + "epoch": 4.383897358272966, + "grad_norm": 0.2620375156402588, + "learning_rate": 1.974329973998623e-06, + "loss": 0.3475, "step": 121640 }, { - "epoch": 4.28, - "learning_rate": 2.682148947300378e-06, - "loss": 0.2325, + "epoch": 4.384077557934191, + "grad_norm": 0.26369425654411316, + "learning_rate": 1.973193520120148e-06, + "loss": 0.3816, "step": 121645 }, { - "epoch": 4.28, - "learning_rate": 2.6808653911089775e-06, - "loss": 0.2497, + "epoch": 4.384257757595416, + "grad_norm": 0.26098334789276123, + "learning_rate": 1.9720573799774542e-06, + "loss": 0.3592, "step": 121650 }, { - "epoch": 4.28, - "learning_rate": 2.6795821247168394e-06, - "loss": 0.2622, + "epoch": 4.38443795725664, + "grad_norm": 0.20638631284236908, + "learning_rate": 1.970921553586019e-06, + "loss": 0.3724, "step": 121655 }, { - "epoch": 4.28, - "learning_rate": 2.678299148140609e-06, - "loss": 0.2621, + "epoch": 4.384618156917865, + "grad_norm": 0.24286890029907227, + "learning_rate": 1.9697860409613133e-06, + "loss": 0.3677, "step": 121660 }, { - "epoch": 4.28, - "learning_rate": 2.677016461396961e-06, - "loss": 0.2589, + "epoch": 4.38479835657909, + "grad_norm": 0.2833465039730072, + "learning_rate": 1.9686508421188214e-06, + "loss": 0.4158, "step": 121665 }, { - "epoch": 4.28, - "learning_rate": 2.6757340645025387e-06, - "loss": 0.2481, + "epoch": 4.3849785562403145, + "grad_norm": 0.2680220901966095, + "learning_rate": 1.967515957074001e-06, + "loss": 0.3728, "step": 121670 }, { - "epoch": 4.28, - "learning_rate": 2.674451957474e-06, - "loss": 0.2638, + "epoch": 4.385158755901539, + "grad_norm": 0.26703187823295593, + "learning_rate": 1.9663813858423167e-06, + "loss": 0.3861, "step": 121675 }, { - "epoch": 4.28, - "learning_rate": 2.673170140327991e-06, - "loss": 0.268, + "epoch": 4.385338955562764, + "grad_norm": 0.2260323315858841, + "learning_rate": 1.9652471284392265e-06, + "loss": 0.3477, "step": 121680 }, { - "epoch": 4.28, - "learning_rate": 2.671888613081153e-06, - "loss": 0.2838, + "epoch": 4.385519155223988, + "grad_norm": 0.23601694405078888, + "learning_rate": 1.9641131848801785e-06, + "loss": 0.4099, "step": 121685 }, { - "epoch": 4.28, - "learning_rate": 2.6706073757501227e-06, - "loss": 0.2576, + "epoch": 4.385699354885213, + "grad_norm": 0.26613062620162964, + "learning_rate": 1.9629795551806414e-06, + "loss": 0.3624, "step": 121690 }, { - "epoch": 4.28, - "learning_rate": 2.669326428351543e-06, - "loss": 0.2445, + "epoch": 4.385879554546437, + "grad_norm": 0.23017358779907227, + "learning_rate": 1.961846239356038e-06, + "loss": 0.3831, "step": 121695 }, { - "epoch": 4.28, - "learning_rate": 2.6680457709020447e-06, - "loss": 0.2423, + "epoch": 4.386059754207662, + "grad_norm": 0.3070394992828369, + "learning_rate": 1.9607132374218184e-06, + "loss": 0.349, "step": 121700 }, { - "epoch": 4.28, - "learning_rate": 2.666765403418256e-06, - "loss": 0.2709, + "epoch": 4.386239953868887, + "grad_norm": 0.22875505685806274, + "learning_rate": 1.959580549393425e-06, + "loss": 0.3717, "step": 121705 }, { - "epoch": 4.28, - "learning_rate": 2.6654853259167943e-06, - "loss": 0.2508, + "epoch": 4.3864201535301115, + "grad_norm": 0.3101567327976227, + "learning_rate": 1.9584481752862816e-06, + "loss": 0.3494, "step": 121710 }, { - "epoch": 4.28, - "learning_rate": 2.6642055384142857e-06, - "loss": 0.2431, + "epoch": 4.386600353191336, + "grad_norm": 0.2821701467037201, + "learning_rate": 1.9573161151158288e-06, + "loss": 0.3666, "step": 121715 }, { - "epoch": 4.28, - "learning_rate": 2.662926040927352e-06, - "loss": 0.2804, + "epoch": 4.386780552852561, + "grad_norm": 0.2166757434606552, + "learning_rate": 1.9561843688974764e-06, + "loss": 0.4189, "step": 121720 }, { - "epoch": 4.28, - "learning_rate": 2.661646833472606e-06, - "loss": 0.2608, + "epoch": 4.386960752513785, + "grad_norm": 0.24579738080501556, + "learning_rate": 1.955052936646648e-06, + "loss": 0.3535, "step": 121725 }, { - "epoch": 4.28, - "learning_rate": 2.660367916066647e-06, - "loss": 0.2494, + "epoch": 4.3871409521750095, + "grad_norm": 0.2603980600833893, + "learning_rate": 1.953921818378768e-06, + "loss": 0.3511, "step": 121730 }, { - "epoch": 4.28, - "learning_rate": 2.6590892887260963e-06, - "loss": 0.232, + "epoch": 4.387321151836234, + "grad_norm": 0.23183012008666992, + "learning_rate": 1.952791014109237e-06, + "loss": 0.3724, "step": 121735 }, { - "epoch": 4.28, - "learning_rate": 2.657810951467546e-06, - "loss": 0.2577, + "epoch": 4.387501351497459, + "grad_norm": 0.1970612108707428, + "learning_rate": 1.9516605238534686e-06, + "loss": 0.3725, "step": 121740 }, { - "epoch": 4.28, - "learning_rate": 2.6565329043075905e-06, - "loss": 0.2576, + "epoch": 4.387681551158684, + "grad_norm": 0.283931702375412, + "learning_rate": 1.950530347626864e-06, + "loss": 0.371, "step": 121745 }, { - "epoch": 4.28, - "learning_rate": 2.6552551472628363e-06, - "loss": 0.253, + "epoch": 4.3878617508199085, + "grad_norm": 0.32697275280952454, + "learning_rate": 1.9494004854448163e-06, + "loss": 0.3709, "step": 121750 }, { - "epoch": 4.28, - "learning_rate": 2.65397768034987e-06, - "loss": 0.2486, + "epoch": 4.388041950481133, + "grad_norm": 0.2545589506626129, + "learning_rate": 1.948270937322727e-06, + "loss": 0.3777, "step": 121755 }, { - "epoch": 4.28, - "learning_rate": 2.652700503585276e-06, - "loss": 0.2248, + "epoch": 4.388222150142358, + "grad_norm": 0.2229977399110794, + "learning_rate": 1.947141703275987e-06, + "loss": 0.3423, "step": 121760 }, { - "epoch": 4.28, - "learning_rate": 2.651423616985632e-06, - "loss": 0.2486, + "epoch": 4.388402349803583, + "grad_norm": 0.2144192010164261, + "learning_rate": 1.9460127833199756e-06, + "loss": 0.3872, "step": 121765 }, { - "epoch": 4.28, - "learning_rate": 2.6501470205675283e-06, - "loss": 0.2499, + "epoch": 4.388582549464807, + "grad_norm": 0.25160834193229675, + "learning_rate": 1.9448841774700744e-06, + "loss": 0.4037, "step": 121770 }, { - "epoch": 4.28, - "learning_rate": 2.6488707143475343e-06, - "loss": 0.2508, + "epoch": 4.388762749126031, + "grad_norm": 0.2952007055282593, + "learning_rate": 1.943755885741666e-06, + "loss": 0.3825, "step": 121775 }, { - "epoch": 4.28, - "learning_rate": 2.647594698342229e-06, - "loss": 0.2659, + "epoch": 4.388942948787256, + "grad_norm": 0.25669974088668823, + "learning_rate": 1.942627908150113e-06, + "loss": 0.3635, "step": 121780 }, { - "epoch": 4.28, - "learning_rate": 2.6463189725681683e-06, - "loss": 0.2229, + "epoch": 4.389123148448481, + "grad_norm": 0.28366559743881226, + "learning_rate": 1.9415002447108004e-06, + "loss": 0.3789, "step": 121785 }, { - "epoch": 4.28, - "learning_rate": 2.64504353704193e-06, - "loss": 0.2552, + "epoch": 4.3893033481097055, + "grad_norm": 0.23855380713939667, + "learning_rate": 1.9403728954390745e-06, + "loss": 0.4061, "step": 121790 }, { - "epoch": 4.29, - "learning_rate": 2.643768391780069e-06, - "loss": 0.2331, + "epoch": 4.38948354777093, + "grad_norm": 0.2798866033554077, + "learning_rate": 1.939245860350308e-06, + "loss": 0.3814, "step": 121795 }, { - "epoch": 4.29, - "learning_rate": 2.6424935367991425e-06, - "loss": 0.2623, + "epoch": 4.389663747432155, + "grad_norm": 0.2672715485095978, + "learning_rate": 1.9381191394598514e-06, + "loss": 0.3589, "step": 121800 }, { - "epoch": 4.29, - "learning_rate": 2.6412189721157e-06, - "loss": 0.2548, + "epoch": 4.38984394709338, + "grad_norm": 0.22907915711402893, + "learning_rate": 1.936992732783058e-06, + "loss": 0.3436, "step": 121805 }, { - "epoch": 4.29, - "learning_rate": 2.6399446977462968e-06, - "loss": 0.2427, + "epoch": 4.390024146754604, + "grad_norm": 0.3034953474998474, + "learning_rate": 1.935866640335271e-06, + "loss": 0.3956, "step": 121810 }, { - "epoch": 4.29, - "learning_rate": 2.638670713707475e-06, - "loss": 0.2621, + "epoch": 4.390204346415829, + "grad_norm": 0.21499215066432953, + "learning_rate": 1.934740862131837e-06, + "loss": 0.3986, "step": 121815 }, { - "epoch": 4.29, - "learning_rate": 2.637397020015775e-06, - "loss": 0.2476, + "epoch": 4.390384546077053, + "grad_norm": 0.2739037275314331, + "learning_rate": 1.9336153981880906e-06, + "loss": 0.3885, "step": 121820 }, { - "epoch": 4.29, - "learning_rate": 2.6361236166877414e-06, - "loss": 0.2595, + "epoch": 4.390564745738278, + "grad_norm": 0.21741308271884918, + "learning_rate": 1.9324902485193696e-06, + "loss": 0.3895, "step": 121825 }, { - "epoch": 4.29, - "learning_rate": 2.6348505037398995e-06, - "loss": 0.2507, + "epoch": 4.390744945399502, + "grad_norm": 0.24937617778778076, + "learning_rate": 1.9313654131410035e-06, + "loss": 0.3741, "step": 121830 }, { - "epoch": 4.29, - "learning_rate": 2.633577681188787e-06, - "loss": 0.2412, + "epoch": 4.390925145060727, + "grad_norm": 0.2421903759241104, + "learning_rate": 1.930240892068319e-06, + "loss": 0.3458, "step": 121835 }, { - "epoch": 4.29, - "learning_rate": 2.6323051490509314e-06, - "loss": 0.2364, + "epoch": 4.391105344721952, + "grad_norm": 0.3107787072658539, + "learning_rate": 1.929116685316634e-06, + "loss": 0.3806, "step": 121840 }, { - "epoch": 4.29, - "learning_rate": 2.6310329073428454e-06, - "loss": 0.2592, + "epoch": 4.391285544383177, + "grad_norm": 0.23685042560100555, + "learning_rate": 1.9279927929012648e-06, + "loss": 0.3384, "step": 121845 }, { - "epoch": 4.29, - "learning_rate": 2.629760956081062e-06, - "loss": 0.2489, + "epoch": 4.391465744044401, + "grad_norm": 0.27006104588508606, + "learning_rate": 1.926869214837532e-06, + "loss": 0.3696, "step": 121850 }, { - "epoch": 4.29, - "learning_rate": 2.628489295282091e-06, - "loss": 0.2743, + "epoch": 4.391645943705626, + "grad_norm": 0.23979541659355164, + "learning_rate": 1.9257459511407366e-06, + "loss": 0.3836, "step": 121855 }, { - "epoch": 4.29, - "learning_rate": 2.6272179249624355e-06, - "loss": 0.2614, + "epoch": 4.391826143366851, + "grad_norm": 0.2638680040836334, + "learning_rate": 1.9246230018261867e-06, + "loss": 0.3689, "step": 121860 }, { - "epoch": 4.29, - "learning_rate": 2.625946845138616e-06, - "loss": 0.2532, + "epoch": 4.392006343028076, + "grad_norm": 0.23276658356189728, + "learning_rate": 1.923500366909181e-06, + "loss": 0.3862, "step": 121865 }, { - "epoch": 4.29, - "learning_rate": 2.624676055827133e-06, - "loss": 0.2724, + "epoch": 4.392186542689299, + "grad_norm": 0.3164710998535156, + "learning_rate": 1.9223780464050103e-06, + "loss": 0.3862, "step": 121870 }, { - "epoch": 4.29, - "learning_rate": 2.623405557044481e-06, - "loss": 0.2745, + "epoch": 4.392366742350524, + "grad_norm": 0.2991268038749695, + "learning_rate": 1.9212560403289752e-06, + "loss": 0.4063, "step": 121875 }, { - "epoch": 4.29, - "learning_rate": 2.622135348807164e-06, - "loss": 0.2399, + "epoch": 4.392546942011749, + "grad_norm": 0.2558140456676483, + "learning_rate": 1.9201343486963646e-06, + "loss": 0.3915, "step": 121880 }, { - "epoch": 4.29, - "learning_rate": 2.6208654311316694e-06, - "loss": 0.2491, + "epoch": 4.392727141672974, + "grad_norm": 0.2522118389606476, + "learning_rate": 1.9190129715224435e-06, + "loss": 0.3721, "step": 121885 }, { - "epoch": 4.29, - "learning_rate": 2.619595804034494e-06, - "loss": 0.2312, + "epoch": 4.392907341334198, + "grad_norm": 0.22641310095787048, + "learning_rate": 1.9178919088225086e-06, + "loss": 0.3486, "step": 121890 }, { - "epoch": 4.29, - "learning_rate": 2.6183264675321145e-06, - "loss": 0.2636, + "epoch": 4.393087540995423, + "grad_norm": 0.19269070029258728, + "learning_rate": 1.9167711606118216e-06, + "loss": 0.3699, "step": 121895 }, { - "epoch": 4.29, - "learning_rate": 2.617057421641014e-06, - "loss": 0.2386, + "epoch": 4.393267740656648, + "grad_norm": 0.24431343376636505, + "learning_rate": 1.9156507269056683e-06, + "loss": 0.3885, "step": 121900 }, { - "epoch": 4.29, - "learning_rate": 2.615788666377675e-06, - "loss": 0.2337, + "epoch": 4.393447940317873, + "grad_norm": 0.25327545404434204, + "learning_rate": 1.9145306077192976e-06, + "loss": 0.382, "step": 121905 }, { - "epoch": 4.29, - "learning_rate": 2.614520201758569e-06, - "loss": 0.2488, + "epoch": 4.393628139979096, + "grad_norm": 0.24400508403778076, + "learning_rate": 1.9134108030679774e-06, + "loss": 0.3645, "step": 121910 }, { - "epoch": 4.29, - "learning_rate": 2.6132520278001672e-06, - "loss": 0.2382, + "epoch": 4.393808339640321, + "grad_norm": 0.23288530111312866, + "learning_rate": 1.9122913129669657e-06, + "loss": 0.3578, "step": 121915 }, { - "epoch": 4.29, - "learning_rate": 2.611984144518928e-06, - "loss": 0.2425, + "epoch": 4.393988539301546, + "grad_norm": 0.2210901528596878, + "learning_rate": 1.911172137431519e-06, + "loss": 0.3833, "step": 121920 }, { - "epoch": 4.29, - "learning_rate": 2.610716551931325e-06, - "loss": 0.2491, + "epoch": 4.394168738962771, + "grad_norm": 0.2453107088804245, + "learning_rate": 1.910053276476878e-06, + "loss": 0.3617, "step": 121925 }, { - "epoch": 4.29, - "learning_rate": 2.609449250053814e-06, - "loss": 0.2611, + "epoch": 4.394348938623995, + "grad_norm": 0.2022264003753662, + "learning_rate": 1.9089347301182945e-06, + "loss": 0.3762, "step": 121930 }, { - "epoch": 4.29, - "learning_rate": 2.6081822389028427e-06, - "loss": 0.2375, + "epoch": 4.39452913828522, + "grad_norm": 0.23200011253356934, + "learning_rate": 1.9078164983709984e-06, + "loss": 0.3415, "step": 121935 }, { - "epoch": 4.29, - "learning_rate": 2.6069155184948677e-06, - "loss": 0.2616, + "epoch": 4.394709337946445, + "grad_norm": 0.24369412660598755, + "learning_rate": 1.9066985812502353e-06, + "loss": 0.3742, "step": 121940 }, { - "epoch": 4.29, - "learning_rate": 2.605649088846343e-06, - "loss": 0.2383, + "epoch": 4.3948895376076695, + "grad_norm": 0.27759307622909546, + "learning_rate": 1.9055809787712348e-06, + "loss": 0.3759, "step": 121945 }, { - "epoch": 4.29, - "learning_rate": 2.604382949973705e-06, - "loss": 0.2471, + "epoch": 4.395069737268894, + "grad_norm": 0.28923094272613525, + "learning_rate": 1.9044636909492208e-06, + "loss": 0.3658, "step": 121950 }, { - "epoch": 4.29, - "learning_rate": 2.603117101893396e-06, - "loss": 0.2555, + "epoch": 4.395249936930119, + "grad_norm": 0.3017236888408661, + "learning_rate": 1.9033467177994174e-06, + "loss": 0.3987, "step": 121955 }, { - "epoch": 4.29, - "learning_rate": 2.601851544621847e-06, - "loss": 0.2565, + "epoch": 4.395430136591343, + "grad_norm": 0.31043195724487305, + "learning_rate": 1.902230059337043e-06, + "loss": 0.3689, "step": 121960 }, { - "epoch": 4.29, - "learning_rate": 2.6005862781754986e-06, - "loss": 0.242, + "epoch": 4.395610336252568, + "grad_norm": 0.25115767121315, + "learning_rate": 1.9011137155773101e-06, + "loss": 0.3732, "step": 121965 }, { - "epoch": 4.29, - "learning_rate": 2.599321302570776e-06, - "loss": 0.2707, + "epoch": 4.395790535913792, + "grad_norm": 0.23707129061222076, + "learning_rate": 1.8999976865354374e-06, + "loss": 0.37, "step": 121970 }, { - "epoch": 4.29, - "learning_rate": 2.598056617824099e-06, - "loss": 0.2544, + "epoch": 4.395970735575017, + "grad_norm": 0.2299506962299347, + "learning_rate": 1.8988819722266154e-06, + "loss": 0.3943, "step": 121975 }, { - "epoch": 4.29, - "learning_rate": 2.5967922239518966e-06, - "loss": 0.2463, + "epoch": 4.396150935236242, + "grad_norm": 0.2858457565307617, + "learning_rate": 1.8977665726660598e-06, + "loss": 0.3703, "step": 121980 }, { - "epoch": 4.29, - "learning_rate": 2.595528120970586e-06, - "loss": 0.2752, + "epoch": 4.3963311348974665, + "grad_norm": 0.2686900198459625, + "learning_rate": 1.8966514878689612e-06, + "loss": 0.3801, "step": 121985 }, { - "epoch": 4.29, - "learning_rate": 2.5942643088965712e-06, - "loss": 0.2545, + "epoch": 4.396511334558691, + "grad_norm": 0.25534695386886597, + "learning_rate": 1.8955367178505073e-06, + "loss": 0.3953, "step": 121990 }, { - "epoch": 4.29, - "learning_rate": 2.593000787746269e-06, - "loss": 0.2346, + "epoch": 4.396691534219916, + "grad_norm": 0.23531198501586914, + "learning_rate": 1.8944222626259028e-06, + "loss": 0.3857, "step": 121995 }, { - "epoch": 4.29, - "learning_rate": 2.591737557536089e-06, - "loss": 0.2512, + "epoch": 4.39687173388114, + "grad_norm": 0.23012003302574158, + "learning_rate": 1.8933081222103189e-06, + "loss": 0.4072, "step": 122000 }, { - "epoch": 4.29, - "eval_loss": 0.24890007078647614, - "eval_runtime": 10.5474, - "eval_samples_per_second": 9.481, - "eval_steps_per_second": 9.481, + "epoch": 4.39687173388114, + "eval_loss": 0.42878466844558716, + "eval_runtime": 3.5304, + "eval_samples_per_second": 28.325, + "eval_steps_per_second": 7.081, "step": 122000 }, { - "epoch": 4.29, - "learning_rate": 2.5904746182824292e-06, - "loss": 0.2254, + "epoch": 4.397051933542365, + "grad_norm": 0.2736528515815735, + "learning_rate": 1.8921942966189322e-06, + "loss": 0.4175, "step": 122005 }, { - "epoch": 4.29, - "learning_rate": 2.589211970001687e-06, - "loss": 0.2532, + "epoch": 4.397232133203589, + "grad_norm": 0.31495052576065063, + "learning_rate": 1.8910807858669277e-06, + "loss": 0.326, "step": 122010 }, { - "epoch": 4.29, - "learning_rate": 2.5879496127102556e-06, - "loss": 0.2646, + "epoch": 4.397412332864814, + "grad_norm": 0.27840882539749146, + "learning_rate": 1.889967589969474e-06, + "loss": 0.3971, "step": 122015 }, { - "epoch": 4.29, - "learning_rate": 2.586687546424532e-06, - "loss": 0.248, + "epoch": 4.397592532526039, + "grad_norm": 0.21493427455425262, + "learning_rate": 1.8888547089417368e-06, + "loss": 0.3331, "step": 122020 }, { - "epoch": 4.29, - "learning_rate": 2.585425771160899e-06, - "loss": 0.2387, + "epoch": 4.3977727321872635, + "grad_norm": 0.259475976228714, + "learning_rate": 1.8877421427988818e-06, + "loss": 0.3773, "step": 122025 }, { - "epoch": 4.29, - "learning_rate": 2.584164286935742e-06, - "loss": 0.2512, + "epoch": 4.397952931848488, + "grad_norm": 0.25516223907470703, + "learning_rate": 1.8866298915560604e-06, + "loss": 0.3608, "step": 122030 }, { - "epoch": 4.29, - "learning_rate": 2.582903093765432e-06, - "loss": 0.2743, + "epoch": 4.398133131509713, + "grad_norm": 0.25606194138526917, + "learning_rate": 1.8855179552284357e-06, + "loss": 0.3888, "step": 122035 }, { - "epoch": 4.29, - "learning_rate": 2.5816421916663585e-06, - "loss": 0.2504, + "epoch": 4.398313331170938, + "grad_norm": 0.260857492685318, + "learning_rate": 1.8844063338311542e-06, + "loss": 0.3505, "step": 122040 }, { - "epoch": 4.29, - "learning_rate": 2.5803815806548835e-06, - "loss": 0.2552, + "epoch": 4.398493530832162, + "grad_norm": 0.22685441374778748, + "learning_rate": 1.8832950273793586e-06, + "loss": 0.3711, "step": 122045 }, { - "epoch": 4.29, - "learning_rate": 2.5791212607473804e-06, - "loss": 0.2443, + "epoch": 4.398673730493386, + "grad_norm": 0.24304035305976868, + "learning_rate": 1.882184035888196e-06, + "loss": 0.3749, "step": 122050 }, { - "epoch": 4.29, - "learning_rate": 2.57786123196021e-06, - "loss": 0.2674, + "epoch": 4.398853930154611, + "grad_norm": 0.31736481189727783, + "learning_rate": 1.8810733593727952e-06, + "loss": 0.352, "step": 122055 }, { - "epoch": 4.29, - "learning_rate": 2.5766014943097385e-06, - "loss": 0.2529, + "epoch": 4.399034129815836, + "grad_norm": 0.190985307097435, + "learning_rate": 1.8799629978482975e-06, + "loss": 0.3598, "step": 122060 }, { - "epoch": 4.29, - "learning_rate": 2.575342047812318e-06, - "loss": 0.2512, + "epoch": 4.3992143294770605, + "grad_norm": 0.26111340522766113, + "learning_rate": 1.878852951329832e-06, + "loss": 0.3519, "step": 122065 }, { - "epoch": 4.29, - "learning_rate": 2.574082892484303e-06, - "loss": 0.2352, + "epoch": 4.399394529138285, + "grad_norm": 0.20273645222187042, + "learning_rate": 1.877743219832509e-06, + "loss": 0.3546, "step": 122070 }, { - "epoch": 4.29, - "learning_rate": 2.5728240283420383e-06, - "loss": 0.2629, + "epoch": 4.39957472879951, + "grad_norm": 0.21357549726963043, + "learning_rate": 1.8766338033714636e-06, + "loss": 0.3868, "step": 122075 }, { - "epoch": 4.3, - "learning_rate": 2.571565455401878e-06, - "loss": 0.2581, + "epoch": 4.399754928460735, + "grad_norm": 0.2500903010368347, + "learning_rate": 1.8755247019618034e-06, + "loss": 0.3884, "step": 122080 }, { - "epoch": 4.3, - "learning_rate": 2.570307173680159e-06, - "loss": 0.274, + "epoch": 4.399935128121959, + "grad_norm": 0.23067040741443634, + "learning_rate": 1.8744159156186492e-06, + "loss": 0.3731, "step": 122085 }, { - "epoch": 4.3, - "learning_rate": 2.5690491831932147e-06, - "loss": 0.2403, + "epoch": 4.400115327783184, + "grad_norm": 0.23591923713684082, + "learning_rate": 1.8733074443570947e-06, + "loss": 0.3813, "step": 122090 }, { - "epoch": 4.3, - "learning_rate": 2.567791483957388e-06, - "loss": 0.2349, + "epoch": 4.400295527444408, + "grad_norm": 0.251615434885025, + "learning_rate": 1.8721992881922473e-06, + "loss": 0.3614, "step": 122095 }, { - "epoch": 4.3, - "learning_rate": 2.5665340759890024e-06, - "loss": 0.2499, + "epoch": 4.400475727105633, + "grad_norm": 0.2689308524131775, + "learning_rate": 1.8710914471392088e-06, + "loss": 0.4076, "step": 122100 }, { - "epoch": 4.3, - "learning_rate": 2.5652769593043912e-06, - "loss": 0.261, + "epoch": 4.4006559267668575, + "grad_norm": 0.2680433392524719, + "learning_rate": 1.8699839212130727e-06, + "loss": 0.3672, "step": 122105 }, { - "epoch": 4.3, - "learning_rate": 2.5640201339198697e-06, - "loss": 0.2513, + "epoch": 4.400836126428082, + "grad_norm": 0.2669503092765808, + "learning_rate": 1.8688767104289267e-06, + "loss": 0.3589, "step": 122110 }, { - "epoch": 4.3, - "learning_rate": 2.5627635998517666e-06, - "loss": 0.2698, + "epoch": 4.401016326089307, + "grad_norm": 0.23472002148628235, + "learning_rate": 1.8677698148018591e-06, + "loss": 0.3815, "step": 122115 }, { - "epoch": 4.3, - "learning_rate": 2.561507357116391e-06, - "loss": 0.2702, + "epoch": 4.401196525750532, + "grad_norm": 0.2634426951408386, + "learning_rate": 1.8666632343469437e-06, + "loss": 0.3885, "step": 122120 }, { - "epoch": 4.3, - "learning_rate": 2.560251405730055e-06, - "loss": 0.242, + "epoch": 4.401376725411756, + "grad_norm": 0.22323672473430634, + "learning_rate": 1.8655569690792686e-06, + "loss": 0.3725, "step": 122125 }, { - "epoch": 4.3, - "learning_rate": 2.5589957457090586e-06, - "loss": 0.2438, + "epoch": 4.401556925072981, + "grad_norm": 0.1853046715259552, + "learning_rate": 1.8644510190138992e-06, + "loss": 0.3773, "step": 122130 }, { - "epoch": 4.3, - "learning_rate": 2.55774037706972e-06, - "loss": 0.2389, + "epoch": 4.401737124734206, + "grad_norm": 0.260140061378479, + "learning_rate": 1.8633453841659042e-06, + "loss": 0.351, "step": 122135 }, { - "epoch": 4.3, - "learning_rate": 2.5564852998283344e-06, - "loss": 0.2274, + "epoch": 4.401917324395431, + "grad_norm": 0.26455867290496826, + "learning_rate": 1.8622400645503525e-06, + "loss": 0.3379, "step": 122140 }, { - "epoch": 4.3, - "learning_rate": 2.5552305140011944e-06, - "loss": 0.2233, + "epoch": 4.402097524056654, + "grad_norm": 0.33009883761405945, + "learning_rate": 1.861135060182298e-06, + "loss": 0.3934, "step": 122145 }, { - "epoch": 4.3, - "learning_rate": 2.5539760196045896e-06, - "loss": 0.2624, + "epoch": 4.402277723717879, + "grad_norm": 0.2440415918827057, + "learning_rate": 1.860030371076793e-06, + "loss": 0.3411, "step": 122150 }, { - "epoch": 4.3, - "learning_rate": 2.5527218166548154e-06, - "loss": 0.2347, + "epoch": 4.402457923379104, + "grad_norm": 0.2990182340145111, + "learning_rate": 1.8589259972489003e-06, + "loss": 0.4079, "step": 122155 }, { - "epoch": 4.3, - "learning_rate": 2.551467905168159e-06, - "loss": 0.2507, + "epoch": 4.402638123040329, + "grad_norm": 0.2691243588924408, + "learning_rate": 1.8578219387136608e-06, + "loss": 0.379, "step": 122160 }, { - "epoch": 4.3, - "learning_rate": 2.550214285160896e-06, - "loss": 0.2562, + "epoch": 4.402818322701553, + "grad_norm": 0.23940818011760712, + "learning_rate": 1.8567181954861152e-06, + "loss": 0.3714, "step": 122165 }, { - "epoch": 4.3, - "learning_rate": 2.5489609566493015e-06, - "loss": 0.2428, + "epoch": 4.402998522362778, + "grad_norm": 0.23924623429775238, + "learning_rate": 1.8556147675813068e-06, + "loss": 0.354, "step": 122170 }, { - "epoch": 4.3, - "learning_rate": 2.5477079196496584e-06, - "loss": 0.282, + "epoch": 4.403178722024003, + "grad_norm": 0.2857334017753601, + "learning_rate": 1.85451165501426e-06, + "loss": 0.398, "step": 122175 }, { - "epoch": 4.3, - "learning_rate": 2.5464551741782334e-06, - "loss": 0.2664, + "epoch": 4.403358921685228, + "grad_norm": 0.26370975375175476, + "learning_rate": 1.8534088578000181e-06, + "loss": 0.3762, "step": 122180 }, { - "epoch": 4.3, - "learning_rate": 2.5452027202512883e-06, - "loss": 0.2606, + "epoch": 4.403539121346451, + "grad_norm": 0.24754032492637634, + "learning_rate": 1.8523063759535969e-06, + "loss": 0.3867, "step": 122185 }, { - "epoch": 4.3, - "learning_rate": 2.543950557885083e-06, - "loss": 0.2485, + "epoch": 4.403719321007676, + "grad_norm": 0.24139243364334106, + "learning_rate": 1.851204209490015e-06, + "loss": 0.3443, "step": 122190 }, { - "epoch": 4.3, - "learning_rate": 2.542698687095885e-06, - "loss": 0.2526, + "epoch": 4.403899520668901, + "grad_norm": 0.2864011526107788, + "learning_rate": 1.8501023584242965e-06, + "loss": 0.3501, "step": 122195 }, { - "epoch": 4.3, - "learning_rate": 2.541447107899947e-06, - "loss": 0.2412, + "epoch": 4.404079720330126, + "grad_norm": 0.2613162398338318, + "learning_rate": 1.8490008227714545e-06, + "loss": 0.3541, "step": 122200 }, { - "epoch": 4.3, - "learning_rate": 2.5401958203135097e-06, - "loss": 0.2406, + "epoch": 4.40425991999135, + "grad_norm": 0.2530914843082428, + "learning_rate": 1.8478996025464933e-06, + "loss": 0.3385, "step": 122205 }, { - "epoch": 4.3, - "learning_rate": 2.5389448243528345e-06, - "loss": 0.2505, + "epoch": 4.404440119652575, + "grad_norm": 0.2663853168487549, + "learning_rate": 1.8467986977644153e-06, + "loss": 0.3808, "step": 122210 }, { - "epoch": 4.3, - "learning_rate": 2.5376941200341538e-06, - "loss": 0.261, + "epoch": 4.4046203193138, + "grad_norm": 0.26814600825309753, + "learning_rate": 1.8456981084402192e-06, + "loss": 0.3838, "step": 122215 }, { - "epoch": 4.3, - "learning_rate": 2.5364437073737146e-06, - "loss": 0.2548, + "epoch": 4.404800518975025, + "grad_norm": 0.2669377326965332, + "learning_rate": 1.8445978345889097e-06, + "loss": 0.381, "step": 122220 }, { - "epoch": 4.3, - "learning_rate": 2.5351935863877464e-06, - "loss": 0.241, + "epoch": 4.404980718636249, + "grad_norm": 0.2761158049106598, + "learning_rate": 1.8434978762254695e-06, + "loss": 0.3659, "step": 122225 }, { - "epoch": 4.3, - "learning_rate": 2.533943757092491e-06, - "loss": 0.2481, + "epoch": 4.405160918297474, + "grad_norm": 0.24499890208244324, + "learning_rate": 1.8423982333648893e-06, + "loss": 0.3993, "step": 122230 }, { - "epoch": 4.3, - "learning_rate": 2.532694219504167e-06, - "loss": 0.2549, + "epoch": 4.405341117958698, + "grad_norm": 0.24173754453659058, + "learning_rate": 1.8412989060221485e-06, + "loss": 0.3923, "step": 122235 }, { - "epoch": 4.3, - "learning_rate": 2.531444973639005e-06, - "loss": 0.2555, + "epoch": 4.405521317619923, + "grad_norm": 0.2643168568611145, + "learning_rate": 1.8401998942122218e-06, + "loss": 0.3681, "step": 122240 }, { - "epoch": 4.3, - "learning_rate": 2.5301960195132147e-06, - "loss": 0.2465, + "epoch": 4.405701517281147, + "grad_norm": 0.24129536747932434, + "learning_rate": 1.8391011979500939e-06, + "loss": 0.3466, "step": 122245 }, { - "epoch": 4.3, - "learning_rate": 2.52894735714303e-06, - "loss": 0.2605, + "epoch": 4.405881716942372, + "grad_norm": 0.2655704915523529, + "learning_rate": 1.8380028172507308e-06, + "loss": 0.3822, "step": 122250 }, { - "epoch": 4.3, - "learning_rate": 2.5276989865446522e-06, - "loss": 0.2623, + "epoch": 4.406061916603597, + "grad_norm": 0.2480999380350113, + "learning_rate": 1.8369047521290872e-06, + "loss": 0.3705, "step": 122255 }, { - "epoch": 4.3, - "learning_rate": 2.5264509077342875e-06, - "loss": 0.2271, + "epoch": 4.4062421162648215, + "grad_norm": 0.3369860053062439, + "learning_rate": 1.8358070026001345e-06, + "loss": 0.3736, "step": 122260 }, { - "epoch": 4.3, - "learning_rate": 2.5252031207281534e-06, - "loss": 0.2602, + "epoch": 4.406422315926046, + "grad_norm": 0.3019421100616455, + "learning_rate": 1.8347095686788247e-06, + "loss": 0.3587, "step": 122265 }, { - "epoch": 4.3, - "learning_rate": 2.52395562554244e-06, - "loss": 0.261, + "epoch": 4.406602515587271, + "grad_norm": 0.26029887795448303, + "learning_rate": 1.8336124503801179e-06, + "loss": 0.3561, "step": 122270 }, { - "epoch": 4.3, - "learning_rate": 2.5227084221933567e-06, - "loss": 0.2442, + "epoch": 4.406782715248496, + "grad_norm": 0.31131497025489807, + "learning_rate": 1.8325156477189547e-06, + "loss": 0.3614, "step": 122275 }, { - "epoch": 4.3, - "learning_rate": 2.52146151069709e-06, - "loss": 0.2489, + "epoch": 4.40696291490972, + "grad_norm": 0.24049854278564453, + "learning_rate": 1.8314191607102738e-06, + "loss": 0.3702, "step": 122280 }, { - "epoch": 4.3, - "learning_rate": 2.5202148910698276e-06, - "loss": 0.2632, + "epoch": 4.407143114570944, + "grad_norm": 0.23440882563591003, + "learning_rate": 1.8303229893690266e-06, + "loss": 0.3774, "step": 122285 }, { - "epoch": 4.3, - "learning_rate": 2.5189685633277644e-06, - "loss": 0.2268, + "epoch": 4.407323314232169, + "grad_norm": 0.22003592550754547, + "learning_rate": 1.829227133710143e-06, + "loss": 0.3919, "step": 122290 }, { - "epoch": 4.3, - "learning_rate": 2.5177225274870792e-06, - "loss": 0.2557, + "epoch": 4.407503513893394, + "grad_norm": 0.1988513171672821, + "learning_rate": 1.8281315937485527e-06, + "loss": 0.3903, "step": 122295 }, { - "epoch": 4.3, - "learning_rate": 2.516476783563951e-06, - "loss": 0.2423, + "epoch": 4.4076837135546185, + "grad_norm": 0.19561129808425903, + "learning_rate": 1.8270363694991855e-06, + "loss": 0.3364, "step": 122300 }, { - "epoch": 4.3, - "learning_rate": 2.5152313315745496e-06, - "loss": 0.2396, + "epoch": 4.407863913215843, + "grad_norm": 0.2583327889442444, + "learning_rate": 1.825941460976957e-06, + "loss": 0.3895, "step": 122305 }, { - "epoch": 4.3, - "learning_rate": 2.5139861715350545e-06, - "loss": 0.2371, + "epoch": 4.408044112877068, + "grad_norm": 0.2395971715450287, + "learning_rate": 1.8248468681967916e-06, + "loss": 0.3592, "step": 122310 }, { - "epoch": 4.3, - "learning_rate": 2.512741303461627e-06, - "loss": 0.2349, + "epoch": 4.408224312538293, + "grad_norm": 0.25403663516044617, + "learning_rate": 1.8237525911736025e-06, + "loss": 0.3655, "step": 122315 }, { - "epoch": 4.3, - "learning_rate": 2.511496727370438e-06, - "loss": 0.2572, + "epoch": 4.4084045121995175, + "grad_norm": 0.27949368953704834, + "learning_rate": 1.8226586299222943e-06, + "loss": 0.3404, "step": 122320 }, { - "epoch": 4.3, - "learning_rate": 2.5102524432776386e-06, - "loss": 0.2773, + "epoch": 4.408584711860741, + "grad_norm": 0.2646028399467468, + "learning_rate": 1.821564984457777e-06, + "loss": 0.3935, "step": 122325 }, { - "epoch": 4.3, - "learning_rate": 2.509008451199396e-06, - "loss": 0.2501, + "epoch": 4.408764911521966, + "grad_norm": 0.32724955677986145, + "learning_rate": 1.8204716547949502e-06, + "loss": 0.3568, "step": 122330 }, { - "epoch": 4.3, - "learning_rate": 2.5077647511518557e-06, - "loss": 0.2553, + "epoch": 4.408945111183191, + "grad_norm": 0.26123833656311035, + "learning_rate": 1.8193786409487046e-06, + "loss": 0.3589, "step": 122335 }, { - "epoch": 4.3, - "learning_rate": 2.5065213431511637e-06, - "loss": 0.238, + "epoch": 4.4091253108444155, + "grad_norm": 0.2541702389717102, + "learning_rate": 1.8182859429339422e-06, + "loss": 0.3606, "step": 122340 }, { - "epoch": 4.3, - "learning_rate": 2.50527822721347e-06, - "loss": 0.2413, + "epoch": 4.40930551050564, + "grad_norm": 0.25994282960891724, + "learning_rate": 1.8171935607655427e-06, + "loss": 0.3641, "step": 122345 }, { - "epoch": 4.3, - "learning_rate": 2.504035403354915e-06, - "loss": 0.2534, + "epoch": 4.409485710166865, + "grad_norm": 0.31410351395606995, + "learning_rate": 1.816101494458397e-06, + "loss": 0.3891, "step": 122350 }, { - "epoch": 4.3, - "learning_rate": 2.502792871591636e-06, - "loss": 0.2371, + "epoch": 4.40966590982809, + "grad_norm": 0.23764289915561676, + "learning_rate": 1.8150097440273767e-06, + "loss": 0.407, "step": 122355 }, { - "epoch": 4.3, - "learning_rate": 2.5015506319397615e-06, - "loss": 0.2468, + "epoch": 4.409846109489314, + "grad_norm": 0.24512486159801483, + "learning_rate": 1.8139183094873558e-06, + "loss": 0.3627, "step": 122360 }, { - "epoch": 4.31, - "learning_rate": 2.5003086844154283e-06, - "loss": 0.2515, + "epoch": 4.410026309150539, + "grad_norm": 0.2727164328098297, + "learning_rate": 1.812827190853217e-06, + "loss": 0.3738, "step": 122365 }, { - "epoch": 4.31, - "learning_rate": 2.49906702903476e-06, - "loss": 0.2508, + "epoch": 4.410206508811763, + "grad_norm": 0.2821141183376312, + "learning_rate": 1.8117363881398174e-06, + "loss": 0.3829, "step": 122370 }, { - "epoch": 4.31, - "learning_rate": 2.4978256658138715e-06, - "loss": 0.2537, + "epoch": 4.410386708472988, + "grad_norm": 0.22460602223873138, + "learning_rate": 1.8106459013620126e-06, + "loss": 0.3681, "step": 122375 }, { - "epoch": 4.31, - "learning_rate": 2.496584594768889e-06, - "loss": 0.2531, + "epoch": 4.4105669081342125, + "grad_norm": 0.24663448333740234, + "learning_rate": 1.8095557305346734e-06, + "loss": 0.3807, "step": 122380 }, { - "epoch": 4.31, - "learning_rate": 2.4953438159159267e-06, - "loss": 0.244, + "epoch": 4.410747107795437, + "grad_norm": 0.32760992646217346, + "learning_rate": 1.8084658756726463e-06, + "loss": 0.3769, "step": 122385 }, { - "epoch": 4.31, - "learning_rate": 2.494103329271097e-06, - "loss": 0.2435, + "epoch": 4.410927307456662, + "grad_norm": 0.26017165184020996, + "learning_rate": 1.8073763367907836e-06, + "loss": 0.4191, "step": 122390 }, { - "epoch": 4.31, - "learning_rate": 2.492863134850501e-06, - "loss": 0.2725, + "epoch": 4.411107507117887, + "grad_norm": 0.2591916620731354, + "learning_rate": 1.806287113903929e-06, + "loss": 0.3955, "step": 122395 }, { - "epoch": 4.31, - "learning_rate": 2.4916232326702428e-06, - "loss": 0.2411, + "epoch": 4.411287706779111, + "grad_norm": 0.2518169581890106, + "learning_rate": 1.8051982070269147e-06, + "loss": 0.3571, "step": 122400 }, { - "epoch": 4.31, - "learning_rate": 2.4903836227464258e-06, - "loss": 0.2417, + "epoch": 4.411467906440336, + "grad_norm": 0.3138507604598999, + "learning_rate": 1.8041096161745902e-06, + "loss": 0.3739, "step": 122405 }, { - "epoch": 4.31, - "learning_rate": 2.489144305095145e-06, - "loss": 0.2568, + "epoch": 4.411648106101561, + "grad_norm": 0.22780956327915192, + "learning_rate": 1.8030213413617798e-06, + "loss": 0.3652, "step": 122410 }, { - "epoch": 4.31, - "learning_rate": 2.487905279732489e-06, - "loss": 0.2252, + "epoch": 4.411828305762786, + "grad_norm": 0.2590252161026001, + "learning_rate": 1.801933382603313e-06, + "loss": 0.3928, "step": 122415 }, { - "epoch": 4.31, - "learning_rate": 2.4866665466745404e-06, - "loss": 0.2488, + "epoch": 4.4120085054240095, + "grad_norm": 0.2547787129878998, + "learning_rate": 1.8008457399140144e-06, + "loss": 0.3554, "step": 122420 }, { - "epoch": 4.31, - "learning_rate": 2.4854281059373984e-06, - "loss": 0.2501, + "epoch": 4.412188705085234, + "grad_norm": 0.29435890913009644, + "learning_rate": 1.7997584133086943e-06, + "loss": 0.4092, "step": 122425 }, { - "epoch": 4.31, - "learning_rate": 2.484189957537128e-06, - "loss": 0.2605, + "epoch": 4.412368904746459, + "grad_norm": 0.23160386085510254, + "learning_rate": 1.7986714028021794e-06, + "loss": 0.3793, "step": 122430 }, { - "epoch": 4.31, - "learning_rate": 2.4829521014898103e-06, - "loss": 0.2593, + "epoch": 4.412549104407684, + "grad_norm": 0.1971481442451477, + "learning_rate": 1.7975847084092806e-06, + "loss": 0.3858, "step": 122435 }, { - "epoch": 4.31, - "learning_rate": 2.4817145378115274e-06, - "loss": 0.2526, + "epoch": 4.412729304068908, + "grad_norm": 0.29055434465408325, + "learning_rate": 1.7964983301447886e-06, + "loss": 0.4172, "step": 122440 }, { - "epoch": 4.31, - "learning_rate": 2.480477266518341e-06, - "loss": 0.249, + "epoch": 4.412909503730133, + "grad_norm": 0.2608969509601593, + "learning_rate": 1.7954122680235164e-06, + "loss": 0.3736, "step": 122445 }, { - "epoch": 4.31, - "learning_rate": 2.4792402876263165e-06, - "loss": 0.2751, + "epoch": 4.413089703391358, + "grad_norm": 0.2528577148914337, + "learning_rate": 1.794326522060258e-06, + "loss": 0.3502, "step": 122450 }, { - "epoch": 4.31, - "learning_rate": 2.47800360115151e-06, - "loss": 0.2493, + "epoch": 4.413269903052583, + "grad_norm": 0.21361927688121796, + "learning_rate": 1.7932410922698122e-06, + "loss": 0.3278, "step": 122455 }, { - "epoch": 4.31, - "learning_rate": 2.4767672071099897e-06, - "loss": 0.2384, + "epoch": 4.4134501027138064, + "grad_norm": 0.33277052640914917, + "learning_rate": 1.7921559786669679e-06, + "loss": 0.3626, "step": 122460 }, { - "epoch": 4.31, - "learning_rate": 2.475531105517803e-06, - "loss": 0.257, + "epoch": 4.413630302375031, + "grad_norm": 0.21200527250766754, + "learning_rate": 1.7910711812664931e-06, + "loss": 0.3842, "step": 122465 }, { - "epoch": 4.31, - "learning_rate": 2.474295296391002e-06, - "loss": 0.2513, + "epoch": 4.413810502036256, + "grad_norm": 0.2637445628643036, + "learning_rate": 1.7899867000831878e-06, + "loss": 0.3866, "step": 122470 }, { - "epoch": 4.31, - "learning_rate": 2.473059779745626e-06, - "loss": 0.2359, + "epoch": 4.413990701697481, + "grad_norm": 0.3103804588317871, + "learning_rate": 1.78890253513182e-06, + "loss": 0.3754, "step": 122475 }, { - "epoch": 4.31, - "learning_rate": 2.4718245555977266e-06, - "loss": 0.2462, + "epoch": 4.414170901358705, + "grad_norm": 0.25578418374061584, + "learning_rate": 1.7878186864271618e-06, + "loss": 0.3487, "step": 122480 }, { - "epoch": 4.31, - "learning_rate": 2.4705896239633347e-06, - "loss": 0.2384, + "epoch": 4.41435110101993, + "grad_norm": 0.23292893171310425, + "learning_rate": 1.786735153983976e-06, + "loss": 0.3688, "step": 122485 }, { - "epoch": 4.31, - "learning_rate": 2.469354984858496e-06, - "loss": 0.2534, + "epoch": 4.414531300681155, + "grad_norm": 0.23511111736297607, + "learning_rate": 1.7856519378170289e-06, + "loss": 0.3837, "step": 122490 }, { - "epoch": 4.31, - "learning_rate": 2.468120638299226e-06, - "loss": 0.2218, + "epoch": 4.41471150034238, + "grad_norm": 0.3069644272327423, + "learning_rate": 1.7845690379410835e-06, + "loss": 0.3729, "step": 122495 }, { - "epoch": 4.31, - "learning_rate": 2.466886584301567e-06, - "loss": 0.2444, + "epoch": 4.414891700003604, + "grad_norm": 0.26386865973472595, + "learning_rate": 1.783486454370889e-06, + "loss": 0.379, "step": 122500 }, { - "epoch": 4.31, - "eval_loss": 0.2489146739244461, - "eval_runtime": 10.5432, - "eval_samples_per_second": 9.485, - "eval_steps_per_second": 9.485, + "epoch": 4.414891700003604, + "eval_loss": 0.42894604802131653, + "eval_runtime": 3.5255, + "eval_samples_per_second": 28.365, + "eval_steps_per_second": 7.091, "step": 122500 }, { - "epoch": 4.31, - "learning_rate": 2.465652822881537e-06, - "loss": 0.2727, + "epoch": 4.415071899664829, + "grad_norm": 0.2563088834285736, + "learning_rate": 1.7824041871211976e-06, + "loss": 0.3939, "step": 122505 }, { - "epoch": 4.31, - "learning_rate": 2.464419354055153e-06, - "loss": 0.262, + "epoch": 4.415252099326053, + "grad_norm": 0.2134285569190979, + "learning_rate": 1.781322236206756e-06, + "loss": 0.3831, "step": 122510 }, { - "epoch": 4.31, - "learning_rate": 2.463186177838428e-06, - "loss": 0.2561, + "epoch": 4.415432298987278, + "grad_norm": 0.21774479746818542, + "learning_rate": 1.7802406016423022e-06, + "loss": 0.3738, "step": 122515 }, { - "epoch": 4.31, - "learning_rate": 2.461953294247382e-06, - "loss": 0.2547, + "epoch": 4.415612498648502, + "grad_norm": 0.19313018023967743, + "learning_rate": 1.779159283442569e-06, + "loss": 0.34, "step": 122520 }, { - "epoch": 4.31, - "learning_rate": 2.4607207032980195e-06, - "loss": 0.2372, + "epoch": 4.415792698309727, + "grad_norm": 0.24598465859889984, + "learning_rate": 1.7780782816223001e-06, + "loss": 0.3575, "step": 122525 }, { - "epoch": 4.31, - "learning_rate": 2.4594884050063465e-06, - "loss": 0.244, + "epoch": 4.415972897970952, + "grad_norm": 0.2550036311149597, + "learning_rate": 1.77699759619622e-06, + "loss": 0.381, "step": 122530 }, { - "epoch": 4.31, - "learning_rate": 2.4582563993883563e-06, - "loss": 0.265, + "epoch": 4.416153097632177, + "grad_norm": 0.3060535192489624, + "learning_rate": 1.7759172271790498e-06, + "loss": 0.4224, "step": 122535 }, { - "epoch": 4.31, - "learning_rate": 2.4570246864600493e-06, - "loss": 0.2328, + "epoch": 4.416333297293401, + "grad_norm": 0.3150867223739624, + "learning_rate": 1.7748371745855114e-06, + "loss": 0.3928, "step": 122540 }, { - "epoch": 4.31, - "learning_rate": 2.4557932662374273e-06, - "loss": 0.2509, + "epoch": 4.416513496954626, + "grad_norm": 0.24567945301532745, + "learning_rate": 1.7737574384303152e-06, + "loss": 0.3674, "step": 122545 }, { - "epoch": 4.31, - "learning_rate": 2.4545621387364658e-06, - "loss": 0.2652, + "epoch": 4.416693696615851, + "grad_norm": 0.19101324677467346, + "learning_rate": 1.7726780187281854e-06, + "loss": 0.3462, "step": 122550 }, { - "epoch": 4.31, - "learning_rate": 2.453331303973164e-06, - "loss": 0.2345, + "epoch": 4.416873896277075, + "grad_norm": 0.3299705982208252, + "learning_rate": 1.771598915493819e-06, + "loss": 0.3893, "step": 122555 }, { - "epoch": 4.31, - "learning_rate": 2.4521007619634965e-06, - "loss": 0.2505, + "epoch": 4.417054095938299, + "grad_norm": 0.29562848806381226, + "learning_rate": 1.7705201287419121e-06, + "loss": 0.3921, "step": 122560 }, { - "epoch": 4.31, - "learning_rate": 2.4508705127234384e-06, - "loss": 0.243, + "epoch": 4.417234295599524, + "grad_norm": 0.2895338535308838, + "learning_rate": 1.769441658487178e-06, + "loss": 0.4102, "step": 122565 }, { - "epoch": 4.31, - "learning_rate": 2.4496405562689617e-06, - "loss": 0.2588, + "epoch": 4.417414495260749, + "grad_norm": 0.2296198308467865, + "learning_rate": 1.7683635047442998e-06, + "loss": 0.3691, "step": 122570 }, { - "epoch": 4.31, - "learning_rate": 2.4484108926160486e-06, - "loss": 0.2478, + "epoch": 4.4175946949219735, + "grad_norm": 0.24252089858055115, + "learning_rate": 1.7672856675279791e-06, + "loss": 0.4019, "step": 122575 }, { - "epoch": 4.31, - "learning_rate": 2.447181521780656e-06, - "loss": 0.2422, + "epoch": 4.417774894583198, + "grad_norm": 0.24175925552845, + "learning_rate": 1.7662081468528879e-06, + "loss": 0.3703, "step": 122580 }, { - "epoch": 4.31, - "learning_rate": 2.4459524437787485e-06, - "loss": 0.249, + "epoch": 4.417955094244423, + "grad_norm": 0.22977383434772491, + "learning_rate": 1.7651309427337087e-06, + "loss": 0.4019, "step": 122585 }, { - "epoch": 4.31, - "learning_rate": 2.44472365862628e-06, - "loss": 0.2487, + "epoch": 4.418135293905648, + "grad_norm": 0.28015124797821045, + "learning_rate": 1.764054055185127e-06, + "loss": 0.3954, "step": 122590 }, { - "epoch": 4.31, - "learning_rate": 2.443495166339213e-06, - "loss": 0.2348, + "epoch": 4.4183154935668725, + "grad_norm": 0.231642484664917, + "learning_rate": 1.7629774842218089e-06, + "loss": 0.3531, "step": 122595 }, { - "epoch": 4.31, - "learning_rate": 2.4422669669334986e-06, - "loss": 0.2635, + "epoch": 4.418495693228096, + "grad_norm": 0.24805359542369843, + "learning_rate": 1.7619012298584259e-06, + "loss": 0.4105, "step": 122600 }, { - "epoch": 4.31, - "learning_rate": 2.4410390604250795e-06, - "loss": 0.2381, + "epoch": 4.418675892889321, + "grad_norm": 0.29648274183273315, + "learning_rate": 1.7608252921096357e-06, + "loss": 0.3828, "step": 122605 }, { - "epoch": 4.31, - "learning_rate": 2.4398114468298985e-06, - "loss": 0.2357, + "epoch": 4.418856092550546, + "grad_norm": 0.28801605105400085, + "learning_rate": 1.7597496709901017e-06, + "loss": 0.3365, "step": 122610 }, { - "epoch": 4.31, - "learning_rate": 2.4385841261639035e-06, - "loss": 0.2647, + "epoch": 4.4190362922117705, + "grad_norm": 0.25377213954925537, + "learning_rate": 1.7586743665144816e-06, + "loss": 0.3936, "step": 122615 }, { - "epoch": 4.31, - "learning_rate": 2.4373570984430242e-06, - "loss": 0.2329, + "epoch": 4.419216491872995, + "grad_norm": 0.2505911588668823, + "learning_rate": 1.7575993786974221e-06, + "loss": 0.3729, "step": 122620 }, { - "epoch": 4.31, - "learning_rate": 2.436130363683195e-06, - "loss": 0.2431, + "epoch": 4.41939669153422, + "grad_norm": 0.2358839064836502, + "learning_rate": 1.7565247075535724e-06, + "loss": 0.3607, "step": 122625 }, { - "epoch": 4.31, - "learning_rate": 2.434903921900336e-06, - "loss": 0.2418, + "epoch": 4.419576891195445, + "grad_norm": 0.2155429720878601, + "learning_rate": 1.7554503530975735e-06, + "loss": 0.3934, "step": 122630 }, { - "epoch": 4.31, - "learning_rate": 2.4336777731103847e-06, - "loss": 0.2545, + "epoch": 4.4197570908566695, + "grad_norm": 0.21443898975849152, + "learning_rate": 1.7543763153440585e-06, + "loss": 0.3833, "step": 122635 }, { - "epoch": 4.31, - "learning_rate": 2.4324519173292565e-06, - "loss": 0.2493, + "epoch": 4.419937290517894, + "grad_norm": 0.2692784368991852, + "learning_rate": 1.7533025943076708e-06, + "loss": 0.3753, "step": 122640 }, { - "epoch": 4.31, - "learning_rate": 2.4312263545728605e-06, - "loss": 0.2496, + "epoch": 4.420117490179118, + "grad_norm": 0.1909954696893692, + "learning_rate": 1.752229190003038e-06, + "loss": 0.3936, "step": 122645 }, { - "epoch": 4.32, - "learning_rate": 2.4300010848571225e-06, - "loss": 0.2447, + "epoch": 4.420297689840343, + "grad_norm": 0.26652899384498596, + "learning_rate": 1.751156102444776e-06, + "loss": 0.3609, "step": 122650 }, { - "epoch": 4.32, - "learning_rate": 2.4287761081979445e-06, - "loss": 0.2414, + "epoch": 4.4204778895015675, + "grad_norm": 0.220448300242424, + "learning_rate": 1.7500833316475118e-06, + "loss": 0.3454, "step": 122655 }, { - "epoch": 4.32, - "learning_rate": 2.4275514246112353e-06, - "loss": 0.2578, + "epoch": 4.420658089162792, + "grad_norm": 0.2539389431476593, + "learning_rate": 1.7490108776258617e-06, + "loss": 0.3466, "step": 122660 }, { - "epoch": 4.32, - "learning_rate": 2.426327034112891e-06, - "loss": 0.266, + "epoch": 4.420838288824017, + "grad_norm": 0.2393910139799118, + "learning_rate": 1.7479387403944363e-06, + "loss": 0.345, "step": 122665 }, { - "epoch": 4.32, - "learning_rate": 2.425102936718818e-06, - "loss": 0.261, + "epoch": 4.421018488485242, + "grad_norm": 0.27179139852523804, + "learning_rate": 1.7468669199678462e-06, + "loss": 0.3895, "step": 122670 }, { - "epoch": 4.32, - "learning_rate": 2.423879132444906e-06, - "loss": 0.2403, + "epoch": 4.421198688146466, + "grad_norm": 0.2731141149997711, + "learning_rate": 1.7457954163606848e-06, + "loss": 0.3681, "step": 122675 }, { - "epoch": 4.32, - "learning_rate": 2.422655621307046e-06, - "loss": 0.2579, + "epoch": 4.421378887807691, + "grad_norm": 0.23041914403438568, + "learning_rate": 1.744724229587566e-06, + "loss": 0.3366, "step": 122680 }, { - "epoch": 4.32, - "learning_rate": 2.4214324033211167e-06, - "loss": 0.2365, + "epoch": 4.421559087468916, + "grad_norm": 0.2509845793247223, + "learning_rate": 1.743653359663075e-06, + "loss": 0.3587, "step": 122685 }, { - "epoch": 4.32, - "learning_rate": 2.420209478503016e-06, - "loss": 0.2446, + "epoch": 4.421739287130141, + "grad_norm": 0.250013530254364, + "learning_rate": 1.7425828066018056e-06, + "loss": 0.3569, "step": 122690 }, { - "epoch": 4.32, - "learning_rate": 2.4189868468686124e-06, - "loss": 0.2614, + "epoch": 4.4219194867913645, + "grad_norm": 0.27683958411216736, + "learning_rate": 1.7415125704183411e-06, + "loss": 0.3784, "step": 122695 }, { - "epoch": 4.32, - "learning_rate": 2.4177645084337787e-06, - "loss": 0.2611, + "epoch": 4.422099686452589, + "grad_norm": 0.2434561401605606, + "learning_rate": 1.7404426511272664e-06, + "loss": 0.3768, "step": 122700 }, { - "epoch": 4.32, - "learning_rate": 2.416542463214394e-06, - "loss": 0.2676, + "epoch": 4.422279886113814, + "grad_norm": 0.30197739601135254, + "learning_rate": 1.7393730487431536e-06, + "loss": 0.3838, "step": 122705 }, { - "epoch": 4.32, - "learning_rate": 2.415320711226321e-06, - "loss": 0.251, + "epoch": 4.422460085775039, + "grad_norm": 0.28687089681625366, + "learning_rate": 1.7383037632805826e-06, + "loss": 0.3882, "step": 122710 }, { - "epoch": 4.32, - "learning_rate": 2.414099252485427e-06, - "loss": 0.2726, + "epoch": 4.422640285436263, + "grad_norm": 0.21703818440437317, + "learning_rate": 1.7372347947541194e-06, + "loss": 0.3486, "step": 122715 }, { - "epoch": 4.32, - "learning_rate": 2.4128780870075695e-06, - "loss": 0.2336, + "epoch": 4.422820485097488, + "grad_norm": 0.27013471722602844, + "learning_rate": 1.7361661431783304e-06, + "loss": 0.3774, "step": 122720 }, { - "epoch": 4.32, - "learning_rate": 2.411657214808602e-06, - "loss": 0.2599, + "epoch": 4.423000684758713, + "grad_norm": 0.25899776816368103, + "learning_rate": 1.7350978085677704e-06, + "loss": 0.3916, "step": 122725 }, { - "epoch": 4.32, - "learning_rate": 2.410436635904381e-06, - "loss": 0.2276, + "epoch": 4.423180884419938, + "grad_norm": 0.24092505872249603, + "learning_rate": 1.7340297909369973e-06, + "loss": 0.3897, "step": 122730 }, { - "epoch": 4.32, - "learning_rate": 2.4092163503107556e-06, - "loss": 0.2533, + "epoch": 4.4233610840811615, + "grad_norm": 0.25030139088630676, + "learning_rate": 1.7329620903005661e-06, + "loss": 0.3741, "step": 122735 }, { - "epoch": 4.32, - "learning_rate": 2.4079963580435656e-06, - "loss": 0.2604, + "epoch": 4.423541283742386, + "grad_norm": 0.218690425157547, + "learning_rate": 1.7318947066730263e-06, + "loss": 0.3597, "step": 122740 }, { - "epoch": 4.32, - "learning_rate": 2.406776659118651e-06, - "loss": 0.258, + "epoch": 4.423721483403611, + "grad_norm": 0.2734704613685608, + "learning_rate": 1.7308276400689077e-06, + "loss": 0.3735, "step": 122745 }, { - "epoch": 4.32, - "learning_rate": 2.4055572535518574e-06, - "loss": 0.2369, + "epoch": 4.423901683064836, + "grad_norm": 0.37903356552124023, + "learning_rate": 1.7297608905027602e-06, + "loss": 0.3653, "step": 122750 }, { - "epoch": 4.32, - "learning_rate": 2.4043381413590083e-06, - "loss": 0.2541, + "epoch": 4.42408188272606, + "grad_norm": 0.29031357169151306, + "learning_rate": 1.7286944579891134e-06, + "loss": 0.411, "step": 122755 }, { - "epoch": 4.32, - "learning_rate": 2.403119322555941e-06, - "loss": 0.2411, + "epoch": 4.424262082387285, + "grad_norm": 0.24027031660079956, + "learning_rate": 1.7276283425425088e-06, + "loss": 0.3777, "step": 122760 }, { - "epoch": 4.32, - "learning_rate": 2.401900797158471e-06, - "loss": 0.2334, + "epoch": 4.42444228204851, + "grad_norm": 0.26298728585243225, + "learning_rate": 1.726562544177454e-06, + "loss": 0.3635, "step": 122765 }, { - "epoch": 4.32, - "learning_rate": 2.4006825651824326e-06, - "loss": 0.2497, + "epoch": 4.424622481709735, + "grad_norm": 0.24697257578372955, + "learning_rate": 1.7254970629084765e-06, + "loss": 0.3843, "step": 122770 }, { - "epoch": 4.32, - "learning_rate": 2.399464626643638e-06, - "loss": 0.2425, + "epoch": 4.424802681370959, + "grad_norm": 0.26722240447998047, + "learning_rate": 1.7244318987501007e-06, + "loss": 0.3959, "step": 122775 }, { - "epoch": 4.32, - "learning_rate": 2.3982469815578972e-06, - "loss": 0.2526, + "epoch": 4.424982881032184, + "grad_norm": 0.23795725405216217, + "learning_rate": 1.7233670517168344e-06, + "loss": 0.3746, "step": 122780 }, { - "epoch": 4.32, - "learning_rate": 2.3970296299410277e-06, - "loss": 0.2486, + "epoch": 4.425163080693408, + "grad_norm": 0.31174471974372864, + "learning_rate": 1.7223025218231826e-06, + "loss": 0.3546, "step": 122785 }, { - "epoch": 4.32, - "learning_rate": 2.395812571808831e-06, - "loss": 0.2415, + "epoch": 4.425343280354633, + "grad_norm": 0.2947745621204376, + "learning_rate": 1.7212383090836558e-06, + "loss": 0.3637, "step": 122790 }, { - "epoch": 4.32, - "learning_rate": 2.3945958071771134e-06, - "loss": 0.2286, + "epoch": 4.425523480015857, + "grad_norm": 0.24382677674293518, + "learning_rate": 1.7201744135127428e-06, + "loss": 0.3861, "step": 122795 }, { - "epoch": 4.32, - "learning_rate": 2.393379336061666e-06, - "loss": 0.2465, + "epoch": 4.425703679677082, + "grad_norm": 0.2894967794418335, + "learning_rate": 1.7191108351249513e-06, + "loss": 0.3632, "step": 122800 }, { - "epoch": 4.32, - "learning_rate": 2.3921631584782942e-06, - "loss": 0.2524, + "epoch": 4.425883879338307, + "grad_norm": 0.23039253056049347, + "learning_rate": 1.7180475739347695e-06, + "loss": 0.3493, "step": 122805 }, { - "epoch": 4.32, - "learning_rate": 2.390947274442787e-06, - "loss": 0.2628, + "epoch": 4.426064078999532, + "grad_norm": 0.295876681804657, + "learning_rate": 1.7169846299566806e-06, + "loss": 0.3588, "step": 122810 }, { - "epoch": 4.32, - "learning_rate": 2.389731683970922e-06, - "loss": 0.2426, + "epoch": 4.426244278660756, + "grad_norm": 0.23762252926826477, + "learning_rate": 1.715922003205167e-06, + "loss": 0.3371, "step": 122815 }, { - "epoch": 4.32, - "learning_rate": 2.3885163870784907e-06, - "loss": 0.2562, + "epoch": 4.426424478321981, + "grad_norm": 0.22657039761543274, + "learning_rate": 1.7148596936947064e-06, + "loss": 0.3865, "step": 122820 }, { - "epoch": 4.32, - "learning_rate": 2.387301383781279e-06, - "loss": 0.2338, + "epoch": 4.426604677983206, + "grad_norm": 0.22004631161689758, + "learning_rate": 1.713797701439776e-06, + "loss": 0.3896, "step": 122825 }, { - "epoch": 4.32, - "learning_rate": 2.3860866740950562e-06, - "loss": 0.237, + "epoch": 4.42678487764443, + "grad_norm": 0.2557975947856903, + "learning_rate": 1.7127360264548475e-06, + "loss": 0.3903, "step": 122830 }, { - "epoch": 4.32, - "learning_rate": 2.384872258035592e-06, - "loss": 0.2568, + "epoch": 4.426965077305654, + "grad_norm": 0.2727845311164856, + "learning_rate": 1.711674668754376e-06, + "loss": 0.3312, "step": 122835 }, { - "epoch": 4.32, - "learning_rate": 2.383658135618655e-06, - "loss": 0.2528, + "epoch": 4.427145276966879, + "grad_norm": 0.24959102272987366, + "learning_rate": 1.7106136283528336e-06, + "loss": 0.3425, "step": 122840 }, { - "epoch": 4.32, - "learning_rate": 2.3824443068600182e-06, - "loss": 0.2568, + "epoch": 4.427325476628104, + "grad_norm": 0.25791552662849426, + "learning_rate": 1.709552905264669e-06, + "loss": 0.3923, "step": 122845 }, { - "epoch": 4.32, - "learning_rate": 2.3812307717754333e-06, - "loss": 0.2535, + "epoch": 4.427505676289329, + "grad_norm": 0.22956585884094238, + "learning_rate": 1.7084924995043355e-06, + "loss": 0.3631, "step": 122850 }, { - "epoch": 4.32, - "learning_rate": 2.3800175303806625e-06, - "loss": 0.2579, + "epoch": 4.427685875950553, + "grad_norm": 0.25182798504829407, + "learning_rate": 1.707432411086285e-06, + "loss": 0.3912, "step": 122855 }, { - "epoch": 4.32, - "learning_rate": 2.3788045826914545e-06, - "loss": 0.2386, + "epoch": 4.427866075611778, + "grad_norm": 0.2180362045764923, + "learning_rate": 1.7063726400249531e-06, + "loss": 0.3683, "step": 122860 }, { - "epoch": 4.32, - "learning_rate": 2.3775919287235633e-06, - "loss": 0.2235, + "epoch": 4.428046275273003, + "grad_norm": 0.24969041347503662, + "learning_rate": 1.7053131863347893e-06, + "loss": 0.3792, "step": 122865 }, { - "epoch": 4.32, - "learning_rate": 2.3763795684927286e-06, - "loss": 0.2408, + "epoch": 4.4282264749342275, + "grad_norm": 0.3251616060733795, + "learning_rate": 1.704254050030224e-06, + "loss": 0.3662, "step": 122870 }, { - "epoch": 4.32, - "learning_rate": 2.3751675020146996e-06, - "loss": 0.261, + "epoch": 4.428406674595452, + "grad_norm": 0.2780360281467438, + "learning_rate": 1.7031952311256845e-06, + "loss": 0.4135, "step": 122875 }, { - "epoch": 4.32, - "learning_rate": 2.373955729305205e-06, - "loss": 0.2728, + "epoch": 4.428586874256676, + "grad_norm": 0.23237118124961853, + "learning_rate": 1.7021367296356005e-06, + "loss": 0.3865, "step": 122880 }, { - "epoch": 4.32, - "learning_rate": 2.372744250379988e-06, - "loss": 0.2683, + "epoch": 4.428767073917901, + "grad_norm": 0.21650679409503937, + "learning_rate": 1.7010785455743943e-06, + "loss": 0.3883, "step": 122885 }, { - "epoch": 4.32, - "learning_rate": 2.3715330652547774e-06, - "loss": 0.2324, + "epoch": 4.4289472735791255, + "grad_norm": 0.2289179116487503, + "learning_rate": 1.7000206789564765e-06, + "loss": 0.3903, "step": 122890 }, { - "epoch": 4.32, - "learning_rate": 2.370322173945291e-06, - "loss": 0.2572, + "epoch": 4.42912747324035, + "grad_norm": 0.27513277530670166, + "learning_rate": 1.6989631297962717e-06, + "loss": 0.3564, "step": 122895 }, { - "epoch": 4.32, - "learning_rate": 2.369111576467262e-06, - "loss": 0.2803, + "epoch": 4.429307672901575, + "grad_norm": 0.2146390825510025, + "learning_rate": 1.6979058981081819e-06, + "loss": 0.3701, "step": 122900 }, { - "epoch": 4.32, - "learning_rate": 2.3679012728364063e-06, - "loss": 0.256, + "epoch": 4.4294878725628, + "grad_norm": 0.36598315834999084, + "learning_rate": 1.6968489839066125e-06, + "loss": 0.3704, "step": 122905 }, { - "epoch": 4.32, - "learning_rate": 2.366691263068438e-06, - "loss": 0.2598, + "epoch": 4.4296680722240245, + "grad_norm": 0.2601289749145508, + "learning_rate": 1.6957923872059634e-06, + "loss": 0.3677, "step": 122910 }, { - "epoch": 4.32, - "learning_rate": 2.3654815471790604e-06, - "loss": 0.2567, + "epoch": 4.429848271885249, + "grad_norm": 0.25093069672584534, + "learning_rate": 1.6947361080206282e-06, + "loss": 0.3744, "step": 122915 }, { - "epoch": 4.32, - "learning_rate": 2.3642721251839946e-06, - "loss": 0.2443, + "epoch": 4.430028471546473, + "grad_norm": 0.3232559859752655, + "learning_rate": 1.6936801463650066e-06, + "loss": 0.3804, "step": 122920 }, { - "epoch": 4.32, - "learning_rate": 2.3630629970989332e-06, - "loss": 0.2458, + "epoch": 4.430208671207698, + "grad_norm": 0.26259225606918335, + "learning_rate": 1.6926245022534842e-06, + "loss": 0.3743, "step": 122925 }, { - "epoch": 4.33, - "learning_rate": 2.3618541629395833e-06, - "loss": 0.2581, + "epoch": 4.4303888708689225, + "grad_norm": 0.25600606203079224, + "learning_rate": 1.6915691757004303e-06, + "loss": 0.3987, "step": 122930 }, { - "epoch": 4.33, - "learning_rate": 2.3606456227216357e-06, - "loss": 0.2582, + "epoch": 4.430569070530147, + "grad_norm": 0.2362159788608551, + "learning_rate": 1.690514166720239e-06, + "loss": 0.3972, "step": 122935 }, { - "epoch": 4.33, - "learning_rate": 2.359437376460788e-06, - "loss": 0.2519, + "epoch": 4.430749270191372, + "grad_norm": 0.21137534081935883, + "learning_rate": 1.6894594753272763e-06, + "loss": 0.3447, "step": 122940 }, { - "epoch": 4.33, - "learning_rate": 2.3582294241727284e-06, - "loss": 0.2416, + "epoch": 4.430929469852597, + "grad_norm": 0.23811127245426178, + "learning_rate": 1.6884051015359226e-06, + "loss": 0.3834, "step": 122945 }, { - "epoch": 4.33, - "learning_rate": 2.3570217658731352e-06, - "loss": 0.254, + "epoch": 4.4311096695138215, + "grad_norm": 0.22823196649551392, + "learning_rate": 1.687351045360533e-06, + "loss": 0.3802, "step": 122950 }, { - "epoch": 4.33, - "learning_rate": 2.3558144015776858e-06, - "loss": 0.2403, + "epoch": 4.431289869175046, + "grad_norm": 0.23941099643707275, + "learning_rate": 1.6862973068154653e-06, + "loss": 0.3415, "step": 122955 }, { - "epoch": 4.33, - "learning_rate": 2.354607331302072e-06, - "loss": 0.2813, + "epoch": 4.431470068836271, + "grad_norm": 0.26783496141433716, + "learning_rate": 1.6852438859150887e-06, + "loss": 0.3696, "step": 122960 }, { - "epoch": 4.33, - "learning_rate": 2.3534005550619547e-06, - "loss": 0.2667, + "epoch": 4.431650268497496, + "grad_norm": 0.31337860226631165, + "learning_rate": 1.6841907826737503e-06, + "loss": 0.3926, "step": 122965 }, { - "epoch": 4.33, - "learning_rate": 2.3521940728730095e-06, - "loss": 0.2722, + "epoch": 4.4318304681587195, + "grad_norm": 0.2646377384662628, + "learning_rate": 1.6831379971057992e-06, + "loss": 0.3866, "step": 122970 }, { - "epoch": 4.33, - "learning_rate": 2.350987884750891e-06, - "loss": 0.263, + "epoch": 4.432010667819944, + "grad_norm": 0.27343621850013733, + "learning_rate": 1.6820855292255772e-06, + "loss": 0.3935, "step": 122975 }, { - "epoch": 4.33, - "learning_rate": 2.3497819907112722e-06, - "loss": 0.2437, + "epoch": 4.432190867481169, + "grad_norm": 0.3007572591304779, + "learning_rate": 1.6810333790474226e-06, + "loss": 0.3962, "step": 122980 }, { - "epoch": 4.33, - "learning_rate": 2.3485763907698076e-06, - "loss": 0.2371, + "epoch": 4.432371067142394, + "grad_norm": 0.24796468019485474, + "learning_rate": 1.6799815465856767e-06, + "loss": 0.365, "step": 122985 }, { - "epoch": 4.33, - "learning_rate": 2.3473710849421542e-06, - "loss": 0.2622, + "epoch": 4.432551266803618, + "grad_norm": 0.2737863063812256, + "learning_rate": 1.6789300318546697e-06, + "loss": 0.4177, "step": 122990 }, { - "epoch": 4.33, - "learning_rate": 2.3461660732439524e-06, - "loss": 0.2443, + "epoch": 4.432731466464843, + "grad_norm": 0.298483282327652, + "learning_rate": 1.6778788348687235e-06, + "loss": 0.3822, "step": 122995 }, { - "epoch": 4.33, - "learning_rate": 2.3449613556908613e-06, - "loss": 0.2414, + "epoch": 4.432911666126068, + "grad_norm": 0.23559249937534332, + "learning_rate": 1.6768279556421628e-06, + "loss": 0.3506, "step": 123000 }, { - "epoch": 4.33, - "eval_loss": 0.24904577434062958, - "eval_runtime": 10.5244, - "eval_samples_per_second": 9.502, - "eval_steps_per_second": 9.502, + "epoch": 4.432911666126068, + "eval_loss": 0.42897456884384155, + "eval_runtime": 3.5635, + "eval_samples_per_second": 28.062, + "eval_steps_per_second": 7.016, "step": 123000 }, { - "epoch": 4.33, - "learning_rate": 2.343756932298513e-06, - "loss": 0.2657, + "epoch": 4.433091865787293, + "grad_norm": 0.2772116959095001, + "learning_rate": 1.6757773941893068e-06, + "loss": 0.4011, "step": 123005 }, { - "epoch": 4.33, - "learning_rate": 2.3425528030825538e-06, - "loss": 0.2416, + "epoch": 4.4332720654485165, + "grad_norm": 0.2690448760986328, + "learning_rate": 1.6747271505244634e-06, + "loss": 0.3637, "step": 123010 }, { - "epoch": 4.33, - "learning_rate": 2.341348968058607e-06, - "loss": 0.2707, + "epoch": 4.433452265109741, + "grad_norm": 0.23633429408073425, + "learning_rate": 1.6736772246619543e-06, + "loss": 0.3459, "step": 123015 }, { - "epoch": 4.33, - "learning_rate": 2.3401454272423183e-06, - "loss": 0.2352, + "epoch": 4.433632464770966, + "grad_norm": 0.2238181084394455, + "learning_rate": 1.6726276166160681e-06, + "loss": 0.3827, "step": 123020 }, { - "epoch": 4.33, - "learning_rate": 2.338942180649306e-06, - "loss": 0.2549, + "epoch": 4.433812664432191, + "grad_norm": 0.24402472376823425, + "learning_rate": 1.6715783264011188e-06, + "loss": 0.388, "step": 123025 }, { - "epoch": 4.33, - "learning_rate": 2.337739228295191e-06, - "loss": 0.276, + "epoch": 4.433992864093415, + "grad_norm": 0.2225305289030075, + "learning_rate": 1.6705293540313998e-06, + "loss": 0.3813, "step": 123030 }, { - "epoch": 4.33, - "learning_rate": 2.3365365701955993e-06, - "loss": 0.2473, + "epoch": 4.43417306375464, + "grad_norm": 0.2935526967048645, + "learning_rate": 1.6694806995211976e-06, + "loss": 0.3678, "step": 123035 }, { - "epoch": 4.33, - "learning_rate": 2.3353342063661466e-06, - "loss": 0.2493, + "epoch": 4.434353263415865, + "grad_norm": 0.24194492399692535, + "learning_rate": 1.6684323628848113e-06, + "loss": 0.3804, "step": 123040 }, { - "epoch": 4.33, - "learning_rate": 2.3341321368224422e-06, - "loss": 0.2441, + "epoch": 4.43453346307709, + "grad_norm": 0.22049802541732788, + "learning_rate": 1.6673843441365077e-06, + "loss": 0.3625, "step": 123045 }, { - "epoch": 4.33, - "learning_rate": 2.3329303615800906e-06, - "loss": 0.2545, + "epoch": 4.434713662738314, + "grad_norm": 0.28984788060188293, + "learning_rate": 1.6663366432905809e-06, + "loss": 0.3471, "step": 123050 }, { - "epoch": 4.33, - "learning_rate": 2.331728880654707e-06, - "loss": 0.2505, + "epoch": 4.434893862399539, + "grad_norm": 0.2572941780090332, + "learning_rate": 1.6652892603612968e-06, + "loss": 0.378, "step": 123055 }, { - "epoch": 4.33, - "learning_rate": 2.330527694061882e-06, - "loss": 0.2575, + "epoch": 4.435074062060763, + "grad_norm": 0.2342897206544876, + "learning_rate": 1.6642421953629305e-06, + "loss": 0.3621, "step": 123060 }, { - "epoch": 4.33, - "learning_rate": 2.3293268018172165e-06, - "loss": 0.2356, + "epoch": 4.435254261721988, + "grad_norm": 0.2670467793941498, + "learning_rate": 1.6631954483097483e-06, + "loss": 0.3689, "step": 123065 }, { - "epoch": 4.33, - "learning_rate": 2.3281262039362984e-06, - "loss": 0.2797, + "epoch": 4.435434461383212, + "grad_norm": 0.29698285460472107, + "learning_rate": 1.6621490192160082e-06, + "loss": 0.3722, "step": 123070 }, { - "epoch": 4.33, - "learning_rate": 2.326925900434723e-06, - "loss": 0.254, + "epoch": 4.435614661044437, + "grad_norm": 0.21422305703163147, + "learning_rate": 1.6611029080959655e-06, + "loss": 0.356, "step": 123075 }, { - "epoch": 4.33, - "learning_rate": 2.3257258913280733e-06, - "loss": 0.2655, + "epoch": 4.435794860705662, + "grad_norm": 0.2958966791629791, + "learning_rate": 1.660057114963881e-06, + "loss": 0.3852, "step": 123080 }, { - "epoch": 4.33, - "learning_rate": 2.3245261766319247e-06, - "loss": 0.2621, + "epoch": 4.435975060366887, + "grad_norm": 0.20923343300819397, + "learning_rate": 1.659011639833999e-06, + "loss": 0.3784, "step": 123085 }, { - "epoch": 4.33, - "learning_rate": 2.32332675636186e-06, - "loss": 0.2525, + "epoch": 4.436155260028111, + "grad_norm": 0.2286258041858673, + "learning_rate": 1.657966482720566e-06, + "loss": 0.3705, "step": 123090 }, { - "epoch": 4.33, - "learning_rate": 2.3221276305334603e-06, - "loss": 0.2358, + "epoch": 4.436335459689336, + "grad_norm": 0.25596699118614197, + "learning_rate": 1.656921643637821e-06, + "loss": 0.3921, "step": 123095 }, { - "epoch": 4.33, - "learning_rate": 2.3209287991622865e-06, - "loss": 0.2428, + "epoch": 4.436515659350561, + "grad_norm": 0.2769662141799927, + "learning_rate": 1.6558771225999942e-06, + "loss": 0.3694, "step": 123100 }, { - "epoch": 4.33, - "learning_rate": 2.3197302622639027e-06, - "loss": 0.2508, + "epoch": 4.436695859011785, + "grad_norm": 0.2575506865978241, + "learning_rate": 1.6548329196213242e-06, + "loss": 0.3762, "step": 123105 }, { - "epoch": 4.33, - "learning_rate": 2.318532019853875e-06, - "loss": 0.2375, + "epoch": 4.436876058673009, + "grad_norm": 0.2907906174659729, + "learning_rate": 1.6537890347160412e-06, + "loss": 0.3744, "step": 123110 }, { - "epoch": 4.33, - "learning_rate": 2.3173340719477632e-06, - "loss": 0.2348, + "epoch": 4.437056258334234, + "grad_norm": 0.2527605891227722, + "learning_rate": 1.652745467898356e-06, + "loss": 0.3641, "step": 123115 }, { - "epoch": 4.33, - "learning_rate": 2.316136418561121e-06, - "loss": 0.242, + "epoch": 4.437236457995459, + "grad_norm": 0.31895288825035095, + "learning_rate": 1.6517022191824965e-06, + "loss": 0.3943, "step": 123120 }, { - "epoch": 4.33, - "learning_rate": 2.3149390597094976e-06, - "loss": 0.2382, + "epoch": 4.437416657656684, + "grad_norm": 0.23418550193309784, + "learning_rate": 1.6506592885826704e-06, + "loss": 0.3661, "step": 123125 }, { - "epoch": 4.33, - "learning_rate": 2.313741995408436e-06, - "loss": 0.2548, + "epoch": 4.437596857317908, + "grad_norm": 0.2372836172580719, + "learning_rate": 1.649616676113097e-06, + "loss": 0.3527, "step": 123130 }, { - "epoch": 4.33, - "learning_rate": 2.3125452256734877e-06, - "loss": 0.2716, + "epoch": 4.437777056979133, + "grad_norm": 0.33892858028411865, + "learning_rate": 1.6485743817879735e-06, + "loss": 0.3835, "step": 123135 }, { - "epoch": 4.33, - "learning_rate": 2.3113487505201843e-06, - "loss": 0.2397, + "epoch": 4.437957256640358, + "grad_norm": 0.32817450165748596, + "learning_rate": 1.6475324056214992e-06, + "loss": 0.3921, "step": 123140 }, { - "epoch": 4.33, - "learning_rate": 2.3101525699640693e-06, - "loss": 0.263, + "epoch": 4.4381374563015825, + "grad_norm": 0.30701467394828796, + "learning_rate": 1.6466990537720912e-06, + "loss": 0.3915, "step": 123145 }, { - "epoch": 4.33, - "learning_rate": 2.308956684020666e-06, - "loss": 0.2503, + "epoch": 4.438317655962807, + "grad_norm": 0.21967414021492004, + "learning_rate": 1.6456576503269705e-06, + "loss": 0.361, "step": 123150 }, { - "epoch": 4.33, - "learning_rate": 2.30776109270551e-06, - "loss": 0.2225, + "epoch": 4.438497855624031, + "grad_norm": 0.23973840475082397, + "learning_rate": 1.6446165650802398e-06, + "loss": 0.3769, "step": 123155 }, { - "epoch": 4.33, - "learning_rate": 2.306565796034124e-06, - "loss": 0.257, + "epoch": 4.438678055285256, + "grad_norm": 0.26962411403656006, + "learning_rate": 1.6435757980460876e-06, + "loss": 0.35, "step": 123160 }, { - "epoch": 4.33, - "learning_rate": 2.3053707940220188e-06, - "loss": 0.2497, + "epoch": 4.438858254946481, + "grad_norm": 0.21383236348628998, + "learning_rate": 1.642535349238694e-06, + "loss": 0.3269, "step": 123165 }, { - "epoch": 4.33, - "learning_rate": 2.30417608668472e-06, - "loss": 0.2441, + "epoch": 4.439038454607705, + "grad_norm": 0.23623572289943695, + "learning_rate": 1.6414952186722287e-06, + "loss": 0.3909, "step": 123170 }, { - "epoch": 4.33, - "learning_rate": 2.302981674037741e-06, - "loss": 0.2544, + "epoch": 4.43921865426893, + "grad_norm": 0.24700896441936493, + "learning_rate": 1.6404554063608746e-06, + "loss": 0.3601, "step": 123175 }, { - "epoch": 4.33, - "learning_rate": 2.301787556096585e-06, - "loss": 0.2273, + "epoch": 4.439398853930155, + "grad_norm": 0.24526774883270264, + "learning_rate": 1.6394159123187953e-06, + "loss": 0.3701, "step": 123180 }, { - "epoch": 4.33, - "learning_rate": 2.300593732876757e-06, - "loss": 0.2501, + "epoch": 4.4395790535913795, + "grad_norm": 0.20414763689041138, + "learning_rate": 1.6383767365601433e-06, + "loss": 0.3538, "step": 123185 }, { - "epoch": 4.33, - "learning_rate": 2.2994002043937617e-06, - "loss": 0.2558, + "epoch": 4.439759253252604, + "grad_norm": 0.19671322405338287, + "learning_rate": 1.6373378790990906e-06, + "loss": 0.3585, "step": 123190 }, { - "epoch": 4.33, - "learning_rate": 2.2982069706630916e-06, - "loss": 0.2451, + "epoch": 4.439939452913828, + "grad_norm": 0.2665334939956665, + "learning_rate": 1.6362993399497816e-06, + "loss": 0.3915, "step": 123195 }, { - "epoch": 4.33, - "learning_rate": 2.2970140317002455e-06, - "loss": 0.2257, + "epoch": 4.440119652575053, + "grad_norm": 0.21728205680847168, + "learning_rate": 1.6352611191263745e-06, + "loss": 0.3661, "step": 123200 }, { - "epoch": 4.33, - "learning_rate": 2.295821387520708e-06, - "loss": 0.2497, + "epoch": 4.4402998522362775, + "grad_norm": 0.23190873861312866, + "learning_rate": 1.634223216643016e-06, + "loss": 0.3862, "step": 123205 }, { - "epoch": 4.33, - "learning_rate": 2.2946290381399698e-06, - "loss": 0.2344, + "epoch": 4.440480051897502, + "grad_norm": 0.28229013085365295, + "learning_rate": 1.633185632513834e-06, + "loss": 0.364, "step": 123210 }, { - "epoch": 4.34, - "learning_rate": 2.2934369835735132e-06, - "loss": 0.2731, + "epoch": 4.440660251558727, + "grad_norm": 0.2951110303401947, + "learning_rate": 1.632148366752978e-06, + "loss": 0.4201, "step": 123215 }, { - "epoch": 4.34, - "learning_rate": 2.2922452238368115e-06, - "loss": 0.2455, + "epoch": 4.440840451219952, + "grad_norm": 0.2822306156158447, + "learning_rate": 1.6311114193745757e-06, + "loss": 0.3883, "step": 123220 }, { - "epoch": 4.34, - "learning_rate": 2.2910537589453355e-06, - "loss": 0.2615, + "epoch": 4.4410206508811765, + "grad_norm": 0.280408650636673, + "learning_rate": 1.6300747903927576e-06, + "loss": 0.4214, "step": 123225 }, { - "epoch": 4.34, - "learning_rate": 2.2898625889145653e-06, - "loss": 0.2281, + "epoch": 4.441200850542401, + "grad_norm": 0.23964573442935944, + "learning_rate": 1.6290384798216428e-06, + "loss": 0.3636, "step": 123230 }, { - "epoch": 4.34, - "learning_rate": 2.2886717137599626e-06, - "loss": 0.2764, + "epoch": 4.441381050203626, + "grad_norm": 0.2101919949054718, + "learning_rate": 1.6280024876753535e-06, + "loss": 0.3383, "step": 123235 }, { - "epoch": 4.34, - "learning_rate": 2.287481133496991e-06, - "loss": 0.2407, + "epoch": 4.441561249864851, + "grad_norm": 0.24973724782466888, + "learning_rate": 1.626966813968006e-06, + "loss": 0.3767, "step": 123240 }, { - "epoch": 4.34, - "learning_rate": 2.2862908481411037e-06, - "loss": 0.2603, + "epoch": 4.4417414495260745, + "grad_norm": 0.23754936456680298, + "learning_rate": 1.6259314587137114e-06, + "loss": 0.3969, "step": 123245 }, { - "epoch": 4.34, - "learning_rate": 2.285100857707764e-06, - "loss": 0.2479, + "epoch": 4.441921649187299, + "grad_norm": 0.28546053171157837, + "learning_rate": 1.6248964219265777e-06, + "loss": 0.3638, "step": 123250 }, { - "epoch": 4.34, - "learning_rate": 2.2839111622124184e-06, - "loss": 0.2535, + "epoch": 4.442101848848524, + "grad_norm": 0.2416713684797287, + "learning_rate": 1.623861703620702e-06, + "loss": 0.3669, "step": 123255 }, { - "epoch": 4.34, - "learning_rate": 2.2827217616705145e-06, - "loss": 0.2414, + "epoch": 4.442282048509749, + "grad_norm": 0.26729458570480347, + "learning_rate": 1.6228273038101815e-06, + "loss": 0.3851, "step": 123260 }, { - "epoch": 4.34, - "learning_rate": 2.2815326560975015e-06, - "loss": 0.2627, + "epoch": 4.4424622481709735, + "grad_norm": 0.24362029135227203, + "learning_rate": 1.6217932225091187e-06, + "loss": 0.405, "step": 123265 }, { - "epoch": 4.34, - "learning_rate": 2.280343845508817e-06, - "loss": 0.2377, + "epoch": 4.442642447832198, + "grad_norm": 0.24386849999427795, + "learning_rate": 1.620759459731594e-06, + "loss": 0.4102, "step": 123270 }, { - "epoch": 4.34, - "learning_rate": 2.2791553299198936e-06, - "loss": 0.2502, + "epoch": 4.442822647493423, + "grad_norm": 0.27166756987571716, + "learning_rate": 1.619726015491696e-06, + "loss": 0.3791, "step": 123275 }, { - "epoch": 4.34, - "learning_rate": 2.2779671093461574e-06, - "loss": 0.2412, + "epoch": 4.443002847154648, + "grad_norm": 0.2460469901561737, + "learning_rate": 1.6186928898035082e-06, + "loss": 0.3231, "step": 123280 }, { - "epoch": 4.34, - "learning_rate": 2.2767791838030515e-06, - "loss": 0.2599, + "epoch": 4.443183046815872, + "grad_norm": 0.21668177843093872, + "learning_rate": 1.6176600826810994e-06, + "loss": 0.3385, "step": 123285 }, { - "epoch": 4.34, - "learning_rate": 2.275591553305992e-06, - "loss": 0.2541, + "epoch": 4.443363246477096, + "grad_norm": 0.27097898721694946, + "learning_rate": 1.6166275941385422e-06, + "loss": 0.3826, "step": 123290 }, { - "epoch": 4.34, - "learning_rate": 2.274404217870402e-06, - "loss": 0.2362, + "epoch": 4.443543446138321, + "grad_norm": 0.2606200575828552, + "learning_rate": 1.615595424189914e-06, + "loss": 0.3223, "step": 123295 }, { - "epoch": 4.34, - "learning_rate": 2.2732171775116884e-06, - "loss": 0.2717, + "epoch": 4.443723645799546, + "grad_norm": 0.26379239559173584, + "learning_rate": 1.6145635728492619e-06, + "loss": 0.3753, "step": 123300 }, { - "epoch": 4.34, - "learning_rate": 2.2720304322452786e-06, - "loss": 0.2824, + "epoch": 4.44390384546077, + "grad_norm": 0.2528885006904602, + "learning_rate": 1.6135320401306608e-06, + "loss": 0.3818, "step": 123305 }, { - "epoch": 4.34, - "learning_rate": 2.2708439820865706e-06, - "loss": 0.2555, + "epoch": 4.444084045121995, + "grad_norm": 0.22059816122055054, + "learning_rate": 1.612500826048155e-06, + "loss": 0.3668, "step": 123310 }, { - "epoch": 4.34, - "learning_rate": 2.2696578270509804e-06, - "loss": 0.261, + "epoch": 4.44426424478322, + "grad_norm": 0.285345196723938, + "learning_rate": 1.6114699306157944e-06, + "loss": 0.356, "step": 123315 }, { - "epoch": 4.34, - "learning_rate": 2.2684719671538983e-06, - "loss": 0.2478, + "epoch": 4.444444444444445, + "grad_norm": 0.29682835936546326, + "learning_rate": 1.610439353847637e-06, + "loss": 0.3959, "step": 123320 }, { - "epoch": 4.34, - "learning_rate": 2.2672864024107287e-06, - "loss": 0.2405, + "epoch": 4.444624644105669, + "grad_norm": 0.2404378205537796, + "learning_rate": 1.6094090957577079e-06, + "loss": 0.372, "step": 123325 }, { - "epoch": 4.34, - "learning_rate": 2.2661011328368674e-06, - "loss": 0.2556, + "epoch": 4.444804843766894, + "grad_norm": 0.31720393896102905, + "learning_rate": 1.6083791563600514e-06, + "loss": 0.3816, "step": 123330 }, { - "epoch": 4.34, - "learning_rate": 2.2649161584476998e-06, - "loss": 0.2402, + "epoch": 4.444985043428118, + "grad_norm": 0.19714432954788208, + "learning_rate": 1.6073495356687008e-06, + "loss": 0.3454, "step": 123335 }, { - "epoch": 4.34, - "learning_rate": 2.263731479258607e-06, - "loss": 0.2424, + "epoch": 4.445165243089343, + "grad_norm": 0.2751675546169281, + "learning_rate": 1.6063202336976862e-06, + "loss": 0.4002, "step": 123340 }, { - "epoch": 4.34, - "learning_rate": 2.2625470952849836e-06, - "loss": 0.258, + "epoch": 4.445345442750567, + "grad_norm": 0.2857431471347809, + "learning_rate": 1.6052912504610273e-06, + "loss": 0.4002, "step": 123345 }, { - "epoch": 4.34, - "learning_rate": 2.2613630065421997e-06, - "loss": 0.2495, + "epoch": 4.445525642411792, + "grad_norm": 0.2135014832019806, + "learning_rate": 1.6042625859727461e-06, + "loss": 0.3727, "step": 123350 }, { - "epoch": 4.34, - "learning_rate": 2.2601792130456295e-06, - "loss": 0.2442, + "epoch": 4.445705842073017, + "grad_norm": 0.2780386507511139, + "learning_rate": 1.6032342402468536e-06, + "loss": 0.3768, "step": 123355 }, { - "epoch": 4.34, - "learning_rate": 2.2589957148106493e-06, - "loss": 0.2352, + "epoch": 4.445886041734242, + "grad_norm": 0.24021285772323608, + "learning_rate": 1.6022062132973692e-06, + "loss": 0.4057, "step": 123360 }, { - "epoch": 4.34, - "learning_rate": 2.2578125118526166e-06, - "loss": 0.2491, + "epoch": 4.446066241395466, + "grad_norm": 0.2460976094007492, + "learning_rate": 1.6011785051382956e-06, + "loss": 0.3774, "step": 123365 }, { - "epoch": 4.34, - "learning_rate": 2.256629604186908e-06, - "loss": 0.2595, + "epoch": 4.446246441056691, + "grad_norm": 0.25038421154022217, + "learning_rate": 1.6001511157836352e-06, + "loss": 0.3843, "step": 123370 }, { - "epoch": 4.34, - "learning_rate": 2.255446991828869e-06, - "loss": 0.236, + "epoch": 4.446426640717916, + "grad_norm": 0.3057589828968048, + "learning_rate": 1.5991240452473854e-06, + "loss": 0.3501, "step": 123375 }, { - "epoch": 4.34, - "learning_rate": 2.2542646747938682e-06, - "loss": 0.2478, + "epoch": 4.44660684037914, + "grad_norm": 0.2682446241378784, + "learning_rate": 1.5980972935435351e-06, + "loss": 0.3708, "step": 123380 }, { - "epoch": 4.34, - "learning_rate": 2.2530826530972493e-06, - "loss": 0.244, + "epoch": 4.446787040040364, + "grad_norm": 0.2691839337348938, + "learning_rate": 1.5970708606860813e-06, + "loss": 0.3683, "step": 123385 }, { - "epoch": 4.34, - "learning_rate": 2.2519009267543605e-06, - "loss": 0.2586, + "epoch": 4.446967239701589, + "grad_norm": 0.26984599232673645, + "learning_rate": 1.5960447466890128e-06, + "loss": 0.4096, "step": 123390 }, { - "epoch": 4.34, - "learning_rate": 2.2507194957805455e-06, - "loss": 0.2511, + "epoch": 4.447147439362814, + "grad_norm": 0.2700972557067871, + "learning_rate": 1.5950189515662934e-06, + "loss": 0.3566, "step": 123395 }, { - "epoch": 4.34, - "learning_rate": 2.2495383601911474e-06, - "loss": 0.2539, + "epoch": 4.447327639024039, + "grad_norm": 0.2507593333721161, + "learning_rate": 1.5939934753319146e-06, + "loss": 0.4151, "step": 123400 }, { - "epoch": 4.34, - "learning_rate": 2.248357520001501e-06, - "loss": 0.2443, + "epoch": 4.447507838685263, + "grad_norm": 0.2393806129693985, + "learning_rate": 1.5929683179998434e-06, + "loss": 0.367, "step": 123405 }, { - "epoch": 4.34, - "learning_rate": 2.247176975226939e-06, - "loss": 0.2526, + "epoch": 4.447688038346488, + "grad_norm": 0.2460200935602188, + "learning_rate": 1.591943479584046e-06, + "loss": 0.4182, "step": 123410 }, { - "epoch": 4.34, - "learning_rate": 2.245996725882782e-06, - "loss": 0.2532, + "epoch": 4.447868238007713, + "grad_norm": 0.24296921491622925, + "learning_rate": 1.5909189600984865e-06, + "loss": 0.3385, "step": 123415 }, { - "epoch": 4.34, - "learning_rate": 2.244816771984365e-06, - "loss": 0.2536, + "epoch": 4.4480484376689375, + "grad_norm": 0.24117806553840637, + "learning_rate": 1.58989475955712e-06, + "loss": 0.3475, "step": 123420 }, { - "epoch": 4.34, - "learning_rate": 2.243637113547009e-06, - "loss": 0.2413, + "epoch": 4.448228637330162, + "grad_norm": 0.21672047674655914, + "learning_rate": 1.588870877973911e-06, + "loss": 0.3675, "step": 123425 }, { - "epoch": 4.34, - "learning_rate": 2.24245775058603e-06, - "loss": 0.2432, + "epoch": 4.448408836991386, + "grad_norm": 0.29379647970199585, + "learning_rate": 1.5878473153628004e-06, + "loss": 0.3689, "step": 123430 }, { - "epoch": 4.34, - "learning_rate": 2.241278683116732e-06, - "loss": 0.2602, + "epoch": 4.448589036652611, + "grad_norm": 0.2538691461086273, + "learning_rate": 1.5868240717377413e-06, + "loss": 0.387, "step": 123435 }, { - "epoch": 4.34, - "learning_rate": 2.2400999111544392e-06, - "loss": 0.2423, + "epoch": 4.448769236313836, + "grad_norm": 0.22580280900001526, + "learning_rate": 1.58580114711267e-06, + "loss": 0.3792, "step": 123440 }, { - "epoch": 4.34, - "learning_rate": 2.238921434714447e-06, - "loss": 0.2605, + "epoch": 4.44894943597506, + "grad_norm": 0.24915678799152374, + "learning_rate": 1.584778541501522e-06, + "loss": 0.3549, "step": 123445 }, { - "epoch": 4.34, - "learning_rate": 2.237743253812061e-06, - "loss": 0.2329, + "epoch": 4.449129635636285, + "grad_norm": 0.28229835629463196, + "learning_rate": 1.583756254918231e-06, + "loss": 0.3472, "step": 123450 }, { - "epoch": 4.34, - "learning_rate": 2.2365653684625732e-06, - "loss": 0.2502, + "epoch": 4.44930983529751, + "grad_norm": 0.2428547888994217, + "learning_rate": 1.5827342873767332e-06, + "loss": 0.3503, "step": 123455 }, { - "epoch": 4.34, - "learning_rate": 2.2353877786812887e-06, - "loss": 0.2332, + "epoch": 4.4494900349587345, + "grad_norm": 0.3034095764160156, + "learning_rate": 1.5817126388909448e-06, + "loss": 0.3411, "step": 123460 }, { - "epoch": 4.34, - "learning_rate": 2.23421048448349e-06, - "loss": 0.2649, + "epoch": 4.449670234619959, + "grad_norm": 0.3237626254558563, + "learning_rate": 1.5806913094747883e-06, + "loss": 0.3616, "step": 123465 }, { - "epoch": 4.34, - "learning_rate": 2.2330334858844615e-06, - "loss": 0.2413, + "epoch": 4.449850434281183, + "grad_norm": 0.26890459656715393, + "learning_rate": 1.5796702991421802e-06, + "loss": 0.407, "step": 123470 }, { - "epoch": 4.34, - "learning_rate": 2.2318567828994858e-06, - "loss": 0.2451, + "epoch": 4.450030633942408, + "grad_norm": 0.2624853849411011, + "learning_rate": 1.5786496079070262e-06, + "loss": 0.3613, "step": 123475 }, { - "epoch": 4.34, - "learning_rate": 2.2306803755438533e-06, - "loss": 0.2572, + "epoch": 4.450210833603633, + "grad_norm": 0.20167163014411926, + "learning_rate": 1.5776292357832374e-06, + "loss": 0.3715, "step": 123480 }, { - "epoch": 4.34, - "learning_rate": 2.2295042638328294e-06, - "loss": 0.2657, + "epoch": 4.450391033264857, + "grad_norm": 0.33518025279045105, + "learning_rate": 1.576609182784719e-06, + "loss": 0.3528, "step": 123485 }, { - "epoch": 4.34, - "learning_rate": 2.22832844778168e-06, - "loss": 0.2541, + "epoch": 4.450571232926082, + "grad_norm": 0.2814764082431793, + "learning_rate": 1.575589448925363e-06, + "loss": 0.3977, "step": 123490 }, { - "epoch": 4.34, - "learning_rate": 2.2271529274056873e-06, - "loss": 0.2277, + "epoch": 4.450751432587307, + "grad_norm": 0.2996358871459961, + "learning_rate": 1.5745700342190667e-06, + "loss": 0.3737, "step": 123495 }, { - "epoch": 4.35, - "learning_rate": 2.2259777027201027e-06, - "loss": 0.2265, + "epoch": 4.4509316322485315, + "grad_norm": 0.2564932107925415, + "learning_rate": 1.5735509386797132e-06, + "loss": 0.3465, "step": 123500 }, { - "epoch": 4.35, - "eval_loss": 0.24895407259464264, - "eval_runtime": 10.5376, - "eval_samples_per_second": 9.49, - "eval_steps_per_second": 9.49, + "epoch": 4.4509316322485315, + "eval_loss": 0.4289168119430542, + "eval_runtime": 3.5469, + "eval_samples_per_second": 28.194, + "eval_steps_per_second": 7.048, "step": 123500 }, { - "epoch": 4.35, - "learning_rate": 2.2248027737401896e-06, - "loss": 0.2476, + "epoch": 4.451111831909756, + "grad_norm": 0.24683259427547455, + "learning_rate": 1.5725321623212025e-06, + "loss": 0.3773, "step": 123505 }, { - "epoch": 4.35, - "learning_rate": 2.2236281404811988e-06, - "loss": 0.256, + "epoch": 4.451292031570981, + "grad_norm": 0.22746077179908752, + "learning_rate": 1.5715137051574013e-06, + "loss": 0.3871, "step": 123510 }, { - "epoch": 4.35, - "learning_rate": 2.222453802958391e-06, - "loss": 0.2702, + "epoch": 4.451472231232206, + "grad_norm": 0.304383784532547, + "learning_rate": 1.5704955672021848e-06, + "loss": 0.3756, "step": 123515 }, { - "epoch": 4.35, - "learning_rate": 2.221279761187009e-06, - "loss": 0.2348, + "epoch": 4.4516524308934295, + "grad_norm": 0.2046549767255783, + "learning_rate": 1.5694777484694334e-06, + "loss": 0.3864, "step": 123520 }, { - "epoch": 4.35, - "learning_rate": 2.220106015182294e-06, - "loss": 0.2549, + "epoch": 4.451832630554654, + "grad_norm": 0.2234576940536499, + "learning_rate": 1.5684602489730137e-06, + "loss": 0.3441, "step": 123525 }, { - "epoch": 4.35, - "learning_rate": 2.218932564959489e-06, - "loss": 0.2425, + "epoch": 4.452012830215879, + "grad_norm": 0.2881526052951813, + "learning_rate": 1.5674430687267843e-06, + "loss": 0.3734, "step": 123530 }, { - "epoch": 4.35, - "learning_rate": 2.2177594105338346e-06, - "loss": 0.2566, + "epoch": 4.452193029877104, + "grad_norm": 0.22432564198970795, + "learning_rate": 1.5664262077446091e-06, + "loss": 0.3581, "step": 123535 }, { - "epoch": 4.35, - "learning_rate": 2.216586551920563e-06, - "loss": 0.2586, + "epoch": 4.4523732295383285, + "grad_norm": 0.2805180549621582, + "learning_rate": 1.565409666040335e-06, + "loss": 0.3709, "step": 123540 }, { - "epoch": 4.35, - "learning_rate": 2.21564847802509e-06, - "loss": 0.2426, + "epoch": 4.452553429199553, + "grad_norm": 0.22590774297714233, + "learning_rate": 1.5643934436278207e-06, + "loss": 0.3828, "step": 123545 }, { - "epoch": 4.35, - "learning_rate": 2.2144761519124757e-06, - "loss": 0.2442, + "epoch": 4.452733628860778, + "grad_norm": 0.3044675588607788, + "learning_rate": 1.5633775405209079e-06, + "loss": 0.3755, "step": 123550 }, { - "epoch": 4.35, - "learning_rate": 2.2133041216548717e-06, - "loss": 0.2617, + "epoch": 4.452913828522003, + "grad_norm": 0.22538445889949799, + "learning_rate": 1.562361956733438e-06, + "loss": 0.3691, "step": 123555 }, { - "epoch": 4.35, - "learning_rate": 2.2121323872674932e-06, - "loss": 0.2533, + "epoch": 4.453094028183227, + "grad_norm": 0.21017488837242126, + "learning_rate": 1.5613466922792502e-06, + "loss": 0.3595, "step": 123560 }, { - "epoch": 4.35, - "learning_rate": 2.210960948765556e-06, - "loss": 0.2569, + "epoch": 4.453274227844451, + "grad_norm": 0.24644504487514496, + "learning_rate": 1.5603317471721723e-06, + "loss": 0.3807, "step": 123565 }, { - "epoch": 4.35, - "learning_rate": 2.209789806164278e-06, - "loss": 0.2602, + "epoch": 4.453454427505676, + "grad_norm": 0.289897084236145, + "learning_rate": 1.5593171214260377e-06, + "loss": 0.3825, "step": 123570 }, { - "epoch": 4.35, - "learning_rate": 2.208618959478859e-06, - "loss": 0.2582, + "epoch": 4.453634627166901, + "grad_norm": 0.26970839500427246, + "learning_rate": 1.558302815054677e-06, + "loss": 0.371, "step": 123575 }, { - "epoch": 4.35, - "learning_rate": 2.2074484087244996e-06, - "loss": 0.2369, + "epoch": 4.4538148268281255, + "grad_norm": 0.2419775426387787, + "learning_rate": 1.55728882807189e-06, + "loss": 0.3658, "step": 123580 }, { - "epoch": 4.35, - "learning_rate": 2.206278153916405e-06, - "loss": 0.2599, + "epoch": 4.45399502648935, + "grad_norm": 0.25539085268974304, + "learning_rate": 1.5562751604915105e-06, + "loss": 0.3781, "step": 123585 }, { - "epoch": 4.35, - "learning_rate": 2.205108195069769e-06, - "loss": 0.2393, + "epoch": 4.454175226150575, + "grad_norm": 0.2259790003299713, + "learning_rate": 1.5552618123273438e-06, + "loss": 0.3886, "step": 123590 }, { - "epoch": 4.35, - "learning_rate": 2.2039385321997784e-06, - "loss": 0.2642, + "epoch": 4.4543554258118, + "grad_norm": 0.2829951047897339, + "learning_rate": 1.554248783593193e-06, + "loss": 0.3758, "step": 123595 }, { - "epoch": 4.35, - "learning_rate": 2.2027691653216194e-06, - "loss": 0.2589, + "epoch": 4.454535625473024, + "grad_norm": 0.2783590257167816, + "learning_rate": 1.5532360743028662e-06, + "loss": 0.386, "step": 123600 }, { - "epoch": 4.35, - "learning_rate": 2.2016000944504842e-06, - "loss": 0.268, + "epoch": 4.454715825134249, + "grad_norm": 0.27192917466163635, + "learning_rate": 1.5522236844701527e-06, + "loss": 0.3568, "step": 123605 }, { - "epoch": 4.35, - "learning_rate": 2.2004313196015454e-06, - "loss": 0.2664, + "epoch": 4.454896024795473, + "grad_norm": 0.21945591270923615, + "learning_rate": 1.551211614108855e-06, + "loss": 0.3553, "step": 123610 }, { - "epoch": 4.35, - "learning_rate": 2.1992628407899757e-06, - "loss": 0.2459, + "epoch": 4.455076224456698, + "grad_norm": 0.2255585640668869, + "learning_rate": 1.5501998632327624e-06, + "loss": 0.3524, "step": 123615 }, { - "epoch": 4.35, - "learning_rate": 2.1980946580309524e-06, - "loss": 0.2592, + "epoch": 4.455256424117922, + "grad_norm": 0.22020921111106873, + "learning_rate": 1.5491884318556526e-06, + "loss": 0.3737, "step": 123620 }, { - "epoch": 4.35, - "learning_rate": 2.1969267713396495e-06, - "loss": 0.256, + "epoch": 4.455436623779147, + "grad_norm": 0.3251590132713318, + "learning_rate": 1.548177319991312e-06, + "loss": 0.3473, "step": 123625 }, { - "epoch": 4.35, - "learning_rate": 2.1957591807312215e-06, - "loss": 0.2237, + "epoch": 4.455616823440372, + "grad_norm": 0.3003102242946625, + "learning_rate": 1.5471665276535151e-06, + "loss": 0.3673, "step": 123630 }, { - "epoch": 4.35, - "learning_rate": 2.1945918862208343e-06, - "loss": 0.2536, + "epoch": 4.455797023101597, + "grad_norm": 0.22849665582180023, + "learning_rate": 1.5461560548560322e-06, + "loss": 0.3556, "step": 123635 }, { - "epoch": 4.35, - "learning_rate": 2.1934248878236364e-06, - "loss": 0.2541, + "epoch": 4.455977222762821, + "grad_norm": 0.23016008734703064, + "learning_rate": 1.5451459016126325e-06, + "loss": 0.4272, "step": 123640 }, { - "epoch": 4.35, - "learning_rate": 2.192258185554791e-06, - "loss": 0.2483, + "epoch": 4.456157422424046, + "grad_norm": 0.25868502259254456, + "learning_rate": 1.5441360679370798e-06, + "loss": 0.3825, "step": 123645 }, { - "epoch": 4.35, - "learning_rate": 2.191091779429441e-06, - "loss": 0.2494, + "epoch": 4.456337622085271, + "grad_norm": 0.2868441641330719, + "learning_rate": 1.5431265538431327e-06, + "loss": 0.3849, "step": 123650 }, { - "epoch": 4.35, - "learning_rate": 2.189925669462736e-06, - "loss": 0.2473, + "epoch": 4.456517821746495, + "grad_norm": 0.2647748589515686, + "learning_rate": 1.5421173593445442e-06, + "loss": 0.3783, "step": 123655 }, { - "epoch": 4.35, - "learning_rate": 2.188759855669806e-06, - "loss": 0.251, + "epoch": 4.456698021407719, + "grad_norm": 0.26466822624206543, + "learning_rate": 1.5411084844550617e-06, + "loss": 0.3664, "step": 123660 }, { - "epoch": 4.35, - "learning_rate": 2.1875943380658014e-06, - "loss": 0.2533, + "epoch": 4.456878221068944, + "grad_norm": 0.2780085504055023, + "learning_rate": 1.5400999291884377e-06, + "loss": 0.3951, "step": 123665 }, { - "epoch": 4.35, - "learning_rate": 2.186429116665847e-06, - "loss": 0.2504, + "epoch": 4.457058420730169, + "grad_norm": 0.26032882928848267, + "learning_rate": 1.5390916935584116e-06, + "loss": 0.4208, "step": 123670 }, { - "epoch": 4.35, - "learning_rate": 2.1852641914850757e-06, - "loss": 0.2285, + "epoch": 4.457238620391394, + "grad_norm": 0.23003022372722626, + "learning_rate": 1.5380837775787194e-06, + "loss": 0.3772, "step": 123675 }, { - "epoch": 4.35, - "learning_rate": 2.1840995625386163e-06, - "loss": 0.26, + "epoch": 4.457418820052618, + "grad_norm": 0.2510555684566498, + "learning_rate": 1.5370761812630918e-06, + "loss": 0.3706, "step": 123680 }, { - "epoch": 4.35, - "learning_rate": 2.1829352298415875e-06, - "loss": 0.2388, + "epoch": 4.457599019713843, + "grad_norm": 0.2845185697078705, + "learning_rate": 1.5360689046252542e-06, + "loss": 0.3997, "step": 123685 }, { - "epoch": 4.35, - "learning_rate": 2.18177119340911e-06, - "loss": 0.2301, + "epoch": 4.457779219375068, + "grad_norm": 0.2645367980003357, + "learning_rate": 1.5350619476789425e-06, + "loss": 0.3494, "step": 123690 }, { - "epoch": 4.35, - "learning_rate": 2.180607453256289e-06, - "loss": 0.2317, + "epoch": 4.457959419036293, + "grad_norm": 0.32513341307640076, + "learning_rate": 1.5340553104378652e-06, + "loss": 0.3991, "step": 123695 }, { - "epoch": 4.35, - "learning_rate": 2.1794440093982488e-06, - "loss": 0.2423, + "epoch": 4.458139618697517, + "grad_norm": 0.2185758650302887, + "learning_rate": 1.5330489929157394e-06, + "loss": 0.3575, "step": 123700 }, { - "epoch": 4.35, - "learning_rate": 2.1782808618500877e-06, - "loss": 0.2753, + "epoch": 4.458319818358741, + "grad_norm": 0.24072350561618805, + "learning_rate": 1.5320429951262788e-06, + "loss": 0.3673, "step": 123705 }, { - "epoch": 4.35, - "learning_rate": 2.177118010626908e-06, - "loss": 0.2499, + "epoch": 4.458500018019966, + "grad_norm": 0.2706233263015747, + "learning_rate": 1.5310373170831922e-06, + "loss": 0.4193, "step": 123710 }, { - "epoch": 4.35, - "learning_rate": 2.1759554557438083e-06, - "loss": 0.266, + "epoch": 4.458680217681191, + "grad_norm": 0.21717756986618042, + "learning_rate": 1.5300319588001766e-06, + "loss": 0.3553, "step": 123715 }, { - "epoch": 4.35, - "learning_rate": 2.174793197215888e-06, - "loss": 0.2148, + "epoch": 4.458860417342415, + "grad_norm": 0.22631758451461792, + "learning_rate": 1.5290269202909297e-06, + "loss": 0.3442, "step": 123720 }, { - "epoch": 4.35, - "learning_rate": 2.173631235058232e-06, - "loss": 0.2433, + "epoch": 4.45904061700364, + "grad_norm": 0.2610069215297699, + "learning_rate": 1.528022201569146e-06, + "loss": 0.3812, "step": 123725 }, { - "epoch": 4.35, - "learning_rate": 2.172469569285934e-06, - "loss": 0.2612, + "epoch": 4.459220816664865, + "grad_norm": 0.3348616361618042, + "learning_rate": 1.5270178026485172e-06, + "loss": 0.4102, "step": 123730 }, { - "epoch": 4.35, - "learning_rate": 2.171308199914071e-06, - "loss": 0.2477, + "epoch": 4.4594010163260895, + "grad_norm": 0.27878302335739136, + "learning_rate": 1.5260137235427268e-06, + "loss": 0.3641, "step": 123735 }, { - "epoch": 4.35, - "learning_rate": 2.170147126957733e-06, - "loss": 0.2531, + "epoch": 4.459581215987314, + "grad_norm": 0.2503393292427063, + "learning_rate": 1.5250099642654558e-06, + "loss": 0.3192, "step": 123740 }, { - "epoch": 4.35, - "learning_rate": 2.168986350431987e-06, - "loss": 0.2647, + "epoch": 4.459761415648538, + "grad_norm": 0.2925935983657837, + "learning_rate": 1.524006524830379e-06, + "loss": 0.3556, "step": 123745 }, { - "epoch": 4.35, - "learning_rate": 2.167825870351908e-06, - "loss": 0.2689, + "epoch": 4.459941615309763, + "grad_norm": 0.25310271978378296, + "learning_rate": 1.5230034052511637e-06, + "loss": 0.3719, "step": 123750 }, { - "epoch": 4.35, - "learning_rate": 2.166665686732558e-06, - "loss": 0.2332, + "epoch": 4.460121814970988, + "grad_norm": 0.28005677461624146, + "learning_rate": 1.5220006055414848e-06, + "loss": 0.3673, "step": 123755 }, { - "epoch": 4.35, - "learning_rate": 2.1655057995890126e-06, - "loss": 0.244, + "epoch": 4.460302014632212, + "grad_norm": 0.26591956615448, + "learning_rate": 1.520998125715009e-06, + "loss": 0.3491, "step": 123760 }, { - "epoch": 4.35, - "learning_rate": 2.1643462089363243e-06, - "loss": 0.2512, + "epoch": 4.460482214293437, + "grad_norm": 0.24851107597351074, + "learning_rate": 1.5199959657853784e-06, + "loss": 0.3389, "step": 123765 }, { - "epoch": 4.35, - "learning_rate": 2.16318691478955e-06, - "loss": 0.2412, + "epoch": 4.460662413954662, + "grad_norm": 0.24951453506946564, + "learning_rate": 1.5189941257662598e-06, + "loss": 0.3878, "step": 123770 }, { - "epoch": 4.35, - "learning_rate": 2.162027917163742e-06, - "loss": 0.2264, + "epoch": 4.4608426136158865, + "grad_norm": 0.26914355158805847, + "learning_rate": 1.5179926056713006e-06, + "loss": 0.3913, "step": 123775 }, { - "epoch": 4.35, - "learning_rate": 2.1608692160739465e-06, - "loss": 0.2461, + "epoch": 4.461022813277111, + "grad_norm": 0.2922750413417816, + "learning_rate": 1.5169914055141427e-06, + "loss": 0.3501, "step": 123780 }, { - "epoch": 4.36, - "learning_rate": 2.1597108115352204e-06, - "loss": 0.2477, + "epoch": 4.461203012938336, + "grad_norm": 0.22999052703380585, + "learning_rate": 1.5159905253084388e-06, + "loss": 0.3749, "step": 123785 }, { - "epoch": 4.36, - "learning_rate": 2.1585527035625913e-06, - "loss": 0.2551, + "epoch": 4.461383212599561, + "grad_norm": 0.3160970211029053, + "learning_rate": 1.514989965067809e-06, + "loss": 0.3529, "step": 123790 }, { - "epoch": 4.36, - "learning_rate": 2.157394892171108e-06, - "loss": 0.2482, + "epoch": 4.461563412260785, + "grad_norm": 0.21242515742778778, + "learning_rate": 1.5139897248058977e-06, + "loss": 0.3838, "step": 123795 }, { - "epoch": 4.36, - "learning_rate": 2.156237377375797e-06, - "loss": 0.2461, + "epoch": 4.461743611922009, + "grad_norm": 0.24923621118068695, + "learning_rate": 1.5129898045363273e-06, + "loss": 0.3617, "step": 123800 }, { - "epoch": 4.36, - "learning_rate": 2.1550801591916912e-06, - "loss": 0.2364, + "epoch": 4.461923811583234, + "grad_norm": 0.25680771470069885, + "learning_rate": 1.5119902042727257e-06, + "loss": 0.3805, "step": 123805 }, { - "epoch": 4.36, - "learning_rate": 2.1539232376338065e-06, - "loss": 0.2472, + "epoch": 4.462104011244459, + "grad_norm": 0.29142534732818604, + "learning_rate": 1.51099092402871e-06, + "loss": 0.3772, "step": 123810 }, { - "epoch": 4.36, - "learning_rate": 2.1527666127171802e-06, - "loss": 0.2429, + "epoch": 4.4622842109056835, + "grad_norm": 0.212846577167511, + "learning_rate": 1.5099919638178972e-06, + "loss": 0.3341, "step": 123815 }, { - "epoch": 4.36, - "learning_rate": 2.1516102844568204e-06, - "loss": 0.2437, + "epoch": 4.462464410566908, + "grad_norm": 0.22237548232078552, + "learning_rate": 1.50899332365389e-06, + "loss": 0.354, "step": 123820 }, { - "epoch": 4.36, - "learning_rate": 2.150454252867745e-06, - "loss": 0.2514, + "epoch": 4.462644610228133, + "grad_norm": 0.2658112645149231, + "learning_rate": 1.5079950035503054e-06, + "loss": 0.3755, "step": 123825 }, { - "epoch": 4.36, - "learning_rate": 2.149298517964957e-06, - "loss": 0.2665, + "epoch": 4.462824809889358, + "grad_norm": 0.2840268611907959, + "learning_rate": 1.506997003520741e-06, + "loss": 0.3935, "step": 123830 }, { - "epoch": 4.36, - "learning_rate": 2.1481430797634735e-06, - "loss": 0.2393, + "epoch": 4.463005009550582, + "grad_norm": 0.2549954354763031, + "learning_rate": 1.5059993235787912e-06, + "loss": 0.3839, "step": 123835 }, { - "epoch": 4.36, - "learning_rate": 2.146987938278286e-06, - "loss": 0.2507, + "epoch": 4.463185209211806, + "grad_norm": 0.26900264620780945, + "learning_rate": 1.5050019637380565e-06, + "loss": 0.3575, "step": 123840 }, { - "epoch": 4.36, - "learning_rate": 2.145833093524405e-06, - "loss": 0.2336, + "epoch": 4.463365408873031, + "grad_norm": 0.28676745295524597, + "learning_rate": 1.5040049240121145e-06, + "loss": 0.3925, "step": 123845 }, { - "epoch": 4.36, - "learning_rate": 2.144678545516815e-06, - "loss": 0.2427, + "epoch": 4.463545608534256, + "grad_norm": 0.24285760521888733, + "learning_rate": 1.5030082044145605e-06, + "loss": 0.3623, "step": 123850 }, { - "epoch": 4.36, - "learning_rate": 2.1435242942705187e-06, - "loss": 0.255, + "epoch": 4.4637258081954805, + "grad_norm": 0.2883954346179962, + "learning_rate": 1.5020118049589722e-06, + "loss": 0.3763, "step": 123855 }, { - "epoch": 4.36, - "learning_rate": 2.1423703398004954e-06, - "loss": 0.243, + "epoch": 4.463906007856705, + "grad_norm": 0.31590425968170166, + "learning_rate": 1.5010157256589218e-06, + "loss": 0.3716, "step": 123860 }, { - "epoch": 4.36, - "learning_rate": 2.141216682121727e-06, - "loss": 0.2506, + "epoch": 4.46408620751793, + "grad_norm": 0.21175405383110046, + "learning_rate": 1.5000199665279824e-06, + "loss": 0.3788, "step": 123865 }, { - "epoch": 4.36, - "learning_rate": 2.1400633212491944e-06, - "loss": 0.2229, + "epoch": 4.464266407179155, + "grad_norm": 0.23191265761852264, + "learning_rate": 1.4990245275797177e-06, + "loss": 0.3739, "step": 123870 }, { - "epoch": 4.36, - "learning_rate": 2.1389102571978764e-06, - "loss": 0.266, + "epoch": 4.464446606840379, + "grad_norm": 0.23299415409564972, + "learning_rate": 1.498029408827703e-06, + "loss": 0.3612, "step": 123875 }, { - "epoch": 4.36, - "learning_rate": 2.137757489982742e-06, - "loss": 0.2635, + "epoch": 4.464626806501604, + "grad_norm": 0.28248894214630127, + "learning_rate": 1.4970346102854831e-06, + "loss": 0.3685, "step": 123880 }, { - "epoch": 4.36, - "learning_rate": 2.1366050196187564e-06, - "loss": 0.2646, + "epoch": 4.464807006162829, + "grad_norm": 0.27142465114593506, + "learning_rate": 1.4960401319666113e-06, + "loss": 0.3475, "step": 123885 }, { - "epoch": 4.36, - "learning_rate": 2.1354528461208944e-06, - "loss": 0.2281, + "epoch": 4.464987205824053, + "grad_norm": 0.2543891966342926, + "learning_rate": 1.4950459738846455e-06, + "loss": 0.3551, "step": 123890 }, { - "epoch": 4.36, - "learning_rate": 2.134300969504102e-06, - "loss": 0.2389, + "epoch": 4.4651674054852775, + "grad_norm": 0.22357742488384247, + "learning_rate": 1.4940521360531257e-06, + "loss": 0.396, "step": 123895 }, { - "epoch": 4.36, - "learning_rate": 2.1331493897833452e-06, - "loss": 0.2276, + "epoch": 4.465347605146502, + "grad_norm": 0.3259972333908081, + "learning_rate": 1.4930586184856016e-06, + "loss": 0.3484, "step": 123900 }, { - "epoch": 4.36, - "learning_rate": 2.1319981069735706e-06, - "loss": 0.246, + "epoch": 4.465527804807727, + "grad_norm": 0.23107150197029114, + "learning_rate": 1.4920654211955987e-06, + "loss": 0.3455, "step": 123905 }, { - "epoch": 4.36, - "learning_rate": 2.1308471210897352e-06, - "loss": 0.2725, + "epoch": 4.465708004468952, + "grad_norm": 0.2234567403793335, + "learning_rate": 1.4910725441966505e-06, + "loss": 0.3741, "step": 123910 }, { - "epoch": 4.36, - "learning_rate": 2.12969643214678e-06, - "loss": 0.2385, + "epoch": 4.465888204130176, + "grad_norm": 0.23561283946037292, + "learning_rate": 1.4900799875022908e-06, + "loss": 0.3616, "step": 123915 }, { - "epoch": 4.36, - "learning_rate": 2.12854604015964e-06, - "loss": 0.2329, + "epoch": 4.466068403791401, + "grad_norm": 0.2155967354774475, + "learning_rate": 1.4890877511260393e-06, + "loss": 0.3914, "step": 123920 }, { - "epoch": 4.36, - "learning_rate": 2.1273959451432567e-06, - "loss": 0.25, + "epoch": 4.466248603452626, + "grad_norm": 0.22571438550949097, + "learning_rate": 1.4880958350814156e-06, + "loss": 0.3525, "step": 123925 }, { - "epoch": 4.36, - "learning_rate": 2.126246147112565e-06, - "loss": 0.2414, + "epoch": 4.46642880311385, + "grad_norm": 0.2629721462726593, + "learning_rate": 1.487104239381934e-06, + "loss": 0.4074, "step": 123930 }, { - "epoch": 4.36, - "learning_rate": 2.1250966460824943e-06, - "loss": 0.255, + "epoch": 4.466609002775074, + "grad_norm": 0.2295321524143219, + "learning_rate": 1.4861129640411004e-06, + "loss": 0.3878, "step": 123935 }, { - "epoch": 4.36, - "learning_rate": 2.123947442067964e-06, - "loss": 0.2632, + "epoch": 4.466789202436299, + "grad_norm": 0.2172783464193344, + "learning_rate": 1.4851220090724288e-06, + "loss": 0.3812, "step": 123940 }, { - "epoch": 4.36, - "learning_rate": 2.122798535083906e-06, - "loss": 0.2888, + "epoch": 4.466969402097524, + "grad_norm": 0.2786194980144501, + "learning_rate": 1.484131374489417e-06, + "loss": 0.3771, "step": 123945 }, { - "epoch": 4.36, - "learning_rate": 2.121649925145225e-06, - "loss": 0.24, + "epoch": 4.467149601758749, + "grad_norm": 0.25498166680336, + "learning_rate": 1.4831410603055651e-06, + "loss": 0.3795, "step": 123950 }, { - "epoch": 4.36, - "learning_rate": 2.120501612266848e-06, - "loss": 0.2417, + "epoch": 4.467329801419973, + "grad_norm": 0.24183687567710876, + "learning_rate": 1.4821510665343595e-06, + "loss": 0.4133, "step": 123955 }, { - "epoch": 4.36, - "learning_rate": 2.11935359646368e-06, - "loss": 0.2631, + "epoch": 4.467510001081198, + "grad_norm": 0.2495322972536087, + "learning_rate": 1.4811613931892953e-06, + "loss": 0.3759, "step": 123960 }, { - "epoch": 4.36, - "learning_rate": 2.118205877750623e-06, - "loss": 0.2194, + "epoch": 4.467690200742423, + "grad_norm": 0.2298755794763565, + "learning_rate": 1.4801720402838503e-06, + "loss": 0.3509, "step": 123965 }, { - "epoch": 4.36, - "learning_rate": 2.117058456142587e-06, - "loss": 0.2435, + "epoch": 4.467870400403648, + "grad_norm": 0.2513538897037506, + "learning_rate": 1.4791830078315139e-06, + "loss": 0.3569, "step": 123970 }, { - "epoch": 4.36, - "learning_rate": 2.115911331654466e-06, - "loss": 0.2673, + "epoch": 4.468050600064872, + "grad_norm": 0.22999824583530426, + "learning_rate": 1.4781942958457474e-06, + "loss": 0.3452, "step": 123975 }, { - "epoch": 4.36, - "learning_rate": 2.1147645043011533e-06, - "loss": 0.247, + "epoch": 4.468230799726096, + "grad_norm": 0.29147133231163025, + "learning_rate": 1.4772059043400317e-06, + "loss": 0.3672, "step": 123980 }, { - "epoch": 4.36, - "learning_rate": 2.11361797409754e-06, - "loss": 0.2425, + "epoch": 4.468410999387321, + "grad_norm": 0.2764071226119995, + "learning_rate": 1.4762178333278337e-06, + "loss": 0.365, "step": 123985 }, { - "epoch": 4.36, - "learning_rate": 2.1124717410585164e-06, - "loss": 0.2498, + "epoch": 4.468591199048546, + "grad_norm": 0.22821201384067535, + "learning_rate": 1.475230082822615e-06, + "loss": 0.3937, "step": 123990 }, { - "epoch": 4.36, - "learning_rate": 2.111325805198966e-06, - "loss": 0.2905, + "epoch": 4.46877139870977, + "grad_norm": 0.27766650915145874, + "learning_rate": 1.474242652837829e-06, + "loss": 0.3654, "step": 123995 }, { - "epoch": 4.36, - "learning_rate": 2.1101801665337595e-06, - "loss": 0.254, + "epoch": 4.468951598370995, + "grad_norm": 0.21973931789398193, + "learning_rate": 1.4732555433869339e-06, + "loss": 0.3532, "step": 124000 }, { - "epoch": 4.36, - "eval_loss": 0.24889464676380157, - "eval_runtime": 10.5436, - "eval_samples_per_second": 9.484, - "eval_steps_per_second": 9.484, + "epoch": 4.468951598370995, + "eval_loss": 0.42887455224990845, + "eval_runtime": 3.5305, + "eval_samples_per_second": 28.324, + "eval_steps_per_second": 7.081, "step": 124000 }, { - "epoch": 4.36, - "learning_rate": 2.1090348250777797e-06, - "loss": 0.2245, + "epoch": 4.46913179803222, + "grad_norm": 0.18677954375743866, + "learning_rate": 1.4722687544833719e-06, + "loss": 0.3637, "step": 124005 }, { - "epoch": 4.36, - "learning_rate": 2.107889780845904e-06, - "loss": 0.2225, + "epoch": 4.469311997693445, + "grad_norm": 0.22494982182979584, + "learning_rate": 1.4712822861405962e-06, + "loss": 0.3444, "step": 124010 }, { - "epoch": 4.36, - "learning_rate": 2.1067450338529887e-06, - "loss": 0.2623, + "epoch": 4.469492197354669, + "grad_norm": 0.2011212706565857, + "learning_rate": 1.470296138372046e-06, + "loss": 0.3575, "step": 124015 }, { - "epoch": 4.36, - "learning_rate": 2.1056005841139007e-06, - "loss": 0.2521, + "epoch": 4.469672397015893, + "grad_norm": 0.2682434320449829, + "learning_rate": 1.4693103111911549e-06, + "loss": 0.3655, "step": 124020 }, { - "epoch": 4.36, - "learning_rate": 2.104456431643503e-06, - "loss": 0.2633, + "epoch": 4.469852596677118, + "grad_norm": 0.2674734890460968, + "learning_rate": 1.468324804611354e-06, + "loss": 0.3717, "step": 124025 }, { - "epoch": 4.36, - "learning_rate": 2.103312576456651e-06, - "loss": 0.2417, + "epoch": 4.470032796338343, + "grad_norm": 0.24927879869937897, + "learning_rate": 1.4673396186460687e-06, + "loss": 0.3839, "step": 124030 }, { - "epoch": 4.36, - "learning_rate": 2.1021690185681952e-06, - "loss": 0.2539, + "epoch": 4.470212995999567, + "grad_norm": 0.2811104953289032, + "learning_rate": 1.4663547533087297e-06, + "loss": 0.347, "step": 124035 }, { - "epoch": 4.36, - "learning_rate": 2.1010257579929806e-06, - "loss": 0.2413, + "epoch": 4.470393195660792, + "grad_norm": 0.28087154030799866, + "learning_rate": 1.4653702086127541e-06, + "loss": 0.4091, "step": 124040 }, { - "epoch": 4.36, - "learning_rate": 2.099882794745861e-06, - "loss": 0.2429, + "epoch": 4.470573395322017, + "grad_norm": 0.2959977686405182, + "learning_rate": 1.464385984571545e-06, + "loss": 0.3983, "step": 124045 }, { - "epoch": 4.36, - "learning_rate": 2.0987401288416697e-06, - "loss": 0.2523, + "epoch": 4.4707535949832415, + "grad_norm": 0.3171277344226837, + "learning_rate": 1.4634020811985255e-06, + "loss": 0.36, "step": 124050 }, { - "epoch": 4.36, - "learning_rate": 2.0975977602952418e-06, - "loss": 0.2338, + "epoch": 4.470933794644466, + "grad_norm": 0.2086503803730011, + "learning_rate": 1.46241849850709e-06, + "loss": 0.3771, "step": 124055 }, { - "epoch": 4.36, - "learning_rate": 2.0964556891214127e-06, - "loss": 0.2462, + "epoch": 4.471113994305691, + "grad_norm": 0.23770023882389069, + "learning_rate": 1.46143523651065e-06, + "loss": 0.3546, "step": 124060 }, { - "epoch": 4.36, - "learning_rate": 2.0953139153350175e-06, - "loss": 0.2491, + "epoch": 4.471294193966916, + "grad_norm": 0.319051057100296, + "learning_rate": 1.4604522952226001e-06, + "loss": 0.3848, "step": 124065 }, { - "epoch": 4.37, - "learning_rate": 2.0941724389508783e-06, - "loss": 0.237, + "epoch": 4.47147439362814, + "grad_norm": 0.1936783492565155, + "learning_rate": 1.4594696746563246e-06, + "loss": 0.3537, "step": 124070 }, { - "epoch": 4.37, - "learning_rate": 2.0930312599838132e-06, - "loss": 0.2343, + "epoch": 4.471654593289364, + "grad_norm": 0.25065305829048157, + "learning_rate": 1.4584873748252208e-06, + "loss": 0.3907, "step": 124075 }, { - "epoch": 4.37, - "learning_rate": 2.0918903784486355e-06, - "loss": 0.2433, + "epoch": 4.471834792950589, + "grad_norm": 0.23407846689224243, + "learning_rate": 1.4575053957426643e-06, + "loss": 0.3819, "step": 124080 }, { - "epoch": 4.37, - "learning_rate": 2.0907497943601696e-06, - "loss": 0.2395, + "epoch": 4.472014992611814, + "grad_norm": 0.2705335319042206, + "learning_rate": 1.456523737422047e-06, + "loss": 0.3762, "step": 124085 }, { - "epoch": 4.37, - "learning_rate": 2.0896095077332202e-06, - "loss": 0.2481, + "epoch": 4.4721951922730385, + "grad_norm": 0.2572081685066223, + "learning_rate": 1.4555423998767304e-06, + "loss": 0.3961, "step": 124090 }, { - "epoch": 4.37, - "learning_rate": 2.088469518582592e-06, - "loss": 0.2496, + "epoch": 4.472375391934263, + "grad_norm": 0.2333773970603943, + "learning_rate": 1.45456138312009e-06, + "loss": 0.3601, "step": 124095 }, { - "epoch": 4.37, - "learning_rate": 2.0873298269230816e-06, - "loss": 0.2363, + "epoch": 4.472555591595488, + "grad_norm": 0.2367962896823883, + "learning_rate": 1.4535806871654927e-06, + "loss": 0.3246, "step": 124100 }, { - "epoch": 4.37, - "learning_rate": 2.0861904327695e-06, - "loss": 0.2617, + "epoch": 4.472735791256713, + "grad_norm": 0.23657187819480896, + "learning_rate": 1.4526003120263005e-06, + "loss": 0.387, "step": 124105 }, { - "epoch": 4.37, - "learning_rate": 2.0850513361366313e-06, - "loss": 0.2712, + "epoch": 4.4729159909179375, + "grad_norm": 0.2925667464733124, + "learning_rate": 1.451620257715869e-06, + "loss": 0.3759, "step": 124110 }, { - "epoch": 4.37, - "learning_rate": 2.0839125370392732e-06, - "loss": 0.2404, + "epoch": 4.473096190579161, + "grad_norm": 0.30918559432029724, + "learning_rate": 1.450640524247554e-06, + "loss": 0.3738, "step": 124115 }, { - "epoch": 4.37, - "learning_rate": 2.082774035492202e-06, - "loss": 0.2523, + "epoch": 4.473276390240386, + "grad_norm": 0.32785508036613464, + "learning_rate": 1.449661111634698e-06, + "loss": 0.3538, "step": 124120 }, { - "epoch": 4.37, - "learning_rate": 2.0816358315102147e-06, - "loss": 0.2495, + "epoch": 4.473456589901611, + "grad_norm": 0.25581833720207214, + "learning_rate": 1.4486820198906542e-06, + "loss": 0.3863, "step": 124125 }, { - "epoch": 4.37, - "learning_rate": 2.0804979251080795e-06, - "loss": 0.2443, + "epoch": 4.4736367895628355, + "grad_norm": 0.2755228877067566, + "learning_rate": 1.447703249028759e-06, + "loss": 0.3925, "step": 124130 }, { - "epoch": 4.37, - "learning_rate": 2.0793603163005713e-06, - "loss": 0.263, + "epoch": 4.47381698922406, + "grad_norm": 0.29217204451560974, + "learning_rate": 1.4467247990623462e-06, + "loss": 0.3763, "step": 124135 }, { - "epoch": 4.37, - "learning_rate": 2.078223005102467e-06, - "loss": 0.26, + "epoch": 4.473997188885285, + "grad_norm": 0.22582022845745087, + "learning_rate": 1.4457466700047496e-06, + "loss": 0.3484, "step": 124140 }, { - "epoch": 4.37, - "learning_rate": 2.077085991528532e-06, - "loss": 0.251, + "epoch": 4.47417738854651, + "grad_norm": 0.2606388330459595, + "learning_rate": 1.4447688618692916e-06, + "loss": 0.3651, "step": 124145 }, { - "epoch": 4.37, - "learning_rate": 2.0759492755935304e-06, - "loss": 0.2627, + "epoch": 4.474357588207734, + "grad_norm": 0.24026912450790405, + "learning_rate": 1.4437913746692955e-06, + "loss": 0.3714, "step": 124150 }, { - "epoch": 4.37, - "learning_rate": 2.0748128573122138e-06, - "loss": 0.2459, + "epoch": 4.474537787868959, + "grad_norm": 0.28807055950164795, + "learning_rate": 1.4428142084180862e-06, + "loss": 0.359, "step": 124155 }, { - "epoch": 4.37, - "learning_rate": 2.073676736699348e-06, - "loss": 0.2482, + "epoch": 4.474717987530184, + "grad_norm": 0.23027734458446503, + "learning_rate": 1.4418373631289671e-06, + "loss": 0.3887, "step": 124160 }, { - "epoch": 4.37, - "learning_rate": 2.0725409137696774e-06, - "loss": 0.2548, + "epoch": 4.474898187191408, + "grad_norm": 0.2151884138584137, + "learning_rate": 1.4408608388152583e-06, + "loss": 0.3844, "step": 124165 }, { - "epoch": 4.37, - "learning_rate": 2.0714053885379585e-06, - "loss": 0.2491, + "epoch": 4.4750783868526325, + "grad_norm": 0.30616864562034607, + "learning_rate": 1.4398846354902574e-06, + "loss": 0.376, "step": 124170 }, { - "epoch": 4.37, - "learning_rate": 2.0702701610189223e-06, - "loss": 0.2515, + "epoch": 4.475258586513857, + "grad_norm": 0.28647398948669434, + "learning_rate": 1.4389087531672674e-06, + "loss": 0.3926, "step": 124175 }, { - "epoch": 4.37, - "learning_rate": 2.069135231227323e-06, - "loss": 0.2509, + "epoch": 4.475438786175082, + "grad_norm": 0.24233390390872955, + "learning_rate": 1.4379331918595835e-06, + "loss": 0.3548, "step": 124180 }, { - "epoch": 4.37, - "learning_rate": 2.068000599177891e-06, - "loss": 0.2418, + "epoch": 4.475618985836307, + "grad_norm": 0.2877272963523865, + "learning_rate": 1.4369579515804975e-06, + "loss": 0.3599, "step": 124185 }, { - "epoch": 4.37, - "learning_rate": 2.0668662648853587e-06, - "loss": 0.2332, + "epoch": 4.475799185497531, + "grad_norm": 0.2793196737766266, + "learning_rate": 1.4359830323432937e-06, + "loss": 0.3681, "step": 124190 }, { - "epoch": 4.37, - "learning_rate": 2.0657322283644503e-06, - "loss": 0.2485, + "epoch": 4.475979385158756, + "grad_norm": 0.21865501999855042, + "learning_rate": 1.435008434161264e-06, + "loss": 0.3772, "step": 124195 }, { - "epoch": 4.37, - "learning_rate": 2.0645984896298986e-06, - "loss": 0.2504, + "epoch": 4.476159584819981, + "grad_norm": 0.24669122695922852, + "learning_rate": 1.434034157047684e-06, + "loss": 0.3853, "step": 124200 }, { - "epoch": 4.37, - "learning_rate": 2.0634650486964225e-06, - "loss": 0.2497, + "epoch": 4.476339784481205, + "grad_norm": 0.23632431030273438, + "learning_rate": 1.4330602010158235e-06, + "loss": 0.3896, "step": 124205 }, { - "epoch": 4.37, - "learning_rate": 2.062331905578735e-06, - "loss": 0.2594, + "epoch": 4.4765199841424295, + "grad_norm": 0.2615738809108734, + "learning_rate": 1.4320865660789551e-06, + "loss": 0.3571, "step": 124210 }, { - "epoch": 4.37, - "learning_rate": 2.0611990602915464e-06, - "loss": 0.2553, + "epoch": 4.476700183803654, + "grad_norm": 0.23187601566314697, + "learning_rate": 1.4311132522503407e-06, + "loss": 0.3799, "step": 124215 }, { - "epoch": 4.37, - "learning_rate": 2.0600665128495707e-06, - "loss": 0.2484, + "epoch": 4.476880383464879, + "grad_norm": 0.23954427242279053, + "learning_rate": 1.4301402595432472e-06, + "loss": 0.3563, "step": 124220 }, { - "epoch": 4.37, - "learning_rate": 2.0589342632675173e-06, - "loss": 0.2325, + "epoch": 4.477060583126104, + "grad_norm": 0.2520751953125, + "learning_rate": 1.429167587970931e-06, + "loss": 0.3862, "step": 124225 }, { - "epoch": 4.37, - "learning_rate": 2.0578023115600866e-06, - "loss": 0.2434, + "epoch": 4.477240782787328, + "grad_norm": 0.2765583097934723, + "learning_rate": 1.4281952375466451e-06, + "loss": 0.3983, "step": 124230 }, { - "epoch": 4.37, - "learning_rate": 2.056670657741966e-06, - "loss": 0.2457, + "epoch": 4.477420982448553, + "grad_norm": 0.2432951033115387, + "learning_rate": 1.4272232082836317e-06, + "loss": 0.3751, "step": 124235 }, { - "epoch": 4.37, - "learning_rate": 2.0555393018278635e-06, - "loss": 0.2459, + "epoch": 4.477601182109778, + "grad_norm": 0.2171791046857834, + "learning_rate": 1.4262515001951389e-06, + "loss": 0.3707, "step": 124240 }, { - "epoch": 4.37, - "learning_rate": 2.054408243832459e-06, - "loss": 0.2463, + "epoch": 4.477781381771003, + "grad_norm": 0.2439403533935547, + "learning_rate": 1.4252801132944055e-06, + "loss": 0.3821, "step": 124245 }, { - "epoch": 4.37, - "learning_rate": 2.053277483770444e-06, - "loss": 0.2368, + "epoch": 4.477961581432227, + "grad_norm": 0.2420462816953659, + "learning_rate": 1.4243090475946713e-06, + "loss": 0.3772, "step": 124250 }, { - "epoch": 4.37, - "learning_rate": 2.0521470216564946e-06, - "loss": 0.2472, + "epoch": 4.478141781093451, + "grad_norm": 0.28287315368652344, + "learning_rate": 1.4233383031091535e-06, + "loss": 0.3906, "step": 124255 }, { - "epoch": 4.37, - "learning_rate": 2.0510168575052945e-06, - "loss": 0.2458, + "epoch": 4.478321980754676, + "grad_norm": 0.22504858672618866, + "learning_rate": 1.4223678798510915e-06, + "loss": 0.3988, "step": 124260 }, { - "epoch": 4.37, - "learning_rate": 2.0498869913315206e-06, - "loss": 0.2398, + "epoch": 4.478502180415901, + "grad_norm": 0.2430412620306015, + "learning_rate": 1.4213977778336995e-06, + "loss": 0.3859, "step": 124265 }, { - "epoch": 4.37, - "learning_rate": 2.048757423149833e-06, - "loss": 0.2281, + "epoch": 4.478682380077125, + "grad_norm": 0.31164422631263733, + "learning_rate": 1.4204279970702034e-06, + "loss": 0.4001, "step": 124270 }, { - "epoch": 4.37, - "learning_rate": 2.0476281529749092e-06, - "loss": 0.268, + "epoch": 4.47886257973835, + "grad_norm": 0.3133993446826935, + "learning_rate": 1.4194585375738061e-06, + "loss": 0.3915, "step": 124275 }, { - "epoch": 4.37, - "learning_rate": 2.046499180821401e-06, - "loss": 0.259, + "epoch": 4.479042779399575, + "grad_norm": 0.27546271681785583, + "learning_rate": 1.4184893993577198e-06, + "loss": 0.3829, "step": 124280 }, { - "epoch": 4.37, - "learning_rate": 2.045370506703981e-06, - "loss": 0.2406, + "epoch": 4.4792229790608, + "grad_norm": 0.32316166162490845, + "learning_rate": 1.417520582435153e-06, + "loss": 0.3139, "step": 124285 }, { - "epoch": 4.37, - "learning_rate": 2.0442421306372945e-06, - "loss": 0.258, + "epoch": 4.479403178722024, + "grad_norm": 0.23698268830776215, + "learning_rate": 1.4165520868193034e-06, + "loss": 0.3568, "step": 124290 }, { - "epoch": 4.37, - "learning_rate": 2.043114052635997e-06, - "loss": 0.2724, + "epoch": 4.479583378383249, + "grad_norm": 0.26185452938079834, + "learning_rate": 1.4155839125233634e-06, + "loss": 0.3757, "step": 124295 }, { - "epoch": 4.37, - "learning_rate": 2.0419862727147356e-06, - "loss": 0.2524, + "epoch": 4.479763578044473, + "grad_norm": 0.24788565933704376, + "learning_rate": 1.414616059560528e-06, + "loss": 0.3441, "step": 124300 }, { - "epoch": 4.37, - "learning_rate": 2.0408587908881505e-06, - "loss": 0.2328, + "epoch": 4.479943777705698, + "grad_norm": 0.3030286729335785, + "learning_rate": 1.4136485279439783e-06, + "loss": 0.3465, "step": 124305 }, { - "epoch": 4.37, - "learning_rate": 2.0397316071708806e-06, - "loss": 0.2231, + "epoch": 4.480123977366922, + "grad_norm": 0.2569609582424164, + "learning_rate": 1.4126813176869036e-06, + "loss": 0.3192, "step": 124310 }, { - "epoch": 4.37, - "learning_rate": 2.0386047215775692e-06, - "loss": 0.2343, + "epoch": 4.480304177028147, + "grad_norm": 0.22701065242290497, + "learning_rate": 1.4117144288024797e-06, + "loss": 0.3387, "step": 124315 }, { - "epoch": 4.37, - "learning_rate": 2.0374781341228444e-06, - "loss": 0.2617, + "epoch": 4.480484376689372, + "grad_norm": 0.23131400346755981, + "learning_rate": 1.4107478613038821e-06, + "loss": 0.3596, "step": 124320 }, { - "epoch": 4.37, - "learning_rate": 2.036351844821327e-06, - "loss": 0.2502, + "epoch": 4.480664576350597, + "grad_norm": 0.260778546333313, + "learning_rate": 1.4097816152042753e-06, + "loss": 0.3689, "step": 124325 }, { - "epoch": 4.37, - "learning_rate": 2.0352258536876528e-06, - "loss": 0.2625, + "epoch": 4.480844776011821, + "grad_norm": 0.23251162469387054, + "learning_rate": 1.4088156905168293e-06, + "loss": 0.3959, "step": 124330 }, { - "epoch": 4.37, - "learning_rate": 2.03410016073643e-06, - "loss": 0.2404, + "epoch": 4.481024975673046, + "grad_norm": 0.25117242336273193, + "learning_rate": 1.4078500872546973e-06, + "loss": 0.3879, "step": 124335 }, { - "epoch": 4.37, - "learning_rate": 2.0329747659822873e-06, - "loss": 0.2491, + "epoch": 4.481205175334271, + "grad_norm": 0.2599792778491974, + "learning_rate": 1.4068848054310469e-06, + "loss": 0.3692, "step": 124340 }, { - "epoch": 4.37, - "learning_rate": 2.031849669439831e-06, - "loss": 0.24, + "epoch": 4.481385374995495, + "grad_norm": 0.28005966544151306, + "learning_rate": 1.4059198450590143e-06, + "loss": 0.3937, "step": 124345 }, { - "epoch": 4.37, - "learning_rate": 2.030724871123668e-06, - "loss": 0.2722, + "epoch": 4.481565574656719, + "grad_norm": 0.24943847954273224, + "learning_rate": 1.4049552061517619e-06, + "loss": 0.344, "step": 124350 }, { - "epoch": 4.38, - "learning_rate": 2.029600371048407e-06, - "loss": 0.2395, + "epoch": 4.481745774317944, + "grad_norm": 0.2192724198102951, + "learning_rate": 1.4039908887224285e-06, + "loss": 0.3625, "step": 124355 }, { - "epoch": 4.38, - "learning_rate": 2.0284761692286493e-06, - "loss": 0.2606, + "epoch": 4.481925973979169, + "grad_norm": 0.2770889401435852, + "learning_rate": 1.4030268927841427e-06, + "loss": 0.3775, "step": 124360 }, { - "epoch": 4.38, - "learning_rate": 2.027352265678986e-06, - "loss": 0.2531, + "epoch": 4.4821061736403935, + "grad_norm": 0.22299063205718994, + "learning_rate": 1.4020632183500583e-06, + "loss": 0.3846, "step": 124365 }, { - "epoch": 4.38, - "learning_rate": 2.0262286604140144e-06, - "loss": 0.2336, + "epoch": 4.482286373301618, + "grad_norm": 0.2386179119348526, + "learning_rate": 1.4010998654332892e-06, + "loss": 0.3591, "step": 124370 }, { - "epoch": 4.38, - "learning_rate": 2.0251053534483254e-06, - "loss": 0.2459, + "epoch": 4.482466572962843, + "grad_norm": 0.24074918031692505, + "learning_rate": 1.4001368340469611e-06, + "loss": 0.3799, "step": 124375 }, { - "epoch": 4.38, - "learning_rate": 2.0239823447964963e-06, - "loss": 0.2519, + "epoch": 4.482646772624068, + "grad_norm": 0.2530936896800995, + "learning_rate": 1.3991741242042027e-06, + "loss": 0.3429, "step": 124380 }, { - "epoch": 4.38, - "learning_rate": 2.0228596344731205e-06, - "loss": 0.2721, + "epoch": 4.4828269722852925, + "grad_norm": 0.28871747851371765, + "learning_rate": 1.3982117359181286e-06, + "loss": 0.3948, "step": 124385 }, { - "epoch": 4.38, - "learning_rate": 2.021737222492767e-06, - "loss": 0.2573, + "epoch": 4.483007171946516, + "grad_norm": 0.2461950182914734, + "learning_rate": 1.3972496692018499e-06, + "loss": 0.357, "step": 124390 }, { - "epoch": 4.38, - "learning_rate": 2.0206151088700137e-06, - "loss": 0.2838, + "epoch": 4.483187371607741, + "grad_norm": 0.21096403896808624, + "learning_rate": 1.3962879240684734e-06, + "loss": 0.3569, "step": 124395 }, { - "epoch": 4.38, - "learning_rate": 2.019493293619432e-06, - "loss": 0.2519, + "epoch": 4.483367571268966, + "grad_norm": 0.3317560851573944, + "learning_rate": 1.3953265005311021e-06, + "loss": 0.3861, "step": 124400 }, { - "epoch": 4.38, - "learning_rate": 2.018371776755579e-06, - "loss": 0.2379, + "epoch": 4.4835477709301905, + "grad_norm": 0.2790171504020691, + "learning_rate": 1.39436539860284e-06, + "loss": 0.3131, "step": 124405 }, { - "epoch": 4.38, - "learning_rate": 2.0172505582930295e-06, - "loss": 0.2365, + "epoch": 4.483727970591415, + "grad_norm": 0.21916787326335907, + "learning_rate": 1.3934046182967814e-06, + "loss": 0.3983, "step": 124410 }, { - "epoch": 4.38, - "learning_rate": 2.0161296382463335e-06, - "loss": 0.2537, + "epoch": 4.48390817025264, + "grad_norm": 0.25883591175079346, + "learning_rate": 1.3924441596260107e-06, + "loss": 0.3775, "step": 124415 }, { - "epoch": 4.38, - "learning_rate": 2.0150090166300484e-06, - "loss": 0.2313, + "epoch": 4.484088369913865, + "grad_norm": 0.18844369053840637, + "learning_rate": 1.3914840226036202e-06, + "loss": 0.4003, "step": 124420 }, { - "epoch": 4.38, - "learning_rate": 2.013888693458718e-06, - "loss": 0.2522, + "epoch": 4.4842685695750895, + "grad_norm": 0.23868656158447266, + "learning_rate": 1.3905242072426854e-06, + "loss": 0.3791, "step": 124425 }, { - "epoch": 4.38, - "learning_rate": 2.0127686687469004e-06, - "loss": 0.2342, + "epoch": 4.484448769236314, + "grad_norm": 0.27947574853897095, + "learning_rate": 1.3895647135562906e-06, + "loss": 0.3521, "step": 124430 }, { - "epoch": 4.38, - "learning_rate": 2.011648942509131e-06, - "loss": 0.2482, + "epoch": 4.484628968897539, + "grad_norm": 0.23540154099464417, + "learning_rate": 1.3886055415575084e-06, + "loss": 0.355, "step": 124435 }, { - "epoch": 4.38, - "learning_rate": 2.010529514759946e-06, - "loss": 0.2342, + "epoch": 4.484809168558763, + "grad_norm": 0.23615595698356628, + "learning_rate": 1.3876466912593976e-06, + "loss": 0.3757, "step": 124440 }, { - "epoch": 4.38, - "learning_rate": 2.0094103855138852e-06, - "loss": 0.23, + "epoch": 4.4849893682199875, + "grad_norm": 0.22659413516521454, + "learning_rate": 1.3866881626750345e-06, + "loss": 0.3436, "step": 124445 }, { - "epoch": 4.38, - "learning_rate": 2.0082915547854824e-06, - "loss": 0.2378, + "epoch": 4.485169567881212, + "grad_norm": 0.24826356768608093, + "learning_rate": 1.385729955817469e-06, + "loss": 0.3645, "step": 124450 }, { - "epoch": 4.38, - "learning_rate": 2.007173022589262e-06, - "loss": 0.2624, + "epoch": 4.485349767542437, + "grad_norm": 0.24432316422462463, + "learning_rate": 1.3847720706997663e-06, + "loss": 0.3564, "step": 124455 }, { - "epoch": 4.38, - "learning_rate": 2.0060547889397484e-06, - "loss": 0.2536, + "epoch": 4.485529967203662, + "grad_norm": 0.21962685883045197, + "learning_rate": 1.3838145073349712e-06, + "loss": 0.3265, "step": 124460 }, { - "epoch": 4.38, - "learning_rate": 2.004936853851455e-06, - "loss": 0.2516, + "epoch": 4.485710166864886, + "grad_norm": 0.28142571449279785, + "learning_rate": 1.382857265736126e-06, + "loss": 0.3877, "step": 124465 }, { - "epoch": 4.38, - "learning_rate": 2.0038192173389063e-06, - "loss": 0.2552, + "epoch": 4.485890366526111, + "grad_norm": 0.20428895950317383, + "learning_rate": 1.3819003459162816e-06, + "loss": 0.3767, "step": 124470 }, { - "epoch": 4.38, - "learning_rate": 2.0027018794166102e-06, - "loss": 0.2291, + "epoch": 4.486070566187336, + "grad_norm": 0.27324962615966797, + "learning_rate": 1.3809437478884719e-06, + "loss": 0.3281, "step": 124475 }, { - "epoch": 4.38, - "learning_rate": 2.0015848400990716e-06, - "loss": 0.257, + "epoch": 4.48625076584856, + "grad_norm": 0.2558141350746155, + "learning_rate": 1.3799874716657307e-06, + "loss": 0.384, "step": 124480 }, { - "epoch": 4.38, - "learning_rate": 2.000468099400796e-06, - "loss": 0.2471, + "epoch": 4.4864309655097845, + "grad_norm": 0.24941542744636536, + "learning_rate": 1.3790315172610896e-06, + "loss": 0.3501, "step": 124485 }, { - "epoch": 4.38, - "learning_rate": 1.9993516573362904e-06, - "loss": 0.2406, + "epoch": 4.486611165171009, + "grad_norm": 0.2563149333000183, + "learning_rate": 1.3780758846875658e-06, + "loss": 0.3841, "step": 124490 }, { - "epoch": 4.38, - "learning_rate": 1.998235513920038e-06, - "loss": 0.2725, + "epoch": 4.486791364832234, + "grad_norm": 0.2376670092344284, + "learning_rate": 1.3771205739581877e-06, + "loss": 0.3725, "step": 124495 }, { - "epoch": 4.38, - "learning_rate": 1.9971196691665415e-06, - "loss": 0.245, + "epoch": 4.486971564493459, + "grad_norm": 0.30792880058288574, + "learning_rate": 1.37616558508597e-06, + "loss": 0.3779, "step": 124500 }, { - "epoch": 4.38, - "eval_loss": 0.24900418519973755, - "eval_runtime": 10.5316, - "eval_samples_per_second": 9.495, - "eval_steps_per_second": 9.495, + "epoch": 4.486971564493459, + "eval_loss": 0.42892351746559143, + "eval_runtime": 3.5425, + "eval_samples_per_second": 28.229, + "eval_steps_per_second": 7.057, "step": 124500 }, { - "epoch": 4.38, - "learning_rate": 1.9960041230902893e-06, - "loss": 0.2467, + "epoch": 4.487151764154683, + "grad_norm": 0.28269314765930176, + "learning_rate": 1.3752109180839217e-06, + "loss": 0.4113, "step": 124505 }, { - "epoch": 4.38, - "learning_rate": 1.9948888757057637e-06, - "loss": 0.2583, + "epoch": 4.487331963815908, + "grad_norm": 0.27605292201042175, + "learning_rate": 1.3742565729650491e-06, + "loss": 0.3593, "step": 124510 }, { - "epoch": 4.38, - "learning_rate": 1.9937739270274454e-06, - "loss": 0.2436, + "epoch": 4.487512163477133, + "grad_norm": 0.2970362901687622, + "learning_rate": 1.3733025497423586e-06, + "loss": 0.3946, "step": 124515 }, { - "epoch": 4.38, - "learning_rate": 1.9926592770698057e-06, - "loss": 0.2666, + "epoch": 4.487692363138358, + "grad_norm": 0.285276859998703, + "learning_rate": 1.3723488484288421e-06, + "loss": 0.3863, "step": 124520 }, { - "epoch": 4.38, - "learning_rate": 1.9915449258473274e-06, - "loss": 0.2608, + "epoch": 4.487872562799582, + "grad_norm": 0.2632303535938263, + "learning_rate": 1.3713954690375035e-06, + "loss": 0.4355, "step": 124525 }, { - "epoch": 4.38, - "learning_rate": 1.990430873374477e-06, - "loss": 0.2429, + "epoch": 4.488052762460806, + "grad_norm": 0.3081340193748474, + "learning_rate": 1.370442411581324e-06, + "loss": 0.3526, "step": 124530 }, { - "epoch": 4.38, - "learning_rate": 1.989317119665715e-06, - "loss": 0.2524, + "epoch": 4.488232962122031, + "grad_norm": 0.2594117820262909, + "learning_rate": 1.3694896760732933e-06, + "loss": 0.3852, "step": 124535 }, { - "epoch": 4.38, - "learning_rate": 1.9882036647355025e-06, - "loss": 0.2692, + "epoch": 4.488413161783256, + "grad_norm": 0.30716049671173096, + "learning_rate": 1.3685372625263899e-06, + "loss": 0.3624, "step": 124540 }, { - "epoch": 4.38, - "learning_rate": 1.987090508598305e-06, - "loss": 0.2596, + "epoch": 4.48859336144448, + "grad_norm": 0.23581121861934662, + "learning_rate": 1.367585170953589e-06, + "loss": 0.3477, "step": 124545 }, { - "epoch": 4.38, - "learning_rate": 1.9859776512685667e-06, - "loss": 0.2396, + "epoch": 4.488773561105705, + "grad_norm": 0.24266508221626282, + "learning_rate": 1.3666334013678729e-06, + "loss": 0.4177, "step": 124550 }, { - "epoch": 4.38, - "learning_rate": 1.984865092760746e-06, - "loss": 0.2326, + "epoch": 4.48895376076693, + "grad_norm": 0.22927452623844147, + "learning_rate": 1.365681953782194e-06, + "loss": 0.3946, "step": 124555 }, { - "epoch": 4.38, - "learning_rate": 1.983752833089278e-06, - "loss": 0.2454, + "epoch": 4.489133960428155, + "grad_norm": 0.24199864268302917, + "learning_rate": 1.3647308282095206e-06, + "loss": 0.3845, "step": 124560 }, { - "epoch": 4.38, - "learning_rate": 1.9826408722686178e-06, - "loss": 0.2596, + "epoch": 4.489314160089379, + "grad_norm": 0.2787548303604126, + "learning_rate": 1.3637800246628197e-06, + "loss": 0.385, "step": 124565 }, { - "epoch": 4.38, - "learning_rate": 1.9815292103131988e-06, - "loss": 0.273, + "epoch": 4.489494359750604, + "grad_norm": 0.2417118102312088, + "learning_rate": 1.3628295431550365e-06, + "loss": 0.3755, "step": 124570 }, { - "epoch": 4.38, - "learning_rate": 1.9804178472374506e-06, - "loss": 0.2503, + "epoch": 4.489674559411828, + "grad_norm": 0.2551795542240143, + "learning_rate": 1.3618793836991273e-06, + "loss": 0.3499, "step": 124575 }, { - "epoch": 4.38, - "learning_rate": 1.979306783055801e-06, - "loss": 0.248, + "epoch": 4.489854759073053, + "grad_norm": 0.3174431324005127, + "learning_rate": 1.3609295463080318e-06, + "loss": 0.3807, "step": 124580 }, { - "epoch": 4.38, - "learning_rate": 1.978196017782688e-06, - "loss": 0.2314, + "epoch": 4.490034958734277, + "grad_norm": 0.23123787343502045, + "learning_rate": 1.3599800309946925e-06, + "loss": 0.3648, "step": 124585 }, { - "epoch": 4.38, - "learning_rate": 1.9770855514325284e-06, - "loss": 0.2342, + "epoch": 4.490215158395502, + "grad_norm": 0.27223360538482666, + "learning_rate": 1.3590308377720546e-06, + "loss": 0.3421, "step": 124590 }, { - "epoch": 4.38, - "learning_rate": 1.9759753840197374e-06, - "loss": 0.2402, + "epoch": 4.490395358056727, + "grad_norm": 0.25846990942955017, + "learning_rate": 1.3580819666530408e-06, + "loss": 0.3496, "step": 124595 }, { - "epoch": 4.38, - "learning_rate": 1.97486551555873e-06, - "loss": 0.2625, + "epoch": 4.490575557717952, + "grad_norm": 0.25922471284866333, + "learning_rate": 1.3571334176505856e-06, + "loss": 0.3783, "step": 124600 }, { - "epoch": 4.38, - "learning_rate": 1.973755946063918e-06, - "loss": 0.2473, + "epoch": 4.490755757379176, + "grad_norm": 0.2560346722602844, + "learning_rate": 1.3561851907776064e-06, + "loss": 0.3274, "step": 124605 }, { - "epoch": 4.38, - "learning_rate": 1.9726466755497163e-06, - "loss": 0.2347, + "epoch": 4.490935957040401, + "grad_norm": 0.25044140219688416, + "learning_rate": 1.355237286047026e-06, + "loss": 0.377, "step": 124610 }, { - "epoch": 4.38, - "learning_rate": 1.9715377040305182e-06, - "loss": 0.2508, + "epoch": 4.491116156701626, + "grad_norm": 0.25878745317459106, + "learning_rate": 1.3542897034717644e-06, + "loss": 0.3426, "step": 124615 }, { - "epoch": 4.38, - "learning_rate": 1.970429031520729e-06, - "loss": 0.2427, + "epoch": 4.49129635636285, + "grad_norm": 0.23946064710617065, + "learning_rate": 1.3533424430647285e-06, + "loss": 0.3811, "step": 124620 }, { - "epoch": 4.38, - "learning_rate": 1.969320658034743e-06, - "loss": 0.2439, + "epoch": 4.491476556024074, + "grad_norm": 0.282932311296463, + "learning_rate": 1.3523955048388188e-06, + "loss": 0.3618, "step": 124625 }, { - "epoch": 4.38, - "learning_rate": 1.9682125835869507e-06, - "loss": 0.249, + "epoch": 4.491656755685299, + "grad_norm": 0.2500767707824707, + "learning_rate": 1.3514488888069443e-06, + "loss": 0.3679, "step": 124630 }, { - "epoch": 4.39, - "learning_rate": 1.967104808191736e-06, - "loss": 0.2222, + "epoch": 4.491836955346524, + "grad_norm": 0.2235366553068161, + "learning_rate": 1.3505025949819978e-06, + "loss": 0.3695, "step": 124635 }, { - "epoch": 4.39, - "learning_rate": 1.9659973318634896e-06, - "loss": 0.2549, + "epoch": 4.492017155007749, + "grad_norm": 0.31541529297828674, + "learning_rate": 1.3495566233768742e-06, + "loss": 0.4089, "step": 124640 }, { - "epoch": 4.39, - "learning_rate": 1.964890154616586e-06, - "loss": 0.2388, + "epoch": 4.492197354668973, + "grad_norm": 0.3026379942893982, + "learning_rate": 1.3486109740044688e-06, + "loss": 0.3709, "step": 124645 }, { - "epoch": 4.39, - "learning_rate": 1.9637832764654057e-06, - "loss": 0.27, + "epoch": 4.492377554330198, + "grad_norm": 0.26099175214767456, + "learning_rate": 1.347665646877655e-06, + "loss": 0.3478, "step": 124650 }, { - "epoch": 4.39, - "learning_rate": 1.962676697424312e-06, - "loss": 0.2355, + "epoch": 4.492557753991423, + "grad_norm": 0.2050563395023346, + "learning_rate": 1.3467206420093191e-06, + "loss": 0.367, "step": 124655 }, { - "epoch": 4.39, - "learning_rate": 1.96157041750768e-06, - "loss": 0.2612, + "epoch": 4.4927379536526475, + "grad_norm": 0.26844698190689087, + "learning_rate": 1.3457759594123348e-06, + "loss": 0.3862, "step": 124660 }, { - "epoch": 4.39, - "learning_rate": 1.960464436729878e-06, - "loss": 0.2444, + "epoch": 4.492918153313871, + "grad_norm": 0.2691669762134552, + "learning_rate": 1.3448315990995746e-06, + "loss": 0.3652, "step": 124665 }, { - "epoch": 4.39, - "learning_rate": 1.959358755105262e-06, - "loss": 0.2326, + "epoch": 4.493098352975096, + "grad_norm": 0.26715192198753357, + "learning_rate": 1.3438875610839035e-06, + "loss": 0.3557, "step": 124670 }, { - "epoch": 4.39, - "learning_rate": 1.9582533726481833e-06, - "loss": 0.2414, + "epoch": 4.493278552636321, + "grad_norm": 0.27473947405815125, + "learning_rate": 1.3429438453781807e-06, + "loss": 0.3646, "step": 124675 }, { - "epoch": 4.39, - "learning_rate": 1.9571482893730007e-06, - "loss": 0.2675, + "epoch": 4.4934587522975455, + "grad_norm": 0.272559255361557, + "learning_rate": 1.3420004519952706e-06, + "loss": 0.3924, "step": 124680 }, { - "epoch": 4.39, - "learning_rate": 1.9560435052940667e-06, - "loss": 0.2569, + "epoch": 4.49363895195877, + "grad_norm": 0.2625700831413269, + "learning_rate": 1.3410573809480242e-06, + "loss": 0.4042, "step": 124685 }, { - "epoch": 4.39, - "learning_rate": 1.9549390204257194e-06, - "loss": 0.2463, + "epoch": 4.493819151619995, + "grad_norm": 0.2726500332355499, + "learning_rate": 1.3401146322492924e-06, + "loss": 0.3691, "step": 124690 }, { - "epoch": 4.39, - "learning_rate": 1.953834834782295e-06, - "loss": 0.2412, + "epoch": 4.49399935128122, + "grad_norm": 0.23851507902145386, + "learning_rate": 1.339172205911915e-06, + "loss": 0.3906, "step": 124695 }, { - "epoch": 4.39, - "learning_rate": 1.952730948378145e-06, - "loss": 0.2457, + "epoch": 4.4941795509424445, + "grad_norm": 0.212006077170372, + "learning_rate": 1.338230101948737e-06, + "loss": 0.3953, "step": 124700 }, { - "epoch": 4.39, - "learning_rate": 1.951627361227593e-06, - "loss": 0.2618, + "epoch": 4.494359750603669, + "grad_norm": 0.24577181041240692, + "learning_rate": 1.33728832037259e-06, + "loss": 0.354, "step": 124705 }, { - "epoch": 4.39, - "learning_rate": 1.950524073344967e-06, - "loss": 0.2526, + "epoch": 4.494539950264894, + "grad_norm": 0.24666406214237213, + "learning_rate": 1.336346861196311e-06, + "loss": 0.3328, "step": 124710 }, { - "epoch": 4.39, - "learning_rate": 1.949421084744599e-06, - "loss": 0.2455, + "epoch": 4.494720149926118, + "grad_norm": 0.22719623148441315, + "learning_rate": 1.3354057244327256e-06, + "loss": 0.3629, "step": 124715 }, { - "epoch": 4.39, - "learning_rate": 1.948318395440804e-06, - "loss": 0.2467, + "epoch": 4.4949003495873425, + "grad_norm": 0.24336789548397064, + "learning_rate": 1.3344649100946543e-06, + "loss": 0.3702, "step": 124720 }, { - "epoch": 4.39, - "learning_rate": 1.9472160054479077e-06, - "loss": 0.2602, + "epoch": 4.495080549248567, + "grad_norm": 0.2589198350906372, + "learning_rate": 1.3335244181949174e-06, + "loss": 0.3501, "step": 124725 }, { - "epoch": 4.39, - "learning_rate": 1.946113914780212e-06, - "loss": 0.2352, + "epoch": 4.495260748909792, + "grad_norm": 0.2021062821149826, + "learning_rate": 1.3325842487463241e-06, + "loss": 0.3659, "step": 124730 }, { - "epoch": 4.39, - "learning_rate": 1.945012123452042e-06, - "loss": 0.2524, + "epoch": 4.495440948571017, + "grad_norm": 0.22475126385688782, + "learning_rate": 1.3316444017616975e-06, + "loss": 0.3618, "step": 124735 }, { - "epoch": 4.39, - "learning_rate": 1.943910631477694e-06, - "loss": 0.245, + "epoch": 4.4956211482322415, + "grad_norm": 0.29188576340675354, + "learning_rate": 1.3307048772538272e-06, + "loss": 0.3786, "step": 124740 }, { - "epoch": 4.39, - "learning_rate": 1.9428094388714707e-06, - "loss": 0.2418, + "epoch": 4.495801347893466, + "grad_norm": 0.2823890149593353, + "learning_rate": 1.3297656752355197e-06, + "loss": 0.3519, "step": 124745 }, { - "epoch": 4.39, - "learning_rate": 1.9417085456476693e-06, - "loss": 0.2375, + "epoch": 4.495981547554691, + "grad_norm": 0.2329128384590149, + "learning_rate": 1.3288267957195733e-06, + "loss": 0.3333, "step": 124750 }, { - "epoch": 4.39, - "learning_rate": 1.940607951820592e-06, - "loss": 0.2728, + "epoch": 4.496161747215915, + "grad_norm": 0.3408520817756653, + "learning_rate": 1.3278882387187775e-06, + "loss": 0.3682, "step": 124755 }, { - "epoch": 4.39, - "learning_rate": 1.9395076574045213e-06, - "loss": 0.2674, + "epoch": 4.4963419468771395, + "grad_norm": 0.2713524103164673, + "learning_rate": 1.3269500042459276e-06, + "loss": 0.3608, "step": 124760 }, { - "epoch": 4.39, - "learning_rate": 1.938407662413741e-06, - "loss": 0.2358, + "epoch": 4.496522146538364, + "grad_norm": 0.2413674145936966, + "learning_rate": 1.3260120923137943e-06, + "loss": 0.3557, "step": 124765 }, { - "epoch": 4.39, - "learning_rate": 1.937307966862545e-06, - "loss": 0.2655, + "epoch": 4.496702346199589, + "grad_norm": 0.29502227902412415, + "learning_rate": 1.325074502935164e-06, + "loss": 0.3778, "step": 124770 }, { - "epoch": 4.39, - "learning_rate": 1.936208570765202e-06, - "loss": 0.2538, + "epoch": 4.496882545860814, + "grad_norm": 0.21458546817302704, + "learning_rate": 1.3241372361228104e-06, + "loss": 0.3857, "step": 124775 }, { - "epoch": 4.39, - "learning_rate": 1.935109474135996e-06, - "loss": 0.2464, + "epoch": 4.497062745522038, + "grad_norm": 0.241383358836174, + "learning_rate": 1.3232002918895036e-06, + "loss": 0.4047, "step": 124780 }, { - "epoch": 4.39, - "learning_rate": 1.9340106769891924e-06, - "loss": 0.2625, + "epoch": 4.497242945183263, + "grad_norm": 0.2607963979244232, + "learning_rate": 1.3222636702480084e-06, + "loss": 0.3352, "step": 124785 }, { - "epoch": 4.39, - "learning_rate": 1.9329121793390554e-06, - "loss": 0.2676, + "epoch": 4.497423144844488, + "grad_norm": 0.24018988013267517, + "learning_rate": 1.3213273712110895e-06, + "loss": 0.3648, "step": 124790 }, { - "epoch": 4.39, - "learning_rate": 1.9318139811998537e-06, - "loss": 0.2386, + "epoch": 4.497603344505713, + "grad_norm": 0.22829695045948029, + "learning_rate": 1.3203913947914953e-06, + "loss": 0.373, "step": 124795 }, { - "epoch": 4.39, - "learning_rate": 1.9307160825858483e-06, - "loss": 0.2545, + "epoch": 4.497783544166937, + "grad_norm": 0.22519542276859283, + "learning_rate": 1.3194557410019875e-06, + "loss": 0.3512, "step": 124800 }, { - "epoch": 4.39, - "learning_rate": 1.929618483511289e-06, - "loss": 0.2529, + "epoch": 4.497963743828161, + "grad_norm": 0.2739558219909668, + "learning_rate": 1.3185204098553089e-06, + "loss": 0.3571, "step": 124805 }, { - "epoch": 4.39, - "learning_rate": 1.9285211839904256e-06, - "loss": 0.2464, + "epoch": 4.498143943489386, + "grad_norm": 0.29086029529571533, + "learning_rate": 1.3175854013642074e-06, + "loss": 0.3761, "step": 124810 }, { - "epoch": 4.39, - "learning_rate": 1.9274241840375155e-06, - "loss": 0.2499, + "epoch": 4.498324143150611, + "grad_norm": 0.2614690959453583, + "learning_rate": 1.3166507155414176e-06, + "loss": 0.3772, "step": 124815 }, { - "epoch": 4.39, - "learning_rate": 1.9263274836667927e-06, - "loss": 0.2586, + "epoch": 4.498504342811835, + "grad_norm": 0.2470005601644516, + "learning_rate": 1.315716352399679e-06, + "loss": 0.3845, "step": 124820 }, { - "epoch": 4.39, - "learning_rate": 1.9252310828925035e-06, - "loss": 0.2359, + "epoch": 4.49868454247306, + "grad_norm": 0.22588050365447998, + "learning_rate": 1.3147823119517122e-06, + "loss": 0.3627, "step": 124825 }, { - "epoch": 4.39, - "learning_rate": 1.9241349817288812e-06, - "loss": 0.2371, + "epoch": 4.498864742134285, + "grad_norm": 0.187801331281662, + "learning_rate": 1.3138485942102625e-06, + "loss": 0.3807, "step": 124830 }, { - "epoch": 4.39, - "learning_rate": 1.9230391801901588e-06, - "loss": 0.2598, + "epoch": 4.49904494179551, + "grad_norm": 0.23127955198287964, + "learning_rate": 1.3129151991880307e-06, + "loss": 0.3522, "step": 124835 }, { - "epoch": 4.39, - "learning_rate": 1.9219436782905666e-06, - "loss": 0.2648, + "epoch": 4.499225141456734, + "grad_norm": 0.20928794145584106, + "learning_rate": 1.3119821268977455e-06, + "loss": 0.3553, "step": 124840 }, { - "epoch": 4.39, - "learning_rate": 1.9208484760443235e-06, - "loss": 0.2486, + "epoch": 4.499405341117959, + "grad_norm": 0.2646942138671875, + "learning_rate": 1.3110493773521193e-06, + "loss": 0.3659, "step": 124845 }, { - "epoch": 4.39, - "learning_rate": 1.9197535734656545e-06, - "loss": 0.2699, + "epoch": 4.499585540779183, + "grad_norm": 0.2235756665468216, + "learning_rate": 1.3101169505638582e-06, + "loss": 0.3924, "step": 124850 }, { - "epoch": 4.39, - "learning_rate": 1.9186589705687762e-06, - "loss": 0.2401, + "epoch": 4.499765740440408, + "grad_norm": 0.2302202582359314, + "learning_rate": 1.3091848465456692e-06, + "loss": 0.3704, "step": 124855 }, { - "epoch": 4.39, - "learning_rate": 1.917564667367902e-06, - "loss": 0.2449, + "epoch": 4.499945940101632, + "grad_norm": 0.2802170515060425, + "learning_rate": 1.3082530653102444e-06, + "loss": 0.3699, "step": 124860 }, { - "epoch": 4.39, - "learning_rate": 1.9164706638772316e-06, - "loss": 0.236, + "epoch": 4.500126139762857, + "grad_norm": 0.2077890932559967, + "learning_rate": 1.3073216068702876e-06, + "loss": 0.3576, "step": 124865 }, { - "epoch": 4.39, - "learning_rate": 1.9153769601109844e-06, - "loss": 0.26, + "epoch": 4.500306339424082, + "grad_norm": 0.2789739966392517, + "learning_rate": 1.3063904712384889e-06, + "loss": 0.3412, "step": 124870 }, { - "epoch": 4.39, - "learning_rate": 1.9142835560833515e-06, - "loss": 0.2426, + "epoch": 4.500486539085307, + "grad_norm": 0.2757323086261749, + "learning_rate": 1.3054596584275325e-06, + "loss": 0.3565, "step": 124875 }, { - "epoch": 4.39, - "learning_rate": 1.9131904518085276e-06, - "loss": 0.2414, + "epoch": 4.500666738746531, + "grad_norm": 0.18960040807724, + "learning_rate": 1.3045291684500999e-06, + "loss": 0.3404, "step": 124880 }, { - "epoch": 4.39, - "learning_rate": 1.912097647300715e-06, - "loss": 0.2513, + "epoch": 4.500846938407756, + "grad_norm": 0.23652799427509308, + "learning_rate": 1.3035990013188698e-06, + "loss": 0.3883, "step": 124885 }, { - "epoch": 4.39, - "learning_rate": 1.911005142574099e-06, - "loss": 0.2624, + "epoch": 4.501027138068981, + "grad_norm": 0.2553980350494385, + "learning_rate": 1.3026691570465128e-06, + "loss": 0.3724, "step": 124890 }, { - "epoch": 4.39, - "learning_rate": 1.9099129376428658e-06, - "loss": 0.2521, + "epoch": 4.5012073377302055, + "grad_norm": 0.28166186809539795, + "learning_rate": 1.3017396356457045e-06, + "loss": 0.3743, "step": 124895 }, { - "epoch": 4.39, - "learning_rate": 1.908821032521196e-06, - "loss": 0.2464, + "epoch": 4.501387537391429, + "grad_norm": 0.25217875838279724, + "learning_rate": 1.3008104371291047e-06, + "loss": 0.3456, "step": 124900 }, { - "epoch": 4.39, - "learning_rate": 1.907729427223265e-06, - "loss": 0.2497, + "epoch": 4.501567737052654, + "grad_norm": 0.22290779650211334, + "learning_rate": 1.299881561509375e-06, + "loss": 0.3797, "step": 124905 }, { - "epoch": 4.39, - "learning_rate": 1.9066381217632516e-06, - "loss": 0.2499, + "epoch": 4.501747936713879, + "grad_norm": 0.25568974018096924, + "learning_rate": 1.2989530087991696e-06, + "loss": 0.3846, "step": 124910 }, { - "epoch": 4.39, - "learning_rate": 1.9055471161553262e-06, - "loss": 0.2384, + "epoch": 4.501928136375104, + "grad_norm": 0.23674674332141876, + "learning_rate": 1.2980247790111367e-06, + "loss": 0.4019, "step": 124915 }, { - "epoch": 4.4, - "learning_rate": 1.9044564104136492e-06, - "loss": 0.2524, + "epoch": 4.502108336036328, + "grad_norm": 0.22385911643505096, + "learning_rate": 1.2970968721579324e-06, + "loss": 0.3634, "step": 124920 }, { - "epoch": 4.4, - "learning_rate": 1.9033660045523788e-06, - "loss": 0.2348, + "epoch": 4.502288535697553, + "grad_norm": 0.3423471748828888, + "learning_rate": 1.296169288252197e-06, + "loss": 0.3883, "step": 124925 }, { - "epoch": 4.4, - "learning_rate": 1.902275898585687e-06, - "loss": 0.2543, + "epoch": 4.502468735358778, + "grad_norm": 0.29338136315345764, + "learning_rate": 1.295242027306559e-06, + "loss": 0.3775, "step": 124930 }, { - "epoch": 4.4, - "learning_rate": 1.9011860925277153e-06, - "loss": 0.2533, + "epoch": 4.5026489350200025, + "grad_norm": 0.17532098293304443, + "learning_rate": 1.2943150893336614e-06, + "loss": 0.343, "step": 124935 }, { - "epoch": 4.4, - "learning_rate": 1.9000965863926245e-06, - "loss": 0.231, + "epoch": 4.502829134681226, + "grad_norm": 0.27964937686920166, + "learning_rate": 1.29338847434613e-06, + "loss": 0.3547, "step": 124940 }, { - "epoch": 4.4, - "learning_rate": 1.8990073801945508e-06, - "loss": 0.2458, + "epoch": 4.503009334342451, + "grad_norm": 0.2344919592142105, + "learning_rate": 1.2924621823565964e-06, + "loss": 0.3394, "step": 124945 }, { - "epoch": 4.4, - "learning_rate": 1.8979184739476463e-06, - "loss": 0.2199, + "epoch": 4.503189534003676, + "grad_norm": 0.2620304226875305, + "learning_rate": 1.2915362133776725e-06, + "loss": 0.3778, "step": 124950 }, { - "epoch": 4.4, - "learning_rate": 1.8968298676660472e-06, - "loss": 0.2734, + "epoch": 4.503369733664901, + "grad_norm": 0.26436105370521545, + "learning_rate": 1.2906105674219737e-06, + "loss": 0.3555, "step": 124955 }, { - "epoch": 4.4, - "learning_rate": 1.8957415613638813e-06, - "loss": 0.2508, + "epoch": 4.503549933326125, + "grad_norm": 0.25804218649864197, + "learning_rate": 1.2896852445021173e-06, + "loss": 0.361, "step": 124960 }, { - "epoch": 4.4, - "learning_rate": 1.8946535550552897e-06, - "loss": 0.2559, + "epoch": 4.50373013298735, + "grad_norm": 0.2341192215681076, + "learning_rate": 1.2887602446307128e-06, + "loss": 0.3672, "step": 124965 }, { - "epoch": 4.4, - "learning_rate": 1.8935658487543944e-06, - "loss": 0.2305, + "epoch": 4.503910332648575, + "grad_norm": 0.26045674085617065, + "learning_rate": 1.2878355678203558e-06, + "loss": 0.3609, "step": 124970 }, { - "epoch": 4.4, - "learning_rate": 1.8924784424753179e-06, - "loss": 0.2515, + "epoch": 4.5040905323097995, + "grad_norm": 0.2202635258436203, + "learning_rate": 1.2869112140836525e-06, + "loss": 0.38, "step": 124975 }, { - "epoch": 4.4, - "learning_rate": 1.891391336232176e-06, - "loss": 0.2453, + "epoch": 4.504270731971024, + "grad_norm": 0.2518618404865265, + "learning_rate": 1.285987183433185e-06, + "loss": 0.3294, "step": 124980 }, { - "epoch": 4.4, - "learning_rate": 1.8903045300390942e-06, - "loss": 0.244, + "epoch": 4.504450931632249, + "grad_norm": 0.22585217654705048, + "learning_rate": 1.2850634758815567e-06, + "loss": 0.3649, "step": 124985 }, { - "epoch": 4.4, - "learning_rate": 1.8892180239101747e-06, - "loss": 0.2466, + "epoch": 4.504631131293473, + "grad_norm": 0.24542665481567383, + "learning_rate": 1.2841400914413465e-06, + "loss": 0.3599, "step": 124990 }, { - "epoch": 4.4, - "learning_rate": 1.8881318178595314e-06, - "loss": 0.2572, + "epoch": 4.5048113309546975, + "grad_norm": 0.23829962313175201, + "learning_rate": 1.2832170301251362e-06, + "loss": 0.379, "step": 124995 }, { - "epoch": 4.4, - "learning_rate": 1.8870459119012612e-06, - "loss": 0.2675, + "epoch": 4.504991530615922, + "grad_norm": 0.30551934242248535, + "learning_rate": 1.2822942919455045e-06, + "loss": 0.4066, "step": 125000 }, { - "epoch": 4.4, - "eval_loss": 0.24892328679561615, - "eval_runtime": 10.538, - "eval_samples_per_second": 9.489, - "eval_steps_per_second": 9.489, + "epoch": 4.504991530615922, + "eval_loss": 0.42903777956962585, + "eval_runtime": 3.5368, + "eval_samples_per_second": 28.274, + "eval_steps_per_second": 7.068, "step": 125000 }, { - "epoch": 4.4, - "learning_rate": 1.8859603060494695e-06, - "loss": 0.2666, + "epoch": 4.505171730277147, + "grad_norm": 0.2084054797887802, + "learning_rate": 1.281371876915019e-06, + "loss": 0.351, "step": 125005 }, { - "epoch": 4.4, - "learning_rate": 1.8848750003182537e-06, - "loss": 0.2405, + "epoch": 4.505351929938372, + "grad_norm": 0.20273062586784363, + "learning_rate": 1.280449785046245e-06, + "loss": 0.366, "step": 125010 }, { - "epoch": 4.4, - "learning_rate": 1.883789994721702e-06, - "loss": 0.2561, + "epoch": 4.5055321295995965, + "grad_norm": 0.2866862714290619, + "learning_rate": 1.2795280163517581e-06, + "loss": 0.413, "step": 125015 }, { - "epoch": 4.4, - "learning_rate": 1.8827052892738979e-06, - "loss": 0.2539, + "epoch": 4.505712329260821, + "grad_norm": 0.2754046618938446, + "learning_rate": 1.2786065708441042e-06, + "loss": 0.3871, "step": 125020 }, { - "epoch": 4.4, - "learning_rate": 1.8816208839889382e-06, - "loss": 0.2762, + "epoch": 4.505892528922046, + "grad_norm": 0.2317970097064972, + "learning_rate": 1.2776854485358453e-06, + "loss": 0.3907, "step": 125025 }, { - "epoch": 4.4, - "learning_rate": 1.8805367788808925e-06, - "loss": 0.2596, + "epoch": 4.50607272858327, + "grad_norm": 0.19607658684253693, + "learning_rate": 1.2767646494395296e-06, + "loss": 0.3727, "step": 125030 }, { - "epoch": 4.4, - "learning_rate": 1.879452973963841e-06, - "loss": 0.2361, + "epoch": 4.5062529282444945, + "grad_norm": 0.18511025607585907, + "learning_rate": 1.2758441735677028e-06, + "loss": 0.3539, "step": 125035 }, { - "epoch": 4.4, - "learning_rate": 1.8783694692518506e-06, - "loss": 0.2608, + "epoch": 4.506433127905719, + "grad_norm": 0.27356868982315063, + "learning_rate": 1.2749240209329049e-06, + "loss": 0.3884, "step": 125040 }, { - "epoch": 4.4, - "learning_rate": 1.8772862647589957e-06, - "loss": 0.2547, + "epoch": 4.506613327566944, + "grad_norm": 0.22257938981056213, + "learning_rate": 1.274004191547673e-06, + "loss": 0.3783, "step": 125045 }, { - "epoch": 4.4, - "learning_rate": 1.8762033604993457e-06, - "loss": 0.2269, + "epoch": 4.506793527228169, + "grad_norm": 0.28302866220474243, + "learning_rate": 1.2730846854245416e-06, + "loss": 0.3706, "step": 125050 }, { - "epoch": 4.4, - "learning_rate": 1.875120756486956e-06, - "loss": 0.2755, + "epoch": 4.5069737268893935, + "grad_norm": 0.33327221870422363, + "learning_rate": 1.2721655025760365e-06, + "loss": 0.387, "step": 125055 }, { - "epoch": 4.4, - "learning_rate": 1.8740384527358767e-06, - "loss": 0.2498, + "epoch": 4.507153926550618, + "grad_norm": 0.24522875249385834, + "learning_rate": 1.2712466430146868e-06, + "loss": 0.3623, "step": 125060 }, { - "epoch": 4.4, - "learning_rate": 1.8729564492601742e-06, - "loss": 0.2621, + "epoch": 4.507334126211843, + "grad_norm": 0.2788931131362915, + "learning_rate": 1.270328106753005e-06, + "loss": 0.3606, "step": 125065 }, { - "epoch": 4.4, - "learning_rate": 1.87187474607389e-06, - "loss": 0.249, + "epoch": 4.507514325873068, + "grad_norm": 0.23487606644630432, + "learning_rate": 1.2694098938035082e-06, + "loss": 0.3898, "step": 125070 }, { - "epoch": 4.4, - "learning_rate": 1.8707933431910657e-06, - "loss": 0.2657, + "epoch": 4.507694525534292, + "grad_norm": 0.24076257646083832, + "learning_rate": 1.2684920041787034e-06, + "loss": 0.3831, "step": 125075 }, { - "epoch": 4.4, - "learning_rate": 1.869712240625751e-06, - "loss": 0.251, + "epoch": 4.507874725195517, + "grad_norm": 0.22647157311439514, + "learning_rate": 1.2675744378911058e-06, + "loss": 0.3621, "step": 125080 }, { - "epoch": 4.4, - "learning_rate": 1.8686314383919794e-06, - "loss": 0.2563, + "epoch": 4.508054924856741, + "grad_norm": 0.28119003772735596, + "learning_rate": 1.2666571949532109e-06, + "loss": 0.3778, "step": 125085 }, { - "epoch": 4.4, - "learning_rate": 1.867550936503784e-06, - "loss": 0.2529, + "epoch": 4.508235124517966, + "grad_norm": 0.20872727036476135, + "learning_rate": 1.2657402753775165e-06, + "loss": 0.3657, "step": 125090 }, { - "epoch": 4.4, - "learning_rate": 1.8664707349751926e-06, - "loss": 0.2425, + "epoch": 4.50841532417919, + "grad_norm": 0.2806181311607361, + "learning_rate": 1.2648236791765161e-06, + "loss": 0.3691, "step": 125095 }, { - "epoch": 4.4, - "learning_rate": 1.8653908338202325e-06, - "loss": 0.2547, + "epoch": 4.508595523840415, + "grad_norm": 0.2301296591758728, + "learning_rate": 1.2639074063626939e-06, + "loss": 0.376, "step": 125100 }, { - "epoch": 4.4, - "learning_rate": 1.8643112330529288e-06, - "loss": 0.2334, + "epoch": 4.50877572350164, + "grad_norm": 0.31790241599082947, + "learning_rate": 1.2629914569485396e-06, + "loss": 0.3857, "step": 125105 }, { - "epoch": 4.4, - "learning_rate": 1.8632319326872982e-06, - "loss": 0.2442, + "epoch": 4.508955923162865, + "grad_norm": 0.24608851969242096, + "learning_rate": 1.262075830946538e-06, + "loss": 0.3611, "step": 125110 }, { - "epoch": 4.4, - "learning_rate": 1.8621529327373488e-06, - "loss": 0.2678, + "epoch": 4.509136122824089, + "grad_norm": 0.2477652132511139, + "learning_rate": 1.2611605283691485e-06, + "loss": 0.3455, "step": 125115 }, { - "epoch": 4.4, - "learning_rate": 1.8610742332171e-06, - "loss": 0.2398, + "epoch": 4.509316322485314, + "grad_norm": 0.2697392702102661, + "learning_rate": 1.2602455492288556e-06, + "loss": 0.3734, "step": 125120 }, { - "epoch": 4.4, - "learning_rate": 1.8599958341405515e-06, - "loss": 0.2348, + "epoch": 4.509496522146538, + "grad_norm": 0.29126474261283875, + "learning_rate": 1.259330893538116e-06, + "loss": 0.3937, "step": 125125 }, { - "epoch": 4.4, - "learning_rate": 1.8589177355217092e-06, - "loss": 0.2419, + "epoch": 4.509676721807763, + "grad_norm": 0.281927227973938, + "learning_rate": 1.2584165613094029e-06, + "loss": 0.3374, "step": 125130 }, { - "epoch": 4.4, - "learning_rate": 1.8578399373745643e-06, - "loss": 0.2521, + "epoch": 4.509856921468987, + "grad_norm": 0.30632874369621277, + "learning_rate": 1.2575025525551648e-06, + "loss": 0.3651, "step": 125135 }, { - "epoch": 4.4, - "learning_rate": 1.8567624397131195e-06, - "loss": 0.2476, + "epoch": 4.510037121130212, + "grad_norm": 0.2316470891237259, + "learning_rate": 1.2565888672878556e-06, + "loss": 0.3384, "step": 125140 }, { - "epoch": 4.4, - "learning_rate": 1.8556852425513637e-06, - "loss": 0.2725, + "epoch": 4.510217320791437, + "grad_norm": 0.2622465193271637, + "learning_rate": 1.2556755055199292e-06, + "loss": 0.3726, "step": 125145 }, { - "epoch": 4.4, - "learning_rate": 1.8546083459032775e-06, - "loss": 0.2639, + "epoch": 4.510397520452662, + "grad_norm": 0.25914567708969116, + "learning_rate": 1.254762467263826e-06, + "loss": 0.3963, "step": 125150 }, { - "epoch": 4.4, - "learning_rate": 1.8535317497828552e-06, - "loss": 0.2557, + "epoch": 4.510577720113886, + "grad_norm": 0.26706215739250183, + "learning_rate": 1.2538497525319882e-06, + "loss": 0.3942, "step": 125155 }, { - "epoch": 4.4, - "learning_rate": 1.8524554542040634e-06, - "loss": 0.2451, + "epoch": 4.510757919775111, + "grad_norm": 0.23364363610744476, + "learning_rate": 1.2529373613368506e-06, + "loss": 0.3678, "step": 125160 }, { - "epoch": 4.4, - "learning_rate": 1.8513794591808853e-06, - "loss": 0.244, + "epoch": 4.510938119436336, + "grad_norm": 0.28146228194236755, + "learning_rate": 1.2520252936908394e-06, + "loss": 0.3441, "step": 125165 }, { - "epoch": 4.4, - "learning_rate": 1.8503037647272902e-06, - "loss": 0.2563, + "epoch": 4.511118319097561, + "grad_norm": 0.25890275835990906, + "learning_rate": 1.251113549606389e-06, + "loss": 0.3795, "step": 125170 }, { - "epoch": 4.4, - "learning_rate": 1.8492283708572422e-06, - "loss": 0.2534, + "epoch": 4.511298518758784, + "grad_norm": 0.24945032596588135, + "learning_rate": 1.2502021290959177e-06, + "loss": 0.3772, "step": 125175 }, { - "epoch": 4.4, - "learning_rate": 1.8481532775847077e-06, - "loss": 0.2703, + "epoch": 4.511478718420009, + "grad_norm": 0.2131379246711731, + "learning_rate": 1.2492910321718453e-06, + "loss": 0.3562, "step": 125180 }, { - "epoch": 4.4, - "learning_rate": 1.8470784849236479e-06, - "loss": 0.2378, + "epoch": 4.511658918081234, + "grad_norm": 0.32095032930374146, + "learning_rate": 1.2483802588465848e-06, + "loss": 0.3529, "step": 125185 }, { - "epoch": 4.4, - "learning_rate": 1.8460039928880125e-06, - "loss": 0.2739, + "epoch": 4.511839117742459, + "grad_norm": 0.2094382345676422, + "learning_rate": 1.2474698091325454e-06, + "loss": 0.3687, "step": 125190 }, { - "epoch": 4.4, - "learning_rate": 1.8449298014917572e-06, - "loss": 0.2582, + "epoch": 4.512019317403683, + "grad_norm": 0.279956191778183, + "learning_rate": 1.2465596830421255e-06, + "loss": 0.426, "step": 125195 }, { - "epoch": 4.4, - "learning_rate": 1.843855910748829e-06, - "loss": 0.2524, + "epoch": 4.512199517064908, + "grad_norm": 0.2812948226928711, + "learning_rate": 1.2456498805877404e-06, + "loss": 0.339, "step": 125200 }, { - "epoch": 4.41, - "learning_rate": 1.8427823206731699e-06, - "loss": 0.2527, + "epoch": 4.512379716726133, + "grad_norm": 0.29968753457069397, + "learning_rate": 1.2447404017817687e-06, + "loss": 0.3693, "step": 125205 }, { - "epoch": 4.41, - "learning_rate": 1.841709031278724e-06, - "loss": 0.2494, + "epoch": 4.5125599163873575, + "grad_norm": 0.2712326943874359, + "learning_rate": 1.243831246636612e-06, + "loss": 0.3906, "step": 125210 }, { - "epoch": 4.41, - "learning_rate": 1.840636042579419e-06, - "loss": 0.2502, + "epoch": 4.512740116048581, + "grad_norm": 0.24692299962043762, + "learning_rate": 1.2429224151646574e-06, + "loss": 0.3267, "step": 125215 }, { - "epoch": 4.41, - "learning_rate": 1.8395633545891965e-06, - "loss": 0.2505, + "epoch": 4.512920315709806, + "grad_norm": 0.25159400701522827, + "learning_rate": 1.242013907378281e-06, + "loss": 0.3634, "step": 125220 }, { - "epoch": 4.41, - "learning_rate": 1.8384909673219818e-06, - "loss": 0.2543, + "epoch": 4.513100515371031, + "grad_norm": 0.2895723581314087, + "learning_rate": 1.2411057232898733e-06, + "loss": 0.3826, "step": 125225 }, { - "epoch": 4.41, - "learning_rate": 1.8374188807916942e-06, - "loss": 0.2378, + "epoch": 4.513280715032256, + "grad_norm": 0.23947159945964813, + "learning_rate": 1.2401978629117934e-06, + "loss": 0.3526, "step": 125230 }, { - "epoch": 4.41, - "learning_rate": 1.8363470950122614e-06, - "loss": 0.2551, + "epoch": 4.51346091469348, + "grad_norm": 0.2706504166126251, + "learning_rate": 1.2392903262564176e-06, + "loss": 0.3864, "step": 125235 }, { - "epoch": 4.41, - "learning_rate": 1.835275609997597e-06, - "loss": 0.2633, + "epoch": 4.513641114354705, + "grad_norm": 0.2251925766468048, + "learning_rate": 1.238383113336114e-06, + "loss": 0.3673, "step": 125240 }, { - "epoch": 4.41, - "learning_rate": 1.8342044257616097e-06, - "loss": 0.2501, + "epoch": 4.51382131401593, + "grad_norm": 0.2925741374492645, + "learning_rate": 1.2374762241632393e-06, + "loss": 0.3739, "step": 125245 }, { - "epoch": 4.41, - "learning_rate": 1.8331335423182106e-06, - "loss": 0.2508, + "epoch": 4.5140015136771545, + "grad_norm": 0.24656470119953156, + "learning_rate": 1.2365696587501502e-06, + "loss": 0.3843, "step": 125250 }, { - "epoch": 4.41, - "learning_rate": 1.8320629596813078e-06, - "loss": 0.2496, + "epoch": 4.514181713338379, + "grad_norm": 0.2367469221353531, + "learning_rate": 1.235663417109198e-06, + "loss": 0.3662, "step": 125255 }, { - "epoch": 4.41, - "learning_rate": 1.8309926778647928e-06, - "loss": 0.2322, + "epoch": 4.514361912999604, + "grad_norm": 0.23144228756427765, + "learning_rate": 1.2347574992527283e-06, + "loss": 0.3484, "step": 125260 }, { - "epoch": 4.41, - "learning_rate": 1.8299226968825745e-06, - "loss": 0.2623, + "epoch": 4.514542112660828, + "grad_norm": 0.3216107189655304, + "learning_rate": 1.2338519051930925e-06, + "loss": 0.3521, "step": 125265 }, { - "epoch": 4.41, - "learning_rate": 1.8288530167485385e-06, - "loss": 0.2539, + "epoch": 4.514722312322053, + "grad_norm": 0.3389444649219513, + "learning_rate": 1.2329466349426194e-06, + "loss": 0.4033, "step": 125270 }, { - "epoch": 4.41, - "learning_rate": 1.8277836374765767e-06, - "loss": 0.2352, + "epoch": 4.514902511983277, + "grad_norm": 0.21677029132843018, + "learning_rate": 1.2320416885136493e-06, + "loss": 0.3743, "step": 125275 }, { - "epoch": 4.41, - "learning_rate": 1.826714559080575e-06, - "loss": 0.2707, + "epoch": 4.515082711644502, + "grad_norm": 0.2557178735733032, + "learning_rate": 1.2311370659185084e-06, + "loss": 0.3911, "step": 125280 }, { - "epoch": 4.41, - "learning_rate": 1.8256457815744117e-06, - "loss": 0.2852, + "epoch": 4.515262911305727, + "grad_norm": 0.2284223735332489, + "learning_rate": 1.2302327671695201e-06, + "loss": 0.3583, "step": 125285 }, { - "epoch": 4.41, - "learning_rate": 1.8245773049719639e-06, - "loss": 0.2284, + "epoch": 4.5154431109669515, + "grad_norm": 0.23775717616081238, + "learning_rate": 1.2293287922790108e-06, + "loss": 0.3642, "step": 125290 }, { - "epoch": 4.41, - "learning_rate": 1.823509129287107e-06, - "loss": 0.2353, + "epoch": 4.515623310628176, + "grad_norm": 0.24203971028327942, + "learning_rate": 1.2284251412593011e-06, + "loss": 0.3632, "step": 125295 }, { - "epoch": 4.41, - "learning_rate": 1.8224412545337132e-06, - "loss": 0.2682, + "epoch": 4.515803510289401, + "grad_norm": 0.30471011996269226, + "learning_rate": 1.2275218141226868e-06, + "loss": 0.3797, "step": 125300 }, { - "epoch": 4.41, - "learning_rate": 1.8213736807256382e-06, - "loss": 0.262, + "epoch": 4.515983709950625, + "grad_norm": 0.2442302256822586, + "learning_rate": 1.2266188108814886e-06, + "loss": 0.3587, "step": 125305 }, { - "epoch": 4.41, - "learning_rate": 1.8203064078767568e-06, - "loss": 0.284, + "epoch": 4.5161639096118495, + "grad_norm": 0.2527487874031067, + "learning_rate": 1.2257161315480048e-06, + "loss": 0.3862, "step": 125310 }, { - "epoch": 4.41, - "learning_rate": 1.8192394360009218e-06, - "loss": 0.239, + "epoch": 4.516344109273074, + "grad_norm": 0.2215377241373062, + "learning_rate": 1.2248137761345424e-06, + "loss": 0.3901, "step": 125315 }, { - "epoch": 4.41, - "learning_rate": 1.8181727651119807e-06, - "loss": 0.245, + "epoch": 4.516524308934299, + "grad_norm": 0.23120597004890442, + "learning_rate": 1.2239117446533833e-06, + "loss": 0.4032, "step": 125320 }, { - "epoch": 4.41, - "learning_rate": 1.8171063952237888e-06, - "loss": 0.2323, + "epoch": 4.516704508595524, + "grad_norm": 0.2772935926914215, + "learning_rate": 1.2230100371168229e-06, + "loss": 0.3507, "step": 125325 }, { - "epoch": 4.41, - "learning_rate": 1.8160403263501963e-06, - "loss": 0.2541, + "epoch": 4.5168847082567485, + "grad_norm": 0.31849685311317444, + "learning_rate": 1.222108653537149e-06, + "loss": 0.3966, "step": 125330 }, { - "epoch": 4.41, - "learning_rate": 1.8149745585050421e-06, - "loss": 0.2466, + "epoch": 4.517064907917973, + "grad_norm": 0.23283176124095917, + "learning_rate": 1.2212075939266432e-06, + "loss": 0.3609, "step": 125335 }, { - "epoch": 4.41, - "learning_rate": 1.8139090917021623e-06, - "loss": 0.2559, + "epoch": 4.517245107579198, + "grad_norm": 0.268390953540802, + "learning_rate": 1.220306858297579e-06, + "loss": 0.3574, "step": 125340 }, { - "epoch": 4.41, - "learning_rate": 1.8128439259553875e-06, - "loss": 0.2354, + "epoch": 4.517425307240423, + "grad_norm": 0.315220445394516, + "learning_rate": 1.21940644666223e-06, + "loss": 0.3818, "step": 125345 }, { - "epoch": 4.41, - "learning_rate": 1.8117790612785596e-06, - "loss": 0.2612, + "epoch": 4.517605506901647, + "grad_norm": 0.243333101272583, + "learning_rate": 1.2185063590328616e-06, + "loss": 0.381, "step": 125350 }, { - "epoch": 4.41, - "learning_rate": 1.8107144976855006e-06, - "loss": 0.251, + "epoch": 4.517785706562872, + "grad_norm": 0.22509504854679108, + "learning_rate": 1.2176065954217414e-06, + "loss": 0.3696, "step": 125355 }, { - "epoch": 4.41, - "learning_rate": 1.8096502351900274e-06, - "loss": 0.2372, + "epoch": 4.517965906224096, + "grad_norm": 0.27468258142471313, + "learning_rate": 1.2167071558411263e-06, + "loss": 0.3988, "step": 125360 }, { - "epoch": 4.41, - "learning_rate": 1.8085862738059622e-06, - "loss": 0.274, + "epoch": 4.518146105885321, + "grad_norm": 0.3038482367992401, + "learning_rate": 1.2158080403032734e-06, + "loss": 0.3707, "step": 125365 }, { - "epoch": 4.41, - "learning_rate": 1.8075226135471247e-06, - "loss": 0.2536, + "epoch": 4.5183263055465455, + "grad_norm": 0.23414720594882965, + "learning_rate": 1.2149092488204311e-06, + "loss": 0.339, "step": 125370 }, { - "epoch": 4.41, - "learning_rate": 1.8064592544273146e-06, - "loss": 0.2435, + "epoch": 4.51850650520777, + "grad_norm": 0.2685023546218872, + "learning_rate": 1.214010781404845e-06, + "loss": 0.3801, "step": 125375 }, { - "epoch": 4.41, - "learning_rate": 1.8053961964603517e-06, - "loss": 0.2335, + "epoch": 4.518686704868995, + "grad_norm": 0.25739866495132446, + "learning_rate": 1.2131126380687558e-06, + "loss": 0.3869, "step": 125380 }, { - "epoch": 4.41, - "learning_rate": 1.8043334396600304e-06, - "loss": 0.2588, + "epoch": 4.51886690453022, + "grad_norm": 0.2300902158021927, + "learning_rate": 1.2122148188244004e-06, + "loss": 0.3804, "step": 125385 }, { - "epoch": 4.41, - "learning_rate": 1.8032709840401563e-06, - "loss": 0.2283, + "epoch": 4.519047104191444, + "grad_norm": 0.2580128610134125, + "learning_rate": 1.2113173236840164e-06, + "loss": 0.3291, "step": 125390 }, { - "epoch": 4.41, - "learning_rate": 1.8022088296145212e-06, - "loss": 0.2606, + "epoch": 4.519227303852669, + "grad_norm": 0.27533698081970215, + "learning_rate": 1.2104201526598274e-06, + "loss": 0.3588, "step": 125395 }, { - "epoch": 4.41, - "learning_rate": 1.801146976396914e-06, - "loss": 0.2641, + "epoch": 4.519407503513893, + "grad_norm": 0.2444005161523819, + "learning_rate": 1.209523305764057e-06, + "loss": 0.364, "step": 125400 }, { - "epoch": 4.41, - "learning_rate": 1.8000854244011239e-06, - "loss": 0.2468, + "epoch": 4.519587703175118, + "grad_norm": 0.25520777702331543, + "learning_rate": 1.208626783008923e-06, + "loss": 0.3582, "step": 125405 }, { - "epoch": 4.41, - "learning_rate": 1.7990241736409369e-06, - "loss": 0.244, + "epoch": 4.519767902836342, + "grad_norm": 0.2750135064125061, + "learning_rate": 1.2077305844066494e-06, + "loss": 0.3841, "step": 125410 }, { - "epoch": 4.41, - "learning_rate": 1.7979632241301337e-06, - "loss": 0.2569, + "epoch": 4.519948102497567, + "grad_norm": 0.24095915257930756, + "learning_rate": 1.206834709969437e-06, + "loss": 0.3664, "step": 125415 }, { - "epoch": 4.41, - "learning_rate": 1.7969025758824864e-06, - "loss": 0.2388, + "epoch": 4.520128302158792, + "grad_norm": 0.2322002798318863, + "learning_rate": 1.2059391597094905e-06, + "loss": 0.3576, "step": 125420 }, { - "epoch": 4.41, - "learning_rate": 1.795842228911762e-06, - "loss": 0.2548, + "epoch": 4.520308501820017, + "grad_norm": 0.2865905463695526, + "learning_rate": 1.2050439336390217e-06, + "loss": 0.3656, "step": 125425 }, { - "epoch": 4.41, - "learning_rate": 1.7947821832317357e-06, - "loss": 0.243, + "epoch": 4.520488701481241, + "grad_norm": 0.26659440994262695, + "learning_rate": 1.2041490317702214e-06, + "loss": 0.4096, "step": 125430 }, { - "epoch": 4.41, - "learning_rate": 1.793722438856174e-06, - "loss": 0.257, + "epoch": 4.520668901142466, + "grad_norm": 0.29634758830070496, + "learning_rate": 1.2032544541152851e-06, + "loss": 0.3838, "step": 125435 }, { - "epoch": 4.41, - "learning_rate": 1.7926629957988272e-06, - "loss": 0.244, + "epoch": 4.520849100803691, + "grad_norm": 0.2613474428653717, + "learning_rate": 1.2023602006863976e-06, + "loss": 0.3772, "step": 125440 }, { - "epoch": 4.41, - "learning_rate": 1.7916038540734648e-06, - "loss": 0.2558, + "epoch": 4.521029300464916, + "grad_norm": 0.2532717287540436, + "learning_rate": 1.2014662714957409e-06, + "loss": 0.3852, "step": 125445 }, { - "epoch": 4.41, - "learning_rate": 1.7905450136938284e-06, - "loss": 0.2988, + "epoch": 4.521209500126139, + "grad_norm": 0.22666624188423157, + "learning_rate": 1.2005726665555051e-06, + "loss": 0.3694, "step": 125450 }, { - "epoch": 4.41, - "learning_rate": 1.7894864746736684e-06, - "loss": 0.274, + "epoch": 4.521389699787364, + "grad_norm": 0.22881825268268585, + "learning_rate": 1.1996793858778554e-06, + "loss": 0.2936, "step": 125455 }, { - "epoch": 4.41, - "learning_rate": 1.7884282370267291e-06, - "loss": 0.2545, + "epoch": 4.521569899448589, + "grad_norm": 0.24387423694133759, + "learning_rate": 1.1987864294749685e-06, + "loss": 0.3588, "step": 125460 }, { - "epoch": 4.41, - "learning_rate": 1.7873703007667553e-06, - "loss": 0.2523, + "epoch": 4.521750099109814, + "grad_norm": 0.2651515305042267, + "learning_rate": 1.1978937973590092e-06, + "loss": 0.3621, "step": 125465 }, { - "epoch": 4.41, - "learning_rate": 1.7863126659074802e-06, - "loss": 0.2514, + "epoch": 4.521930298771038, + "grad_norm": 0.21903403103351593, + "learning_rate": 1.197001489542135e-06, + "loss": 0.3943, "step": 125470 }, { - "epoch": 4.41, - "learning_rate": 1.7852553324626376e-06, - "loss": 0.2401, + "epoch": 4.522110498432263, + "grad_norm": 0.2606528103351593, + "learning_rate": 1.196109506036508e-06, + "loss": 0.353, "step": 125475 }, { - "epoch": 4.41, - "learning_rate": 1.7841983004459495e-06, - "loss": 0.2538, + "epoch": 4.522290698093488, + "grad_norm": 0.22265635430812836, + "learning_rate": 1.1952178468542852e-06, + "loss": 0.3417, "step": 125480 }, { - "epoch": 4.41, - "learning_rate": 1.7831415698711467e-06, - "loss": 0.2336, + "epoch": 4.522470897754713, + "grad_norm": 0.24026912450790405, + "learning_rate": 1.1943265120076042e-06, + "loss": 0.3699, "step": 125485 }, { - "epoch": 4.42, - "learning_rate": 1.7820851407519572e-06, - "loss": 0.273, + "epoch": 4.522651097415936, + "grad_norm": 0.23182328045368195, + "learning_rate": 1.193435501508619e-06, + "loss": 0.3662, "step": 125490 }, { - "epoch": 4.42, - "learning_rate": 1.7810290131020868e-06, - "loss": 0.2575, + "epoch": 4.522831297077161, + "grad_norm": 0.26761478185653687, + "learning_rate": 1.1925448153694619e-06, + "loss": 0.3798, "step": 125495 }, { - "epoch": 4.42, - "learning_rate": 1.779973186935252e-06, - "loss": 0.2638, + "epoch": 4.523011496738386, + "grad_norm": 0.23140046000480652, + "learning_rate": 1.1916544536022783e-06, + "loss": 0.3844, "step": 125500 }, { - "epoch": 4.42, - "eval_loss": 0.24885064363479614, - "eval_runtime": 10.5403, - "eval_samples_per_second": 9.487, - "eval_steps_per_second": 9.487, + "epoch": 4.523011496738386, + "eval_loss": 0.428926557302475, + "eval_runtime": 3.5228, + "eval_samples_per_second": 28.386, + "eval_steps_per_second": 7.097, "step": 125500 }, { - "epoch": 4.42, - "learning_rate": 1.7789176622651642e-06, - "loss": 0.2569, + "epoch": 4.523191696399611, + "grad_norm": 0.2863670885562897, + "learning_rate": 1.1907644162191922e-06, + "loss": 0.3947, "step": 125505 }, { - "epoch": 4.42, - "learning_rate": 1.7778624391055293e-06, - "loss": 0.2524, + "epoch": 4.523371896060835, + "grad_norm": 0.1922617256641388, + "learning_rate": 1.189874703232327e-06, + "loss": 0.3523, "step": 125510 }, { - "epoch": 4.42, - "learning_rate": 1.776807517470047e-06, - "loss": 0.2501, + "epoch": 4.52355209572206, + "grad_norm": 0.26853513717651367, + "learning_rate": 1.1889853146538148e-06, + "loss": 0.3827, "step": 125515 }, { - "epoch": 4.42, - "learning_rate": 1.7757528973724096e-06, - "loss": 0.2501, + "epoch": 4.523732295383285, + "grad_norm": 0.31399276852607727, + "learning_rate": 1.1880962504957655e-06, + "loss": 0.39, "step": 125520 }, { - "epoch": 4.42, - "learning_rate": 1.7746985788263199e-06, - "loss": 0.2399, + "epoch": 4.5239124950445095, + "grad_norm": 0.2765243649482727, + "learning_rate": 1.187207510770294e-06, + "loss": 0.3471, "step": 125525 }, { - "epoch": 4.42, - "learning_rate": 1.773644561845461e-06, - "loss": 0.2688, + "epoch": 4.524092694705734, + "grad_norm": 0.2347140908241272, + "learning_rate": 1.1863190954895104e-06, + "loss": 0.3601, "step": 125530 }, { - "epoch": 4.42, - "learning_rate": 1.7725908464435198e-06, - "loss": 0.262, + "epoch": 4.524272894366959, + "grad_norm": 0.25988760590553284, + "learning_rate": 1.1854310046655158e-06, + "loss": 0.3578, "step": 125535 }, { - "epoch": 4.42, - "learning_rate": 1.7715374326341767e-06, - "loss": 0.2531, + "epoch": 4.524453094028183, + "grad_norm": 0.26233744621276855, + "learning_rate": 1.184543238310415e-06, + "loss": 0.3576, "step": 125540 }, { - "epoch": 4.42, - "learning_rate": 1.7704843204311179e-06, - "loss": 0.2601, + "epoch": 4.524633293689408, + "grad_norm": 0.24501554667949677, + "learning_rate": 1.1836557964363032e-06, + "loss": 0.3731, "step": 125545 }, { - "epoch": 4.42, - "learning_rate": 1.7694315098480102e-06, - "loss": 0.2438, + "epoch": 4.524813493350632, + "grad_norm": 0.20214052498340607, + "learning_rate": 1.1827686790552711e-06, + "loss": 0.3533, "step": 125550 }, { - "epoch": 4.42, - "learning_rate": 1.7683790008985206e-06, - "loss": 0.2567, + "epoch": 4.524993693011857, + "grad_norm": 0.27101585268974304, + "learning_rate": 1.1818818861794007e-06, + "loss": 0.3721, "step": 125555 }, { - "epoch": 4.42, - "learning_rate": 1.7675372109243276e-06, - "loss": 0.2601, + "epoch": 4.525173892673082, + "grad_norm": 0.22587630152702332, + "learning_rate": 1.1809954178207821e-06, + "loss": 0.3643, "step": 125560 }, { - "epoch": 4.42, - "learning_rate": 1.7664852449497975e-06, - "loss": 0.2437, + "epoch": 4.5253540923343065, + "grad_norm": 0.293062686920166, + "learning_rate": 1.1801092739914838e-06, + "loss": 0.3687, "step": 125565 }, { - "epoch": 4.42, - "learning_rate": 1.7654335806471439e-06, - "loss": 0.2388, + "epoch": 4.525534291995531, + "grad_norm": 0.3036471903324127, + "learning_rate": 1.1792234547035903e-06, + "loss": 0.3325, "step": 125570 }, { - "epoch": 4.42, - "learning_rate": 1.7643822180300196e-06, - "loss": 0.2325, + "epoch": 4.525714491656756, + "grad_norm": 0.2614469826221466, + "learning_rate": 1.178337959969164e-06, + "loss": 0.3833, "step": 125575 }, { - "epoch": 4.42, - "learning_rate": 1.763331157112086e-06, - "loss": 0.2332, + "epoch": 4.52589469131798, + "grad_norm": 0.26248598098754883, + "learning_rate": 1.1774527898002707e-06, + "loss": 0.3678, "step": 125580 }, { - "epoch": 4.42, - "learning_rate": 1.762280397906982e-06, - "loss": 0.2463, + "epoch": 4.526074890979205, + "grad_norm": 0.22794273495674133, + "learning_rate": 1.1765679442089728e-06, + "loss": 0.3575, "step": 125585 }, { - "epoch": 4.42, - "learning_rate": 1.7612299404283494e-06, - "loss": 0.2564, + "epoch": 4.526255090640429, + "grad_norm": 0.2476157546043396, + "learning_rate": 1.1756834232073189e-06, + "loss": 0.3442, "step": 125590 }, { - "epoch": 4.42, - "learning_rate": 1.760179784689836e-06, - "loss": 0.2581, + "epoch": 4.526435290301654, + "grad_norm": 0.236716166138649, + "learning_rate": 1.1747992268073743e-06, + "loss": 0.3679, "step": 125595 }, { - "epoch": 4.42, - "learning_rate": 1.7591299307050723e-06, - "loss": 0.266, + "epoch": 4.526615489962879, + "grad_norm": 0.295484334230423, + "learning_rate": 1.173915355021174e-06, + "loss": 0.3566, "step": 125600 }, { - "epoch": 4.42, - "learning_rate": 1.7580803784876893e-06, - "loss": 0.2848, + "epoch": 4.5267956896241035, + "grad_norm": 0.237171933054924, + "learning_rate": 1.1730318078607584e-06, + "loss": 0.3829, "step": 125605 }, { - "epoch": 4.42, - "learning_rate": 1.757031128051312e-06, - "loss": 0.2355, + "epoch": 4.526975889285328, + "grad_norm": 0.2715757191181183, + "learning_rate": 1.1721485853381787e-06, + "loss": 0.3776, "step": 125610 }, { - "epoch": 4.42, - "learning_rate": 1.755982179409571e-06, - "loss": 0.2384, + "epoch": 4.527156088946553, + "grad_norm": 0.21538300812244415, + "learning_rate": 1.1712656874654586e-06, + "loss": 0.3446, "step": 125615 }, { - "epoch": 4.42, - "learning_rate": 1.7549335325760834e-06, - "loss": 0.2755, + "epoch": 4.527336288607778, + "grad_norm": 0.21776896715164185, + "learning_rate": 1.1703831142546306e-06, + "loss": 0.3613, "step": 125620 }, { - "epoch": 4.42, - "learning_rate": 1.7538851875644606e-06, - "loss": 0.2615, + "epoch": 4.527516488269002, + "grad_norm": 0.22091469168663025, + "learning_rate": 1.1695008657177182e-06, + "loss": 0.3677, "step": 125625 }, { - "epoch": 4.42, - "learning_rate": 1.7528371443883195e-06, - "loss": 0.2423, + "epoch": 4.527696687930227, + "grad_norm": 0.24481423199176788, + "learning_rate": 1.1686189418667393e-06, + "loss": 0.3669, "step": 125630 }, { - "epoch": 4.42, - "learning_rate": 1.751789403061274e-06, - "loss": 0.2409, + "epoch": 4.527876887591451, + "grad_norm": 0.24965307116508484, + "learning_rate": 1.1677373427137178e-06, + "loss": 0.366, "step": 125635 }, { - "epoch": 4.42, - "learning_rate": 1.750741963596919e-06, - "loss": 0.2515, + "epoch": 4.528057087252676, + "grad_norm": 0.2324383556842804, + "learning_rate": 1.1668560682706608e-06, + "loss": 0.3827, "step": 125640 }, { - "epoch": 4.42, - "learning_rate": 1.7496948260088547e-06, - "loss": 0.2494, + "epoch": 4.5282372869139005, + "grad_norm": 0.2629086971282959, + "learning_rate": 1.1659751185495754e-06, + "loss": 0.3757, "step": 125645 }, { - "epoch": 4.42, - "learning_rate": 1.748647990310684e-06, - "loss": 0.2517, + "epoch": 4.528417486575125, + "grad_norm": 0.2776727080345154, + "learning_rate": 1.1650944935624658e-06, + "loss": 0.3613, "step": 125650 }, { - "epoch": 4.42, - "learning_rate": 1.747601456515996e-06, - "loss": 0.2673, + "epoch": 4.52859768623635, + "grad_norm": 0.28128179907798767, + "learning_rate": 1.1642141933213253e-06, + "loss": 0.3698, "step": 125655 }, { - "epoch": 4.42, - "learning_rate": 1.746555224638377e-06, - "loss": 0.2398, + "epoch": 4.528777885897575, + "grad_norm": 0.2046898454427719, + "learning_rate": 1.163334217838155e-06, + "loss": 0.3678, "step": 125660 }, { - "epoch": 4.42, - "learning_rate": 1.7455092946914136e-06, - "loss": 0.2578, + "epoch": 4.528958085558799, + "grad_norm": 0.21193328499794006, + "learning_rate": 1.1624545671249405e-06, + "loss": 0.3739, "step": 125665 }, { - "epoch": 4.42, - "learning_rate": 1.7444636666886894e-06, - "loss": 0.2711, + "epoch": 4.529138285220024, + "grad_norm": 0.30449825525283813, + "learning_rate": 1.161575241193666e-06, + "loss": 0.3799, "step": 125670 }, { - "epoch": 4.42, - "learning_rate": 1.7434183406437793e-06, - "loss": 0.2458, + "epoch": 4.529318484881248, + "grad_norm": 0.28137627243995667, + "learning_rate": 1.1606962400563166e-06, + "loss": 0.3485, "step": 125675 }, { - "epoch": 4.42, - "learning_rate": 1.7423733165702478e-06, - "loss": 0.2469, + "epoch": 4.529498684542473, + "grad_norm": 0.26191192865371704, + "learning_rate": 1.1598175637248605e-06, + "loss": 0.3948, "step": 125680 }, { - "epoch": 4.42, - "learning_rate": 1.7413285944816754e-06, - "loss": 0.2754, + "epoch": 4.5296788842036975, + "grad_norm": 0.21634413301944733, + "learning_rate": 1.1589392122112769e-06, + "loss": 0.3728, "step": 125685 }, { - "epoch": 4.42, - "learning_rate": 1.7402841743916266e-06, - "loss": 0.2554, + "epoch": 4.529859083864922, + "grad_norm": 0.2029135525226593, + "learning_rate": 1.1580611855275341e-06, + "loss": 0.3492, "step": 125690 }, { - "epoch": 4.42, - "learning_rate": 1.7392400563136568e-06, - "loss": 0.2446, + "epoch": 4.530039283526147, + "grad_norm": 0.2269791066646576, + "learning_rate": 1.1571834836855866e-06, + "loss": 0.3489, "step": 125695 }, { - "epoch": 4.42, - "learning_rate": 1.7381962402613277e-06, - "loss": 0.2441, + "epoch": 4.530219483187372, + "grad_norm": 0.2346084862947464, + "learning_rate": 1.1563061066974024e-06, + "loss": 0.3775, "step": 125700 }, { - "epoch": 4.42, - "learning_rate": 1.737152726248184e-06, - "loss": 0.2483, + "epoch": 4.530399682848596, + "grad_norm": 0.21212127804756165, + "learning_rate": 1.1554290545749273e-06, + "loss": 0.3526, "step": 125705 }, { - "epoch": 4.42, - "learning_rate": 1.736109514287787e-06, - "loss": 0.2573, + "epoch": 4.530579882509821, + "grad_norm": 0.22559018433094025, + "learning_rate": 1.154552327330119e-06, + "loss": 0.3672, "step": 125710 }, { - "epoch": 4.42, - "learning_rate": 1.7350666043936786e-06, - "loss": 0.2383, + "epoch": 4.530760082171046, + "grad_norm": 0.24675384163856506, + "learning_rate": 1.1536759249749173e-06, + "loss": 0.3749, "step": 125715 }, { - "epoch": 4.42, - "learning_rate": 1.7340239965793954e-06, - "loss": 0.2672, + "epoch": 4.530940281832271, + "grad_norm": 0.20101101696491241, + "learning_rate": 1.1527998475212604e-06, + "loss": 0.3611, "step": 125720 }, { - "epoch": 4.42, - "learning_rate": 1.7329816908584734e-06, - "loss": 0.2704, + "epoch": 4.5311204814934944, + "grad_norm": 0.24855537712574005, + "learning_rate": 1.151924094981091e-06, + "loss": 0.4015, "step": 125725 }, { - "epoch": 4.42, - "learning_rate": 1.731939687244455e-06, - "loss": 0.2382, + "epoch": 4.531300681154719, + "grad_norm": 0.2826426029205322, + "learning_rate": 1.151048667366339e-06, + "loss": 0.3815, "step": 125730 }, { - "epoch": 4.42, - "learning_rate": 1.7308979857508623e-06, - "loss": 0.2316, + "epoch": 4.531480880815944, + "grad_norm": 0.2761218547821045, + "learning_rate": 1.1501735646889332e-06, + "loss": 0.3988, "step": 125735 }, { - "epoch": 4.42, - "learning_rate": 1.729856586391229e-06, - "loss": 0.2461, + "epoch": 4.531661080477169, + "grad_norm": 0.2532573938369751, + "learning_rate": 1.1492987869607952e-06, + "loss": 0.3876, "step": 125740 }, { - "epoch": 4.42, - "learning_rate": 1.728815489179067e-06, - "loss": 0.2689, + "epoch": 4.531841280138393, + "grad_norm": 0.2559954524040222, + "learning_rate": 1.1484243341938427e-06, + "loss": 0.367, "step": 125745 }, { - "epoch": 4.42, - "learning_rate": 1.7277746941279038e-06, - "loss": 0.2622, + "epoch": 4.532021479799618, + "grad_norm": 0.22612114250659943, + "learning_rate": 1.147550206399986e-06, + "loss": 0.3544, "step": 125750 }, { - "epoch": 4.42, - "learning_rate": 1.7267342012512484e-06, - "loss": 0.2526, + "epoch": 4.532201679460843, + "grad_norm": 0.24155735969543457, + "learning_rate": 1.146676403591146e-06, + "loss": 0.4076, "step": 125755 }, { - "epoch": 4.42, - "learning_rate": 1.725694010562609e-06, - "loss": 0.2388, + "epoch": 4.532381879122068, + "grad_norm": 0.3148764967918396, + "learning_rate": 1.1458029257792185e-06, + "loss": 0.3787, "step": 125760 }, { - "epoch": 4.42, - "learning_rate": 1.7246541220754975e-06, - "loss": 0.2314, + "epoch": 4.532562078783291, + "grad_norm": 0.2415306717157364, + "learning_rate": 1.144929772976111e-06, + "loss": 0.3981, "step": 125765 }, { - "epoch": 4.42, - "learning_rate": 1.723614535803414e-06, - "loss": 0.2311, + "epoch": 4.532742278444516, + "grad_norm": 0.2210559993982315, + "learning_rate": 1.1440569451937166e-06, + "loss": 0.3602, "step": 125770 }, { - "epoch": 4.43, - "learning_rate": 1.7225752517598558e-06, - "loss": 0.2388, + "epoch": 4.532922478105741, + "grad_norm": 0.22382661700248718, + "learning_rate": 1.143184442443923e-06, + "loss": 0.337, "step": 125775 }, { - "epoch": 4.43, - "learning_rate": 1.7215362699583154e-06, - "loss": 0.2332, + "epoch": 4.533102677766966, + "grad_norm": 0.24328698217868805, + "learning_rate": 1.1423122647386291e-06, + "loss": 0.3835, "step": 125780 }, { - "epoch": 4.43, - "learning_rate": 1.7204975904122872e-06, - "loss": 0.2405, + "epoch": 4.53328287742819, + "grad_norm": 0.2541979253292084, + "learning_rate": 1.1414404120897087e-06, + "loss": 0.38, "step": 125785 }, { - "epoch": 4.43, - "learning_rate": 1.719459213135252e-06, - "loss": 0.2476, + "epoch": 4.533463077089415, + "grad_norm": 0.24431033432483673, + "learning_rate": 1.1405688845090383e-06, + "loss": 0.3388, "step": 125790 }, { - "epoch": 4.43, - "learning_rate": 1.7184211381407023e-06, - "loss": 0.2752, + "epoch": 4.53364327675064, + "grad_norm": 0.24632686376571655, + "learning_rate": 1.139697682008503e-06, + "loss": 0.351, "step": 125795 }, { - "epoch": 4.43, - "learning_rate": 1.7173833654421046e-06, - "loss": 0.2647, + "epoch": 4.533823476411865, + "grad_norm": 0.24334456026554108, + "learning_rate": 1.1388268045999655e-06, + "loss": 0.342, "step": 125800 }, { - "epoch": 4.43, - "learning_rate": 1.7163458950529454e-06, - "loss": 0.2338, + "epoch": 4.534003676073089, + "grad_norm": 0.22440917789936066, + "learning_rate": 1.137956252295297e-06, + "loss": 0.3438, "step": 125805 }, { - "epoch": 4.43, - "learning_rate": 1.715308726986692e-06, - "loss": 0.228, + "epoch": 4.534183875734314, + "grad_norm": 0.25877583026885986, + "learning_rate": 1.1370860251063515e-06, + "loss": 0.3857, "step": 125810 }, { - "epoch": 4.43, - "learning_rate": 1.7142718612568053e-06, - "loss": 0.2639, + "epoch": 4.534364075395538, + "grad_norm": 0.2521132826805115, + "learning_rate": 1.1362161230449863e-06, + "loss": 0.3869, "step": 125815 }, { - "epoch": 4.43, - "learning_rate": 1.713235297876753e-06, - "loss": 0.2404, + "epoch": 4.534544275056763, + "grad_norm": 0.23100094497203827, + "learning_rate": 1.1353465461230616e-06, + "loss": 0.3642, "step": 125820 }, { - "epoch": 4.43, - "learning_rate": 1.7121990368599961e-06, - "loss": 0.2597, + "epoch": 4.534724474717987, + "grad_norm": 0.2277524769306183, + "learning_rate": 1.134477294352418e-06, + "loss": 0.3368, "step": 125825 }, { - "epoch": 4.43, - "learning_rate": 1.711163078219985e-06, - "loss": 0.2799, + "epoch": 4.534904674379212, + "grad_norm": 0.2838972210884094, + "learning_rate": 1.1336083677448984e-06, + "loss": 0.4055, "step": 125830 }, { - "epoch": 4.43, - "learning_rate": 1.7101274219701756e-06, - "loss": 0.2636, + "epoch": 4.535084874040437, + "grad_norm": 0.26135292649269104, + "learning_rate": 1.132739766312349e-06, + "loss": 0.3795, "step": 125835 }, { - "epoch": 4.43, - "learning_rate": 1.7090920681240102e-06, - "loss": 0.2583, + "epoch": 4.5352650737016615, + "grad_norm": 0.24618162214756012, + "learning_rate": 1.131871490066591e-06, + "loss": 0.3496, "step": 125840 }, { - "epoch": 4.43, - "learning_rate": 1.7080570166949333e-06, - "loss": 0.2649, + "epoch": 4.535445273362886, + "grad_norm": 0.28488776087760925, + "learning_rate": 1.1310035390194706e-06, + "loss": 0.3829, "step": 125845 }, { - "epoch": 4.43, - "learning_rate": 1.7070222676963897e-06, - "loss": 0.2289, + "epoch": 4.535625473024111, + "grad_norm": 0.28462615609169006, + "learning_rate": 1.1301359131828032e-06, + "loss": 0.3896, "step": 125850 }, { - "epoch": 4.43, - "learning_rate": 1.7059878211418107e-06, - "loss": 0.2291, + "epoch": 4.535805672685335, + "grad_norm": 0.295928955078125, + "learning_rate": 1.1292686125684127e-06, + "loss": 0.4013, "step": 125855 }, { - "epoch": 4.43, - "learning_rate": 1.7049536770446295e-06, - "loss": 0.2542, + "epoch": 4.53598587234656, + "grad_norm": 0.27669599652290344, + "learning_rate": 1.1284016371881173e-06, + "loss": 0.3411, "step": 125860 }, { - "epoch": 4.43, - "learning_rate": 1.7039198354182718e-06, - "loss": 0.2395, + "epoch": 4.536166072007784, + "grad_norm": 0.31112879514694214, + "learning_rate": 1.1275349870537243e-06, + "loss": 0.3581, "step": 125865 }, { - "epoch": 4.43, - "learning_rate": 1.7028862962761654e-06, - "loss": 0.2525, + "epoch": 4.536346271669009, + "grad_norm": 0.30249273777008057, + "learning_rate": 1.1266686621770466e-06, + "loss": 0.3908, "step": 125870 }, { - "epoch": 4.43, - "learning_rate": 1.7018530596317195e-06, - "loss": 0.2491, + "epoch": 4.536526471330234, + "grad_norm": 0.260932058095932, + "learning_rate": 1.1258026625698914e-06, + "loss": 0.3768, "step": 125875 }, { - "epoch": 4.43, - "learning_rate": 1.7008201254983647e-06, - "loss": 0.2712, + "epoch": 4.5367066709914585, + "grad_norm": 0.2824000418186188, + "learning_rate": 1.1249369882440464e-06, + "loss": 0.3298, "step": 125880 }, { - "epoch": 4.43, - "learning_rate": 1.6997874938895043e-06, - "loss": 0.2482, + "epoch": 4.536886870652683, + "grad_norm": 0.2413741648197174, + "learning_rate": 1.124071639211316e-06, + "loss": 0.3432, "step": 125885 }, { - "epoch": 4.43, - "learning_rate": 1.6987551648185495e-06, - "loss": 0.2483, + "epoch": 4.537067070313908, + "grad_norm": 0.2007882446050644, + "learning_rate": 1.1232066154834852e-06, + "loss": 0.3411, "step": 125890 }, { - "epoch": 4.43, - "learning_rate": 1.6977231382988956e-06, - "loss": 0.2378, + "epoch": 4.537247269975133, + "grad_norm": 0.2404080480337143, + "learning_rate": 1.122341917072342e-06, + "loss": 0.3577, "step": 125895 }, { - "epoch": 4.43, - "learning_rate": 1.6966914143439566e-06, - "loss": 0.2375, + "epoch": 4.5374274696363575, + "grad_norm": 0.2568254768848419, + "learning_rate": 1.1214775439896686e-06, + "loss": 0.3915, "step": 125900 }, { - "epoch": 4.43, - "learning_rate": 1.695659992967119e-06, - "loss": 0.2365, + "epoch": 4.537607669297582, + "grad_norm": 0.26964160799980164, + "learning_rate": 1.1206134962472387e-06, + "loss": 0.3875, "step": 125905 }, { - "epoch": 4.43, - "learning_rate": 1.6946288741817806e-06, - "loss": 0.2516, + "epoch": 4.537787868958806, + "grad_norm": 0.26435330510139465, + "learning_rate": 1.1197497738568264e-06, + "loss": 0.3631, "step": 125910 }, { - "epoch": 4.43, - "learning_rate": 1.6935980580013221e-06, - "loss": 0.2581, + "epoch": 4.537968068620031, + "grad_norm": 0.27454784512519836, + "learning_rate": 1.1188863768302027e-06, + "loss": 0.3484, "step": 125915 }, { - "epoch": 4.43, - "learning_rate": 1.6925675444391386e-06, - "loss": 0.262, + "epoch": 4.5381482682812555, + "grad_norm": 0.2890207767486572, + "learning_rate": 1.1180233051791279e-06, + "loss": 0.3799, "step": 125920 }, { - "epoch": 4.43, - "learning_rate": 1.6915373335086054e-06, - "loss": 0.2466, + "epoch": 4.53832846794248, + "grad_norm": 0.27648934721946716, + "learning_rate": 1.1171605589153616e-06, + "loss": 0.3902, "step": 125925 }, { - "epoch": 4.43, - "learning_rate": 1.6905074252231007e-06, - "loss": 0.2644, + "epoch": 4.538508667603705, + "grad_norm": 0.2612933814525604, + "learning_rate": 1.1162981380506587e-06, + "loss": 0.3866, "step": 125930 }, { - "epoch": 4.43, - "learning_rate": 1.6894778195959887e-06, - "loss": 0.2811, + "epoch": 4.53868886726493, + "grad_norm": 0.23902583122253418, + "learning_rate": 1.1154360425967652e-06, + "loss": 0.3764, "step": 125935 }, { - "epoch": 4.43, - "learning_rate": 1.6884485166406504e-06, - "loss": 0.2388, + "epoch": 4.538869066926154, + "grad_norm": 0.2665193974971771, + "learning_rate": 1.1145742725654357e-06, + "loss": 0.3214, "step": 125940 }, { - "epoch": 4.43, - "learning_rate": 1.6874195163704475e-06, - "loss": 0.2606, + "epoch": 4.539049266587379, + "grad_norm": 0.3075253665447235, + "learning_rate": 1.1137128279684078e-06, + "loss": 0.3819, "step": 125945 }, { - "epoch": 4.43, - "learning_rate": 1.6863908187987305e-06, - "loss": 0.2765, + "epoch": 4.539229466248603, + "grad_norm": 0.25425055623054504, + "learning_rate": 1.1128517088174195e-06, + "loss": 0.3686, "step": 125950 }, { - "epoch": 4.43, - "learning_rate": 1.6853624239388716e-06, - "loss": 0.2499, + "epoch": 4.539409665909828, + "grad_norm": 0.2914336025714874, + "learning_rate": 1.1119909151242003e-06, + "loss": 0.4104, "step": 125955 }, { - "epoch": 4.43, - "learning_rate": 1.6843343318042104e-06, - "loss": 0.2536, + "epoch": 4.5395898655710525, + "grad_norm": 0.23320336639881134, + "learning_rate": 1.1111304469004769e-06, + "loss": 0.3615, "step": 125960 }, { - "epoch": 4.43, - "learning_rate": 1.6833065424081086e-06, - "loss": 0.2379, + "epoch": 4.539770065232277, + "grad_norm": 0.2035553902387619, + "learning_rate": 1.1102703041579787e-06, + "loss": 0.3544, "step": 125965 }, { - "epoch": 4.43, - "learning_rate": 1.6822790557638995e-06, - "loss": 0.2696, + "epoch": 4.539950264893502, + "grad_norm": 0.27693793177604675, + "learning_rate": 1.1094104869084242e-06, + "loss": 0.3813, "step": 125970 }, { - "epoch": 4.43, - "learning_rate": 1.681251871884934e-06, - "loss": 0.2442, + "epoch": 4.540130464554727, + "grad_norm": 0.2863492965698242, + "learning_rate": 1.1085509951635236e-06, + "loss": 0.3601, "step": 125975 }, { - "epoch": 4.43, - "learning_rate": 1.6802249907845457e-06, - "loss": 0.2633, + "epoch": 4.540310664215951, + "grad_norm": 0.22666563093662262, + "learning_rate": 1.1076918289349924e-06, + "loss": 0.3503, "step": 125980 }, { - "epoch": 4.43, - "learning_rate": 1.6791984124760657e-06, - "loss": 0.2424, + "epoch": 4.540490863877176, + "grad_norm": 0.25621458888053894, + "learning_rate": 1.1068329882345296e-06, + "loss": 0.3974, "step": 125985 }, { - "epoch": 4.43, - "learning_rate": 1.6781721369728248e-06, - "loss": 0.2456, + "epoch": 4.540671063538401, + "grad_norm": 0.23761415481567383, + "learning_rate": 1.105974473073848e-06, + "loss": 0.3689, "step": 125990 }, { - "epoch": 4.43, - "learning_rate": 1.6771461642881515e-06, - "loss": 0.2539, + "epoch": 4.540851263199626, + "grad_norm": 0.27705660462379456, + "learning_rate": 1.1051162834646356e-06, + "loss": 0.4032, "step": 125995 }, { - "epoch": 4.43, - "learning_rate": 1.6761204944353626e-06, - "loss": 0.2602, + "epoch": 4.5410314628608495, + "grad_norm": 0.25998204946517944, + "learning_rate": 1.1042584194185857e-06, + "loss": 0.3646, "step": 126000 }, { - "epoch": 4.43, - "eval_loss": 0.24884916841983795, - "eval_runtime": 10.5407, - "eval_samples_per_second": 9.487, - "eval_steps_per_second": 9.487, + "epoch": 4.5410314628608495, + "eval_loss": 0.42872315645217896, + "eval_runtime": 3.5299, + "eval_samples_per_second": 28.33, + "eval_steps_per_second": 7.082, "step": 126000 }, { - "epoch": 4.43, - "learning_rate": 1.675095127427781e-06, - "loss": 0.236, + "epoch": 4.541211662522074, + "grad_norm": 0.2178216576576233, + "learning_rate": 1.1034008809473916e-06, + "loss": 0.365, "step": 126005 }, { - "epoch": 4.43, - "learning_rate": 1.6740700632787127e-06, - "loss": 0.2544, + "epoch": 4.541391862183299, + "grad_norm": 0.2195737063884735, + "learning_rate": 1.1025436680627332e-06, + "loss": 0.3845, "step": 126010 }, { - "epoch": 4.43, - "learning_rate": 1.6730453020014718e-06, - "loss": 0.2377, + "epoch": 4.541572061844524, + "grad_norm": 0.24314382672309875, + "learning_rate": 1.10168678077629e-06, + "loss": 0.3472, "step": 126015 }, { - "epoch": 4.43, - "learning_rate": 1.6720208436093705e-06, - "loss": 0.2608, + "epoch": 4.541752261505748, + "grad_norm": 0.2001175880432129, + "learning_rate": 1.1008302190997383e-06, + "loss": 0.3606, "step": 126020 }, { - "epoch": 4.43, - "learning_rate": 1.670996688115703e-06, - "loss": 0.2792, + "epoch": 4.541932461166973, + "grad_norm": 0.24921679496765137, + "learning_rate": 1.0999739830447441e-06, + "loss": 0.3683, "step": 126025 }, { - "epoch": 4.43, - "learning_rate": 1.6699728355337645e-06, - "loss": 0.2465, + "epoch": 4.542112660828198, + "grad_norm": 0.2515079379081726, + "learning_rate": 1.0991180726229789e-06, + "loss": 0.3579, "step": 126030 }, { - "epoch": 4.43, - "learning_rate": 1.6689492858768584e-06, - "loss": 0.2511, + "epoch": 4.542292860489423, + "grad_norm": 0.2457331418991089, + "learning_rate": 1.0982624878461051e-06, + "loss": 0.3869, "step": 126035 }, { - "epoch": 4.43, - "learning_rate": 1.6679260391582713e-06, - "loss": 0.2508, + "epoch": 4.5424730601506464, + "grad_norm": 0.22210632264614105, + "learning_rate": 1.0974072287257775e-06, + "loss": 0.3974, "step": 126040 }, { - "epoch": 4.43, - "learning_rate": 1.6669030953912869e-06, - "loss": 0.2346, + "epoch": 4.542653259811871, + "grad_norm": 0.256584107875824, + "learning_rate": 1.0965522952736478e-06, + "loss": 0.3527, "step": 126045 }, { - "epoch": 4.43, - "learning_rate": 1.665880454589186e-06, - "loss": 0.2555, + "epoch": 4.542833459473096, + "grad_norm": 0.25087007880210876, + "learning_rate": 1.0956976875013598e-06, + "loss": 0.3566, "step": 126050 }, { - "epoch": 4.43, - "learning_rate": 1.6648581167652498e-06, - "loss": 0.2365, + "epoch": 4.543013659134321, + "grad_norm": 0.2532568871974945, + "learning_rate": 1.0948434054205704e-06, + "loss": 0.3707, "step": 126055 }, { - "epoch": 4.44, - "learning_rate": 1.6638360819327569e-06, - "loss": 0.2425, + "epoch": 4.543193858795545, + "grad_norm": 0.22819988429546356, + "learning_rate": 1.093989449042912e-06, + "loss": 0.3395, "step": 126060 }, { - "epoch": 4.44, - "learning_rate": 1.6628143501049687e-06, - "loss": 0.2536, + "epoch": 4.54337405845677, + "grad_norm": 0.26745355129241943, + "learning_rate": 1.0931358183800117e-06, + "loss": 0.3549, "step": 126065 }, { - "epoch": 4.44, - "learning_rate": 1.661792921295155e-06, - "loss": 0.2817, + "epoch": 4.543554258117995, + "grad_norm": 0.23124882578849792, + "learning_rate": 1.0922825134435127e-06, + "loss": 0.3598, "step": 126070 }, { - "epoch": 4.44, - "learning_rate": 1.660771795516583e-06, - "loss": 0.2533, + "epoch": 4.54373445777922, + "grad_norm": 0.32220420241355896, + "learning_rate": 1.0914295342450337e-06, + "loss": 0.361, "step": 126075 }, { - "epoch": 4.44, - "learning_rate": 1.659750972782509e-06, - "loss": 0.2689, + "epoch": 4.543914657440444, + "grad_norm": 0.24584202468395233, + "learning_rate": 1.0905768807961987e-06, + "loss": 0.3809, "step": 126080 }, { - "epoch": 4.44, - "learning_rate": 1.6587304531061831e-06, - "loss": 0.2694, + "epoch": 4.544094857101669, + "grad_norm": 0.24640391767024994, + "learning_rate": 1.0897245531086287e-06, + "loss": 0.369, "step": 126085 }, { - "epoch": 4.44, - "learning_rate": 1.6577102365008644e-06, - "loss": 0.2329, + "epoch": 4.544275056762894, + "grad_norm": 0.2793191373348236, + "learning_rate": 1.0888725511939258e-06, + "loss": 0.3747, "step": 126090 }, { - "epoch": 4.44, - "learning_rate": 1.6566903229797954e-06, - "loss": 0.2602, + "epoch": 4.544455256424118, + "grad_norm": 0.2635978162288666, + "learning_rate": 1.0880208750637083e-06, + "loss": 0.3755, "step": 126095 }, { - "epoch": 4.44, - "learning_rate": 1.6556707125562177e-06, - "loss": 0.2298, + "epoch": 4.544635456085342, + "grad_norm": 0.2227722406387329, + "learning_rate": 1.0871695247295782e-06, + "loss": 0.3653, "step": 126100 }, { - "epoch": 4.44, - "learning_rate": 1.654651405243368e-06, - "loss": 0.2428, + "epoch": 4.544815655746567, + "grad_norm": 0.23662735521793365, + "learning_rate": 1.0863185002031317e-06, + "loss": 0.3871, "step": 126105 }, { - "epoch": 4.44, - "learning_rate": 1.653632401054489e-06, - "loss": 0.2488, + "epoch": 4.544995855407792, + "grad_norm": 0.25815528631210327, + "learning_rate": 1.0854678014959679e-06, + "loss": 0.3697, "step": 126110 }, { - "epoch": 4.44, - "learning_rate": 1.6526137000028058e-06, - "loss": 0.2422, + "epoch": 4.545176055069017, + "grad_norm": 0.2185879647731781, + "learning_rate": 1.084617428619672e-06, + "loss": 0.3678, "step": 126115 }, { - "epoch": 4.44, - "learning_rate": 1.6515953021015441e-06, - "loss": 0.2439, + "epoch": 4.545356254730241, + "grad_norm": 0.2667752504348755, + "learning_rate": 1.0837673815858345e-06, + "loss": 0.3512, "step": 126120 }, { - "epoch": 4.44, - "learning_rate": 1.6505772073639291e-06, - "loss": 0.2258, + "epoch": 4.545536454391466, + "grad_norm": 0.20545603334903717, + "learning_rate": 1.0829176604060353e-06, + "loss": 0.373, "step": 126125 }, { - "epoch": 4.44, - "learning_rate": 1.6495594158031868e-06, - "loss": 0.2332, + "epoch": 4.545716654052691, + "grad_norm": 0.2735787332057953, + "learning_rate": 1.0820682650918569e-06, + "loss": 0.4048, "step": 126130 }, { - "epoch": 4.44, - "learning_rate": 1.6485419274325259e-06, - "loss": 0.2456, + "epoch": 4.545896853713915, + "grad_norm": 0.27663978934288025, + "learning_rate": 1.0812191956548644e-06, + "loss": 0.4058, "step": 126135 }, { - "epoch": 4.44, - "learning_rate": 1.6475247422651608e-06, - "loss": 0.2648, + "epoch": 4.546077053375139, + "grad_norm": 0.33949437737464905, + "learning_rate": 1.0803704521066298e-06, + "loss": 0.367, "step": 126140 }, { - "epoch": 4.44, - "learning_rate": 1.646507860314292e-06, - "loss": 0.2597, + "epoch": 4.546257253036364, + "grad_norm": 0.28756633400917053, + "learning_rate": 1.0795220344587131e-06, + "loss": 0.3924, "step": 126145 }, { - "epoch": 4.44, - "learning_rate": 1.6454912815931312e-06, - "loss": 0.2382, + "epoch": 4.546437452697589, + "grad_norm": 0.2777945101261139, + "learning_rate": 1.0786739427226827e-06, + "loss": 0.3827, "step": 126150 }, { - "epoch": 4.44, - "learning_rate": 1.6444750061148762e-06, - "loss": 0.277, + "epoch": 4.5466176523588135, + "grad_norm": 0.2093537449836731, + "learning_rate": 1.0778261769100905e-06, + "loss": 0.3825, "step": 126155 }, { - "epoch": 4.44, - "learning_rate": 1.6434590338927191e-06, - "loss": 0.2687, + "epoch": 4.546797852020038, + "grad_norm": 0.2346353679895401, + "learning_rate": 1.0769787370324802e-06, + "loss": 0.3685, "step": 126160 }, { - "epoch": 4.44, - "learning_rate": 1.6424433649398524e-06, - "loss": 0.2389, + "epoch": 4.546978051681263, + "grad_norm": 0.23039868474006653, + "learning_rate": 1.0761316231014061e-06, + "loss": 0.3617, "step": 126165 }, { - "epoch": 4.44, - "learning_rate": 1.6414279992694682e-06, - "loss": 0.2455, + "epoch": 4.547158251342488, + "grad_norm": 0.23052442073822021, + "learning_rate": 1.0752848351284011e-06, + "loss": 0.3888, "step": 126170 }, { - "epoch": 4.44, - "learning_rate": 1.6404129368947445e-06, - "loss": 0.2309, + "epoch": 4.5473384510037125, + "grad_norm": 0.26234328746795654, + "learning_rate": 1.0744383731250168e-06, + "loss": 0.3682, "step": 126175 }, { - "epoch": 4.44, - "learning_rate": 1.6393981778288658e-06, - "loss": 0.2248, + "epoch": 4.547518650664937, + "grad_norm": 0.25200456380844116, + "learning_rate": 1.0735922371027746e-06, + "loss": 0.3748, "step": 126180 }, { - "epoch": 4.44, - "learning_rate": 1.6383837220850045e-06, - "loss": 0.2573, + "epoch": 4.547698850326161, + "grad_norm": 0.31448420882225037, + "learning_rate": 1.0727464270732013e-06, + "loss": 0.3791, "step": 126185 }, { - "epoch": 4.44, - "learning_rate": 1.6373695696763391e-06, - "loss": 0.2841, + "epoch": 4.547879049987386, + "grad_norm": 0.22912736237049103, + "learning_rate": 1.0719009430478294e-06, + "loss": 0.3655, "step": 126190 }, { - "epoch": 4.44, - "learning_rate": 1.636355720616034e-06, - "loss": 0.2451, + "epoch": 4.5480592496486105, + "grad_norm": 0.23380719125270844, + "learning_rate": 1.0710557850381747e-06, + "loss": 0.3199, "step": 126195 }, { - "epoch": 4.44, - "learning_rate": 1.6353421749172453e-06, - "loss": 0.2418, + "epoch": 4.548239449309835, + "grad_norm": 0.23022237420082092, + "learning_rate": 1.0702109530557502e-06, + "loss": 0.3819, "step": 126200 }, { - "epoch": 4.44, - "learning_rate": 1.6343289325931489e-06, - "loss": 0.2396, + "epoch": 4.54841964897106, + "grad_norm": 0.2493658810853958, + "learning_rate": 1.069366447112069e-06, + "loss": 0.4007, "step": 126205 }, { - "epoch": 4.44, - "learning_rate": 1.6333159936568893e-06, - "loss": 0.2439, + "epoch": 4.548599848632285, + "grad_norm": 0.21301326155662537, + "learning_rate": 1.0685222672186357e-06, + "loss": 0.3805, "step": 126210 }, { - "epoch": 4.44, - "learning_rate": 1.6323033581216228e-06, - "loss": 0.2512, + "epoch": 4.5487800482935095, + "grad_norm": 0.299532949924469, + "learning_rate": 1.067678413386955e-06, + "loss": 0.3399, "step": 126215 }, { - "epoch": 4.44, - "learning_rate": 1.6312910260004949e-06, - "loss": 0.2396, + "epoch": 4.548960247954734, + "grad_norm": 0.21883751451969147, + "learning_rate": 1.0668348856285231e-06, + "loss": 0.3422, "step": 126220 }, { - "epoch": 4.44, - "learning_rate": 1.6302789973066556e-06, - "loss": 0.2277, + "epoch": 4.549140447615958, + "grad_norm": 0.2648196518421173, + "learning_rate": 1.0659916839548313e-06, + "loss": 0.3592, "step": 126225 }, { - "epoch": 4.44, - "learning_rate": 1.6292672720532365e-06, - "loss": 0.2511, + "epoch": 4.549320647277183, + "grad_norm": 0.20908887684345245, + "learning_rate": 1.0651488083773697e-06, + "loss": 0.3723, "step": 126230 }, { - "epoch": 4.44, - "learning_rate": 1.6282558502533879e-06, - "loss": 0.2438, + "epoch": 4.5495008469384075, + "grad_norm": 0.25661706924438477, + "learning_rate": 1.0643062589076186e-06, + "loss": 0.3885, "step": 126235 }, { - "epoch": 4.44, - "learning_rate": 1.6272447319202272e-06, - "loss": 0.248, + "epoch": 4.549681046599632, + "grad_norm": 0.2967999279499054, + "learning_rate": 1.0634640355570658e-06, + "loss": 0.3833, "step": 126240 }, { - "epoch": 4.44, - "learning_rate": 1.6262339170668966e-06, - "loss": 0.245, + "epoch": 4.549861246260857, + "grad_norm": 0.3139897286891937, + "learning_rate": 1.062622138337177e-06, + "loss": 0.3911, "step": 126245 }, { - "epoch": 4.44, - "learning_rate": 1.6252234057065136e-06, - "loss": 0.2582, + "epoch": 4.550041445922082, + "grad_norm": 0.2221304029226303, + "learning_rate": 1.0617805672594295e-06, + "loss": 0.369, "step": 126250 }, { - "epoch": 4.44, - "learning_rate": 1.6242131978522007e-06, - "loss": 0.2563, + "epoch": 4.550221645583306, + "grad_norm": 0.2807883322238922, + "learning_rate": 1.0609393223352887e-06, + "loss": 0.372, "step": 126255 }, { - "epoch": 4.44, - "learning_rate": 1.62320329351707e-06, - "loss": 0.2532, + "epoch": 4.550401845244531, + "grad_norm": 0.2690640389919281, + "learning_rate": 1.0600984035762124e-06, + "loss": 0.3985, "step": 126260 }, { - "epoch": 4.44, - "learning_rate": 1.6221936927142412e-06, - "loss": 0.2486, + "epoch": 4.550582044905756, + "grad_norm": 0.2200150489807129, + "learning_rate": 1.059257810993658e-06, + "loss": 0.3896, "step": 126265 }, { - "epoch": 4.44, - "learning_rate": 1.6211843954568206e-06, - "loss": 0.258, + "epoch": 4.550762244566981, + "grad_norm": 0.21496446430683136, + "learning_rate": 1.0584175445990884e-06, + "loss": 0.3742, "step": 126270 }, { - "epoch": 4.44, - "learning_rate": 1.6201754017579147e-06, - "loss": 0.2594, + "epoch": 4.5509424442282045, + "grad_norm": 0.27852797508239746, + "learning_rate": 1.0575776044039366e-06, + "loss": 0.3906, "step": 126275 }, { - "epoch": 4.44, - "learning_rate": 1.6191667116306208e-06, - "loss": 0.2616, + "epoch": 4.551122643889429, + "grad_norm": 0.27898815274238586, + "learning_rate": 1.0567379904196567e-06, + "loss": 0.3217, "step": 126280 }, { - "epoch": 4.44, - "learning_rate": 1.6181583250880372e-06, - "loss": 0.2547, + "epoch": 4.551302843550654, + "grad_norm": 0.25052666664123535, + "learning_rate": 1.0558987026576873e-06, + "loss": 0.3821, "step": 126285 }, { - "epoch": 4.44, - "learning_rate": 1.6171502421432615e-06, - "loss": 0.2448, + "epoch": 4.551483043211879, + "grad_norm": 0.306427538394928, + "learning_rate": 1.0550597411294633e-06, + "loss": 0.3712, "step": 126290 }, { - "epoch": 4.44, - "learning_rate": 1.6161424628093808e-06, - "loss": 0.2552, + "epoch": 4.551663242873103, + "grad_norm": 0.30519047379493713, + "learning_rate": 1.0542211058464146e-06, + "loss": 0.416, "step": 126295 }, { - "epoch": 4.44, - "learning_rate": 1.615134987099476e-06, - "loss": 0.2536, + "epoch": 4.551843442534328, + "grad_norm": 0.2536962032318115, + "learning_rate": 1.0533827968199656e-06, + "loss": 0.3969, "step": 126300 }, { - "epoch": 4.44, - "learning_rate": 1.6141278150266364e-06, - "loss": 0.2477, + "epoch": 4.552023642195553, + "grad_norm": 0.27985987067222595, + "learning_rate": 1.0525448140615374e-06, + "loss": 0.3781, "step": 126305 }, { - "epoch": 4.44, - "learning_rate": 1.6131209466039354e-06, - "loss": 0.2782, + "epoch": 4.552203841856778, + "grad_norm": 0.2177259922027588, + "learning_rate": 1.0517071575825544e-06, + "loss": 0.3797, "step": 126310 }, { - "epoch": 4.44, - "learning_rate": 1.6121143818444429e-06, - "loss": 0.2656, + "epoch": 4.5523840415180015, + "grad_norm": 0.25428178906440735, + "learning_rate": 1.0508698273944212e-06, + "loss": 0.3856, "step": 126315 }, { - "epoch": 4.44, - "learning_rate": 1.6111081207612344e-06, - "loss": 0.2611, + "epoch": 4.552564241179226, + "grad_norm": 0.235152468085289, + "learning_rate": 1.0500328235085538e-06, + "loss": 0.36, "step": 126320 }, { - "epoch": 4.44, - "learning_rate": 1.6101021633673747e-06, - "loss": 0.2602, + "epoch": 4.552744440840451, + "grad_norm": 0.22013433277606964, + "learning_rate": 1.0491961459363515e-06, + "loss": 0.3802, "step": 126325 }, { - "epoch": 4.44, - "learning_rate": 1.6090965096759225e-06, - "loss": 0.2545, + "epoch": 4.552924640501676, + "grad_norm": 0.277230829000473, + "learning_rate": 1.0483597946892104e-06, + "loss": 0.3904, "step": 126330 }, { - "epoch": 4.44, - "learning_rate": 1.6080911596999316e-06, - "loss": 0.2484, + "epoch": 4.5531048401629, + "grad_norm": 0.25709518790245056, + "learning_rate": 1.0475237697785328e-06, + "loss": 0.3708, "step": 126335 }, { - "epoch": 4.44, - "learning_rate": 1.6070861134524661e-06, - "loss": 0.2543, + "epoch": 4.553285039824125, + "grad_norm": 0.2280193567276001, + "learning_rate": 1.0466880712157095e-06, + "loss": 0.3783, "step": 126340 }, { - "epoch": 4.45, - "learning_rate": 1.6060813709465688e-06, - "loss": 0.2743, + "epoch": 4.55346523948535, + "grad_norm": 0.2616412937641144, + "learning_rate": 1.0458526990121175e-06, + "loss": 0.4052, "step": 126345 }, { - "epoch": 4.45, - "learning_rate": 1.6050769321952903e-06, - "loss": 0.2608, + "epoch": 4.553645439146575, + "grad_norm": 0.33539026975631714, + "learning_rate": 1.0450176531791478e-06, + "loss": 0.3937, "step": 126350 }, { - "epoch": 4.45, - "learning_rate": 1.604072797211667e-06, - "loss": 0.2474, + "epoch": 4.553825638807799, + "grad_norm": 0.21199902892112732, + "learning_rate": 1.0441829337281744e-06, + "loss": 0.3597, "step": 126355 }, { - "epoch": 4.45, - "learning_rate": 1.6030689660087416e-06, - "loss": 0.2777, + "epoch": 4.554005838469024, + "grad_norm": 0.24767298996448517, + "learning_rate": 1.0433485406705718e-06, + "loss": 0.3845, "step": 126360 }, { - "epoch": 4.45, - "learning_rate": 1.6020654385995482e-06, - "loss": 0.2635, + "epoch": 4.554186038130249, + "grad_norm": 0.2962040305137634, + "learning_rate": 1.0425144740177085e-06, + "loss": 0.3648, "step": 126365 }, { - "epoch": 4.45, - "learning_rate": 1.6010622149971178e-06, - "loss": 0.2895, + "epoch": 4.554366237791473, + "grad_norm": 0.2016834169626236, + "learning_rate": 1.041680733780942e-06, + "loss": 0.3578, "step": 126370 }, { - "epoch": 4.45, - "learning_rate": 1.600059295214465e-06, - "loss": 0.2526, + "epoch": 4.554546437452697, + "grad_norm": 0.21861585974693298, + "learning_rate": 1.040847319971641e-06, + "loss": 0.3705, "step": 126375 }, { - "epoch": 4.45, - "learning_rate": 1.5990566792646293e-06, - "loss": 0.2474, + "epoch": 4.554726637113922, + "grad_norm": 0.3241893947124481, + "learning_rate": 1.040014232601158e-06, + "loss": 0.3971, "step": 126380 }, { - "epoch": 4.45, - "learning_rate": 1.5980543671606173e-06, - "loss": 0.2581, + "epoch": 4.554906836775147, + "grad_norm": 0.24871119856834412, + "learning_rate": 1.0391814716808391e-06, + "loss": 0.3911, "step": 126385 }, { - "epoch": 4.45, - "learning_rate": 1.5970523589154462e-06, - "loss": 0.2419, + "epoch": 4.555087036436372, + "grad_norm": 0.20087426900863647, + "learning_rate": 1.0383490372220361e-06, + "loss": 0.3938, "step": 126390 }, { - "epoch": 4.45, - "learning_rate": 1.5960506545421305e-06, - "loss": 0.2387, + "epoch": 4.555267236097596, + "grad_norm": 0.2455914169549942, + "learning_rate": 1.0375169292360847e-06, + "loss": 0.3256, "step": 126395 }, { - "epoch": 4.45, - "learning_rate": 1.5950492540536681e-06, - "loss": 0.2547, + "epoch": 4.555447435758821, + "grad_norm": 0.2423105537891388, + "learning_rate": 1.0366851477343286e-06, + "loss": 0.3616, "step": 126400 }, { - "epoch": 4.45, - "learning_rate": 1.5940481574630712e-06, - "loss": 0.2519, + "epoch": 4.555627635420046, + "grad_norm": 0.2773745059967041, + "learning_rate": 1.0358536927280977e-06, + "loss": 0.3839, "step": 126405 }, { - "epoch": 4.45, - "learning_rate": 1.5930473647833315e-06, - "loss": 0.2622, + "epoch": 4.55580783508127, + "grad_norm": 0.2075750231742859, + "learning_rate": 1.0350225642287215e-06, + "loss": 0.3333, "step": 126410 }, { - "epoch": 4.45, - "learning_rate": 1.5920468760274449e-06, - "loss": 0.2612, + "epoch": 4.555988034742494, + "grad_norm": 0.28897416591644287, + "learning_rate": 1.0341917622475216e-06, + "loss": 0.3451, "step": 126415 }, { - "epoch": 4.45, - "learning_rate": 1.591046691208406e-06, - "loss": 0.2612, + "epoch": 4.556168234403719, + "grad_norm": 0.2481168508529663, + "learning_rate": 1.0333612867958197e-06, + "loss": 0.3703, "step": 126420 }, { - "epoch": 4.45, - "learning_rate": 1.5900468103391992e-06, - "loss": 0.2397, + "epoch": 4.556348434064944, + "grad_norm": 0.23715002834796906, + "learning_rate": 1.032531137884926e-06, + "loss": 0.3816, "step": 126425 }, { - "epoch": 4.45, - "learning_rate": 1.5890472334328028e-06, - "loss": 0.2415, + "epoch": 4.556528633726169, + "grad_norm": 0.24655555188655853, + "learning_rate": 1.0317013155261595e-06, + "loss": 0.3749, "step": 126430 }, { - "epoch": 4.45, - "learning_rate": 1.5880479605022037e-06, - "loss": 0.2416, + "epoch": 4.556708833387393, + "grad_norm": 0.20687216520309448, + "learning_rate": 1.030871819730822e-06, + "loss": 0.3603, "step": 126435 }, { - "epoch": 4.45, - "learning_rate": 1.587048991560372e-06, - "loss": 0.2499, + "epoch": 4.556889033048618, + "grad_norm": 0.2607882022857666, + "learning_rate": 1.0300426505102156e-06, + "loss": 0.3703, "step": 126440 }, { - "epoch": 4.45, - "learning_rate": 1.586050326620281e-06, - "loss": 0.2477, + "epoch": 4.557069232709843, + "grad_norm": 0.2784349322319031, + "learning_rate": 1.0292138078756396e-06, + "loss": 0.3719, "step": 126445 }, { - "epoch": 4.45, - "learning_rate": 1.585051965694892e-06, - "loss": 0.2689, + "epoch": 4.5572494323710675, + "grad_norm": 0.261934757232666, + "learning_rate": 1.0283852918383768e-06, + "loss": 0.3879, "step": 126450 }, { - "epoch": 4.45, - "learning_rate": 1.5840539087971701e-06, - "loss": 0.2361, + "epoch": 4.557429632032292, + "grad_norm": 0.27113890647888184, + "learning_rate": 1.0275571024097348e-06, + "loss": 0.3513, "step": 126455 }, { - "epoch": 4.45, - "learning_rate": 1.5830561559400825e-06, - "loss": 0.2429, + "epoch": 4.557609831693516, + "grad_norm": 0.2984657287597656, + "learning_rate": 1.0267292396009764e-06, + "loss": 0.3412, "step": 126460 }, { - "epoch": 4.45, - "learning_rate": 1.5820587071365772e-06, - "loss": 0.2768, + "epoch": 4.557790031354741, + "grad_norm": 0.2706381380558014, + "learning_rate": 1.0259017034233932e-06, + "loss": 0.3714, "step": 126465 }, { - "epoch": 4.45, - "learning_rate": 1.581061562399605e-06, - "loss": 0.234, + "epoch": 4.5579702310159655, + "grad_norm": 0.27217623591423035, + "learning_rate": 1.025074493888259e-06, + "loss": 0.3975, "step": 126470 }, { - "epoch": 4.45, - "learning_rate": 1.5800647217421166e-06, - "loss": 0.275, + "epoch": 4.55815043067719, + "grad_norm": 0.22170761227607727, + "learning_rate": 1.0242476110068426e-06, + "loss": 0.384, "step": 126475 }, { - "epoch": 4.45, - "learning_rate": 1.5790681851770544e-06, - "loss": 0.2451, + "epoch": 4.558330630338415, + "grad_norm": 0.2368282973766327, + "learning_rate": 1.0234210547904132e-06, + "loss": 0.3971, "step": 126480 }, { - "epoch": 4.45, - "learning_rate": 1.5780719527173555e-06, - "loss": 0.2236, + "epoch": 4.55851082999964, + "grad_norm": 0.22124750912189484, + "learning_rate": 1.0225948252502283e-06, + "loss": 0.3928, "step": 126485 }, { - "epoch": 4.45, - "learning_rate": 1.5770760243759536e-06, - "loss": 0.2471, + "epoch": 4.5586910296608645, + "grad_norm": 0.27596476674079895, + "learning_rate": 1.0217689223975425e-06, + "loss": 0.3498, "step": 126490 }, { - "epoch": 4.45, - "learning_rate": 1.5760804001657858e-06, - "loss": 0.2583, + "epoch": 4.558871229322089, + "grad_norm": 0.2869780659675598, + "learning_rate": 1.0209433462436164e-06, + "loss": 0.3905, "step": 126495 }, { - "epoch": 4.45, - "learning_rate": 1.575085080099778e-06, - "loss": 0.2628, + "epoch": 4.559051428983313, + "grad_norm": 0.2928743362426758, + "learning_rate": 1.0201180967996937e-06, + "loss": 0.3722, "step": 126500 }, { - "epoch": 4.45, - "eval_loss": 0.2488119751214981, - "eval_runtime": 10.5384, - "eval_samples_per_second": 9.489, - "eval_steps_per_second": 9.489, + "epoch": 4.559051428983313, + "eval_loss": 0.42879581451416016, + "eval_runtime": 3.5284, + "eval_samples_per_second": 28.342, + "eval_steps_per_second": 7.085, "step": 126500 }, { - "epoch": 4.45, - "learning_rate": 1.5740900641908501e-06, - "loss": 0.2759, + "epoch": 4.559231628644538, + "grad_norm": 0.2935590147972107, + "learning_rate": 1.0192931740770213e-06, + "loss": 0.388, "step": 126505 }, { - "epoch": 4.45, - "learning_rate": 1.5730953524519227e-06, - "loss": 0.2509, + "epoch": 4.5594118283057625, + "grad_norm": 0.22944243252277374, + "learning_rate": 1.018468578086837e-06, + "loss": 0.3727, "step": 126510 }, { - "epoch": 4.45, - "learning_rate": 1.5721009448959184e-06, - "loss": 0.242, + "epoch": 4.559592027966987, + "grad_norm": 0.25734102725982666, + "learning_rate": 1.0176443088403708e-06, + "loss": 0.3495, "step": 126515 }, { - "epoch": 4.45, - "learning_rate": 1.571106841535744e-06, - "loss": 0.2429, + "epoch": 4.559772227628212, + "grad_norm": 0.2919704020023346, + "learning_rate": 1.0168203663488612e-06, + "loss": 0.3703, "step": 126520 }, { - "epoch": 4.45, - "learning_rate": 1.5701130423843053e-06, - "loss": 0.2498, + "epoch": 4.559952427289437, + "grad_norm": 0.29848772287368774, + "learning_rate": 1.0159967506235297e-06, + "loss": 0.3914, "step": 126525 }, { - "epoch": 4.45, - "learning_rate": 1.5691195474545062e-06, - "loss": 0.2651, + "epoch": 4.5601326269506615, + "grad_norm": 0.28151485323905945, + "learning_rate": 1.0151734616756003e-06, + "loss": 0.356, "step": 126530 }, { - "epoch": 4.45, - "learning_rate": 1.5681263567592502e-06, - "loss": 0.2381, + "epoch": 4.560312826611886, + "grad_norm": 0.387460857629776, + "learning_rate": 1.0143504995162895e-06, + "loss": 0.3724, "step": 126535 }, { - "epoch": 4.45, - "learning_rate": 1.5671334703114327e-06, - "loss": 0.2313, + "epoch": 4.560493026273111, + "grad_norm": 0.221677765250206, + "learning_rate": 1.0135278641568046e-06, + "loss": 0.3535, "step": 126540 }, { - "epoch": 4.45, - "learning_rate": 1.5661408881239404e-06, - "loss": 0.2446, + "epoch": 4.560673225934336, + "grad_norm": 0.3218268156051636, + "learning_rate": 1.012705555608362e-06, + "loss": 0.3796, "step": 126545 }, { - "epoch": 4.45, - "learning_rate": 1.5651486102096686e-06, - "loss": 0.2733, + "epoch": 4.5608534255955595, + "grad_norm": 0.2881567180156708, + "learning_rate": 1.0118835738821664e-06, + "loss": 0.3839, "step": 126550 }, { - "epoch": 4.45, - "learning_rate": 1.5641566365814958e-06, - "loss": 0.262, + "epoch": 4.561033625256784, + "grad_norm": 0.25800442695617676, + "learning_rate": 1.0110619189894061e-06, + "loss": 0.3872, "step": 126555 }, { - "epoch": 4.45, - "learning_rate": 1.5631649672523036e-06, - "loss": 0.2356, + "epoch": 4.561213824918009, + "grad_norm": 0.2594134509563446, + "learning_rate": 1.010240590941286e-06, + "loss": 0.3691, "step": 126560 }, { - "epoch": 4.45, - "learning_rate": 1.5621736022349676e-06, - "loss": 0.2432, + "epoch": 4.561394024579234, + "grad_norm": 0.22755463421344757, + "learning_rate": 1.0094195897489944e-06, + "loss": 0.3766, "step": 126565 }, { - "epoch": 4.45, - "learning_rate": 1.5611825415423637e-06, - "loss": 0.269, + "epoch": 4.561574224240458, + "grad_norm": 0.21717192232608795, + "learning_rate": 1.008598915423714e-06, + "loss": 0.3823, "step": 126570 }, { - "epoch": 4.45, - "learning_rate": 1.5601917851873593e-06, - "loss": 0.2697, + "epoch": 4.561754423901683, + "grad_norm": 0.21237117052078247, + "learning_rate": 1.0077785679766304e-06, + "loss": 0.3484, "step": 126575 }, { - "epoch": 4.45, - "learning_rate": 1.5592013331828193e-06, - "loss": 0.2502, + "epoch": 4.561934623562908, + "grad_norm": 0.24171051383018494, + "learning_rate": 1.0069585474189126e-06, + "loss": 0.4059, "step": 126580 }, { - "epoch": 4.45, - "learning_rate": 1.5582111855415943e-06, - "loss": 0.23, + "epoch": 4.562114823224133, + "grad_norm": 0.24933667480945587, + "learning_rate": 1.0061388537617456e-06, + "loss": 0.3449, "step": 126585 }, { - "epoch": 4.45, - "learning_rate": 1.5572213422765548e-06, - "loss": 0.2646, + "epoch": 4.5622950228853565, + "grad_norm": 0.27278992533683777, + "learning_rate": 1.0053194870162901e-06, + "loss": 0.3666, "step": 126590 }, { - "epoch": 4.45, - "learning_rate": 1.5562318034005457e-06, - "loss": 0.2634, + "epoch": 4.562475222546581, + "grad_norm": 0.21958884596824646, + "learning_rate": 1.0045004471937125e-06, + "loss": 0.3687, "step": 126595 }, { - "epoch": 4.45, - "learning_rate": 1.5552425689264156e-06, - "loss": 0.2501, + "epoch": 4.562655422207806, + "grad_norm": 0.2822725474834442, + "learning_rate": 1.0036817343051674e-06, + "loss": 0.3441, "step": 126600 }, { - "epoch": 4.45, - "learning_rate": 1.5542536388670065e-06, - "loss": 0.2284, + "epoch": 4.562835621869031, + "grad_norm": 0.2642861604690552, + "learning_rate": 1.0028633483618154e-06, + "loss": 0.3919, "step": 126605 }, { - "epoch": 4.45, - "learning_rate": 1.5532650132351667e-06, - "loss": 0.2591, + "epoch": 4.563015821530255, + "grad_norm": 0.2586185336112976, + "learning_rate": 1.0020452893748006e-06, + "loss": 0.3887, "step": 126610 }, { - "epoch": 4.45, - "learning_rate": 1.5522766920437275e-06, - "loss": 0.2355, + "epoch": 4.56319602119148, + "grad_norm": 0.2781605124473572, + "learning_rate": 1.0012275573552748e-06, + "loss": 0.3715, "step": 126615 }, { - "epoch": 4.45, - "learning_rate": 1.5512886753055233e-06, - "loss": 0.2518, + "epoch": 4.563376220852705, + "grad_norm": 0.27998635172843933, + "learning_rate": 1.0004101523143794e-06, + "loss": 0.383, "step": 126620 }, { - "epoch": 4.46, - "learning_rate": 1.5503009630333799e-06, - "loss": 0.2802, + "epoch": 4.56355642051393, + "grad_norm": 0.29203617572784424, + "learning_rate": 9.99593074263247e-07, + "loss": 0.3644, "step": 126625 }, { - "epoch": 4.46, - "learning_rate": 1.5493135552401284e-06, - "loss": 0.2514, + "epoch": 4.563736620175154, + "grad_norm": 0.2884252965450287, + "learning_rate": 9.987763232130132e-07, + "loss": 0.4022, "step": 126630 }, { - "epoch": 4.46, - "learning_rate": 1.5483264519385864e-06, - "loss": 0.2342, + "epoch": 4.563916819836379, + "grad_norm": 0.26518526673316956, + "learning_rate": 9.979598991748023e-07, + "loss": 0.3545, "step": 126635 }, { - "epoch": 4.46, - "learning_rate": 1.547339653141569e-06, - "loss": 0.2499, + "epoch": 4.564097019497604, + "grad_norm": 0.28502172231674194, + "learning_rate": 9.971438021597473e-07, + "loss": 0.3894, "step": 126640 }, { - "epoch": 4.46, - "learning_rate": 1.5463531588618878e-06, - "loss": 0.2665, + "epoch": 4.564277219158828, + "grad_norm": 0.2587156593799591, + "learning_rate": 9.96328032178953e-07, + "loss": 0.3615, "step": 126645 }, { - "epoch": 4.46, - "learning_rate": 1.545366969112355e-06, - "loss": 0.2529, + "epoch": 4.564457418820052, + "grad_norm": 0.2818557024002075, + "learning_rate": 9.955125892435468e-07, + "loss": 0.3558, "step": 126650 }, { - "epoch": 4.46, - "learning_rate": 1.544381083905777e-06, - "loss": 0.2425, + "epoch": 4.564637618481277, + "grad_norm": 0.2464769035577774, + "learning_rate": 9.946974733646335e-07, + "loss": 0.3825, "step": 126655 }, { - "epoch": 4.46, - "learning_rate": 1.5433955032549546e-06, - "loss": 0.248, + "epoch": 4.564817818142502, + "grad_norm": 0.21170103549957275, + "learning_rate": 9.938826845533183e-07, + "loss": 0.3545, "step": 126660 }, { - "epoch": 4.46, - "learning_rate": 1.5424102271726776e-06, - "loss": 0.2423, + "epoch": 4.564998017803727, + "grad_norm": 0.22021377086639404, + "learning_rate": 9.930682228207089e-07, + "loss": 0.3835, "step": 126665 }, { - "epoch": 4.46, - "learning_rate": 1.541425255671744e-06, - "loss": 0.2585, + "epoch": 4.565178217464951, + "grad_norm": 0.24116267263889313, + "learning_rate": 9.922540881778936e-07, + "loss": 0.3767, "step": 126670 }, { - "epoch": 4.46, - "learning_rate": 1.5404405887649493e-06, - "loss": 0.2577, + "epoch": 4.565358417126176, + "grad_norm": 0.20193111896514893, + "learning_rate": 9.914402806359635e-07, + "loss": 0.3498, "step": 126675 }, { - "epoch": 4.46, - "learning_rate": 1.5394562264650665e-06, - "loss": 0.2412, + "epoch": 4.565538616787401, + "grad_norm": 0.3153676986694336, + "learning_rate": 9.906268002060155e-07, + "loss": 0.3707, "step": 126680 }, { - "epoch": 4.46, - "learning_rate": 1.5384721687848908e-06, - "loss": 0.2255, + "epoch": 4.565718816448625, + "grad_norm": 0.21089020371437073, + "learning_rate": 9.898136468991293e-07, + "loss": 0.362, "step": 126685 }, { - "epoch": 4.46, - "learning_rate": 1.53748841573719e-06, - "loss": 0.2443, + "epoch": 4.565899016109849, + "grad_norm": 0.22067415714263916, + "learning_rate": 9.890008207263852e-07, + "loss": 0.3148, "step": 126690 }, { - "epoch": 4.46, - "learning_rate": 1.5365049673347397e-06, - "loss": 0.2799, + "epoch": 4.566079215771074, + "grad_norm": 0.27838194370269775, + "learning_rate": 9.881883216988547e-07, + "loss": 0.3914, "step": 126695 }, { - "epoch": 4.46, - "learning_rate": 1.5355218235903051e-06, - "loss": 0.2415, + "epoch": 4.566259415432299, + "grad_norm": 0.2392108291387558, + "learning_rate": 9.87376149827607e-07, + "loss": 0.3841, "step": 126700 }, { - "epoch": 4.46, - "learning_rate": 1.534538984516662e-06, - "loss": 0.2554, + "epoch": 4.566439615093524, + "grad_norm": 0.2691670358181, + "learning_rate": 9.865643051237134e-07, + "loss": 0.3909, "step": 126705 }, { - "epoch": 4.46, - "learning_rate": 1.5335564501265637e-06, - "loss": 0.2415, + "epoch": 4.566619814754748, + "grad_norm": 0.29195481538772583, + "learning_rate": 9.857527875982292e-07, + "loss": 0.4007, "step": 126710 }, { - "epoch": 4.46, - "learning_rate": 1.5325742204327725e-06, - "loss": 0.2521, + "epoch": 4.566800014415973, + "grad_norm": 0.2075537145137787, + "learning_rate": 9.849415972622178e-07, + "loss": 0.3993, "step": 126715 }, { - "epoch": 4.46, - "learning_rate": 1.5315922954480338e-06, - "loss": 0.2632, + "epoch": 4.566980214077198, + "grad_norm": 0.24173662066459656, + "learning_rate": 9.841307341267258e-07, + "loss": 0.3896, "step": 126720 }, { - "epoch": 4.46, - "learning_rate": 1.5306106751851013e-06, - "loss": 0.2467, + "epoch": 4.5671604137384225, + "grad_norm": 0.21936601400375366, + "learning_rate": 9.833201982028e-07, + "loss": 0.369, "step": 126725 }, { - "epoch": 4.46, - "learning_rate": 1.5296293596567284e-06, - "loss": 0.2427, + "epoch": 4.567340613399647, + "grad_norm": 0.2615318298339844, + "learning_rate": 9.8250998950149e-07, + "loss": 0.3511, "step": 126730 }, { - "epoch": 4.46, - "learning_rate": 1.5286483488756499e-06, - "loss": 0.2531, + "epoch": 4.567520813060871, + "grad_norm": 0.23326557874679565, + "learning_rate": 9.81700108033834e-07, + "loss": 0.347, "step": 126735 }, { - "epoch": 4.46, - "learning_rate": 1.5276676428546022e-06, - "loss": 0.2361, + "epoch": 4.567701012722096, + "grad_norm": 0.26399147510528564, + "learning_rate": 9.808905538108593e-07, + "loss": 0.3568, "step": 126740 }, { - "epoch": 4.46, - "learning_rate": 1.5266872416063226e-06, - "loss": 0.2521, + "epoch": 4.567881212383321, + "grad_norm": 0.23725526034832, + "learning_rate": 9.800813268436016e-07, + "loss": 0.4013, "step": 126745 }, { - "epoch": 4.46, - "learning_rate": 1.525707145143543e-06, - "loss": 0.2528, + "epoch": 4.568061412044545, + "grad_norm": 0.23820684850215912, + "learning_rate": 9.792724271430881e-07, + "loss": 0.3677, "step": 126750 }, { - "epoch": 4.46, - "learning_rate": 1.524727353478983e-06, - "loss": 0.2141, + "epoch": 4.56824161170577, + "grad_norm": 0.23823130130767822, + "learning_rate": 9.78463854720335e-07, + "loss": 0.3627, "step": 126755 }, { - "epoch": 4.46, - "learning_rate": 1.5237478666253635e-06, - "loss": 0.2405, + "epoch": 4.568421811366995, + "grad_norm": 0.2619285583496094, + "learning_rate": 9.776556095863615e-07, + "loss": 0.3665, "step": 126760 }, { - "epoch": 4.46, - "learning_rate": 1.522768684595413e-06, - "loss": 0.2489, + "epoch": 4.5686020110282195, + "grad_norm": 0.2296546995639801, + "learning_rate": 9.768476917521752e-07, + "loss": 0.3725, "step": 126765 }, { - "epoch": 4.46, - "learning_rate": 1.5217898074018356e-06, - "loss": 0.2644, + "epoch": 4.568782210689444, + "grad_norm": 0.24734491109848022, + "learning_rate": 9.76040101228795e-07, + "loss": 0.3575, "step": 126770 }, { - "epoch": 4.46, - "learning_rate": 1.5208112350573428e-06, - "loss": 0.2701, + "epoch": 4.568962410350668, + "grad_norm": 0.24005775153636932, + "learning_rate": 9.752328380272125e-07, + "loss": 0.3752, "step": 126775 }, { - "epoch": 4.46, - "learning_rate": 1.519832967574647e-06, - "loss": 0.2505, + "epoch": 4.569142610011893, + "grad_norm": 0.27801743149757385, + "learning_rate": 9.74425902158435e-07, + "loss": 0.4004, "step": 126780 }, { - "epoch": 4.46, - "learning_rate": 1.5188550049664434e-06, - "loss": 0.2438, + "epoch": 4.5693228096731175, + "grad_norm": 0.23164783418178558, + "learning_rate": 9.736192936334516e-07, + "loss": 0.3666, "step": 126785 }, { - "epoch": 4.46, - "learning_rate": 1.5178773472454333e-06, - "loss": 0.2178, + "epoch": 4.569503009334342, + "grad_norm": 0.2679736912250519, + "learning_rate": 9.72813012463253e-07, + "loss": 0.4026, "step": 126790 }, { - "epoch": 4.46, - "learning_rate": 1.5168999944243089e-06, - "loss": 0.2501, + "epoch": 4.569683208995567, + "grad_norm": 0.21138164401054382, + "learning_rate": 9.72007058658822e-07, + "loss": 0.4035, "step": 126795 }, { - "epoch": 4.46, - "learning_rate": 1.515922946515766e-06, - "loss": 0.2513, + "epoch": 4.569863408656792, + "grad_norm": 0.27566009759902954, + "learning_rate": 9.712014322311475e-07, + "loss": 0.3793, "step": 126800 }, { - "epoch": 4.46, - "learning_rate": 1.5149462035324857e-06, - "loss": 0.2651, + "epoch": 4.5700436083180165, + "grad_norm": 0.20621134340763092, + "learning_rate": 9.703961331912009e-07, + "loss": 0.367, "step": 126805 }, { - "epoch": 4.46, - "learning_rate": 1.5139697654871527e-06, - "loss": 0.2726, + "epoch": 4.570223807979241, + "grad_norm": 0.2810385227203369, + "learning_rate": 9.695911615499542e-07, + "loss": 0.3406, "step": 126810 }, { - "epoch": 4.46, - "learning_rate": 1.5129936323924426e-06, - "loss": 0.2419, + "epoch": 4.570404007640466, + "grad_norm": 0.24304364621639252, + "learning_rate": 9.687865173183762e-07, + "loss": 0.3867, "step": 126815 }, { - "epoch": 4.46, - "learning_rate": 1.5120178042610317e-06, - "loss": 0.2407, + "epoch": 4.570584207301691, + "grad_norm": 0.2781292200088501, + "learning_rate": 9.679822005074251e-07, + "loss": 0.4125, "step": 126820 }, { - "epoch": 4.46, - "learning_rate": 1.511042281105593e-06, - "loss": 0.2472, + "epoch": 4.5707644069629145, + "grad_norm": 0.2525552809238434, + "learning_rate": 9.671782111280698e-07, + "loss": 0.3521, "step": 126825 }, { - "epoch": 4.46, - "learning_rate": 1.5100670629387858e-06, - "loss": 0.2563, + "epoch": 4.570944606624139, + "grad_norm": 0.19801495969295502, + "learning_rate": 9.66374549191254e-07, + "loss": 0.3962, "step": 126830 }, { - "epoch": 4.46, - "learning_rate": 1.5090921497732808e-06, - "loss": 0.2501, + "epoch": 4.571124806285364, + "grad_norm": 0.26284393668174744, + "learning_rate": 9.65571214707936e-07, + "loss": 0.3921, "step": 126835 }, { - "epoch": 4.46, - "learning_rate": 1.5081175416217313e-06, - "loss": 0.2744, + "epoch": 4.571305005946589, + "grad_norm": 0.24263347685337067, + "learning_rate": 9.647682076890541e-07, + "loss": 0.3704, "step": 126840 }, { - "epoch": 4.46, - "learning_rate": 1.507143238496797e-06, - "loss": 0.2663, + "epoch": 4.5714852056078135, + "grad_norm": 0.2582213580608368, + "learning_rate": 9.639655281455496e-07, + "loss": 0.3755, "step": 126845 }, { - "epoch": 4.46, - "learning_rate": 1.5061692404111232e-06, - "loss": 0.2494, + "epoch": 4.571665405269038, + "grad_norm": 0.2347324639558792, + "learning_rate": 9.631631760883692e-07, + "loss": 0.3776, "step": 126850 }, { - "epoch": 4.46, - "learning_rate": 1.5051955473773556e-06, - "loss": 0.2423, + "epoch": 4.571845604930263, + "grad_norm": 0.26129037141799927, + "learning_rate": 9.62361151528432e-07, + "loss": 0.386, "step": 126855 }, { - "epoch": 4.46, - "learning_rate": 1.5042221594081447e-06, - "loss": 0.2425, + "epoch": 4.572025804591488, + "grad_norm": 0.3019455373287201, + "learning_rate": 9.615594544766681e-07, + "loss": 0.3658, "step": 126860 }, { - "epoch": 4.46, - "learning_rate": 1.503249076516125e-06, - "loss": 0.2373, + "epoch": 4.5722060042527115, + "grad_norm": 0.24731415510177612, + "learning_rate": 9.60758084944005e-07, + "loss": 0.3629, "step": 126865 }, { - "epoch": 4.46, - "learning_rate": 1.502276298713931e-06, - "loss": 0.2575, + "epoch": 4.572386203913936, + "grad_norm": 0.2329123318195343, + "learning_rate": 9.599570429413591e-07, + "loss": 0.3584, "step": 126870 }, { - "epoch": 4.46, - "learning_rate": 1.5013038260141887e-06, - "loss": 0.239, + "epoch": 4.572566403575161, + "grad_norm": 0.270871102809906, + "learning_rate": 9.591563284796435e-07, + "loss": 0.3416, "step": 126875 }, { - "epoch": 4.46, - "learning_rate": 1.5003316584295352e-06, - "loss": 0.2341, + "epoch": 4.572746603236386, + "grad_norm": 0.2525206208229065, + "learning_rate": 9.58355941569769e-07, + "loss": 0.3663, "step": 126880 }, { - "epoch": 4.46, - "learning_rate": 1.499359795972588e-06, - "loss": 0.2364, + "epoch": 4.57292680289761, + "grad_norm": 0.25082695484161377, + "learning_rate": 9.575558822226354e-07, + "loss": 0.3574, "step": 126885 }, { - "epoch": 4.46, - "learning_rate": 1.4983882386559622e-06, - "loss": 0.2315, + "epoch": 4.573107002558835, + "grad_norm": 0.25247716903686523, + "learning_rate": 9.567561504491535e-07, + "loss": 0.3674, "step": 126890 }, { - "epoch": 4.46, - "learning_rate": 1.4974169864922782e-06, - "loss": 0.2696, + "epoch": 4.57328720222006, + "grad_norm": 0.3109354078769684, + "learning_rate": 9.559567462602143e-07, + "loss": 0.3811, "step": 126895 }, { - "epoch": 4.46, - "learning_rate": 1.4964460394941482e-06, - "loss": 0.2497, + "epoch": 4.573467401881285, + "grad_norm": 0.26343855261802673, + "learning_rate": 9.551576696667092e-07, + "loss": 0.3876, "step": 126900 }, { - "epoch": 4.46, - "learning_rate": 1.4954753976741786e-06, - "loss": 0.2626, + "epoch": 4.573647601542509, + "grad_norm": 0.28081122040748596, + "learning_rate": 9.545186442679322e-07, + "loss": 0.3957, "step": 126905 }, { - "epoch": 4.47, - "learning_rate": 1.494505061044965e-06, - "loss": 0.2441, + "epoch": 4.573827801203734, + "grad_norm": 0.2132662832736969, + "learning_rate": 9.537201573736415e-07, + "loss": 0.373, "step": 126910 }, { - "epoch": 4.47, - "learning_rate": 1.493535029619117e-06, - "loss": 0.2493, + "epoch": 4.574008000864959, + "grad_norm": 0.3238392770290375, + "learning_rate": 9.52921998105255e-07, + "loss": 0.3838, "step": 126915 }, { - "epoch": 4.47, - "learning_rate": 1.4925653034092267e-06, - "loss": 0.2521, + "epoch": 4.574188200526183, + "grad_norm": 0.30945807695388794, + "learning_rate": 9.521241664736558e-07, + "loss": 0.3599, "step": 126920 }, { - "epoch": 4.47, - "learning_rate": 1.491595882427882e-06, - "loss": 0.2596, + "epoch": 4.574368400187407, + "grad_norm": 0.28242048621177673, + "learning_rate": 9.513266624897072e-07, + "loss": 0.3748, "step": 126925 }, { - "epoch": 4.47, - "learning_rate": 1.4906267666876695e-06, - "loss": 0.2547, + "epoch": 4.574548599848632, + "grad_norm": 0.24541223049163818, + "learning_rate": 9.505294861642727e-07, + "loss": 0.386, "step": 126930 }, { - "epoch": 4.47, - "learning_rate": 1.4896579562011793e-06, - "loss": 0.256, + "epoch": 4.574728799509857, + "grad_norm": 0.23810042440891266, + "learning_rate": 9.497326375082216e-07, + "loss": 0.3933, "step": 126935 }, { - "epoch": 4.47, - "learning_rate": 1.4886894509809845e-06, - "loss": 0.2824, + "epoch": 4.574908999171082, + "grad_norm": 0.2471175640821457, + "learning_rate": 9.489361165324062e-07, + "loss": 0.3648, "step": 126940 }, { - "epoch": 4.47, - "learning_rate": 1.4877212510396615e-06, - "loss": 0.2579, + "epoch": 4.575089198832306, + "grad_norm": 0.2977747321128845, + "learning_rate": 9.481399232476817e-07, + "loss": 0.4237, "step": 126945 }, { - "epoch": 4.47, - "learning_rate": 1.4867533563897807e-06, - "loss": 0.259, + "epoch": 4.575269398493531, + "grad_norm": 0.2620290517807007, + "learning_rate": 9.473440576648923e-07, + "loss": 0.3927, "step": 126950 }, { - "epoch": 4.47, - "learning_rate": 1.4857857670439152e-06, - "loss": 0.2363, + "epoch": 4.575449598154756, + "grad_norm": 0.2539403736591339, + "learning_rate": 9.46548519794882e-07, + "loss": 0.3549, "step": 126955 }, { - "epoch": 4.47, - "learning_rate": 1.4848184830146272e-06, - "loss": 0.2565, + "epoch": 4.57562979781598, + "grad_norm": 0.2643360197544098, + "learning_rate": 9.457533096484922e-07, + "loss": 0.3877, "step": 126960 }, { - "epoch": 4.47, - "learning_rate": 1.4838515043144734e-06, - "loss": 0.2329, + "epoch": 4.575809997477204, + "grad_norm": 0.22208766639232635, + "learning_rate": 9.449584272365585e-07, + "loss": 0.3733, "step": 126965 }, { - "epoch": 4.47, - "learning_rate": 1.482884830956005e-06, - "loss": 0.254, + "epoch": 4.575990197138429, + "grad_norm": 0.22070464491844177, + "learning_rate": 9.441638725699059e-07, + "loss": 0.3158, "step": 126970 }, { - "epoch": 4.47, - "learning_rate": 1.481918462951784e-06, - "loss": 0.2786, + "epoch": 4.576170396799654, + "grad_norm": 0.2639147937297821, + "learning_rate": 9.433696456593671e-07, + "loss": 0.3675, "step": 126975 }, { - "epoch": 4.47, - "learning_rate": 1.4809524003143532e-06, - "loss": 0.2525, + "epoch": 4.576350596460879, + "grad_norm": 0.2987234592437744, + "learning_rate": 9.425757465157531e-07, + "loss": 0.4199, "step": 126980 }, { - "epoch": 4.47, - "learning_rate": 1.4799866430562554e-06, - "loss": 0.2582, + "epoch": 4.576530796122103, + "grad_norm": 0.24704287946224213, + "learning_rate": 9.417821751498912e-07, + "loss": 0.3624, "step": 126985 }, { - "epoch": 4.47, - "learning_rate": 1.4790211911900277e-06, - "loss": 0.2658, + "epoch": 4.576710995783328, + "grad_norm": 0.23351232707500458, + "learning_rate": 9.409889315725895e-07, + "loss": 0.3265, "step": 126990 }, { - "epoch": 4.47, - "learning_rate": 1.47805604472821e-06, - "loss": 0.2376, + "epoch": 4.576891195444553, + "grad_norm": 0.23723867535591125, + "learning_rate": 9.40196015794656e-07, + "loss": 0.4, "step": 126995 }, { - "epoch": 4.47, - "learning_rate": 1.4770912036833313e-06, - "loss": 0.2564, + "epoch": 4.5770713951057775, + "grad_norm": 0.24983817338943481, + "learning_rate": 9.394034278268931e-07, + "loss": 0.3816, "step": 127000 }, { - "epoch": 4.47, - "eval_loss": 0.2487480491399765, - "eval_runtime": 10.5483, - "eval_samples_per_second": 9.48, - "eval_steps_per_second": 9.48, + "epoch": 4.5770713951057775, + "eval_loss": 0.42867863178253174, + "eval_runtime": 3.5177, + "eval_samples_per_second": 28.428, + "eval_steps_per_second": 7.107, "step": 127000 }, { - "epoch": 4.47, - "learning_rate": 1.476126668067923e-06, - "loss": 0.2605, + "epoch": 4.577251594767002, + "grad_norm": 0.2775065302848816, + "learning_rate": 9.386111676801007e-07, + "loss": 0.414, "step": 127005 }, { - "epoch": 4.47, - "learning_rate": 1.475162437894506e-06, - "loss": 0.2506, + "epoch": 4.577431794428226, + "grad_norm": 0.31633272767066956, + "learning_rate": 9.3781923536507e-07, + "loss": 0.3846, "step": 127010 }, { - "epoch": 4.47, - "learning_rate": 1.4741985131756037e-06, - "loss": 0.2621, + "epoch": 4.577611994089451, + "grad_norm": 0.2062579095363617, + "learning_rate": 9.37027630892598e-07, + "loss": 0.3812, "step": 127015 }, { - "epoch": 4.47, - "learning_rate": 1.4732348939237307e-06, - "loss": 0.2506, + "epoch": 4.577792193750676, + "grad_norm": 0.3232049345970154, + "learning_rate": 9.36236354273462e-07, + "loss": 0.3796, "step": 127020 }, { - "epoch": 4.47, - "learning_rate": 1.4722715801513937e-06, - "loss": 0.2394, + "epoch": 4.5779723934119, + "grad_norm": 0.24797019362449646, + "learning_rate": 9.35445405518448e-07, + "loss": 0.384, "step": 127025 }, { - "epoch": 4.47, - "learning_rate": 1.4713085718711078e-06, - "loss": 0.2506, + "epoch": 4.578152593073125, + "grad_norm": 0.23074999451637268, + "learning_rate": 9.346547846383308e-07, + "loss": 0.3794, "step": 127030 }, { - "epoch": 4.47, - "learning_rate": 1.4703458690953742e-06, - "loss": 0.2659, + "epoch": 4.57833279273435, + "grad_norm": 0.2560942769050598, + "learning_rate": 9.338644916438849e-07, + "loss": 0.3497, "step": 127035 }, { - "epoch": 4.47, - "learning_rate": 1.469383471836694e-06, - "loss": 0.2494, + "epoch": 4.5785129923955745, + "grad_norm": 0.20177873969078064, + "learning_rate": 9.33074526545874e-07, + "loss": 0.3644, "step": 127040 }, { - "epoch": 4.47, - "learning_rate": 1.468421380107557e-06, - "loss": 0.238, + "epoch": 4.578693192056799, + "grad_norm": 0.2853398025035858, + "learning_rate": 9.322848893550645e-07, + "loss": 0.3395, "step": 127045 }, { - "epoch": 4.47, - "learning_rate": 1.4674595939204643e-06, - "loss": 0.2531, + "epoch": 4.578873391718023, + "grad_norm": 0.26799461245536804, + "learning_rate": 9.314955800822117e-07, + "loss": 0.3263, "step": 127050 }, { - "epoch": 4.47, - "learning_rate": 1.466498113287898e-06, - "loss": 0.2549, + "epoch": 4.579053591379248, + "grad_norm": 0.2538415491580963, + "learning_rate": 9.307065987380736e-07, + "loss": 0.356, "step": 127055 }, { - "epoch": 4.47, - "learning_rate": 1.4655369382223478e-06, - "loss": 0.2312, + "epoch": 4.579233791040473, + "grad_norm": 0.2968897819519043, + "learning_rate": 9.299179453333973e-07, + "loss": 0.3384, "step": 127060 }, { - "epoch": 4.47, - "learning_rate": 1.4645760687362842e-06, - "loss": 0.2283, + "epoch": 4.579413990701697, + "grad_norm": 0.2750302255153656, + "learning_rate": 9.291296198789296e-07, + "loss": 0.3623, "step": 127065 }, { - "epoch": 4.47, - "learning_rate": 1.4636155048421974e-06, - "loss": 0.2538, + "epoch": 4.579594190362922, + "grad_norm": 0.2515812814235687, + "learning_rate": 9.283416223854119e-07, + "loss": 0.3995, "step": 127070 }, { - "epoch": 4.47, - "learning_rate": 1.4626552465525495e-06, - "loss": 0.2794, + "epoch": 4.579774390024147, + "grad_norm": 0.2370171695947647, + "learning_rate": 9.275539528635746e-07, + "loss": 0.3846, "step": 127075 }, { - "epoch": 4.47, - "learning_rate": 1.461695293879814e-06, - "loss": 0.274, + "epoch": 4.5799545896853715, + "grad_norm": 0.2444002330303192, + "learning_rate": 9.267666113241563e-07, + "loss": 0.3358, "step": 127080 }, { - "epoch": 4.47, - "learning_rate": 1.4607356468364475e-06, - "loss": 0.2674, + "epoch": 4.580134789346596, + "grad_norm": 0.25110915303230286, + "learning_rate": 9.259795977778846e-07, + "loss": 0.3625, "step": 127085 }, { - "epoch": 4.47, - "learning_rate": 1.4597763054349179e-06, - "loss": 0.2542, + "epoch": 4.580314989007821, + "grad_norm": 0.2334173619747162, + "learning_rate": 9.251929122354785e-07, + "loss": 0.3718, "step": 127090 }, { - "epoch": 4.47, - "learning_rate": 1.4588172696876817e-06, - "loss": 0.261, + "epoch": 4.580495188669046, + "grad_norm": 0.3335031270980835, + "learning_rate": 9.244065547076574e-07, + "loss": 0.3461, "step": 127095 }, { - "epoch": 4.47, - "learning_rate": 1.4578585396071904e-06, - "loss": 0.2515, + "epoch": 4.58067538833027, + "grad_norm": 0.286377489566803, + "learning_rate": 9.23620525205135e-07, + "loss": 0.3706, "step": 127100 }, { - "epoch": 4.47, - "learning_rate": 1.4569001152058837e-06, - "loss": 0.2512, + "epoch": 4.580855587991494, + "grad_norm": 0.28314265608787537, + "learning_rate": 9.228348237386248e-07, + "loss": 0.3618, "step": 127105 }, { - "epoch": 4.47, - "learning_rate": 1.4559419964962156e-06, - "loss": 0.272, + "epoch": 4.581035787652719, + "grad_norm": 0.29264768958091736, + "learning_rate": 9.220494503188265e-07, + "loss": 0.3632, "step": 127110 }, { - "epoch": 4.47, - "learning_rate": 1.4549841834906292e-06, - "loss": 0.235, + "epoch": 4.581215987313944, + "grad_norm": 0.22267203032970428, + "learning_rate": 9.212644049564401e-07, + "loss": 0.3707, "step": 127115 }, { - "epoch": 4.47, - "learning_rate": 1.4540266762015531e-06, - "loss": 0.2546, + "epoch": 4.5813961869751685, + "grad_norm": 0.26017507910728455, + "learning_rate": 9.204796876621679e-07, + "loss": 0.3522, "step": 127120 }, { - "epoch": 4.47, - "learning_rate": 1.453069474641422e-06, - "loss": 0.239, + "epoch": 4.581576386636393, + "grad_norm": 0.297837495803833, + "learning_rate": 9.196952984466961e-07, + "loss": 0.3786, "step": 127125 }, { - "epoch": 4.47, - "learning_rate": 1.4521125788226703e-06, - "loss": 0.2469, + "epoch": 4.581756586297618, + "grad_norm": 0.23576004803180695, + "learning_rate": 9.189112373207188e-07, + "loss": 0.3523, "step": 127130 }, { - "epoch": 4.47, - "learning_rate": 1.4511559887577158e-06, - "loss": 0.2428, + "epoch": 4.581936785958843, + "grad_norm": 0.22233489155769348, + "learning_rate": 9.181275042949078e-07, + "loss": 0.3784, "step": 127135 }, { - "epoch": 4.47, - "learning_rate": 1.4501997044589793e-06, - "loss": 0.2392, + "epoch": 4.582116985620067, + "grad_norm": 0.19266854226589203, + "learning_rate": 9.173440993799492e-07, + "loss": 0.3736, "step": 127140 }, { - "epoch": 4.47, - "learning_rate": 1.449243725938884e-06, - "loss": 0.2742, + "epoch": 4.582297185281291, + "grad_norm": 0.2523898482322693, + "learning_rate": 9.165610225865152e-07, + "loss": 0.3829, "step": 127145 }, { - "epoch": 4.47, - "learning_rate": 1.4482880532098369e-06, - "loss": 0.2383, + "epoch": 4.582477384942516, + "grad_norm": 0.26051437854766846, + "learning_rate": 9.157782739252718e-07, + "loss": 0.3785, "step": 127150 }, { - "epoch": 4.47, - "learning_rate": 1.4473326862842474e-06, - "loss": 0.2642, + "epoch": 4.582657584603741, + "grad_norm": 0.3132672607898712, + "learning_rate": 9.149958534068915e-07, + "loss": 0.349, "step": 127155 }, { - "epoch": 4.47, - "learning_rate": 1.4463776251745191e-06, - "loss": 0.2464, + "epoch": 4.5828377842649655, + "grad_norm": 0.24046078324317932, + "learning_rate": 9.142137610420265e-07, + "loss": 0.3656, "step": 127160 }, { - "epoch": 4.47, - "learning_rate": 1.4454228698930567e-06, - "loss": 0.2648, + "epoch": 4.58301798392619, + "grad_norm": 0.2993525564670563, + "learning_rate": 9.134319968413325e-07, + "loss": 0.3774, "step": 127165 }, { - "epoch": 4.47, - "learning_rate": 1.4444684204522551e-06, - "loss": 0.2388, + "epoch": 4.583198183587415, + "grad_norm": 0.24848264455795288, + "learning_rate": 9.1265056081547e-07, + "loss": 0.4103, "step": 127170 }, { - "epoch": 4.47, - "learning_rate": 1.4435142768645106e-06, - "loss": 0.2547, + "epoch": 4.58337838324864, + "grad_norm": 0.24963483214378357, + "learning_rate": 9.118694529750782e-07, + "loss": 0.3906, "step": 127175 }, { - "epoch": 4.47, - "learning_rate": 1.4425604391422043e-06, - "loss": 0.265, + "epoch": 4.583558582909864, + "grad_norm": 0.3283641040325165, + "learning_rate": 9.11088673330801e-07, + "loss": 0.4036, "step": 127180 }, { - "epoch": 4.47, - "learning_rate": 1.4416069072977266e-06, - "loss": 0.2466, + "epoch": 4.583738782571089, + "grad_norm": 0.24759909510612488, + "learning_rate": 9.103082218932774e-07, + "loss": 0.3573, "step": 127185 }, { - "epoch": 4.47, - "learning_rate": 1.4406536813434624e-06, - "loss": 0.2674, + "epoch": 4.583918982232314, + "grad_norm": 0.2345927208662033, + "learning_rate": 9.09528098673143e-07, + "loss": 0.3574, "step": 127190 }, { - "epoch": 4.48, - "learning_rate": 1.4397007612917817e-06, - "loss": 0.2396, + "epoch": 4.584099181893538, + "grad_norm": 0.2747328579425812, + "learning_rate": 9.087483036810174e-07, + "loss": 0.3511, "step": 127195 }, { - "epoch": 4.48, - "learning_rate": 1.4387481471550557e-06, - "loss": 0.2615, + "epoch": 4.5842793815547624, + "grad_norm": 0.2693836987018585, + "learning_rate": 9.07968836927539e-07, + "loss": 0.3638, "step": 127200 }, { - "epoch": 4.48, - "learning_rate": 1.4377958389456604e-06, - "loss": 0.2407, + "epoch": 4.584459581215987, + "grad_norm": 0.29041051864624023, + "learning_rate": 9.071896984233163e-07, + "loss": 0.4105, "step": 127205 }, { - "epoch": 4.48, - "learning_rate": 1.4368438366759606e-06, - "loss": 0.2464, + "epoch": 4.584639780877212, + "grad_norm": 0.23431363701820374, + "learning_rate": 9.064108881789712e-07, + "loss": 0.3591, "step": 127210 }, { - "epoch": 4.48, - "learning_rate": 1.435892140358308e-06, - "loss": 0.2432, + "epoch": 4.584819980538437, + "grad_norm": 0.21707414090633392, + "learning_rate": 9.056324062051147e-07, + "loss": 0.3548, "step": 127215 }, { - "epoch": 4.48, - "learning_rate": 1.4349407500050731e-06, - "loss": 0.2628, + "epoch": 4.585000180199661, + "grad_norm": 0.2566896378993988, + "learning_rate": 9.048542525123493e-07, + "loss": 0.3801, "step": 127220 }, { - "epoch": 4.48, - "learning_rate": 1.4339896656285989e-06, - "loss": 0.2422, + "epoch": 4.585180379860886, + "grad_norm": 0.22780990600585938, + "learning_rate": 9.040764271112806e-07, + "loss": 0.3584, "step": 127225 }, { - "epoch": 4.48, - "learning_rate": 1.4330388872412392e-06, - "loss": 0.2442, + "epoch": 4.585360579522111, + "grad_norm": 0.28133270144462585, + "learning_rate": 9.032989300125055e-07, + "loss": 0.3961, "step": 127230 }, { - "epoch": 4.48, - "learning_rate": 1.4320884148553427e-06, - "loss": 0.2485, + "epoch": 4.585540779183335, + "grad_norm": 0.24993495643138885, + "learning_rate": 9.025217612266157e-07, + "loss": 0.3699, "step": 127235 }, { - "epoch": 4.48, - "learning_rate": 1.431138248483238e-06, - "loss": 0.2313, + "epoch": 4.585720978844559, + "grad_norm": 0.20704279839992523, + "learning_rate": 9.017449207642026e-07, + "loss": 0.3575, "step": 127240 }, { - "epoch": 4.48, - "learning_rate": 1.4301883881372768e-06, - "loss": 0.2543, + "epoch": 4.585901178505784, + "grad_norm": 0.22936898469924927, + "learning_rate": 9.009684086358494e-07, + "loss": 0.3541, "step": 127245 }, { - "epoch": 4.48, - "learning_rate": 1.4292388338297852e-06, - "loss": 0.2537, + "epoch": 4.586081378167009, + "grad_norm": 0.26734253764152527, + "learning_rate": 9.001922248521366e-07, + "loss": 0.399, "step": 127250 }, { - "epoch": 4.48, - "learning_rate": 1.428289585573092e-06, - "loss": 0.2665, + "epoch": 4.586261577828234, + "grad_norm": 0.26072803139686584, + "learning_rate": 8.994163694236391e-07, + "loss": 0.3603, "step": 127255 }, { - "epoch": 4.48, - "learning_rate": 1.4273406433795267e-06, - "loss": 0.223, + "epoch": 4.586441777489458, + "grad_norm": 0.21624024212360382, + "learning_rate": 8.98640842360926e-07, + "loss": 0.3603, "step": 127260 }, { - "epoch": 4.48, - "learning_rate": 1.4263920072614068e-06, - "loss": 0.2692, + "epoch": 4.586621977150683, + "grad_norm": 0.2746710479259491, + "learning_rate": 8.97865643674567e-07, + "loss": 0.3696, "step": 127265 }, { - "epoch": 4.48, - "learning_rate": 1.4254436772310503e-06, - "loss": 0.2491, + "epoch": 4.586802176811908, + "grad_norm": 0.2964709997177124, + "learning_rate": 8.970907733751199e-07, + "loss": 0.3768, "step": 127270 }, { - "epoch": 4.48, - "learning_rate": 1.4244956533007726e-06, - "loss": 0.2294, + "epoch": 4.586982376473133, + "grad_norm": 0.2429097592830658, + "learning_rate": 8.96316231473146e-07, + "loss": 0.3656, "step": 127275 }, { - "epoch": 4.48, - "learning_rate": 1.42354793548288e-06, - "loss": 0.2604, + "epoch": 4.587162576134357, + "grad_norm": 0.22786717116832733, + "learning_rate": 8.955420179791979e-07, + "loss": 0.3619, "step": 127280 }, { - "epoch": 4.48, - "learning_rate": 1.4226005237896827e-06, - "loss": 0.2476, + "epoch": 4.587342775795581, + "grad_norm": 0.30810287594795227, + "learning_rate": 8.947681329038198e-07, + "loss": 0.3812, "step": 127285 }, { - "epoch": 4.48, - "learning_rate": 1.421653418233479e-06, - "loss": 0.254, + "epoch": 4.587522975456806, + "grad_norm": 0.2238711714744568, + "learning_rate": 8.939945762575619e-07, + "loss": 0.3629, "step": 127290 }, { - "epoch": 4.48, - "learning_rate": 1.4207066188265645e-06, - "loss": 0.256, + "epoch": 4.587703175118031, + "grad_norm": 0.22540000081062317, + "learning_rate": 8.932213480509627e-07, + "loss": 0.3756, "step": 127295 }, { - "epoch": 4.48, - "learning_rate": 1.4197601255812376e-06, - "loss": 0.2693, + "epoch": 4.587883374779255, + "grad_norm": 0.2444496601819992, + "learning_rate": 8.9244844829455e-07, + "loss": 0.3786, "step": 127300 }, { - "epoch": 4.48, - "learning_rate": 1.418813938509786e-06, - "loss": 0.2401, + "epoch": 4.58806357444048, + "grad_norm": 0.2592555582523346, + "learning_rate": 8.916758769988626e-07, + "loss": 0.3718, "step": 127305 }, { - "epoch": 4.48, - "learning_rate": 1.4178680576244967e-06, - "loss": 0.234, + "epoch": 4.588243774101705, + "grad_norm": 0.26862582564353943, + "learning_rate": 8.909036341744226e-07, + "loss": 0.373, "step": 127310 }, { - "epoch": 4.48, - "learning_rate": 1.4169224829376438e-06, - "loss": 0.2692, + "epoch": 4.5884239737629295, + "grad_norm": 0.24260741472244263, + "learning_rate": 8.901317198317577e-07, + "loss": 0.3717, "step": 127315 }, { - "epoch": 4.48, - "learning_rate": 1.4159772144615142e-06, - "loss": 0.2418, + "epoch": 4.588604173424154, + "grad_norm": 0.24818958342075348, + "learning_rate": 8.893601339813762e-07, + "loss": 0.3762, "step": 127320 }, { - "epoch": 4.48, - "learning_rate": 1.4150322522083787e-06, - "loss": 0.227, + "epoch": 4.588784373085378, + "grad_norm": 0.2344391644001007, + "learning_rate": 8.885888766337947e-07, + "loss": 0.39, "step": 127325 }, { - "epoch": 4.48, - "learning_rate": 1.4140875961905026e-06, - "loss": 0.2393, + "epoch": 4.588964572746603, + "grad_norm": 0.28583937883377075, + "learning_rate": 8.878179477995214e-07, + "loss": 0.3784, "step": 127330 }, { - "epoch": 4.48, - "learning_rate": 1.4131432464201565e-06, - "loss": 0.2483, + "epoch": 4.589144772407828, + "grad_norm": 0.24101144075393677, + "learning_rate": 8.870473474890617e-07, + "loss": 0.3615, "step": 127335 }, { - "epoch": 4.48, - "learning_rate": 1.4121992029096031e-06, - "loss": 0.2745, + "epoch": 4.589324972069052, + "grad_norm": 0.2540043890476227, + "learning_rate": 8.86277075712913e-07, + "loss": 0.3619, "step": 127340 }, { - "epoch": 4.48, - "learning_rate": 1.4112554656710992e-06, - "loss": 0.2644, + "epoch": 4.589505171730277, + "grad_norm": 0.2533954381942749, + "learning_rate": 8.855071324815723e-07, + "loss": 0.3504, "step": 127345 }, { - "epoch": 4.48, - "learning_rate": 1.4103120347168958e-06, - "loss": 0.2589, + "epoch": 4.589685371391502, + "grad_norm": 0.20439958572387695, + "learning_rate": 8.847375178055228e-07, + "loss": 0.3585, "step": 127350 }, { - "epoch": 4.48, - "learning_rate": 1.4093689100592416e-06, - "loss": 0.2414, + "epoch": 4.5898655710527265, + "grad_norm": 0.22959500551223755, + "learning_rate": 8.839682316952619e-07, + "loss": 0.3976, "step": 127355 }, { - "epoch": 4.48, - "learning_rate": 1.4084260917103881e-06, - "loss": 0.2681, + "epoch": 4.590045770713951, + "grad_norm": 0.22936657071113586, + "learning_rate": 8.831992741612616e-07, + "loss": 0.3839, "step": 127360 }, { - "epoch": 4.48, - "learning_rate": 1.4074835796825753e-06, - "loss": 0.2525, + "epoch": 4.590225970375176, + "grad_norm": 0.2100255936384201, + "learning_rate": 8.824306452140025e-07, + "loss": 0.3651, "step": 127365 }, { - "epoch": 4.48, - "learning_rate": 1.406541373988038e-06, - "loss": 0.2467, + "epoch": 4.590406170036401, + "grad_norm": 0.26398009061813354, + "learning_rate": 8.816623448639594e-07, + "loss": 0.3644, "step": 127370 }, { - "epoch": 4.48, - "learning_rate": 1.4055994746390132e-06, - "loss": 0.2416, + "epoch": 4.5905863696976255, + "grad_norm": 0.2545202374458313, + "learning_rate": 8.808943731215935e-07, + "loss": 0.3634, "step": 127375 }, { - "epoch": 4.48, - "learning_rate": 1.4046578816477306e-06, - "loss": 0.2252, + "epoch": 4.590766569358849, + "grad_norm": 0.2735772430896759, + "learning_rate": 8.801267299973715e-07, + "loss": 0.3916, "step": 127380 }, { - "epoch": 4.48, - "learning_rate": 1.4037165950264136e-06, - "loss": 0.2356, + "epoch": 4.590946769020074, + "grad_norm": 0.2763000428676605, + "learning_rate": 8.793594155017598e-07, + "loss": 0.3839, "step": 127385 }, { - "epoch": 4.48, - "learning_rate": 1.4027756147872856e-06, - "loss": 0.2436, + "epoch": 4.591126968681299, + "grad_norm": 0.26428860425949097, + "learning_rate": 8.785924296451975e-07, + "loss": 0.381, "step": 127390 }, { - "epoch": 4.48, - "learning_rate": 1.4018349409425674e-06, - "loss": 0.2723, + "epoch": 4.5913071683425235, + "grad_norm": 0.2190413475036621, + "learning_rate": 8.778257724381483e-07, + "loss": 0.357, "step": 127395 }, { - "epoch": 4.48, - "learning_rate": 1.4008945735044716e-06, - "loss": 0.2374, + "epoch": 4.591487368003748, + "grad_norm": 0.2818199694156647, + "learning_rate": 8.770594438910512e-07, + "loss": 0.4062, "step": 127400 }, { - "epoch": 4.48, - "learning_rate": 1.3999545124852077e-06, - "loss": 0.2445, + "epoch": 4.591667567664973, + "grad_norm": 0.25005903840065, + "learning_rate": 8.762934440143478e-07, + "loss": 0.3673, "step": 127405 }, { - "epoch": 4.48, - "learning_rate": 1.399014757896977e-06, - "loss": 0.2511, + "epoch": 4.591847767326198, + "grad_norm": 0.2070559412240982, + "learning_rate": 8.755277728184796e-07, + "loss": 0.3492, "step": 127410 }, { - "epoch": 4.48, - "learning_rate": 1.3980753097519894e-06, - "loss": 0.2439, + "epoch": 4.592027966987422, + "grad_norm": 0.23622746765613556, + "learning_rate": 8.747624303138746e-07, + "loss": 0.3909, "step": 127415 }, { - "epoch": 4.48, - "learning_rate": 1.3971361680624406e-06, - "loss": 0.2561, + "epoch": 4.592208166648646, + "grad_norm": 0.32990074157714844, + "learning_rate": 8.739974165109549e-07, + "loss": 0.3831, "step": 127420 }, { - "epoch": 4.48, - "learning_rate": 1.3961973328405236e-06, - "loss": 0.2491, + "epoch": 4.592388366309871, + "grad_norm": 0.21549347043037415, + "learning_rate": 8.732327314201539e-07, + "loss": 0.3512, "step": 127425 }, { - "epoch": 4.48, - "learning_rate": 1.3952588040984232e-06, - "loss": 0.2633, + "epoch": 4.592568565971096, + "grad_norm": 0.23620551824569702, + "learning_rate": 8.724683750518853e-07, + "loss": 0.3524, "step": 127430 }, { - "epoch": 4.48, - "learning_rate": 1.394320581848338e-06, - "loss": 0.2349, + "epoch": 4.5927487656323205, + "grad_norm": 0.2451809197664261, + "learning_rate": 8.717043474165659e-07, + "loss": 0.3549, "step": 127435 }, { - "epoch": 4.48, - "learning_rate": 1.3933826661024386e-06, - "loss": 0.2662, + "epoch": 4.592928965293545, + "grad_norm": 0.2065710574388504, + "learning_rate": 8.709406485246013e-07, + "loss": 0.3637, "step": 127440 }, { - "epoch": 4.48, - "learning_rate": 1.3924450568729098e-06, - "loss": 0.2524, + "epoch": 4.59310916495477, + "grad_norm": 0.2208905667066574, + "learning_rate": 8.701772783863943e-07, + "loss": 0.3542, "step": 127445 }, { - "epoch": 4.48, - "learning_rate": 1.3915077541719224e-06, - "loss": 0.2417, + "epoch": 4.593289364615995, + "grad_norm": 0.2570354640483856, + "learning_rate": 8.69414237012356e-07, + "loss": 0.3631, "step": 127450 }, { - "epoch": 4.48, - "learning_rate": 1.39057075801165e-06, - "loss": 0.2401, + "epoch": 4.593469564277219, + "grad_norm": 0.2893807888031006, + "learning_rate": 8.686515244128752e-07, + "loss": 0.3994, "step": 127455 }, { - "epoch": 4.48, - "learning_rate": 1.389634068404258e-06, - "loss": 0.2373, + "epoch": 4.593649763938444, + "grad_norm": 0.2452847957611084, + "learning_rate": 8.678891405983464e-07, + "loss": 0.367, "step": 127460 }, { - "epoch": 4.48, - "learning_rate": 1.3886976853619087e-06, - "loss": 0.2342, + "epoch": 4.593829963599669, + "grad_norm": 0.28028956055641174, + "learning_rate": 8.671270855791558e-07, + "loss": 0.388, "step": 127465 }, { - "epoch": 4.48, - "learning_rate": 1.3877616088967565e-06, - "loss": 0.2402, + "epoch": 4.594010163260893, + "grad_norm": 0.2985093295574188, + "learning_rate": 8.66365359365684e-07, + "loss": 0.4046, "step": 127470 }, { - "epoch": 4.48, - "learning_rate": 1.386825839020961e-06, - "loss": 0.2766, + "epoch": 4.5941903629221175, + "grad_norm": 0.2665720283985138, + "learning_rate": 8.656039619683143e-07, + "loss": 0.3886, "step": 127475 }, { - "epoch": 4.49, - "learning_rate": 1.3858903757466706e-06, - "loss": 0.2329, + "epoch": 4.594370562583342, + "grad_norm": 0.21940386295318604, + "learning_rate": 8.64842893397419e-07, + "loss": 0.3497, "step": 127480 }, { - "epoch": 4.49, - "learning_rate": 1.3849552190860259e-06, - "loss": 0.2594, + "epoch": 4.594550762244567, + "grad_norm": 0.21801802515983582, + "learning_rate": 8.640821536633647e-07, + "loss": 0.3911, "step": 127485 }, { - "epoch": 4.49, - "learning_rate": 1.384020369051181e-06, - "loss": 0.2463, + "epoch": 4.594730961905792, + "grad_norm": 0.22866007685661316, + "learning_rate": 8.633217427765184e-07, + "loss": 0.3706, "step": 127490 }, { - "epoch": 4.49, - "learning_rate": 1.3830858256542623e-06, - "loss": 0.2369, + "epoch": 4.594911161567016, + "grad_norm": 0.21963122487068176, + "learning_rate": 8.625616607472381e-07, + "loss": 0.3706, "step": 127495 }, { - "epoch": 4.49, - "learning_rate": 1.3821515889074154e-06, - "loss": 0.2386, + "epoch": 4.595091361228241, + "grad_norm": 0.22869311273097992, + "learning_rate": 8.618019075858852e-07, + "loss": 0.3886, "step": 127500 }, { - "epoch": 4.49, - "eval_loss": 0.2487904578447342, - "eval_runtime": 10.5517, - "eval_samples_per_second": 9.477, - "eval_steps_per_second": 9.477, + "epoch": 4.595091361228241, + "eval_loss": 0.42893698811531067, + "eval_runtime": 3.5318, + "eval_samples_per_second": 28.314, + "eval_steps_per_second": 7.078, "step": 127500 }, { - "epoch": 4.49, - "learning_rate": 1.3812176588227615e-06, - "loss": 0.2426, + "epoch": 4.595271560889466, + "grad_norm": 0.26364636421203613, + "learning_rate": 8.610424833028097e-07, + "loss": 0.3719, "step": 127505 }, { - "epoch": 4.49, - "learning_rate": 1.3802840354124353e-06, - "loss": 0.2382, + "epoch": 4.59545176055069, + "grad_norm": 0.2287474125623703, + "learning_rate": 8.602833879083505e-07, + "loss": 0.382, "step": 127510 }, { - "epoch": 4.49, - "learning_rate": 1.3793507186885546e-06, - "loss": 0.2469, + "epoch": 4.5956319602119144, + "grad_norm": 0.2660192549228668, + "learning_rate": 8.595246214128605e-07, + "loss": 0.3737, "step": 127515 }, { - "epoch": 4.49, - "learning_rate": 1.3784177086632377e-06, - "loss": 0.2207, + "epoch": 4.595812159873139, + "grad_norm": 0.2630321681499481, + "learning_rate": 8.587661838266731e-07, + "loss": 0.3626, "step": 127520 }, { - "epoch": 4.49, - "learning_rate": 1.3774850053485944e-06, - "loss": 0.2492, + "epoch": 4.595992359534364, + "grad_norm": 0.21891890466213226, + "learning_rate": 8.580080751601244e-07, + "loss": 0.3754, "step": 127525 }, { - "epoch": 4.49, - "learning_rate": 1.3765526087567455e-06, - "loss": 0.2256, + "epoch": 4.596172559195589, + "grad_norm": 0.2554580271244049, + "learning_rate": 8.572502954235422e-07, + "loss": 0.4112, "step": 127530 }, { - "epoch": 4.49, - "learning_rate": 1.3756205188997922e-06, - "loss": 0.2693, + "epoch": 4.596352758856813, + "grad_norm": 0.2666684091091156, + "learning_rate": 8.564928446272463e-07, + "loss": 0.3948, "step": 127535 }, { - "epoch": 4.49, - "learning_rate": 1.3746887357898392e-06, - "loss": 0.2465, + "epoch": 4.596532958518038, + "grad_norm": 0.22191660106182098, + "learning_rate": 8.557357227815615e-07, + "loss": 0.3486, "step": 127540 }, { - "epoch": 4.49, - "learning_rate": 1.3737572594389765e-06, - "loss": 0.254, + "epoch": 4.596713158179263, + "grad_norm": 0.2158440500497818, + "learning_rate": 8.549789298968075e-07, + "loss": 0.3562, "step": 127545 }, { - "epoch": 4.49, - "learning_rate": 1.3728260898593082e-06, - "loss": 0.2624, + "epoch": 4.596893357840488, + "grad_norm": 0.2409515678882599, + "learning_rate": 8.542224659832871e-07, + "loss": 0.3832, "step": 127550 }, { - "epoch": 4.49, - "learning_rate": 1.371895227062925e-06, - "loss": 0.2333, + "epoch": 4.597073557501712, + "grad_norm": 0.3206073045730591, + "learning_rate": 8.534663310513141e-07, + "loss": 0.3694, "step": 127555 }, { - "epoch": 4.49, - "learning_rate": 1.3709646710619113e-06, - "loss": 0.2297, + "epoch": 4.597253757162936, + "grad_norm": 0.2606680393218994, + "learning_rate": 8.527105251111833e-07, + "loss": 0.3742, "step": 127560 }, { - "epoch": 4.49, - "learning_rate": 1.3700344218683437e-06, - "loss": 0.2366, + "epoch": 4.597433956824161, + "grad_norm": 0.3032326102256775, + "learning_rate": 8.519550481731975e-07, + "loss": 0.3228, "step": 127565 }, { - "epoch": 4.49, - "learning_rate": 1.3691044794943125e-06, - "loss": 0.2414, + "epoch": 4.597614156485386, + "grad_norm": 0.303232342004776, + "learning_rate": 8.511999002476512e-07, + "loss": 0.396, "step": 127570 }, { - "epoch": 4.49, - "learning_rate": 1.368174843951886e-06, - "loss": 0.2431, + "epoch": 4.59779435614661, + "grad_norm": 0.2779444754123688, + "learning_rate": 8.504450813448306e-07, + "loss": 0.3345, "step": 127575 }, { - "epoch": 4.49, - "learning_rate": 1.367245515253135e-06, - "loss": 0.2397, + "epoch": 4.597974555807835, + "grad_norm": 0.2656551003456116, + "learning_rate": 8.496905914750192e-07, + "loss": 0.3711, "step": 127580 }, { - "epoch": 4.49, - "learning_rate": 1.3663164934101219e-06, - "loss": 0.2398, + "epoch": 4.59815475546906, + "grad_norm": 0.2177533507347107, + "learning_rate": 8.489364306484976e-07, + "loss": 0.3436, "step": 127585 }, { - "epoch": 4.49, - "learning_rate": 1.3653877784349178e-06, - "loss": 0.2439, + "epoch": 4.598334955130285, + "grad_norm": 0.25983306765556335, + "learning_rate": 8.481825988755382e-07, + "loss": 0.3416, "step": 127590 }, { - "epoch": 4.49, - "learning_rate": 1.364459370339577e-06, - "loss": 0.2407, + "epoch": 4.598515154791509, + "grad_norm": 0.23284339904785156, + "learning_rate": 8.474290961664244e-07, + "loss": 0.333, "step": 127595 }, { - "epoch": 4.49, - "learning_rate": 1.3635312691361508e-06, - "loss": 0.2473, + "epoch": 4.598695354452733, + "grad_norm": 0.2582654356956482, + "learning_rate": 8.466759225314063e-07, + "loss": 0.33, "step": 127600 }, { - "epoch": 4.49, - "learning_rate": 1.3626034748366962e-06, - "loss": 0.2755, + "epoch": 4.598875554113958, + "grad_norm": 0.31043708324432373, + "learning_rate": 8.459230779807508e-07, + "loss": 0.3663, "step": 127605 }, { - "epoch": 4.49, - "learning_rate": 1.3616759874532592e-06, - "loss": 0.258, + "epoch": 4.599055753775183, + "grad_norm": 0.25590458512306213, + "learning_rate": 8.451705625247191e-07, + "loss": 0.3797, "step": 127610 }, { - "epoch": 4.49, - "learning_rate": 1.360748806997883e-06, - "loss": 0.2314, + "epoch": 4.599235953436407, + "grad_norm": 0.2824951410293579, + "learning_rate": 8.444183761735641e-07, + "loss": 0.385, "step": 127615 }, { - "epoch": 4.49, - "learning_rate": 1.3598219334826022e-06, - "loss": 0.238, + "epoch": 4.599416153097632, + "grad_norm": 0.3160955309867859, + "learning_rate": 8.436665189375303e-07, + "loss": 0.351, "step": 127620 }, { - "epoch": 4.49, - "learning_rate": 1.3588953669194548e-06, - "loss": 0.2529, + "epoch": 4.599596352758857, + "grad_norm": 0.29997727274894714, + "learning_rate": 8.429149908268652e-07, + "loss": 0.3823, "step": 127625 }, { - "epoch": 4.49, - "learning_rate": 1.3579691073204726e-06, - "loss": 0.2615, + "epoch": 4.5997765524200815, + "grad_norm": 0.2773001194000244, + "learning_rate": 8.421637918518022e-07, + "loss": 0.3737, "step": 127630 }, { - "epoch": 4.49, - "learning_rate": 1.3570431546976792e-06, - "loss": 0.244, + "epoch": 4.599956752081306, + "grad_norm": 0.2861831486225128, + "learning_rate": 8.41412922022583e-07, + "loss": 0.3375, "step": 127635 }, { - "epoch": 4.49, - "learning_rate": 1.3561175090630984e-06, - "loss": 0.2943, + "epoch": 4.600136951742531, + "grad_norm": 0.27983224391937256, + "learning_rate": 8.406623813494358e-07, + "loss": 0.365, "step": 127640 }, { - "epoch": 4.49, - "learning_rate": 1.3551921704287513e-06, - "loss": 0.237, + "epoch": 4.600317151403756, + "grad_norm": 0.26674070954322815, + "learning_rate": 8.399121698425855e-07, + "loss": 0.3933, "step": 127645 }, { - "epoch": 4.49, - "learning_rate": 1.354267138806653e-06, - "loss": 0.2492, + "epoch": 4.6004973510649805, + "grad_norm": 0.23168079555034637, + "learning_rate": 8.391622875122545e-07, + "loss": 0.3665, "step": 127650 }, { - "epoch": 4.49, - "learning_rate": 1.353342414208808e-06, - "loss": 0.2577, + "epoch": 4.600677550726204, + "grad_norm": 0.2457718551158905, + "learning_rate": 8.384127343686599e-07, + "loss": 0.3455, "step": 127655 }, { - "epoch": 4.49, - "learning_rate": 1.3524179966472317e-06, - "loss": 0.2467, + "epoch": 4.600857750387429, + "grad_norm": 0.25445955991744995, + "learning_rate": 8.376635104220126e-07, + "loss": 0.3738, "step": 127660 }, { - "epoch": 4.49, - "learning_rate": 1.3514938861339199e-06, - "loss": 0.2468, + "epoch": 4.601037950048654, + "grad_norm": 0.23274029791355133, + "learning_rate": 8.369146156825269e-07, + "loss": 0.3727, "step": 127665 }, { - "epoch": 4.49, - "learning_rate": 1.350570082680877e-06, - "loss": 0.236, + "epoch": 4.6012181497098785, + "grad_norm": 0.2078734189271927, + "learning_rate": 8.361660501603946e-07, + "loss": 0.3685, "step": 127670 }, { - "epoch": 4.49, - "learning_rate": 1.349646586300099e-06, - "loss": 0.251, + "epoch": 4.601398349371103, + "grad_norm": 0.24284294247627258, + "learning_rate": 8.354178138658269e-07, + "loss": 0.3768, "step": 127675 }, { - "epoch": 4.49, - "learning_rate": 1.3487233970035679e-06, - "loss": 0.2442, + "epoch": 4.601578549032328, + "grad_norm": 0.3138650059700012, + "learning_rate": 8.346699068090074e-07, + "loss": 0.3789, "step": 127680 }, { - "epoch": 4.49, - "learning_rate": 1.3478005148032824e-06, - "loss": 0.2409, + "epoch": 4.601758748693553, + "grad_norm": 0.23819488286972046, + "learning_rate": 8.339223290001363e-07, + "loss": 0.3685, "step": 127685 }, { - "epoch": 4.49, - "learning_rate": 1.3468779397112191e-06, - "loss": 0.2736, + "epoch": 4.6019389483547775, + "grad_norm": 0.2773071825504303, + "learning_rate": 8.331750804493971e-07, + "loss": 0.373, "step": 127690 }, { - "epoch": 4.49, - "learning_rate": 1.3459556717393574e-06, - "loss": 0.2542, + "epoch": 4.602119148016001, + "grad_norm": 0.2507835924625397, + "learning_rate": 8.324281611669621e-07, + "loss": 0.3704, "step": 127695 }, { - "epoch": 4.49, - "learning_rate": 1.345033710899668e-06, - "loss": 0.2343, + "epoch": 4.602299347677226, + "grad_norm": 0.1873769313097, + "learning_rate": 8.316815711630205e-07, + "loss": 0.3327, "step": 127700 }, { - "epoch": 4.49, - "learning_rate": 1.3441120572041305e-06, - "loss": 0.244, + "epoch": 4.602479547338451, + "grad_norm": 0.23078793287277222, + "learning_rate": 8.309353104477335e-07, + "loss": 0.3727, "step": 127705 }, { - "epoch": 4.49, - "learning_rate": 1.3431907106647073e-06, - "loss": 0.2564, + "epoch": 4.6026597469996755, + "grad_norm": 0.23477856814861298, + "learning_rate": 8.301893790312765e-07, + "loss": 0.3795, "step": 127710 }, { - "epoch": 4.49, - "learning_rate": 1.3422696712933642e-06, - "loss": 0.242, + "epoch": 4.6028399466609, + "grad_norm": 0.27823832631111145, + "learning_rate": 8.294437769238106e-07, + "loss": 0.3449, "step": 127715 }, { - "epoch": 4.49, - "learning_rate": 1.3413489391020551e-06, - "loss": 0.2257, + "epoch": 4.603020146322125, + "grad_norm": 0.27423784136772156, + "learning_rate": 8.286985041354889e-07, + "loss": 0.3604, "step": 127720 }, { - "epoch": 4.49, - "learning_rate": 1.34042851410274e-06, - "loss": 0.269, + "epoch": 4.60320034598335, + "grad_norm": 0.2629181146621704, + "learning_rate": 8.279535606764755e-07, + "loss": 0.3842, "step": 127725 }, { - "epoch": 4.49, - "learning_rate": 1.3395083963073707e-06, - "loss": 0.2503, + "epoch": 4.603380545644574, + "grad_norm": 0.25341519713401794, + "learning_rate": 8.272089465569121e-07, + "loss": 0.3454, "step": 127730 }, { - "epoch": 4.49, - "learning_rate": 1.3385885857278874e-06, - "loss": 0.2621, + "epoch": 4.603560745305799, + "grad_norm": 0.2602289617061615, + "learning_rate": 8.264646617869493e-07, + "loss": 0.3771, "step": 127735 }, { - "epoch": 4.49, - "learning_rate": 1.3376690823762417e-06, - "loss": 0.2473, + "epoch": 4.603740944967024, + "grad_norm": 0.2383667379617691, + "learning_rate": 8.25720706376723e-07, + "loss": 0.3977, "step": 127740 }, { - "epoch": 4.49, - "learning_rate": 1.3367498862643685e-06, - "loss": 0.2326, + "epoch": 4.603921144628248, + "grad_norm": 0.2735196650028229, + "learning_rate": 8.249770803363727e-07, + "loss": 0.3894, "step": 127745 }, { - "epoch": 4.49, - "learning_rate": 1.3358309974042027e-06, - "loss": 0.2687, + "epoch": 4.6041013442894725, + "grad_norm": 0.2749096751213074, + "learning_rate": 8.242337836760261e-07, + "loss": 0.3741, "step": 127750 }, { - "epoch": 4.49, - "learning_rate": 1.3349124158076736e-06, - "loss": 0.2496, + "epoch": 4.604281543950697, + "grad_norm": 0.22528064250946045, + "learning_rate": 8.234908164058169e-07, + "loss": 0.3538, "step": 127755 }, { - "epoch": 4.49, - "learning_rate": 1.3339941414867162e-06, - "loss": 0.2329, + "epoch": 4.604461743611922, + "grad_norm": 0.22581934928894043, + "learning_rate": 8.227481785358648e-07, + "loss": 0.3696, "step": 127760 }, { - "epoch": 4.5, - "learning_rate": 1.333076174453246e-06, - "loss": 0.2658, + "epoch": 4.604641943273147, + "grad_norm": 0.31339600682258606, + "learning_rate": 8.220058700762894e-07, + "loss": 0.4024, "step": 127765 }, { - "epoch": 4.5, - "learning_rate": 1.3321585147191812e-06, - "loss": 0.2538, + "epoch": 4.604822142934371, + "grad_norm": 0.2499316930770874, + "learning_rate": 8.212638910372023e-07, + "loss": 0.386, "step": 127770 }, { - "epoch": 4.5, - "learning_rate": 1.33124116229644e-06, - "loss": 0.2431, + "epoch": 4.605002342595596, + "grad_norm": 0.27650460600852966, + "learning_rate": 8.205222414287089e-07, + "loss": 0.378, "step": 127775 }, { - "epoch": 4.5, - "learning_rate": 1.3303241171969406e-06, - "loss": 0.2353, + "epoch": 4.605182542256821, + "grad_norm": 0.2796940505504608, + "learning_rate": 8.197809212609236e-07, + "loss": 0.3827, "step": 127780 }, { - "epoch": 4.5, - "learning_rate": 1.3294073794325818e-06, - "loss": 0.2328, + "epoch": 4.605362741918045, + "grad_norm": 0.24295328557491302, + "learning_rate": 8.190399305439412e-07, + "loss": 0.3327, "step": 127785 }, { - "epoch": 4.5, - "learning_rate": 1.3284909490152681e-06, - "loss": 0.2453, + "epoch": 4.6055429415792695, + "grad_norm": 0.2779107689857483, + "learning_rate": 8.182992692878561e-07, + "loss": 0.3532, "step": 127790 }, { - "epoch": 4.5, - "learning_rate": 1.3275748259568955e-06, - "loss": 0.2366, + "epoch": 4.605723141240494, + "grad_norm": 0.22981327772140503, + "learning_rate": 8.175589375027631e-07, + "loss": 0.3642, "step": 127795 }, { - "epoch": 4.5, - "learning_rate": 1.3266590102693655e-06, - "loss": 0.254, + "epoch": 4.605903340901719, + "grad_norm": 0.2713334262371063, + "learning_rate": 8.168189351987488e-07, + "loss": 0.3618, "step": 127800 }, { - "epoch": 4.5, - "learning_rate": 1.3257435019645687e-06, - "loss": 0.2509, + "epoch": 4.606083540562944, + "grad_norm": 0.2509160339832306, + "learning_rate": 8.160792623858909e-07, + "loss": 0.3942, "step": 127805 }, { - "epoch": 4.5, - "learning_rate": 1.3248283010543927e-06, - "loss": 0.2444, + "epoch": 4.606263740224168, + "grad_norm": 0.3234541416168213, + "learning_rate": 8.153399190742761e-07, + "loss": 0.3853, "step": 127810 }, { - "epoch": 4.5, - "learning_rate": 1.3240963616583557e-06, - "loss": 0.2366, + "epoch": 4.606443939885393, + "grad_norm": 0.35334426164627075, + "learning_rate": 8.146009052739656e-07, + "loss": 0.4016, "step": 127815 }, { - "epoch": 4.5, - "learning_rate": 1.3231817140884339e-06, - "loss": 0.2728, + "epoch": 4.606624139546618, + "grad_norm": 0.3184077739715576, + "learning_rate": 8.138622209950403e-07, + "loss": 0.3873, "step": 127820 }, { - "epoch": 4.5, - "learning_rate": 1.3222673739463926e-06, - "loss": 0.2542, + "epoch": 4.606804339207843, + "grad_norm": 0.20487551391124725, + "learning_rate": 8.131238662475588e-07, + "loss": 0.329, "step": 127825 }, { - "epoch": 4.5, - "learning_rate": 1.3213533412441054e-06, - "loss": 0.2533, + "epoch": 4.606984538869067, + "grad_norm": 0.24536176025867462, + "learning_rate": 8.123858410415825e-07, + "loss": 0.3856, "step": 127830 }, { - "epoch": 4.5, - "learning_rate": 1.3204396159934406e-06, - "loss": 0.2539, + "epoch": 4.607164738530291, + "grad_norm": 0.27180904150009155, + "learning_rate": 8.116481453871672e-07, + "loss": 0.3782, "step": 127835 }, { - "epoch": 4.5, - "learning_rate": 1.319526198206264e-06, - "loss": 0.251, + "epoch": 4.607344938191516, + "grad_norm": 0.27243977785110474, + "learning_rate": 8.109107792943577e-07, + "loss": 0.3721, "step": 127840 }, { - "epoch": 4.5, - "learning_rate": 1.3186130878944296e-06, - "loss": 0.2461, + "epoch": 4.607525137852741, + "grad_norm": 0.2593880295753479, + "learning_rate": 8.101737427732098e-07, + "loss": 0.3698, "step": 127845 }, { - "epoch": 4.5, - "learning_rate": 1.3177002850698006e-06, - "loss": 0.2506, + "epoch": 4.607705337513965, + "grad_norm": 0.2483491450548172, + "learning_rate": 8.0943703583376e-07, + "loss": 0.3611, "step": 127850 }, { - "epoch": 4.5, - "learning_rate": 1.3167877897442227e-06, - "loss": 0.2423, + "epoch": 4.60788553717519, + "grad_norm": 0.28499743342399597, + "learning_rate": 8.087006584860501e-07, + "loss": 0.3617, "step": 127855 }, { - "epoch": 4.5, - "learning_rate": 1.315875601929553e-06, - "loss": 0.2454, + "epoch": 4.608065736836415, + "grad_norm": 0.29936861991882324, + "learning_rate": 8.079646107401085e-07, + "loss": 0.3898, "step": 127860 }, { - "epoch": 4.5, - "learning_rate": 1.3149637216376243e-06, - "loss": 0.256, + "epoch": 4.60824593649764, + "grad_norm": 0.25267350673675537, + "learning_rate": 8.072288926059601e-07, + "loss": 0.3722, "step": 127865 }, { - "epoch": 4.5, - "learning_rate": 1.3140521488802876e-06, - "loss": 0.2749, + "epoch": 4.608426136158864, + "grad_norm": 0.23155082762241364, + "learning_rate": 8.064935040936417e-07, + "loss": 0.3845, "step": 127870 }, { - "epoch": 4.5, - "learning_rate": 1.3131408836693726e-06, - "loss": 0.2542, + "epoch": 4.608606335820088, + "grad_norm": 0.21807222068309784, + "learning_rate": 8.057584452131644e-07, + "loss": 0.3625, "step": 127875 }, { - "epoch": 4.5, - "learning_rate": 1.3122299260167143e-06, - "loss": 0.2321, + "epoch": 4.608786535481313, + "grad_norm": 0.21520547568798065, + "learning_rate": 8.0502371597454e-07, + "loss": 0.388, "step": 127880 }, { - "epoch": 4.5, - "learning_rate": 1.3113192759341363e-06, - "loss": 0.2417, + "epoch": 4.608966735142538, + "grad_norm": 0.3115110993385315, + "learning_rate": 8.042893163877852e-07, + "loss": 0.3558, "step": 127885 }, { - "epoch": 4.5, - "learning_rate": 1.3104089334334686e-06, - "loss": 0.2423, + "epoch": 4.609146934803762, + "grad_norm": 0.2501071095466614, + "learning_rate": 8.035552464629059e-07, + "loss": 0.3805, "step": 127890 }, { - "epoch": 4.5, - "learning_rate": 1.3094988985265287e-06, - "loss": 0.2395, + "epoch": 4.609327134464987, + "grad_norm": 0.27980107069015503, + "learning_rate": 8.028215062098998e-07, + "loss": 0.35, "step": 127895 }, { - "epoch": 4.5, - "learning_rate": 1.308589171225133e-06, - "loss": 0.2731, + "epoch": 4.609507334126212, + "grad_norm": 0.35373973846435547, + "learning_rate": 8.020880956387672e-07, + "loss": 0.3654, "step": 127900 }, { - "epoch": 4.5, - "learning_rate": 1.307679751541091e-06, - "loss": 0.2482, + "epoch": 4.609687533787437, + "grad_norm": 0.3000072240829468, + "learning_rate": 8.013550147594945e-07, + "loss": 0.3785, "step": 127905 }, { - "epoch": 4.5, - "learning_rate": 1.3067706394862156e-06, - "loss": 0.2195, + "epoch": 4.609867733448661, + "grad_norm": 0.258821576833725, + "learning_rate": 8.006222635820792e-07, + "loss": 0.3589, "step": 127910 }, { - "epoch": 4.5, - "learning_rate": 1.3058618350723085e-06, - "loss": 0.2456, + "epoch": 4.610047933109886, + "grad_norm": 0.2523718774318695, + "learning_rate": 7.998898421165025e-07, + "loss": 0.3906, "step": 127915 }, { - "epoch": 4.5, - "learning_rate": 1.3049533383111685e-06, - "loss": 0.251, + "epoch": 4.610228132771111, + "grad_norm": 0.2900547385215759, + "learning_rate": 7.991577503727366e-07, + "loss": 0.3752, "step": 127920 }, { - "epoch": 4.5, - "learning_rate": 1.3040451492145972e-06, - "loss": 0.2635, + "epoch": 4.6104083324323355, + "grad_norm": 0.23753364384174347, + "learning_rate": 7.984259883607653e-07, + "loss": 0.343, "step": 127925 }, { - "epoch": 4.5, - "learning_rate": 1.3031372677943827e-06, - "loss": 0.2577, + "epoch": 4.610588532093559, + "grad_norm": 0.24349209666252136, + "learning_rate": 7.97694556090553e-07, + "loss": 0.3692, "step": 127930 }, { - "epoch": 4.5, - "learning_rate": 1.3022296940623153e-06, - "loss": 0.2534, + "epoch": 4.610768731754784, + "grad_norm": 0.2928297519683838, + "learning_rate": 7.969634535720639e-07, + "loss": 0.3934, "step": 127935 }, { - "epoch": 4.5, - "learning_rate": 1.3013224280301745e-06, - "loss": 0.2489, + "epoch": 4.610948931416009, + "grad_norm": 0.29633548855781555, + "learning_rate": 7.962326808152621e-07, + "loss": 0.3514, "step": 127940 }, { - "epoch": 4.5, - "learning_rate": 1.3004154697097454e-06, - "loss": 0.2611, + "epoch": 4.6111291310772335, + "grad_norm": 0.26136893033981323, + "learning_rate": 7.955022378301064e-07, + "loss": 0.3772, "step": 127945 }, { - "epoch": 4.5, - "learning_rate": 1.2995088191128046e-06, - "loss": 0.2496, + "epoch": 4.611309330738458, + "grad_norm": 0.24420562386512756, + "learning_rate": 7.947721246265472e-07, + "loss": 0.352, "step": 127950 }, { - "epoch": 4.5, - "learning_rate": 1.2986024762511235e-06, - "loss": 0.2583, + "epoch": 4.611489530399683, + "grad_norm": 0.24144157767295837, + "learning_rate": 7.940423412145292e-07, + "loss": 0.4035, "step": 127955 }, { - "epoch": 4.5, - "learning_rate": 1.2976964411364644e-06, - "loss": 0.2547, + "epoch": 4.611669730060908, + "grad_norm": 0.19324125349521637, + "learning_rate": 7.933128876039974e-07, + "loss": 0.3594, "step": 127960 }, { - "epoch": 4.5, - "learning_rate": 1.296790713780599e-06, - "loss": 0.2385, + "epoch": 4.6118499297221325, + "grad_norm": 0.2407853752374649, + "learning_rate": 7.925837638048967e-07, + "loss": 0.394, "step": 127965 }, { - "epoch": 4.5, - "learning_rate": 1.2958852941952843e-06, - "loss": 0.2369, + "epoch": 4.612030129383356, + "grad_norm": 0.2682248055934906, + "learning_rate": 7.918549698271494e-07, + "loss": 0.36, "step": 127970 }, { - "epoch": 4.5, - "learning_rate": 1.2949801823922802e-06, - "loss": 0.2403, + "epoch": 4.612210329044581, + "grad_norm": 0.21902526915073395, + "learning_rate": 7.911265056806921e-07, + "loss": 0.3495, "step": 127975 }, { - "epoch": 4.5, - "learning_rate": 1.2940753783833303e-06, - "loss": 0.2501, + "epoch": 4.612390528705806, + "grad_norm": 0.25772321224212646, + "learning_rate": 7.903983713754503e-07, + "loss": 0.3791, "step": 127980 }, { - "epoch": 4.5, - "learning_rate": 1.2931708821801946e-06, - "loss": 0.2313, + "epoch": 4.6125707283670305, + "grad_norm": 0.23041696846485138, + "learning_rate": 7.896705669213411e-07, + "loss": 0.4039, "step": 127985 }, { - "epoch": 4.5, - "learning_rate": 1.292266693794611e-06, - "loss": 0.2635, + "epoch": 4.612750928028255, + "grad_norm": 0.2209273725748062, + "learning_rate": 7.889430923282898e-07, + "loss": 0.3744, "step": 127990 }, { - "epoch": 4.5, - "learning_rate": 1.2913628132383226e-06, - "loss": 0.2462, + "epoch": 4.61293112768948, + "grad_norm": 0.23536401987075806, + "learning_rate": 7.882159476061968e-07, + "loss": 0.3478, "step": 127995 }, { - "epoch": 4.5, - "learning_rate": 1.2904592405230565e-06, - "loss": 0.2526, + "epoch": 4.613111327350705, + "grad_norm": 0.29315903782844543, + "learning_rate": 7.874891327649709e-07, + "loss": 0.4023, "step": 128000 }, { - "epoch": 4.5, - "eval_loss": 0.24867750704288483, - "eval_runtime": 10.5298, - "eval_samples_per_second": 9.497, - "eval_steps_per_second": 9.497, + "epoch": 4.613111327350705, + "eval_loss": 0.42875730991363525, + "eval_runtime": 3.5339, + "eval_samples_per_second": 28.298, + "eval_steps_per_second": 7.074, "step": 128000 }, { - "epoch": 4.5, - "learning_rate": 1.2895559756605586e-06, - "loss": 0.255, + "epoch": 4.6132915270119295, + "grad_norm": 0.25371137261390686, + "learning_rate": 7.867626478145235e-07, + "loss": 0.3724, "step": 128005 }, { - "epoch": 4.5, - "learning_rate": 1.2886530186625473e-06, - "loss": 0.2401, + "epoch": 4.613471726673154, + "grad_norm": 0.2467198222875595, + "learning_rate": 7.86036492764744e-07, + "loss": 0.386, "step": 128010 }, { - "epoch": 4.5, - "learning_rate": 1.2877503695407495e-06, - "loss": 0.2524, + "epoch": 4.613651926334379, + "grad_norm": 0.36411789059638977, + "learning_rate": 7.853106676255328e-07, + "loss": 0.3526, "step": 128015 }, { - "epoch": 4.5, - "learning_rate": 1.286848028306889e-06, - "loss": 0.2278, + "epoch": 4.613832125995603, + "grad_norm": 0.31409749388694763, + "learning_rate": 7.84585172406771e-07, + "loss": 0.4179, "step": 128020 }, { - "epoch": 4.5, - "learning_rate": 1.285945994972676e-06, - "loss": 0.2532, + "epoch": 4.6140123256568275, + "grad_norm": 0.23237422108650208, + "learning_rate": 7.838600071183478e-07, + "loss": 0.3654, "step": 128025 }, { - "epoch": 4.5, - "learning_rate": 1.2850442695498287e-06, - "loss": 0.2428, + "epoch": 4.614192525318052, + "grad_norm": 0.2930370271205902, + "learning_rate": 7.831351717701469e-07, + "loss": 0.3874, "step": 128030 }, { - "epoch": 4.5, - "learning_rate": 1.2841428520500492e-06, - "loss": 0.2495, + "epoch": 4.614372724979277, + "grad_norm": 0.24373428523540497, + "learning_rate": 7.82410666372041e-07, + "loss": 0.3839, "step": 128035 }, { - "epoch": 4.5, - "learning_rate": 1.28324174248505e-06, - "loss": 0.2322, + "epoch": 4.614552924640502, + "grad_norm": 0.31576600670814514, + "learning_rate": 7.816864909339e-07, + "loss": 0.384, "step": 128040 }, { - "epoch": 4.5, - "learning_rate": 1.282340940866525e-06, - "loss": 0.2704, + "epoch": 4.614733124301726, + "grad_norm": 0.2532004714012146, + "learning_rate": 7.809626454655911e-07, + "loss": 0.3523, "step": 128045 }, { - "epoch": 4.51, - "learning_rate": 1.2814404472061752e-06, - "loss": 0.2555, + "epoch": 4.614913323962951, + "grad_norm": 0.2986234724521637, + "learning_rate": 7.802391299769784e-07, + "loss": 0.3987, "step": 128050 }, { - "epoch": 4.51, - "learning_rate": 1.2805402615156837e-06, - "loss": 0.2477, + "epoch": 4.615093523624176, + "grad_norm": 0.24925264716148376, + "learning_rate": 7.795159444779154e-07, + "loss": 0.3647, "step": 128055 }, { - "epoch": 4.51, - "learning_rate": 1.2796403838067517e-06, - "loss": 0.2378, + "epoch": 4.6152737232854, + "grad_norm": 0.19276399910449982, + "learning_rate": 7.787930889782663e-07, + "loss": 0.3356, "step": 128060 }, { - "epoch": 4.51, - "learning_rate": 1.2787408140910535e-06, - "loss": 0.2618, + "epoch": 4.6154539229466245, + "grad_norm": 0.23550952970981598, + "learning_rate": 7.780705634878621e-07, + "loss": 0.4043, "step": 128065 }, { - "epoch": 4.51, - "learning_rate": 1.2778415523802767e-06, - "loss": 0.2622, + "epoch": 4.615634122607849, + "grad_norm": 0.29155659675598145, + "learning_rate": 7.773483680165617e-07, + "loss": 0.3733, "step": 128070 }, { - "epoch": 4.51, - "learning_rate": 1.2769425986860873e-06, - "loss": 0.253, + "epoch": 4.615814322269074, + "grad_norm": 0.2625998854637146, + "learning_rate": 7.766265025742014e-07, + "loss": 0.3544, "step": 128075 }, { - "epoch": 4.51, - "learning_rate": 1.2760439530201645e-06, - "loss": 0.2437, + "epoch": 4.615994521930299, + "grad_norm": 0.26157450675964355, + "learning_rate": 7.759049671706125e-07, + "loss": 0.3456, "step": 128080 }, { - "epoch": 4.51, - "learning_rate": 1.27514561539418e-06, - "loss": 0.2546, + "epoch": 4.616174721591523, + "grad_norm": 0.2234465479850769, + "learning_rate": 7.751837618156316e-07, + "loss": 0.4037, "step": 128085 }, { - "epoch": 4.51, - "learning_rate": 1.2742475858197961e-06, - "loss": 0.2535, + "epoch": 4.616354921252748, + "grad_norm": 0.2910038232803345, + "learning_rate": 7.744628865190784e-07, + "loss": 0.384, "step": 128090 }, { - "epoch": 4.51, - "learning_rate": 1.2733498643086677e-06, - "loss": 0.2389, + "epoch": 4.616535120913973, + "grad_norm": 0.28692278265953064, + "learning_rate": 7.737423412907785e-07, + "loss": 0.3577, "step": 128095 }, { - "epoch": 4.51, - "learning_rate": 1.272452450872458e-06, - "loss": 0.2523, + "epoch": 4.616715320575198, + "grad_norm": 0.23942017555236816, + "learning_rate": 7.730221261405518e-07, + "loss": 0.3879, "step": 128100 }, { - "epoch": 4.51, - "learning_rate": 1.2715553455228151e-06, - "loss": 0.243, + "epoch": 4.616895520236422, + "grad_norm": 0.23349037766456604, + "learning_rate": 7.723022410782044e-07, + "loss": 0.3775, "step": 128105 }, { - "epoch": 4.51, - "learning_rate": 1.2706585482713889e-06, - "loss": 0.2398, + "epoch": 4.617075719897647, + "grad_norm": 0.2587268352508545, + "learning_rate": 7.715826861135505e-07, + "loss": 0.4072, "step": 128110 }, { - "epoch": 4.51, - "learning_rate": 1.2697620591298199e-06, - "loss": 0.2464, + "epoch": 4.617255919558871, + "grad_norm": 0.24212481081485748, + "learning_rate": 7.708634612563936e-07, + "loss": 0.3423, "step": 128115 }, { - "epoch": 4.51, - "learning_rate": 1.268865878109754e-06, - "loss": 0.243, + "epoch": 4.617436119220096, + "grad_norm": 0.2429906278848648, + "learning_rate": 7.701445665165258e-07, + "loss": 0.3702, "step": 128120 }, { - "epoch": 4.51, - "learning_rate": 1.2679700052228265e-06, - "loss": 0.26, + "epoch": 4.61761631888132, + "grad_norm": 0.2274543046951294, + "learning_rate": 7.694260019037502e-07, + "loss": 0.3496, "step": 128125 }, { - "epoch": 4.51, - "learning_rate": 1.2670744404806645e-06, - "loss": 0.2382, + "epoch": 4.617796518542545, + "grad_norm": 0.27825266122817993, + "learning_rate": 7.687077674278564e-07, + "loss": 0.4096, "step": 128130 }, { - "epoch": 4.51, - "learning_rate": 1.2661791838949028e-06, - "loss": 0.2534, + "epoch": 4.61797671820377, + "grad_norm": 0.2594509720802307, + "learning_rate": 7.679898630986254e-07, + "loss": 0.3866, "step": 128135 }, { - "epoch": 4.51, - "learning_rate": 1.2652842354771627e-06, - "loss": 0.2408, + "epoch": 4.618156917864995, + "grad_norm": 0.2519370913505554, + "learning_rate": 7.67272288925841e-07, + "loss": 0.3612, "step": 128140 }, { - "epoch": 4.51, - "learning_rate": 1.2643895952390682e-06, - "loss": 0.2671, + "epoch": 4.618337117526219, + "grad_norm": 0.26362189650535583, + "learning_rate": 7.665550449192788e-07, + "loss": 0.3569, "step": 128145 }, { - "epoch": 4.51, - "learning_rate": 1.263495263192227e-06, - "loss": 0.2214, + "epoch": 4.618517317187444, + "grad_norm": 0.20409050583839417, + "learning_rate": 7.658381310887142e-07, + "loss": 0.4079, "step": 128150 }, { - "epoch": 4.51, - "learning_rate": 1.26260123934826e-06, - "loss": 0.2392, + "epoch": 4.618697516848668, + "grad_norm": 0.24394989013671875, + "learning_rate": 7.651215474439172e-07, + "loss": 0.3567, "step": 128155 }, { - "epoch": 4.51, - "learning_rate": 1.2617075237187748e-06, - "loss": 0.2451, + "epoch": 4.618877716509893, + "grad_norm": 0.2227751463651657, + "learning_rate": 7.644052939946411e-07, + "loss": 0.4044, "step": 128160 }, { - "epoch": 4.51, - "learning_rate": 1.260814116315373e-06, - "loss": 0.2509, + "epoch": 4.619057916171117, + "grad_norm": 0.2579880952835083, + "learning_rate": 7.636893707506532e-07, + "loss": 0.3804, "step": 128165 }, { - "epoch": 4.51, - "learning_rate": 1.2599210171496484e-06, - "loss": 0.2544, + "epoch": 4.619238115832342, + "grad_norm": 0.2594741880893707, + "learning_rate": 7.629737777217011e-07, + "loss": 0.3805, "step": 128170 }, { - "epoch": 4.51, - "learning_rate": 1.2590282262332109e-06, - "loss": 0.2544, + "epoch": 4.619418315493567, + "grad_norm": 0.2516389787197113, + "learning_rate": 7.622585149175465e-07, + "loss": 0.3612, "step": 128175 }, { - "epoch": 4.51, - "learning_rate": 1.2581357435776458e-06, - "loss": 0.2139, + "epoch": 4.619598515154792, + "grad_norm": 0.24397249519824982, + "learning_rate": 7.615435823479233e-07, + "loss": 0.3967, "step": 128180 }, { - "epoch": 4.51, - "learning_rate": 1.2572435691945355e-06, - "loss": 0.2569, + "epoch": 4.619778714816016, + "grad_norm": 0.2816333472728729, + "learning_rate": 7.608289800225737e-07, + "loss": 0.4114, "step": 128185 }, { - "epoch": 4.51, - "learning_rate": 1.2563517030954735e-06, - "loss": 0.2502, + "epoch": 4.619958914477241, + "grad_norm": 0.19909071922302246, + "learning_rate": 7.601147079512399e-07, + "loss": 0.3857, "step": 128190 }, { - "epoch": 4.51, - "learning_rate": 1.2554601452920394e-06, - "loss": 0.2751, + "epoch": 4.620139114138466, + "grad_norm": 0.2591770589351654, + "learning_rate": 7.594007661436475e-07, + "loss": 0.3494, "step": 128195 }, { - "epoch": 4.51, - "learning_rate": 1.2545688957958046e-06, - "loss": 0.2483, + "epoch": 4.6203193137996905, + "grad_norm": 0.2472904771566391, + "learning_rate": 7.586871546095303e-07, + "loss": 0.3945, "step": 128200 }, { - "epoch": 4.51, - "learning_rate": 1.2536779546183485e-06, - "loss": 0.2366, + "epoch": 4.620499513460914, + "grad_norm": 0.32332003116607666, + "learning_rate": 7.579738733586056e-07, + "loss": 0.3713, "step": 128205 }, { - "epoch": 4.51, - "learning_rate": 1.2527873217712287e-06, - "loss": 0.2486, + "epoch": 4.620679713122139, + "grad_norm": 0.25053974986076355, + "learning_rate": 7.572609224005905e-07, + "loss": 0.3696, "step": 128210 }, { - "epoch": 4.51, - "learning_rate": 1.2518969972660194e-06, - "loss": 0.2633, + "epoch": 4.620859912783364, + "grad_norm": 0.2518098056316376, + "learning_rate": 7.565483017452024e-07, + "loss": 0.3672, "step": 128215 }, { - "epoch": 4.51, - "learning_rate": 1.2510069811142749e-06, - "loss": 0.2338, + "epoch": 4.621040112444589, + "grad_norm": 0.25849613547325134, + "learning_rate": 7.558360114021528e-07, + "loss": 0.3614, "step": 128220 }, { - "epoch": 4.51, - "learning_rate": 1.2501172733275557e-06, - "loss": 0.2527, + "epoch": 4.621220312105813, + "grad_norm": 0.19711540639400482, + "learning_rate": 7.551240513811425e-07, + "loss": 0.3363, "step": 128225 }, { - "epoch": 4.51, - "learning_rate": 1.2492278739174079e-06, - "loss": 0.2317, + "epoch": 4.621400511767038, + "grad_norm": 0.28743451833724976, + "learning_rate": 7.544124216918746e-07, + "loss": 0.3525, "step": 128230 }, { - "epoch": 4.51, - "learning_rate": 1.248338782895389e-06, - "loss": 0.2424, + "epoch": 4.621580711428263, + "grad_norm": 0.2442832887172699, + "learning_rate": 7.537011223440415e-07, + "loss": 0.3821, "step": 128235 }, { - "epoch": 4.51, - "learning_rate": 1.2474500002730316e-06, - "loss": 0.2638, + "epoch": 4.6217609110894875, + "grad_norm": 0.23622441291809082, + "learning_rate": 7.529901533473355e-07, + "loss": 0.3843, "step": 128240 }, { - "epoch": 4.51, - "learning_rate": 1.2465615260618874e-06, - "loss": 0.2505, + "epoch": 4.621941110750711, + "grad_norm": 0.274114191532135, + "learning_rate": 7.522795147114486e-07, + "loss": 0.364, "step": 128245 }, { - "epoch": 4.51, - "learning_rate": 1.245673360273486e-06, - "loss": 0.2524, + "epoch": 4.622121310411936, + "grad_norm": 0.28449776768684387, + "learning_rate": 7.515692064460539e-07, + "loss": 0.3848, "step": 128250 }, { - "epoch": 4.51, - "learning_rate": 1.2447855029193628e-06, - "loss": 0.2336, + "epoch": 4.622301510073161, + "grad_norm": 0.2581150531768799, + "learning_rate": 7.508592285608351e-07, + "loss": 0.3404, "step": 128255 }, { - "epoch": 4.51, - "learning_rate": 1.2438979540110445e-06, - "loss": 0.2408, + "epoch": 4.6224817097343855, + "grad_norm": 0.25341930985450745, + "learning_rate": 7.501495810654679e-07, + "loss": 0.3227, "step": 128260 }, { - "epoch": 4.51, - "learning_rate": 1.2430107135600527e-06, - "loss": 0.249, + "epoch": 4.62266190939561, + "grad_norm": 0.24932630360126495, + "learning_rate": 7.494402639696113e-07, + "loss": 0.3482, "step": 128265 }, { - "epoch": 4.51, - "learning_rate": 1.2421237815779112e-06, - "loss": 0.2256, + "epoch": 4.622842109056835, + "grad_norm": 0.2252834588289261, + "learning_rate": 7.487312772829436e-07, + "loss": 0.356, "step": 128270 }, { - "epoch": 4.51, - "learning_rate": 1.241237158076139e-06, - "loss": 0.2579, + "epoch": 4.62302230871806, + "grad_norm": 0.23649568855762482, + "learning_rate": 7.480226210151098e-07, + "loss": 0.3956, "step": 128275 }, { - "epoch": 4.51, - "learning_rate": 1.240350843066243e-06, - "loss": 0.2842, + "epoch": 4.6232025083792845, + "grad_norm": 0.20822374522686005, + "learning_rate": 7.473142951757772e-07, + "loss": 0.3501, "step": 128280 }, { - "epoch": 4.51, - "learning_rate": 1.2394648365597283e-06, - "loss": 0.2446, + "epoch": 4.623382708040509, + "grad_norm": 0.2828695476055145, + "learning_rate": 7.466062997745909e-07, + "loss": 0.356, "step": 128285 }, { - "epoch": 4.51, - "learning_rate": 1.2385791385681105e-06, - "loss": 0.2459, + "epoch": 4.623562907701734, + "grad_norm": 0.2521428167819977, + "learning_rate": 7.458986348211988e-07, + "loss": 0.4002, "step": 128290 }, { - "epoch": 4.51, - "learning_rate": 1.2376937491028778e-06, - "loss": 0.2724, + "epoch": 4.623743107362958, + "grad_norm": 0.25071465969085693, + "learning_rate": 7.45191300325243e-07, + "loss": 0.4166, "step": 128295 }, { - "epoch": 4.51, - "learning_rate": 1.2368086681755347e-06, - "loss": 0.2479, + "epoch": 4.6239233070241825, + "grad_norm": 0.2567957937717438, + "learning_rate": 7.444842962963577e-07, + "loss": 0.3731, "step": 128300 }, { - "epoch": 4.51, - "learning_rate": 1.2359238957975695e-06, - "loss": 0.2443, + "epoch": 4.624103506685407, + "grad_norm": 0.2071949541568756, + "learning_rate": 7.437776227441767e-07, + "loss": 0.3451, "step": 128305 }, { - "epoch": 4.51, - "learning_rate": 1.2350394319804726e-06, - "loss": 0.2539, + "epoch": 4.624283706346632, + "grad_norm": 0.2758256793022156, + "learning_rate": 7.430712796783312e-07, + "loss": 0.3545, "step": 128310 }, { - "epoch": 4.51, - "learning_rate": 1.2341552767357296e-06, - "loss": 0.2659, + "epoch": 4.624463906007857, + "grad_norm": 0.26326316595077515, + "learning_rate": 7.423652671084441e-07, + "loss": 0.3581, "step": 128315 }, { - "epoch": 4.51, - "learning_rate": 1.2332714300748144e-06, - "loss": 0.267, + "epoch": 4.6246441056690815, + "grad_norm": 0.28544881939888, + "learning_rate": 7.416595850441327e-07, + "loss": 0.3611, "step": 128320 }, { - "epoch": 4.51, - "learning_rate": 1.232387892009207e-06, - "loss": 0.283, + "epoch": 4.624824305330306, + "grad_norm": 0.2547943592071533, + "learning_rate": 7.409542334950143e-07, + "loss": 0.3622, "step": 128325 }, { - "epoch": 4.52, - "learning_rate": 1.2315046625503785e-06, - "loss": 0.2495, + "epoch": 4.625004504991531, + "grad_norm": 0.24495618045330048, + "learning_rate": 7.402492124706922e-07, + "loss": 0.3712, "step": 128330 }, { - "epoch": 4.52, - "learning_rate": 1.2306217417098004e-06, - "loss": 0.2439, + "epoch": 4.625184704652755, + "grad_norm": 0.33649563789367676, + "learning_rate": 7.395445219807839e-07, + "loss": 0.3987, "step": 128335 }, { - "epoch": 4.52, - "learning_rate": 1.2297391294989357e-06, - "loss": 0.2537, + "epoch": 4.6253649043139795, + "grad_norm": 0.2949081361293793, + "learning_rate": 7.388401620348844e-07, + "loss": 0.3823, "step": 128340 }, { - "epoch": 4.52, - "learning_rate": 1.2288568259292394e-06, - "loss": 0.2471, + "epoch": 4.625545103975204, + "grad_norm": 0.23140224814414978, + "learning_rate": 7.381361326425862e-07, + "loss": 0.3957, "step": 128345 }, { - "epoch": 4.52, - "learning_rate": 1.2279748310121742e-06, - "loss": 0.246, + "epoch": 4.625725303636429, + "grad_norm": 0.2955685257911682, + "learning_rate": 7.374324338134897e-07, + "loss": 0.3733, "step": 128350 }, { - "epoch": 4.52, - "learning_rate": 1.2270931447591838e-06, - "loss": 0.2501, + "epoch": 4.625905503297654, + "grad_norm": 0.2494993358850479, + "learning_rate": 7.367290655571762e-07, + "loss": 0.3525, "step": 128355 }, { - "epoch": 4.52, - "learning_rate": 1.2262117671817286e-06, - "loss": 0.2576, + "epoch": 4.626085702958878, + "grad_norm": 0.21933454275131226, + "learning_rate": 7.360260278832353e-07, + "loss": 0.3365, "step": 128360 }, { - "epoch": 4.52, - "learning_rate": 1.225330698291241e-06, - "loss": 0.2712, + "epoch": 4.626265902620103, + "grad_norm": 0.26655271649360657, + "learning_rate": 7.3532332080124e-07, + "loss": 0.3693, "step": 128365 }, { - "epoch": 4.52, - "learning_rate": 1.2244499380991704e-06, - "loss": 0.2883, + "epoch": 4.626446102281328, + "grad_norm": 0.25093698501586914, + "learning_rate": 7.346209443207657e-07, + "loss": 0.364, "step": 128370 }, { - "epoch": 4.52, - "learning_rate": 1.223569486616949e-06, - "loss": 0.2656, + "epoch": 4.626626301942553, + "grad_norm": 0.2572937309741974, + "learning_rate": 7.339188984513828e-07, + "loss": 0.369, "step": 128375 }, { - "epoch": 4.52, - "learning_rate": 1.2226893438560038e-06, - "loss": 0.2379, + "epoch": 4.626806501603777, + "grad_norm": 0.2407890409231186, + "learning_rate": 7.332171832026586e-07, + "loss": 0.3885, "step": 128380 }, { - "epoch": 4.52, - "learning_rate": 1.2218095098277733e-06, - "loss": 0.2576, + "epoch": 4.626986701265002, + "grad_norm": 0.2307957410812378, + "learning_rate": 7.325157985841518e-07, + "loss": 0.3407, "step": 128385 }, { - "epoch": 4.52, - "learning_rate": 1.2209299845436727e-06, - "loss": 0.2686, + "epoch": 4.627166900926226, + "grad_norm": 0.3046651780605316, + "learning_rate": 7.318147446054191e-07, + "loss": 0.398, "step": 128390 }, { - "epoch": 4.52, - "learning_rate": 1.2200507680151268e-06, - "loss": 0.2639, + "epoch": 4.627347100587451, + "grad_norm": 0.2512384355068207, + "learning_rate": 7.311140212760109e-07, + "loss": 0.3883, "step": 128395 }, { - "epoch": 4.52, - "learning_rate": 1.2191718602535484e-06, - "loss": 0.2408, + "epoch": 4.627527300248675, + "grad_norm": 0.33214032649993896, + "learning_rate": 7.304136286054753e-07, + "loss": 0.3687, "step": 128400 }, { - "epoch": 4.52, - "learning_rate": 1.2182932612703507e-06, - "loss": 0.2649, + "epoch": 4.6277074999099, + "grad_norm": 0.22773124277591705, + "learning_rate": 7.297135666033572e-07, + "loss": 0.382, "step": 128405 }, { - "epoch": 4.52, - "learning_rate": 1.217414971076941e-06, - "loss": 0.2392, + "epoch": 4.627887699571125, + "grad_norm": 0.28903457522392273, + "learning_rate": 7.290138352791936e-07, + "loss": 0.365, "step": 128410 }, { - "epoch": 4.52, - "learning_rate": 1.2165369896847302e-06, - "loss": 0.2655, + "epoch": 4.62806789923235, + "grad_norm": 0.26188942790031433, + "learning_rate": 7.283144346425158e-07, + "loss": 0.3868, "step": 128415 }, { - "epoch": 4.52, - "learning_rate": 1.2156593171051055e-06, - "loss": 0.2692, + "epoch": 4.628248098893574, + "grad_norm": 0.2626602053642273, + "learning_rate": 7.27615364702855e-07, + "loss": 0.3717, "step": 128420 }, { - "epoch": 4.52, - "learning_rate": 1.2147819533494753e-06, - "loss": 0.2493, + "epoch": 4.628428298554799, + "grad_norm": 0.21673083305358887, + "learning_rate": 7.269166254697313e-07, + "loss": 0.405, "step": 128425 }, { - "epoch": 4.52, - "learning_rate": 1.2139048984292273e-06, - "loss": 0.2598, + "epoch": 4.628608498216023, + "grad_norm": 0.2583613991737366, + "learning_rate": 7.26218216952676e-07, + "loss": 0.372, "step": 128430 }, { - "epoch": 4.52, - "learning_rate": 1.2130281523557468e-06, - "loss": 0.2561, + "epoch": 4.628788697877248, + "grad_norm": 0.2558477818965912, + "learning_rate": 7.255201391611954e-07, + "loss": 0.3432, "step": 128435 }, { - "epoch": 4.52, - "learning_rate": 1.2121517151404193e-06, - "loss": 0.2514, + "epoch": 4.628968897538472, + "grad_norm": 0.23397044837474823, + "learning_rate": 7.248223921048014e-07, + "loss": 0.3572, "step": 128440 }, { - "epoch": 4.52, - "learning_rate": 1.2112755867946246e-06, - "loss": 0.2554, + "epoch": 4.629149097199697, + "grad_norm": 0.23054839670658112, + "learning_rate": 7.241249757930057e-07, + "loss": 0.352, "step": 128445 }, { - "epoch": 4.52, - "learning_rate": 1.2103997673297423e-06, - "loss": 0.2531, + "epoch": 4.629329296860922, + "grad_norm": 0.20861117541790009, + "learning_rate": 7.234278902353036e-07, + "loss": 0.3908, "step": 128450 }, { - "epoch": 4.52, - "learning_rate": 1.2095242567571358e-06, - "loss": 0.2466, + "epoch": 4.629509496522147, + "grad_norm": 0.25009605288505554, + "learning_rate": 7.227311354412014e-07, + "loss": 0.3717, "step": 128455 }, { - "epoch": 4.52, - "learning_rate": 1.2086490550881818e-06, - "loss": 0.2515, + "epoch": 4.629689696183371, + "grad_norm": 0.20757490396499634, + "learning_rate": 7.220347114201803e-07, + "loss": 0.3404, "step": 128460 }, { - "epoch": 4.52, - "learning_rate": 1.2077741623342354e-06, - "loss": 0.2532, + "epoch": 4.629869895844596, + "grad_norm": 0.21684591472148895, + "learning_rate": 7.213386181817411e-07, + "loss": 0.3976, "step": 128465 }, { - "epoch": 4.52, - "learning_rate": 1.2068995785066678e-06, - "loss": 0.282, + "epoch": 4.630050095505821, + "grad_norm": 0.2429099678993225, + "learning_rate": 7.206428557353623e-07, + "loss": 0.3773, "step": 128470 }, { - "epoch": 4.52, - "learning_rate": 1.2060253036168284e-06, - "loss": 0.2421, + "epoch": 4.6302302951670455, + "grad_norm": 0.22892560064792633, + "learning_rate": 7.199474240905224e-07, + "loss": 0.3536, "step": 128475 }, { - "epoch": 4.52, - "learning_rate": 1.2051513376760664e-06, - "loss": 0.2655, + "epoch": 4.630410494828269, + "grad_norm": 0.2792302072048187, + "learning_rate": 7.192523232567e-07, + "loss": 0.3563, "step": 128480 }, { - "epoch": 4.52, - "learning_rate": 1.2042776806957368e-06, - "loss": 0.2595, + "epoch": 4.630590694489494, + "grad_norm": 0.3034243881702423, + "learning_rate": 7.185575532433625e-07, + "loss": 0.3868, "step": 128485 }, { - "epoch": 4.52, - "learning_rate": 1.2034043326871775e-06, - "loss": 0.2237, + "epoch": 4.630770894150719, + "grad_norm": 0.2696426212787628, + "learning_rate": 7.178631140599746e-07, + "loss": 0.3712, "step": 128490 }, { - "epoch": 4.52, - "learning_rate": 1.2025312936617295e-06, - "loss": 0.2359, + "epoch": 4.630951093811944, + "grad_norm": 0.2592863440513611, + "learning_rate": 7.171690057160035e-07, + "loss": 0.3662, "step": 128495 }, { - "epoch": 4.52, - "learning_rate": 1.201658563630731e-06, - "loss": 0.2484, + "epoch": 4.631131293473168, + "grad_norm": 0.27280524373054504, + "learning_rate": 7.164752282209031e-07, + "loss": 0.381, "step": 128500 }, { - "epoch": 4.52, - "eval_loss": 0.24868270754814148, - "eval_runtime": 10.5541, - "eval_samples_per_second": 9.475, - "eval_steps_per_second": 9.475, + "epoch": 4.631131293473168, + "eval_loss": 0.42883244156837463, + "eval_runtime": 3.5159, + "eval_samples_per_second": 28.443, + "eval_steps_per_second": 7.111, "step": 128500 }, { - "epoch": 4.52, - "learning_rate": 1.2007861426055146e-06, - "loss": 0.2519, + "epoch": 4.631311493134393, + "grad_norm": 0.27146467566490173, + "learning_rate": 7.157817815841267e-07, + "loss": 0.3605, "step": 128505 }, { - "epoch": 4.52, - "learning_rate": 1.1999140305974043e-06, - "loss": 0.243, + "epoch": 4.631491692795618, + "grad_norm": 0.2853062152862549, + "learning_rate": 7.150886658151224e-07, + "loss": 0.3779, "step": 128510 }, { - "epoch": 4.52, - "learning_rate": 1.1990422276177221e-06, - "loss": 0.2495, + "epoch": 4.6316718924568425, + "grad_norm": 0.28604602813720703, + "learning_rate": 7.143958809233297e-07, + "loss": 0.3426, "step": 128515 }, { - "epoch": 4.52, - "learning_rate": 1.1981707336777893e-06, - "loss": 0.2229, + "epoch": 4.631852092118066, + "grad_norm": 0.230341836810112, + "learning_rate": 7.137034269181969e-07, + "loss": 0.3912, "step": 128520 }, { - "epoch": 4.52, - "learning_rate": 1.1972995487889299e-06, - "loss": 0.255, + "epoch": 4.632032291779291, + "grad_norm": 0.24526306986808777, + "learning_rate": 7.130113038091524e-07, + "loss": 0.386, "step": 128525 }, { - "epoch": 4.52, - "learning_rate": 1.1964286729624492e-06, - "loss": 0.2373, + "epoch": 4.632212491440516, + "grad_norm": 0.2500110864639282, + "learning_rate": 7.12319511605622e-07, + "loss": 0.3485, "step": 128530 }, { - "epoch": 4.52, - "learning_rate": 1.1955581062096488e-06, - "loss": 0.2358, + "epoch": 4.632392691101741, + "grad_norm": 0.23886556923389435, + "learning_rate": 7.116280503170398e-07, + "loss": 0.381, "step": 128535 }, { - "epoch": 4.52, - "learning_rate": 1.194687848541845e-06, - "loss": 0.2546, + "epoch": 4.632572890762965, + "grad_norm": 0.19476771354675293, + "learning_rate": 7.109369199528177e-07, + "loss": 0.3623, "step": 128540 }, { - "epoch": 4.52, - "learning_rate": 1.1938178999703287e-06, - "loss": 0.2521, + "epoch": 4.63275309042419, + "grad_norm": 0.23774556815624237, + "learning_rate": 7.102461205223843e-07, + "loss": 0.3641, "step": 128545 }, { - "epoch": 4.52, - "learning_rate": 1.1929482605064018e-06, - "loss": 0.2572, + "epoch": 4.632933290085415, + "grad_norm": 0.26775142550468445, + "learning_rate": 7.095556520351432e-07, + "loss": 0.3751, "step": 128550 }, { - "epoch": 4.52, - "learning_rate": 1.1920789301613443e-06, - "loss": 0.2352, + "epoch": 4.6331134897466395, + "grad_norm": 0.24131718277931213, + "learning_rate": 7.088655145005008e-07, + "loss": 0.3709, "step": 128555 }, { - "epoch": 4.52, - "learning_rate": 1.1912099089464584e-06, - "loss": 0.2382, + "epoch": 4.633293689407864, + "grad_norm": 0.2844558358192444, + "learning_rate": 7.081757079278662e-07, + "loss": 0.3807, "step": 128560 }, { - "epoch": 4.52, - "learning_rate": 1.190341196873021e-06, - "loss": 0.2519, + "epoch": 4.633473889069089, + "grad_norm": 0.303622305393219, + "learning_rate": 7.074862323266318e-07, + "loss": 0.3549, "step": 128565 }, { - "epoch": 4.52, - "learning_rate": 1.1894727939523093e-06, - "loss": 0.2644, + "epoch": 4.633654088730313, + "grad_norm": 0.21193251013755798, + "learning_rate": 7.067970877061958e-07, + "loss": 0.3675, "step": 128570 }, { - "epoch": 4.52, - "learning_rate": 1.1886047001956003e-06, - "loss": 0.2342, + "epoch": 4.6338342883915375, + "grad_norm": 0.2827516496181488, + "learning_rate": 7.061082740759478e-07, + "loss": 0.3776, "step": 128575 }, { - "epoch": 4.52, - "learning_rate": 1.1879104477958952e-06, - "loss": 0.2639, + "epoch": 4.634014488052762, + "grad_norm": 0.2597757577896118, + "learning_rate": 7.054197914452664e-07, + "loss": 0.3736, "step": 128580 }, { - "epoch": 4.52, - "learning_rate": 1.187042910562794e-06, - "loss": 0.2323, + "epoch": 4.634194687713987, + "grad_norm": 0.23663856089115143, + "learning_rate": 7.047316398235387e-07, + "loss": 0.3633, "step": 128585 }, { - "epoch": 4.52, - "learning_rate": 1.1861756825252534e-06, - "loss": 0.2592, + "epoch": 4.634374887375212, + "grad_norm": 0.22754615545272827, + "learning_rate": 7.040438192201403e-07, + "loss": 0.3843, "step": 128590 }, { - "epoch": 4.52, - "learning_rate": 1.185308763694526e-06, - "loss": 0.2256, + "epoch": 4.6345550870364365, + "grad_norm": 0.28135913610458374, + "learning_rate": 7.033563296444417e-07, + "loss": 0.354, "step": 128595 }, { - "epoch": 4.52, - "learning_rate": 1.1844421540818662e-06, - "loss": 0.2473, + "epoch": 4.634735286697661, + "grad_norm": 0.31850412487983704, + "learning_rate": 7.026691711058076e-07, + "loss": 0.3579, "step": 128600 }, { - "epoch": 4.52, - "learning_rate": 1.1835758536985347e-06, - "loss": 0.2467, + "epoch": 4.634915486358886, + "grad_norm": 0.24080584943294525, + "learning_rate": 7.019823436136024e-07, + "loss": 0.3839, "step": 128605 }, { - "epoch": 4.52, - "learning_rate": 1.1827098625557698e-06, - "loss": 0.2349, + "epoch": 4.63509568602011, + "grad_norm": 0.2874895930290222, + "learning_rate": 7.012958471771803e-07, + "loss": 0.3642, "step": 128610 }, { - "epoch": 4.53, - "learning_rate": 1.1818441806648262e-06, - "loss": 0.2594, + "epoch": 4.6352758856813345, + "grad_norm": 0.24868187308311462, + "learning_rate": 7.006096818059e-07, + "loss": 0.3436, "step": 128615 }, { - "epoch": 4.53, - "learning_rate": 1.1809788080369393e-06, - "loss": 0.2482, + "epoch": 4.635456085342559, + "grad_norm": 0.24118977785110474, + "learning_rate": 6.999238475091096e-07, + "loss": 0.3891, "step": 128620 }, { - "epoch": 4.53, - "learning_rate": 1.1801137446833422e-06, - "loss": 0.2539, + "epoch": 4.635636285003784, + "grad_norm": 0.202513188123703, + "learning_rate": 6.99238344296152e-07, + "loss": 0.3655, "step": 128625 }, { - "epoch": 4.53, - "learning_rate": 1.1792489906152726e-06, - "loss": 0.2359, + "epoch": 4.635816484665009, + "grad_norm": 0.24123841524124146, + "learning_rate": 6.985531721763666e-07, + "loss": 0.4058, "step": 128630 }, { - "epoch": 4.53, - "learning_rate": 1.1783845458439552e-06, - "loss": 0.2332, + "epoch": 4.6359966843262335, + "grad_norm": 0.2426537424325943, + "learning_rate": 6.978683311590877e-07, + "loss": 0.3598, "step": 128635 }, { - "epoch": 4.53, - "learning_rate": 1.177520410380617e-06, - "loss": 0.2727, + "epoch": 4.636176883987458, + "grad_norm": 0.2862418591976166, + "learning_rate": 6.971838212536524e-07, + "loss": 0.3541, "step": 128640 }, { - "epoch": 4.53, - "learning_rate": 1.176656584236474e-06, - "loss": 0.246, + "epoch": 4.636357083648683, + "grad_norm": 0.24865272641181946, + "learning_rate": 6.96499642469381e-07, + "loss": 0.3766, "step": 128645 }, { - "epoch": 4.53, - "learning_rate": 1.175793067422748e-06, - "loss": 0.2481, + "epoch": 4.636537283309908, + "grad_norm": 0.31950682401657104, + "learning_rate": 6.958157948155936e-07, + "loss": 0.347, "step": 128650 }, { - "epoch": 4.53, - "learning_rate": 1.1749298599506464e-06, - "loss": 0.2231, + "epoch": 4.636717482971132, + "grad_norm": 0.2226303666830063, + "learning_rate": 6.951322783016107e-07, + "loss": 0.3515, "step": 128655 }, { - "epoch": 4.53, - "learning_rate": 1.174066961831377e-06, - "loss": 0.2451, + "epoch": 4.636897682632357, + "grad_norm": 0.22242146730422974, + "learning_rate": 6.944490929367469e-07, + "loss": 0.3747, "step": 128660 }, { - "epoch": 4.53, - "learning_rate": 1.1732043730761478e-06, - "loss": 0.2431, + "epoch": 4.637077882293581, + "grad_norm": 0.3434227705001831, + "learning_rate": 6.93766238730309e-07, + "loss": 0.399, "step": 128665 }, { - "epoch": 4.53, - "learning_rate": 1.1723420936961576e-06, - "loss": 0.2217, + "epoch": 4.637258081954806, + "grad_norm": 0.27048400044441223, + "learning_rate": 6.930837156916004e-07, + "loss": 0.3702, "step": 128670 }, { - "epoch": 4.53, - "learning_rate": 1.1714801237026036e-06, - "loss": 0.252, + "epoch": 4.6374382816160304, + "grad_norm": 0.2766757011413574, + "learning_rate": 6.924015238299164e-07, + "loss": 0.373, "step": 128675 }, { - "epoch": 4.53, - "learning_rate": 1.1706184631066737e-06, - "loss": 0.2514, + "epoch": 4.637618481277255, + "grad_norm": 0.22949692606925964, + "learning_rate": 6.91719663154558e-07, + "loss": 0.3402, "step": 128680 }, { - "epoch": 4.53, - "learning_rate": 1.1697571119195643e-06, - "loss": 0.229, + "epoch": 4.63779868093848, + "grad_norm": 0.2594715654850006, + "learning_rate": 6.910381336748123e-07, + "loss": 0.3437, "step": 128685 }, { - "epoch": 4.53, - "learning_rate": 1.1688960701524503e-06, - "loss": 0.2412, + "epoch": 4.637978880599705, + "grad_norm": 0.24005542695522308, + "learning_rate": 6.903569353999661e-07, + "loss": 0.3574, "step": 128690 }, { - "epoch": 4.53, - "learning_rate": 1.1680353378165172e-06, - "loss": 0.2556, + "epoch": 4.638159080260929, + "grad_norm": 0.2983018755912781, + "learning_rate": 6.896760683393011e-07, + "loss": 0.366, "step": 128695 }, { - "epoch": 4.53, - "learning_rate": 1.167174914922936e-06, - "loss": 0.2428, + "epoch": 4.638339279922154, + "grad_norm": 0.2457517385482788, + "learning_rate": 6.889955325020903e-07, + "loss": 0.3647, "step": 128700 }, { - "epoch": 4.53, - "learning_rate": 1.1663148014828873e-06, - "loss": 0.2401, + "epoch": 4.638519479583378, + "grad_norm": 0.27278947830200195, + "learning_rate": 6.883153278976123e-07, + "loss": 0.3514, "step": 128705 }, { - "epoch": 4.53, - "learning_rate": 1.1654549975075314e-06, - "loss": 0.2426, + "epoch": 4.638699679244603, + "grad_norm": 0.3145797848701477, + "learning_rate": 6.876354545351293e-07, + "loss": 0.3964, "step": 128710 }, { - "epoch": 4.53, - "learning_rate": 1.1645955030080313e-06, - "loss": 0.2618, + "epoch": 4.638879878905827, + "grad_norm": 0.26128581166267395, + "learning_rate": 6.869559124239061e-07, + "loss": 0.3822, "step": 128715 }, { - "epoch": 4.53, - "learning_rate": 1.1637363179955535e-06, - "loss": 0.2434, + "epoch": 4.639060078567052, + "grad_norm": 0.2640693187713623, + "learning_rate": 6.862767015732019e-07, + "loss": 0.3944, "step": 128720 }, { - "epoch": 4.53, - "learning_rate": 1.1628774424812528e-06, - "loss": 0.2437, + "epoch": 4.639240278228277, + "grad_norm": 0.24374765157699585, + "learning_rate": 6.855978219922676e-07, + "loss": 0.3894, "step": 128725 }, { - "epoch": 4.53, - "learning_rate": 1.1620188764762785e-06, - "loss": 0.2514, + "epoch": 4.639420477889502, + "grad_norm": 0.24732322990894318, + "learning_rate": 6.849192736903598e-07, + "loss": 0.3685, "step": 128730 }, { - "epoch": 4.53, - "learning_rate": 1.1611606199917773e-06, - "loss": 0.2378, + "epoch": 4.639600677550726, + "grad_norm": 0.307091623544693, + "learning_rate": 6.84241056676721e-07, + "loss": 0.4219, "step": 128735 }, { - "epoch": 4.53, - "learning_rate": 1.1603026730388962e-06, - "loss": 0.2316, + "epoch": 4.639780877211951, + "grad_norm": 0.2447475790977478, + "learning_rate": 6.835631709605856e-07, + "loss": 0.3844, "step": 128740 }, { - "epoch": 4.53, - "learning_rate": 1.1594450356287727e-06, - "loss": 0.2539, + "epoch": 4.639961076873176, + "grad_norm": 0.21926872432231903, + "learning_rate": 6.828856165511932e-07, + "loss": 0.3567, "step": 128745 }, { - "epoch": 4.53, - "learning_rate": 1.158587707772546e-06, - "loss": 0.2615, + "epoch": 4.640141276534401, + "grad_norm": 0.27030664682388306, + "learning_rate": 6.822083934577811e-07, + "loss": 0.4035, "step": 128750 }, { - "epoch": 4.53, - "learning_rate": 1.1577306894813483e-06, - "loss": 0.2508, + "epoch": 4.640321476195624, + "grad_norm": 0.28935354948043823, + "learning_rate": 6.815315016895696e-07, + "loss": 0.3833, "step": 128755 }, { - "epoch": 4.53, - "learning_rate": 1.1568739807662988e-06, - "loss": 0.2585, + "epoch": 4.640501675856849, + "grad_norm": 0.25917571783065796, + "learning_rate": 6.808549412557819e-07, + "loss": 0.3641, "step": 128760 }, { - "epoch": 4.53, - "learning_rate": 1.1560175816385298e-06, - "loss": 0.2464, + "epoch": 4.640681875518074, + "grad_norm": 0.2617967426776886, + "learning_rate": 6.801787121656355e-07, + "loss": 0.3447, "step": 128765 }, { - "epoch": 4.53, - "learning_rate": 1.155161492109158e-06, - "loss": 0.253, + "epoch": 4.640862075179299, + "grad_norm": 0.23066262900829315, + "learning_rate": 6.795028144283483e-07, + "loss": 0.3868, "step": 128770 }, { - "epoch": 4.53, - "learning_rate": 1.154305712189302e-06, - "loss": 0.2679, + "epoch": 4.641042274840523, + "grad_norm": 0.20748838782310486, + "learning_rate": 6.788272480531266e-07, + "loss": 0.3589, "step": 128775 }, { - "epoch": 4.53, - "learning_rate": 1.1534502418900667e-06, - "loss": 0.2551, + "epoch": 4.641222474501748, + "grad_norm": 0.2675074636936188, + "learning_rate": 6.781520130491742e-07, + "loss": 0.3952, "step": 128780 }, { - "epoch": 4.53, - "learning_rate": 1.152595081222571e-06, - "loss": 0.2369, + "epoch": 4.641402674162973, + "grad_norm": 0.25951436161994934, + "learning_rate": 6.774771094256949e-07, + "loss": 0.3544, "step": 128785 }, { - "epoch": 4.53, - "learning_rate": 1.1517402301979092e-06, - "loss": 0.233, + "epoch": 4.6415828738241975, + "grad_norm": 0.23791882395744324, + "learning_rate": 6.768025371918785e-07, + "loss": 0.379, "step": 128790 }, { - "epoch": 4.53, - "learning_rate": 1.150885688827183e-06, - "loss": 0.2555, + "epoch": 4.641763073485421, + "grad_norm": 0.2497406154870987, + "learning_rate": 6.761282963569149e-07, + "loss": 0.3833, "step": 128795 }, { - "epoch": 4.53, - "learning_rate": 1.1500314571214921e-06, - "loss": 0.2557, + "epoch": 4.641943273146646, + "grad_norm": 0.2554693818092346, + "learning_rate": 6.754543869299996e-07, + "loss": 0.3814, "step": 128800 }, { - "epoch": 4.53, - "learning_rate": 1.149177535091922e-06, - "loss": 0.2527, + "epoch": 4.642123472807871, + "grad_norm": 0.22991681098937988, + "learning_rate": 6.747808089203056e-07, + "loss": 0.3698, "step": 128805 }, { - "epoch": 4.53, - "learning_rate": 1.1483239227495668e-06, - "loss": 0.2333, + "epoch": 4.642303672469096, + "grad_norm": 0.2903478741645813, + "learning_rate": 6.741075623370147e-07, + "loss": 0.3504, "step": 128810 }, { - "epoch": 4.53, - "learning_rate": 1.1474706201055007e-06, - "loss": 0.2304, + "epoch": 4.64248387213032, + "grad_norm": 0.21784360706806183, + "learning_rate": 6.734346471893e-07, + "loss": 0.3714, "step": 128815 }, { - "epoch": 4.53, - "learning_rate": 1.146617627170815e-06, - "loss": 0.2461, + "epoch": 4.642664071791545, + "grad_norm": 0.2957072854042053, + "learning_rate": 6.727620634863235e-07, + "loss": 0.3464, "step": 128820 }, { - "epoch": 4.53, - "learning_rate": 1.1457649439565733e-06, - "loss": 0.2539, + "epoch": 4.64284427145277, + "grad_norm": 0.2421649545431137, + "learning_rate": 6.720898112372614e-07, + "loss": 0.354, "step": 128825 }, { - "epoch": 4.53, - "learning_rate": 1.1449125704738606e-06, - "loss": 0.2542, + "epoch": 4.6430244711139945, + "grad_norm": 0.21480095386505127, + "learning_rate": 6.714178904512619e-07, + "loss": 0.3823, "step": 128830 }, { - "epoch": 4.53, - "learning_rate": 1.1440605067337295e-06, - "loss": 0.2524, + "epoch": 4.643204670775219, + "grad_norm": 0.22311148047447205, + "learning_rate": 6.707463011374815e-07, + "loss": 0.3979, "step": 128835 }, { - "epoch": 4.53, - "learning_rate": 1.1432087527472574e-06, - "loss": 0.2413, + "epoch": 4.643384870436444, + "grad_norm": 0.2693723440170288, + "learning_rate": 6.70075043305074e-07, + "loss": 0.3854, "step": 128840 }, { - "epoch": 4.53, - "learning_rate": 1.1423573085254963e-06, - "loss": 0.2529, + "epoch": 4.643565070097668, + "grad_norm": 0.29857292771339417, + "learning_rate": 6.69404116963182e-07, + "loss": 0.3685, "step": 128845 }, { - "epoch": 4.53, - "learning_rate": 1.1415061740795014e-06, - "loss": 0.2289, + "epoch": 4.643745269758893, + "grad_norm": 0.2625799775123596, + "learning_rate": 6.68733522120954e-07, + "loss": 0.3603, "step": 128850 }, { - "epoch": 4.53, - "learning_rate": 1.1406553494203247e-06, - "loss": 0.2514, + "epoch": 4.643925469420117, + "grad_norm": 0.2390749603509903, + "learning_rate": 6.680632587875185e-07, + "loss": 0.3602, "step": 128855 }, { - "epoch": 4.53, - "learning_rate": 1.1398048345590162e-06, - "loss": 0.255, + "epoch": 4.644105669081342, + "grad_norm": 0.28638026118278503, + "learning_rate": 6.673933269720072e-07, + "loss": 0.3673, "step": 128860 }, { - "epoch": 4.53, - "learning_rate": 1.1389546295066166e-06, - "loss": 0.2527, + "epoch": 4.644285868742567, + "grad_norm": 0.2660246789455414, + "learning_rate": 6.667237266835519e-07, + "loss": 0.3327, "step": 128865 }, { - "epoch": 4.53, - "learning_rate": 1.1381047342741674e-06, - "loss": 0.2329, + "epoch": 4.6444660684037915, + "grad_norm": 0.218661829829216, + "learning_rate": 6.660544579312755e-07, + "loss": 0.3471, "step": 128870 }, { - "epoch": 4.53, - "learning_rate": 1.1372551488726985e-06, - "loss": 0.2546, + "epoch": 4.644646268065016, + "grad_norm": 0.2507161498069763, + "learning_rate": 6.65385520724296e-07, + "loss": 0.3942, "step": 128875 }, { - "epoch": 4.53, - "learning_rate": 1.1364058733132483e-06, - "loss": 0.2448, + "epoch": 4.644826467726241, + "grad_norm": 0.2658718526363373, + "learning_rate": 6.64716915071728e-07, + "loss": 0.3769, "step": 128880 }, { - "epoch": 4.53, - "learning_rate": 1.1355569076068411e-06, - "loss": 0.2494, + "epoch": 4.645006667387465, + "grad_norm": 0.24641652405261993, + "learning_rate": 6.640486409826785e-07, + "loss": 0.3482, "step": 128885 }, { - "epoch": 4.53, - "learning_rate": 1.134708251764502e-06, - "loss": 0.2349, + "epoch": 4.6451868670486895, + "grad_norm": 0.27895164489746094, + "learning_rate": 6.633806984662566e-07, + "loss": 0.3843, "step": 128890 }, { - "epoch": 4.53, - "learning_rate": 1.1338599057972437e-06, - "loss": 0.245, + "epoch": 4.645367066709914, + "grad_norm": 0.2246672362089157, + "learning_rate": 6.627130875315607e-07, + "loss": 0.3708, "step": 128895 }, { - "epoch": 4.54, - "learning_rate": 1.1330118697160914e-06, - "loss": 0.2504, + "epoch": 4.645547266371139, + "grad_norm": 0.2589907944202423, + "learning_rate": 6.620458081876891e-07, + "loss": 0.364, "step": 128900 }, { - "epoch": 4.54, - "learning_rate": 1.1321641435320495e-06, - "loss": 0.2522, + "epoch": 4.645727466032364, + "grad_norm": 0.31070613861083984, + "learning_rate": 6.613788604437288e-07, + "loss": 0.3812, "step": 128905 }, { - "epoch": 4.54, - "learning_rate": 1.1313167272561238e-06, - "loss": 0.242, + "epoch": 4.6459076656935885, + "grad_norm": 0.24168938398361206, + "learning_rate": 6.607122443087672e-07, + "loss": 0.3644, "step": 128910 }, { - "epoch": 4.54, - "learning_rate": 1.1304696208993215e-06, - "loss": 0.2493, + "epoch": 4.646087865354813, + "grad_norm": 0.24737270176410675, + "learning_rate": 6.60045959791894e-07, + "loss": 0.361, "step": 128915 }, { - "epoch": 4.54, - "learning_rate": 1.1296228244726426e-06, - "loss": 0.2431, + "epoch": 4.646268065016038, + "grad_norm": 0.24914908409118652, + "learning_rate": 6.593800069021827e-07, + "loss": 0.3888, "step": 128920 }, { - "epoch": 4.54, - "learning_rate": 1.1287763379870809e-06, - "loss": 0.2553, + "epoch": 4.646448264677263, + "grad_norm": 0.24401962757110596, + "learning_rate": 6.587143856487011e-07, + "loss": 0.3582, "step": 128925 }, { - "epoch": 4.54, - "learning_rate": 1.127930161453622e-06, - "loss": 0.2363, + "epoch": 4.646628464338487, + "grad_norm": 0.24380113184452057, + "learning_rate": 6.58049096040525e-07, + "loss": 0.3765, "step": 128930 }, { - "epoch": 4.54, - "learning_rate": 1.1270842948832599e-06, - "loss": 0.2522, + "epoch": 4.646808663999712, + "grad_norm": 0.241029292345047, + "learning_rate": 6.573841380867197e-07, + "loss": 0.3522, "step": 128935 }, { - "epoch": 4.54, - "learning_rate": 1.1262387382869778e-06, - "loss": 0.254, + "epoch": 4.646988863660936, + "grad_norm": 0.229360893368721, + "learning_rate": 6.567195117963415e-07, + "loss": 0.36, "step": 128940 }, { - "epoch": 4.54, - "learning_rate": 1.1253934916757496e-06, - "loss": 0.2429, + "epoch": 4.647169063322161, + "grad_norm": 0.2620939016342163, + "learning_rate": 6.560552171784473e-07, + "loss": 0.4135, "step": 128945 }, { - "epoch": 4.54, - "learning_rate": 1.1245485550605534e-06, - "loss": 0.2606, + "epoch": 4.6473492629833855, + "grad_norm": 0.2812032997608185, + "learning_rate": 6.553912542420826e-07, + "loss": 0.3603, "step": 128950 }, { - "epoch": 4.54, - "learning_rate": 1.1237039284523605e-06, - "loss": 0.2483, + "epoch": 4.64752946264461, + "grad_norm": 0.27785706520080566, + "learning_rate": 6.54727622996304e-07, + "loss": 0.3793, "step": 128955 }, { - "epoch": 4.54, - "learning_rate": 1.122859611862137e-06, - "loss": 0.2245, + "epoch": 4.647709662305835, + "grad_norm": 0.2647446095943451, + "learning_rate": 6.540643234501487e-07, + "loss": 0.3661, "step": 128960 }, { - "epoch": 4.54, - "learning_rate": 1.1220156053008468e-06, - "loss": 0.2884, + "epoch": 4.64788986196706, + "grad_norm": 0.28079158067703247, + "learning_rate": 6.534013556126511e-07, + "loss": 0.3658, "step": 128965 }, { - "epoch": 4.54, - "learning_rate": 1.1211719087794447e-06, - "loss": 0.2425, + "epoch": 4.648070061628284, + "grad_norm": 0.291545569896698, + "learning_rate": 6.527387194928486e-07, + "loss": 0.3871, "step": 128970 }, { - "epoch": 4.54, - "learning_rate": 1.1203285223088915e-06, - "loss": 0.2316, + "epoch": 4.648250261289509, + "grad_norm": 0.24639180302619934, + "learning_rate": 6.520764150997644e-07, + "loss": 0.3754, "step": 128975 }, { - "epoch": 4.54, - "learning_rate": 1.119485445900137e-06, - "loss": 0.2686, + "epoch": 4.648430460950733, + "grad_norm": 0.1946740746498108, + "learning_rate": 6.514144424424246e-07, + "loss": 0.3622, "step": 128980 }, { - "epoch": 4.54, - "learning_rate": 1.1186426795641192e-06, - "loss": 0.2521, + "epoch": 4.648610660611958, + "grad_norm": 0.24029649794101715, + "learning_rate": 6.507528015298525e-07, + "loss": 0.3713, "step": 128985 }, { - "epoch": 4.54, - "learning_rate": 1.1178002233117934e-06, - "loss": 0.2504, + "epoch": 4.6487908602731824, + "grad_norm": 0.22794067859649658, + "learning_rate": 6.500914923710577e-07, + "loss": 0.3872, "step": 128990 }, { - "epoch": 4.54, - "learning_rate": 1.11695807715409e-06, - "loss": 0.2675, + "epoch": 4.648971059934407, + "grad_norm": 0.2424376904964447, + "learning_rate": 6.494305149750524e-07, + "loss": 0.3783, "step": 128995 }, { - "epoch": 4.54, - "learning_rate": 1.1161162411019471e-06, - "loss": 0.2357, + "epoch": 4.649151259595632, + "grad_norm": 0.27544233202934265, + "learning_rate": 6.487698693508432e-07, + "loss": 0.3789, "step": 129000 }, { - "epoch": 4.54, - "eval_loss": 0.24872316420078278, - "eval_runtime": 10.5448, - "eval_samples_per_second": 9.483, - "eval_steps_per_second": 9.483, + "epoch": 4.649151259595632, + "eval_loss": 0.4288814663887024, + "eval_runtime": 3.5274, + "eval_samples_per_second": 28.349, + "eval_steps_per_second": 7.087, "step": 129000 }, { - "epoch": 4.54, - "learning_rate": 1.115274715166298e-06, - "loss": 0.2395, + "epoch": 4.649331459256857, + "grad_norm": 0.23391346633434296, + "learning_rate": 6.481095555074257e-07, + "loss": 0.3823, "step": 129005 }, { - "epoch": 4.54, - "learning_rate": 1.1144334993580613e-06, - "loss": 0.2523, + "epoch": 4.649511658918081, + "grad_norm": 0.3378024697303772, + "learning_rate": 6.474495734538038e-07, + "loss": 0.381, "step": 129010 }, { - "epoch": 4.54, - "learning_rate": 1.1135925936881674e-06, - "loss": 0.2347, + "epoch": 4.649691858579306, + "grad_norm": 0.23834291100502014, + "learning_rate": 6.467899231989704e-07, + "loss": 0.3852, "step": 129015 }, { - "epoch": 4.54, - "learning_rate": 1.1127519981675299e-06, - "loss": 0.2594, + "epoch": 4.649872058240531, + "grad_norm": 0.2810116410255432, + "learning_rate": 6.461306047519017e-07, + "loss": 0.3648, "step": 129020 }, { - "epoch": 4.54, - "learning_rate": 1.1119117128070649e-06, - "loss": 0.2511, + "epoch": 4.650052257901756, + "grad_norm": 0.2411430925130844, + "learning_rate": 6.45471618121593e-07, + "loss": 0.3573, "step": 129025 }, { - "epoch": 4.54, - "learning_rate": 1.111071737617686e-06, - "loss": 0.2486, + "epoch": 4.650232457562979, + "grad_norm": 0.22861650586128235, + "learning_rate": 6.448129633170153e-07, + "loss": 0.3441, "step": 129030 }, { - "epoch": 4.54, - "learning_rate": 1.1102320726102983e-06, - "loss": 0.2397, + "epoch": 4.650412657224204, + "grad_norm": 0.29254698753356934, + "learning_rate": 6.441546403471499e-07, + "loss": 0.3794, "step": 129035 }, { - "epoch": 4.54, - "learning_rate": 1.1093927177957985e-06, - "loss": 0.241, + "epoch": 4.650592856885429, + "grad_norm": 0.2996603548526764, + "learning_rate": 6.434966492209621e-07, + "loss": 0.375, "step": 129040 }, { - "epoch": 4.54, - "learning_rate": 1.1085536731850947e-06, - "loss": 0.2557, + "epoch": 4.650773056546654, + "grad_norm": 0.29424965381622314, + "learning_rate": 6.428389899474113e-07, + "loss": 0.3789, "step": 129045 }, { - "epoch": 4.54, - "learning_rate": 1.10771493878907e-06, - "loss": 0.2746, + "epoch": 4.650953256207878, + "grad_norm": 0.22816717624664307, + "learning_rate": 6.421816625354682e-07, + "loss": 0.3889, "step": 129050 }, { - "epoch": 4.54, - "learning_rate": 1.1068765146186266e-06, - "loss": 0.2543, + "epoch": 4.651133455869103, + "grad_norm": 0.29083213210105896, + "learning_rate": 6.415246669940838e-07, + "loss": 0.3914, "step": 129055 }, { - "epoch": 4.54, - "learning_rate": 1.1060384006846475e-06, - "loss": 0.2526, + "epoch": 4.651313655530328, + "grad_norm": 0.24897891283035278, + "learning_rate": 6.408680033322096e-07, + "loss": 0.382, "step": 129060 }, { - "epoch": 4.54, - "learning_rate": 1.1052005969980073e-06, - "loss": 0.2341, + "epoch": 4.651493855191553, + "grad_norm": 0.2584581971168518, + "learning_rate": 6.402116715587908e-07, + "loss": 0.41, "step": 129065 }, { - "epoch": 4.54, - "learning_rate": 1.1043631035695918e-06, - "loss": 0.2538, + "epoch": 4.651674054852776, + "grad_norm": 0.26626285910606384, + "learning_rate": 6.395556716827705e-07, + "loss": 0.3482, "step": 129070 }, { - "epoch": 4.54, - "learning_rate": 1.1035259204102756e-06, - "loss": 0.2729, + "epoch": 4.651854254514001, + "grad_norm": 0.29574957489967346, + "learning_rate": 6.389000037130916e-07, + "loss": 0.3603, "step": 129075 }, { - "epoch": 4.54, - "learning_rate": 1.1026890475309277e-06, - "loss": 0.2602, + "epoch": 4.652034454175226, + "grad_norm": 0.21474981307983398, + "learning_rate": 6.382446676586828e-07, + "loss": 0.3585, "step": 129080 }, { - "epoch": 4.54, - "learning_rate": 1.1018524849424089e-06, - "loss": 0.2441, + "epoch": 4.652214653836451, + "grad_norm": 0.25126680731773376, + "learning_rate": 6.375896635284734e-07, + "loss": 0.385, "step": 129085 }, { - "epoch": 4.54, - "learning_rate": 1.1010162326555912e-06, - "loss": 0.2472, + "epoch": 4.652394853497675, + "grad_norm": 0.20911748707294464, + "learning_rate": 6.369349913313893e-07, + "loss": 0.3573, "step": 129090 }, { - "epoch": 4.54, - "learning_rate": 1.100180290681327e-06, - "loss": 0.2702, + "epoch": 4.6525750531589, + "grad_norm": 0.24984316527843475, + "learning_rate": 6.362806510763458e-07, + "loss": 0.3599, "step": 129095 }, { - "epoch": 4.54, - "learning_rate": 1.0993446590304658e-06, - "loss": 0.257, + "epoch": 4.652755252820125, + "grad_norm": 0.20926958322525024, + "learning_rate": 6.356266427722663e-07, + "loss": 0.37, "step": 129100 }, { - "epoch": 4.54, - "learning_rate": 1.0985093377138656e-06, - "loss": 0.2181, + "epoch": 4.6529354524813495, + "grad_norm": 0.23557382822036743, + "learning_rate": 6.349729664280546e-07, + "loss": 0.3507, "step": 129105 }, { - "epoch": 4.54, - "learning_rate": 1.0976743267423705e-06, - "loss": 0.2417, + "epoch": 4.653115652142574, + "grad_norm": 0.27930939197540283, + "learning_rate": 6.343196220526176e-07, + "loss": 0.3653, "step": 129110 }, { - "epoch": 4.54, - "learning_rate": 1.096839626126825e-06, - "loss": 0.2334, + "epoch": 4.653295851803799, + "grad_norm": 0.20511141419410706, + "learning_rate": 6.336666096548593e-07, + "loss": 0.3598, "step": 129115 }, { - "epoch": 4.54, - "learning_rate": 1.0960052358780615e-06, - "loss": 0.2583, + "epoch": 4.653476051465024, + "grad_norm": 0.2338797003030777, + "learning_rate": 6.330139292436782e-07, + "loss": 0.3693, "step": 129120 }, { - "epoch": 4.54, - "learning_rate": 1.0951711560069161e-06, - "loss": 0.254, + "epoch": 4.653656251126248, + "grad_norm": 0.29057690501213074, + "learning_rate": 6.323615808279587e-07, + "loss": 0.3527, "step": 129125 }, { - "epoch": 4.54, - "learning_rate": 1.0943373865242218e-06, - "loss": 0.2625, + "epoch": 4.653836450787472, + "grad_norm": 0.26193034648895264, + "learning_rate": 6.317095644166021e-07, + "loss": 0.379, "step": 129130 }, { - "epoch": 4.54, - "learning_rate": 1.0935039274408003e-06, - "loss": 0.2485, + "epoch": 4.654016650448697, + "grad_norm": 0.2552226781845093, + "learning_rate": 6.310578800184791e-07, + "loss": 0.3968, "step": 129135 }, { - "epoch": 4.54, - "learning_rate": 1.0926707787674767e-06, - "loss": 0.2437, + "epoch": 4.654196850109922, + "grad_norm": 0.22633424401283264, + "learning_rate": 6.30406527642477e-07, + "loss": 0.3954, "step": 129140 }, { - "epoch": 4.54, - "learning_rate": 1.091837940515064e-06, - "loss": 0.2593, + "epoch": 4.6543770497711465, + "grad_norm": 0.2168351113796234, + "learning_rate": 6.297555072974692e-07, + "loss": 0.4123, "step": 129145 }, { - "epoch": 4.54, - "learning_rate": 1.0910054126943846e-06, - "loss": 0.2597, + "epoch": 4.654557249432371, + "grad_norm": 0.2150443196296692, + "learning_rate": 6.291048189923238e-07, + "loss": 0.3593, "step": 129150 }, { - "epoch": 4.54, - "learning_rate": 1.090173195316238e-06, - "loss": 0.2385, + "epoch": 4.654737449093596, + "grad_norm": 0.2785971462726593, + "learning_rate": 6.284544627359057e-07, + "loss": 0.365, "step": 129155 }, { - "epoch": 4.54, - "learning_rate": 1.0893412883914378e-06, - "loss": 0.2366, + "epoch": 4.654917648754821, + "grad_norm": 0.28096702694892883, + "learning_rate": 6.278044385370774e-07, + "loss": 0.3482, "step": 129160 }, { - "epoch": 4.54, - "learning_rate": 1.0885096919307835e-06, - "loss": 0.2441, + "epoch": 4.655097848416045, + "grad_norm": 0.2804014980792999, + "learning_rate": 6.271547464046928e-07, + "loss": 0.3895, "step": 129165 }, { - "epoch": 4.54, - "learning_rate": 1.0876784059450722e-06, - "loss": 0.2596, + "epoch": 4.655278048077269, + "grad_norm": 0.26343923807144165, + "learning_rate": 6.265053863476089e-07, + "loss": 0.4001, "step": 129170 }, { - "epoch": 4.54, - "learning_rate": 1.086847430445101e-06, - "loss": 0.25, + "epoch": 4.655458247738494, + "grad_norm": 0.23427803814411163, + "learning_rate": 6.258563583746713e-07, + "loss": 0.3745, "step": 129175 }, { - "epoch": 4.54, - "learning_rate": 1.0860167654416525e-06, - "loss": 0.2292, + "epoch": 4.655638447399719, + "grad_norm": 0.24222131073474884, + "learning_rate": 6.252076624947201e-07, + "loss": 0.3622, "step": 129180 }, { - "epoch": 4.55, - "learning_rate": 1.0851864109455184e-06, - "loss": 0.2542, + "epoch": 4.6558186470609435, + "grad_norm": 0.25545430183410645, + "learning_rate": 6.245592987165955e-07, + "loss": 0.388, "step": 129185 }, { - "epoch": 4.55, - "learning_rate": 1.084356366967479e-06, - "loss": 0.2411, + "epoch": 4.655998846722168, + "grad_norm": 0.30208820104599, + "learning_rate": 6.239112670491293e-07, + "loss": 0.4266, "step": 129190 }, { - "epoch": 4.55, - "learning_rate": 1.0835266335183113e-06, - "loss": 0.257, + "epoch": 4.656179046383393, + "grad_norm": 0.26093167066574097, + "learning_rate": 6.232635675011562e-07, + "loss": 0.3532, "step": 129195 }, { - "epoch": 4.55, - "learning_rate": 1.082697210608785e-06, - "loss": 0.2618, + "epoch": 4.656359246044618, + "grad_norm": 0.3261514902114868, + "learning_rate": 6.226162000814967e-07, + "loss": 0.3773, "step": 129200 }, { - "epoch": 4.55, - "learning_rate": 1.0818680982496749e-06, - "loss": 0.2573, + "epoch": 4.656539445705842, + "grad_norm": 0.22967924177646637, + "learning_rate": 6.219691647989689e-07, + "loss": 0.3761, "step": 129205 }, { - "epoch": 4.55, - "learning_rate": 1.081039296451744e-06, - "loss": 0.2352, + "epoch": 4.656719645367067, + "grad_norm": 0.23772135376930237, + "learning_rate": 6.213224616623964e-07, + "loss": 0.3674, "step": 129210 }, { - "epoch": 4.55, - "learning_rate": 1.0802108052257592e-06, - "loss": 0.2664, + "epoch": 4.656899845028291, + "grad_norm": 0.24498851597309113, + "learning_rate": 6.206760906805803e-07, + "loss": 0.3609, "step": 129215 }, { - "epoch": 4.55, - "learning_rate": 1.0793826245824674e-06, - "loss": 0.2425, + "epoch": 4.657080044689516, + "grad_norm": 0.22295355796813965, + "learning_rate": 6.200300518623387e-07, + "loss": 0.3527, "step": 129220 }, { - "epoch": 4.55, - "learning_rate": 1.0785547545326318e-06, - "loss": 0.2493, + "epoch": 4.6572602443507405, + "grad_norm": 0.2808474600315094, + "learning_rate": 6.193843452164616e-07, + "loss": 0.3905, "step": 129225 }, { - "epoch": 4.55, - "learning_rate": 1.0777271950869972e-06, - "loss": 0.245, + "epoch": 4.657440444011965, + "grad_norm": 0.2916402816772461, + "learning_rate": 6.187389707517532e-07, + "loss": 0.385, "step": 129230 }, { - "epoch": 4.55, - "learning_rate": 1.0768999462563128e-06, - "loss": 0.2364, + "epoch": 4.65762064367319, + "grad_norm": 0.26787781715393066, + "learning_rate": 6.180939284770093e-07, + "loss": 0.3638, "step": 129235 }, { - "epoch": 4.55, - "learning_rate": 1.0760730080513093e-06, - "loss": 0.2364, + "epoch": 4.657800843334415, + "grad_norm": 0.29607486724853516, + "learning_rate": 6.174492184010117e-07, + "loss": 0.3698, "step": 129240 }, { - "epoch": 4.55, - "learning_rate": 1.0752463804827361e-06, - "loss": 0.2683, + "epoch": 4.657981042995639, + "grad_norm": 0.2748851478099823, + "learning_rate": 6.168048405325505e-07, + "loss": 0.3977, "step": 129245 }, { - "epoch": 4.55, - "learning_rate": 1.074420063561324e-06, - "loss": 0.2563, + "epoch": 4.658161242656864, + "grad_norm": 0.2305481731891632, + "learning_rate": 6.161607948804021e-07, + "loss": 0.3268, "step": 129250 }, { - "epoch": 4.55, - "learning_rate": 1.073594057297797e-06, - "loss": 0.2401, + "epoch": 4.658341442318088, + "grad_norm": 0.20638981461524963, + "learning_rate": 6.155170814533401e-07, + "loss": 0.3884, "step": 129255 }, { - "epoch": 4.55, - "learning_rate": 1.0727683617028805e-06, - "loss": 0.2371, + "epoch": 4.658521641979313, + "grad_norm": 0.25829213857650757, + "learning_rate": 6.148737002601379e-07, + "loss": 0.4005, "step": 129260 }, { - "epoch": 4.55, - "learning_rate": 1.071942976787299e-06, - "loss": 0.2316, + "epoch": 4.6587018416405375, + "grad_norm": 0.28095561265945435, + "learning_rate": 6.142306513095608e-07, + "loss": 0.373, "step": 129265 }, { - "epoch": 4.55, - "learning_rate": 1.0711179025617719e-06, - "loss": 0.249, + "epoch": 4.658882041301762, + "grad_norm": 0.22143559157848358, + "learning_rate": 6.135879346103712e-07, + "loss": 0.359, "step": 129270 }, { - "epoch": 4.55, - "learning_rate": 1.070293139037004e-06, - "loss": 0.2508, + "epoch": 4.659062240962987, + "grad_norm": 0.24161243438720703, + "learning_rate": 6.129455501713233e-07, + "loss": 0.3702, "step": 129275 }, { - "epoch": 4.55, - "learning_rate": 1.0694686862237152e-06, - "loss": 0.2644, + "epoch": 4.659242440624212, + "grad_norm": 0.20303134620189667, + "learning_rate": 6.123034980011683e-07, + "loss": 0.3933, "step": 129280 }, { - "epoch": 4.55, - "learning_rate": 1.068644544132602e-06, - "loss": 0.2427, + "epoch": 4.659422640285436, + "grad_norm": 0.29990285634994507, + "learning_rate": 6.116617781086603e-07, + "loss": 0.387, "step": 129285 }, { - "epoch": 4.55, - "learning_rate": 1.06782071277437e-06, - "loss": 0.2476, + "epoch": 4.659602839946661, + "grad_norm": 0.24720498919487, + "learning_rate": 6.110203905025397e-07, + "loss": 0.3566, "step": 129290 }, { - "epoch": 4.55, - "learning_rate": 1.0669971921597105e-06, - "loss": 0.263, + "epoch": 4.659783039607886, + "grad_norm": 0.2561245560646057, + "learning_rate": 6.10379335191541e-07, + "loss": 0.3665, "step": 129295 }, { - "epoch": 4.55, - "learning_rate": 1.0661739822993234e-06, - "loss": 0.2553, + "epoch": 4.659963239269111, + "grad_norm": 0.24636390805244446, + "learning_rate": 6.097386121844045e-07, + "loss": 0.3752, "step": 129300 }, { - "epoch": 4.55, - "learning_rate": 1.0653510832038943e-06, - "loss": 0.2611, + "epoch": 4.6601434389303344, + "grad_norm": 0.22616252303123474, + "learning_rate": 6.090982214898567e-07, + "loss": 0.3541, "step": 129305 }, { - "epoch": 4.55, - "learning_rate": 1.0645284948841067e-06, - "loss": 0.2628, + "epoch": 4.660323638591559, + "grad_norm": 0.2326437532901764, + "learning_rate": 6.084581631166208e-07, + "loss": 0.4103, "step": 129310 }, { - "epoch": 4.55, - "learning_rate": 1.0637062173506406e-06, - "loss": 0.2506, + "epoch": 4.660503838252784, + "grad_norm": 0.26222485303878784, + "learning_rate": 6.078184370734236e-07, + "loss": 0.3368, "step": 129315 }, { - "epoch": 4.55, - "learning_rate": 1.062884250614174e-06, - "loss": 0.2878, + "epoch": 4.660684037914009, + "grad_norm": 0.20036746561527252, + "learning_rate": 6.071790433689744e-07, + "loss": 0.3811, "step": 129320 }, { - "epoch": 4.55, - "learning_rate": 1.0620625946853841e-06, - "loss": 0.2495, + "epoch": 4.660864237575233, + "grad_norm": 0.3011195659637451, + "learning_rate": 6.065399820119888e-07, + "loss": 0.3701, "step": 129325 }, { - "epoch": 4.55, - "learning_rate": 1.0612412495749348e-06, - "loss": 0.2503, + "epoch": 4.661044437236458, + "grad_norm": 0.3017103672027588, + "learning_rate": 6.059012530111763e-07, + "loss": 0.3723, "step": 129330 }, { - "epoch": 4.55, - "learning_rate": 1.0604202152934895e-06, - "loss": 0.2626, + "epoch": 4.661224636897683, + "grad_norm": 0.2928502857685089, + "learning_rate": 6.052628563752355e-07, + "loss": 0.4019, "step": 129335 }, { - "epoch": 4.55, - "learning_rate": 1.0595994918517121e-06, - "loss": 0.237, + "epoch": 4.661404836558908, + "grad_norm": 0.26223188638687134, + "learning_rate": 6.046247921128623e-07, + "loss": 0.376, "step": 129340 }, { - "epoch": 4.55, - "learning_rate": 1.0587790792602608e-06, - "loss": 0.2439, + "epoch": 4.661585036220131, + "grad_norm": 0.25763529539108276, + "learning_rate": 6.039870602327552e-07, + "loss": 0.383, "step": 129345 }, { - "epoch": 4.55, - "learning_rate": 1.0579589775297827e-06, - "loss": 0.2621, + "epoch": 4.661765235881356, + "grad_norm": 0.3097740709781647, + "learning_rate": 6.03349660743599e-07, + "loss": 0.3831, "step": 129350 }, { - "epoch": 4.55, - "learning_rate": 1.0571391866709275e-06, - "loss": 0.2516, + "epoch": 4.661945435542581, + "grad_norm": 0.21880759298801422, + "learning_rate": 6.027125936540839e-07, + "loss": 0.325, "step": 129355 }, { - "epoch": 4.55, - "learning_rate": 1.0563197066943447e-06, - "loss": 0.2473, + "epoch": 4.662125635203806, + "grad_norm": 0.27281638979911804, + "learning_rate": 6.020758589728864e-07, + "loss": 0.3572, "step": 129360 }, { - "epoch": 4.55, - "learning_rate": 1.0555005376106708e-06, - "loss": 0.2242, + "epoch": 4.66230583486503, + "grad_norm": 0.26208505034446716, + "learning_rate": 6.014394567086801e-07, + "loss": 0.3569, "step": 129365 }, { - "epoch": 4.55, - "learning_rate": 1.0546816794305358e-06, - "loss": 0.263, + "epoch": 4.662486034526255, + "grad_norm": 0.2751930058002472, + "learning_rate": 6.008033868701357e-07, + "loss": 0.3527, "step": 129370 }, { - "epoch": 4.55, - "learning_rate": 1.0538631321645815e-06, - "loss": 0.2499, + "epoch": 4.66266623418748, + "grad_norm": 0.2358606606721878, + "learning_rate": 6.001676494659214e-07, + "loss": 0.3303, "step": 129375 }, { - "epoch": 4.55, - "learning_rate": 1.0530448958234378e-06, - "loss": 0.2622, + "epoch": 4.662846433848705, + "grad_norm": 0.22096312046051025, + "learning_rate": 5.995322445047025e-07, + "loss": 0.3917, "step": 129380 }, { - "epoch": 4.55, - "learning_rate": 1.0522269704177218e-06, - "loss": 0.2467, + "epoch": 4.663026633509929, + "grad_norm": 0.2579168677330017, + "learning_rate": 5.988971719951331e-07, + "loss": 0.41, "step": 129385 }, { - "epoch": 4.55, - "learning_rate": 1.0514093559580552e-06, - "loss": 0.2427, + "epoch": 4.663206833171154, + "grad_norm": 0.29768040776252747, + "learning_rate": 5.982624319458618e-07, + "loss": 0.3481, "step": 129390 }, { - "epoch": 4.55, - "learning_rate": 1.0505920524550571e-06, - "loss": 0.2242, + "epoch": 4.663387032832379, + "grad_norm": 0.24240122735500336, + "learning_rate": 5.976280243655403e-07, + "loss": 0.3567, "step": 129395 }, { - "epoch": 4.55, - "learning_rate": 1.049775059919339e-06, - "loss": 0.251, + "epoch": 4.663567232493603, + "grad_norm": 0.3015746772289276, + "learning_rate": 5.96993949262814e-07, + "loss": 0.3612, "step": 129400 }, { - "epoch": 4.55, - "learning_rate": 1.0489583783615086e-06, - "loss": 0.2533, + "epoch": 4.663747432154827, + "grad_norm": 0.2500225901603699, + "learning_rate": 5.963602066463236e-07, + "loss": 0.3912, "step": 129405 }, { - "epoch": 4.55, - "learning_rate": 1.0481420077921634e-06, - "loss": 0.2605, + "epoch": 4.663927631816052, + "grad_norm": 0.25465816259384155, + "learning_rate": 5.957267965246982e-07, + "loss": 0.3754, "step": 129410 }, { - "epoch": 4.55, - "learning_rate": 1.047325948221914e-06, - "loss": 0.206, + "epoch": 4.664107831477277, + "grad_norm": 0.24899233877658844, + "learning_rate": 5.95093718906567e-07, + "loss": 0.3582, "step": 129415 }, { - "epoch": 4.55, - "learning_rate": 1.0465101996613518e-06, - "loss": 0.2622, + "epoch": 4.6642880311385015, + "grad_norm": 0.22887267172336578, + "learning_rate": 5.944609738005618e-07, + "loss": 0.3409, "step": 129420 }, { - "epoch": 4.55, - "learning_rate": 1.045694762121066e-06, - "loss": 0.2663, + "epoch": 4.664468230799726, + "grad_norm": 0.26909008622169495, + "learning_rate": 5.93828561215301e-07, + "loss": 0.3867, "step": 129425 }, { - "epoch": 4.55, - "learning_rate": 1.0448796356116508e-06, - "loss": 0.2672, + "epoch": 4.664648430460951, + "grad_norm": 0.2556828558444977, + "learning_rate": 5.931964811594026e-07, + "loss": 0.356, "step": 129430 }, { - "epoch": 4.55, - "learning_rate": 1.0440648201436836e-06, - "loss": 0.2433, + "epoch": 4.664828630122176, + "grad_norm": 0.25458240509033203, + "learning_rate": 5.925647336414735e-07, + "loss": 0.4164, "step": 129435 }, { - "epoch": 4.55, - "learning_rate": 1.043250315727748e-06, - "loss": 0.2424, + "epoch": 4.6650088297834, + "grad_norm": 0.26027604937553406, + "learning_rate": 5.919333186701265e-07, + "loss": 0.3915, "step": 129440 }, { - "epoch": 4.55, - "learning_rate": 1.042436122374421e-06, - "loss": 0.2538, + "epoch": 4.665189029444624, + "grad_norm": 0.23065000772476196, + "learning_rate": 5.9130223625396e-07, + "loss": 0.3516, "step": 129445 }, { - "epoch": 4.55, - "learning_rate": 1.0416222400942698e-06, - "loss": 0.2515, + "epoch": 4.665369229105849, + "grad_norm": 0.2661188542842865, + "learning_rate": 5.906714864015783e-07, + "loss": 0.3747, "step": 129450 }, { - "epoch": 4.55, - "learning_rate": 1.0408086688978663e-06, - "loss": 0.2418, + "epoch": 4.665549428767074, + "grad_norm": 0.25803518295288086, + "learning_rate": 5.900410691215719e-07, + "loss": 0.3264, "step": 129455 }, { - "epoch": 4.55, - "learning_rate": 1.0399954087957714e-06, - "loss": 0.2414, + "epoch": 4.6657296284282985, + "grad_norm": 0.24731706082820892, + "learning_rate": 5.894109844225281e-07, + "loss": 0.3546, "step": 129460 }, { - "epoch": 4.55, - "learning_rate": 1.039182459798549e-06, - "loss": 0.2363, + "epoch": 4.665909828089523, + "grad_norm": 0.21868941187858582, + "learning_rate": 5.88781232313032e-07, + "loss": 0.3743, "step": 129465 }, { - "epoch": 4.56, - "learning_rate": 1.0383698219167459e-06, - "loss": 0.2648, + "epoch": 4.666090027750748, + "grad_norm": 0.2019900679588318, + "learning_rate": 5.881518128016655e-07, + "loss": 0.3616, "step": 129470 }, { - "epoch": 4.56, - "learning_rate": 1.0375574951609234e-06, - "loss": 0.2565, + "epoch": 4.666270227411973, + "grad_norm": 0.37569132447242737, + "learning_rate": 5.875227258970078e-07, + "loss": 0.3833, "step": 129475 }, { - "epoch": 4.56, - "learning_rate": 1.0367454795416205e-06, - "loss": 0.2511, + "epoch": 4.6664504270731975, + "grad_norm": 0.2718876600265503, + "learning_rate": 5.868939716076244e-07, + "loss": 0.3947, "step": 129480 }, { - "epoch": 4.56, - "learning_rate": 1.0359337750693893e-06, - "loss": 0.242, + "epoch": 4.666630626734422, + "grad_norm": 0.23850391805171967, + "learning_rate": 5.86265549942086e-07, + "loss": 0.3544, "step": 129485 }, { - "epoch": 4.56, - "learning_rate": 1.0351223817547633e-06, - "loss": 0.2405, + "epoch": 4.666810826395646, + "grad_norm": 0.28313085436820984, + "learning_rate": 5.856374609089526e-07, + "loss": 0.3506, "step": 129490 }, { - "epoch": 4.56, - "learning_rate": 1.0343112996082815e-06, - "loss": 0.2332, + "epoch": 4.666991026056871, + "grad_norm": 0.21799631416797638, + "learning_rate": 5.850097045167785e-07, + "loss": 0.3657, "step": 129495 }, { - "epoch": 4.56, - "learning_rate": 1.0335005286404742e-06, - "loss": 0.2607, + "epoch": 4.6671712257180955, + "grad_norm": 0.25782206654548645, + "learning_rate": 5.843822807741261e-07, + "loss": 0.3886, "step": 129500 }, { - "epoch": 4.56, - "eval_loss": 0.24865983426570892, - "eval_runtime": 10.5527, - "eval_samples_per_second": 9.476, - "eval_steps_per_second": 9.476, + "epoch": 4.6671712257180955, + "eval_loss": 0.4288308322429657, + "eval_runtime": 3.5293, + "eval_samples_per_second": 28.335, + "eval_steps_per_second": 7.084, "step": 129500 }, { - "epoch": 4.56, - "learning_rate": 1.0326900688618635e-06, - "loss": 0.256, + "epoch": 4.66735142537932, + "grad_norm": 0.2763291001319885, + "learning_rate": 5.837551896895305e-07, + "loss": 0.373, "step": 129505 }, { - "epoch": 4.56, - "learning_rate": 1.0318799202829827e-06, - "loss": 0.2483, + "epoch": 4.667531625040545, + "grad_norm": 0.3070751130580902, + "learning_rate": 5.831284312715485e-07, + "loss": 0.3634, "step": 129510 }, { - "epoch": 4.56, - "learning_rate": 1.0310700829143427e-06, - "loss": 0.238, + "epoch": 4.66771182470177, + "grad_norm": 0.3483225703239441, + "learning_rate": 5.82502005528715e-07, + "loss": 0.3704, "step": 129515 }, { - "epoch": 4.56, - "learning_rate": 1.0302605567664631e-06, - "loss": 0.2675, + "epoch": 4.667892024362994, + "grad_norm": 0.20731894671916962, + "learning_rate": 5.818759124695622e-07, + "loss": 0.3615, "step": 129520 }, { - "epoch": 4.56, - "learning_rate": 1.029451341849852e-06, - "loss": 0.2344, + "epoch": 4.668072224024219, + "grad_norm": 0.25448137521743774, + "learning_rate": 5.81250152102622e-07, + "loss": 0.3756, "step": 129525 }, { - "epoch": 4.56, - "learning_rate": 1.0286424381750175e-06, - "loss": 0.267, + "epoch": 4.668252423685443, + "grad_norm": 0.3217816948890686, + "learning_rate": 5.806247244364238e-07, + "loss": 0.4025, "step": 129530 }, { - "epoch": 4.56, - "learning_rate": 1.0278338457524655e-06, - "loss": 0.2232, + "epoch": 4.668432623346668, + "grad_norm": 0.21159349381923676, + "learning_rate": 5.799996294794801e-07, + "loss": 0.382, "step": 129535 }, { - "epoch": 4.56, - "learning_rate": 1.0270255645926902e-06, - "loss": 0.2512, + "epoch": 4.6686128230078925, + "grad_norm": 0.2236149162054062, + "learning_rate": 5.793748672403204e-07, + "loss": 0.3662, "step": 129540 }, { - "epoch": 4.56, - "learning_rate": 1.0262175947061886e-06, - "loss": 0.276, + "epoch": 4.668793022669117, + "grad_norm": 0.262952595949173, + "learning_rate": 5.78750437727446e-07, + "loss": 0.3876, "step": 129545 }, { - "epoch": 4.56, - "learning_rate": 1.0254099361034553e-06, - "loss": 0.2621, + "epoch": 4.668973222330342, + "grad_norm": 0.27944156527519226, + "learning_rate": 5.781263409493698e-07, + "loss": 0.3889, "step": 129550 }, { - "epoch": 4.56, - "learning_rate": 1.0246025887949734e-06, - "loss": 0.2586, + "epoch": 4.669153421991567, + "grad_norm": 0.24670124053955078, + "learning_rate": 5.775025769145959e-07, + "loss": 0.3498, "step": 129555 }, { - "epoch": 4.56, - "learning_rate": 1.0237955527912264e-06, - "loss": 0.2416, + "epoch": 4.669333621652791, + "grad_norm": 0.2797219455242157, + "learning_rate": 5.768791456316175e-07, + "loss": 0.3675, "step": 129560 }, { - "epoch": 4.56, - "learning_rate": 1.0229888281026916e-06, - "loss": 0.2527, + "epoch": 4.669513821314016, + "grad_norm": 0.19079472124576569, + "learning_rate": 5.762560471089334e-07, + "loss": 0.3674, "step": 129565 }, { - "epoch": 4.56, - "learning_rate": 1.0221824147398474e-06, - "loss": 0.2497, + "epoch": 4.669694020975241, + "grad_norm": 0.20952509343624115, + "learning_rate": 5.756332813550369e-07, + "loss": 0.3384, "step": 129570 }, { - "epoch": 4.56, - "learning_rate": 1.0213763127131625e-06, - "loss": 0.2474, + "epoch": 4.669874220636466, + "grad_norm": 0.2480088174343109, + "learning_rate": 5.750108483784017e-07, + "loss": 0.3942, "step": 129575 }, { - "epoch": 4.56, - "learning_rate": 1.0205705220331042e-06, - "loss": 0.2455, + "epoch": 4.6700544202976895, + "grad_norm": 0.19479909539222717, + "learning_rate": 5.743887481875182e-07, + "loss": 0.3673, "step": 129580 }, { - "epoch": 4.56, - "learning_rate": 1.0197650427101302e-06, - "loss": 0.2321, + "epoch": 4.670234619958914, + "grad_norm": 0.27210843563079834, + "learning_rate": 5.737669807908546e-07, + "loss": 0.374, "step": 129585 }, { - "epoch": 4.56, - "learning_rate": 1.0189598747547074e-06, - "loss": 0.2573, + "epoch": 4.670414819620139, + "grad_norm": 0.25238466262817383, + "learning_rate": 5.731455461968932e-07, + "loss": 0.3692, "step": 129590 }, { - "epoch": 4.56, - "learning_rate": 1.0181550181772858e-06, - "loss": 0.2412, + "epoch": 4.670595019281364, + "grad_norm": 0.3293459117412567, + "learning_rate": 5.725244444140937e-07, + "loss": 0.365, "step": 129595 }, { - "epoch": 4.56, - "learning_rate": 1.0173504729883182e-06, - "loss": 0.2519, + "epoch": 4.670775218942588, + "grad_norm": 0.25170597434043884, + "learning_rate": 5.719036754509161e-07, + "loss": 0.4202, "step": 129600 }, { - "epoch": 4.56, - "learning_rate": 1.0165462391982432e-06, - "loss": 0.2473, + "epoch": 4.670955418603813, + "grad_norm": 0.22425323724746704, + "learning_rate": 5.712832393158229e-07, + "loss": 0.3636, "step": 129605 }, { - "epoch": 4.56, - "learning_rate": 1.0157423168175167e-06, - "loss": 0.2461, + "epoch": 4.671135618265038, + "grad_norm": 0.3065611720085144, + "learning_rate": 5.706631360172659e-07, + "loss": 0.3638, "step": 129610 }, { - "epoch": 4.56, - "learning_rate": 1.014938705856569e-06, - "loss": 0.2548, + "epoch": 4.671315817926263, + "grad_norm": 0.2282622754573822, + "learning_rate": 5.700433655636939e-07, + "loss": 0.3536, "step": 129615 }, { - "epoch": 4.56, - "learning_rate": 1.0141354063258335e-06, - "loss": 0.2538, + "epoch": 4.6714960175874864, + "grad_norm": 0.22002369165420532, + "learning_rate": 5.694239279635527e-07, + "loss": 0.3777, "step": 129620 }, { - "epoch": 4.56, - "learning_rate": 1.0133324182357434e-06, - "loss": 0.2641, + "epoch": 4.671676217248711, + "grad_norm": 0.263820081949234, + "learning_rate": 5.688048232252746e-07, + "loss": 0.3732, "step": 129625 }, { - "epoch": 4.56, - "learning_rate": 1.012529741596724e-06, - "loss": 0.2556, + "epoch": 4.671856416909936, + "grad_norm": 0.24897272884845734, + "learning_rate": 5.681860513573084e-07, + "loss": 0.3497, "step": 129630 }, { - "epoch": 4.56, - "learning_rate": 1.0117273764191998e-06, - "loss": 0.2564, + "epoch": 4.672036616571161, + "grad_norm": 0.2410743087530136, + "learning_rate": 5.675676123680724e-07, + "loss": 0.3739, "step": 129635 }, { - "epoch": 4.56, - "learning_rate": 1.0109253227135824e-06, - "loss": 0.2577, + "epoch": 4.672216816232385, + "grad_norm": 0.28402799367904663, + "learning_rate": 5.669495062660013e-07, + "loss": 0.3734, "step": 129640 }, { - "epoch": 4.56, - "learning_rate": 1.010123580490291e-06, - "loss": 0.2703, + "epoch": 4.67239701589361, + "grad_norm": 0.22168543934822083, + "learning_rate": 5.663317330595108e-07, + "loss": 0.3817, "step": 129645 }, { - "epoch": 4.56, - "learning_rate": 1.0093221497597367e-06, - "loss": 0.2258, + "epoch": 4.672577215554835, + "grad_norm": 0.25576576590538025, + "learning_rate": 5.657142927570163e-07, + "loss": 0.4007, "step": 129650 }, { - "epoch": 4.56, - "learning_rate": 1.0085210305323223e-06, - "loss": 0.2672, + "epoch": 4.67275741521606, + "grad_norm": 0.21974904835224152, + "learning_rate": 5.65097185366939e-07, + "loss": 0.399, "step": 129655 }, { - "epoch": 4.56, - "learning_rate": 1.0077202228184507e-06, - "loss": 0.25, + "epoch": 4.672937614877284, + "grad_norm": 0.3177945911884308, + "learning_rate": 5.644804108976804e-07, + "loss": 0.3589, "step": 129660 }, { - "epoch": 4.56, - "learning_rate": 1.0069197266285245e-06, - "loss": 0.2373, + "epoch": 4.673117814538509, + "grad_norm": 0.3027752637863159, + "learning_rate": 5.638639693576447e-07, + "loss": 0.3754, "step": 129665 }, { - "epoch": 4.56, - "learning_rate": 1.0061195419729302e-06, - "loss": 0.2499, + "epoch": 4.673298014199734, + "grad_norm": 0.3068021237850189, + "learning_rate": 5.63247860755231e-07, + "loss": 0.3751, "step": 129670 }, { - "epoch": 4.56, - "learning_rate": 1.0053196688620647e-06, - "loss": 0.2464, + "epoch": 4.673478213860958, + "grad_norm": 0.2814277410507202, + "learning_rate": 5.626320850988353e-07, + "loss": 0.3641, "step": 129675 }, { - "epoch": 4.56, - "learning_rate": 1.004520107306306e-06, - "loss": 0.2409, + "epoch": 4.673658413522182, + "grad_norm": 0.2657163441181183, + "learning_rate": 5.620166423968454e-07, + "loss": 0.3845, "step": 129680 }, { - "epoch": 4.56, - "learning_rate": 1.0037208573160428e-06, - "loss": 0.2213, + "epoch": 4.673838613183407, + "grad_norm": 0.21708977222442627, + "learning_rate": 5.614015326576489e-07, + "loss": 0.3781, "step": 129685 }, { - "epoch": 4.56, - "learning_rate": 1.0029219189016503e-06, - "loss": 0.2502, + "epoch": 4.674018812844632, + "grad_norm": 0.2891923189163208, + "learning_rate": 5.607867558896224e-07, + "loss": 0.3721, "step": 129690 }, { - "epoch": 4.56, - "learning_rate": 1.0021232920735008e-06, - "loss": 0.2466, + "epoch": 4.674199012505857, + "grad_norm": 0.25727081298828125, + "learning_rate": 5.601723121011482e-07, + "loss": 0.32, "step": 129695 }, { - "epoch": 4.56, - "learning_rate": 1.0013249768419635e-06, - "loss": 0.2498, + "epoch": 4.674379212167081, + "grad_norm": 0.18455742299556732, + "learning_rate": 5.595582013005918e-07, + "loss": 0.3469, "step": 129700 }, { - "epoch": 4.56, - "learning_rate": 1.0005269732174055e-06, - "loss": 0.2472, + "epoch": 4.674559411828306, + "grad_norm": 0.27723222970962524, + "learning_rate": 5.589444234963214e-07, + "loss": 0.3841, "step": 129705 }, { - "epoch": 4.56, - "learning_rate": 9.997292812101905e-07, - "loss": 0.2651, + "epoch": 4.674739611489531, + "grad_norm": 0.2724769115447998, + "learning_rate": 5.583309786967084e-07, + "loss": 0.3844, "step": 129710 }, { - "epoch": 4.56, - "learning_rate": 9.98931900830677e-07, - "loss": 0.235, + "epoch": 4.674919811150755, + "grad_norm": 0.25393152236938477, + "learning_rate": 5.577178669100986e-07, + "loss": 0.4236, "step": 129715 }, { - "epoch": 4.56, - "learning_rate": 9.981348320892093e-07, - "loss": 0.2582, + "epoch": 4.675100010811979, + "grad_norm": 0.2612922489643097, + "learning_rate": 5.571050881448492e-07, + "loss": 0.3766, "step": 129720 }, { - "epoch": 4.56, - "learning_rate": 9.97338074996146e-07, - "loss": 0.2462, + "epoch": 4.675280210473204, + "grad_norm": 0.2447577714920044, + "learning_rate": 5.56492642409312e-07, + "loss": 0.3888, "step": 129725 }, { - "epoch": 4.56, - "learning_rate": 9.965416295618313e-07, - "loss": 0.2511, + "epoch": 4.675460410134429, + "grad_norm": 0.30114662647247314, + "learning_rate": 5.55880529711833e-07, + "loss": 0.3967, "step": 129730 }, { - "epoch": 4.56, - "learning_rate": 9.957454957966017e-07, - "loss": 0.2487, + "epoch": 4.6756406097956535, + "grad_norm": 0.24598009884357452, + "learning_rate": 5.552687500607473e-07, + "loss": 0.376, "step": 129735 }, { - "epoch": 4.56, - "learning_rate": 9.949496737107988e-07, - "loss": 0.2294, + "epoch": 4.675820809456878, + "grad_norm": 0.23333467543125153, + "learning_rate": 5.546573034643926e-07, + "loss": 0.373, "step": 129740 }, { - "epoch": 4.56, - "learning_rate": 9.941541633147562e-07, - "loss": 0.2585, + "epoch": 4.676001009118103, + "grad_norm": 0.3097272515296936, + "learning_rate": 5.540461899310956e-07, + "loss": 0.4054, "step": 129745 }, { - "epoch": 4.56, - "learning_rate": 9.933589646187985e-07, - "loss": 0.2421, + "epoch": 4.676181208779328, + "grad_norm": 0.2813061773777008, + "learning_rate": 5.534354094691912e-07, + "loss": 0.3875, "step": 129750 }, { - "epoch": 4.57, - "learning_rate": 9.92564077633254e-07, - "loss": 0.2785, + "epoch": 4.6763614084405525, + "grad_norm": 0.27265501022338867, + "learning_rate": 5.528249620869952e-07, + "loss": 0.3326, "step": 129755 }, { - "epoch": 4.57, - "learning_rate": 9.91769502368442e-07, - "loss": 0.2468, + "epoch": 4.676541608101777, + "grad_norm": 0.23373085260391235, + "learning_rate": 5.522148477928257e-07, + "loss": 0.355, "step": 129760 }, { - "epoch": 4.57, - "learning_rate": 9.90975238834682e-07, - "loss": 0.2277, + "epoch": 4.676721807763001, + "grad_norm": 0.2653125822544098, + "learning_rate": 5.516050665949956e-07, + "loss": 0.374, "step": 129765 }, { - "epoch": 4.57, - "learning_rate": 9.901812870422884e-07, - "loss": 0.2453, + "epoch": 4.676902007424226, + "grad_norm": 0.2680438160896301, + "learning_rate": 5.509956185018123e-07, + "loss": 0.3668, "step": 129770 }, { - "epoch": 4.57, - "learning_rate": 9.893876470015661e-07, - "loss": 0.2453, + "epoch": 4.6770822070854505, + "grad_norm": 0.24834401905536652, + "learning_rate": 5.503865035215799e-07, + "loss": 0.3672, "step": 129775 }, { - "epoch": 4.57, - "learning_rate": 9.885943187228214e-07, - "loss": 0.2807, + "epoch": 4.677262406746675, + "grad_norm": 0.2544165551662445, + "learning_rate": 5.497777216626004e-07, + "loss": 0.3523, "step": 129780 }, { - "epoch": 4.57, - "learning_rate": 9.878013022163567e-07, - "loss": 0.2355, + "epoch": 4.6774426064079, + "grad_norm": 0.2632615864276886, + "learning_rate": 5.491692729331643e-07, + "loss": 0.3629, "step": 129785 }, { - "epoch": 4.57, - "learning_rate": 9.870085974924698e-07, - "loss": 0.2383, + "epoch": 4.677622806069125, + "grad_norm": 0.27221032977104187, + "learning_rate": 5.485611573415622e-07, + "loss": 0.3558, "step": 129790 }, { - "epoch": 4.57, - "learning_rate": 9.862162045614436e-07, - "loss": 0.2452, + "epoch": 4.6778030057303495, + "grad_norm": 0.23004166781902313, + "learning_rate": 5.479533748960819e-07, + "loss": 0.3677, "step": 129795 }, { - "epoch": 4.57, - "learning_rate": 9.854241234335814e-07, - "loss": 0.265, + "epoch": 4.677983205391574, + "grad_norm": 0.23140105605125427, + "learning_rate": 5.473459256050029e-07, + "loss": 0.3682, "step": 129800 }, { - "epoch": 4.57, - "learning_rate": 9.846323541191582e-07, - "loss": 0.2254, + "epoch": 4.678163405052798, + "grad_norm": 0.25781282782554626, + "learning_rate": 5.46738809476599e-07, + "loss": 0.3737, "step": 129805 }, { - "epoch": 4.57, - "learning_rate": 9.838408966284545e-07, - "loss": 0.2428, + "epoch": 4.678343604714023, + "grad_norm": 0.28814247250556946, + "learning_rate": 5.461320265191445e-07, + "loss": 0.4094, "step": 129810 }, { - "epoch": 4.57, - "learning_rate": 9.830497509717511e-07, - "loss": 0.2269, + "epoch": 4.6785238043752475, + "grad_norm": 0.2671186327934265, + "learning_rate": 5.455255767409101e-07, + "loss": 0.3495, "step": 129815 }, { - "epoch": 4.57, - "learning_rate": 9.822589171593206e-07, - "loss": 0.2352, + "epoch": 4.678704004036472, + "grad_norm": 0.25495436787605286, + "learning_rate": 5.449194601501534e-07, + "loss": 0.4176, "step": 129820 }, { - "epoch": 4.57, - "learning_rate": 9.814683952014292e-07, - "loss": 0.2565, + "epoch": 4.678884203697697, + "grad_norm": 0.24299226701259613, + "learning_rate": 5.443136767551343e-07, + "loss": 0.3457, "step": 129825 }, { - "epoch": 4.57, - "learning_rate": 9.806781851083414e-07, - "loss": 0.2418, + "epoch": 4.679064403358922, + "grad_norm": 0.2552716135978699, + "learning_rate": 5.437082265641075e-07, + "loss": 0.4135, "step": 129830 }, { - "epoch": 4.57, - "learning_rate": 9.79888286890318e-07, - "loss": 0.2701, + "epoch": 4.679244603020146, + "grad_norm": 0.2803386151790619, + "learning_rate": 5.431031095853189e-07, + "loss": 0.3884, "step": 129835 }, { - "epoch": 4.57, - "learning_rate": 9.790987005576152e-07, - "loss": 0.2583, + "epoch": 4.679424802681371, + "grad_norm": 0.20010852813720703, + "learning_rate": 5.42498325827015e-07, + "loss": 0.3422, "step": 129840 }, { - "epoch": 4.57, - "learning_rate": 9.783094261204884e-07, - "loss": 0.2461, + "epoch": 4.679605002342596, + "grad_norm": 0.28240904211997986, + "learning_rate": 5.41893875297439e-07, + "loss": 0.3968, "step": 129845 }, { - "epoch": 4.57, - "learning_rate": 9.775204635891766e-07, - "loss": 0.2528, + "epoch": 4.679785202003821, + "grad_norm": 0.28269240260124207, + "learning_rate": 5.412897580048231e-07, + "loss": 0.3503, "step": 129850 }, { - "epoch": 4.57, - "learning_rate": 9.767318129739355e-07, - "loss": 0.2441, + "epoch": 4.6799654016650445, + "grad_norm": 0.295452743768692, + "learning_rate": 5.406859739574e-07, + "loss": 0.3895, "step": 129855 }, { - "epoch": 4.57, - "learning_rate": 9.759434742849988e-07, - "loss": 0.263, + "epoch": 4.680145601326269, + "grad_norm": 0.2563684284687042, + "learning_rate": 5.400825231633932e-07, + "loss": 0.38, "step": 129860 }, { - "epoch": 4.57, - "learning_rate": 9.751554475326025e-07, - "loss": 0.2548, + "epoch": 4.680325800987494, + "grad_norm": 0.22655951976776123, + "learning_rate": 5.394794056310243e-07, + "loss": 0.3844, "step": 129865 }, { - "epoch": 4.57, - "learning_rate": 9.743677327269802e-07, - "loss": 0.2489, + "epoch": 4.680506000648719, + "grad_norm": 0.2583646774291992, + "learning_rate": 5.38876621368517e-07, + "loss": 0.3703, "step": 129870 }, { - "epoch": 4.57, - "learning_rate": 9.735803298783569e-07, - "loss": 0.2698, + "epoch": 4.680686200309943, + "grad_norm": 0.24790343642234802, + "learning_rate": 5.382741703840788e-07, + "loss": 0.3648, "step": 129875 }, { - "epoch": 4.57, - "learning_rate": 9.727932389969636e-07, - "loss": 0.2638, + "epoch": 4.680866399971168, + "grad_norm": 0.2448999434709549, + "learning_rate": 5.376720526859197e-07, + "loss": 0.3749, "step": 129880 }, { - "epoch": 4.57, - "learning_rate": 9.72006460093014e-07, - "loss": 0.2491, + "epoch": 4.681046599632393, + "grad_norm": 0.2374943196773529, + "learning_rate": 5.370702682822415e-07, + "loss": 0.3917, "step": 129885 }, { - "epoch": 4.57, - "learning_rate": 9.712199931767223e-07, - "loss": 0.2499, + "epoch": 4.681226799293618, + "grad_norm": 0.23962445557117462, + "learning_rate": 5.364688171812432e-07, + "loss": 0.3672, "step": 129890 }, { - "epoch": 4.57, - "learning_rate": 9.704338382583084e-07, - "loss": 0.2712, + "epoch": 4.6814069989548415, + "grad_norm": 0.2775137424468994, + "learning_rate": 5.358676993911238e-07, + "loss": 0.391, "step": 129895 }, { - "epoch": 4.57, - "learning_rate": 9.69647995347972e-07, - "loss": 0.2672, + "epoch": 4.681587198616066, + "grad_norm": 0.23689554631710052, + "learning_rate": 5.352669149200711e-07, + "loss": 0.3584, "step": 129900 }, { - "epoch": 4.57, - "learning_rate": 9.688624644559218e-07, - "loss": 0.2716, + "epoch": 4.681767398277291, + "grad_norm": 0.22408610582351685, + "learning_rate": 5.34666463776265e-07, + "loss": 0.363, "step": 129905 }, { - "epoch": 4.57, - "learning_rate": 9.680772455923526e-07, - "loss": 0.2433, + "epoch": 4.681947597938516, + "grad_norm": 0.2599274218082428, + "learning_rate": 5.340663459678958e-07, + "loss": 0.3446, "step": 129910 }, { - "epoch": 4.57, - "learning_rate": 9.672923387674644e-07, - "loss": 0.2583, + "epoch": 4.68212779759974, + "grad_norm": 0.2526589632034302, + "learning_rate": 5.334665615031376e-07, + "loss": 0.3397, "step": 129915 }, { - "epoch": 4.57, - "learning_rate": 9.665077439914461e-07, - "loss": 0.2455, + "epoch": 4.682307997260965, + "grad_norm": 0.23241139948368073, + "learning_rate": 5.328671103901589e-07, + "loss": 0.3638, "step": 129920 }, { - "epoch": 4.57, - "learning_rate": 9.65723461274487e-07, - "loss": 0.2558, + "epoch": 4.68248819692219, + "grad_norm": 0.2700398862361908, + "learning_rate": 5.322679926371282e-07, + "loss": 0.3878, "step": 129925 }, { - "epoch": 4.57, - "learning_rate": 9.649394906267677e-07, - "loss": 0.25, + "epoch": 4.682668396583415, + "grad_norm": 0.24214357137680054, + "learning_rate": 5.316692082522057e-07, + "loss": 0.3821, "step": 129930 }, { - "epoch": 4.57, - "learning_rate": 9.64155832058472e-07, - "loss": 0.2606, + "epoch": 4.682848596244639, + "grad_norm": 0.1930561661720276, + "learning_rate": 5.310707572435569e-07, + "loss": 0.3813, "step": 129935 }, { - "epoch": 4.57, - "learning_rate": 9.63372485579775e-07, - "loss": 0.2532, + "epoch": 4.683028795905864, + "grad_norm": 0.28772681951522827, + "learning_rate": 5.30472639619331e-07, + "loss": 0.3628, "step": 129940 }, { - "epoch": 4.57, - "learning_rate": 9.625894512008432e-07, - "loss": 0.2402, + "epoch": 4.683208995567089, + "grad_norm": 0.2775450050830841, + "learning_rate": 5.298748553876797e-07, + "loss": 0.4035, "step": 129945 }, { - "epoch": 4.57, - "learning_rate": 9.618067289318412e-07, - "loss": 0.2522, + "epoch": 4.683389195228313, + "grad_norm": 0.3269951045513153, + "learning_rate": 5.292774045567439e-07, + "loss": 0.3668, "step": 129950 }, { - "epoch": 4.57, - "learning_rate": 9.610243187829438e-07, - "loss": 0.2268, + "epoch": 4.683569394889537, + "grad_norm": 0.27343136072158813, + "learning_rate": 5.286802871346641e-07, + "loss": 0.3986, "step": 129955 }, { - "epoch": 4.57, - "learning_rate": 9.602422207643013e-07, - "loss": 0.2497, + "epoch": 4.683749594550762, + "grad_norm": 0.2822220027446747, + "learning_rate": 5.280835031295811e-07, + "loss": 0.3583, "step": 129960 }, { - "epoch": 4.57, - "learning_rate": 9.59460434886067e-07, - "loss": 0.2595, + "epoch": 4.683929794211987, + "grad_norm": 0.26671725511550903, + "learning_rate": 5.274870525496245e-07, + "loss": 0.3761, "step": 129965 }, { - "epoch": 4.57, - "learning_rate": 9.58678961158399e-07, - "loss": 0.2403, + "epoch": 4.684109993873212, + "grad_norm": 0.2931014597415924, + "learning_rate": 5.268909354029127e-07, + "loss": 0.3933, "step": 129970 }, { - "epoch": 4.57, - "learning_rate": 9.578977995914424e-07, - "loss": 0.2467, + "epoch": 4.684290193534436, + "grad_norm": 0.218561589717865, + "learning_rate": 5.262951516975756e-07, + "loss": 0.4029, "step": 129975 }, { - "epoch": 4.57, - "learning_rate": 9.57116950195336e-07, - "loss": 0.2558, + "epoch": 4.684470393195661, + "grad_norm": 0.25005042552948, + "learning_rate": 5.256997014417315e-07, + "loss": 0.3756, "step": 129980 }, { - "epoch": 4.57, - "learning_rate": 9.563364129802187e-07, - "loss": 0.2688, + "epoch": 4.684650592856886, + "grad_norm": 0.2712259590625763, + "learning_rate": 5.251045846434876e-07, + "loss": 0.3682, "step": 129985 }, { - "epoch": 4.57, - "learning_rate": 9.55556187956233e-07, - "loss": 0.2545, + "epoch": 4.68483079251811, + "grad_norm": 0.2651374936103821, + "learning_rate": 5.245098013109573e-07, + "loss": 0.3631, "step": 129990 }, { - "epoch": 4.57, - "learning_rate": 9.547762751335038e-07, - "loss": 0.2514, + "epoch": 4.685010992179334, + "grad_norm": 0.27646490931510925, + "learning_rate": 5.239153514522393e-07, + "loss": 0.3945, "step": 129995 }, { - "epoch": 4.57, - "learning_rate": 9.539966745221563e-07, - "loss": 0.251, + "epoch": 4.685191191840559, + "grad_norm": 0.24400193989276886, + "learning_rate": 5.233212350754385e-07, + "loss": 0.3355, "step": 130000 }, { - "epoch": 4.57, - "eval_loss": 0.24862876534461975, - "eval_runtime": 10.5504, - "eval_samples_per_second": 9.478, - "eval_steps_per_second": 9.478, + "epoch": 4.685191191840559, + "eval_loss": 0.4288380742073059, + "eval_runtime": 3.5366, + "eval_samples_per_second": 28.276, + "eval_steps_per_second": 7.069, "step": 130000 }, { - "epoch": 4.57, - "learning_rate": 9.532173861323129e-07, - "loss": 0.2545, + "epoch": 4.685371391501784, + "grad_norm": 0.22407753765583038, + "learning_rate": 5.227274521886483e-07, + "loss": 0.3887, "step": 130005 }, { - "epoch": 4.57, - "learning_rate": 9.524384099740991e-07, - "loss": 0.2479, + "epoch": 4.685551591163009, + "grad_norm": 0.2578125298023224, + "learning_rate": 5.221340027999566e-07, + "loss": 0.3554, "step": 130010 }, { - "epoch": 4.57, - "learning_rate": 9.516597460576232e-07, - "loss": 0.2336, + "epoch": 4.685731790824233, + "grad_norm": 0.26388269662857056, + "learning_rate": 5.215408869174487e-07, + "loss": 0.3703, "step": 130015 }, { - "epoch": 4.57, - "learning_rate": 9.508813943929967e-07, - "loss": 0.2482, + "epoch": 4.685911990485458, + "grad_norm": 0.3199911117553711, + "learning_rate": 5.20948104549207e-07, + "loss": 0.3835, "step": 130020 }, { - "epoch": 4.57, - "learning_rate": 9.501033549903227e-07, - "loss": 0.2513, + "epoch": 4.686092190146683, + "grad_norm": 0.2785996198654175, + "learning_rate": 5.203556557033085e-07, + "loss": 0.3619, "step": 130025 }, { - "epoch": 4.57, - "learning_rate": 9.493256278597124e-07, - "loss": 0.2543, + "epoch": 4.6862723898079075, + "grad_norm": 0.2726043462753296, + "learning_rate": 5.197635403878243e-07, + "loss": 0.3626, "step": 130030 }, { - "epoch": 4.57, - "learning_rate": 9.485482130112549e-07, - "loss": 0.2624, + "epoch": 4.686452589469132, + "grad_norm": 0.2747058868408203, + "learning_rate": 5.19171758610823e-07, + "loss": 0.3786, "step": 130035 }, { - "epoch": 4.58, - "learning_rate": 9.477711104550507e-07, - "loss": 0.2444, + "epoch": 4.686632789130356, + "grad_norm": 0.2753622233867645, + "learning_rate": 5.185803103803677e-07, + "loss": 0.4035, "step": 130040 }, { - "epoch": 4.58, - "learning_rate": 9.46994320201186e-07, - "loss": 0.2581, + "epoch": 4.686812988791581, + "grad_norm": 0.3083060383796692, + "learning_rate": 5.179891957045158e-07, + "loss": 0.3657, "step": 130045 }, { - "epoch": 4.58, - "learning_rate": 9.4621784225975e-07, - "loss": 0.2551, + "epoch": 4.6869931884528055, + "grad_norm": 0.23582348227500916, + "learning_rate": 5.173984145913191e-07, + "loss": 0.3944, "step": 130050 }, { - "epoch": 4.58, - "learning_rate": 9.454416766408236e-07, - "loss": 0.2578, + "epoch": 4.68717338811403, + "grad_norm": 0.2400096207857132, + "learning_rate": 5.168079670488296e-07, + "loss": 0.3639, "step": 130055 }, { - "epoch": 4.58, - "learning_rate": 9.446658233544847e-07, - "loss": 0.2461, + "epoch": 4.687353587775255, + "grad_norm": 0.2639099657535553, + "learning_rate": 5.162178530850937e-07, + "loss": 0.3944, "step": 130060 }, { - "epoch": 4.58, - "learning_rate": 9.438902824108059e-07, - "loss": 0.2454, + "epoch": 4.68753378743648, + "grad_norm": 0.21031659841537476, + "learning_rate": 5.156280727081492e-07, + "loss": 0.3775, "step": 130065 }, { - "epoch": 4.58, - "learning_rate": 9.431150538198597e-07, - "loss": 0.2429, + "epoch": 4.6877139870977045, + "grad_norm": 0.2745669484138489, + "learning_rate": 5.150386259260314e-07, + "loss": 0.3419, "step": 130070 }, { - "epoch": 4.58, - "learning_rate": 9.423401375917101e-07, - "loss": 0.2445, + "epoch": 4.687894186758929, + "grad_norm": 0.25483670830726624, + "learning_rate": 5.144495127467675e-07, + "loss": 0.3432, "step": 130075 }, { - "epoch": 4.58, - "learning_rate": 9.41565533736416e-07, - "loss": 0.2653, + "epoch": 4.688074386420153, + "grad_norm": 0.27415239810943604, + "learning_rate": 5.138607331783951e-07, + "loss": 0.4003, "step": 130080 }, { - "epoch": 4.58, - "learning_rate": 9.407912422640414e-07, - "loss": 0.2538, + "epoch": 4.688254586081378, + "grad_norm": 0.33515846729278564, + "learning_rate": 5.132722872289275e-07, + "loss": 0.3553, "step": 130085 }, { - "epoch": 4.58, - "learning_rate": 9.400172631846337e-07, - "loss": 0.2458, + "epoch": 4.6884347857426025, + "grad_norm": 0.2727743089199066, + "learning_rate": 5.126841749063805e-07, + "loss": 0.3815, "step": 130090 }, { - "epoch": 4.58, - "learning_rate": 9.392435965082491e-07, - "loss": 0.2516, + "epoch": 4.688614985403827, + "grad_norm": 0.239381343126297, + "learning_rate": 5.120963962187753e-07, + "loss": 0.373, "step": 130095 }, { - "epoch": 4.58, - "learning_rate": 9.384702422449265e-07, - "loss": 0.2256, + "epoch": 4.688795185065052, + "grad_norm": 0.23639050126075745, + "learning_rate": 5.115089511741139e-07, + "loss": 0.3469, "step": 130100 }, { - "epoch": 4.58, - "learning_rate": 9.376972004047136e-07, - "loss": 0.2582, + "epoch": 4.688975384726277, + "grad_norm": 0.2537215054035187, + "learning_rate": 5.109218397804011e-07, + "loss": 0.3599, "step": 130105 }, { - "epoch": 4.58, - "learning_rate": 9.369244709976465e-07, - "loss": 0.239, + "epoch": 4.6891555843875015, + "grad_norm": 0.268621027469635, + "learning_rate": 5.103350620456387e-07, + "loss": 0.4099, "step": 130110 }, { - "epoch": 4.58, - "learning_rate": 9.361520540337564e-07, - "loss": 0.2414, + "epoch": 4.689335784048726, + "grad_norm": 0.22903098165988922, + "learning_rate": 5.097486179778177e-07, + "loss": 0.3794, "step": 130115 }, { - "epoch": 4.58, - "learning_rate": 9.353799495230687e-07, - "loss": 0.2354, + "epoch": 4.689515983709951, + "grad_norm": 0.2540788948535919, + "learning_rate": 5.091625075849316e-07, + "loss": 0.3508, "step": 130120 }, { - "epoch": 4.58, - "learning_rate": 9.346081574756166e-07, - "loss": 0.2258, + "epoch": 4.689696183371176, + "grad_norm": 0.2338607758283615, + "learning_rate": 5.085767308749629e-07, + "loss": 0.366, "step": 130125 }, { - "epoch": 4.58, - "learning_rate": 9.338366779014174e-07, - "loss": 0.2356, + "epoch": 4.6898763830324, + "grad_norm": 0.2733408808708191, + "learning_rate": 5.079912878558968e-07, + "loss": 0.3504, "step": 130130 }, { - "epoch": 4.58, - "learning_rate": 9.330655108104907e-07, - "loss": 0.237, + "epoch": 4.690056582693624, + "grad_norm": 0.26694294810295105, + "learning_rate": 5.074061785357076e-07, + "loss": 0.3895, "step": 130135 }, { - "epoch": 4.58, - "learning_rate": 9.322946562128426e-07, - "loss": 0.2546, + "epoch": 4.690236782354849, + "grad_norm": 0.2584874629974365, + "learning_rate": 5.068214029223639e-07, + "loss": 0.376, "step": 130140 }, { - "epoch": 4.58, - "learning_rate": 9.3152411411849e-07, - "loss": 0.2514, + "epoch": 4.690416982016074, + "grad_norm": 0.2830393314361572, + "learning_rate": 5.062369610238399e-07, + "loss": 0.3739, "step": 130145 }, { - "epoch": 4.58, - "learning_rate": 9.307538845374358e-07, - "loss": 0.2565, + "epoch": 4.690597181677298, + "grad_norm": 0.2988766133785248, + "learning_rate": 5.056528528480958e-07, + "loss": 0.3467, "step": 130150 }, { - "epoch": 4.58, - "learning_rate": 9.299839674796806e-07, - "loss": 0.2609, + "epoch": 4.690777381338523, + "grad_norm": 0.2446690797805786, + "learning_rate": 5.050690784030892e-07, + "loss": 0.393, "step": 130155 }, { - "epoch": 4.58, - "learning_rate": 9.292143629552191e-07, - "loss": 0.2513, + "epoch": 4.690957580999748, + "grad_norm": 0.23521770536899567, + "learning_rate": 5.044856376967721e-07, + "loss": 0.387, "step": 130160 }, { - "epoch": 4.58, - "learning_rate": 9.284450709740461e-07, - "loss": 0.2503, + "epoch": 4.691137780660973, + "grad_norm": 0.26140204071998596, + "learning_rate": 5.039025307370965e-07, + "loss": 0.3775, "step": 130165 }, { - "epoch": 4.58, - "learning_rate": 9.276760915461508e-07, - "loss": 0.2504, + "epoch": 4.691317980322197, + "grad_norm": 0.3232240676879883, + "learning_rate": 5.033197575320059e-07, + "loss": 0.3765, "step": 130170 }, { - "epoch": 4.58, - "learning_rate": 9.269074246815195e-07, - "loss": 0.2378, + "epoch": 4.691498179983421, + "grad_norm": 0.2332460880279541, + "learning_rate": 5.027373180894441e-07, + "loss": 0.375, "step": 130175 }, { - "epoch": 4.58, - "learning_rate": 9.261390703901251e-07, - "loss": 0.2432, + "epoch": 4.691678379644646, + "grad_norm": 0.25555670261383057, + "learning_rate": 5.021552124173379e-07, + "loss": 0.3493, "step": 130180 }, { - "epoch": 4.58, - "learning_rate": 9.25371028681954e-07, - "loss": 0.2401, + "epoch": 4.691858579305871, + "grad_norm": 0.2844722270965576, + "learning_rate": 5.015734405236284e-07, + "loss": 0.3675, "step": 130185 }, { - "epoch": 4.58, - "learning_rate": 9.246032995669728e-07, - "loss": 0.2513, + "epoch": 4.692038778967095, + "grad_norm": 0.282840758562088, + "learning_rate": 5.009920024162368e-07, + "loss": 0.4054, "step": 130190 }, { - "epoch": 4.58, - "learning_rate": 9.238358830551519e-07, - "loss": 0.2554, + "epoch": 4.69221897862832, + "grad_norm": 0.22690938413143158, + "learning_rate": 5.004108981030847e-07, + "loss": 0.3435, "step": 130195 }, { - "epoch": 4.58, - "learning_rate": 9.230687791564524e-07, - "loss": 0.2605, + "epoch": 4.692399178289545, + "grad_norm": 0.24626252055168152, + "learning_rate": 4.998301275920936e-07, + "loss": 0.3498, "step": 130200 }, { - "epoch": 4.58, - "learning_rate": 9.223019878808415e-07, - "loss": 0.2507, + "epoch": 4.69257937795077, + "grad_norm": 0.23748280107975006, + "learning_rate": 4.99249690891171e-07, + "loss": 0.401, "step": 130205 }, { - "epoch": 4.58, - "learning_rate": 9.215355092382694e-07, - "loss": 0.2304, + "epoch": 4.692759577611994, + "grad_norm": 0.2302643209695816, + "learning_rate": 4.98669588008227e-07, + "loss": 0.3868, "step": 130210 }, { - "epoch": 4.58, - "learning_rate": 9.207693432386893e-07, - "loss": 0.248, + "epoch": 4.692939777273219, + "grad_norm": 0.2521742582321167, + "learning_rate": 4.980898189511668e-07, + "loss": 0.3973, "step": 130215 }, { - "epoch": 4.58, - "learning_rate": 9.200034898920546e-07, - "loss": 0.2407, + "epoch": 4.693119976934444, + "grad_norm": 0.24064558744430542, + "learning_rate": 4.975103837278921e-07, + "loss": 0.3831, "step": 130220 }, { - "epoch": 4.58, - "learning_rate": 9.192379492083042e-07, - "loss": 0.2477, + "epoch": 4.693300176595668, + "grad_norm": 0.233879953622818, + "learning_rate": 4.969312823462912e-07, + "loss": 0.3357, "step": 130225 }, { - "epoch": 4.58, - "learning_rate": 9.184727211973776e-07, - "loss": 0.2558, + "epoch": 4.693480376256892, + "grad_norm": 0.27745091915130615, + "learning_rate": 4.963525148142606e-07, + "loss": 0.3951, "step": 130230 }, { - "epoch": 4.58, - "learning_rate": 9.177078058692112e-07, - "loss": 0.2516, + "epoch": 4.693660575918117, + "grad_norm": 0.2301134169101715, + "learning_rate": 4.9577408113968e-07, + "loss": 0.381, "step": 130235 }, { - "epoch": 4.58, - "learning_rate": 9.169432032337416e-07, - "loss": 0.2311, + "epoch": 4.693840775579342, + "grad_norm": 0.2147936373949051, + "learning_rate": 4.951959813304346e-07, + "loss": 0.3665, "step": 130240 }, { - "epoch": 4.58, - "learning_rate": 9.161789133008914e-07, - "loss": 0.2586, + "epoch": 4.694020975240567, + "grad_norm": 0.24145711958408356, + "learning_rate": 4.94618215394399e-07, + "loss": 0.4068, "step": 130245 }, { - "epoch": 4.58, - "learning_rate": 9.15414936080583e-07, - "loss": 0.2382, + "epoch": 4.694201174901791, + "grad_norm": 0.23923490941524506, + "learning_rate": 4.940407833394473e-07, + "loss": 0.4088, "step": 130250 }, { - "epoch": 4.58, - "learning_rate": 9.146512715827421e-07, - "loss": 0.2545, + "epoch": 4.694381374563016, + "grad_norm": 0.24783268570899963, + "learning_rate": 4.934636851734453e-07, + "loss": 0.3628, "step": 130255 }, { - "epoch": 4.58, - "learning_rate": 9.1388791981728e-07, - "loss": 0.237, + "epoch": 4.694561574224241, + "grad_norm": 0.2156895250082016, + "learning_rate": 4.928869209042536e-07, + "loss": 0.3927, "step": 130260 }, { - "epoch": 4.58, - "learning_rate": 9.13124880794114e-07, - "loss": 0.2335, + "epoch": 4.694741773885465, + "grad_norm": 0.28466811776161194, + "learning_rate": 4.923104905397379e-07, + "loss": 0.4006, "step": 130265 }, { - "epoch": 4.58, - "learning_rate": 9.12362154523147e-07, - "loss": 0.2433, + "epoch": 4.694921973546689, + "grad_norm": 0.25431156158447266, + "learning_rate": 4.917343940877423e-07, + "loss": 0.3977, "step": 130270 }, { - "epoch": 4.58, - "learning_rate": 9.115997410142796e-07, - "loss": 0.2211, + "epoch": 4.695102173207914, + "grad_norm": 0.3024144470691681, + "learning_rate": 4.911586315561212e-07, + "loss": 0.3901, "step": 130275 }, { - "epoch": 4.58, - "learning_rate": 9.108376402774177e-07, - "loss": 0.2445, + "epoch": 4.695282372869139, + "grad_norm": 0.29389986395835876, + "learning_rate": 4.905832029527186e-07, + "loss": 0.3691, "step": 130280 }, { - "epoch": 4.58, - "learning_rate": 9.100758523224534e-07, - "loss": 0.2432, + "epoch": 4.695462572530364, + "grad_norm": 0.26747405529022217, + "learning_rate": 4.900081082853753e-07, + "loss": 0.4077, "step": 130285 }, { - "epoch": 4.58, - "learning_rate": 9.09314377159276e-07, - "loss": 0.2565, + "epoch": 4.695642772191588, + "grad_norm": 0.259396493434906, + "learning_rate": 4.894333475619295e-07, + "loss": 0.3603, "step": 130290 }, { - "epoch": 4.58, - "learning_rate": 9.085532147977749e-07, - "loss": 0.2818, + "epoch": 4.695822971852813, + "grad_norm": 0.25687748193740845, + "learning_rate": 4.888589207902056e-07, + "loss": 0.374, "step": 130295 }, { - "epoch": 4.58, - "learning_rate": 9.077923652478365e-07, - "loss": 0.2358, + "epoch": 4.696003171514038, + "grad_norm": 0.2638847231864929, + "learning_rate": 4.882848279780306e-07, + "loss": 0.4057, "step": 130300 }, { - "epoch": 4.58, - "learning_rate": 9.070318285193336e-07, - "loss": 0.2563, + "epoch": 4.6961833711752625, + "grad_norm": 0.24750365316867828, + "learning_rate": 4.877110691332343e-07, + "loss": 0.3794, "step": 130305 }, { - "epoch": 4.58, - "learning_rate": 9.06271604622147e-07, - "loss": 0.2659, + "epoch": 4.696363570836487, + "grad_norm": 0.22299759089946747, + "learning_rate": 4.871376442636272e-07, + "loss": 0.379, "step": 130310 }, { - "epoch": 4.58, - "learning_rate": 9.055116935661439e-07, - "loss": 0.2431, + "epoch": 4.696543770497711, + "grad_norm": 0.24523359537124634, + "learning_rate": 4.865645533770224e-07, + "loss": 0.3825, "step": 130315 }, { - "epoch": 4.59, - "learning_rate": 9.047520953611943e-07, - "loss": 0.2275, + "epoch": 4.696723970158936, + "grad_norm": 0.24802939593791962, + "learning_rate": 4.859917964812305e-07, + "loss": 0.4062, "step": 130320 }, { - "epoch": 4.59, - "learning_rate": 9.039928100171569e-07, - "loss": 0.2477, + "epoch": 4.696904169820161, + "grad_norm": 0.25451767444610596, + "learning_rate": 4.854193735840534e-07, + "loss": 0.3949, "step": 130325 }, { - "epoch": 4.59, - "learning_rate": 9.032338375438931e-07, - "loss": 0.2529, + "epoch": 4.697084369481385, + "grad_norm": 0.2552180588245392, + "learning_rate": 4.848472846932933e-07, + "loss": 0.3683, "step": 130330 }, { - "epoch": 4.59, - "learning_rate": 9.024751779512592e-07, - "loss": 0.255, + "epoch": 4.69726456914261, + "grad_norm": 0.2642126679420471, + "learning_rate": 4.842755298167412e-07, + "loss": 0.3984, "step": 130335 }, { - "epoch": 4.59, - "learning_rate": 9.017168312491053e-07, - "loss": 0.23, + "epoch": 4.697444768803835, + "grad_norm": 0.2669326961040497, + "learning_rate": 4.837041089621908e-07, + "loss": 0.3565, "step": 130340 }, { - "epoch": 4.59, - "learning_rate": 9.009587974472767e-07, - "loss": 0.2536, + "epoch": 4.6976249684650595, + "grad_norm": 0.27341264486312866, + "learning_rate": 4.831330221374248e-07, + "loss": 0.4032, "step": 130345 }, { - "epoch": 4.59, - "learning_rate": 9.002010765556124e-07, - "loss": 0.23, + "epoch": 4.697805168126284, + "grad_norm": 0.2548832297325134, + "learning_rate": 4.82562269350223e-07, + "loss": 0.3906, "step": 130350 }, { - "epoch": 4.59, - "learning_rate": 8.994436685839602e-07, - "loss": 0.2501, + "epoch": 4.697985367787508, + "grad_norm": 0.29564833641052246, + "learning_rate": 4.819918506083627e-07, + "loss": 0.3535, "step": 130355 }, { - "epoch": 4.59, - "learning_rate": 8.986865735421429e-07, - "loss": 0.2553, + "epoch": 4.698165567448733, + "grad_norm": 0.29607510566711426, + "learning_rate": 4.814217659196207e-07, + "loss": 0.339, "step": 130360 }, { - "epoch": 4.59, - "learning_rate": 8.979297914400026e-07, - "loss": 0.2381, + "epoch": 4.6983457671099575, + "grad_norm": 0.2381865680217743, + "learning_rate": 4.80852015291755e-07, + "loss": 0.3721, "step": 130365 }, { - "epoch": 4.59, - "learning_rate": 8.971733222873563e-07, - "loss": 0.2512, + "epoch": 4.698525966771182, + "grad_norm": 0.2766072154045105, + "learning_rate": 4.802825987325371e-07, + "loss": 0.3651, "step": 130370 }, { - "epoch": 4.59, - "learning_rate": 8.964171660940323e-07, - "loss": 0.2411, + "epoch": 4.698706166432407, + "grad_norm": 0.26471713185310364, + "learning_rate": 4.797135162497219e-07, + "loss": 0.3566, "step": 130375 }, { - "epoch": 4.59, - "learning_rate": 8.956613228698479e-07, - "loss": 0.253, + "epoch": 4.698886366093632, + "grad_norm": 0.2708904445171356, + "learning_rate": 4.791447678510613e-07, + "loss": 0.39, "step": 130380 }, { - "epoch": 4.59, - "learning_rate": 8.949057926246146e-07, - "loss": 0.2265, + "epoch": 4.6990665657548565, + "grad_norm": 0.23361581563949585, + "learning_rate": 4.785763535443078e-07, + "loss": 0.3869, "step": 130385 }, { - "epoch": 4.59, - "learning_rate": 8.941505753681412e-07, - "loss": 0.2592, + "epoch": 4.699246765416081, + "grad_norm": 0.19293129444122314, + "learning_rate": 4.780082733372021e-07, + "loss": 0.3327, "step": 130390 }, { - "epoch": 4.59, - "learning_rate": 8.933956711102393e-07, - "loss": 0.2514, + "epoch": 4.699426965077306, + "grad_norm": 0.2304544597864151, + "learning_rate": 4.774405272374827e-07, + "loss": 0.3904, "step": 130395 }, { - "epoch": 4.59, - "learning_rate": 8.926410798607066e-07, - "loss": 0.2592, + "epoch": 4.699607164738531, + "grad_norm": 0.2673514187335968, + "learning_rate": 4.768731152528933e-07, + "loss": 0.4132, "step": 130400 }, { - "epoch": 4.59, - "learning_rate": 8.918868016293435e-07, - "loss": 0.2251, + "epoch": 4.699787364399755, + "grad_norm": 0.27514368295669556, + "learning_rate": 4.763060373911582e-07, + "loss": 0.3417, "step": 130405 }, { - "epoch": 4.59, - "learning_rate": 8.911328364259397e-07, - "loss": 0.2537, + "epoch": 4.699967564060979, + "grad_norm": 0.24293185770511627, + "learning_rate": 4.7573929366000746e-07, + "loss": 0.3843, "step": 130410 }, { - "epoch": 4.59, - "learning_rate": 8.903791842602899e-07, - "loss": 0.2395, + "epoch": 4.700147763722204, + "grad_norm": 0.27645057439804077, + "learning_rate": 4.751728840671599e-07, + "loss": 0.3684, "step": 130415 }, { - "epoch": 4.59, - "learning_rate": 8.896258451421752e-07, - "loss": 0.2495, + "epoch": 4.700327963383429, + "grad_norm": 0.23572184145450592, + "learning_rate": 4.7460680862033146e-07, + "loss": 0.3592, "step": 130420 }, { - "epoch": 4.59, - "learning_rate": 8.888728190813822e-07, - "loss": 0.2558, + "epoch": 4.7005081630446535, + "grad_norm": 0.26540112495422363, + "learning_rate": 4.740410673272383e-07, + "loss": 0.3695, "step": 130425 }, { - "epoch": 4.59, - "learning_rate": 8.881201060876837e-07, - "loss": 0.2303, + "epoch": 4.700688362705878, + "grad_norm": 0.29115819931030273, + "learning_rate": 4.7347566019558807e-07, + "loss": 0.3774, "step": 130430 }, { - "epoch": 4.59, - "learning_rate": 8.873677061708551e-07, - "loss": 0.2421, + "epoch": 4.700868562367103, + "grad_norm": 0.239894837141037, + "learning_rate": 4.72910587233083e-07, + "loss": 0.3866, "step": 130435 }, { - "epoch": 4.59, - "learning_rate": 8.866156193406694e-07, - "loss": 0.2329, + "epoch": 4.701048762028328, + "grad_norm": 0.2407197803258896, + "learning_rate": 4.723458484474225e-07, + "loss": 0.3989, "step": 130440 }, { - "epoch": 4.59, - "learning_rate": 8.858638456068852e-07, - "loss": 0.2282, + "epoch": 4.701228961689552, + "grad_norm": 0.20911681652069092, + "learning_rate": 4.717814438462975e-07, + "loss": 0.3579, "step": 130445 }, { - "epoch": 4.59, - "learning_rate": 8.851123849792698e-07, - "loss": 0.2595, + "epoch": 4.701409161350776, + "grad_norm": 0.3005968928337097, + "learning_rate": 4.712173734374048e-07, + "loss": 0.3647, "step": 130450 }, { - "epoch": 4.59, - "learning_rate": 8.843612374675764e-07, - "loss": 0.2371, + "epoch": 4.701589361012001, + "grad_norm": 0.24574686586856842, + "learning_rate": 4.706536372284298e-07, + "loss": 0.3752, "step": 130455 }, { - "epoch": 4.59, - "learning_rate": 8.836104030815612e-07, - "loss": 0.2655, + "epoch": 4.701769560673226, + "grad_norm": 0.33672651648521423, + "learning_rate": 4.700902352270442e-07, + "loss": 0.4029, "step": 130460 }, { - "epoch": 4.59, - "learning_rate": 8.828598818309663e-07, - "loss": 0.248, + "epoch": 4.7019497603344504, + "grad_norm": 0.22763417661190033, + "learning_rate": 4.695271674409335e-07, + "loss": 0.3716, "step": 130465 }, { - "epoch": 4.59, - "learning_rate": 8.82109673725548e-07, - "loss": 0.2432, + "epoch": 4.702129959995675, + "grad_norm": 0.24694277346134186, + "learning_rate": 4.689644338777638e-07, + "loss": 0.3485, "step": 130470 }, { - "epoch": 4.59, - "learning_rate": 8.813597787750372e-07, - "loss": 0.2633, + "epoch": 4.7023101596569, + "grad_norm": 0.21548400819301605, + "learning_rate": 4.684020345452067e-07, + "loss": 0.3963, "step": 130475 }, { - "epoch": 4.59, - "learning_rate": 8.806101969891761e-07, - "loss": 0.2779, + "epoch": 4.702490359318125, + "grad_norm": 0.260619580745697, + "learning_rate": 4.678399694509228e-07, + "loss": 0.4, "step": 130480 }, { - "epoch": 4.59, - "learning_rate": 8.79860928377696e-07, - "loss": 0.2375, + "epoch": 4.702670558979349, + "grad_norm": 0.31047627329826355, + "learning_rate": 4.67278238602567e-07, + "loss": 0.3997, "step": 130485 }, { - "epoch": 4.59, - "learning_rate": 8.791119729503278e-07, - "loss": 0.2311, + "epoch": 4.702850758640574, + "grad_norm": 0.2056533545255661, + "learning_rate": 4.6671684200779717e-07, + "loss": 0.3476, "step": 130490 }, { - "epoch": 4.59, - "learning_rate": 8.783633307167943e-07, - "loss": 0.2546, + "epoch": 4.703030958301799, + "grad_norm": 0.24057982861995697, + "learning_rate": 4.6615577967425996e-07, + "loss": 0.3588, "step": 130495 }, { - "epoch": 4.59, - "learning_rate": 8.776150016868157e-07, - "loss": 0.2328, + "epoch": 4.703211157963023, + "grad_norm": 0.25371670722961426, + "learning_rate": 4.6559505160959916e-07, + "loss": 0.4112, "step": 130500 }, { - "epoch": 4.59, - "eval_loss": 0.24856102466583252, - "eval_runtime": 10.5535, - "eval_samples_per_second": 9.475, - "eval_steps_per_second": 9.475, + "epoch": 4.703211157963023, + "eval_loss": 0.42873358726501465, + "eval_runtime": 3.5322, + "eval_samples_per_second": 28.311, + "eval_steps_per_second": 7.078, "step": 130500 }, { - "epoch": 4.59, - "learning_rate": 8.768669858701061e-07, - "loss": 0.2428, + "epoch": 4.703391357624247, + "grad_norm": 0.3009524345397949, + "learning_rate": 4.6503465782145873e-07, + "loss": 0.3473, "step": 130505 }, { - "epoch": 4.59, - "learning_rate": 8.761192832763859e-07, - "loss": 0.2483, + "epoch": 4.703571557285472, + "grad_norm": 0.27046623826026917, + "learning_rate": 4.6447459831746586e-07, + "loss": 0.395, "step": 130510 }, { - "epoch": 4.59, - "learning_rate": 8.753718939153555e-07, - "loss": 0.2551, + "epoch": 4.703751756946697, + "grad_norm": 0.2447580248117447, + "learning_rate": 4.639148731052617e-07, + "loss": 0.3678, "step": 130515 }, { - "epoch": 4.59, - "learning_rate": 8.746248177967209e-07, - "loss": 0.2682, + "epoch": 4.703931956607922, + "grad_norm": 0.28649699687957764, + "learning_rate": 4.6335548219246517e-07, + "loss": 0.3448, "step": 130520 }, { - "epoch": 4.59, - "learning_rate": 8.738780549301884e-07, - "loss": 0.2427, + "epoch": 4.704112156269146, + "grad_norm": 0.28706684708595276, + "learning_rate": 4.6279642558670064e-07, + "loss": 0.3887, "step": 130525 }, { - "epoch": 4.59, - "learning_rate": 8.731316053254446e-07, - "loss": 0.2679, + "epoch": 4.704292355930371, + "grad_norm": 0.20302236080169678, + "learning_rate": 4.622377032955816e-07, + "loss": 0.3479, "step": 130530 }, { - "epoch": 4.59, - "learning_rate": 8.723854689921901e-07, - "loss": 0.2462, + "epoch": 4.704472555591596, + "grad_norm": 0.2422175258398056, + "learning_rate": 4.616793153267268e-07, + "loss": 0.3707, "step": 130535 }, { - "epoch": 4.59, - "learning_rate": 8.716396459401088e-07, - "loss": 0.2441, + "epoch": 4.70465275525282, + "grad_norm": 0.25601354241371155, + "learning_rate": 4.6112126168773587e-07, + "loss": 0.3576, "step": 130540 }, { - "epoch": 4.59, - "learning_rate": 8.708941361788847e-07, - "loss": 0.2305, + "epoch": 4.704832954914044, + "grad_norm": 0.2792492210865021, + "learning_rate": 4.60563542386222e-07, + "loss": 0.3739, "step": 130545 }, { - "epoch": 4.59, - "learning_rate": 8.701489397181989e-07, - "loss": 0.2483, + "epoch": 4.705013154575269, + "grad_norm": 0.22106266021728516, + "learning_rate": 4.600061574297765e-07, + "loss": 0.3579, "step": 130550 }, { - "epoch": 4.59, - "learning_rate": 8.694040565677297e-07, - "loss": 0.238, + "epoch": 4.705193354236494, + "grad_norm": 0.2654285430908203, + "learning_rate": 4.5944910682599597e-07, + "loss": 0.3411, "step": 130555 }, { - "epoch": 4.59, - "learning_rate": 8.686594867371417e-07, - "loss": 0.2371, + "epoch": 4.705373553897719, + "grad_norm": 0.32557204365730286, + "learning_rate": 4.588923905824688e-07, + "loss": 0.3711, "step": 130560 }, { - "epoch": 4.59, - "learning_rate": 8.679152302361104e-07, - "loss": 0.2677, + "epoch": 4.705553753558943, + "grad_norm": 0.308724969625473, + "learning_rate": 4.583360087067834e-07, + "loss": 0.3891, "step": 130565 }, { - "epoch": 4.59, - "learning_rate": 8.671712870742948e-07, - "loss": 0.2529, + "epoch": 4.705733953220168, + "grad_norm": 0.2696959376335144, + "learning_rate": 4.577799612065198e-07, + "loss": 0.3632, "step": 130570 }, { - "epoch": 4.59, - "learning_rate": 8.664276572613595e-07, - "loss": 0.2513, + "epoch": 4.705914152881393, + "grad_norm": 0.280472069978714, + "learning_rate": 4.572242480892497e-07, + "loss": 0.3685, "step": 130575 }, { - "epoch": 4.59, - "learning_rate": 8.656843408069493e-07, - "loss": 0.2493, + "epoch": 4.7060943525426175, + "grad_norm": 0.26516270637512207, + "learning_rate": 4.566688693625476e-07, + "loss": 0.3442, "step": 130580 }, { - "epoch": 4.59, - "learning_rate": 8.64941337720726e-07, - "loss": 0.2527, + "epoch": 4.706274552203842, + "grad_norm": 0.2824823558330536, + "learning_rate": 4.5611382503398246e-07, + "loss": 0.372, "step": 130585 }, { - "epoch": 4.59, - "learning_rate": 8.641986480123349e-07, - "loss": 0.2418, + "epoch": 4.706454751865066, + "grad_norm": 0.2615325450897217, + "learning_rate": 4.5555911511111495e-07, + "loss": 0.3817, "step": 130590 }, { - "epoch": 4.59, - "learning_rate": 8.634562716914179e-07, - "loss": 0.2504, + "epoch": 4.706634951526291, + "grad_norm": 0.2668514549732208, + "learning_rate": 4.5500473960150004e-07, + "loss": 0.3485, "step": 130595 }, { - "epoch": 4.59, - "learning_rate": 8.627142087676121e-07, - "loss": 0.257, + "epoch": 4.706815151187516, + "grad_norm": 0.286246657371521, + "learning_rate": 4.5445069851269564e-07, + "loss": 0.4019, "step": 130600 }, { - "epoch": 4.6, - "learning_rate": 8.619724592505596e-07, - "loss": 0.2459, + "epoch": 4.70699535084874, + "grad_norm": 0.26146531105041504, + "learning_rate": 4.5389699185224564e-07, + "loss": 0.3399, "step": 130605 }, { - "epoch": 4.6, - "learning_rate": 8.61231023149886e-07, - "loss": 0.2586, + "epoch": 4.707175550509965, + "grad_norm": 0.27129244804382324, + "learning_rate": 4.5334361962769965e-07, + "loss": 0.384, "step": 130610 }, { - "epoch": 4.6, - "learning_rate": 8.604899004752198e-07, - "loss": 0.2216, + "epoch": 4.70735575017119, + "grad_norm": 0.21053126454353333, + "learning_rate": 4.527905818465933e-07, + "loss": 0.3281, "step": 130615 }, { - "epoch": 4.6, - "learning_rate": 8.59749091236181e-07, - "loss": 0.2588, + "epoch": 4.7075359498324145, + "grad_norm": 0.287487655878067, + "learning_rate": 4.5223787851646215e-07, + "loss": 0.3927, "step": 130620 }, { - "epoch": 4.6, - "learning_rate": 8.590085954423955e-07, - "loss": 0.2517, + "epoch": 4.707716149493639, + "grad_norm": 0.25808537006378174, + "learning_rate": 4.5168550964483915e-07, + "loss": 0.3529, "step": 130625 }, { - "epoch": 4.6, - "learning_rate": 8.582684131034718e-07, - "loss": 0.2439, + "epoch": 4.707896349154863, + "grad_norm": 0.21509693562984467, + "learning_rate": 4.51133475239246e-07, + "loss": 0.3739, "step": 130630 }, { - "epoch": 4.6, - "learning_rate": 8.575285442290193e-07, - "loss": 0.2547, + "epoch": 4.708076548816088, + "grad_norm": 0.3704529404640198, + "learning_rate": 4.5058177530720724e-07, + "loss": 0.3907, "step": 130635 }, { - "epoch": 4.6, - "learning_rate": 8.567889888286495e-07, - "loss": 0.265, + "epoch": 4.708256748477313, + "grad_norm": 0.25836381316185, + "learning_rate": 4.500304098562419e-07, + "loss": 0.3895, "step": 130640 }, { - "epoch": 4.6, - "learning_rate": 8.56049746911966e-07, - "loss": 0.2511, + "epoch": 4.708436948138537, + "grad_norm": 0.21811331808567047, + "learning_rate": 4.4947937889385507e-07, + "loss": 0.382, "step": 130645 }, { - "epoch": 4.6, - "learning_rate": 8.553108184885666e-07, - "loss": 0.2476, + "epoch": 4.708617147799762, + "grad_norm": 0.24641865491867065, + "learning_rate": 4.489286824275601e-07, + "loss": 0.3452, "step": 130650 }, { - "epoch": 4.6, - "learning_rate": 8.545722035680409e-07, - "loss": 0.2505, + "epoch": 4.708797347460987, + "grad_norm": 0.23483498394489288, + "learning_rate": 4.483783204648595e-07, + "loss": 0.3834, "step": 130655 }, { - "epoch": 4.6, - "learning_rate": 8.538339021599839e-07, - "loss": 0.2566, + "epoch": 4.7089775471222115, + "grad_norm": 0.28817734122276306, + "learning_rate": 4.4782829301324993e-07, + "loss": 0.3866, "step": 130660 }, { - "epoch": 4.6, - "learning_rate": 8.530959142739797e-07, - "loss": 0.2655, + "epoch": 4.709157746783436, + "grad_norm": 0.2239607572555542, + "learning_rate": 4.4738851190498645e-07, + "loss": 0.3863, "step": 130665 }, { - "epoch": 4.6, - "learning_rate": 8.523582399196151e-07, - "loss": 0.2626, + "epoch": 4.709337946444661, + "grad_norm": 0.22553732991218567, + "learning_rate": 4.4683908659222396e-07, + "loss": 0.3521, "step": 130670 }, { - "epoch": 4.6, - "learning_rate": 8.5162087910646e-07, - "loss": 0.2215, + "epoch": 4.709518146105886, + "grad_norm": 0.21854856610298157, + "learning_rate": 4.4628999581152463e-07, + "loss": 0.3594, "step": 130675 }, { - "epoch": 4.6, - "learning_rate": 8.508838318440959e-07, - "loss": 0.2521, + "epoch": 4.70969834576711, + "grad_norm": 0.22778260707855225, + "learning_rate": 4.457412395703714e-07, + "loss": 0.3684, "step": 130680 }, { - "epoch": 4.6, - "learning_rate": 8.501470981420928e-07, - "loss": 0.26, + "epoch": 4.709878545428334, + "grad_norm": 0.20570991933345795, + "learning_rate": 4.451928178762416e-07, + "loss": 0.3913, "step": 130685 }, { - "epoch": 4.6, - "learning_rate": 8.494106780100098e-07, - "loss": 0.229, + "epoch": 4.710058745089559, + "grad_norm": 0.26178503036499023, + "learning_rate": 4.446447307366042e-07, + "loss": 0.3578, "step": 130690 }, { - "epoch": 4.6, - "learning_rate": 8.48674571457414e-07, - "loss": 0.2412, + "epoch": 4.710238944750784, + "grad_norm": 0.2906634211540222, + "learning_rate": 4.4409697815892827e-07, + "loss": 0.3814, "step": 130695 }, { - "epoch": 4.6, - "learning_rate": 8.479387784938647e-07, - "loss": 0.2515, + "epoch": 4.7104191444120085, + "grad_norm": 0.24634572863578796, + "learning_rate": 4.435495601506745e-07, + "loss": 0.3709, "step": 130700 }, { - "epoch": 4.6, - "learning_rate": 8.472032991289125e-07, - "loss": 0.2331, + "epoch": 4.710599344073233, + "grad_norm": 0.205975741147995, + "learning_rate": 4.430024767193064e-07, + "loss": 0.3627, "step": 130705 }, { - "epoch": 4.6, - "learning_rate": 8.464681333721109e-07, - "loss": 0.2427, + "epoch": 4.710779543734458, + "grad_norm": 0.24753928184509277, + "learning_rate": 4.4245572787227905e-07, + "loss": 0.3751, "step": 130710 }, { - "epoch": 4.6, - "learning_rate": 8.457332812329966e-07, - "loss": 0.26, + "epoch": 4.710959743395683, + "grad_norm": 0.24739667773246765, + "learning_rate": 4.41909313617031e-07, + "loss": 0.3678, "step": 130715 }, { - "epoch": 4.6, - "learning_rate": 8.449987427211231e-07, - "loss": 0.2556, + "epoch": 4.711139943056907, + "grad_norm": 0.19426771998405457, + "learning_rate": 4.4136323396101743e-07, + "loss": 0.3344, "step": 130720 }, { - "epoch": 4.6, - "learning_rate": 8.442645178460217e-07, - "loss": 0.2508, + "epoch": 4.711320142718131, + "grad_norm": 0.2439907044172287, + "learning_rate": 4.408174889116712e-07, + "loss": 0.3633, "step": 130725 }, { - "epoch": 4.6, - "learning_rate": 8.435306066172238e-07, - "loss": 0.2447, + "epoch": 4.711500342379356, + "grad_norm": 0.2416488081216812, + "learning_rate": 4.402720784764391e-07, + "loss": 0.3851, "step": 130730 }, { - "epoch": 4.6, - "learning_rate": 8.427970090442605e-07, - "loss": 0.2612, + "epoch": 4.711680542040581, + "grad_norm": 0.23560959100723267, + "learning_rate": 4.3972700266274035e-07, + "loss": 0.3564, "step": 130735 }, { - "epoch": 4.6, - "learning_rate": 8.420637251366604e-07, - "loss": 0.2698, + "epoch": 4.7118607417018055, + "grad_norm": 0.24532419443130493, + "learning_rate": 4.3918226147800503e-07, + "loss": 0.3914, "step": 130740 }, { - "epoch": 4.6, - "learning_rate": 8.413307549039406e-07, - "loss": 0.2544, + "epoch": 4.71204094136303, + "grad_norm": 0.25437724590301514, + "learning_rate": 4.386378549296577e-07, + "loss": 0.3805, "step": 130745 }, { - "epoch": 4.6, - "learning_rate": 8.405980983556216e-07, - "loss": 0.2537, + "epoch": 4.712221141024255, + "grad_norm": 0.2842262089252472, + "learning_rate": 4.380937830251175e-07, + "loss": 0.3494, "step": 130750 }, { - "epoch": 4.6, - "learning_rate": 8.398657555012123e-07, - "loss": 0.2554, + "epoch": 4.71240134068548, + "grad_norm": 0.21695636212825775, + "learning_rate": 4.375500457717896e-07, + "loss": 0.3494, "step": 130755 }, { - "epoch": 4.6, - "learning_rate": 8.391337263502247e-07, - "loss": 0.2342, + "epoch": 4.712581540346704, + "grad_norm": 0.26148954033851624, + "learning_rate": 4.3700664317709026e-07, + "loss": 0.3834, "step": 130760 }, { - "epoch": 4.6, - "learning_rate": 8.384020109121649e-07, - "loss": 0.2464, + "epoch": 4.712761740007929, + "grad_norm": 0.31055474281311035, + "learning_rate": 4.364635752484164e-07, + "loss": 0.3661, "step": 130765 }, { - "epoch": 4.6, - "learning_rate": 8.376706091965309e-07, - "loss": 0.2523, + "epoch": 4.712941939669154, + "grad_norm": 0.32295408844947815, + "learning_rate": 4.3592084199317316e-07, + "loss": 0.3913, "step": 130770 }, { - "epoch": 4.6, - "learning_rate": 8.369395212128206e-07, - "loss": 0.2362, + "epoch": 4.713122139330378, + "grad_norm": 0.239763081073761, + "learning_rate": 4.353784434187519e-07, + "loss": 0.3283, "step": 130775 }, { - "epoch": 4.6, - "learning_rate": 8.362087469705265e-07, - "loss": 0.2474, + "epoch": 4.7133023389916024, + "grad_norm": 0.2873198986053467, + "learning_rate": 4.3483637953254384e-07, + "loss": 0.3751, "step": 130780 }, { - "epoch": 4.6, - "learning_rate": 8.354782864791383e-07, - "loss": 0.2297, + "epoch": 4.713482538652827, + "grad_norm": 0.25636720657348633, + "learning_rate": 4.3429465034193485e-07, + "loss": 0.3496, "step": 130785 }, { - "epoch": 4.6, - "learning_rate": 8.347481397481344e-07, - "loss": 0.2467, + "epoch": 4.713662738314052, + "grad_norm": 0.29088860750198364, + "learning_rate": 4.33753255854305e-07, + "loss": 0.3963, "step": 130790 }, { - "epoch": 4.6, - "learning_rate": 8.340183067870072e-07, - "loss": 0.2582, + "epoch": 4.713842937975277, + "grad_norm": 0.2415892779827118, + "learning_rate": 4.3321219607702634e-07, + "loss": 0.3655, "step": 130795 }, { - "epoch": 4.6, - "learning_rate": 8.332887876052215e-07, - "loss": 0.2395, + "epoch": 4.714023137636501, + "grad_norm": 0.2409953773021698, + "learning_rate": 4.3267147101747896e-07, + "loss": 0.3836, "step": 130800 }, { - "epoch": 4.6, - "learning_rate": 8.325595822122556e-07, - "loss": 0.273, + "epoch": 4.714203337297726, + "grad_norm": 0.2819431722164154, + "learning_rate": 4.321310806830292e-07, + "loss": 0.3651, "step": 130805 }, { - "epoch": 4.6, - "learning_rate": 8.318306906175771e-07, - "loss": 0.2424, + "epoch": 4.714383536958951, + "grad_norm": 0.24938331544399261, + "learning_rate": 4.3159102508103233e-07, + "loss": 0.3752, "step": 130810 }, { - "epoch": 4.6, - "learning_rate": 8.311021128306506e-07, - "loss": 0.235, + "epoch": 4.714563736620175, + "grad_norm": 0.2521766424179077, + "learning_rate": 4.3105130421885463e-07, + "loss": 0.3826, "step": 130815 }, { - "epoch": 4.6, - "learning_rate": 8.303738488609353e-07, - "loss": 0.2673, + "epoch": 4.714743936281399, + "grad_norm": 0.25512444972991943, + "learning_rate": 4.3051191810384304e-07, + "loss": 0.3638, "step": 130820 }, { - "epoch": 4.6, - "learning_rate": 8.296458987178873e-07, - "loss": 0.2706, + "epoch": 4.714924135942624, + "grad_norm": 0.2716846466064453, + "learning_rate": 4.299728667433528e-07, + "loss": 0.3844, "step": 130825 }, { - "epoch": 4.6, - "learning_rate": 8.289182624109549e-07, - "loss": 0.275, + "epoch": 4.715104335603849, + "grad_norm": 0.2582548260688782, + "learning_rate": 4.2943415014472523e-07, + "loss": 0.3979, "step": 130830 }, { - "epoch": 4.6, - "learning_rate": 8.28190939949594e-07, - "loss": 0.2573, + "epoch": 4.715284535265074, + "grad_norm": 0.2618710398674011, + "learning_rate": 4.288957683152961e-07, + "loss": 0.3923, "step": 130835 }, { - "epoch": 4.6, - "learning_rate": 8.274639313432419e-07, - "loss": 0.2329, + "epoch": 4.715464734926298, + "grad_norm": 0.24619565904140472, + "learning_rate": 4.283577212624096e-07, + "loss": 0.389, "step": 130840 }, { - "epoch": 4.6, - "learning_rate": 8.267372366013409e-07, - "loss": 0.2559, + "epoch": 4.715644934587523, + "grad_norm": 0.28034818172454834, + "learning_rate": 4.2782000899339035e-07, + "loss": 0.3825, "step": 130845 }, { - "epoch": 4.6, - "learning_rate": 8.260108557333251e-07, - "loss": 0.2578, + "epoch": 4.715825134248748, + "grad_norm": 0.27038708329200745, + "learning_rate": 4.272826315155659e-07, + "loss": 0.398, "step": 130850 }, { - "epoch": 4.6, - "learning_rate": 8.252847887486287e-07, - "loss": 0.2443, + "epoch": 4.716005333909973, + "grad_norm": 0.2420760691165924, + "learning_rate": 4.267455888362581e-07, + "loss": 0.387, "step": 130855 }, { - "epoch": 4.6, - "learning_rate": 8.245590356566745e-07, - "loss": 0.253, + "epoch": 4.716185533571197, + "grad_norm": 0.26630204916000366, + "learning_rate": 4.2620888096278346e-07, + "loss": 0.4072, "step": 130860 }, { - "epoch": 4.6, - "learning_rate": 8.238335964668914e-07, - "loss": 0.2273, + "epoch": 4.716365733232421, + "grad_norm": 0.2816385328769684, + "learning_rate": 4.256725079024554e-07, + "loss": 0.372, "step": 130865 }, { - "epoch": 4.6, - "learning_rate": 8.231084711886939e-07, - "loss": 0.2434, + "epoch": 4.716545932893646, + "grad_norm": 0.23565801978111267, + "learning_rate": 4.25136469662582e-07, + "loss": 0.3533, "step": 130870 }, { - "epoch": 4.6, - "learning_rate": 8.223836598315049e-07, - "loss": 0.2605, + "epoch": 4.716726132554871, + "grad_norm": 0.2744382321834564, + "learning_rate": 4.2460076625046865e-07, + "loss": 0.377, "step": 130875 }, { - "epoch": 4.6, - "learning_rate": 8.216591624047282e-07, - "loss": 0.2579, + "epoch": 4.716906332216095, + "grad_norm": 0.28961044549942017, + "learning_rate": 4.2406539767340934e-07, + "loss": 0.3608, "step": 130880 }, { - "epoch": 4.6, - "learning_rate": 8.209349789177728e-07, - "loss": 0.2405, + "epoch": 4.71708653187732, + "grad_norm": 0.24323128163814545, + "learning_rate": 4.2353036393870116e-07, + "loss": 0.3578, "step": 130885 }, { - "epoch": 4.61, - "learning_rate": 8.202111093800396e-07, - "loss": 0.236, + "epoch": 4.717266731538545, + "grad_norm": 0.2941163182258606, + "learning_rate": 4.2299566505363266e-07, + "loss": 0.3813, "step": 130890 }, { - "epoch": 4.61, - "learning_rate": 8.194875538009322e-07, - "loss": 0.2623, + "epoch": 4.7174469311997695, + "grad_norm": 0.28495699167251587, + "learning_rate": 4.224613010254952e-07, + "loss": 0.4055, "step": 130895 }, { - "epoch": 4.61, - "learning_rate": 8.187643121898458e-07, - "loss": 0.259, + "epoch": 4.717627130860994, + "grad_norm": 0.36021918058395386, + "learning_rate": 4.21927271861558e-07, + "loss": 0.3711, "step": 130900 }, { - "epoch": 4.61, - "learning_rate": 8.180413845561646e-07, - "loss": 0.2347, + "epoch": 4.717807330522218, + "grad_norm": 0.23534350097179413, + "learning_rate": 4.213935775691041e-07, + "loss": 0.3555, "step": 130905 }, { - "epoch": 4.61, - "learning_rate": 8.173187709092811e-07, - "loss": 0.2467, + "epoch": 4.717987530183443, + "grad_norm": 0.2570185959339142, + "learning_rate": 4.2086021815540553e-07, + "loss": 0.3603, "step": 130910 }, { - "epoch": 4.61, - "learning_rate": 8.165964712585739e-07, - "loss": 0.2806, + "epoch": 4.718167729844668, + "grad_norm": 0.207231804728508, + "learning_rate": 4.2032719362772864e-07, + "loss": 0.3445, "step": 130915 }, { - "epoch": 4.61, - "learning_rate": 8.158744856134271e-07, - "loss": 0.2558, + "epoch": 4.718347929505892, + "grad_norm": 0.24607731401920319, + "learning_rate": 4.1979450399333434e-07, + "loss": 0.3545, "step": 130920 }, { - "epoch": 4.61, - "learning_rate": 8.151528139832109e-07, - "loss": 0.25, + "epoch": 4.718528129167117, + "grad_norm": 0.19284264743328094, + "learning_rate": 4.1926214925948057e-07, + "loss": 0.3441, "step": 130925 }, { - "epoch": 4.61, - "learning_rate": 8.144314563772987e-07, - "loss": 0.2682, + "epoch": 4.718708328828342, + "grad_norm": 0.2840045988559723, + "learning_rate": 4.1873012943342007e-07, + "loss": 0.3717, "step": 130930 }, { - "epoch": 4.61, - "learning_rate": 8.137104128050549e-07, - "loss": 0.2685, + "epoch": 4.7188885284895665, + "grad_norm": 0.26149991154670715, + "learning_rate": 4.181984445224052e-07, + "loss": 0.352, "step": 130935 }, { - "epoch": 4.61, - "learning_rate": 8.129896832758416e-07, - "loss": 0.2251, + "epoch": 4.719068728150791, + "grad_norm": 0.2628280818462372, + "learning_rate": 4.176670945336747e-07, + "loss": 0.3827, "step": 130940 }, { - "epoch": 4.61, - "learning_rate": 8.122692677990151e-07, - "loss": 0.2476, + "epoch": 4.719248927812016, + "grad_norm": 0.29585734009742737, + "learning_rate": 4.1713607947447276e-07, + "loss": 0.3666, "step": 130945 }, { - "epoch": 4.61, - "learning_rate": 8.115491663839348e-07, - "loss": 0.2486, + "epoch": 4.719429127473241, + "grad_norm": 0.26492440700531006, + "learning_rate": 4.166053993520297e-07, + "loss": 0.3757, "step": 130950 }, { - "epoch": 4.61, - "learning_rate": 8.108293790399485e-07, - "loss": 0.2653, + "epoch": 4.7196093271344655, + "grad_norm": 0.24920345842838287, + "learning_rate": 4.160750541735814e-07, + "loss": 0.3477, "step": 130955 }, { - "epoch": 4.61, - "learning_rate": 8.101099057763989e-07, - "loss": 0.2658, + "epoch": 4.719789526795689, + "grad_norm": 0.2741335928440094, + "learning_rate": 4.1554504394634984e-07, + "loss": 0.3818, "step": 130960 }, { - "epoch": 4.61, - "learning_rate": 8.093907466026313e-07, - "loss": 0.2512, + "epoch": 4.719969726456914, + "grad_norm": 0.2235456109046936, + "learning_rate": 4.1501536867755706e-07, + "loss": 0.3711, "step": 130965 }, { - "epoch": 4.61, - "learning_rate": 8.086719015279825e-07, - "loss": 0.2522, + "epoch": 4.720149926118139, + "grad_norm": 0.27255016565322876, + "learning_rate": 4.144860283744223e-07, + "loss": 0.381, "step": 130970 }, { - "epoch": 4.61, - "learning_rate": 8.07953370561787e-07, - "loss": 0.2256, + "epoch": 4.7203301257793635, + "grad_norm": 0.3171219229698181, + "learning_rate": 4.1395702304415364e-07, + "loss": 0.3889, "step": 130975 }, { - "epoch": 4.61, - "learning_rate": 8.072351537133732e-07, - "loss": 0.275, + "epoch": 4.720510325440588, + "grad_norm": 0.2652469873428345, + "learning_rate": 4.134283526939592e-07, + "loss": 0.3715, "step": 130980 }, { - "epoch": 4.61, - "learning_rate": 8.065172509920671e-07, - "loss": 0.2418, + "epoch": 4.720690525101813, + "grad_norm": 0.2219124734401703, + "learning_rate": 4.129000173310443e-07, + "loss": 0.3463, "step": 130985 }, { - "epoch": 4.61, - "learning_rate": 8.057996624071917e-07, - "loss": 0.2487, + "epoch": 4.720870724763038, + "grad_norm": 0.24788452684879303, + "learning_rate": 4.123720169626061e-07, + "loss": 0.3595, "step": 130990 }, { - "epoch": 4.61, - "learning_rate": 8.050823879680619e-07, - "loss": 0.2264, + "epoch": 4.721050924424262, + "grad_norm": 0.21029269695281982, + "learning_rate": 4.1184435159584143e-07, + "loss": 0.3676, "step": 130995 }, { - "epoch": 4.61, - "learning_rate": 8.043654276839951e-07, - "loss": 0.2602, + "epoch": 4.721231124085486, + "grad_norm": 0.28664112091064453, + "learning_rate": 4.1131702123793357e-07, + "loss": 0.3637, "step": 131000 }, { - "epoch": 4.61, - "eval_loss": 0.24853986501693726, - "eval_runtime": 10.534, - "eval_samples_per_second": 9.493, - "eval_steps_per_second": 9.493, + "epoch": 4.721231124085486, + "eval_loss": 0.42884311079978943, + "eval_runtime": 3.5394, + "eval_samples_per_second": 28.253, + "eval_steps_per_second": 7.063, "step": 131000 }, { - "epoch": 4.61, - "learning_rate": 8.036487815642923e-07, - "loss": 0.2605, + "epoch": 4.721411323746711, + "grad_norm": 0.3436447083950043, + "learning_rate": 4.107900258960712e-07, + "loss": 0.39, "step": 131005 }, { - "epoch": 4.61, - "learning_rate": 8.029324496182683e-07, - "loss": 0.2415, + "epoch": 4.721591523407936, + "grad_norm": 0.20727182924747467, + "learning_rate": 4.102633655774374e-07, + "loss": 0.3699, "step": 131010 }, { - "epoch": 4.61, - "learning_rate": 8.022164318552184e-07, - "loss": 0.2655, + "epoch": 4.7217717230691605, + "grad_norm": 0.24989864230155945, + "learning_rate": 4.0973704028920423e-07, + "loss": 0.3706, "step": 131015 }, { - "epoch": 4.61, - "learning_rate": 8.015007282844405e-07, - "loss": 0.2641, + "epoch": 4.721951922730385, + "grad_norm": 0.24569204449653625, + "learning_rate": 4.092110500385382e-07, + "loss": 0.3611, "step": 131020 }, { - "epoch": 4.61, - "learning_rate": 8.007853389152276e-07, - "loss": 0.2411, + "epoch": 4.72213212239161, + "grad_norm": 0.2474668025970459, + "learning_rate": 4.0868539483261413e-07, + "loss": 0.3665, "step": 131025 }, { - "epoch": 4.61, - "learning_rate": 8.000702637568719e-07, - "loss": 0.2311, + "epoch": 4.722312322052835, + "grad_norm": 0.2459608018398285, + "learning_rate": 4.081600746785874e-07, + "loss": 0.3716, "step": 131030 }, { - "epoch": 4.61, - "learning_rate": 7.99355502818655e-07, - "loss": 0.2515, + "epoch": 4.722492521714059, + "grad_norm": 0.2828958034515381, + "learning_rate": 4.076350895836245e-07, + "loss": 0.3439, "step": 131035 }, { - "epoch": 4.61, - "learning_rate": 7.986410561098556e-07, - "loss": 0.2698, + "epoch": 4.722672721375284, + "grad_norm": 0.4490903317928314, + "learning_rate": 4.0711043955486974e-07, + "loss": 0.3709, "step": 131040 }, { - "epoch": 4.61, - "learning_rate": 7.979269236397552e-07, - "loss": 0.2797, + "epoch": 4.722852921036509, + "grad_norm": 0.30186668038368225, + "learning_rate": 4.065861245994701e-07, + "loss": 0.3289, "step": 131045 }, { - "epoch": 4.61, - "learning_rate": 7.97213105417624e-07, - "loss": 0.2432, + "epoch": 4.723033120697733, + "grad_norm": 0.2187676876783371, + "learning_rate": 4.0606214472457826e-07, + "loss": 0.3377, "step": 131050 }, { - "epoch": 4.61, - "learning_rate": 7.964996014527298e-07, - "loss": 0.2413, + "epoch": 4.7232133203589575, + "grad_norm": 0.21422752737998962, + "learning_rate": 4.055384999373246e-07, + "loss": 0.3654, "step": 131055 }, { - "epoch": 4.61, - "learning_rate": 7.957864117543373e-07, - "loss": 0.2622, + "epoch": 4.723393520020182, + "grad_norm": 0.20950256288051605, + "learning_rate": 4.050151902448507e-07, + "loss": 0.375, "step": 131060 }, { - "epoch": 4.61, - "learning_rate": 7.950735363317085e-07, - "loss": 0.2344, + "epoch": 4.723573719681407, + "grad_norm": 0.23349595069885254, + "learning_rate": 4.044922156542813e-07, + "loss": 0.371, "step": 131065 }, { - "epoch": 4.61, - "learning_rate": 7.943609751940972e-07, - "loss": 0.2349, + "epoch": 4.723753919342632, + "grad_norm": 0.2319730669260025, + "learning_rate": 4.0396957617274134e-07, + "loss": 0.3757, "step": 131070 }, { - "epoch": 4.61, - "learning_rate": 7.936487283507543e-07, - "loss": 0.2351, + "epoch": 4.723934119003856, + "grad_norm": 0.22478604316711426, + "learning_rate": 4.034472718073556e-07, + "loss": 0.3753, "step": 131075 }, { - "epoch": 4.61, - "learning_rate": 7.929367958109307e-07, - "loss": 0.2361, + "epoch": 4.724114318665081, + "grad_norm": 0.19446243345737457, + "learning_rate": 4.02925302565238e-07, + "loss": 0.3447, "step": 131080 }, { - "epoch": 4.61, - "learning_rate": 7.922251775838718e-07, - "loss": 0.2419, + "epoch": 4.724294518326306, + "grad_norm": 0.2668735682964325, + "learning_rate": 4.0240366845350485e-07, + "loss": 0.3687, "step": 131085 }, { - "epoch": 4.61, - "learning_rate": 7.915138736788147e-07, - "loss": 0.261, + "epoch": 4.72447471798753, + "grad_norm": 0.22886505722999573, + "learning_rate": 4.018823694792562e-07, + "loss": 0.3632, "step": 131090 }, { - "epoch": 4.61, - "learning_rate": 7.908028841049992e-07, - "loss": 0.2685, + "epoch": 4.7246549176487544, + "grad_norm": 0.25273075699806213, + "learning_rate": 4.0136140564959735e-07, + "loss": 0.3876, "step": 131095 }, { - "epoch": 4.61, - "learning_rate": 7.900922088716483e-07, - "loss": 0.2618, + "epoch": 4.724835117309979, + "grad_norm": 0.2730422914028168, + "learning_rate": 4.0084077697162826e-07, + "loss": 0.4057, "step": 131100 }, { - "epoch": 4.61, - "learning_rate": 7.893818479879966e-07, - "loss": 0.2682, + "epoch": 4.725015316971204, + "grad_norm": 0.21895499527454376, + "learning_rate": 4.003204834524432e-07, + "loss": 0.3848, "step": 131105 }, { - "epoch": 4.61, - "learning_rate": 7.886718014632699e-07, - "loss": 0.2335, + "epoch": 4.725195516632429, + "grad_norm": 0.23920688033103943, + "learning_rate": 3.9980052509912267e-07, + "loss": 0.3527, "step": 131110 }, { - "epoch": 4.61, - "learning_rate": 7.879620693066803e-07, - "loss": 0.2395, + "epoch": 4.725375716293653, + "grad_norm": 0.2694016695022583, + "learning_rate": 3.99280901918761e-07, + "loss": 0.3571, "step": 131115 }, { - "epoch": 4.61, - "learning_rate": 7.872526515274481e-07, - "loss": 0.268, + "epoch": 4.725555915954878, + "grad_norm": 0.2753653824329376, + "learning_rate": 3.987616139184358e-07, + "loss": 0.3968, "step": 131120 }, { - "epoch": 4.61, - "learning_rate": 7.865435481347827e-07, - "loss": 0.2562, + "epoch": 4.725736115616103, + "grad_norm": 0.24241603910923004, + "learning_rate": 3.9824266110521924e-07, + "loss": 0.3743, "step": 131125 }, { - "epoch": 4.61, - "learning_rate": 7.858347591378907e-07, - "loss": 0.228, + "epoch": 4.725916315277328, + "grad_norm": 0.28032630681991577, + "learning_rate": 3.977240434861834e-07, + "loss": 0.382, "step": 131130 }, { - "epoch": 4.61, - "learning_rate": 7.851262845459756e-07, - "loss": 0.2396, + "epoch": 4.726096514938552, + "grad_norm": 0.2554890215396881, + "learning_rate": 3.972057610683921e-07, + "loss": 0.4077, "step": 131135 }, { - "epoch": 4.61, - "learning_rate": 7.844181243682414e-07, - "loss": 0.2498, + "epoch": 4.726276714599777, + "grad_norm": 0.27855169773101807, + "learning_rate": 3.9668781385891194e-07, + "loss": 0.3812, "step": 131140 }, { - "epoch": 4.61, - "learning_rate": 7.837102786138806e-07, - "loss": 0.2512, + "epoch": 4.726456914261001, + "grad_norm": 0.23848845064640045, + "learning_rate": 3.9617020186479557e-07, + "loss": 0.3784, "step": 131145 }, { - "epoch": 4.61, - "learning_rate": 7.830027472920803e-07, - "loss": 0.2554, + "epoch": 4.726637113922226, + "grad_norm": 0.2614748477935791, + "learning_rate": 3.956529250930985e-07, + "loss": 0.3939, "step": 131150 }, { - "epoch": 4.61, - "learning_rate": 7.822955304120277e-07, - "loss": 0.2474, + "epoch": 4.72681731358345, + "grad_norm": 0.22815276682376862, + "learning_rate": 3.95135983550865e-07, + "loss": 0.3932, "step": 131155 }, { - "epoch": 4.61, - "learning_rate": 7.815886279829098e-07, - "loss": 0.2653, + "epoch": 4.726997513244675, + "grad_norm": 0.28664857149124146, + "learning_rate": 3.9461937724514233e-07, + "loss": 0.369, "step": 131160 }, { - "epoch": 4.61, - "learning_rate": 7.808820400139028e-07, - "loss": 0.2367, + "epoch": 4.7271777129059, + "grad_norm": 0.25510892271995544, + "learning_rate": 3.941031061829636e-07, + "loss": 0.3868, "step": 131165 }, { - "epoch": 4.61, - "learning_rate": 7.801757665141824e-07, - "loss": 0.2574, + "epoch": 4.727357912567125, + "grad_norm": 0.227322056889534, + "learning_rate": 3.9358717037136786e-07, + "loss": 0.357, "step": 131170 }, { - "epoch": 4.62, - "learning_rate": 7.794698074929163e-07, - "loss": 0.2576, + "epoch": 4.727538112228349, + "grad_norm": 0.2149222195148468, + "learning_rate": 3.930715698173826e-07, + "loss": 0.3443, "step": 131175 }, { - "epoch": 4.62, - "learning_rate": 7.787641629592752e-07, - "loss": 0.2521, + "epoch": 4.727718311889574, + "grad_norm": 0.2911987900733948, + "learning_rate": 3.925563045280328e-07, + "loss": 0.385, "step": 131180 }, { - "epoch": 4.62, - "learning_rate": 7.780588329224154e-07, - "loss": 0.2378, + "epoch": 4.727898511550798, + "grad_norm": 0.22979766130447388, + "learning_rate": 3.9204137451034076e-07, + "loss": 0.3557, "step": 131185 }, { - "epoch": 4.62, - "learning_rate": 7.773538173915018e-07, - "loss": 0.2399, + "epoch": 4.728078711212023, + "grad_norm": 0.24125829339027405, + "learning_rate": 3.9152677977131734e-07, + "loss": 0.3766, "step": 131190 }, { - "epoch": 4.62, - "learning_rate": 7.766491163756828e-07, - "loss": 0.2391, + "epoch": 4.728258910873247, + "grad_norm": 0.270080029964447, + "learning_rate": 3.910125203179821e-07, + "loss": 0.3529, "step": 131195 }, { - "epoch": 4.62, - "learning_rate": 7.759447298841149e-07, - "loss": 0.2725, + "epoch": 4.728439110534472, + "grad_norm": 0.20879442989826202, + "learning_rate": 3.904985961573349e-07, + "loss": 0.3869, "step": 131200 }, { - "epoch": 4.62, - "learning_rate": 7.75240657925938e-07, - "loss": 0.257, + "epoch": 4.728619310195697, + "grad_norm": 0.23659254610538483, + "learning_rate": 3.899850072963784e-07, + "loss": 0.3665, "step": 131205 }, { - "epoch": 4.62, - "learning_rate": 7.745369005102976e-07, - "loss": 0.2435, + "epoch": 4.7287995098569215, + "grad_norm": 0.23854908347129822, + "learning_rate": 3.8947175374211274e-07, + "loss": 0.3918, "step": 131210 }, { - "epoch": 4.62, - "learning_rate": 7.73833457646328e-07, - "loss": 0.247, + "epoch": 4.728979709518146, + "grad_norm": 0.2165476530790329, + "learning_rate": 3.889588355015267e-07, + "loss": 0.3698, "step": 131215 }, { - "epoch": 4.62, - "learning_rate": 7.731303293431691e-07, - "loss": 0.2392, + "epoch": 4.729159909179371, + "grad_norm": 0.2346785068511963, + "learning_rate": 3.8844625258161737e-07, + "loss": 0.3508, "step": 131220 }, { - "epoch": 4.62, - "learning_rate": 7.72427515609947e-07, - "loss": 0.2595, + "epoch": 4.729340108840596, + "grad_norm": 0.25256285071372986, + "learning_rate": 3.879340049893598e-07, + "loss": 0.373, "step": 131225 }, { - "epoch": 4.62, - "learning_rate": 7.717250164557849e-07, - "loss": 0.2273, + "epoch": 4.7295203085018205, + "grad_norm": 0.22178764641284943, + "learning_rate": 3.874220927317346e-07, + "loss": 0.3534, "step": 131230 }, { - "epoch": 4.62, - "learning_rate": 7.710228318898033e-07, - "loss": 0.254, + "epoch": 4.729700508163044, + "grad_norm": 0.27392688393592834, + "learning_rate": 3.8691051581571933e-07, + "loss": 0.3442, "step": 131235 }, { - "epoch": 4.62, - "learning_rate": 7.703209619211254e-07, - "loss": 0.2277, + "epoch": 4.729880707824269, + "grad_norm": 0.26804807782173157, + "learning_rate": 3.8639927424828635e-07, + "loss": 0.3818, "step": 131240 }, { - "epoch": 4.62, - "learning_rate": 7.696194065588635e-07, - "loss": 0.2502, + "epoch": 4.730060907485494, + "grad_norm": 0.2786375880241394, + "learning_rate": 3.8588836803639397e-07, + "loss": 0.4149, "step": 131245 }, { - "epoch": 4.62, - "learning_rate": 7.689181658121214e-07, - "loss": 0.2546, + "epoch": 4.7302411071467185, + "grad_norm": 0.2680922746658325, + "learning_rate": 3.853777971870087e-07, + "loss": 0.3972, "step": 131250 }, { - "epoch": 4.62, - "learning_rate": 7.682172396900111e-07, - "loss": 0.2075, + "epoch": 4.730421306807943, + "grad_norm": 0.2476295381784439, + "learning_rate": 3.848675617070835e-07, + "loss": 0.4079, "step": 131255 }, { - "epoch": 4.62, - "learning_rate": 7.67516628201631e-07, - "loss": 0.2491, + "epoch": 4.730601506469168, + "grad_norm": 0.2629091143608093, + "learning_rate": 3.843576616035738e-07, + "loss": 0.3755, "step": 131260 }, { - "epoch": 4.62, - "learning_rate": 7.668163313560795e-07, - "loss": 0.2517, + "epoch": 4.730781706130393, + "grad_norm": 0.26332563161849976, + "learning_rate": 3.8384809688342684e-07, + "loss": 0.3467, "step": 131265 }, { - "epoch": 4.62, - "learning_rate": 7.661163491624434e-07, - "loss": 0.2478, + "epoch": 4.7309619057916175, + "grad_norm": 0.21108414232730865, + "learning_rate": 3.833388675535815e-07, + "loss": 0.3619, "step": 131270 }, { - "epoch": 4.62, - "learning_rate": 7.654166816298159e-07, - "loss": 0.239, + "epoch": 4.731142105452841, + "grad_norm": 0.24823954701423645, + "learning_rate": 3.828299736209795e-07, + "loss": 0.3529, "step": 131275 }, { - "epoch": 4.62, - "learning_rate": 7.647173287672865e-07, - "loss": 0.2577, + "epoch": 4.731322305114066, + "grad_norm": 0.261547714471817, + "learning_rate": 3.8232141509255137e-07, + "loss": 0.3579, "step": 131280 }, { - "epoch": 4.62, - "learning_rate": 7.640182905839261e-07, - "loss": 0.2768, + "epoch": 4.731502504775291, + "grad_norm": 0.22427648305892944, + "learning_rate": 3.818131919752277e-07, + "loss": 0.3685, "step": 131285 }, { - "epoch": 4.62, - "learning_rate": 7.633195670888161e-07, - "loss": 0.2681, + "epoch": 4.7316827044365155, + "grad_norm": 0.27016377449035645, + "learning_rate": 3.8130530427593626e-07, + "loss": 0.3812, "step": 131290 }, { - "epoch": 4.62, - "learning_rate": 7.626211582910326e-07, - "loss": 0.2349, + "epoch": 4.73186290409774, + "grad_norm": 0.2747372090816498, + "learning_rate": 3.8079775200159096e-07, + "loss": 0.3558, "step": 131295 }, { - "epoch": 4.62, - "learning_rate": 7.61923064199635e-07, - "loss": 0.2773, + "epoch": 4.732043103758965, + "grad_norm": 0.2099730223417282, + "learning_rate": 3.802905351591113e-07, + "loss": 0.3624, "step": 131300 }, { - "epoch": 4.62, - "learning_rate": 7.612252848236967e-07, - "loss": 0.2587, + "epoch": 4.73222330342019, + "grad_norm": 0.2557778060436249, + "learning_rate": 3.797836537554056e-07, + "loss": 0.3553, "step": 131305 }, { - "epoch": 4.62, - "learning_rate": 7.605278201722715e-07, - "loss": 0.2307, + "epoch": 4.732403503081414, + "grad_norm": 0.26313820481300354, + "learning_rate": 3.792771077973795e-07, + "loss": 0.3633, "step": 131310 }, { - "epoch": 4.62, - "learning_rate": 7.598306702544217e-07, - "loss": 0.2553, + "epoch": 4.732583702742639, + "grad_norm": 0.25919729471206665, + "learning_rate": 3.787708972919385e-07, + "loss": 0.4063, "step": 131315 }, { - "epoch": 4.62, - "learning_rate": 7.591338350791954e-07, - "loss": 0.2531, + "epoch": 4.732763902403864, + "grad_norm": 0.26296132802963257, + "learning_rate": 3.7826502224597716e-07, + "loss": 0.3512, "step": 131320 }, { - "epoch": 4.62, - "learning_rate": 7.584373146556411e-07, - "loss": 0.2391, + "epoch": 4.732944102065088, + "grad_norm": 0.23905587196350098, + "learning_rate": 3.7775948266638717e-07, + "loss": 0.389, "step": 131325 }, { - "epoch": 4.62, - "learning_rate": 7.577411089927989e-07, - "loss": 0.2578, + "epoch": 4.7331243017263125, + "grad_norm": 0.21689580380916595, + "learning_rate": 3.7725427856005747e-07, + "loss": 0.3284, "step": 131330 }, { - "epoch": 4.62, - "learning_rate": 7.570452180997139e-07, - "loss": 0.2569, + "epoch": 4.733304501387537, + "grad_norm": 0.28716617822647095, + "learning_rate": 3.767494099338714e-07, + "loss": 0.3821, "step": 131335 }, { - "epoch": 4.62, - "learning_rate": 7.56349641985421e-07, - "loss": 0.2361, + "epoch": 4.733484701048762, + "grad_norm": 0.21327391266822815, + "learning_rate": 3.762448767947069e-07, + "loss": 0.3663, "step": 131340 }, { - "epoch": 4.62, - "learning_rate": 7.556543806589489e-07, - "loss": 0.2516, + "epoch": 4.733664900709987, + "grad_norm": 0.2822578251361847, + "learning_rate": 3.757406791494389e-07, + "loss": 0.3799, "step": 131345 }, { - "epoch": 4.62, - "learning_rate": 7.549594341293292e-07, - "loss": 0.2621, + "epoch": 4.733845100371211, + "grad_norm": 0.23147007822990417, + "learning_rate": 3.752368170049342e-07, + "loss": 0.3403, "step": 131350 }, { - "epoch": 4.62, - "learning_rate": 7.542648024055826e-07, - "loss": 0.2666, + "epoch": 4.734025300032436, + "grad_norm": 0.2217051088809967, + "learning_rate": 3.747332903680623e-07, + "loss": 0.3961, "step": 131355 }, { - "epoch": 4.62, - "learning_rate": 7.535704854967296e-07, - "loss": 0.2504, + "epoch": 4.734205499693661, + "grad_norm": 0.23157884180545807, + "learning_rate": 3.742300992456843e-07, + "loss": 0.3447, "step": 131360 }, { - "epoch": 4.62, - "learning_rate": 7.528764834117824e-07, - "loss": 0.235, + "epoch": 4.734385699354885, + "grad_norm": 0.3138419985771179, + "learning_rate": 3.737272436446504e-07, + "loss": 0.4007, "step": 131365 }, { - "epoch": 4.62, - "learning_rate": 7.521827961597561e-07, - "loss": 0.2512, + "epoch": 4.7345658990161095, + "grad_norm": 0.2702825963497162, + "learning_rate": 3.7322472357181615e-07, + "loss": 0.3663, "step": 131370 }, { - "epoch": 4.62, - "learning_rate": 7.514894237496573e-07, - "loss": 0.2873, + "epoch": 4.734746098677334, + "grad_norm": 0.22164510190486908, + "learning_rate": 3.7272253903402607e-07, + "loss": 0.328, "step": 131375 }, { - "epoch": 4.62, - "learning_rate": 7.507963661904871e-07, - "loss": 0.2377, + "epoch": 4.734926298338559, + "grad_norm": 0.267595499753952, + "learning_rate": 3.722206900381248e-07, + "loss": 0.368, "step": 131380 }, { - "epoch": 4.62, - "learning_rate": 7.50103623491244e-07, - "loss": 0.2497, + "epoch": 4.735106497999784, + "grad_norm": 0.2832372486591339, + "learning_rate": 3.7171917659094834e-07, + "loss": 0.3878, "step": 131385 }, { - "epoch": 4.62, - "learning_rate": 7.494111956609262e-07, - "loss": 0.2483, + "epoch": 4.735286697661008, + "grad_norm": 0.22526337206363678, + "learning_rate": 3.7121799869933026e-07, + "loss": 0.3577, "step": 131390 }, { - "epoch": 4.62, - "learning_rate": 7.48719082708521e-07, - "loss": 0.2678, + "epoch": 4.735466897322233, + "grad_norm": 0.3088375926017761, + "learning_rate": 3.707171563700956e-07, + "loss": 0.4084, "step": 131395 }, { - "epoch": 4.62, - "learning_rate": 7.480272846430159e-07, - "loss": 0.2581, + "epoch": 4.735647096983458, + "grad_norm": 0.3013952374458313, + "learning_rate": 3.702166496100723e-07, + "loss": 0.3899, "step": 131400 }, { - "epoch": 4.62, - "learning_rate": 7.473358014733949e-07, - "loss": 0.2626, + "epoch": 4.735827296644683, + "grad_norm": 0.24282796680927277, + "learning_rate": 3.697164784260826e-07, + "loss": 0.3846, "step": 131405 }, { - "epoch": 4.62, - "learning_rate": 7.466446332086347e-07, - "loss": 0.2685, + "epoch": 4.736007496305907, + "grad_norm": 0.21715568006038666, + "learning_rate": 3.6921664282493493e-07, + "loss": 0.345, "step": 131410 }, { - "epoch": 4.62, - "learning_rate": 7.459537798577109e-07, - "loss": 0.2595, + "epoch": 4.736187695967132, + "grad_norm": 0.23071317374706268, + "learning_rate": 3.687171428134406e-07, + "loss": 0.3733, "step": 131415 }, { - "epoch": 4.62, - "learning_rate": 7.452632414295946e-07, - "loss": 0.2587, + "epoch": 4.736367895628356, + "grad_norm": 0.22580982744693756, + "learning_rate": 3.68217978398408e-07, + "loss": 0.3632, "step": 131420 }, { - "epoch": 4.62, - "learning_rate": 7.445730179332478e-07, - "loss": 0.2841, + "epoch": 4.736548095289581, + "grad_norm": 0.2745913863182068, + "learning_rate": 3.6771914958663723e-07, + "loss": 0.3948, "step": 131425 }, { - "epoch": 4.62, - "learning_rate": 7.438831093776383e-07, - "loss": 0.2513, + "epoch": 4.736728294950805, + "grad_norm": 0.22211065888404846, + "learning_rate": 3.6722065638492287e-07, + "loss": 0.3529, "step": 131430 }, { - "epoch": 4.62, - "learning_rate": 7.431935157717201e-07, - "loss": 0.2596, + "epoch": 4.73690849461203, + "grad_norm": 0.2897282838821411, + "learning_rate": 3.667224988000567e-07, + "loss": 0.3725, "step": 131435 }, { - "epoch": 4.62, - "learning_rate": 7.425042371244472e-07, - "loss": 0.2492, + "epoch": 4.737088694273255, + "grad_norm": 0.2873408794403076, + "learning_rate": 3.6622467683882777e-07, + "loss": 0.3692, "step": 131440 }, { - "epoch": 4.62, - "learning_rate": 7.418152734447708e-07, - "loss": 0.2612, + "epoch": 4.73726889393448, + "grad_norm": 0.24957486987113953, + "learning_rate": 3.657271905080195e-07, + "loss": 0.3915, "step": 131445 }, { - "epoch": 4.62, - "learning_rate": 7.411266247416365e-07, - "loss": 0.2343, + "epoch": 4.737449093595704, + "grad_norm": 0.24464282393455505, + "learning_rate": 3.652300398144098e-07, + "loss": 0.3876, "step": 131450 }, { - "epoch": 4.62, - "learning_rate": 7.40438291023987e-07, - "loss": 0.244, + "epoch": 4.737629293256929, + "grad_norm": 0.3088054358959198, + "learning_rate": 3.647332247647711e-07, + "loss": 0.3674, "step": 131455 }, { - "epoch": 4.63, - "learning_rate": 7.397502723007571e-07, - "loss": 0.254, + "epoch": 4.737809492918153, + "grad_norm": 0.26032140851020813, + "learning_rate": 3.6423674536587015e-07, + "loss": 0.387, "step": 131460 }, { - "epoch": 4.63, - "learning_rate": 7.390625685808811e-07, - "loss": 0.2319, + "epoch": 4.737989692579378, + "grad_norm": 0.2570837736129761, + "learning_rate": 3.6374060162447653e-07, + "loss": 0.3837, "step": 131465 }, { - "epoch": 4.63, - "learning_rate": 7.383751798732907e-07, - "loss": 0.2528, + "epoch": 4.738169892240602, + "grad_norm": 0.19201694428920746, + "learning_rate": 3.6324479354734595e-07, + "loss": 0.3706, "step": 131470 }, { - "epoch": 4.63, - "learning_rate": 7.376881061869096e-07, - "loss": 0.2371, + "epoch": 4.738350091901827, + "grad_norm": 0.28280356526374817, + "learning_rate": 3.627493211412342e-07, + "loss": 0.4057, "step": 131475 }, { - "epoch": 4.63, - "learning_rate": 7.370013475306581e-07, - "loss": 0.2413, + "epoch": 4.738530291563052, + "grad_norm": 0.26518356800079346, + "learning_rate": 3.6225418441289684e-07, + "loss": 0.4072, "step": 131480 }, { - "epoch": 4.63, - "learning_rate": 7.363149039134542e-07, - "loss": 0.2594, + "epoch": 4.738710491224277, + "grad_norm": 0.261747270822525, + "learning_rate": 3.61759383369073e-07, + "loss": 0.3822, "step": 131485 }, { - "epoch": 4.63, - "learning_rate": 7.356287753442131e-07, - "loss": 0.269, + "epoch": 4.738890690885501, + "grad_norm": 0.2915024757385254, + "learning_rate": 3.612649180165101e-07, + "loss": 0.3657, "step": 131490 }, { - "epoch": 4.63, - "learning_rate": 7.349429618318416e-07, - "loss": 0.2414, + "epoch": 4.739070890546726, + "grad_norm": 0.2875518202781677, + "learning_rate": 3.607707883619388e-07, + "loss": 0.3783, "step": 131495 }, { - "epoch": 4.63, - "learning_rate": 7.342574633852406e-07, - "loss": 0.2318, + "epoch": 4.739251090207951, + "grad_norm": 0.2599601745605469, + "learning_rate": 3.6027699441209815e-07, + "loss": 0.3707, "step": 131500 }, { - "epoch": 4.63, - "eval_loss": 0.24853749573230743, - "eval_runtime": 10.5675, - "eval_samples_per_second": 9.463, - "eval_steps_per_second": 9.463, + "epoch": 4.739251090207951, + "eval_loss": 0.42884549498558044, + "eval_runtime": 3.5372, + "eval_samples_per_second": 28.271, + "eval_steps_per_second": 7.068, "step": 131500 }, { - "epoch": 4.63, - "learning_rate": 7.335722800133171e-07, - "loss": 0.2536, + "epoch": 4.7394312898691755, + "grad_norm": 0.23225200176239014, + "learning_rate": 3.5978353617371065e-07, + "loss": 0.3767, "step": 131505 }, { - "epoch": 4.63, - "learning_rate": 7.328874117249668e-07, - "loss": 0.2504, + "epoch": 4.739611489530399, + "grad_norm": 0.2974475920200348, + "learning_rate": 3.5929041365350413e-07, + "loss": 0.3841, "step": 131510 }, { - "epoch": 4.63, - "learning_rate": 7.322028585290768e-07, - "loss": 0.2521, + "epoch": 4.739791689191624, + "grad_norm": 0.28682082891464233, + "learning_rate": 3.587976268581955e-07, + "loss": 0.3533, "step": 131515 }, { - "epoch": 4.63, - "learning_rate": 7.315186204345403e-07, - "loss": 0.2433, + "epoch": 4.739971888852849, + "grad_norm": 0.2352001816034317, + "learning_rate": 3.5830517579449605e-07, + "loss": 0.3648, "step": 131520 }, { - "epoch": 4.63, - "learning_rate": 7.308346974502444e-07, - "loss": 0.2541, + "epoch": 4.7401520885140735, + "grad_norm": 0.25316321849823, + "learning_rate": 3.5781306046911986e-07, + "loss": 0.3353, "step": 131525 }, { - "epoch": 4.63, - "learning_rate": 7.301510895850655e-07, - "loss": 0.2189, + "epoch": 4.740332288175298, + "grad_norm": 0.26786184310913086, + "learning_rate": 3.573212808887699e-07, + "loss": 0.3874, "step": 131530 }, { - "epoch": 4.63, - "learning_rate": 7.294677968478797e-07, - "loss": 0.2477, + "epoch": 4.740512487836523, + "grad_norm": 0.27805742621421814, + "learning_rate": 3.568298370601436e-07, + "loss": 0.4236, "step": 131535 }, { - "epoch": 4.63, - "learning_rate": 7.287848192475577e-07, - "loss": 0.234, + "epoch": 4.740692687497748, + "grad_norm": 0.3236870765686035, + "learning_rate": 3.5633872898994113e-07, + "loss": 0.4053, "step": 131540 }, { - "epoch": 4.63, - "learning_rate": 7.281021567929703e-07, - "loss": 0.2328, + "epoch": 4.7408728871589725, + "grad_norm": 0.2795065641403198, + "learning_rate": 3.5584795668485435e-07, + "loss": 0.3755, "step": 131545 }, { - "epoch": 4.63, - "learning_rate": 7.274198094929824e-07, - "loss": 0.2481, + "epoch": 4.741053086820196, + "grad_norm": 0.23501679301261902, + "learning_rate": 3.553575201515669e-07, + "loss": 0.3576, "step": 131550 }, { - "epoch": 4.63, - "learning_rate": 7.267377773564509e-07, - "loss": 0.2676, + "epoch": 4.741233286481421, + "grad_norm": 0.2597077190876007, + "learning_rate": 3.5486741939676215e-07, + "loss": 0.3755, "step": 131555 }, { - "epoch": 4.63, - "learning_rate": 7.2605606039223e-07, - "loss": 0.2516, + "epoch": 4.741413486142646, + "grad_norm": 0.2638082504272461, + "learning_rate": 3.5437765442711545e-07, + "loss": 0.3753, "step": 131560 }, { - "epoch": 4.63, - "learning_rate": 7.253746586091792e-07, - "loss": 0.2411, + "epoch": 4.7415936858038705, + "grad_norm": 0.32323339581489563, + "learning_rate": 3.53888225249302e-07, + "loss": 0.3974, "step": 131565 }, { - "epoch": 4.63, - "learning_rate": 7.246935720161358e-07, - "loss": 0.2535, + "epoch": 4.741773885465095, + "grad_norm": 0.26547950506210327, + "learning_rate": 3.5339913186999416e-07, + "loss": 0.3688, "step": 131570 }, { - "epoch": 4.63, - "learning_rate": 7.24012800621951e-07, - "loss": 0.2354, + "epoch": 4.74195408512632, + "grad_norm": 0.3076135516166687, + "learning_rate": 3.5291037429584784e-07, + "loss": 0.3674, "step": 131575 }, { - "epoch": 4.63, - "learning_rate": 7.233323444354595e-07, - "loss": 0.2391, + "epoch": 4.742134284787545, + "grad_norm": 0.2687076926231384, + "learning_rate": 3.5242195253352426e-07, + "loss": 0.335, "step": 131580 }, { - "epoch": 4.63, - "learning_rate": 7.226522034655015e-07, - "loss": 0.2697, + "epoch": 4.7423144844487695, + "grad_norm": 0.23814557492733002, + "learning_rate": 3.519338665896821e-07, + "loss": 0.3682, "step": 131585 }, { - "epoch": 4.63, - "learning_rate": 7.219723777209058e-07, - "loss": 0.277, + "epoch": 4.742494684109994, + "grad_norm": 0.21820604801177979, + "learning_rate": 3.514461164709687e-07, + "loss": 0.3579, "step": 131590 }, { - "epoch": 4.63, - "learning_rate": 7.212928672104963e-07, - "loss": 0.2566, + "epoch": 4.742674883771219, + "grad_norm": 0.23697353899478912, + "learning_rate": 3.509587021840316e-07, + "loss": 0.337, "step": 131595 }, { - "epoch": 4.63, - "learning_rate": 7.206136719430989e-07, - "loss": 0.2366, + "epoch": 4.742855083432443, + "grad_norm": 0.36326366662979126, + "learning_rate": 3.5047162373550433e-07, + "loss": 0.4014, "step": 131600 }, { - "epoch": 4.63, - "learning_rate": 7.199347919275345e-07, - "loss": 0.2788, + "epoch": 4.7430352830936675, + "grad_norm": 0.2886950373649597, + "learning_rate": 3.499848811320344e-07, + "loss": 0.3733, "step": 131605 }, { - "epoch": 4.63, - "learning_rate": 7.192562271726155e-07, - "loss": 0.2382, + "epoch": 4.743215482754892, + "grad_norm": 0.2526874840259552, + "learning_rate": 3.494984743802443e-07, + "loss": 0.3709, "step": 131610 }, { - "epoch": 4.63, - "learning_rate": 7.185779776871487e-07, - "loss": 0.2471, + "epoch": 4.743395682416117, + "grad_norm": 0.3060406744480133, + "learning_rate": 3.4901240348676756e-07, + "loss": 0.368, "step": 131615 }, { - "epoch": 4.63, - "learning_rate": 7.179000434799493e-07, - "loss": 0.2557, + "epoch": 4.743575882077342, + "grad_norm": 0.23225359618663788, + "learning_rate": 3.485266684582239e-07, + "loss": 0.3697, "step": 131620 }, { - "epoch": 4.63, - "learning_rate": 7.172224245598102e-07, - "loss": 0.2397, + "epoch": 4.743756081738566, + "grad_norm": 0.2793751060962677, + "learning_rate": 3.4804126930123026e-07, + "loss": 0.3779, "step": 131625 }, { - "epoch": 4.63, - "learning_rate": 7.165451209355411e-07, - "loss": 0.2317, + "epoch": 4.743936281399791, + "grad_norm": 0.25954195857048035, + "learning_rate": 3.4755620602240357e-07, + "loss": 0.3558, "step": 131630 }, { - "epoch": 4.63, - "learning_rate": 7.158681326159239e-07, - "loss": 0.2725, + "epoch": 4.744116481061016, + "grad_norm": 0.224226713180542, + "learning_rate": 3.4707147862834965e-07, + "loss": 0.3742, "step": 131635 }, { - "epoch": 4.63, - "learning_rate": 7.151914596097597e-07, - "loss": 0.2184, + "epoch": 4.74429668072224, + "grad_norm": 0.22823956608772278, + "learning_rate": 3.4658708712567433e-07, + "loss": 0.3546, "step": 131640 }, { - "epoch": 4.63, - "learning_rate": 7.145151019258306e-07, - "loss": 0.249, + "epoch": 4.7444768803834645, + "grad_norm": 0.27476179599761963, + "learning_rate": 3.461030315209779e-07, + "loss": 0.3922, "step": 131645 }, { - "epoch": 4.63, - "learning_rate": 7.138390595729183e-07, - "loss": 0.2474, + "epoch": 4.744657080044689, + "grad_norm": 0.26549628376960754, + "learning_rate": 3.4561931182085225e-07, + "loss": 0.3772, "step": 131650 }, { - "epoch": 4.63, - "learning_rate": 7.131633325597964e-07, - "loss": 0.2322, + "epoch": 4.744837279705914, + "grad_norm": 0.29317188262939453, + "learning_rate": 3.4513592803188944e-07, + "loss": 0.3897, "step": 131655 }, { - "epoch": 4.63, - "learning_rate": 7.124879208952468e-07, - "loss": 0.2616, + "epoch": 4.745017479367139, + "grad_norm": 0.2582772970199585, + "learning_rate": 3.446528801606785e-07, + "loss": 0.3766, "step": 131660 }, { - "epoch": 4.63, - "learning_rate": 7.118128245880346e-07, - "loss": 0.2338, + "epoch": 4.745197679028363, + "grad_norm": 0.2573517858982086, + "learning_rate": 3.4417016821379765e-07, + "loss": 0.3822, "step": 131665 }, { - "epoch": 4.63, - "learning_rate": 7.111380436469279e-07, - "loss": 0.2484, + "epoch": 4.745377878689588, + "grad_norm": 0.2170310914516449, + "learning_rate": 3.436877921978249e-07, + "loss": 0.3824, "step": 131670 }, { - "epoch": 4.63, - "learning_rate": 7.104635780806834e-07, - "loss": 0.2542, + "epoch": 4.745558078350813, + "grad_norm": 0.2496137022972107, + "learning_rate": 3.432057521193327e-07, + "loss": 0.3849, "step": 131675 }, { - "epoch": 4.63, - "learning_rate": 7.09789427898061e-07, - "loss": 0.2621, + "epoch": 4.745738278012038, + "grad_norm": 0.285085529088974, + "learning_rate": 3.427240479848881e-07, + "loss": 0.3786, "step": 131680 }, { - "epoch": 4.63, - "learning_rate": 7.091155931078175e-07, - "loss": 0.2529, + "epoch": 4.745918477673262, + "grad_norm": 0.2879694104194641, + "learning_rate": 3.4224267980105807e-07, + "loss": 0.3835, "step": 131685 }, { - "epoch": 4.63, - "learning_rate": 7.084420737187014e-07, - "loss": 0.2661, + "epoch": 4.746098677334487, + "grad_norm": 0.26048213243484497, + "learning_rate": 3.417616475743929e-07, + "loss": 0.3415, "step": 131690 }, { - "epoch": 4.63, - "learning_rate": 7.07768869739453e-07, - "loss": 0.2306, + "epoch": 4.746278876995711, + "grad_norm": 0.29208385944366455, + "learning_rate": 3.412809513114512e-07, + "loss": 0.362, "step": 131695 }, { - "epoch": 4.63, - "learning_rate": 7.070959811788181e-07, - "loss": 0.2482, + "epoch": 4.746459076656936, + "grad_norm": 0.23016516864299774, + "learning_rate": 3.4080059101878614e-07, + "loss": 0.3496, "step": 131700 }, { - "epoch": 4.63, - "learning_rate": 7.06423408045534e-07, - "loss": 0.2536, + "epoch": 4.74663927631816, + "grad_norm": 0.22892868518829346, + "learning_rate": 3.403205667029369e-07, + "loss": 0.37, "step": 131705 }, { - "epoch": 4.63, - "learning_rate": 7.057511503483299e-07, - "loss": 0.2493, + "epoch": 4.746819475979385, + "grad_norm": 0.2265908122062683, + "learning_rate": 3.398408783704454e-07, + "loss": 0.3758, "step": 131710 }, { - "epoch": 4.63, - "learning_rate": 7.050792080959351e-07, - "loss": 0.2561, + "epoch": 4.74699967564061, + "grad_norm": 0.22055870294570923, + "learning_rate": 3.393615260278482e-07, + "loss": 0.4055, "step": 131715 }, { - "epoch": 4.63, - "learning_rate": 7.044075812970785e-07, - "loss": 0.261, + "epoch": 4.747179875301835, + "grad_norm": 0.22395636141300201, + "learning_rate": 3.3888250968167335e-07, + "loss": 0.4186, "step": 131720 }, { - "epoch": 4.63, - "learning_rate": 7.037362699604783e-07, - "loss": 0.2364, + "epoch": 4.747360074963059, + "grad_norm": 0.24537676572799683, + "learning_rate": 3.3840382933845174e-07, + "loss": 0.3678, "step": 131725 }, { - "epoch": 4.63, - "learning_rate": 7.030652740948496e-07, - "loss": 0.2581, + "epoch": 4.747540274624284, + "grad_norm": 0.28544852137565613, + "learning_rate": 3.379254850047031e-07, + "loss": 0.3729, "step": 131730 }, { - "epoch": 4.63, - "learning_rate": 7.023945937089077e-07, - "loss": 0.2381, + "epoch": 4.747720474285508, + "grad_norm": 0.2334669977426529, + "learning_rate": 3.3744747668694456e-07, + "loss": 0.366, "step": 131735 }, { - "epoch": 4.63, - "learning_rate": 7.01724228811354e-07, - "loss": 0.2489, + "epoch": 4.747900673946733, + "grad_norm": 0.26452332735061646, + "learning_rate": 3.369698043916902e-07, + "loss": 0.3512, "step": 131740 }, { - "epoch": 4.64, - "learning_rate": 7.010541794109038e-07, - "loss": 0.2608, + "epoch": 4.748080873607957, + "grad_norm": 0.2582310140132904, + "learning_rate": 3.364924681254461e-07, + "loss": 0.3748, "step": 131745 }, { - "epoch": 4.64, - "learning_rate": 7.003844455162473e-07, - "loss": 0.2626, + "epoch": 4.748261073269182, + "grad_norm": 0.23445448279380798, + "learning_rate": 3.36015467894718e-07, + "loss": 0.3843, "step": 131750 }, { - "epoch": 4.64, - "learning_rate": 6.997150271360886e-07, - "loss": 0.2393, + "epoch": 4.748441272930407, + "grad_norm": 0.25989097356796265, + "learning_rate": 3.355388037060064e-07, + "loss": 0.3922, "step": 131755 }, { - "epoch": 4.64, - "learning_rate": 6.990459242791125e-07, - "loss": 0.2539, + "epoch": 4.748621472591632, + "grad_norm": 0.24597260355949402, + "learning_rate": 3.350624755658005e-07, + "loss": 0.3349, "step": 131760 }, { - "epoch": 4.64, - "learning_rate": 6.983771369540148e-07, - "loss": 0.2579, + "epoch": 4.748801672252856, + "grad_norm": 0.2952145040035248, + "learning_rate": 3.345864834805951e-07, + "loss": 0.3894, "step": 131765 }, { - "epoch": 4.64, - "learning_rate": 6.977086651694692e-07, - "loss": 0.2256, + "epoch": 4.748981871914081, + "grad_norm": 0.25986775755882263, + "learning_rate": 3.341108274568683e-07, + "loss": 0.3596, "step": 131770 }, { - "epoch": 4.64, - "learning_rate": 6.97040508934163e-07, - "loss": 0.2416, + "epoch": 4.749162071575306, + "grad_norm": 0.2798271179199219, + "learning_rate": 3.336355075011122e-07, + "loss": 0.3774, "step": 131775 }, { - "epoch": 4.64, - "learning_rate": 6.963726682567701e-07, - "loss": 0.2481, + "epoch": 4.7493422712365305, + "grad_norm": 0.25875961780548096, + "learning_rate": 3.3316052361979655e-07, + "loss": 0.3871, "step": 131780 }, { - "epoch": 4.64, - "learning_rate": 6.95705143145961e-07, - "loss": 0.2529, + "epoch": 4.749522470897754, + "grad_norm": 0.32598257064819336, + "learning_rate": 3.326858758193885e-07, + "loss": 0.394, "step": 131785 }, { - "epoch": 4.64, - "learning_rate": 6.95037933610404e-07, - "loss": 0.2361, + "epoch": 4.749702670558979, + "grad_norm": 0.2517693042755127, + "learning_rate": 3.3221156410636335e-07, + "loss": 0.3926, "step": 131790 }, { - "epoch": 4.64, - "learning_rate": 6.943710396587589e-07, - "loss": 0.2571, + "epoch": 4.749882870220204, + "grad_norm": 0.21797238290309906, + "learning_rate": 3.3173758848717705e-07, + "loss": 0.3232, "step": 131795 }, { - "epoch": 4.64, - "learning_rate": 6.937044612996906e-07, - "loss": 0.2358, + "epoch": 4.750063069881429, + "grad_norm": 0.1989823281764984, + "learning_rate": 3.312639489682939e-07, + "loss": 0.3844, "step": 131800 }, { - "epoch": 4.64, - "learning_rate": 6.93038198541851e-07, - "loss": 0.2303, + "epoch": 4.750243269542653, + "grad_norm": 0.22494913637638092, + "learning_rate": 3.307906455561616e-07, + "loss": 0.3855, "step": 131805 }, { - "epoch": 4.64, - "learning_rate": 6.923722513938912e-07, - "loss": 0.2525, + "epoch": 4.750423469203878, + "grad_norm": 0.22416415810585022, + "learning_rate": 3.303176782572276e-07, + "loss": 0.3918, "step": 131810 }, { - "epoch": 4.64, - "learning_rate": 6.917066198644601e-07, - "loss": 0.2429, + "epoch": 4.750603668865103, + "grad_norm": 0.2782622277736664, + "learning_rate": 3.2984504707794237e-07, + "loss": 0.3599, "step": 131815 }, { - "epoch": 4.64, - "learning_rate": 6.910413039621977e-07, - "loss": 0.2485, + "epoch": 4.7507838685263275, + "grad_norm": 0.23325783014297485, + "learning_rate": 3.2937275202473974e-07, + "loss": 0.3743, "step": 131820 }, { - "epoch": 4.64, - "learning_rate": 6.903763036957445e-07, - "loss": 0.2254, + "epoch": 4.750964068187551, + "grad_norm": 0.2581305503845215, + "learning_rate": 3.2890079310405885e-07, + "loss": 0.3498, "step": 131825 }, { - "epoch": 4.64, - "learning_rate": 6.897116190737296e-07, - "loss": 0.2648, + "epoch": 4.751144267848776, + "grad_norm": 0.30278897285461426, + "learning_rate": 3.2842917032233076e-07, + "loss": 0.3916, "step": 131830 }, { - "epoch": 4.64, - "learning_rate": 6.890472501047934e-07, - "loss": 0.2625, + "epoch": 4.751324467510001, + "grad_norm": 0.26210495829582214, + "learning_rate": 3.279578836859754e-07, + "loss": 0.3663, "step": 131835 }, { - "epoch": 4.64, - "learning_rate": 6.88383196797554e-07, - "loss": 0.2564, + "epoch": 4.7515046671712255, + "grad_norm": 0.3384738862514496, + "learning_rate": 3.274869332014152e-07, + "loss": 0.379, "step": 131840 }, { - "epoch": 4.64, - "learning_rate": 6.877194591606406e-07, - "loss": 0.2375, + "epoch": 4.75168486683245, + "grad_norm": 0.3079512119293213, + "learning_rate": 3.27016318875073e-07, + "loss": 0.4015, "step": 131845 }, { - "epoch": 4.64, - "learning_rate": 6.870560372026629e-07, - "loss": 0.2494, + "epoch": 4.751865066493675, + "grad_norm": 0.2897801995277405, + "learning_rate": 3.265460407133547e-07, + "loss": 0.382, "step": 131850 }, { - "epoch": 4.64, - "learning_rate": 6.86392930932242e-07, - "loss": 0.233, + "epoch": 4.7520452661549, + "grad_norm": 0.2497911900281906, + "learning_rate": 3.260760987226691e-07, + "loss": 0.3617, "step": 131855 }, { - "epoch": 4.64, - "learning_rate": 6.857301403579875e-07, - "loss": 0.2484, + "epoch": 4.7522254658161245, + "grad_norm": 0.23933720588684082, + "learning_rate": 3.2560649290942216e-07, + "loss": 0.3815, "step": 131860 }, { - "epoch": 4.64, - "learning_rate": 6.850676654885008e-07, - "loss": 0.2281, + "epoch": 4.752405665477349, + "grad_norm": 0.24069957435131073, + "learning_rate": 3.2513722328000595e-07, + "loss": 0.363, "step": 131865 }, { - "epoch": 4.64, - "learning_rate": 6.84405506332389e-07, - "loss": 0.2505, + "epoch": 4.752585865138574, + "grad_norm": 0.3069649636745453, + "learning_rate": 3.2466828984082096e-07, + "loss": 0.3922, "step": 131870 }, { - "epoch": 4.64, - "learning_rate": 6.837436628982452e-07, - "loss": 0.2339, + "epoch": 4.752766064799798, + "grad_norm": 0.21737022697925568, + "learning_rate": 3.24199692598251e-07, + "loss": 0.3574, "step": 131875 }, { - "epoch": 4.64, - "learning_rate": 6.830821351946654e-07, - "loss": 0.2606, + "epoch": 4.7529462644610225, + "grad_norm": 0.22841662168502808, + "learning_rate": 3.2373143155868255e-07, + "loss": 0.344, "step": 131880 }, { - "epoch": 4.64, - "learning_rate": 6.82420923230237e-07, - "loss": 0.2293, + "epoch": 4.753126464122247, + "grad_norm": 0.25134187936782837, + "learning_rate": 3.232635067284995e-07, + "loss": 0.3917, "step": 131885 }, { - "epoch": 4.64, - "learning_rate": 6.817600270135504e-07, - "loss": 0.2381, + "epoch": 4.753306663783472, + "grad_norm": 0.21516527235507965, + "learning_rate": 3.2279591811406884e-07, + "loss": 0.3816, "step": 131890 }, { - "epoch": 4.64, - "learning_rate": 6.810994465531822e-07, - "loss": 0.2254, + "epoch": 4.753486863444697, + "grad_norm": 0.20590807497501373, + "learning_rate": 3.223286657217717e-07, + "loss": 0.3616, "step": 131895 }, { - "epoch": 4.64, - "learning_rate": 6.804391818577089e-07, - "loss": 0.2608, + "epoch": 4.7536670631059215, + "grad_norm": 0.2603895962238312, + "learning_rate": 3.21861749557964e-07, + "loss": 0.3731, "step": 131900 }, { - "epoch": 4.64, - "learning_rate": 6.797792329357039e-07, - "loss": 0.2643, + "epoch": 4.753847262767146, + "grad_norm": 0.2670796513557434, + "learning_rate": 3.2139516962901297e-07, + "loss": 0.3868, "step": 131905 }, { - "epoch": 4.64, - "learning_rate": 6.791195997957439e-07, - "loss": 0.2592, + "epoch": 4.754027462428371, + "grad_norm": 0.2581007480621338, + "learning_rate": 3.2092892594127454e-07, + "loss": 0.382, "step": 131910 }, { - "epoch": 4.64, - "learning_rate": 6.784602824463859e-07, - "loss": 0.2448, + "epoch": 4.754207662089595, + "grad_norm": 0.29957446455955505, + "learning_rate": 3.2046301850110484e-07, + "loss": 0.3801, "step": 131915 }, { - "epoch": 4.64, - "learning_rate": 6.778012808961925e-07, - "loss": 0.246, + "epoch": 4.7543878617508195, + "grad_norm": 0.21832120418548584, + "learning_rate": 3.199974473148459e-07, + "loss": 0.3558, "step": 131920 }, { - "epoch": 4.64, - "learning_rate": 6.77142595153718e-07, - "loss": 0.236, + "epoch": 4.754568061412044, + "grad_norm": 0.3172387182712555, + "learning_rate": 3.195322123888428e-07, + "loss": 0.3774, "step": 131925 }, { - "epoch": 4.64, - "learning_rate": 6.764842252275195e-07, - "loss": 0.249, + "epoch": 4.754748261073269, + "grad_norm": 0.2534702718257904, + "learning_rate": 3.1906731372943476e-07, + "loss": 0.386, "step": 131930 }, { - "epoch": 4.64, - "learning_rate": 6.758261711261455e-07, - "loss": 0.2457, + "epoch": 4.754928460734494, + "grad_norm": 0.21878264844417572, + "learning_rate": 3.186027513429585e-07, + "loss": 0.3593, "step": 131935 }, { - "epoch": 4.64, - "learning_rate": 6.751684328581364e-07, - "loss": 0.2405, + "epoch": 4.7551086603957184, + "grad_norm": 0.2851618528366089, + "learning_rate": 3.181385252357394e-07, + "loss": 0.4047, "step": 131940 }, { - "epoch": 4.64, - "learning_rate": 6.745110104320329e-07, - "loss": 0.2463, + "epoch": 4.755288860056943, + "grad_norm": 0.24602822959423065, + "learning_rate": 3.176746354141058e-07, + "loss": 0.3892, "step": 131945 }, { - "epoch": 4.64, - "learning_rate": 6.738539038563752e-07, - "loss": 0.2291, + "epoch": 4.755469059718168, + "grad_norm": 0.25003623962402344, + "learning_rate": 3.172110818843749e-07, + "loss": 0.3674, "step": 131950 }, { - "epoch": 4.64, - "learning_rate": 6.731971131396897e-07, - "loss": 0.2499, + "epoch": 4.755649259379393, + "grad_norm": 0.282927930355072, + "learning_rate": 3.167478646528638e-07, + "loss": 0.3198, "step": 131955 }, { - "epoch": 4.64, - "learning_rate": 6.725406382905086e-07, - "loss": 0.2429, + "epoch": 4.755829459040617, + "grad_norm": 0.23673711717128754, + "learning_rate": 3.162849837258869e-07, + "loss": 0.3578, "step": 131960 }, { - "epoch": 4.64, - "learning_rate": 6.718844793173585e-07, - "loss": 0.2376, + "epoch": 4.756009658701842, + "grad_norm": 0.2531015872955322, + "learning_rate": 3.158224391097503e-07, + "loss": 0.3597, "step": 131965 }, { - "epoch": 4.64, - "learning_rate": 6.712286362287545e-07, - "loss": 0.2402, + "epoch": 4.756189858363066, + "grad_norm": 0.25352203845977783, + "learning_rate": 3.1536023081074894e-07, + "loss": 0.4158, "step": 131970 }, { - "epoch": 4.64, - "learning_rate": 6.705731090332123e-07, - "loss": 0.2352, + "epoch": 4.756370058024291, + "grad_norm": 0.278499037027359, + "learning_rate": 3.1489835883518615e-07, + "loss": 0.3489, "step": 131975 }, { - "epoch": 4.64, - "learning_rate": 6.699178977392446e-07, - "loss": 0.2627, + "epoch": 4.756550257685515, + "grad_norm": 0.27848923206329346, + "learning_rate": 3.144368231893541e-07, + "loss": 0.4042, "step": 131980 }, { - "epoch": 4.64, - "learning_rate": 6.692630023553581e-07, - "loss": 0.2318, + "epoch": 4.75673045734674, + "grad_norm": 0.25936684012413025, + "learning_rate": 3.1397562387954214e-07, + "loss": 0.3477, "step": 131985 }, { - "epoch": 4.64, - "learning_rate": 6.686084228900602e-07, - "loss": 0.2494, + "epoch": 4.756910657007965, + "grad_norm": 0.21595986187458038, + "learning_rate": 3.1351476091203427e-07, + "loss": 0.3695, "step": 131990 }, { - "epoch": 4.64, - "learning_rate": 6.679541593518468e-07, - "loss": 0.2371, + "epoch": 4.75709085666919, + "grad_norm": 0.2597789466381073, + "learning_rate": 3.1305423429310587e-07, + "loss": 0.3714, "step": 131995 }, { - "epoch": 4.64, - "learning_rate": 6.673002117492083e-07, - "loss": 0.262, + "epoch": 4.757271056330414, + "grad_norm": 0.22275707125663757, + "learning_rate": 3.1259404402903536e-07, + "loss": 0.3491, "step": 132000 }, { - "epoch": 4.64, - "eval_loss": 0.24853284657001495, - "eval_runtime": 10.5548, - "eval_samples_per_second": 9.474, - "eval_steps_per_second": 9.474, + "epoch": 4.757271056330414, + "eval_loss": 0.4288484454154968, + "eval_runtime": 3.5292, + "eval_samples_per_second": 28.335, + "eval_steps_per_second": 7.084, "step": 132000 }, { - "epoch": 4.64, - "learning_rate": 6.666465800906435e-07, - "loss": 0.2598, + "epoch": 4.757451255991639, + "grad_norm": 0.25196489691734314, + "learning_rate": 3.121341901260899e-07, + "loss": 0.3829, "step": 132005 }, { - "epoch": 4.64, - "learning_rate": 6.659932643846345e-07, - "loss": 0.2609, + "epoch": 4.757631455652863, + "grad_norm": 0.21652761101722717, + "learning_rate": 3.1167467259053675e-07, + "loss": 0.3318, "step": 132010 }, { - "epoch": 4.64, - "learning_rate": 6.653402646396717e-07, - "loss": 0.2532, + "epoch": 4.757811655314088, + "grad_norm": 0.2428998500108719, + "learning_rate": 3.112154914286375e-07, + "loss": 0.357, "step": 132015 }, { - "epoch": 4.64, - "learning_rate": 6.646875808642233e-07, - "loss": 0.2528, + "epoch": 4.757991854975312, + "grad_norm": 0.2448771446943283, + "learning_rate": 3.1075664664664827e-07, + "loss": 0.3707, "step": 132020 }, { - "epoch": 4.65, - "learning_rate": 6.640352130667716e-07, - "loss": 0.26, + "epoch": 4.758172054636537, + "grad_norm": 0.2827889621257782, + "learning_rate": 3.1029813825081687e-07, + "loss": 0.393, "step": 132025 }, { - "epoch": 4.65, - "learning_rate": 6.633831612557844e-07, - "loss": 0.2451, + "epoch": 4.758352254297762, + "grad_norm": 0.21509301662445068, + "learning_rate": 3.098399662473939e-07, + "loss": 0.3583, "step": 132030 }, { - "epoch": 4.65, - "learning_rate": 6.627314254397276e-07, - "loss": 0.2449, + "epoch": 4.758532453958987, + "grad_norm": 0.23091164231300354, + "learning_rate": 3.0938213064262143e-07, + "loss": 0.3593, "step": 132035 }, { - "epoch": 4.65, - "learning_rate": 6.620800056270638e-07, - "loss": 0.2424, + "epoch": 4.758712653620211, + "grad_norm": 0.3308013677597046, + "learning_rate": 3.089246314427391e-07, + "loss": 0.383, "step": 132040 }, { - "epoch": 4.65, - "learning_rate": 6.614289018262526e-07, - "loss": 0.2506, + "epoch": 4.758892853281436, + "grad_norm": 0.2075890302658081, + "learning_rate": 3.084674686539779e-07, + "loss": 0.3666, "step": 132045 }, { - "epoch": 4.65, - "learning_rate": 6.607781140457487e-07, - "loss": 0.2562, + "epoch": 4.759073052942661, + "grad_norm": 0.2468065321445465, + "learning_rate": 3.080106422825635e-07, + "loss": 0.3804, "step": 132050 }, { - "epoch": 4.65, - "learning_rate": 6.601276422940006e-07, - "loss": 0.2633, + "epoch": 4.7592532526038855, + "grad_norm": 0.24998079240322113, + "learning_rate": 3.0755415233472693e-07, + "loss": 0.4156, "step": 132055 }, { - "epoch": 4.65, - "learning_rate": 6.594774865794517e-07, - "loss": 0.2432, + "epoch": 4.759433452265109, + "grad_norm": 0.23068219423294067, + "learning_rate": 3.0709799881668554e-07, + "loss": 0.3924, "step": 132060 }, { - "epoch": 4.65, - "learning_rate": 6.588276469105481e-07, - "loss": 0.2413, + "epoch": 4.759613651926334, + "grad_norm": 0.2239476889371872, + "learning_rate": 3.066421817346482e-07, + "loss": 0.3956, "step": 132065 }, { - "epoch": 4.65, - "learning_rate": 6.581781232957273e-07, - "loss": 0.2464, + "epoch": 4.759793851587559, + "grad_norm": 0.2508222162723541, + "learning_rate": 3.0618670109483494e-07, + "loss": 0.3624, "step": 132070 }, { - "epoch": 4.65, - "learning_rate": 6.575289157434189e-07, - "loss": 0.2366, + "epoch": 4.759974051248784, + "grad_norm": 0.21215388178825378, + "learning_rate": 3.0573155690344355e-07, + "loss": 0.3786, "step": 132075 }, { - "epoch": 4.65, - "learning_rate": 6.568800242620577e-07, - "loss": 0.2498, + "epoch": 4.760154250910008, + "grad_norm": 0.23385365307331085, + "learning_rate": 3.05276749166683e-07, + "loss": 0.3711, "step": 132080 }, { - "epoch": 4.65, - "learning_rate": 6.562314488600674e-07, - "loss": 0.2505, + "epoch": 4.760334450571233, + "grad_norm": 0.26072898507118225, + "learning_rate": 3.048222778907428e-07, + "loss": 0.3787, "step": 132085 }, { - "epoch": 4.65, - "learning_rate": 6.555831895458691e-07, - "loss": 0.256, + "epoch": 4.760514650232458, + "grad_norm": 0.27673956751823425, + "learning_rate": 3.0436814308181794e-07, + "loss": 0.3841, "step": 132090 }, { - "epoch": 4.65, - "learning_rate": 6.549352463278757e-07, - "loss": 0.2644, + "epoch": 4.7606948498936825, + "grad_norm": 0.2762385308742523, + "learning_rate": 3.0391434474609516e-07, + "loss": 0.3801, "step": 132095 }, { - "epoch": 4.65, - "learning_rate": 6.542876192145053e-07, - "loss": 0.2679, + "epoch": 4.760875049554906, + "grad_norm": 0.27955085039138794, + "learning_rate": 3.0346088288976117e-07, + "loss": 0.3601, "step": 132100 }, { - "epoch": 4.65, - "learning_rate": 6.536403082141679e-07, - "loss": 0.2297, + "epoch": 4.761055249216131, + "grad_norm": 0.2552419900894165, + "learning_rate": 3.030077575189888e-07, + "loss": 0.363, "step": 132105 }, { - "epoch": 4.65, - "learning_rate": 6.529933133352678e-07, - "loss": 0.256, + "epoch": 4.761235448877356, + "grad_norm": 0.30773216485977173, + "learning_rate": 3.0255496863995646e-07, + "loss": 0.3942, "step": 132110 }, { - "epoch": 4.65, - "learning_rate": 6.523466345861984e-07, - "loss": 0.2407, + "epoch": 4.761415648538581, + "grad_norm": 0.22618763148784637, + "learning_rate": 3.021025162588259e-07, + "loss": 0.391, "step": 132115 }, { - "epoch": 4.65, - "learning_rate": 6.51700271975364e-07, - "loss": 0.2212, + "epoch": 4.761595848199805, + "grad_norm": 0.2810955047607422, + "learning_rate": 3.016504003817727e-07, + "loss": 0.3841, "step": 132120 }, { - "epoch": 4.65, - "learning_rate": 6.510542255111579e-07, - "loss": 0.2716, + "epoch": 4.76177604786103, + "grad_norm": 0.25989750027656555, + "learning_rate": 3.0119862101494754e-07, + "loss": 0.3548, "step": 132125 }, { - "epoch": 4.65, - "learning_rate": 6.504084952019651e-07, - "loss": 0.2682, + "epoch": 4.761956247522255, + "grad_norm": 0.2658855617046356, + "learning_rate": 3.0074717816451214e-07, + "loss": 0.3629, "step": 132130 }, { - "epoch": 4.65, - "learning_rate": 6.497630810561706e-07, - "loss": 0.2413, + "epoch": 4.7621364471834795, + "grad_norm": 0.27331972122192383, + "learning_rate": 3.002960718366116e-07, + "loss": 0.3689, "step": 132135 }, { - "epoch": 4.65, - "learning_rate": 6.491179830821537e-07, - "loss": 0.2548, + "epoch": 4.762316646844704, + "grad_norm": 0.22226150333881378, + "learning_rate": 2.998453020373965e-07, + "loss": 0.4006, "step": 132140 }, { - "epoch": 4.65, - "learning_rate": 6.484732012882966e-07, - "loss": 0.2452, + "epoch": 4.762496846505929, + "grad_norm": 0.24564702808856964, + "learning_rate": 2.9939486877300636e-07, + "loss": 0.3452, "step": 132145 }, { - "epoch": 4.65, - "learning_rate": 6.478287356829649e-07, - "loss": 0.2513, + "epoch": 4.762677046167154, + "grad_norm": 0.3113086521625519, + "learning_rate": 2.989447720495808e-07, + "loss": 0.3813, "step": 132150 }, { - "epoch": 4.65, - "learning_rate": 6.47184586274524e-07, - "loss": 0.2597, + "epoch": 4.7628572458283776, + "grad_norm": 0.24232225120067596, + "learning_rate": 2.9849501187324823e-07, + "loss": 0.3843, "step": 132155 }, { - "epoch": 4.65, - "learning_rate": 6.465407530713452e-07, - "loss": 0.2343, + "epoch": 4.763037445489602, + "grad_norm": 0.21868427097797394, + "learning_rate": 2.980455882501398e-07, + "loss": 0.3898, "step": 132160 }, { - "epoch": 4.65, - "learning_rate": 6.458972360817883e-07, - "loss": 0.2566, + "epoch": 4.763217645150827, + "grad_norm": 0.2546680271625519, + "learning_rate": 2.975965011863785e-07, + "loss": 0.318, "step": 132165 }, { - "epoch": 4.65, - "learning_rate": 6.452540353141995e-07, - "loss": 0.249, + "epoch": 4.763397844812052, + "grad_norm": 0.2764042615890503, + "learning_rate": 2.971477506880815e-07, + "loss": 0.3723, "step": 132170 }, { - "epoch": 4.65, - "learning_rate": 6.446111507769414e-07, - "loss": 0.2572, + "epoch": 4.7635780444732765, + "grad_norm": 0.23360565304756165, + "learning_rate": 2.9669933676136354e-07, + "loss": 0.3803, "step": 132175 }, { - "epoch": 4.65, - "learning_rate": 6.439685824783548e-07, - "loss": 0.2377, + "epoch": 4.763758244134501, + "grad_norm": 0.258620023727417, + "learning_rate": 2.9625125941233345e-07, + "loss": 0.4101, "step": 132180 }, { - "epoch": 4.65, - "learning_rate": 6.433263304267855e-07, - "loss": 0.2539, + "epoch": 4.763938443795726, + "grad_norm": 0.2147550880908966, + "learning_rate": 2.9580351864710034e-07, + "loss": 0.362, "step": 132185 }, { - "epoch": 4.65, - "learning_rate": 6.426843946305716e-07, - "loss": 0.2579, + "epoch": 4.764118643456951, + "grad_norm": 0.2418157309293747, + "learning_rate": 2.9535611447175924e-07, + "loss": 0.358, "step": 132190 }, { - "epoch": 4.65, - "learning_rate": 6.420427750980479e-07, - "loss": 0.245, + "epoch": 4.7642988431181745, + "grad_norm": 0.26910528540611267, + "learning_rate": 2.9490904689240815e-07, + "loss": 0.4377, "step": 132195 }, { - "epoch": 4.65, - "learning_rate": 6.414014718375494e-07, - "loss": 0.2459, + "epoch": 4.764479042779399, + "grad_norm": 0.26641741394996643, + "learning_rate": 2.944623159151394e-07, + "loss": 0.3293, "step": 132200 }, { - "epoch": 4.65, - "learning_rate": 6.407604848574001e-07, - "loss": 0.2422, + "epoch": 4.764659242440624, + "grad_norm": 0.2724882960319519, + "learning_rate": 2.9401592154603694e-07, + "loss": 0.3949, "step": 132205 }, { - "epoch": 4.65, - "learning_rate": 6.401198141659182e-07, - "loss": 0.2429, + "epoch": 4.764839442101849, + "grad_norm": 0.245937317609787, + "learning_rate": 2.935698637911849e-07, + "loss": 0.3721, "step": 132210 }, { - "epoch": 4.65, - "learning_rate": 6.394794597714304e-07, - "loss": 0.2692, + "epoch": 4.7650196417630735, + "grad_norm": 0.25321248173713684, + "learning_rate": 2.9312414265665886e-07, + "loss": 0.3822, "step": 132215 }, { - "epoch": 4.65, - "learning_rate": 6.388394216822469e-07, - "loss": 0.2535, + "epoch": 4.765199841424298, + "grad_norm": 0.27613797783851624, + "learning_rate": 2.9267875814853463e-07, + "loss": 0.3634, "step": 132220 }, { - "epoch": 4.65, - "learning_rate": 6.381996999066775e-07, - "loss": 0.2592, + "epoch": 4.765380041085523, + "grad_norm": 0.2703777849674225, + "learning_rate": 2.922337102728795e-07, + "loss": 0.3705, "step": 132225 }, { - "epoch": 4.65, - "learning_rate": 6.375602944530295e-07, - "loss": 0.247, + "epoch": 4.765560240746748, + "grad_norm": 0.23466283082962036, + "learning_rate": 2.917889990357553e-07, + "loss": 0.3601, "step": 132230 }, { - "epoch": 4.65, - "learning_rate": 6.369212053296076e-07, - "loss": 0.2495, + "epoch": 4.765740440407972, + "grad_norm": 0.26712194085121155, + "learning_rate": 2.9134462444322106e-07, + "loss": 0.3573, "step": 132235 }, { - "epoch": 4.65, - "learning_rate": 6.362824325447076e-07, - "loss": 0.2421, + "epoch": 4.765920640069197, + "grad_norm": 0.23665735125541687, + "learning_rate": 2.909005865013331e-07, + "loss": 0.3711, "step": 132240 }, { - "epoch": 4.65, - "learning_rate": 6.35643976106623e-07, - "loss": 0.2519, + "epoch": 4.766100839730421, + "grad_norm": 0.26551133394241333, + "learning_rate": 2.904568852161449e-07, + "loss": 0.3894, "step": 132245 }, { - "epoch": 4.65, - "learning_rate": 6.350058360236443e-07, - "loss": 0.2391, + "epoch": 4.766281039391646, + "grad_norm": 0.21451817452907562, + "learning_rate": 2.900135205936932e-07, + "loss": 0.3891, "step": 132250 }, { - "epoch": 4.65, - "learning_rate": 6.343680123040597e-07, - "loss": 0.2558, + "epoch": 4.7664612390528704, + "grad_norm": 0.256231427192688, + "learning_rate": 2.8957049264002334e-07, + "loss": 0.3673, "step": 132255 }, { - "epoch": 4.65, - "learning_rate": 6.337305049561481e-07, - "loss": 0.2499, + "epoch": 4.766641438714095, + "grad_norm": 0.2589702904224396, + "learning_rate": 2.89127801361172e-07, + "loss": 0.4113, "step": 132260 }, { - "epoch": 4.65, - "learning_rate": 6.330933139881895e-07, - "loss": 0.2328, + "epoch": 4.76682163837532, + "grad_norm": 0.25339174270629883, + "learning_rate": 2.886854467631733e-07, + "loss": 0.3935, "step": 132265 }, { - "epoch": 4.65, - "learning_rate": 6.32456439408452e-07, - "loss": 0.2415, + "epoch": 4.767001838036545, + "grad_norm": 0.23474030196666718, + "learning_rate": 2.882434288520475e-07, + "loss": 0.3563, "step": 132270 }, { - "epoch": 4.65, - "learning_rate": 6.318198812252124e-07, - "loss": 0.244, + "epoch": 4.767182037697769, + "grad_norm": 0.2822888195514679, + "learning_rate": 2.8780174763382296e-07, + "loss": 0.3655, "step": 132275 }, { - "epoch": 4.65, - "learning_rate": 6.31183639446728e-07, - "loss": 0.2548, + "epoch": 4.767362237358994, + "grad_norm": 0.32862353324890137, + "learning_rate": 2.8736040311451163e-07, + "loss": 0.3644, "step": 132280 }, { - "epoch": 4.65, - "learning_rate": 6.305477140812699e-07, - "loss": 0.2634, + "epoch": 4.767542437020218, + "grad_norm": 0.2945389151573181, + "learning_rate": 2.869193953001337e-07, + "loss": 0.3493, "step": 132285 }, { - "epoch": 4.65, - "learning_rate": 6.299121051370843e-07, - "loss": 0.2385, + "epoch": 4.767722636681443, + "grad_norm": 0.20633207261562347, + "learning_rate": 2.8647872419669544e-07, + "loss": 0.3379, "step": 132290 }, { - "epoch": 4.65, - "learning_rate": 6.292768126224341e-07, - "loss": 0.2506, + "epoch": 4.767902836342667, + "grad_norm": 0.25659510493278503, + "learning_rate": 2.8603838981019757e-07, + "loss": 0.3413, "step": 132295 }, { - "epoch": 4.65, - "learning_rate": 6.286418365455626e-07, - "loss": 0.2482, + "epoch": 4.768083036003892, + "grad_norm": 0.24695339798927307, + "learning_rate": 2.8559839214664094e-07, + "loss": 0.3909, "step": 132300 }, { - "epoch": 4.65, - "learning_rate": 6.280071769147134e-07, - "loss": 0.2356, + "epoch": 4.768263235665117, + "grad_norm": 0.2526165544986725, + "learning_rate": 2.8515873121202076e-07, + "loss": 0.3805, "step": 132305 }, { - "epoch": 4.66, - "learning_rate": 6.273728337381296e-07, - "loss": 0.2472, + "epoch": 4.768443435326342, + "grad_norm": 0.31589943170547485, + "learning_rate": 2.8471940701233216e-07, + "loss": 0.4108, "step": 132310 }, { - "epoch": 4.66, - "learning_rate": 6.267388070240493e-07, - "loss": 0.2518, + "epoch": 4.768623634987566, + "grad_norm": 0.24592305719852448, + "learning_rate": 2.8428041955355365e-07, + "loss": 0.3881, "step": 132315 }, { - "epoch": 4.66, - "learning_rate": 6.261050967807047e-07, - "loss": 0.2369, + "epoch": 4.768803834648791, + "grad_norm": 0.30194249749183655, + "learning_rate": 2.8384176884166947e-07, + "loss": 0.3638, "step": 132320 }, { - "epoch": 4.66, - "learning_rate": 6.254717030163171e-07, - "loss": 0.2795, + "epoch": 4.768984034310016, + "grad_norm": 0.26434436440467834, + "learning_rate": 2.834034548826553e-07, + "loss": 0.3949, "step": 132325 }, { - "epoch": 4.66, - "learning_rate": 6.248386257391187e-07, - "loss": 0.2486, + "epoch": 4.769164233971241, + "grad_norm": 0.24037402868270874, + "learning_rate": 2.8296547768248415e-07, + "loss": 0.3672, "step": 132330 }, { - "epoch": 4.66, - "learning_rate": 6.242058649573279e-07, - "loss": 0.247, + "epoch": 4.769344433632464, + "grad_norm": 0.2738417088985443, + "learning_rate": 2.825278372471263e-07, + "loss": 0.3967, "step": 132335 }, { - "epoch": 4.66, - "learning_rate": 6.235734206791577e-07, - "loss": 0.264, + "epoch": 4.769524633293689, + "grad_norm": 0.2520720958709717, + "learning_rate": 2.82090533582538e-07, + "loss": 0.3668, "step": 132340 }, { - "epoch": 4.66, - "learning_rate": 6.229412929128209e-07, - "loss": 0.2515, + "epoch": 4.769704832954914, + "grad_norm": 0.2859429717063904, + "learning_rate": 2.8165356669468126e-07, + "loss": 0.3258, "step": 132345 }, { - "epoch": 4.66, - "learning_rate": 6.223094816665304e-07, - "loss": 0.2593, + "epoch": 4.769885032616139, + "grad_norm": 0.2699842154979706, + "learning_rate": 2.8121693658950955e-07, + "loss": 0.352, "step": 132350 }, { - "epoch": 4.66, - "learning_rate": 6.216779869484823e-07, - "loss": 0.2431, + "epoch": 4.770065232277363, + "grad_norm": 0.29695814847946167, + "learning_rate": 2.807806432729682e-07, + "loss": 0.3885, "step": 132355 }, { - "epoch": 4.66, - "learning_rate": 6.210468087668842e-07, - "loss": 0.2436, + "epoch": 4.770245431938588, + "grad_norm": 0.2401593029499054, + "learning_rate": 2.803446867510079e-07, + "loss": 0.3437, "step": 132360 }, { - "epoch": 4.66, - "learning_rate": 6.204159471299209e-07, - "loss": 0.2392, + "epoch": 4.770425631599813, + "grad_norm": 0.26505574584007263, + "learning_rate": 2.7990906702956566e-07, + "loss": 0.3474, "step": 132365 }, { - "epoch": 4.66, - "learning_rate": 6.197854020457944e-07, - "loss": 0.2254, + "epoch": 4.7706058312610375, + "grad_norm": 0.7576366662979126, + "learning_rate": 2.7947378411457557e-07, + "loss": 0.3716, "step": 132370 }, { - "epoch": 4.66, - "learning_rate": 6.191551735226869e-07, - "loss": 0.2355, + "epoch": 4.770786030922261, + "grad_norm": 0.3026728332042694, + "learning_rate": 2.7903883801196906e-07, + "loss": 0.342, "step": 132375 }, { - "epoch": 4.66, - "learning_rate": 6.185252615687809e-07, - "loss": 0.2576, + "epoch": 4.770966230583486, + "grad_norm": 0.3232538104057312, + "learning_rate": 2.786042287276719e-07, + "loss": 0.3823, "step": 132380 }, { - "epoch": 4.66, - "learning_rate": 6.178956661922531e-07, - "loss": 0.2521, + "epoch": 4.771146430244711, + "grad_norm": 0.24834728240966797, + "learning_rate": 2.781699562676071e-07, + "loss": 0.3594, "step": 132385 }, { - "epoch": 4.66, - "learning_rate": 6.172663874012857e-07, - "loss": 0.2453, + "epoch": 4.771326629905936, + "grad_norm": 0.23225435614585876, + "learning_rate": 2.7773602063768946e-07, + "loss": 0.3594, "step": 132390 }, { - "epoch": 4.66, - "learning_rate": 6.166374252040418e-07, - "loss": 0.2552, + "epoch": 4.77150682956716, + "grad_norm": 0.27175524830818176, + "learning_rate": 2.773024218438308e-07, + "loss": 0.3937, "step": 132395 }, { - "epoch": 4.66, - "learning_rate": 6.160087796086955e-07, - "loss": 0.234, + "epoch": 4.771687029228385, + "grad_norm": 0.250227153301239, + "learning_rate": 2.768691598919432e-07, + "loss": 0.3693, "step": 132400 }, { - "epoch": 4.66, - "learning_rate": 6.15380450623404e-07, - "loss": 0.2648, + "epoch": 4.77186722888961, + "grad_norm": 0.23589470982551575, + "learning_rate": 2.7643623478792456e-07, + "loss": 0.369, "step": 132405 }, { - "epoch": 4.66, - "learning_rate": 6.147524382563275e-07, - "loss": 0.2542, + "epoch": 4.7720474285508345, + "grad_norm": 0.32625946402549744, + "learning_rate": 2.7600364653767584e-07, + "loss": 0.3857, "step": 132410 }, { - "epoch": 4.66, - "learning_rate": 6.141247425156204e-07, - "loss": 0.22, + "epoch": 4.772227628212059, + "grad_norm": 0.29416128993034363, + "learning_rate": 2.755713951470923e-07, + "loss": 0.3979, "step": 132415 }, { - "epoch": 4.66, - "learning_rate": 6.134973634094321e-07, - "loss": 0.2267, + "epoch": 4.772407827873284, + "grad_norm": 0.22298985719680786, + "learning_rate": 2.7513948062205807e-07, + "loss": 0.383, "step": 132420 }, { - "epoch": 4.66, - "learning_rate": 6.128703009459114e-07, - "loss": 0.25, + "epoch": 4.772588027534509, + "grad_norm": 0.23173697292804718, + "learning_rate": 2.747079029684657e-07, + "loss": 0.335, "step": 132425 }, { - "epoch": 4.66, - "learning_rate": 6.122435551331962e-07, - "loss": 0.2679, + "epoch": 4.772768227195733, + "grad_norm": 0.3118121922016144, + "learning_rate": 2.7427666219219104e-07, + "loss": 0.411, "step": 132430 }, { - "epoch": 4.66, - "learning_rate": 6.116171259794273e-07, - "loss": 0.2352, + "epoch": 4.772948426856957, + "grad_norm": 0.23155774176120758, + "learning_rate": 2.738457582991072e-07, + "loss": 0.3765, "step": 132435 }, { - "epoch": 4.66, - "learning_rate": 6.109910134927344e-07, - "loss": 0.2364, + "epoch": 4.773128626518182, + "grad_norm": 0.25789666175842285, + "learning_rate": 2.734151912950872e-07, + "loss": 0.3977, "step": 132440 }, { - "epoch": 4.66, - "learning_rate": 6.103652176812524e-07, - "loss": 0.2653, + "epoch": 4.773308826179407, + "grad_norm": 0.24202270805835724, + "learning_rate": 2.729849611859986e-07, + "loss": 0.3611, "step": 132445 }, { - "epoch": 4.66, - "learning_rate": 6.097397385531029e-07, - "loss": 0.2623, + "epoch": 4.7734890258406315, + "grad_norm": 0.26847487688064575, + "learning_rate": 2.7255506797770615e-07, + "loss": 0.3389, "step": 132450 }, { - "epoch": 4.66, - "learning_rate": 6.091145761164125e-07, - "loss": 0.264, + "epoch": 4.773669225501856, + "grad_norm": 0.23348930478096008, + "learning_rate": 2.721255116760607e-07, + "loss": 0.3824, "step": 132455 }, { - "epoch": 4.66, - "learning_rate": 6.084897303792914e-07, - "loss": 0.2523, + "epoch": 4.773849425163081, + "grad_norm": 0.23757703602313995, + "learning_rate": 2.7169629228691597e-07, + "loss": 0.3796, "step": 132460 }, { - "epoch": 4.66, - "learning_rate": 6.078652013498609e-07, - "loss": 0.2527, + "epoch": 4.774029624824306, + "grad_norm": 0.2990122139453888, + "learning_rate": 2.7126740981612555e-07, + "loss": 0.4045, "step": 132465 }, { - "epoch": 4.66, - "learning_rate": 6.07240989036223e-07, - "loss": 0.2597, + "epoch": 4.7742098244855296, + "grad_norm": 0.2594546675682068, + "learning_rate": 2.708388642695237e-07, + "loss": 0.3844, "step": 132470 }, { - "epoch": 4.66, - "learning_rate": 6.066170934464877e-07, - "loss": 0.2469, + "epoch": 4.774390024146754, + "grad_norm": 0.2652234435081482, + "learning_rate": 2.7041065565296117e-07, + "loss": 0.3808, "step": 132475 }, { - "epoch": 4.66, - "learning_rate": 6.059935145887485e-07, - "loss": 0.2414, + "epoch": 4.774570223807979, + "grad_norm": 0.22843249142169952, + "learning_rate": 2.6998278397226404e-07, + "loss": 0.3518, "step": 132480 }, { - "epoch": 4.66, - "learning_rate": 6.05370252471113e-07, - "loss": 0.2587, + "epoch": 4.774750423469204, + "grad_norm": 0.24594981968402863, + "learning_rate": 2.695552492332609e-07, + "loss": 0.3952, "step": 132485 }, { - "epoch": 4.66, - "learning_rate": 6.047473071016663e-07, - "loss": 0.26, + "epoch": 4.7749306231304285, + "grad_norm": 0.20578601956367493, + "learning_rate": 2.6912805144178046e-07, + "loss": 0.3949, "step": 132490 }, { - "epoch": 4.66, - "learning_rate": 6.041246784884991e-07, - "loss": 0.2351, + "epoch": 4.775110822791653, + "grad_norm": 0.22884593904018402, + "learning_rate": 2.6870119060364295e-07, + "loss": 0.366, "step": 132495 }, { - "epoch": 4.66, - "learning_rate": 6.035023666396938e-07, - "loss": 0.2501, + "epoch": 4.775291022452878, + "grad_norm": 0.2841814458370209, + "learning_rate": 2.682746667246633e-07, + "loss": 0.339, "step": 132500 }, { - "epoch": 4.66, - "eval_loss": 0.24850960075855255, - "eval_runtime": 10.5477, - "eval_samples_per_second": 9.481, - "eval_steps_per_second": 9.481, + "epoch": 4.775291022452878, + "eval_loss": 0.4288337230682373, + "eval_runtime": 3.5448, + "eval_samples_per_second": 28.21, + "eval_steps_per_second": 7.053, "step": 132500 }, { - "epoch": 4.66, - "learning_rate": 6.028803715633302e-07, - "loss": 0.2474, + "epoch": 4.775471222114103, + "grad_norm": 0.216835618019104, + "learning_rate": 2.678484798106534e-07, + "loss": 0.381, "step": 132505 }, { - "epoch": 4.66, - "learning_rate": 6.022586932674906e-07, - "loss": 0.2318, + "epoch": 4.775651421775327, + "grad_norm": 0.28074148297309875, + "learning_rate": 2.6742262986741706e-07, + "loss": 0.3725, "step": 132510 }, { - "epoch": 4.66, - "learning_rate": 6.016373317602436e-07, - "loss": 0.2425, + "epoch": 4.775831621436552, + "grad_norm": 0.3115323781967163, + "learning_rate": 2.669971169007607e-07, + "loss": 0.3841, "step": 132515 }, { - "epoch": 4.66, - "learning_rate": 6.010162870496522e-07, - "loss": 0.24, + "epoch": 4.776011821097776, + "grad_norm": 0.31163284182548523, + "learning_rate": 2.6657194091648243e-07, + "loss": 0.3959, "step": 132520 }, { - "epoch": 4.66, - "learning_rate": 6.003955591437904e-07, - "loss": 0.2605, + "epoch": 4.776192020759001, + "grad_norm": 0.25598570704460144, + "learning_rate": 2.661471019203693e-07, + "loss": 0.3576, "step": 132525 }, { - "epoch": 4.66, - "learning_rate": 5.997751480507102e-07, - "loss": 0.2381, + "epoch": 4.7763722204202255, + "grad_norm": 0.25508904457092285, + "learning_rate": 2.657225999182167e-07, + "loss": 0.3747, "step": 132530 }, { - "epoch": 4.66, - "learning_rate": 5.991550537784663e-07, - "loss": 0.2376, + "epoch": 4.77655242008145, + "grad_norm": 0.24659277498722076, + "learning_rate": 2.652984349158033e-07, + "loss": 0.3652, "step": 132535 }, { - "epoch": 4.66, - "learning_rate": 5.98535276335116e-07, - "loss": 0.2289, + "epoch": 4.776732619742675, + "grad_norm": 0.2608989179134369, + "learning_rate": 2.6487460691890787e-07, + "loss": 0.381, "step": 132540 }, { - "epoch": 4.66, - "learning_rate": 5.979158157287057e-07, - "loss": 0.2477, + "epoch": 4.7769128194039, + "grad_norm": 0.22622261941432953, + "learning_rate": 2.6445111593331187e-07, + "loss": 0.3443, "step": 132545 }, { - "epoch": 4.66, - "learning_rate": 5.972966719672762e-07, - "loss": 0.2415, + "epoch": 4.777093019065124, + "grad_norm": 0.23941785097122192, + "learning_rate": 2.640279619647773e-07, + "loss": 0.3482, "step": 132550 }, { - "epoch": 4.66, - "learning_rate": 5.966778450588628e-07, - "loss": 0.2521, + "epoch": 4.777273218726349, + "grad_norm": 0.25073572993278503, + "learning_rate": 2.6360514501907183e-07, + "loss": 0.3629, "step": 132555 }, { - "epoch": 4.66, - "learning_rate": 5.96059335011509e-07, - "loss": 0.2563, + "epoch": 4.777453418387573, + "grad_norm": 0.2828969359397888, + "learning_rate": 2.631826651019603e-07, + "loss": 0.3381, "step": 132560 }, { - "epoch": 4.66, - "learning_rate": 5.954411418332418e-07, - "loss": 0.255, + "epoch": 4.777633618048798, + "grad_norm": 0.29804477095603943, + "learning_rate": 2.627605222191937e-07, + "loss": 0.373, "step": 132565 }, { - "epoch": 4.66, - "learning_rate": 5.948232655320879e-07, - "loss": 0.2468, + "epoch": 4.7778138177100224, + "grad_norm": 0.21889689564704895, + "learning_rate": 2.6233871637652563e-07, + "loss": 0.3954, "step": 132570 }, { - "epoch": 4.66, - "learning_rate": 5.942057061160688e-07, - "loss": 0.267, + "epoch": 4.777994017371247, + "grad_norm": 0.32417771220207214, + "learning_rate": 2.619172475797044e-07, + "loss": 0.4157, "step": 132575 }, { - "epoch": 4.66, - "learning_rate": 5.935884635932087e-07, - "loss": 0.2616, + "epoch": 4.778174217032472, + "grad_norm": 0.24297624826431274, + "learning_rate": 2.6149611583446975e-07, + "loss": 0.374, "step": 132580 }, { - "epoch": 4.66, - "learning_rate": 5.929715379715151e-07, - "loss": 0.2581, + "epoch": 4.778354416693697, + "grad_norm": 0.2629621922969818, + "learning_rate": 2.6107532114656166e-07, + "loss": 0.3859, "step": 132585 }, { - "epoch": 4.66, - "learning_rate": 5.923549292590008e-07, - "loss": 0.2606, + "epoch": 4.778534616354921, + "grad_norm": 0.2679719924926758, + "learning_rate": 2.6065486352171153e-07, + "loss": 0.366, "step": 132590 }, { - "epoch": 4.67, - "learning_rate": 5.917386374636735e-07, - "loss": 0.2622, + "epoch": 4.778714816016146, + "grad_norm": 0.25725606083869934, + "learning_rate": 2.6023474296565096e-07, + "loss": 0.4017, "step": 132595 }, { - "epoch": 4.67, - "learning_rate": 5.911226625935351e-07, - "loss": 0.236, + "epoch": 4.778895015677371, + "grad_norm": 0.26632368564605713, + "learning_rate": 2.598149594841004e-07, + "loss": 0.412, "step": 132600 }, { - "epoch": 4.67, - "learning_rate": 5.905070046565819e-07, - "loss": 0.2401, + "epoch": 4.779075215338596, + "grad_norm": 0.20511886477470398, + "learning_rate": 2.59395513082783e-07, + "loss": 0.3194, "step": 132605 }, { - "epoch": 4.67, - "learning_rate": 5.898916636608077e-07, - "loss": 0.2449, + "epoch": 4.779255414999819, + "grad_norm": 0.26528292894363403, + "learning_rate": 2.589764037674108e-07, + "loss": 0.3529, "step": 132610 }, { - "epoch": 4.67, - "learning_rate": 5.892766396142058e-07, - "loss": 0.2527, + "epoch": 4.779435614661044, + "grad_norm": 0.2384311705827713, + "learning_rate": 2.5855763154369604e-07, + "loss": 0.3273, "step": 132615 }, { - "epoch": 4.67, - "learning_rate": 5.886619325247561e-07, - "loss": 0.271, + "epoch": 4.779615814322269, + "grad_norm": 0.28175267577171326, + "learning_rate": 2.581391964173424e-07, + "loss": 0.3562, "step": 132620 }, { - "epoch": 4.67, - "learning_rate": 5.880475424004467e-07, - "loss": 0.2544, + "epoch": 4.779796013983494, + "grad_norm": 0.2850888967514038, + "learning_rate": 2.5772109839405367e-07, + "loss": 0.39, "step": 132625 }, { - "epoch": 4.67, - "learning_rate": 5.874334692492517e-07, - "loss": 0.2472, + "epoch": 4.779976213644718, + "grad_norm": 0.2696197032928467, + "learning_rate": 2.573033374795225e-07, + "loss": 0.3607, "step": 132630 }, { - "epoch": 4.67, - "learning_rate": 5.868197130791397e-07, - "loss": 0.2511, + "epoch": 4.780156413305943, + "grad_norm": 0.2529081702232361, + "learning_rate": 2.5688591367944437e-07, + "loss": 0.3348, "step": 132635 }, { - "epoch": 4.67, - "learning_rate": 5.862062738980906e-07, - "loss": 0.2521, + "epoch": 4.780336612967168, + "grad_norm": 0.2832430601119995, + "learning_rate": 2.5646882699950634e-07, + "loss": 0.3933, "step": 132640 }, { - "epoch": 4.67, - "learning_rate": 5.855931517140617e-07, - "loss": 0.249, + "epoch": 4.780516812628393, + "grad_norm": 0.22554835677146912, + "learning_rate": 2.5605207744538727e-07, + "loss": 0.3706, "step": 132645 }, { - "epoch": 4.67, - "learning_rate": 5.849803465350162e-07, - "loss": 0.261, + "epoch": 4.780697012289616, + "grad_norm": 0.24301104247570038, + "learning_rate": 2.556356650227687e-07, + "loss": 0.3657, "step": 132650 }, { - "epoch": 4.67, - "learning_rate": 5.843678583689116e-07, - "loss": 0.2515, + "epoch": 4.780877211950841, + "grad_norm": 0.2747708559036255, + "learning_rate": 2.5521958973732383e-07, + "loss": 0.3806, "step": 132655 }, { - "epoch": 4.67, - "learning_rate": 5.837556872236999e-07, - "loss": 0.248, + "epoch": 4.781057411612066, + "grad_norm": 0.2549690008163452, + "learning_rate": 2.548038515947232e-07, + "loss": 0.3701, "step": 132660 }, { - "epoch": 4.67, - "learning_rate": 5.831438331073275e-07, - "loss": 0.2506, + "epoch": 4.781237611273291, + "grad_norm": 0.24783384799957275, + "learning_rate": 2.5438845060062887e-07, + "loss": 0.3806, "step": 132665 }, { - "epoch": 4.67, - "learning_rate": 5.825322960277435e-07, - "loss": 0.2607, + "epoch": 4.781417810934515, + "grad_norm": 0.2004653662443161, + "learning_rate": 2.5397338676069746e-07, + "loss": 0.3775, "step": 132670 }, { - "epoch": 4.67, - "learning_rate": 5.819210759928833e-07, - "loss": 0.2541, + "epoch": 4.78159801059574, + "grad_norm": 0.2571395933628082, + "learning_rate": 2.5355866008059117e-07, + "loss": 0.3793, "step": 132675 }, { - "epoch": 4.67, - "learning_rate": 5.813101730106879e-07, - "loss": 0.249, + "epoch": 4.781778210256965, + "grad_norm": 0.2348882555961609, + "learning_rate": 2.5314427056595537e-07, + "loss": 0.391, "step": 132680 }, { - "epoch": 4.67, - "learning_rate": 5.806995870890869e-07, - "loss": 0.2453, + "epoch": 4.7819584099181895, + "grad_norm": 0.26653140783309937, + "learning_rate": 2.527302182224384e-07, + "loss": 0.3878, "step": 132685 }, { - "epoch": 4.67, - "learning_rate": 5.800893182360073e-07, - "loss": 0.2662, + "epoch": 4.782138609579414, + "grad_norm": 0.2474595308303833, + "learning_rate": 2.523165030556829e-07, + "loss": 0.3423, "step": 132690 }, { - "epoch": 4.67, - "learning_rate": 5.794793664593762e-07, - "loss": 0.2356, + "epoch": 4.782318809240639, + "grad_norm": 0.2751745879650116, + "learning_rate": 2.519031250713205e-07, + "loss": 0.3803, "step": 132695 }, { - "epoch": 4.67, - "learning_rate": 5.788697317671149e-07, - "loss": 0.2409, + "epoch": 4.782499008901864, + "grad_norm": 0.27088475227355957, + "learning_rate": 2.5149008427498567e-07, + "loss": 0.3678, "step": 132700 }, { - "epoch": 4.67, - "learning_rate": 5.78260414167131e-07, - "loss": 0.2677, + "epoch": 4.782679208563088, + "grad_norm": 0.2260597050189972, + "learning_rate": 2.510773806723099e-07, + "loss": 0.3565, "step": 132705 }, { - "epoch": 4.67, - "learning_rate": 5.776514136673433e-07, - "loss": 0.257, + "epoch": 4.782859408224312, + "grad_norm": 0.23380564153194427, + "learning_rate": 2.5066501426891377e-07, + "loss": 0.3805, "step": 132710 }, { - "epoch": 4.67, - "learning_rate": 5.770427302756564e-07, - "loss": 0.2392, + "epoch": 4.783039607885537, + "grad_norm": 0.22910314798355103, + "learning_rate": 2.5025298507041494e-07, + "loss": 0.325, "step": 132715 }, { - "epoch": 4.67, - "learning_rate": 5.764343639999725e-07, - "loss": 0.2663, + "epoch": 4.783219807546762, + "grad_norm": 0.2516254782676697, + "learning_rate": 2.498412930824257e-07, + "loss": 0.372, "step": 132720 }, { - "epoch": 4.67, - "learning_rate": 5.758263148481963e-07, - "loss": 0.2483, + "epoch": 4.7834000072079865, + "grad_norm": 0.24351219832897186, + "learning_rate": 2.4942993831055803e-07, + "loss": 0.3611, "step": 132725 }, { - "epoch": 4.67, - "learning_rate": 5.752185828282159e-07, - "loss": 0.2377, + "epoch": 4.783580206869211, + "grad_norm": 0.3001359701156616, + "learning_rate": 2.49018920760416e-07, + "loss": 0.4113, "step": 132730 }, { - "epoch": 4.67, - "learning_rate": 5.746111679479277e-07, - "loss": 0.2485, + "epoch": 4.783760406530436, + "grad_norm": 0.256946861743927, + "learning_rate": 2.486082404375978e-07, + "loss": 0.3766, "step": 132735 }, { - "epoch": 4.67, - "learning_rate": 5.740040702152172e-07, - "loss": 0.2491, + "epoch": 4.783940606191661, + "grad_norm": 0.25258302688598633, + "learning_rate": 2.4819789734770174e-07, + "loss": 0.3731, "step": 132740 }, { - "epoch": 4.67, - "learning_rate": 5.733972896379669e-07, - "loss": 0.2482, + "epoch": 4.784120805852885, + "grad_norm": 0.24291817843914032, + "learning_rate": 2.4778789149631773e-07, + "loss": 0.3833, "step": 132745 }, { - "epoch": 4.67, - "learning_rate": 5.727908262240539e-07, - "loss": 0.2355, + "epoch": 4.784301005514109, + "grad_norm": 0.25857844948768616, + "learning_rate": 2.4737822288902756e-07, + "loss": 0.3858, "step": 132750 }, { - "epoch": 4.67, - "learning_rate": 5.721846799813524e-07, - "loss": 0.2578, + "epoch": 4.784481205175334, + "grad_norm": 0.276473730802536, + "learning_rate": 2.4696889153142386e-07, + "loss": 0.3602, "step": 132755 }, { - "epoch": 4.67, - "learning_rate": 5.715788509177367e-07, - "loss": 0.2684, + "epoch": 4.784661404836559, + "grad_norm": 0.2200738787651062, + "learning_rate": 2.465598974290717e-07, + "loss": 0.3473, "step": 132760 }, { - "epoch": 4.67, - "learning_rate": 5.709733390410698e-07, - "loss": 0.281, + "epoch": 4.7848416044977835, + "grad_norm": 0.22857971489429474, + "learning_rate": 2.461512405875471e-07, + "loss": 0.3452, "step": 132765 }, { - "epoch": 4.67, - "learning_rate": 5.703681443592152e-07, - "loss": 0.2661, + "epoch": 4.785021804159008, + "grad_norm": 0.2619878351688385, + "learning_rate": 2.4574292101242347e-07, + "loss": 0.3811, "step": 132770 }, { - "epoch": 4.67, - "learning_rate": 5.697632668800302e-07, - "loss": 0.2444, + "epoch": 4.785202003820233, + "grad_norm": 0.2593579888343811, + "learning_rate": 2.453349387092574e-07, + "loss": 0.3729, "step": 132775 }, { - "epoch": 4.67, - "learning_rate": 5.691587066113668e-07, - "loss": 0.2518, + "epoch": 4.785382203481458, + "grad_norm": 0.22074739634990692, + "learning_rate": 2.449272936836111e-07, + "loss": 0.3484, "step": 132780 }, { - "epoch": 4.67, - "learning_rate": 5.685544635610773e-07, - "loss": 0.256, + "epoch": 4.785562403142682, + "grad_norm": 0.25108104944229126, + "learning_rate": 2.4451998594103854e-07, + "loss": 0.3631, "step": 132785 }, { - "epoch": 4.67, - "learning_rate": 5.679505377370109e-07, - "loss": 0.2404, + "epoch": 4.785742602803907, + "grad_norm": 0.2209988385438919, + "learning_rate": 2.441130154870852e-07, + "loss": 0.3648, "step": 132790 }, { - "epoch": 4.67, - "learning_rate": 5.673469291470029e-07, - "loss": 0.2286, + "epoch": 4.785922802465131, + "grad_norm": 0.2671109735965729, + "learning_rate": 2.437063823273023e-07, + "loss": 0.3766, "step": 132795 }, { - "epoch": 4.67, - "learning_rate": 5.667436377988944e-07, - "loss": 0.2347, + "epoch": 4.786103002126356, + "grad_norm": 0.25873270630836487, + "learning_rate": 2.4330008646722693e-07, + "loss": 0.4221, "step": 132800 }, { - "epoch": 4.67, - "learning_rate": 5.661406637005151e-07, - "loss": 0.2382, + "epoch": 4.7862832017875805, + "grad_norm": 0.24569159746170044, + "learning_rate": 2.428941279123936e-07, + "loss": 0.349, "step": 132805 }, { - "epoch": 4.67, - "learning_rate": 5.655380068597005e-07, - "loss": 0.2313, + "epoch": 4.786463401448805, + "grad_norm": 0.22799314558506012, + "learning_rate": 2.424885066683341e-07, + "loss": 0.3881, "step": 132810 }, { - "epoch": 4.67, - "learning_rate": 5.649356672842721e-07, - "loss": 0.2519, + "epoch": 4.78664360111003, + "grad_norm": 0.258485347032547, + "learning_rate": 2.420832227405745e-07, + "loss": 0.3969, "step": 132815 }, { - "epoch": 4.67, - "learning_rate": 5.643336449820485e-07, - "loss": 0.2517, + "epoch": 4.786823800771255, + "grad_norm": 0.208878755569458, + "learning_rate": 2.416782761346409e-07, + "loss": 0.361, "step": 132820 }, { - "epoch": 4.67, - "learning_rate": 5.637319399608487e-07, - "loss": 0.2417, + "epoch": 4.787004000432479, + "grad_norm": 0.2455977350473404, + "learning_rate": 2.4127366685604834e-07, + "loss": 0.3483, "step": 132825 }, { - "epoch": 4.67, - "learning_rate": 5.631305522284858e-07, - "loss": 0.2561, + "epoch": 4.787184200093704, + "grad_norm": 0.25862938165664673, + "learning_rate": 2.408693949103036e-07, + "loss": 0.3815, "step": 132830 }, { - "epoch": 4.67, - "learning_rate": 5.625294817927673e-07, - "loss": 0.2748, + "epoch": 4.787364399754928, + "grad_norm": 0.2472279667854309, + "learning_rate": 2.4046546030292437e-07, + "loss": 0.3725, "step": 132835 }, { - "epoch": 4.67, - "learning_rate": 5.619287286615011e-07, - "loss": 0.2655, + "epoch": 4.787544599416153, + "grad_norm": 0.2628474533557892, + "learning_rate": 2.400618630394064e-07, + "loss": 0.3722, "step": 132840 }, { - "epoch": 4.67, - "learning_rate": 5.613282928424806e-07, - "loss": 0.2467, + "epoch": 4.7877247990773775, + "grad_norm": 0.2300945371389389, + "learning_rate": 2.3965860312525344e-07, + "loss": 0.3503, "step": 132845 }, { - "epoch": 4.67, - "learning_rate": 5.607281743435112e-07, - "loss": 0.2592, + "epoch": 4.787904998738602, + "grad_norm": 0.27947181463241577, + "learning_rate": 2.392556805659585e-07, + "loss": 0.3705, "step": 132850 }, { - "epoch": 4.67, - "learning_rate": 5.601283731723806e-07, - "loss": 0.2365, + "epoch": 4.788085198399827, + "grad_norm": 0.22649262845516205, + "learning_rate": 2.3885309536700874e-07, + "loss": 0.3519, "step": 132855 }, { - "epoch": 4.67, - "learning_rate": 5.595288893368744e-07, - "loss": 0.25, + "epoch": 4.788265398061052, + "grad_norm": 0.2503747045993805, + "learning_rate": 2.3845084753389426e-07, + "loss": 0.3673, "step": 132860 }, { - "epoch": 4.67, - "learning_rate": 5.589297228447781e-07, - "loss": 0.2686, + "epoch": 4.788445597722276, + "grad_norm": 0.2534830570220947, + "learning_rate": 2.3804893707209396e-07, + "loss": 0.3949, "step": 132865 }, { - "epoch": 4.67, - "learning_rate": 5.583308737038745e-07, - "loss": 0.2597, + "epoch": 4.788625797383501, + "grad_norm": 0.2691611349582672, + "learning_rate": 2.3764736398708133e-07, + "loss": 0.3847, "step": 132870 }, { - "epoch": 4.67, - "learning_rate": 5.577323419219349e-07, - "loss": 0.2482, + "epoch": 4.788805997044726, + "grad_norm": 0.23171448707580566, + "learning_rate": 2.3724612828432968e-07, + "loss": 0.3372, "step": 132875 }, { - "epoch": 4.68, - "learning_rate": 5.571341275067338e-07, - "loss": 0.2365, + "epoch": 4.788986196705951, + "grad_norm": 0.20922648906707764, + "learning_rate": 2.3684522996930137e-07, + "loss": 0.3914, "step": 132880 }, { - "epoch": 4.68, - "learning_rate": 5.565362304660371e-07, - "loss": 0.2679, + "epoch": 4.7891663963671744, + "grad_norm": 0.24962696433067322, + "learning_rate": 2.3644466904746697e-07, + "loss": 0.3608, "step": 132885 }, { - "epoch": 4.68, - "learning_rate": 5.559386508076053e-07, - "loss": 0.2361, + "epoch": 4.789346596028399, + "grad_norm": 0.3583545982837677, + "learning_rate": 2.3604444552427773e-07, + "loss": 0.3603, "step": 132890 }, { - "epoch": 4.68, - "learning_rate": 5.553413885392044e-07, - "loss": 0.2445, + "epoch": 4.789526795689624, + "grad_norm": 0.26689061522483826, + "learning_rate": 2.3564455940519037e-07, + "loss": 0.3372, "step": 132895 }, { - "epoch": 4.68, - "learning_rate": 5.547444436685866e-07, - "loss": 0.2383, + "epoch": 4.789706995350849, + "grad_norm": 0.24085666239261627, + "learning_rate": 2.3524501069565053e-07, + "loss": 0.3906, "step": 132900 }, { - "epoch": 4.68, - "learning_rate": 5.541478162035013e-07, - "loss": 0.2589, + "epoch": 4.789887195012073, + "grad_norm": 0.2830328345298767, + "learning_rate": 2.3484579940110385e-07, + "loss": 0.3628, "step": 132905 }, { - "epoch": 4.68, - "learning_rate": 5.535515061516977e-07, - "loss": 0.2598, + "epoch": 4.790067394673298, + "grad_norm": 0.30713436007499695, + "learning_rate": 2.3444692552698488e-07, + "loss": 0.3912, "step": 132910 }, { - "epoch": 4.68, - "learning_rate": 5.529555135209142e-07, - "loss": 0.2544, + "epoch": 4.790247594334523, + "grad_norm": 0.21235887706279755, + "learning_rate": 2.340483890787337e-07, + "loss": 0.3317, "step": 132915 }, { - "epoch": 4.68, - "learning_rate": 5.523598383188916e-07, - "loss": 0.2531, + "epoch": 4.790427793995748, + "grad_norm": 0.2915695011615753, + "learning_rate": 2.3365019006177936e-07, + "loss": 0.3714, "step": 132920 }, { - "epoch": 4.68, - "learning_rate": 5.517644805533684e-07, - "loss": 0.2508, + "epoch": 4.790607993656971, + "grad_norm": 0.29393500089645386, + "learning_rate": 2.3325232848154522e-07, + "loss": 0.3477, "step": 132925 }, { - "epoch": 4.68, - "learning_rate": 5.511694402320716e-07, - "loss": 0.2439, + "epoch": 4.790788193318196, + "grad_norm": 0.2399698942899704, + "learning_rate": 2.3285480434345474e-07, + "loss": 0.3784, "step": 132930 }, { - "epoch": 4.68, - "learning_rate": 5.505747173627257e-07, - "loss": 0.2572, + "epoch": 4.790968392979421, + "grad_norm": 0.25534698367118835, + "learning_rate": 2.324576176529203e-07, + "loss": 0.3805, "step": 132935 }, { - "epoch": 4.68, - "learning_rate": 5.499803119530522e-07, - "loss": 0.2283, + "epoch": 4.791148592640646, + "grad_norm": 0.23360496759414673, + "learning_rate": 2.3206076841535696e-07, + "loss": 0.3425, "step": 132940 }, { - "epoch": 4.68, - "learning_rate": 5.4938622401077e-07, - "loss": 0.2578, + "epoch": 4.79132879230187, + "grad_norm": 0.2366493046283722, + "learning_rate": 2.3166425663617154e-07, + "loss": 0.3638, "step": 132945 }, { - "epoch": 4.68, - "learning_rate": 5.487924535435979e-07, - "loss": 0.2574, + "epoch": 4.791508991963095, + "grad_norm": 0.23220838606357574, + "learning_rate": 2.3126808232076247e-07, + "loss": 0.367, "step": 132950 }, { - "epoch": 4.68, - "learning_rate": 5.481990005592435e-07, - "loss": 0.2252, + "epoch": 4.79168919162432, + "grad_norm": 0.22431600093841553, + "learning_rate": 2.30872245474531e-07, + "loss": 0.3759, "step": 132955 }, { - "epoch": 4.68, - "learning_rate": 5.476058650654065e-07, - "loss": 0.2651, + "epoch": 4.791869391285545, + "grad_norm": 0.24509382247924805, + "learning_rate": 2.3047674610287007e-07, + "loss": 0.3951, "step": 132960 }, { - "epoch": 4.68, - "learning_rate": 5.470130470697943e-07, - "loss": 0.2429, + "epoch": 4.792049590946769, + "grad_norm": 0.3701198101043701, + "learning_rate": 2.3008158421116977e-07, + "loss": 0.3721, "step": 132965 }, { - "epoch": 4.68, - "learning_rate": 5.464205465801037e-07, - "loss": 0.2431, + "epoch": 4.792229790607994, + "grad_norm": 0.2326425015926361, + "learning_rate": 2.2968675980480913e-07, + "loss": 0.3701, "step": 132970 }, { - "epoch": 4.68, - "learning_rate": 5.458283636040257e-07, - "loss": 0.2552, + "epoch": 4.792409990269219, + "grad_norm": 0.25888702273368835, + "learning_rate": 2.2929227288917278e-07, + "loss": 0.3776, "step": 132975 }, { - "epoch": 4.68, - "learning_rate": 5.452364981492486e-07, - "loss": 0.2313, + "epoch": 4.792590189930443, + "grad_norm": 0.2808634638786316, + "learning_rate": 2.2889812346963134e-07, + "loss": 0.3671, "step": 132980 }, { - "epoch": 4.68, - "learning_rate": 5.446449502234635e-07, - "loss": 0.246, + "epoch": 4.792770389591667, + "grad_norm": 0.24987556040287018, + "learning_rate": 2.2850431155156116e-07, + "loss": 0.3775, "step": 132985 }, { - "epoch": 4.68, - "learning_rate": 5.440537198343449e-07, - "loss": 0.2323, + "epoch": 4.792950589252892, + "grad_norm": 0.3451884388923645, + "learning_rate": 2.2811083714032177e-07, + "loss": 0.3949, "step": 132990 }, { - "epoch": 4.68, - "learning_rate": 5.434628069895698e-07, - "loss": 0.2413, + "epoch": 4.793130788914117, + "grad_norm": 0.23816993832588196, + "learning_rate": 2.2771770024127559e-07, + "loss": 0.3669, "step": 132995 }, { - "epoch": 4.68, - "learning_rate": 5.428722116968127e-07, - "loss": 0.263, + "epoch": 4.7933109885753415, + "grad_norm": 0.28144383430480957, + "learning_rate": 2.2732490085977943e-07, + "loss": 0.3463, "step": 133000 }, { - "epoch": 4.68, - "eval_loss": 0.24850255250930786, - "eval_runtime": 10.5511, - "eval_samples_per_second": 9.478, - "eval_steps_per_second": 9.478, + "epoch": 4.7933109885753415, + "eval_loss": 0.428801029920578, + "eval_runtime": 3.5412, + "eval_samples_per_second": 28.239, + "eval_steps_per_second": 7.06, "step": 133000 }, { - "epoch": 4.68, - "learning_rate": 5.422819339637453e-07, - "loss": 0.2391, + "epoch": 4.793491188236566, + "grad_norm": 0.29074543714523315, + "learning_rate": 2.2693243900118454e-07, + "loss": 0.3782, "step": 133005 }, { - "epoch": 4.68, - "learning_rate": 5.41691973798028e-07, - "loss": 0.2412, + "epoch": 4.793671387897791, + "grad_norm": 0.25907793641090393, + "learning_rate": 2.2654031467084225e-07, + "loss": 0.3529, "step": 133010 }, { - "epoch": 4.68, - "learning_rate": 5.411023312073188e-07, - "loss": 0.2447, + "epoch": 4.793851587559016, + "grad_norm": 0.2522429823875427, + "learning_rate": 2.261485278740899e-07, + "loss": 0.3729, "step": 133015 }, { - "epoch": 4.68, - "learning_rate": 5.405130061992808e-07, - "loss": 0.2358, + "epoch": 4.79403178722024, + "grad_norm": 0.2468443363904953, + "learning_rate": 2.257570786162677e-07, + "loss": 0.3641, "step": 133020 }, { - "epoch": 4.68, - "learning_rate": 5.399239987815607e-07, - "loss": 0.2476, + "epoch": 4.794211986881464, + "grad_norm": 0.2643374502658844, + "learning_rate": 2.2536596690270751e-07, + "loss": 0.3705, "step": 133025 }, { - "epoch": 4.68, - "learning_rate": 5.393353089618081e-07, - "loss": 0.2601, + "epoch": 4.794392186542689, + "grad_norm": 0.34247177839279175, + "learning_rate": 2.2497519273874113e-07, + "loss": 0.3723, "step": 133030 }, { - "epoch": 4.68, - "learning_rate": 5.387469367476638e-07, - "loss": 0.2679, + "epoch": 4.794572386203914, + "grad_norm": 0.28834789991378784, + "learning_rate": 2.245847561296921e-07, + "loss": 0.367, "step": 133035 }, { - "epoch": 4.68, - "learning_rate": 5.38158882146772e-07, - "loss": 0.2602, + "epoch": 4.7947525858651385, + "grad_norm": 0.2463623285293579, + "learning_rate": 2.241946570808756e-07, + "loss": 0.3654, "step": 133040 }, { - "epoch": 4.68, - "learning_rate": 5.375711451667653e-07, - "loss": 0.2363, + "epoch": 4.794932785526363, + "grad_norm": 0.24550725519657135, + "learning_rate": 2.2380489559761242e-07, + "loss": 0.3917, "step": 133045 }, { - "epoch": 4.68, - "learning_rate": 5.369837258152766e-07, - "loss": 0.2532, + "epoch": 4.795112985187588, + "grad_norm": 0.23969918489456177, + "learning_rate": 2.2341547168521215e-07, + "loss": 0.3357, "step": 133050 }, { - "epoch": 4.68, - "learning_rate": 5.363966240999329e-07, - "loss": 0.2625, + "epoch": 4.795293184848813, + "grad_norm": 0.1967889666557312, + "learning_rate": 2.230263853489789e-07, + "loss": 0.3329, "step": 133055 }, { - "epoch": 4.68, - "learning_rate": 5.358098400283534e-07, - "loss": 0.2409, + "epoch": 4.7954733845100375, + "grad_norm": 0.2610912322998047, + "learning_rate": 2.2263763659421123e-07, + "loss": 0.3567, "step": 133060 }, { - "epoch": 4.68, - "learning_rate": 5.352233736081652e-07, - "loss": 0.2583, + "epoch": 4.795653584171262, + "grad_norm": 0.26562637090682983, + "learning_rate": 2.222492254262104e-07, + "loss": 0.348, "step": 133065 }, { - "epoch": 4.68, - "learning_rate": 5.346372248469761e-07, - "loss": 0.2374, + "epoch": 4.795833783832486, + "grad_norm": 0.2973119020462036, + "learning_rate": 2.2186115185026668e-07, + "loss": 0.4043, "step": 133070 }, { - "epoch": 4.68, - "learning_rate": 5.340513937523994e-07, - "loss": 0.2515, + "epoch": 4.796013983493711, + "grad_norm": 0.28032979369163513, + "learning_rate": 2.2147341587166748e-07, + "loss": 0.3818, "step": 133075 }, { - "epoch": 4.68, - "learning_rate": 5.33465880332043e-07, - "loss": 0.2455, + "epoch": 4.7961941831549355, + "grad_norm": 0.2844066917896271, + "learning_rate": 2.2108601749569467e-07, + "loss": 0.3509, "step": 133080 }, { - "epoch": 4.68, - "learning_rate": 5.328806845935091e-07, - "loss": 0.2345, + "epoch": 4.79637438281616, + "grad_norm": 0.25133219361305237, + "learning_rate": 2.206989567276302e-07, + "loss": 0.3783, "step": 133085 }, { - "epoch": 4.68, - "learning_rate": 5.322958065443944e-07, - "loss": 0.2455, + "epoch": 4.796554582477385, + "grad_norm": 0.2511965036392212, + "learning_rate": 2.2031223357274477e-07, + "loss": 0.3864, "step": 133090 }, { - "epoch": 4.68, - "learning_rate": 5.317112461922902e-07, - "loss": 0.2656, + "epoch": 4.79673478213861, + "grad_norm": 0.27423515915870667, + "learning_rate": 2.1992584803630368e-07, + "loss": 0.3573, "step": 133095 }, { - "epoch": 4.68, - "learning_rate": 5.311270035447957e-07, - "loss": 0.2337, + "epoch": 4.796914981799834, + "grad_norm": 0.24834780395030975, + "learning_rate": 2.1953980012357767e-07, + "loss": 0.3601, "step": 133100 }, { - "epoch": 4.68, - "learning_rate": 5.305430786094856e-07, - "loss": 0.2505, + "epoch": 4.797095181461059, + "grad_norm": 0.20566605031490326, + "learning_rate": 2.1915408983982643e-07, + "loss": 0.3793, "step": 133105 }, { - "epoch": 4.68, - "learning_rate": 5.29959471393951e-07, - "loss": 0.2346, + "epoch": 4.797275381122283, + "grad_norm": 0.30788904428482056, + "learning_rate": 2.1876871719030135e-07, + "loss": 0.3647, "step": 133110 }, { - "epoch": 4.68, - "learning_rate": 5.293761819057663e-07, - "loss": 0.2234, + "epoch": 4.797455580783508, + "grad_norm": 0.2940426766872406, + "learning_rate": 2.1838368218025374e-07, + "loss": 0.3761, "step": 133115 }, { - "epoch": 4.68, - "learning_rate": 5.287932101525061e-07, - "loss": 0.2616, + "epoch": 4.7976357804447325, + "grad_norm": 0.24170757830142975, + "learning_rate": 2.1799898481493219e-07, + "loss": 0.3499, "step": 133120 }, { - "epoch": 4.68, - "learning_rate": 5.282105561417366e-07, - "loss": 0.2709, + "epoch": 4.797815980105957, + "grad_norm": 0.19300325214862823, + "learning_rate": 2.1761462509957698e-07, + "loss": 0.3965, "step": 133125 }, { - "epoch": 4.68, - "learning_rate": 5.276282198810267e-07, - "loss": 0.2789, + "epoch": 4.797996179767182, + "grad_norm": 0.2832929491996765, + "learning_rate": 2.1723060303942278e-07, + "loss": 0.3731, "step": 133130 }, { - "epoch": 4.68, - "learning_rate": 5.270462013779371e-07, - "loss": 0.2462, + "epoch": 4.798176379428407, + "grad_norm": 0.2704760730266571, + "learning_rate": 2.1684691863970152e-07, + "loss": 0.3728, "step": 133135 }, { - "epoch": 4.68, - "learning_rate": 5.264645006400254e-07, - "loss": 0.2529, + "epoch": 4.798356579089631, + "grad_norm": 0.2666027247905731, + "learning_rate": 2.1646357190564514e-07, + "loss": 0.3435, "step": 133140 }, { - "epoch": 4.68, - "learning_rate": 5.258831176748413e-07, - "loss": 0.2318, + "epoch": 4.798536778750856, + "grad_norm": 0.2326856553554535, + "learning_rate": 2.1608056284247725e-07, + "loss": 0.3331, "step": 133145 }, { - "epoch": 4.68, - "learning_rate": 5.253020524899343e-07, - "loss": 0.2744, + "epoch": 4.798716978412081, + "grad_norm": 0.23978599905967712, + "learning_rate": 2.1569789145541031e-07, + "loss": 0.3723, "step": 133150 }, { - "epoch": 4.68, - "learning_rate": 5.248374291488805e-07, - "loss": 0.2483, + "epoch": 4.798897178073306, + "grad_norm": 0.2229306995868683, + "learning_rate": 2.1531555774966238e-07, + "loss": 0.3922, "step": 133155 }, { - "epoch": 4.68, - "learning_rate": 5.242569359874882e-07, - "loss": 0.235, + "epoch": 4.79907737773453, + "grad_norm": 0.26587215065956116, + "learning_rate": 2.1493356173043765e-07, + "loss": 0.3805, "step": 133160 }, { - "epoch": 4.69, - "learning_rate": 5.236767606274867e-07, - "loss": 0.2279, + "epoch": 4.799257577395754, + "grad_norm": 0.21258600056171417, + "learning_rate": 2.1455190340294863e-07, + "loss": 0.3379, "step": 133165 }, { - "epoch": 4.69, - "learning_rate": 5.230969030764143e-07, - "loss": 0.2417, + "epoch": 4.799437777056979, + "grad_norm": 0.2616501450538635, + "learning_rate": 2.141705827723911e-07, + "loss": 0.3754, "step": 133170 }, { - "epoch": 4.69, - "learning_rate": 5.225173633417985e-07, - "loss": 0.2398, + "epoch": 4.799617976718204, + "grad_norm": 0.21897155046463013, + "learning_rate": 2.1378959984395818e-07, + "loss": 0.3835, "step": 133175 }, { - "epoch": 4.69, - "learning_rate": 5.219381414311636e-07, - "loss": 0.2554, + "epoch": 4.799798176379428, + "grad_norm": 0.26772749423980713, + "learning_rate": 2.1340895462284571e-07, + "loss": 0.3344, "step": 133180 }, { - "epoch": 4.69, - "learning_rate": 5.21359237352026e-07, - "loss": 0.2657, + "epoch": 4.799978376040653, + "grad_norm": 0.3039627969264984, + "learning_rate": 2.1302864711423564e-07, + "loss": 0.4117, "step": 133185 }, { - "epoch": 4.69, - "learning_rate": 5.207806511119101e-07, - "loss": 0.2319, + "epoch": 4.800158575701878, + "grad_norm": 0.27932974696159363, + "learning_rate": 2.1264867732331272e-07, + "loss": 0.3639, "step": 133190 }, { - "epoch": 4.69, - "learning_rate": 5.202023827183239e-07, - "loss": 0.242, + "epoch": 4.800338775363103, + "grad_norm": 0.23750893771648407, + "learning_rate": 2.1226904525525336e-07, + "loss": 0.3618, "step": 133195 }, { - "epoch": 4.69, - "learning_rate": 5.19624432178778e-07, - "loss": 0.2349, + "epoch": 4.800518975024327, + "grad_norm": 0.26062673330307007, + "learning_rate": 2.1188975091522568e-07, + "loss": 0.372, "step": 133200 }, { - "epoch": 4.69, - "learning_rate": 5.190467995007747e-07, - "loss": 0.2436, + "epoch": 4.800699174685551, + "grad_norm": 0.250861257314682, + "learning_rate": 2.1151079430840325e-07, + "loss": 0.393, "step": 133205 }, { - "epoch": 4.69, - "learning_rate": 5.184694846918164e-07, - "loss": 0.2547, + "epoch": 4.800879374346776, + "grad_norm": 0.2910214364528656, + "learning_rate": 2.111321754399487e-07, + "loss": 0.3412, "step": 133210 }, { - "epoch": 4.69, - "learning_rate": 5.178924877593971e-07, - "loss": 0.2507, + "epoch": 4.801059574008001, + "grad_norm": 0.23345068097114563, + "learning_rate": 2.1075389431501613e-07, + "loss": 0.3914, "step": 133215 }, { - "epoch": 4.69, - "learning_rate": 5.17315808711008e-07, - "loss": 0.2255, + "epoch": 4.801239773669225, + "grad_norm": 0.22395312786102295, + "learning_rate": 2.1037595093876538e-07, + "loss": 0.3657, "step": 133220 }, { - "epoch": 4.69, - "learning_rate": 5.167394475541404e-07, - "loss": 0.2469, + "epoch": 4.80141997333045, + "grad_norm": 0.21458613872528076, + "learning_rate": 2.0999834531633955e-07, + "loss": 0.3573, "step": 133225 }, { - "epoch": 4.69, - "learning_rate": 5.16163404296277e-07, - "loss": 0.2552, + "epoch": 4.801600172991675, + "grad_norm": 0.2688688635826111, + "learning_rate": 2.0962107745288727e-07, + "loss": 0.3631, "step": 133230 }, { - "epoch": 4.69, - "learning_rate": 5.155876789448955e-07, - "loss": 0.2525, + "epoch": 4.8017803726529, + "grad_norm": 0.2928563952445984, + "learning_rate": 2.0924414735355168e-07, + "loss": 0.3654, "step": 133235 }, { - "epoch": 4.69, - "learning_rate": 5.150122715074701e-07, - "loss": 0.2423, + "epoch": 4.801960572314124, + "grad_norm": 0.360637903213501, + "learning_rate": 2.08867555023462e-07, + "loss": 0.3995, "step": 133240 }, { - "epoch": 4.69, - "learning_rate": 5.144371819914757e-07, - "loss": 0.2567, + "epoch": 4.802140771975349, + "grad_norm": 0.3004918694496155, + "learning_rate": 2.08491300467753e-07, + "loss": 0.3326, "step": 133245 }, { - "epoch": 4.69, - "learning_rate": 5.138624104043754e-07, - "loss": 0.2347, + "epoch": 4.802320971636574, + "grad_norm": 0.25441572070121765, + "learning_rate": 2.0811538369155115e-07, + "loss": 0.4041, "step": 133250 }, { - "epoch": 4.69, - "learning_rate": 5.132879567536386e-07, - "loss": 0.2455, + "epoch": 4.802501171297798, + "grad_norm": 0.22652685642242432, + "learning_rate": 2.0773980469997456e-07, + "loss": 0.3531, "step": 133255 }, { - "epoch": 4.69, - "learning_rate": 5.127138210467175e-07, - "loss": 0.2377, + "epoch": 4.802681370959022, + "grad_norm": 0.2670671343803406, + "learning_rate": 2.0736456349814416e-07, + "loss": 0.3764, "step": 133260 }, { - "epoch": 4.69, - "learning_rate": 5.1214000329107e-07, - "loss": 0.2489, + "epoch": 4.802861570620247, + "grad_norm": 0.2373109608888626, + "learning_rate": 2.069896600911725e-07, + "loss": 0.3733, "step": 133265 }, { - "epoch": 4.69, - "learning_rate": 5.115665034941486e-07, - "loss": 0.2669, + "epoch": 4.803041770281472, + "grad_norm": 0.2623651921749115, + "learning_rate": 2.066150944841666e-07, + "loss": 0.3785, "step": 133270 }, { - "epoch": 4.69, - "learning_rate": 5.109933216633972e-07, - "loss": 0.2368, + "epoch": 4.803221969942697, + "grad_norm": 0.26846858859062195, + "learning_rate": 2.0624086668223074e-07, + "loss": 0.366, "step": 133275 }, { - "epoch": 4.69, - "learning_rate": 5.104204578062571e-07, - "loss": 0.2481, + "epoch": 4.803402169603921, + "grad_norm": 0.2998198866844177, + "learning_rate": 2.0586697669046083e-07, + "loss": 0.399, "step": 133280 }, { - "epoch": 4.69, - "learning_rate": 5.098479119301697e-07, - "loss": 0.2422, + "epoch": 4.803582369265146, + "grad_norm": 0.29226166009902954, + "learning_rate": 2.054934245139528e-07, + "loss": 0.3444, "step": 133285 }, { - "epoch": 4.69, - "learning_rate": 5.092756840425678e-07, - "loss": 0.257, + "epoch": 4.803762568926371, + "grad_norm": 0.24952854216098785, + "learning_rate": 2.0512021015779704e-07, + "loss": 0.3419, "step": 133290 }, { - "epoch": 4.69, - "learning_rate": 5.087037741508816e-07, - "loss": 0.2429, + "epoch": 4.803942768587595, + "grad_norm": 0.28424960374832153, + "learning_rate": 2.0474733362708109e-07, + "loss": 0.3488, "step": 133295 }, { - "epoch": 4.69, - "learning_rate": 5.081321822625329e-07, - "loss": 0.2547, + "epoch": 4.804122968248819, + "grad_norm": 0.3396574854850769, + "learning_rate": 2.0437479492687872e-07, + "loss": 0.3507, "step": 133300 }, { - "epoch": 4.69, - "learning_rate": 5.075609083849491e-07, - "loss": 0.2409, + "epoch": 4.804303167910044, + "grad_norm": 0.23657868802547455, + "learning_rate": 2.040025940622692e-07, + "loss": 0.3767, "step": 133305 }, { - "epoch": 4.69, - "learning_rate": 5.069899525255467e-07, - "loss": 0.2559, + "epoch": 4.804483367571269, + "grad_norm": 0.24988289177417755, + "learning_rate": 2.0363073103832619e-07, + "loss": 0.3637, "step": 133310 }, { - "epoch": 4.69, - "learning_rate": 5.064193146917362e-07, - "loss": 0.2601, + "epoch": 4.8046635672324935, + "grad_norm": 0.26382821798324585, + "learning_rate": 2.032592058601124e-07, + "loss": 0.3908, "step": 133315 }, { - "epoch": 4.69, - "learning_rate": 5.058489948909312e-07, - "loss": 0.24, + "epoch": 4.804843766893718, + "grad_norm": 0.24887283146381378, + "learning_rate": 2.0288801853269036e-07, + "loss": 0.4073, "step": 133320 }, { - "epoch": 4.69, - "learning_rate": 5.052789931305341e-07, - "loss": 0.2504, + "epoch": 4.805023966554943, + "grad_norm": 0.27872660756111145, + "learning_rate": 2.0251716906111718e-07, + "loss": 0.3677, "step": 133325 }, { - "epoch": 4.69, - "learning_rate": 5.047093094179472e-07, - "loss": 0.2488, + "epoch": 4.805204166216168, + "grad_norm": 0.22657455503940582, + "learning_rate": 2.0214665745044714e-07, + "loss": 0.3901, "step": 133330 }, { - "epoch": 4.69, - "learning_rate": 5.041399437605649e-07, - "loss": 0.251, + "epoch": 4.8053843658773925, + "grad_norm": 0.22239211201667786, + "learning_rate": 2.0177648370573176e-07, + "loss": 0.3766, "step": 133335 }, { - "epoch": 4.69, - "learning_rate": 5.035708961657836e-07, - "loss": 0.2721, + "epoch": 4.805564565538617, + "grad_norm": 0.28701508045196533, + "learning_rate": 2.014066478320087e-07, + "loss": 0.4004, "step": 133340 }, { - "epoch": 4.69, - "learning_rate": 5.030021666409895e-07, - "loss": 0.2156, + "epoch": 4.805744765199841, + "grad_norm": 0.31527572870254517, + "learning_rate": 2.0103714983431553e-07, + "loss": 0.3964, "step": 133345 }, { - "epoch": 4.69, - "learning_rate": 5.02433755193571e-07, - "loss": 0.2312, + "epoch": 4.805924964861066, + "grad_norm": 0.27578893303871155, + "learning_rate": 2.0066798971769273e-07, + "loss": 0.3874, "step": 133350 }, { - "epoch": 4.69, - "learning_rate": 5.018656618309025e-07, - "loss": 0.266, + "epoch": 4.8061051645222905, + "grad_norm": 0.25750282406806946, + "learning_rate": 2.0029916748716683e-07, + "loss": 0.3622, "step": 133355 }, { - "epoch": 4.69, - "learning_rate": 5.012978865603646e-07, - "loss": 0.2396, + "epoch": 4.806285364183515, + "grad_norm": 0.2739262580871582, + "learning_rate": 1.999306831477643e-07, + "loss": 0.3426, "step": 133360 }, { - "epoch": 4.69, - "learning_rate": 5.007304293893262e-07, - "loss": 0.2459, + "epoch": 4.80646556384474, + "grad_norm": 0.25903549790382385, + "learning_rate": 1.995625367045062e-07, + "loss": 0.3852, "step": 133365 }, { - "epoch": 4.69, - "learning_rate": 5.001632903251619e-07, - "loss": 0.2543, + "epoch": 4.806645763505965, + "grad_norm": 0.19403640925884247, + "learning_rate": 1.9919472816240237e-07, + "loss": 0.3446, "step": 133370 }, { - "epoch": 4.69, - "learning_rate": 4.99596469375227e-07, - "loss": 0.2621, + "epoch": 4.8068259631671895, + "grad_norm": 0.22920994460582733, + "learning_rate": 1.9882725752647102e-07, + "loss": 0.3628, "step": 133375 }, { - "epoch": 4.69, - "learning_rate": 4.990299665468879e-07, - "loss": 0.2688, + "epoch": 4.807006162828414, + "grad_norm": 0.24877864122390747, + "learning_rate": 1.984601248017165e-07, + "loss": 0.3912, "step": 133380 }, { - "epoch": 4.69, - "learning_rate": 4.984637818474996e-07, - "loss": 0.2269, + "epoch": 4.807186362489638, + "grad_norm": 0.22861328721046448, + "learning_rate": 1.9809332999314311e-07, + "loss": 0.3505, "step": 133385 }, { - "epoch": 4.69, - "learning_rate": 4.978979152844093e-07, - "loss": 0.2381, + "epoch": 4.807366562150863, + "grad_norm": 0.2941596806049347, + "learning_rate": 1.9772687310574412e-07, + "loss": 0.3911, "step": 133390 }, { - "epoch": 4.69, - "learning_rate": 4.973323668649665e-07, - "loss": 0.2326, + "epoch": 4.8075467618120875, + "grad_norm": 0.24592210352420807, + "learning_rate": 1.973607541445155e-07, + "loss": 0.4009, "step": 133395 }, { - "epoch": 4.69, - "learning_rate": 4.967671365965181e-07, - "loss": 0.276, + "epoch": 4.807726961473312, + "grad_norm": 0.26915445923805237, + "learning_rate": 1.9699497311444493e-07, + "loss": 0.381, "step": 133400 }, { - "epoch": 4.69, - "learning_rate": 4.962022244863973e-07, - "loss": 0.2454, + "epoch": 4.807907161134537, + "grad_norm": 0.2586546838283539, + "learning_rate": 1.9662953002051455e-07, + "loss": 0.3685, "step": 133405 }, { - "epoch": 4.69, - "learning_rate": 4.956376305419425e-07, - "loss": 0.2617, + "epoch": 4.808087360795762, + "grad_norm": 0.21903952956199646, + "learning_rate": 1.9626442486770647e-07, + "loss": 0.4036, "step": 133410 }, { - "epoch": 4.69, - "learning_rate": 4.950733547704866e-07, - "loss": 0.2602, + "epoch": 4.8082675604569864, + "grad_norm": 0.2535596191883087, + "learning_rate": 1.9589965766099172e-07, + "loss": 0.3819, "step": 133415 }, { - "epoch": 4.69, - "learning_rate": 4.94509397179349e-07, - "loss": 0.2497, + "epoch": 4.808447760118211, + "grad_norm": 0.28337281942367554, + "learning_rate": 1.9553522840534133e-07, + "loss": 0.3731, "step": 133420 }, { - "epoch": 4.69, - "learning_rate": 4.939457577758627e-07, - "loss": 0.2693, + "epoch": 4.808627959779436, + "grad_norm": 0.2324288785457611, + "learning_rate": 1.9517113710572354e-07, + "loss": 0.3678, "step": 133425 }, { - "epoch": 4.69, - "learning_rate": 4.933824365673356e-07, - "loss": 0.2398, + "epoch": 4.808808159440661, + "grad_norm": 0.31472426652908325, + "learning_rate": 1.948073837670955e-07, + "loss": 0.3801, "step": 133430 }, { - "epoch": 4.69, - "learning_rate": 4.928194335610897e-07, - "loss": 0.2697, + "epoch": 4.808988359101885, + "grad_norm": 0.2125542312860489, + "learning_rate": 1.9444396839441436e-07, + "loss": 0.3243, "step": 133435 }, { - "epoch": 4.69, - "learning_rate": 4.922567487644331e-07, - "loss": 0.2417, + "epoch": 4.809168558763109, + "grad_norm": 0.27946358919143677, + "learning_rate": 1.9408089099263172e-07, + "loss": 0.3759, "step": 133440 }, { - "epoch": 4.69, - "learning_rate": 4.916943821846737e-07, - "loss": 0.2278, + "epoch": 4.809348758424334, + "grad_norm": 0.2657468914985657, + "learning_rate": 1.9371815156669358e-07, + "loss": 0.385, "step": 133445 }, { - "epoch": 4.7, - "learning_rate": 4.911323338291057e-07, - "loss": 0.2754, + "epoch": 4.809528958085559, + "grad_norm": 0.31190598011016846, + "learning_rate": 1.9335575012154327e-07, + "loss": 0.4146, "step": 133450 }, { - "epoch": 4.7, - "learning_rate": 4.905706037050345e-07, - "loss": 0.2477, + "epoch": 4.809709157746783, + "grad_norm": 0.2902090549468994, + "learning_rate": 1.9299368666211847e-07, + "loss": 0.3892, "step": 133455 }, { - "epoch": 4.7, - "learning_rate": 4.900091918197541e-07, - "loss": 0.2517, + "epoch": 4.809889357408008, + "grad_norm": 0.27345964312553406, + "learning_rate": 1.9263196119335413e-07, + "loss": 0.3975, "step": 133460 }, { - "epoch": 4.7, - "learning_rate": 4.894480981805505e-07, - "loss": 0.2734, + "epoch": 4.810069557069233, + "grad_norm": 0.2101958841085434, + "learning_rate": 1.9227057372017132e-07, + "loss": 0.3399, "step": 133465 }, { - "epoch": 4.7, - "learning_rate": 4.888873227947066e-07, - "loss": 0.2455, + "epoch": 4.810249756730458, + "grad_norm": 0.24289266765117645, + "learning_rate": 1.9190952424750496e-07, + "loss": 0.3661, "step": 133470 }, { - "epoch": 4.7, - "learning_rate": 4.883268656695111e-07, - "loss": 0.2346, + "epoch": 4.810429956391682, + "grad_norm": 0.27102965116500854, + "learning_rate": 1.9154881278026504e-07, + "loss": 0.3835, "step": 133475 }, { - "epoch": 4.7, - "learning_rate": 4.87766726812236e-07, - "loss": 0.2484, + "epoch": 4.810610156052906, + "grad_norm": 0.2354983687400818, + "learning_rate": 1.9118843932336982e-07, + "loss": 0.3508, "step": 133480 }, { - "epoch": 4.7, - "learning_rate": 4.872069062301588e-07, - "loss": 0.2498, + "epoch": 4.810790355714131, + "grad_norm": 0.2682526707649231, + "learning_rate": 1.908284038817293e-07, + "loss": 0.3447, "step": 133485 }, { - "epoch": 4.7, - "learning_rate": 4.866474039305402e-07, - "loss": 0.2736, + "epoch": 4.810970555375356, + "grad_norm": 0.2339244782924652, + "learning_rate": 1.9046870646024785e-07, + "loss": 0.373, "step": 133490 }, { - "epoch": 4.7, - "learning_rate": 4.860882199206523e-07, - "loss": 0.252, + "epoch": 4.81115075503658, + "grad_norm": 0.2050226777791977, + "learning_rate": 1.901093470638271e-07, + "loss": 0.368, "step": 133495 }, { - "epoch": 4.7, - "learning_rate": 4.85529354207756e-07, - "loss": 0.2395, + "epoch": 4.811330954697805, + "grad_norm": 0.21445462107658386, + "learning_rate": 1.8975032569736318e-07, + "loss": 0.3391, "step": 133500 }, { - "epoch": 4.7, - "eval_loss": 0.24847909808158875, - "eval_runtime": 10.5573, - "eval_samples_per_second": 9.472, - "eval_steps_per_second": 9.472, + "epoch": 4.811330954697805, + "eval_loss": 0.4288221001625061, + "eval_runtime": 3.5381, + "eval_samples_per_second": 28.264, + "eval_steps_per_second": 7.066, "step": 133500 }, { - "epoch": 4.7, - "learning_rate": 4.849708067991038e-07, - "loss": 0.2376, + "epoch": 4.81151115435903, + "grad_norm": 0.23659485578536987, + "learning_rate": 1.8939164236574658e-07, + "loss": 0.3578, "step": 133505 }, { - "epoch": 4.7, - "learning_rate": 4.844125777019481e-07, - "loss": 0.2373, + "epoch": 4.811691354020255, + "grad_norm": 0.2790174186229706, + "learning_rate": 1.8903329707386785e-07, + "loss": 0.3562, "step": 133510 }, { - "epoch": 4.7, - "learning_rate": 4.838546669235416e-07, - "loss": 0.2411, + "epoch": 4.811871553681479, + "grad_norm": 0.2516117990016937, + "learning_rate": 1.8867528982660643e-07, + "loss": 0.3731, "step": 133515 }, { - "epoch": 4.7, - "learning_rate": 4.832970744711257e-07, - "loss": 0.2565, + "epoch": 4.812051753342704, + "grad_norm": 0.2900271713733673, + "learning_rate": 1.8831762062883896e-07, + "loss": 0.3736, "step": 133520 }, { - "epoch": 4.7, - "learning_rate": 4.827398003519363e-07, - "loss": 0.2392, + "epoch": 4.812231953003929, + "grad_norm": 0.22807054221630096, + "learning_rate": 1.8796028948544209e-07, + "loss": 0.3544, "step": 133525 }, { - "epoch": 4.7, - "learning_rate": 4.821828445732147e-07, - "loss": 0.2737, + "epoch": 4.812412152665153, + "grad_norm": 0.26726317405700684, + "learning_rate": 1.8760329640128139e-07, + "loss": 0.3583, "step": 133530 }, { - "epoch": 4.7, - "learning_rate": 4.816262071421912e-07, - "loss": 0.2578, + "epoch": 4.812592352326377, + "grad_norm": 0.23728644847869873, + "learning_rate": 1.8724664138122238e-07, + "loss": 0.3509, "step": 133535 }, { - "epoch": 4.7, - "learning_rate": 4.810698880660935e-07, - "loss": 0.2449, + "epoch": 4.812772551987602, + "grad_norm": 0.27613380551338196, + "learning_rate": 1.8689032443012234e-07, + "loss": 0.3841, "step": 133540 }, { - "epoch": 4.7, - "learning_rate": 4.805138873521436e-07, - "loss": 0.2407, + "epoch": 4.812952751648827, + "grad_norm": 0.23071031272411346, + "learning_rate": 1.8653434555284123e-07, + "loss": 0.3659, "step": 133545 }, { - "epoch": 4.7, - "learning_rate": 4.799582050075635e-07, - "loss": 0.2536, + "epoch": 4.813132951310052, + "grad_norm": 0.2540687024593353, + "learning_rate": 1.8617870475422238e-07, + "loss": 0.374, "step": 133550 }, { - "epoch": 4.7, - "learning_rate": 4.794028410395695e-07, - "loss": 0.2539, + "epoch": 4.813313150971276, + "grad_norm": 0.2460535317659378, + "learning_rate": 1.8582340203911475e-07, + "loss": 0.3813, "step": 133555 }, { - "epoch": 4.7, - "learning_rate": 4.788477954553672e-07, - "loss": 0.2721, + "epoch": 4.813493350632501, + "grad_norm": 0.18833312392234802, + "learning_rate": 1.8546843741236163e-07, + "loss": 0.3373, "step": 133560 }, { - "epoch": 4.7, - "learning_rate": 4.782930682621644e-07, - "loss": 0.2498, + "epoch": 4.813673550293726, + "grad_norm": 0.2607569694519043, + "learning_rate": 1.851138108787953e-07, + "loss": 0.3953, "step": 133565 }, { - "epoch": 4.7, - "learning_rate": 4.777386594671668e-07, - "loss": 0.2588, + "epoch": 4.81385374995495, + "grad_norm": 0.2937115430831909, + "learning_rate": 1.84759522443248e-07, + "loss": 0.3231, "step": 133570 }, { - "epoch": 4.7, - "learning_rate": 4.771845690775739e-07, - "loss": 0.2659, + "epoch": 4.814033949616174, + "grad_norm": 0.27710872888565063, + "learning_rate": 1.844055721105492e-07, + "loss": 0.389, "step": 133575 }, { - "epoch": 4.7, - "learning_rate": 4.7663079710057455e-07, - "loss": 0.2421, + "epoch": 4.814214149277399, + "grad_norm": 0.19749537110328674, + "learning_rate": 1.8405195988552003e-07, + "loss": 0.3749, "step": 133580 }, { - "epoch": 4.7, - "learning_rate": 4.7607734354336296e-07, - "loss": 0.2476, + "epoch": 4.814394348938624, + "grad_norm": 0.20698542892932892, + "learning_rate": 1.8369868577297612e-07, + "loss": 0.3486, "step": 133585 }, { - "epoch": 4.7, - "learning_rate": 4.7552420841312786e-07, - "loss": 0.2486, + "epoch": 4.814574548599849, + "grad_norm": 0.22517277300357819, + "learning_rate": 1.8334574977773577e-07, + "loss": 0.3636, "step": 133590 }, { - "epoch": 4.7, - "learning_rate": 4.7497139171704954e-07, - "loss": 0.2392, + "epoch": 4.814754748261073, + "grad_norm": 0.2940255105495453, + "learning_rate": 1.8299315190460352e-07, + "loss": 0.388, "step": 133595 }, { - "epoch": 4.7, - "learning_rate": 4.7441889346230284e-07, - "loss": 0.2418, + "epoch": 4.814934947922298, + "grad_norm": 0.21540936827659607, + "learning_rate": 1.8264089215838386e-07, + "loss": 0.414, "step": 133600 }, { - "epoch": 4.7, - "learning_rate": 4.738667136560654e-07, - "loss": 0.2407, + "epoch": 4.815115147583523, + "grad_norm": 0.2051449418067932, + "learning_rate": 1.8228897054388128e-07, + "loss": 0.3345, "step": 133605 }, { - "epoch": 4.7, - "learning_rate": 4.7331485230550363e-07, - "loss": 0.2377, + "epoch": 4.8152953472447475, + "grad_norm": 0.2661552429199219, + "learning_rate": 1.8193738706588082e-07, + "loss": 0.3696, "step": 133610 }, { - "epoch": 4.7, - "learning_rate": 4.727633094177869e-07, - "loss": 0.2626, + "epoch": 4.815475546905972, + "grad_norm": 0.26067331433296204, + "learning_rate": 1.815861417291842e-07, + "loss": 0.3499, "step": 133615 }, { - "epoch": 4.7, - "learning_rate": 4.722120850000733e-07, - "loss": 0.2364, + "epoch": 4.815655746567196, + "grad_norm": 0.23874883353710175, + "learning_rate": 1.8123523453856816e-07, + "loss": 0.3581, "step": 133620 }, { - "epoch": 4.7, - "learning_rate": 4.71661179059521e-07, - "loss": 0.2398, + "epoch": 4.815835946228421, + "grad_norm": 0.264144629240036, + "learning_rate": 1.8088466549881778e-07, + "loss": 0.3731, "step": 133625 }, { - "epoch": 4.7, - "learning_rate": 4.711105916032854e-07, - "loss": 0.2548, + "epoch": 4.8160161458896455, + "grad_norm": 0.2632257342338562, + "learning_rate": 1.8053443461470698e-07, + "loss": 0.4175, "step": 133630 }, { - "epoch": 4.7, - "learning_rate": 4.70560322638508e-07, - "loss": 0.2547, + "epoch": 4.81619634555087, + "grad_norm": 0.2465040385723114, + "learning_rate": 1.8018454189101254e-07, + "loss": 0.3609, "step": 133635 }, { - "epoch": 4.7, - "learning_rate": 4.7001037217234424e-07, - "loss": 0.2344, + "epoch": 4.816376545212095, + "grad_norm": 0.2356046885251999, + "learning_rate": 1.7983498733249725e-07, + "loss": 0.3647, "step": 133640 }, { - "epoch": 4.7, - "learning_rate": 4.6946074021193e-07, - "loss": 0.2463, + "epoch": 4.81655674487332, + "grad_norm": 0.2914571166038513, + "learning_rate": 1.7948577094392405e-07, + "loss": 0.3861, "step": 133645 }, { - "epoch": 4.7, - "learning_rate": 4.6891142676440125e-07, - "loss": 0.2588, + "epoch": 4.8167369445345445, + "grad_norm": 0.2516320049762726, + "learning_rate": 1.7913689273005018e-07, + "loss": 0.389, "step": 133650 }, { - "epoch": 4.7, - "learning_rate": 4.6836243183689123e-07, - "loss": 0.2377, + "epoch": 4.816917144195769, + "grad_norm": 0.2756977379322052, + "learning_rate": 1.78788352695633e-07, + "loss": 0.397, "step": 133655 }, { - "epoch": 4.7, - "learning_rate": 4.6781375543652475e-07, - "loss": 0.2665, + "epoch": 4.817097343856993, + "grad_norm": 0.31001341342926025, + "learning_rate": 1.7844015084542143e-07, + "loss": 0.3965, "step": 133660 }, { - "epoch": 4.7, - "learning_rate": 4.672653975704322e-07, - "loss": 0.2636, + "epoch": 4.817277543518218, + "grad_norm": 0.31293249130249023, + "learning_rate": 1.780922871841534e-07, + "loss": 0.3876, "step": 133665 }, { - "epoch": 4.7, - "learning_rate": 4.667173582457329e-07, - "loss": 0.2751, + "epoch": 4.8174577431794425, + "grad_norm": 0.23081745207309723, + "learning_rate": 1.7774476171657229e-07, + "loss": 0.3338, "step": 133670 }, { - "epoch": 4.7, - "learning_rate": 4.661696374695379e-07, - "loss": 0.2446, + "epoch": 4.817637942840667, + "grad_norm": 0.229642853140831, + "learning_rate": 1.7739757444741323e-07, + "loss": 0.4005, "step": 133675 }, { - "epoch": 4.7, - "learning_rate": 4.656222352489609e-07, - "loss": 0.2516, + "epoch": 4.817818142501892, + "grad_norm": 0.29115715622901917, + "learning_rate": 1.7705072538140856e-07, + "loss": 0.3731, "step": 133680 }, { - "epoch": 4.7, - "learning_rate": 4.650751515911128e-07, - "loss": 0.2605, + "epoch": 4.817998342163117, + "grad_norm": 0.27003014087677, + "learning_rate": 1.7670421452328224e-07, + "loss": 0.3786, "step": 133685 }, { - "epoch": 4.7, - "learning_rate": 4.645283865030936e-07, - "loss": 0.2383, + "epoch": 4.8181785418243415, + "grad_norm": 0.2552355229854584, + "learning_rate": 1.7635804187775273e-07, + "loss": 0.3423, "step": 133690 }, { - "epoch": 4.7, - "learning_rate": 4.639819399920031e-07, - "loss": 0.2682, + "epoch": 4.818358741485566, + "grad_norm": 0.29862159490585327, + "learning_rate": 1.760122074495385e-07, + "loss": 0.373, "step": 133695 }, { - "epoch": 4.7, - "learning_rate": 4.6343581206493844e-07, - "loss": 0.251, + "epoch": 4.818538941146791, + "grad_norm": 0.2521141469478607, + "learning_rate": 1.7566671124335242e-07, + "loss": 0.3318, "step": 133700 }, { - "epoch": 4.7, - "learning_rate": 4.628900027289884e-07, - "loss": 0.2662, + "epoch": 4.818719140808016, + "grad_norm": 0.23437343537807465, + "learning_rate": 1.7532155326390464e-07, + "loss": 0.373, "step": 133705 }, { - "epoch": 4.7, - "learning_rate": 4.623445119912445e-07, - "loss": 0.2338, + "epoch": 4.81889934046924, + "grad_norm": 0.24322153627872467, + "learning_rate": 1.749767335158914e-07, + "loss": 0.3533, "step": 133710 }, { - "epoch": 4.7, - "learning_rate": 4.617993398587844e-07, - "loss": 0.2535, + "epoch": 4.819079540130464, + "grad_norm": 0.29220882058143616, + "learning_rate": 1.7463225200401167e-07, + "loss": 0.3859, "step": 133715 }, { - "epoch": 4.7, - "learning_rate": 4.6125448633868586e-07, - "loss": 0.274, + "epoch": 4.819259739791689, + "grad_norm": 0.19493746757507324, + "learning_rate": 1.7428810873296453e-07, + "loss": 0.3794, "step": 133720 }, { - "epoch": 4.7, - "learning_rate": 4.607099514380292e-07, - "loss": 0.253, + "epoch": 4.819439939452914, + "grad_norm": 0.23761507868766785, + "learning_rate": 1.739443037074351e-07, + "loss": 0.3677, "step": 133725 }, { - "epoch": 4.7, - "learning_rate": 4.601657351638811e-07, - "loss": 0.2562, + "epoch": 4.8196201391141384, + "grad_norm": 0.31488823890686035, + "learning_rate": 1.736008369321085e-07, + "loss": 0.3997, "step": 133730 }, { - "epoch": 4.71, - "learning_rate": 4.59621837523308e-07, - "loss": 0.2455, + "epoch": 4.819800338775363, + "grad_norm": 0.23459258675575256, + "learning_rate": 1.7325770841166156e-07, + "loss": 0.3629, "step": 133735 }, { - "epoch": 4.71, - "learning_rate": 4.5907825852337106e-07, - "loss": 0.2697, + "epoch": 4.819980538436588, + "grad_norm": 0.29168611764907837, + "learning_rate": 1.7291491815077388e-07, + "loss": 0.3874, "step": 133740 }, { - "epoch": 4.71, - "learning_rate": 4.585349981711312e-07, - "loss": 0.2471, + "epoch": 4.820160738097813, + "grad_norm": 0.28949815034866333, + "learning_rate": 1.7257246615411393e-07, + "loss": 0.3727, "step": 133745 }, { - "epoch": 4.71, - "learning_rate": 4.5799205647364116e-07, - "loss": 0.2394, + "epoch": 4.820340937759037, + "grad_norm": 0.23711396753787994, + "learning_rate": 1.7223035242634467e-07, + "loss": 0.3764, "step": 133750 }, { - "epoch": 4.71, - "learning_rate": 4.5744943343795364e-07, - "loss": 0.2628, + "epoch": 4.820521137420261, + "grad_norm": 0.26176103949546814, + "learning_rate": 1.7188857697213178e-07, + "loss": 0.3257, "step": 133755 }, { - "epoch": 4.71, - "learning_rate": 4.5690712907110465e-07, - "loss": 0.2439, + "epoch": 4.820701337081486, + "grad_norm": 0.3358304798603058, + "learning_rate": 1.715471397961327e-07, + "loss": 0.3996, "step": 133760 }, { - "epoch": 4.71, - "learning_rate": 4.563651433801469e-07, - "loss": 0.2465, + "epoch": 4.820881536742711, + "grad_norm": 0.32944080233573914, + "learning_rate": 1.7120604090299363e-07, + "loss": 0.3569, "step": 133765 }, { - "epoch": 4.71, - "learning_rate": 4.5582347637211367e-07, - "loss": 0.2499, + "epoch": 4.821061736403935, + "grad_norm": 0.25890570878982544, + "learning_rate": 1.708652802973637e-07, + "loss": 0.3859, "step": 133770 }, { - "epoch": 4.71, - "learning_rate": 4.55390372222117e-07, - "loss": 0.2433, + "epoch": 4.82124193606516, + "grad_norm": 0.23768959939479828, + "learning_rate": 1.7052485798389196e-07, + "loss": 0.3734, "step": 133775 }, { - "epoch": 4.71, - "learning_rate": 4.548492788610637e-07, - "loss": 0.2583, + "epoch": 4.821422135726385, + "grad_norm": 0.2727597951889038, + "learning_rate": 1.70184773967208e-07, + "loss": 0.3652, "step": 133780 }, { - "epoch": 4.71, - "learning_rate": 4.5430850420261904e-07, - "loss": 0.2323, + "epoch": 4.82160233538761, + "grad_norm": 0.31934478878974915, + "learning_rate": 1.698450282519526e-07, + "loss": 0.3821, "step": 133785 }, { - "epoch": 4.71, - "learning_rate": 4.537680482537998e-07, - "loss": 0.2464, + "epoch": 4.821782535048834, + "grad_norm": 0.2313583940267563, + "learning_rate": 1.695056208427498e-07, + "loss": 0.3477, "step": 133790 }, { - "epoch": 4.71, - "learning_rate": 4.5322791102162807e-07, - "loss": 0.2542, + "epoch": 4.821962734710059, + "grad_norm": 0.28459036350250244, + "learning_rate": 1.691665517442237e-07, + "loss": 0.3909, "step": 133795 }, { - "epoch": 4.71, - "learning_rate": 4.5268809251311496e-07, - "loss": 0.259, + "epoch": 4.822142934371284, + "grad_norm": 0.2991600036621094, + "learning_rate": 1.6882782096099836e-07, + "loss": 0.3633, "step": 133800 }, { - "epoch": 4.71, - "learning_rate": 4.5214859273527146e-07, - "loss": 0.2701, + "epoch": 4.822323134032508, + "grad_norm": 0.2798830270767212, + "learning_rate": 1.684894284976868e-07, + "loss": 0.3536, "step": 133805 }, { - "epoch": 4.71, - "learning_rate": 4.5160941169509476e-07, - "loss": 0.2358, + "epoch": 4.822503333693732, + "grad_norm": 0.2202059030532837, + "learning_rate": 1.681513743588964e-07, + "loss": 0.3627, "step": 133810 }, { - "epoch": 4.71, - "learning_rate": 4.510705493995987e-07, - "loss": 0.2536, + "epoch": 4.822683533354957, + "grad_norm": 0.2335967868566513, + "learning_rate": 1.6781365854924014e-07, + "loss": 0.3384, "step": 133815 }, { - "epoch": 4.71, - "learning_rate": 4.5053200585576937e-07, - "loss": 0.2644, + "epoch": 4.822863733016182, + "grad_norm": 0.21627481281757355, + "learning_rate": 1.674762810733116e-07, + "loss": 0.3793, "step": 133820 }, { - "epoch": 4.71, - "learning_rate": 4.49993781070604e-07, - "loss": 0.2591, + "epoch": 4.823043932677407, + "grad_norm": 0.2823790907859802, + "learning_rate": 1.671392419357126e-07, + "loss": 0.3695, "step": 133825 }, { - "epoch": 4.71, - "learning_rate": 4.494558750510913e-07, - "loss": 0.2571, + "epoch": 4.823224132338631, + "grad_norm": 0.2549593448638916, + "learning_rate": 1.66802541141034e-07, + "loss": 0.342, "step": 133830 }, { - "epoch": 4.71, - "learning_rate": 4.4891828780421194e-07, - "loss": 0.2387, + "epoch": 4.823404331999856, + "grad_norm": 0.21052326261997223, + "learning_rate": 1.6646617869386095e-07, + "loss": 0.3486, "step": 133835 }, { - "epoch": 4.71, - "learning_rate": 4.483810193369492e-07, - "loss": 0.2582, + "epoch": 4.823584531661081, + "grad_norm": 0.24714645743370056, + "learning_rate": 1.6613015459877868e-07, + "loss": 0.377, "step": 133840 }, { - "epoch": 4.71, - "learning_rate": 4.4784406965628077e-07, - "loss": 0.2423, + "epoch": 4.823764731322305, + "grad_norm": 0.24139024317264557, + "learning_rate": 1.6579446886036687e-07, + "loss": 0.3834, "step": 133845 }, { - "epoch": 4.71, - "learning_rate": 4.4730743876917614e-07, - "loss": 0.2634, + "epoch": 4.823944930983529, + "grad_norm": 0.29688748717308044, + "learning_rate": 1.6545912148319687e-07, + "loss": 0.4207, "step": 133850 }, { - "epoch": 4.71, - "learning_rate": 4.4677112668260467e-07, - "loss": 0.2468, + "epoch": 4.824125130644754, + "grad_norm": 0.26805707812309265, + "learning_rate": 1.6512411247183724e-07, + "loss": 0.3626, "step": 133855 }, { - "epoch": 4.71, - "learning_rate": 4.4623513340352753e-07, - "loss": 0.2499, + "epoch": 4.824305330305979, + "grad_norm": 0.2496339976787567, + "learning_rate": 1.6478944183085376e-07, + "loss": 0.3988, "step": 133860 }, { - "epoch": 4.71, - "learning_rate": 4.4569945893890576e-07, - "loss": 0.265, + "epoch": 4.824485529967204, + "grad_norm": 0.20922985672950745, + "learning_rate": 1.644551095648067e-07, + "loss": 0.3854, "step": 133865 }, { - "epoch": 4.71, - "learning_rate": 4.451641032956949e-07, - "loss": 0.2362, + "epoch": 4.824665729628428, + "grad_norm": 0.2915142774581909, + "learning_rate": 1.6412111567825074e-07, + "loss": 0.3817, "step": 133870 }, { - "epoch": 4.71, - "learning_rate": 4.44629066480845e-07, - "loss": 0.2777, + "epoch": 4.824845929289653, + "grad_norm": 0.25543341040611267, + "learning_rate": 1.6378746017573222e-07, + "loss": 0.3862, "step": 133875 }, { - "epoch": 4.71, - "learning_rate": 4.4409434850130607e-07, - "loss": 0.2626, + "epoch": 4.825026128950878, + "grad_norm": 0.2814326882362366, + "learning_rate": 1.6345414306180584e-07, + "loss": 0.3492, "step": 133880 }, { - "epoch": 4.71, - "learning_rate": 4.435599493640169e-07, - "loss": 0.2508, + "epoch": 4.8252063286121025, + "grad_norm": 0.2885473370552063, + "learning_rate": 1.6312116434100412e-07, + "loss": 0.3942, "step": 133885 }, { - "epoch": 4.71, - "learning_rate": 4.4302586907591926e-07, - "loss": 0.2379, + "epoch": 4.825386528273327, + "grad_norm": 0.239909365773201, + "learning_rate": 1.6278852401787336e-07, + "loss": 0.4162, "step": 133890 }, { - "epoch": 4.71, - "learning_rate": 4.4249210764394643e-07, - "loss": 0.2597, + "epoch": 4.825566727934551, + "grad_norm": 0.22957023978233337, + "learning_rate": 1.6245622209693778e-07, + "loss": 0.3542, "step": 133895 }, { - "epoch": 4.71, - "learning_rate": 4.419586650750318e-07, - "loss": 0.2523, + "epoch": 4.825746927595776, + "grad_norm": 0.22540193796157837, + "learning_rate": 1.621242585827243e-07, + "loss": 0.3753, "step": 133900 }, { - "epoch": 4.71, - "learning_rate": 4.414255413760976e-07, - "loss": 0.2325, + "epoch": 4.825927127257001, + "grad_norm": 0.22572508454322815, + "learning_rate": 1.617926334797626e-07, + "loss": 0.3706, "step": 133905 }, { - "epoch": 4.71, - "learning_rate": 4.408927365540688e-07, - "loss": 0.2461, + "epoch": 4.826107326918225, + "grad_norm": 0.2843494117259979, + "learning_rate": 1.6146134679256574e-07, + "loss": 0.399, "step": 133910 }, { - "epoch": 4.71, - "learning_rate": 4.4036025061586216e-07, - "loss": 0.254, + "epoch": 4.82628752657945, + "grad_norm": 0.2644733190536499, + "learning_rate": 1.6113039852565237e-07, + "loss": 0.3451, "step": 133915 }, { - "epoch": 4.71, - "learning_rate": 4.3982808356839434e-07, - "loss": 0.2598, + "epoch": 4.826467726240675, + "grad_norm": 0.25811052322387695, + "learning_rate": 1.6079978868352442e-07, + "loss": 0.388, "step": 133920 }, { - "epoch": 4.71, - "learning_rate": 4.3929623541856813e-07, - "loss": 0.244, + "epoch": 4.8266479259018995, + "grad_norm": 0.2605394721031189, + "learning_rate": 1.6046951727069214e-07, + "loss": 0.3679, "step": 133925 }, { - "epoch": 4.71, - "learning_rate": 4.3876470617330026e-07, - "loss": 0.2638, + "epoch": 4.826828125563124, + "grad_norm": 0.29219627380371094, + "learning_rate": 1.60139584291652e-07, + "loss": 0.3613, "step": 133930 }, { - "epoch": 4.71, - "learning_rate": 4.382334958394824e-07, - "loss": 0.25, + "epoch": 4.827008325224348, + "grad_norm": 0.24204973876476288, + "learning_rate": 1.598099897509031e-07, + "loss": 0.3677, "step": 133935 }, { - "epoch": 4.71, - "learning_rate": 4.377026044240173e-07, - "loss": 0.2512, + "epoch": 4.827188524885573, + "grad_norm": 0.2921901047229767, + "learning_rate": 1.5948073365293358e-07, + "loss": 0.3456, "step": 133940 }, { - "epoch": 4.71, - "learning_rate": 4.3717203193379686e-07, - "loss": 0.2599, + "epoch": 4.8273687245467976, + "grad_norm": 0.2787991166114807, + "learning_rate": 1.5915181600222872e-07, + "loss": 0.3672, "step": 133945 }, { - "epoch": 4.71, - "learning_rate": 4.366417783757071e-07, - "loss": 0.2406, + "epoch": 4.827548924208022, + "grad_norm": 0.2651299238204956, + "learning_rate": 1.5882323680327104e-07, + "loss": 0.3701, "step": 133950 }, { - "epoch": 4.71, - "learning_rate": 4.3611184375663973e-07, - "loss": 0.2614, + "epoch": 4.827729123869247, + "grad_norm": 0.27234312891960144, + "learning_rate": 1.5849499606053753e-07, + "loss": 0.3357, "step": 133955 }, { - "epoch": 4.71, - "learning_rate": 4.3558222808346707e-07, - "loss": 0.2675, + "epoch": 4.827909323530472, + "grad_norm": 0.2102580964565277, + "learning_rate": 1.5816709377849957e-07, + "loss": 0.3899, "step": 133960 }, { - "epoch": 4.71, - "learning_rate": 4.3505293136307523e-07, - "loss": 0.2617, + "epoch": 4.8280895231916965, + "grad_norm": 0.29363569617271423, + "learning_rate": 1.5783952996162864e-07, + "loss": 0.3687, "step": 133965 }, { - "epoch": 4.71, - "learning_rate": 4.34523953602331e-07, - "loss": 0.2576, + "epoch": 4.828269722852921, + "grad_norm": 0.22374111413955688, + "learning_rate": 1.5751230461438228e-07, + "loss": 0.3412, "step": 133970 }, { - "epoch": 4.71, - "learning_rate": 4.339952948081066e-07, - "loss": 0.2348, + "epoch": 4.828449922514146, + "grad_norm": 0.2635408341884613, + "learning_rate": 1.5718541774122076e-07, + "loss": 0.4143, "step": 133975 }, { - "epoch": 4.71, - "learning_rate": 4.3346695498725776e-07, - "loss": 0.2357, + "epoch": 4.828630122175371, + "grad_norm": 0.25199902057647705, + "learning_rate": 1.568588693465961e-07, + "loss": 0.375, "step": 133980 }, { - "epoch": 4.71, - "learning_rate": 4.3293893414665667e-07, - "loss": 0.2417, + "epoch": 4.828810321836595, + "grad_norm": 0.21928834915161133, + "learning_rate": 1.5653265943496587e-07, + "loss": 0.3635, "step": 133985 }, { - "epoch": 4.71, - "learning_rate": 4.324112322931506e-07, - "loss": 0.2454, + "epoch": 4.828990521497819, + "grad_norm": 0.25685426592826843, + "learning_rate": 1.562067880107626e-07, + "loss": 0.3996, "step": 133990 }, { - "epoch": 4.71, - "learning_rate": 4.318838494335925e-07, - "loss": 0.2458, + "epoch": 4.829170721159044, + "grad_norm": 0.24886153638362885, + "learning_rate": 1.5588125507843275e-07, + "loss": 0.3769, "step": 133995 }, { - "epoch": 4.71, - "learning_rate": 4.313567855748324e-07, - "loss": 0.2546, + "epoch": 4.829350920820269, + "grad_norm": 0.2590484321117401, + "learning_rate": 1.5555606064241168e-07, + "loss": 0.3867, "step": 134000 }, { - "epoch": 4.71, - "eval_loss": 0.24852041900157928, - "eval_runtime": 10.5387, - "eval_samples_per_second": 9.489, - "eval_steps_per_second": 9.489, + "epoch": 4.829350920820269, + "eval_loss": 0.42881202697753906, + "eval_runtime": 3.5296, + "eval_samples_per_second": 28.332, + "eval_steps_per_second": 7.083, "step": 134000 }, { - "epoch": 4.71, - "learning_rate": 4.3083004072371203e-07, - "loss": 0.2659, + "epoch": 4.8295311204814935, + "grad_norm": 0.21452315151691437, + "learning_rate": 1.5523120470712915e-07, + "loss": 0.3729, "step": 134005 }, { - "epoch": 4.71, - "learning_rate": 4.303036148870759e-07, - "loss": 0.2601, + "epoch": 4.829711320142718, + "grad_norm": 0.2168400138616562, + "learning_rate": 1.5490668727701217e-07, + "loss": 0.3683, "step": 134010 }, { - "epoch": 4.72, - "learning_rate": 4.29777508071752e-07, - "loss": 0.2358, + "epoch": 4.829891519803943, + "grad_norm": 0.3310934603214264, + "learning_rate": 1.5458250835648225e-07, + "loss": 0.3743, "step": 134015 }, { - "epoch": 4.72, - "learning_rate": 4.2925172028457364e-07, - "loss": 0.2433, + "epoch": 4.830071719465168, + "grad_norm": 0.2809125781059265, + "learning_rate": 1.5425866794995247e-07, + "loss": 0.377, "step": 134020 }, { - "epoch": 4.72, - "learning_rate": 4.2872625153237144e-07, - "loss": 0.2395, + "epoch": 4.830251919126392, + "grad_norm": 0.2371680587530136, + "learning_rate": 1.5393516606183878e-07, + "loss": 0.3484, "step": 134025 }, { - "epoch": 4.72, - "learning_rate": 4.282011018219678e-07, - "loss": 0.2385, + "epoch": 4.830432118787616, + "grad_norm": 0.22989128530025482, + "learning_rate": 1.5361200269655153e-07, + "loss": 0.3637, "step": 134030 }, { - "epoch": 4.72, - "learning_rate": 4.2767627116017674e-07, - "loss": 0.2473, + "epoch": 4.830612318448841, + "grad_norm": 0.250777930021286, + "learning_rate": 1.5328917785848719e-07, + "loss": 0.3924, "step": 134035 }, { - "epoch": 4.72, - "learning_rate": 4.271517595538149e-07, - "loss": 0.2381, + "epoch": 4.830792518110066, + "grad_norm": 0.26133477687835693, + "learning_rate": 1.5296669155204778e-07, + "loss": 0.3637, "step": 134040 }, { - "epoch": 4.72, - "learning_rate": 4.266275670096936e-07, - "loss": 0.243, + "epoch": 4.8309727177712904, + "grad_norm": 0.28973469138145447, + "learning_rate": 1.526445437816243e-07, + "loss": 0.3738, "step": 134045 }, { - "epoch": 4.72, - "learning_rate": 4.261036935346213e-07, - "loss": 0.2607, + "epoch": 4.831152917432515, + "grad_norm": 0.2642209529876709, + "learning_rate": 1.5232273455161316e-07, + "loss": 0.3481, "step": 134050 }, { - "epoch": 4.72, - "learning_rate": 4.255801391353953e-07, - "loss": 0.2362, + "epoch": 4.83133311709374, + "grad_norm": 0.259695827960968, + "learning_rate": 1.5200126386639148e-07, + "loss": 0.3832, "step": 134055 }, { - "epoch": 4.72, - "learning_rate": 4.250569038188129e-07, - "loss": 0.256, + "epoch": 4.831513316754965, + "grad_norm": 0.2560080885887146, + "learning_rate": 1.516801317303418e-07, + "loss": 0.3417, "step": 134060 }, { - "epoch": 4.72, - "learning_rate": 4.2453398759167704e-07, - "loss": 0.2454, + "epoch": 4.831693516416189, + "grad_norm": 0.24287177622318268, + "learning_rate": 1.5135933814783844e-07, + "loss": 0.3666, "step": 134065 }, { - "epoch": 4.72, - "learning_rate": 4.240113904607684e-07, - "loss": 0.2605, + "epoch": 4.831873716077414, + "grad_norm": 0.2676023840904236, + "learning_rate": 1.5103888312325566e-07, + "loss": 0.3918, "step": 134070 }, { - "epoch": 4.72, - "learning_rate": 4.23489112432876e-07, - "loss": 0.2446, + "epoch": 4.832053915738639, + "grad_norm": 0.35573047399520874, + "learning_rate": 1.5071876666095385e-07, + "loss": 0.3883, "step": 134075 }, { - "epoch": 4.72, - "learning_rate": 4.2296715351478335e-07, - "loss": 0.2687, + "epoch": 4.832234115399863, + "grad_norm": 0.26907774806022644, + "learning_rate": 1.503989887653018e-07, + "loss": 0.3785, "step": 134080 }, { - "epoch": 4.72, - "learning_rate": 4.224455137132627e-07, - "loss": 0.2663, + "epoch": 4.832414315061087, + "grad_norm": 0.269654244184494, + "learning_rate": 1.500795494406515e-07, + "loss": 0.3896, "step": 134085 }, { - "epoch": 4.72, - "learning_rate": 4.21924193035092e-07, - "loss": 0.2411, + "epoch": 4.832594514722312, + "grad_norm": 0.3965383470058441, + "learning_rate": 1.497604486913551e-07, + "loss": 0.377, "step": 134090 }, { - "epoch": 4.72, - "learning_rate": 4.2140319148703535e-07, - "loss": 0.2379, + "epoch": 4.832774714383537, + "grad_norm": 0.22493110597133636, + "learning_rate": 1.4944168652176183e-07, + "loss": 0.3609, "step": 134095 }, { - "epoch": 4.72, - "learning_rate": 4.208825090758622e-07, - "loss": 0.2565, + "epoch": 4.832954914044762, + "grad_norm": 0.2781234681606293, + "learning_rate": 1.4912326293621549e-07, + "loss": 0.3761, "step": 134100 }, { - "epoch": 4.72, - "learning_rate": 4.203621458083312e-07, - "loss": 0.2397, + "epoch": 4.833135113705986, + "grad_norm": 0.3429318070411682, + "learning_rate": 1.4880517793905148e-07, + "loss": 0.3934, "step": 134105 }, { - "epoch": 4.72, - "learning_rate": 4.19842101691198e-07, - "loss": 0.2508, + "epoch": 4.833315313367211, + "grad_norm": 0.2164117842912674, + "learning_rate": 1.4848743153460522e-07, + "loss": 0.3973, "step": 134110 }, { - "epoch": 4.72, - "learning_rate": 4.1932237673121545e-07, - "loss": 0.2467, + "epoch": 4.833495513028436, + "grad_norm": 0.32656198740005493, + "learning_rate": 1.4817002372720935e-07, + "loss": 0.3912, "step": 134115 }, { - "epoch": 4.72, - "learning_rate": 4.1880297093513654e-07, - "loss": 0.2712, + "epoch": 4.83367571268966, + "grad_norm": 0.25910449028015137, + "learning_rate": 1.4785295452118264e-07, + "loss": 0.3827, "step": 134120 }, { - "epoch": 4.72, - "learning_rate": 4.1828388430969755e-07, - "loss": 0.254, + "epoch": 4.833855912350884, + "grad_norm": 0.20214806497097015, + "learning_rate": 1.4753622392084943e-07, + "loss": 0.3517, "step": 134125 }, { - "epoch": 4.72, - "learning_rate": 4.177651168616459e-07, - "loss": 0.2601, + "epoch": 4.834036112012109, + "grad_norm": 0.27396053075790405, + "learning_rate": 1.472198319305229e-07, + "loss": 0.3526, "step": 134130 }, { - "epoch": 4.72, - "learning_rate": 4.172466685977094e-07, - "loss": 0.2542, + "epoch": 4.834216311673334, + "grad_norm": 0.20608721673488617, + "learning_rate": 1.4690377855451353e-07, + "loss": 0.3631, "step": 134135 }, { - "epoch": 4.72, - "learning_rate": 4.167285395246273e-07, - "loss": 0.2537, + "epoch": 4.834396511334559, + "grad_norm": 0.26418453454971313, + "learning_rate": 1.4658806379712897e-07, + "loss": 0.3475, "step": 134140 }, { - "epoch": 4.72, - "learning_rate": 4.1621072964912464e-07, - "loss": 0.2534, + "epoch": 4.834576710995783, + "grad_norm": 0.2818256616592407, + "learning_rate": 1.4627268766267133e-07, + "loss": 0.36, "step": 134145 }, { - "epoch": 4.72, - "learning_rate": 4.1569323897792387e-07, - "loss": 0.2512, + "epoch": 4.834756910657008, + "grad_norm": 0.30141425132751465, + "learning_rate": 1.4595765015543715e-07, + "loss": 0.373, "step": 134150 }, { - "epoch": 4.72, - "learning_rate": 4.151760675177446e-07, - "loss": 0.2476, + "epoch": 4.834937110318233, + "grad_norm": 0.309048056602478, + "learning_rate": 1.456429512797175e-07, + "loss": 0.4023, "step": 134155 }, { - "epoch": 4.72, - "learning_rate": 4.1465921527530095e-07, - "loss": 0.2304, + "epoch": 4.8351173099794575, + "grad_norm": 0.28096258640289307, + "learning_rate": 1.4532859103980056e-07, + "loss": 0.3722, "step": 134160 }, { - "epoch": 4.72, - "learning_rate": 4.141426822573041e-07, - "loss": 0.2545, + "epoch": 4.835297509640682, + "grad_norm": 0.30201297998428345, + "learning_rate": 1.4501456943996628e-07, + "loss": 0.3901, "step": 134165 }, { - "epoch": 4.72, - "learning_rate": 4.136264684704655e-07, - "loss": 0.2259, + "epoch": 4.835477709301906, + "grad_norm": 0.2812051773071289, + "learning_rate": 1.447008864845001e-07, + "loss": 0.4076, "step": 134170 }, { - "epoch": 4.72, - "learning_rate": 4.1311057392147966e-07, - "loss": 0.248, + "epoch": 4.835657908963131, + "grad_norm": 0.3078659176826477, + "learning_rate": 1.443875421776736e-07, + "loss": 0.3531, "step": 134175 }, { - "epoch": 4.72, - "learning_rate": 4.1259499861705243e-07, - "loss": 0.2331, + "epoch": 4.835838108624356, + "grad_norm": 0.2857990264892578, + "learning_rate": 1.4407453652375002e-07, + "loss": 0.389, "step": 134180 }, { - "epoch": 4.72, - "learning_rate": 4.120797425638756e-07, - "loss": 0.2626, + "epoch": 4.83601830828558, + "grad_norm": 0.23130671679973602, + "learning_rate": 1.43761869527001e-07, + "loss": 0.4073, "step": 134185 }, { - "epoch": 4.72, - "learning_rate": 4.115648057686411e-07, - "loss": 0.2415, + "epoch": 4.836198507946805, + "grad_norm": 0.2094799429178238, + "learning_rate": 1.434495411916842e-07, + "loss": 0.3676, "step": 134190 }, { - "epoch": 4.72, - "learning_rate": 4.1105018823802966e-07, - "loss": 0.247, + "epoch": 4.83637870760803, + "grad_norm": 0.26732537150382996, + "learning_rate": 1.4313755152205455e-07, + "loss": 0.3971, "step": 134195 }, { - "epoch": 4.72, - "learning_rate": 4.105358899787276e-07, - "loss": 0.2723, + "epoch": 4.8365589072692545, + "grad_norm": 0.2994996905326843, + "learning_rate": 1.4282590052236423e-07, + "loss": 0.3768, "step": 134200 }, { - "epoch": 4.72, - "learning_rate": 4.1002191099741296e-07, - "loss": 0.2401, + "epoch": 4.836739106930479, + "grad_norm": 0.22868464887142181, + "learning_rate": 1.4251458819685704e-07, + "loss": 0.3547, "step": 134205 }, { - "epoch": 4.72, - "learning_rate": 4.095082513007581e-07, - "loss": 0.2487, + "epoch": 4.836919306591704, + "grad_norm": 0.26898080110549927, + "learning_rate": 1.4220361454977682e-07, + "loss": 0.3795, "step": 134210 }, { - "epoch": 4.72, - "learning_rate": 4.0899491089543276e-07, - "loss": 0.228, + "epoch": 4.837099506252928, + "grad_norm": 0.2844333350658417, + "learning_rate": 1.4189297958536185e-07, + "loss": 0.4158, "step": 134215 }, { - "epoch": 4.72, - "learning_rate": 4.08481889788101e-07, - "loss": 0.238, + "epoch": 4.837279705914153, + "grad_norm": 0.22372843325138092, + "learning_rate": 1.415826833078393e-07, + "loss": 0.3845, "step": 134220 }, { - "epoch": 4.72, - "learning_rate": 4.0796918798542694e-07, - "loss": 0.2298, + "epoch": 4.837459905575377, + "grad_norm": 0.3177548050880432, + "learning_rate": 1.412727257214419e-07, + "loss": 0.4199, "step": 134225 }, { - "epoch": 4.72, - "learning_rate": 4.074568054940636e-07, - "loss": 0.2284, + "epoch": 4.837640105236602, + "grad_norm": 0.22313030064105988, + "learning_rate": 1.409631068303885e-07, + "loss": 0.3804, "step": 134230 }, { - "epoch": 4.72, - "learning_rate": 4.069447423206696e-07, - "loss": 0.2346, + "epoch": 4.837820304897827, + "grad_norm": 0.21491451561450958, + "learning_rate": 1.4065382663890347e-07, + "loss": 0.3386, "step": 134235 }, { - "epoch": 4.72, - "learning_rate": 4.064329984718923e-07, - "loss": 0.2512, + "epoch": 4.8380005045590515, + "grad_norm": 0.24090984463691711, + "learning_rate": 1.403448851511946e-07, + "loss": 0.3939, "step": 134240 }, { - "epoch": 4.72, - "learning_rate": 4.059215739543737e-07, - "loss": 0.239, + "epoch": 4.838180704220276, + "grad_norm": 0.3148958384990692, + "learning_rate": 1.4003628237147238e-07, + "loss": 0.3783, "step": 134245 }, { - "epoch": 4.72, - "learning_rate": 4.0541046877475285e-07, - "loss": 0.2488, + "epoch": 4.838360903881501, + "grad_norm": 0.32658737897872925, + "learning_rate": 1.3972801830394732e-07, + "loss": 0.403, "step": 134250 }, { - "epoch": 4.72, - "learning_rate": 4.0489968293966896e-07, - "loss": 0.2513, + "epoch": 4.838541103542726, + "grad_norm": 0.26316890120506287, + "learning_rate": 1.3942009295281056e-07, + "loss": 0.3716, "step": 134255 }, { - "epoch": 4.72, - "learning_rate": 4.0438921645575555e-07, - "loss": 0.2548, + "epoch": 4.83872130320395, + "grad_norm": 0.3213282823562622, + "learning_rate": 1.3911250632226146e-07, + "loss": 0.3641, "step": 134260 }, { - "epoch": 4.72, - "learning_rate": 4.038790693296407e-07, - "loss": 0.2413, + "epoch": 4.838901502865174, + "grad_norm": 0.2604977786540985, + "learning_rate": 1.3880525841649673e-07, + "loss": 0.361, "step": 134265 }, { - "epoch": 4.72, - "learning_rate": 4.033692415679413e-07, - "loss": 0.265, + "epoch": 4.839081702526399, + "grad_norm": 0.22943316400051117, + "learning_rate": 1.3849834923969073e-07, + "loss": 0.3516, "step": 134270 }, { - "epoch": 4.72, - "learning_rate": 4.028597331772854e-07, - "loss": 0.2508, + "epoch": 4.839261902187624, + "grad_norm": 0.29876038432121277, + "learning_rate": 1.3819177879603462e-07, + "loss": 0.3684, "step": 134275 }, { - "epoch": 4.72, - "learning_rate": 4.023505441642872e-07, - "loss": 0.225, + "epoch": 4.8394421018488485, + "grad_norm": 0.2451476752758026, + "learning_rate": 1.378855470897028e-07, + "loss": 0.3882, "step": 134280 }, { - "epoch": 4.72, - "learning_rate": 4.018416745355552e-07, - "loss": 0.2656, + "epoch": 4.839622301510073, + "grad_norm": 0.2849496603012085, + "learning_rate": 1.3757965412486696e-07, + "loss": 0.3678, "step": 134285 }, { - "epoch": 4.72, - "learning_rate": 4.013331242976981e-07, - "loss": 0.2575, + "epoch": 4.839802501171298, + "grad_norm": 0.19778481125831604, + "learning_rate": 1.3727409990569319e-07, + "loss": 0.3739, "step": 134290 }, { - "epoch": 4.72, - "learning_rate": 4.008248934573189e-07, - "loss": 0.2526, + "epoch": 4.839982700832523, + "grad_norm": 0.22249376773834229, + "learning_rate": 1.369688844363448e-07, + "loss": 0.357, "step": 134295 }, { - "epoch": 4.73, - "learning_rate": 4.0031698202101784e-07, - "loss": 0.2413, + "epoch": 4.840162900493747, + "grad_norm": 0.2733737528324127, + "learning_rate": 1.366640077209852e-07, + "loss": 0.3621, "step": 134300 }, { - "epoch": 4.73, - "learning_rate": 3.9980938999538975e-07, - "loss": 0.2438, + "epoch": 4.840343100154971, + "grad_norm": 0.21390418708324432, + "learning_rate": 1.363594697637638e-07, + "loss": 0.3495, "step": 134305 }, { - "epoch": 4.73, - "learning_rate": 3.9930211738702374e-07, - "loss": 0.2287, + "epoch": 4.840523299816196, + "grad_norm": 0.3072846233844757, + "learning_rate": 1.3605527056883004e-07, + "loss": 0.4137, "step": 134310 }, { - "epoch": 4.73, - "learning_rate": 3.987951642025062e-07, - "loss": 0.2725, + "epoch": 4.840703499477421, + "grad_norm": 0.25589028000831604, + "learning_rate": 1.3575141014032788e-07, + "loss": 0.4014, "step": 134315 }, { - "epoch": 4.73, - "learning_rate": 3.982885304484235e-07, - "loss": 0.2506, + "epoch": 4.8408836991386455, + "grad_norm": 0.2160835713148117, + "learning_rate": 1.354478884824012e-07, + "loss": 0.3557, "step": 134320 }, { - "epoch": 4.73, - "learning_rate": 3.9778221613134823e-07, - "loss": 0.2348, + "epoch": 4.84106389879987, + "grad_norm": 0.2661241888999939, + "learning_rate": 1.351447055991828e-07, + "loss": 0.3846, "step": 134325 }, { - "epoch": 4.73, - "learning_rate": 3.972762212578557e-07, - "loss": 0.2513, + "epoch": 4.841244098461095, + "grad_norm": 0.23646846413612366, + "learning_rate": 1.3484186149480272e-07, + "loss": 0.3725, "step": 134330 }, { - "epoch": 4.73, - "learning_rate": 3.967705458345211e-07, - "loss": 0.2566, + "epoch": 4.84142429812232, + "grad_norm": 0.24086101353168488, + "learning_rate": 1.3453935617339099e-07, + "loss": 0.3693, "step": 134335 }, { - "epoch": 4.73, - "learning_rate": 3.962651898679087e-07, - "loss": 0.2325, + "epoch": 4.841604497783544, + "grad_norm": 0.19936615228652954, + "learning_rate": 1.3423718963906374e-07, + "loss": 0.3585, "step": 134340 }, { - "epoch": 4.73, - "learning_rate": 3.957601533645744e-07, - "loss": 0.2445, + "epoch": 4.841784697444769, + "grad_norm": 0.2506243884563446, + "learning_rate": 1.339353618959427e-07, + "loss": 0.3775, "step": 134345 }, { - "epoch": 4.73, - "learning_rate": 3.9525543633108233e-07, - "loss": 0.2519, + "epoch": 4.841964897105994, + "grad_norm": 0.22023634612560272, + "learning_rate": 1.3363387294813568e-07, + "loss": 0.3357, "step": 134350 }, { - "epoch": 4.73, - "learning_rate": 3.9475103877398e-07, - "loss": 0.2446, + "epoch": 4.842145096767218, + "grad_norm": 0.27518585324287415, + "learning_rate": 1.3333272279975328e-07, + "loss": 0.3789, "step": 134355 }, { - "epoch": 4.73, - "learning_rate": 3.9424696069982334e-07, - "loss": 0.2499, + "epoch": 4.8423252964284424, + "grad_norm": 0.20245154201984406, + "learning_rate": 1.3303191145490057e-07, + "loss": 0.359, "step": 134360 }, { - "epoch": 4.73, - "learning_rate": 3.937432021151516e-07, - "loss": 0.255, + "epoch": 4.842505496089667, + "grad_norm": 0.29023030400276184, + "learning_rate": 1.3273143891767147e-07, + "loss": 0.3642, "step": 134365 }, { - "epoch": 4.73, - "learning_rate": 3.9323976302650946e-07, - "loss": 0.268, + "epoch": 4.842685695750892, + "grad_norm": 0.23865652084350586, + "learning_rate": 1.3243130519216274e-07, + "loss": 0.3794, "step": 134370 }, { - "epoch": 4.73, - "learning_rate": 3.9273664344043337e-07, - "loss": 0.2577, + "epoch": 4.842865895412117, + "grad_norm": 0.222527414560318, + "learning_rate": 1.3213151028246273e-07, + "loss": 0.3462, "step": 134375 }, { - "epoch": 4.73, - "learning_rate": 3.9223384336345424e-07, - "loss": 0.2665, + "epoch": 4.843046095073341, + "grad_norm": 0.23358391225337982, + "learning_rate": 1.3183205419265708e-07, + "loss": 0.3391, "step": 134380 }, { - "epoch": 4.73, - "learning_rate": 3.91731362802103e-07, - "loss": 0.26, + "epoch": 4.843226294734566, + "grad_norm": 0.25445687770843506, + "learning_rate": 1.315329369268231e-07, + "loss": 0.3423, "step": 134385 }, { - "epoch": 4.73, - "learning_rate": 3.9122920176289934e-07, - "loss": 0.2487, + "epoch": 4.843406494395791, + "grad_norm": 0.312282532453537, + "learning_rate": 1.3123415848903809e-07, + "loss": 0.3769, "step": 134390 }, { - "epoch": 4.73, - "learning_rate": 3.9072736025236866e-07, - "loss": 0.2559, + "epoch": 4.843586694057015, + "grad_norm": 0.25823071599006653, + "learning_rate": 1.3093571888337652e-07, + "loss": 0.3843, "step": 134395 }, { - "epoch": 4.73, - "learning_rate": 3.902258382770252e-07, - "loss": 0.2562, + "epoch": 4.843766893718239, + "grad_norm": 0.24276918172836304, + "learning_rate": 1.3063761811389906e-07, + "loss": 0.3791, "step": 134400 }, { - "epoch": 4.73, - "learning_rate": 3.897246358433804e-07, - "loss": 0.2477, + "epoch": 4.843947093379464, + "grad_norm": 0.25537583231925964, + "learning_rate": 1.3033985618466914e-07, + "loss": 0.4017, "step": 134405 }, { - "epoch": 4.73, - "learning_rate": 3.892237529579429e-07, - "loss": 0.2262, + "epoch": 4.844127293040689, + "grad_norm": 0.2588360011577606, + "learning_rate": 1.3004243309974184e-07, + "loss": 0.3756, "step": 134410 }, { - "epoch": 4.73, - "learning_rate": 3.8872318962721586e-07, - "loss": 0.2661, + "epoch": 4.844307492701914, + "grad_norm": 0.3001175820827484, + "learning_rate": 1.2974534886317225e-07, + "loss": 0.3726, "step": 134415 }, { - "epoch": 4.73, - "learning_rate": 3.882229458576997e-07, - "loss": 0.2361, + "epoch": 4.844487692363138, + "grad_norm": 0.3025680184364319, + "learning_rate": 1.294486034790099e-07, + "loss": 0.3936, "step": 134420 }, { - "epoch": 4.73, - "learning_rate": 3.877230216558864e-07, - "loss": 0.2421, + "epoch": 4.844667892024363, + "grad_norm": 0.27103391289711, + "learning_rate": 1.2915219695129321e-07, + "loss": 0.3672, "step": 134425 }, { - "epoch": 4.73, - "learning_rate": 3.872234170282707e-07, - "loss": 0.2562, + "epoch": 4.844848091685588, + "grad_norm": 0.27774909138679504, + "learning_rate": 1.2885612928406342e-07, + "loss": 0.3652, "step": 134430 }, { - "epoch": 4.73, - "learning_rate": 3.867241319813364e-07, - "loss": 0.2601, + "epoch": 4.845028291346813, + "grad_norm": 0.2541874945163727, + "learning_rate": 1.2856040048135342e-07, + "loss": 0.3296, "step": 134435 }, { - "epoch": 4.73, - "learning_rate": 3.8622516652156993e-07, - "loss": 0.2331, + "epoch": 4.845208491008037, + "grad_norm": 0.2172260284423828, + "learning_rate": 1.2826501054719054e-07, + "loss": 0.3267, "step": 134440 }, { - "epoch": 4.73, - "learning_rate": 3.857265206554495e-07, - "loss": 0.2541, + "epoch": 4.845388690669262, + "grad_norm": 0.177218958735466, + "learning_rate": 1.2796995948560487e-07, + "loss": 0.349, "step": 134445 }, { - "epoch": 4.73, - "learning_rate": 3.8522819438944757e-07, - "loss": 0.2329, + "epoch": 4.845568890330486, + "grad_norm": 0.26676860451698303, + "learning_rate": 1.2767524730061263e-07, + "loss": 0.4027, "step": 134450 }, { - "epoch": 4.73, - "learning_rate": 3.8473018773003967e-07, - "loss": 0.2487, + "epoch": 4.845749089991711, + "grad_norm": 0.27540668845176697, + "learning_rate": 1.2738087399622733e-07, + "loss": 0.3745, "step": 134455 }, { - "epoch": 4.73, - "learning_rate": 3.8423250068368165e-07, - "loss": 0.2591, + "epoch": 4.845929289652935, + "grad_norm": 0.25069594383239746, + "learning_rate": 1.2708683957646238e-07, + "loss": 0.3432, "step": 134460 }, { - "epoch": 4.73, - "learning_rate": 3.8373513325684897e-07, - "loss": 0.25, + "epoch": 4.84610948931416, + "grad_norm": 0.29599523544311523, + "learning_rate": 1.2679314404532572e-07, + "loss": 0.3566, "step": 134465 }, { - "epoch": 4.73, - "learning_rate": 3.8323808545598915e-07, - "loss": 0.2557, + "epoch": 4.846289688975385, + "grad_norm": 0.2683459520339966, + "learning_rate": 1.264997874068169e-07, + "loss": 0.3686, "step": 134470 }, { - "epoch": 4.73, - "learning_rate": 3.82741357287561e-07, - "loss": 0.2413, + "epoch": 4.8464698886366095, + "grad_norm": 0.250180184841156, + "learning_rate": 1.2620676966493272e-07, + "loss": 0.3944, "step": 134475 }, { - "epoch": 4.73, - "learning_rate": 3.822449487580121e-07, - "loss": 0.2602, + "epoch": 4.846650088297834, + "grad_norm": 0.27705875039100647, + "learning_rate": 1.2591409082366445e-07, + "loss": 0.3683, "step": 134480 }, { - "epoch": 4.73, - "learning_rate": 3.8174885987378725e-07, - "loss": 0.2503, + "epoch": 4.846830287959059, + "grad_norm": 0.267696350812912, + "learning_rate": 1.2562175088700057e-07, + "loss": 0.3831, "step": 134485 }, { - "epoch": 4.73, - "learning_rate": 3.812530906413342e-07, - "loss": 0.2533, + "epoch": 4.847010487620283, + "grad_norm": 0.26516005396842957, + "learning_rate": 1.2532974985892398e-07, + "loss": 0.3802, "step": 134490 }, { - "epoch": 4.73, - "learning_rate": 3.80757641067081e-07, - "loss": 0.24, + "epoch": 4.847190687281508, + "grad_norm": 0.22032172977924347, + "learning_rate": 1.2503808774341486e-07, + "loss": 0.346, "step": 134495 }, { - "epoch": 4.73, - "learning_rate": 3.8026251115746427e-07, - "loss": 0.2486, + "epoch": 4.847370886942732, + "grad_norm": 0.25521814823150635, + "learning_rate": 1.2474676454444778e-07, + "loss": 0.3357, "step": 134500 }, { - "epoch": 4.73, - "eval_loss": 0.24848829209804535, - "eval_runtime": 10.5443, - "eval_samples_per_second": 9.484, - "eval_steps_per_second": 9.484, + "epoch": 4.847370886942732, + "eval_loss": 0.4288318157196045, + "eval_runtime": 3.5358, + "eval_samples_per_second": 28.282, + "eval_steps_per_second": 7.07, "step": 134500 }, { - "epoch": 4.73, - "learning_rate": 3.7976770091891766e-07, - "loss": 0.2469, + "epoch": 4.847551086603957, + "grad_norm": 0.25918158888816833, + "learning_rate": 1.2445578026598903e-07, + "loss": 0.3733, "step": 134505 }, { - "epoch": 4.73, - "learning_rate": 3.7927321035786113e-07, - "loss": 0.2598, + "epoch": 4.847731286265182, + "grad_norm": 0.2234797328710556, + "learning_rate": 1.241651349120021e-07, + "loss": 0.3643, "step": 134510 }, { - "epoch": 4.73, - "learning_rate": 3.7877903948071445e-07, - "loss": 0.2534, + "epoch": 4.8479114859264065, + "grad_norm": 0.30298492312431335, + "learning_rate": 1.238748284864505e-07, + "loss": 0.361, "step": 134515 }, { - "epoch": 4.73, - "learning_rate": 3.782851882938976e-07, - "loss": 0.2452, + "epoch": 4.848091685587631, + "grad_norm": 0.2233441025018692, + "learning_rate": 1.2358486099328658e-07, + "loss": 0.3642, "step": 134520 }, { - "epoch": 4.73, - "learning_rate": 3.777916568038192e-07, - "loss": 0.2526, + "epoch": 4.848271885248856, + "grad_norm": 0.26897063851356506, + "learning_rate": 1.2329523243646556e-07, + "loss": 0.3709, "step": 134525 }, { - "epoch": 4.73, - "learning_rate": 3.772984450168937e-07, - "loss": 0.2298, + "epoch": 4.848452084910081, + "grad_norm": 0.2564341723918915, + "learning_rate": 1.2300594281992872e-07, + "loss": 0.3741, "step": 134530 }, { - "epoch": 4.73, - "learning_rate": 3.768055529395187e-07, - "loss": 0.2505, + "epoch": 4.8486322845713055, + "grad_norm": 0.1938348412513733, + "learning_rate": 1.227169921476201e-07, + "loss": 0.3406, "step": 134535 }, { - "epoch": 4.73, - "learning_rate": 3.763129805780946e-07, - "loss": 0.2471, + "epoch": 4.848812484232529, + "grad_norm": 0.25928547978401184, + "learning_rate": 1.224283804234755e-07, + "loss": 0.3673, "step": 134540 }, { - "epoch": 4.73, - "learning_rate": 3.758207279390191e-07, - "loss": 0.2605, + "epoch": 4.848992683893754, + "grad_norm": 0.3056635558605194, + "learning_rate": 1.221401076514306e-07, + "loss": 0.3775, "step": 134545 }, { - "epoch": 4.73, - "learning_rate": 3.7532879502868433e-07, - "loss": 0.2401, + "epoch": 4.849172883554979, + "grad_norm": 0.2699172794818878, + "learning_rate": 1.2185217383540725e-07, + "loss": 0.3643, "step": 134550 }, { - "epoch": 4.73, - "learning_rate": 3.7483718185347683e-07, - "loss": 0.2561, + "epoch": 4.8493530832162035, + "grad_norm": 0.2580404579639435, + "learning_rate": 1.2156457897933014e-07, + "loss": 0.3634, "step": 134555 }, { - "epoch": 4.73, - "learning_rate": 3.7434588841978034e-07, - "loss": 0.2412, + "epoch": 4.849533282877428, + "grad_norm": 0.2560892701148987, + "learning_rate": 1.2127732308712114e-07, + "loss": 0.3698, "step": 134560 }, { - "epoch": 4.73, - "learning_rate": 3.738549147339732e-07, - "loss": 0.2506, + "epoch": 4.849713482538653, + "grad_norm": 0.2861708998680115, + "learning_rate": 1.2099040616269374e-07, + "loss": 0.3928, "step": 134565 }, { - "epoch": 4.73, - "learning_rate": 3.733642608024307e-07, - "loss": 0.2368, + "epoch": 4.849893682199878, + "grad_norm": 0.21710778772830963, + "learning_rate": 1.207038282099532e-07, + "loss": 0.384, "step": 134570 }, { - "epoch": 4.73, - "learning_rate": 3.7287392663152286e-07, - "loss": 0.2515, + "epoch": 4.850073881861102, + "grad_norm": 0.2481117993593216, + "learning_rate": 1.2041758923280465e-07, + "loss": 0.3562, "step": 134575 }, { - "epoch": 4.73, - "learning_rate": 3.7238391222761957e-07, - "loss": 0.2606, + "epoch": 4.850254081522326, + "grad_norm": 0.24015110731124878, + "learning_rate": 1.2013168923515338e-07, + "loss": 0.3729, "step": 134580 }, { - "epoch": 4.74, - "learning_rate": 3.7189421759708245e-07, - "loss": 0.2508, + "epoch": 4.850434281183551, + "grad_norm": 0.24284091591835022, + "learning_rate": 1.1990321330488386e-07, + "loss": 0.3809, "step": 134585 }, { - "epoch": 4.74, - "learning_rate": 3.7140484274626466e-07, - "loss": 0.265, + "epoch": 4.850614480844776, + "grad_norm": 0.23821890354156494, + "learning_rate": 1.196179234801309e-07, + "loss": 0.3453, "step": 134590 }, { - "epoch": 4.74, - "learning_rate": 3.70915787681525e-07, - "loss": 0.263, + "epoch": 4.8507946805060005, + "grad_norm": 0.2183995395898819, + "learning_rate": 1.1933297264576927e-07, + "loss": 0.3285, "step": 134595 }, { - "epoch": 4.74, - "learning_rate": 3.7042705240921127e-07, - "loss": 0.2538, + "epoch": 4.850974880167225, + "grad_norm": 0.22939856350421906, + "learning_rate": 1.1904836080567638e-07, + "loss": 0.3785, "step": 134600 }, { - "epoch": 4.74, - "learning_rate": 3.699386369356711e-07, - "loss": 0.2524, + "epoch": 4.85115507982845, + "grad_norm": 0.27934911847114563, + "learning_rate": 1.187640879637325e-07, + "loss": 0.3638, "step": 134605 }, { - "epoch": 4.74, - "learning_rate": 3.6945054126724945e-07, - "loss": 0.2637, + "epoch": 4.851335279489675, + "grad_norm": 0.263106107711792, + "learning_rate": 1.184801541238123e-07, + "loss": 0.3625, "step": 134610 }, { - "epoch": 4.74, - "learning_rate": 3.689627654102773e-07, - "loss": 0.2613, + "epoch": 4.851515479150899, + "grad_norm": 0.31745436787605286, + "learning_rate": 1.181965592897849e-07, + "loss": 0.3788, "step": 134615 }, { - "epoch": 4.74, - "learning_rate": 3.6847530937109133e-07, - "loss": 0.2424, + "epoch": 4.851695678812124, + "grad_norm": 0.297881543636322, + "learning_rate": 1.1791330346550832e-07, + "loss": 0.3824, "step": 134620 }, { - "epoch": 4.74, - "learning_rate": 3.679881731560225e-07, - "loss": 0.2353, + "epoch": 4.851875878473349, + "grad_norm": 0.2612767219543457, + "learning_rate": 1.1763038665484616e-07, + "loss": 0.3582, "step": 134625 }, { - "epoch": 4.74, - "learning_rate": 3.675013567713936e-07, - "loss": 0.2516, + "epoch": 4.852056078134573, + "grad_norm": 0.27403807640075684, + "learning_rate": 1.1734780886165364e-07, + "loss": 0.4069, "step": 134630 }, { - "epoch": 4.74, - "learning_rate": 3.670148602235246e-07, - "loss": 0.252, + "epoch": 4.8522362777957975, + "grad_norm": 0.250141978263855, + "learning_rate": 1.1706557008978048e-07, + "loss": 0.3807, "step": 134635 }, { - "epoch": 4.74, - "learning_rate": 3.665286835187354e-07, - "loss": 0.2366, + "epoch": 4.852416477457022, + "grad_norm": 0.24126245081424713, + "learning_rate": 1.167836703430708e-07, + "loss": 0.3707, "step": 134640 }, { - "epoch": 4.74, - "learning_rate": 3.6604282666333754e-07, - "loss": 0.2468, + "epoch": 4.852596677118247, + "grad_norm": 0.2205542027950287, + "learning_rate": 1.1650210962536323e-07, + "loss": 0.3328, "step": 134645 }, { - "epoch": 4.74, - "learning_rate": 3.6555728966363725e-07, - "loss": 0.2369, + "epoch": 4.852776876779472, + "grad_norm": 0.23354937136173248, + "learning_rate": 1.1622088794049912e-07, + "loss": 0.3718, "step": 134650 }, { - "epoch": 4.74, - "learning_rate": 3.6507207252594046e-07, - "loss": 0.2279, + "epoch": 4.852957076440696, + "grad_norm": 0.25610846281051636, + "learning_rate": 1.1594000529230875e-07, + "loss": 0.3912, "step": 134655 }, { - "epoch": 4.74, - "learning_rate": 3.6458717525654773e-07, - "loss": 0.2566, + "epoch": 4.853137276101921, + "grad_norm": 0.3403148055076599, + "learning_rate": 1.1565946168461684e-07, + "loss": 0.3726, "step": 134660 }, { - "epoch": 4.74, - "learning_rate": 3.641025978617568e-07, - "loss": 0.2486, + "epoch": 4.853317475763146, + "grad_norm": 0.2914038598537445, + "learning_rate": 1.1537925712124809e-07, + "loss": 0.3737, "step": 134665 }, { - "epoch": 4.74, - "learning_rate": 3.6361834034785714e-07, - "loss": 0.2552, + "epoch": 4.85349767542437, + "grad_norm": 0.3024619519710541, + "learning_rate": 1.150993916060189e-07, + "loss": 0.3859, "step": 134670 }, { - "epoch": 4.74, - "learning_rate": 3.631344027211381e-07, - "loss": 0.2381, + "epoch": 4.8536778750855945, + "grad_norm": 0.2520431876182556, + "learning_rate": 1.1481986514274012e-07, + "loss": 0.3754, "step": 134675 }, { - "epoch": 4.74, - "learning_rate": 3.6265078498788353e-07, - "loss": 0.2367, + "epoch": 4.853858074746819, + "grad_norm": 0.23025912046432495, + "learning_rate": 1.1454067773522537e-07, + "loss": 0.3815, "step": 134680 }, { - "epoch": 4.74, - "learning_rate": 3.6216748715437176e-07, - "loss": 0.2521, + "epoch": 4.854038274408044, + "grad_norm": 0.2676393687725067, + "learning_rate": 1.1426182938727714e-07, + "loss": 0.369, "step": 134685 }, { - "epoch": 4.74, - "learning_rate": 3.616845092268756e-07, - "loss": 0.2516, + "epoch": 4.854218474069269, + "grad_norm": 0.2332090437412262, + "learning_rate": 1.1398332010269241e-07, + "loss": 0.3455, "step": 134690 }, { - "epoch": 4.74, - "learning_rate": 3.612018512116705e-07, - "loss": 0.2339, + "epoch": 4.854398673730493, + "grad_norm": 0.23634545505046844, + "learning_rate": 1.1370514988526538e-07, + "loss": 0.4167, "step": 134695 }, { - "epoch": 4.74, - "learning_rate": 3.6071951311502374e-07, - "loss": 0.2278, + "epoch": 4.854578873391718, + "grad_norm": 0.25552475452423096, + "learning_rate": 1.1342731873878742e-07, + "loss": 0.4016, "step": 134700 }, { - "epoch": 4.74, - "learning_rate": 3.602374949431941e-07, - "loss": 0.2603, + "epoch": 4.854759073052943, + "grad_norm": 0.21695470809936523, + "learning_rate": 1.1314982666704444e-07, + "loss": 0.3786, "step": 134705 }, { - "epoch": 4.74, - "learning_rate": 3.5975579670244065e-07, - "loss": 0.2373, + "epoch": 4.854939272714168, + "grad_norm": 0.27159446477890015, + "learning_rate": 1.1287267367381671e-07, + "loss": 0.396, "step": 134710 }, { - "epoch": 4.74, - "learning_rate": 3.592744183990221e-07, - "loss": 0.2669, + "epoch": 4.855119472375392, + "grad_norm": 0.2568185329437256, + "learning_rate": 1.1259585976288179e-07, + "loss": 0.3537, "step": 134715 }, { - "epoch": 4.74, - "learning_rate": 3.587933600391863e-07, - "loss": 0.2454, + "epoch": 4.855299672036617, + "grad_norm": 0.29152026772499084, + "learning_rate": 1.1231938493800887e-07, + "loss": 0.341, "step": 134720 }, { - "epoch": 4.74, - "learning_rate": 3.583126216291782e-07, - "loss": 0.2355, + "epoch": 4.855479871697841, + "grad_norm": 0.2555713951587677, + "learning_rate": 1.1204324920296716e-07, + "loss": 0.3395, "step": 134725 }, { - "epoch": 4.74, - "learning_rate": 3.5783220317524293e-07, - "loss": 0.2398, + "epoch": 4.855660071359066, + "grad_norm": 0.2556825280189514, + "learning_rate": 1.1176745256151755e-07, + "loss": 0.3501, "step": 134730 }, { - "epoch": 4.74, - "learning_rate": 3.5735210468361426e-07, - "loss": 0.2453, + "epoch": 4.85584027102029, + "grad_norm": 0.30311036109924316, + "learning_rate": 1.1149199501741537e-07, + "loss": 0.3778, "step": 134735 }, { - "epoch": 4.74, - "learning_rate": 3.5687232616053166e-07, - "loss": 0.2474, + "epoch": 4.856020470681515, + "grad_norm": 0.23065444827079773, + "learning_rate": 1.1121687657441871e-07, + "loss": 0.373, "step": 134740 }, { - "epoch": 4.74, - "learning_rate": 3.563928676122208e-07, - "loss": 0.2625, + "epoch": 4.85620067034274, + "grad_norm": 0.26827138662338257, + "learning_rate": 1.1094209723627458e-07, + "loss": 0.3632, "step": 134745 }, { - "epoch": 4.74, - "learning_rate": 3.559137290449044e-07, - "loss": 0.2214, + "epoch": 4.856380870003965, + "grad_norm": 0.29760560393333435, + "learning_rate": 1.1066765700672166e-07, + "loss": 0.3913, "step": 134750 }, { - "epoch": 4.74, - "learning_rate": 3.5543491046481084e-07, - "loss": 0.2508, + "epoch": 4.856561069665189, + "grad_norm": 0.27333009243011475, + "learning_rate": 1.1039355588950695e-07, + "loss": 0.3704, "step": 134755 }, { - "epoch": 4.74, - "learning_rate": 3.549564118781518e-07, - "loss": 0.2269, + "epoch": 4.856741269326414, + "grad_norm": 0.244674414396286, + "learning_rate": 1.101197938883608e-07, + "loss": 0.3931, "step": 134760 }, { - "epoch": 4.74, - "learning_rate": 3.544782332911417e-07, - "loss": 0.2652, + "epoch": 4.856921468987638, + "grad_norm": 0.2720971405506134, + "learning_rate": 1.0984637100701078e-07, + "loss": 0.3967, "step": 134765 }, { - "epoch": 4.74, - "learning_rate": 3.540003747099896e-07, - "loss": 0.2392, + "epoch": 4.857101668648863, + "grad_norm": 0.17154434323310852, + "learning_rate": 1.0957328724918725e-07, + "loss": 0.3374, "step": 134770 }, { - "epoch": 4.74, - "learning_rate": 3.535228361409015e-07, - "loss": 0.2644, + "epoch": 4.857281868310087, + "grad_norm": 0.27293217182159424, + "learning_rate": 1.0930054261860668e-07, + "loss": 0.3667, "step": 134775 }, { - "epoch": 4.74, - "learning_rate": 3.5304561759007813e-07, - "loss": 0.2172, + "epoch": 4.857462067971312, + "grad_norm": 0.24722565710544586, + "learning_rate": 1.0902813711899107e-07, + "loss": 0.4101, "step": 134780 }, { - "epoch": 4.74, - "learning_rate": 3.525687190637117e-07, - "loss": 0.2304, + "epoch": 4.857642267632537, + "grad_norm": 0.3114294707775116, + "learning_rate": 1.0875607075404582e-07, + "loss": 0.3313, "step": 134785 }, { - "epoch": 4.74, - "learning_rate": 3.5209214056799997e-07, - "loss": 0.2548, + "epoch": 4.8578224672937615, + "grad_norm": 0.3135533332824707, + "learning_rate": 1.0848434352747905e-07, + "loss": 0.3961, "step": 134790 }, { - "epoch": 4.74, - "learning_rate": 3.5161588210912697e-07, - "loss": 0.2445, + "epoch": 4.858002666954986, + "grad_norm": 0.30858752131462097, + "learning_rate": 1.0821295544299337e-07, + "loss": 0.4051, "step": 134795 }, { - "epoch": 4.74, - "learning_rate": 3.511399436932766e-07, - "loss": 0.2448, + "epoch": 4.858182866616211, + "grad_norm": 0.25825393199920654, + "learning_rate": 1.0794190650429137e-07, + "loss": 0.3879, "step": 134800 }, { - "epoch": 4.74, - "learning_rate": 3.5066432532663005e-07, - "loss": 0.2474, + "epoch": 4.858363066277436, + "grad_norm": 0.31138578057289124, + "learning_rate": 1.0767119671505899e-07, + "loss": 0.3673, "step": 134805 }, { - "epoch": 4.74, - "learning_rate": 3.5018902701536296e-07, - "loss": 0.2618, + "epoch": 4.8585432659386605, + "grad_norm": 0.21300721168518066, + "learning_rate": 1.0740082607898494e-07, + "loss": 0.3528, "step": 134810 }, { - "epoch": 4.74, - "learning_rate": 3.4971404876564806e-07, - "loss": 0.2629, + "epoch": 4.858723465599884, + "grad_norm": 0.23539184033870697, + "learning_rate": 1.0713079459975795e-07, + "loss": 0.3364, "step": 134815 }, { - "epoch": 4.74, - "learning_rate": 3.4923939058364994e-07, - "loss": 0.2302, + "epoch": 4.858903665261109, + "grad_norm": 0.2637368440628052, + "learning_rate": 1.0686110228105284e-07, + "loss": 0.3895, "step": 134820 }, { - "epoch": 4.74, - "learning_rate": 3.4876505247553315e-07, - "loss": 0.237, + "epoch": 4.859083864922334, + "grad_norm": 0.24945858120918274, + "learning_rate": 1.0659174912655001e-07, + "loss": 0.3608, "step": 134825 }, { - "epoch": 4.74, - "learning_rate": 3.4829103444745383e-07, - "loss": 0.242, + "epoch": 4.8592640645835585, + "grad_norm": 0.2507854402065277, + "learning_rate": 1.0632273513991042e-07, + "loss": 0.3788, "step": 134830 }, { - "epoch": 4.74, - "learning_rate": 3.4781733650557366e-07, - "loss": 0.2565, + "epoch": 4.859444264244783, + "grad_norm": 0.2690359950065613, + "learning_rate": 1.0605406032480614e-07, + "loss": 0.3606, "step": 134835 }, { - "epoch": 4.74, - "learning_rate": 3.4734395865603787e-07, - "loss": 0.2636, + "epoch": 4.859624463906008, + "grad_norm": 0.2506677210330963, + "learning_rate": 1.0578572468489534e-07, + "loss": 0.4075, "step": 134840 }, { - "epoch": 4.74, - "learning_rate": 3.468709009049914e-07, - "loss": 0.2631, + "epoch": 4.859804663567233, + "grad_norm": 0.2608148455619812, + "learning_rate": 1.0551772822383343e-07, + "loss": 0.336, "step": 134845 }, { - "epoch": 4.74, - "learning_rate": 3.463981632585822e-07, - "loss": 0.2821, + "epoch": 4.8599848632284575, + "grad_norm": 0.26057958602905273, + "learning_rate": 1.0525007094527305e-07, + "loss": 0.3707, "step": 134850 }, { - "epoch": 4.74, - "learning_rate": 3.459257457229442e-07, - "loss": 0.2558, + "epoch": 4.860165062889681, + "grad_norm": 0.3034597337245941, + "learning_rate": 1.049827528528613e-07, + "loss": 0.3569, "step": 134855 }, { - "epoch": 4.74, - "learning_rate": 3.454536483042142e-07, - "loss": 0.2601, + "epoch": 4.860345262550906, + "grad_norm": 0.25797319412231445, + "learning_rate": 1.047157739502369e-07, + "loss": 0.3628, "step": 134860 }, { - "epoch": 4.74, - "learning_rate": 3.449818710085179e-07, - "loss": 0.26, + "epoch": 4.860525462212131, + "grad_norm": 0.2869213819503784, + "learning_rate": 1.0444913424104418e-07, + "loss": 0.4191, "step": 134865 }, { - "epoch": 4.75, - "learning_rate": 3.4451041384198635e-07, - "loss": 0.2316, + "epoch": 4.8607056618733555, + "grad_norm": 0.2047075480222702, + "learning_rate": 1.041828337289108e-07, + "loss": 0.3554, "step": 134870 }, { - "epoch": 4.75, - "learning_rate": 3.440392768107342e-07, - "loss": 0.254, + "epoch": 4.86088586153458, + "grad_norm": 0.2842639088630676, + "learning_rate": 1.0391687241746717e-07, + "loss": 0.3925, "step": 134875 }, { - "epoch": 4.75, - "learning_rate": 3.4356845992088715e-07, - "loss": 0.2416, + "epoch": 4.861066061195805, + "grad_norm": 0.2997124493122101, + "learning_rate": 1.036512503103354e-07, + "loss": 0.3753, "step": 134880 }, { - "epoch": 4.75, - "learning_rate": 3.430979631785514e-07, - "loss": 0.2333, + "epoch": 4.86124626085703, + "grad_norm": 0.23771364986896515, + "learning_rate": 1.0338596741113204e-07, + "loss": 0.3682, "step": 134885 }, { - "epoch": 4.75, - "learning_rate": 3.4262778658984416e-07, - "loss": 0.2421, + "epoch": 4.8614264605182544, + "grad_norm": 0.2713213264942169, + "learning_rate": 1.0312102372347921e-07, + "loss": 0.3753, "step": 134890 }, { - "epoch": 4.75, - "learning_rate": 3.421579301608607e-07, - "loss": 0.2682, + "epoch": 4.861606660179479, + "grad_norm": 0.28782111406326294, + "learning_rate": 1.0285641925097956e-07, + "loss": 0.3416, "step": 134895 }, { - "epoch": 4.75, - "learning_rate": 3.4168839389770725e-07, - "loss": 0.2293, + "epoch": 4.861786859840704, + "grad_norm": 0.21895797550678253, + "learning_rate": 1.0259215399724132e-07, + "loss": 0.3613, "step": 134900 }, { - "epoch": 4.75, - "learning_rate": 3.4121917780648163e-07, - "loss": 0.2583, + "epoch": 4.861967059501928, + "grad_norm": 0.21650052070617676, + "learning_rate": 1.0232822796586716e-07, + "loss": 0.3367, "step": 134905 }, { - "epoch": 4.75, - "learning_rate": 3.407502818932734e-07, - "loss": 0.2395, + "epoch": 4.8621472591631525, + "grad_norm": 0.22131061553955078, + "learning_rate": 1.0206464116044868e-07, + "loss": 0.4108, "step": 134910 }, { - "epoch": 4.75, - "learning_rate": 3.402817061641722e-07, - "loss": 0.2444, + "epoch": 4.862327458824377, + "grad_norm": 0.2649054527282715, + "learning_rate": 1.0180139358458018e-07, + "loss": 0.3535, "step": 134915 }, { - "epoch": 4.75, - "learning_rate": 3.398134506252593e-07, - "loss": 0.2272, + "epoch": 4.862507658485602, + "grad_norm": 0.1987181305885315, + "learning_rate": 1.0153848524184494e-07, + "loss": 0.4098, "step": 134920 }, { - "epoch": 4.75, - "learning_rate": 3.393455152826186e-07, - "loss": 0.2682, + "epoch": 4.862687858146827, + "grad_norm": 0.22203120589256287, + "learning_rate": 1.0127591613582899e-07, + "loss": 0.369, "step": 134925 }, { - "epoch": 4.75, - "learning_rate": 3.388779001423259e-07, - "loss": 0.2648, + "epoch": 4.862868057808051, + "grad_norm": 0.28719407320022583, + "learning_rate": 1.010136862701072e-07, + "loss": 0.3782, "step": 134930 }, { - "epoch": 4.75, - "learning_rate": 3.3841060521044856e-07, - "loss": 0.2421, + "epoch": 4.863048257469276, + "grad_norm": 0.2688189148902893, + "learning_rate": 1.0075179564825455e-07, + "loss": 0.3805, "step": 134935 }, { - "epoch": 4.75, - "learning_rate": 3.3794363049305667e-07, - "loss": 0.2558, + "epoch": 4.863228457130501, + "grad_norm": 0.22982627153396606, + "learning_rate": 1.0049024427383758e-07, + "loss": 0.3613, "step": 134940 }, { - "epoch": 4.75, - "learning_rate": 3.3747697599621764e-07, - "loss": 0.2466, + "epoch": 4.863408656791725, + "grad_norm": 0.18782635033130646, + "learning_rate": 1.0022903215042012e-07, + "loss": 0.3668, "step": 134945 }, { - "epoch": 4.75, - "learning_rate": 3.3701064172598497e-07, - "loss": 0.2599, + "epoch": 4.8635888564529495, + "grad_norm": 0.26904863119125366, + "learning_rate": 9.996815928156044e-08, + "loss": 0.371, "step": 134950 }, { - "epoch": 4.75, - "learning_rate": 3.365446276884149e-07, - "loss": 0.2467, + "epoch": 4.863769056114174, + "grad_norm": 0.29253089427948, + "learning_rate": 9.970762567081404e-08, + "loss": 0.3492, "step": 134955 }, { - "epoch": 4.75, - "learning_rate": 3.3607893388955815e-07, - "loss": 0.2531, + "epoch": 4.863949255775399, + "grad_norm": 0.25840044021606445, + "learning_rate": 9.944743132173085e-08, + "loss": 0.3703, "step": 134960 }, { - "epoch": 4.75, - "learning_rate": 3.3561356033546266e-07, - "loss": 0.2548, + "epoch": 4.864129455436624, + "grad_norm": 0.27990660071372986, + "learning_rate": 9.918757623785802e-08, + "loss": 0.3802, "step": 134965 }, { - "epoch": 4.75, - "learning_rate": 3.351485070321736e-07, - "loss": 0.2526, + "epoch": 4.864309655097848, + "grad_norm": 0.24384404718875885, + "learning_rate": 9.892806042273162e-08, + "loss": 0.3459, "step": 134970 }, { - "epoch": 4.75, - "learning_rate": 3.34683773985725e-07, - "loss": 0.2468, + "epoch": 4.864489854759073, + "grad_norm": 0.189422607421875, + "learning_rate": 9.866888387988771e-08, + "loss": 0.3627, "step": 134975 }, { - "epoch": 4.75, - "learning_rate": 3.34219361202151e-07, - "loss": 0.232, + "epoch": 4.864670054420298, + "grad_norm": 0.2541491985321045, + "learning_rate": 9.841004661285957e-08, + "loss": 0.3354, "step": 134980 }, { - "epoch": 4.75, - "learning_rate": 3.3375526868748284e-07, - "loss": 0.2708, + "epoch": 4.864850254081523, + "grad_norm": 0.24088044464588165, + "learning_rate": 9.815154862517495e-08, + "loss": 0.3643, "step": 134985 }, { - "epoch": 4.75, - "learning_rate": 3.3329149644774904e-07, - "loss": 0.2592, + "epoch": 4.865030453742747, + "grad_norm": 0.19148464500904083, + "learning_rate": 9.789338992035324e-08, + "loss": 0.3547, "step": 134990 }, { - "epoch": 4.75, - "learning_rate": 3.3282804448896697e-07, - "loss": 0.2699, + "epoch": 4.865210653403972, + "grad_norm": 0.2473505735397339, + "learning_rate": 9.763557050191385e-08, + "loss": 0.3399, "step": 134995 }, { - "epoch": 4.75, - "learning_rate": 3.3236491281715685e-07, - "loss": 0.2617, + "epoch": 4.865390853065196, + "grad_norm": 0.2551048696041107, + "learning_rate": 9.737809037336787e-08, + "loss": 0.3505, "step": 135000 }, { - "epoch": 4.75, - "eval_loss": 0.24845683574676514, - "eval_runtime": 10.5389, - "eval_samples_per_second": 9.489, - "eval_steps_per_second": 9.489, + "epoch": 4.865390853065196, + "eval_loss": 0.42877399921417236, + "eval_runtime": 3.5351, + "eval_samples_per_second": 28.288, + "eval_steps_per_second": 7.072, "step": 135000 }, { - "epoch": 4.75, - "learning_rate": 3.3190210143833336e-07, - "loss": 0.235, + "epoch": 4.865571052726421, + "grad_norm": 0.24867664277553558, + "learning_rate": 9.71209495382236e-08, + "loss": 0.3733, "step": 135005 }, { - "epoch": 4.75, - "learning_rate": 3.3143961035850545e-07, - "loss": 0.2608, + "epoch": 4.865751252387645, + "grad_norm": 0.33623263239860535, + "learning_rate": 9.686414799998656e-08, + "loss": 0.4158, "step": 135010 }, { - "epoch": 4.75, - "learning_rate": 3.30977439583674e-07, - "loss": 0.2415, + "epoch": 4.86593145204887, + "grad_norm": 0.301376074552536, + "learning_rate": 9.660768576215396e-08, + "loss": 0.3439, "step": 135015 }, { - "epoch": 4.75, - "learning_rate": 3.3051558911984247e-07, - "loss": 0.2501, + "epoch": 4.866111651710095, + "grad_norm": 0.3229968845844269, + "learning_rate": 9.635156282822023e-08, + "loss": 0.3624, "step": 135020 }, { - "epoch": 4.75, - "learning_rate": 3.3005405897301165e-07, - "loss": 0.2308, + "epoch": 4.86629185137132, + "grad_norm": 0.2948048710823059, + "learning_rate": 9.609577920167422e-08, + "loss": 0.3791, "step": 135025 }, { - "epoch": 4.75, - "learning_rate": 3.295928491491684e-07, - "loss": 0.2542, + "epoch": 4.866472051032544, + "grad_norm": 0.26417121291160583, + "learning_rate": 9.584033488600208e-08, + "loss": 0.357, "step": 135030 }, { - "epoch": 4.75, - "learning_rate": 3.2913195965430245e-07, - "loss": 0.2402, + "epoch": 4.866652250693769, + "grad_norm": 0.234276682138443, + "learning_rate": 9.558522988468432e-08, + "loss": 0.3794, "step": 135035 }, { - "epoch": 4.75, - "learning_rate": 3.2867139049440334e-07, - "loss": 0.246, + "epoch": 4.866832450354993, + "grad_norm": 0.22871927917003632, + "learning_rate": 9.533046420119318e-08, + "loss": 0.3673, "step": 135040 }, { - "epoch": 4.75, - "learning_rate": 3.282111416754441e-07, - "loss": 0.2597, + "epoch": 4.867012650016218, + "grad_norm": 0.2720992863178253, + "learning_rate": 9.507603783900366e-08, + "loss": 0.3691, "step": 135045 }, { - "epoch": 4.75, - "learning_rate": 3.277512132034033e-07, - "loss": 0.2654, + "epoch": 4.867192849677442, + "grad_norm": 0.187312513589859, + "learning_rate": 9.482195080158517e-08, + "loss": 0.3576, "step": 135050 }, { - "epoch": 4.75, - "learning_rate": 3.2729160508425396e-07, - "loss": 0.2464, + "epoch": 4.867373049338667, + "grad_norm": 0.2163442075252533, + "learning_rate": 9.456820309239333e-08, + "loss": 0.329, "step": 135055 }, { - "epoch": 4.75, - "learning_rate": 3.268323173239635e-07, - "loss": 0.2554, + "epoch": 4.867553248999892, + "grad_norm": 0.2720244526863098, + "learning_rate": 9.431479471488647e-08, + "loss": 0.364, "step": 135060 }, { - "epoch": 4.75, - "learning_rate": 3.2637334992849656e-07, - "loss": 0.2478, + "epoch": 4.867733448661117, + "grad_norm": 0.23891782760620117, + "learning_rate": 9.406172567252015e-08, + "loss": 0.3779, "step": 135065 }, { - "epoch": 4.75, - "learning_rate": 3.259147029038123e-07, - "loss": 0.2364, + "epoch": 4.867913648322341, + "grad_norm": 0.21805892884731293, + "learning_rate": 9.380899596873882e-08, + "loss": 0.3617, "step": 135070 }, { - "epoch": 4.75, - "learning_rate": 3.2545637625586154e-07, - "loss": 0.2548, + "epoch": 4.868093847983566, + "grad_norm": 0.23532462120056152, + "learning_rate": 9.355660560699254e-08, + "loss": 0.3854, "step": 135075 }, { - "epoch": 4.75, - "learning_rate": 3.2499836999059786e-07, - "loss": 0.2601, + "epoch": 4.868274047644791, + "grad_norm": 0.3134779632091522, + "learning_rate": 9.330455459071185e-08, + "loss": 0.3714, "step": 135080 }, { - "epoch": 4.75, - "learning_rate": 3.24540684113972e-07, - "loss": 0.2602, + "epoch": 4.8684542473060155, + "grad_norm": 0.24097296595573425, + "learning_rate": 9.305284292333572e-08, + "loss": 0.3793, "step": 135085 }, { - "epoch": 4.75, - "learning_rate": 3.2408331863192363e-07, - "loss": 0.2577, + "epoch": 4.868634446967239, + "grad_norm": 0.24849741160869598, + "learning_rate": 9.280147060829192e-08, + "loss": 0.3469, "step": 135090 }, { - "epoch": 4.75, - "learning_rate": 3.2362627355038975e-07, - "loss": 0.241, + "epoch": 4.868814646628464, + "grad_norm": 0.23304787278175354, + "learning_rate": 9.25504376490055e-08, + "loss": 0.3691, "step": 135095 }, { - "epoch": 4.75, - "learning_rate": 3.231695488753045e-07, - "loss": 0.2389, + "epoch": 4.868994846289689, + "grad_norm": 0.2561473548412323, + "learning_rate": 9.229974404889875e-08, + "loss": 0.3874, "step": 135100 }, { - "epoch": 4.75, - "learning_rate": 3.2271314461260195e-07, - "loss": 0.2322, + "epoch": 4.8691750459509135, + "grad_norm": 0.23009338974952698, + "learning_rate": 9.204938981138277e-08, + "loss": 0.3618, "step": 135105 }, { - "epoch": 4.75, - "learning_rate": 3.2225706076820796e-07, - "loss": 0.2624, + "epoch": 4.869355245612138, + "grad_norm": 0.22427034378051758, + "learning_rate": 9.179937493987434e-08, + "loss": 0.4131, "step": 135110 }, { - "epoch": 4.75, - "learning_rate": 3.2180129734804e-07, - "loss": 0.2631, + "epoch": 4.869535445273363, + "grad_norm": 0.30147793889045715, + "learning_rate": 9.154969943777625e-08, + "loss": 0.4054, "step": 135115 }, { - "epoch": 4.75, - "learning_rate": 3.2134585435802113e-07, - "loss": 0.2308, + "epoch": 4.869715644934588, + "grad_norm": 0.2412918359041214, + "learning_rate": 9.130036330849134e-08, + "loss": 0.3642, "step": 135120 }, { - "epoch": 4.75, - "learning_rate": 3.208907318040605e-07, - "loss": 0.2534, + "epoch": 4.8698958445958125, + "grad_norm": 0.2309122383594513, + "learning_rate": 9.105136655541691e-08, + "loss": 0.3581, "step": 135125 }, { - "epoch": 4.75, - "learning_rate": 3.2043592969207005e-07, - "loss": 0.22, + "epoch": 4.870076044257036, + "grad_norm": 0.29338333010673523, + "learning_rate": 9.080270918194466e-08, + "loss": 0.3669, "step": 135130 }, { - "epoch": 4.75, - "learning_rate": 3.1998144802795616e-07, - "loss": 0.2446, + "epoch": 4.870256243918261, + "grad_norm": 0.2538418471813202, + "learning_rate": 9.05543911914608e-08, + "loss": 0.3738, "step": 135135 }, { - "epoch": 4.75, - "learning_rate": 3.195272868176169e-07, - "loss": 0.2606, + "epoch": 4.870436443579486, + "grad_norm": 0.2361038327217102, + "learning_rate": 9.03064125873515e-08, + "loss": 0.3925, "step": 135140 }, { - "epoch": 4.75, - "learning_rate": 3.190734460669531e-07, - "loss": 0.2732, + "epoch": 4.8706166432407105, + "grad_norm": 0.27490556240081787, + "learning_rate": 9.00587733729974e-08, + "loss": 0.3542, "step": 135145 }, { - "epoch": 4.75, - "learning_rate": 3.186199257818517e-07, - "loss": 0.2327, + "epoch": 4.870796842901935, + "grad_norm": 0.21929150819778442, + "learning_rate": 8.9811473551768e-08, + "loss": 0.35, "step": 135150 }, { - "epoch": 4.76, - "learning_rate": 3.1816672596820527e-07, - "loss": 0.2567, + "epoch": 4.87097704256316, + "grad_norm": 0.234662726521492, + "learning_rate": 8.956451312703562e-08, + "loss": 0.3558, "step": 135155 }, { - "epoch": 4.76, - "learning_rate": 3.1771384663190075e-07, - "loss": 0.2659, + "epoch": 4.871157242224385, + "grad_norm": 0.2809460759162903, + "learning_rate": 8.931789210216146e-08, + "loss": 0.4014, "step": 135160 }, { - "epoch": 4.76, - "learning_rate": 3.172612877788139e-07, - "loss": 0.2662, + "epoch": 4.8713374418856095, + "grad_norm": 0.2753678560256958, + "learning_rate": 8.907161048050949e-08, + "loss": 0.3782, "step": 135165 }, { - "epoch": 4.76, - "learning_rate": 3.168090494148207e-07, - "loss": 0.2372, + "epoch": 4.871517641546834, + "grad_norm": 0.3030547499656677, + "learning_rate": 8.882566826543259e-08, + "loss": 0.4204, "step": 135170 }, { - "epoch": 4.76, - "learning_rate": 3.163571315457997e-07, - "loss": 0.268, + "epoch": 4.871697841208059, + "grad_norm": 0.2812124788761139, + "learning_rate": 8.858006546028641e-08, + "loss": 0.3582, "step": 135175 }, { - "epoch": 4.76, - "learning_rate": 3.1590553417761294e-07, - "loss": 0.2421, + "epoch": 4.871878040869283, + "grad_norm": 0.24554339051246643, + "learning_rate": 8.833480206840994e-08, + "loss": 0.3686, "step": 135180 }, { - "epoch": 4.76, - "learning_rate": 3.154542573161251e-07, - "loss": 0.2636, + "epoch": 4.8720582405305075, + "grad_norm": 0.24936611950397491, + "learning_rate": 8.80898780931505e-08, + "loss": 0.3608, "step": 135185 }, { - "epoch": 4.76, - "learning_rate": 3.150033009671927e-07, - "loss": 0.2537, + "epoch": 4.872238440191732, + "grad_norm": 0.26903268694877625, + "learning_rate": 8.784529353784432e-08, + "loss": 0.3633, "step": 135190 }, { - "epoch": 4.76, - "learning_rate": 3.145526651366776e-07, - "loss": 0.2599, + "epoch": 4.872418639852957, + "grad_norm": 0.2582688629627228, + "learning_rate": 8.760104840582207e-08, + "loss": 0.3629, "step": 135195 }, { - "epoch": 4.76, - "learning_rate": 3.1410234983042796e-07, - "loss": 0.2666, + "epoch": 4.872598839514182, + "grad_norm": 0.24837349355220795, + "learning_rate": 8.735714270041162e-08, + "loss": 0.3739, "step": 135200 }, { - "epoch": 4.76, - "learning_rate": 3.136523550542891e-07, - "loss": 0.2494, + "epoch": 4.8727790391754064, + "grad_norm": 0.2944589853286743, + "learning_rate": 8.711357642493812e-08, + "loss": 0.3869, "step": 135205 }, { - "epoch": 4.76, - "learning_rate": 3.1320268081410365e-07, - "loss": 0.2346, + "epoch": 4.872959238836631, + "grad_norm": 0.2535194456577301, + "learning_rate": 8.687034958271833e-08, + "loss": 0.3885, "step": 135210 }, { - "epoch": 4.76, - "learning_rate": 3.127533271157168e-07, - "loss": 0.2193, + "epoch": 4.873139438497856, + "grad_norm": 0.2899385392665863, + "learning_rate": 8.66274621770663e-08, + "loss": 0.3827, "step": 135215 }, { - "epoch": 4.76, - "learning_rate": 3.123042939649573e-07, - "loss": 0.2747, + "epoch": 4.873319638159081, + "grad_norm": 0.23525474965572357, + "learning_rate": 8.638491421129325e-08, + "loss": 0.4044, "step": 135220 }, { - "epoch": 4.76, - "learning_rate": 3.1185558136765656e-07, - "loss": 0.2646, + "epoch": 4.8734998378203045, + "grad_norm": 0.2679741084575653, + "learning_rate": 8.614270568869931e-08, + "loss": 0.3682, "step": 135225 }, { - "epoch": 4.76, - "learning_rate": 3.114071893296378e-07, - "loss": 0.2353, + "epoch": 4.873680037481529, + "grad_norm": 0.2726483643054962, + "learning_rate": 8.590083661259019e-08, + "loss": 0.4052, "step": 135230 }, { - "epoch": 4.76, - "learning_rate": 3.1095911785672946e-07, - "loss": 0.2685, + "epoch": 4.873860237142754, + "grad_norm": 0.2760205864906311, + "learning_rate": 8.565930698625769e-08, + "loss": 0.3704, "step": 135235 }, { - "epoch": 4.76, - "learning_rate": 3.105113669547466e-07, - "loss": 0.2474, + "epoch": 4.874040436803979, + "grad_norm": 0.21959510445594788, + "learning_rate": 8.54181168129936e-08, + "loss": 0.3618, "step": 135240 }, { - "epoch": 4.76, - "learning_rate": 3.1006393662950106e-07, - "loss": 0.2549, + "epoch": 4.874220636465203, + "grad_norm": 0.2475598305463791, + "learning_rate": 8.517726609608424e-08, + "loss": 0.3569, "step": 135245 }, { - "epoch": 4.76, - "learning_rate": 3.096168268868049e-07, - "loss": 0.2491, + "epoch": 4.874400836126428, + "grad_norm": 0.3407549560070038, + "learning_rate": 8.493675483881303e-08, + "loss": 0.3741, "step": 135250 }, { - "epoch": 4.76, - "learning_rate": 3.091700377324647e-07, - "loss": 0.242, + "epoch": 4.874581035787653, + "grad_norm": 0.2598182260990143, + "learning_rate": 8.46965830444496e-08, + "loss": 0.373, "step": 135255 }, { - "epoch": 4.76, - "learning_rate": 3.0872356917227565e-07, - "loss": 0.2686, + "epoch": 4.874761235448878, + "grad_norm": 0.20785191655158997, + "learning_rate": 8.445675071627468e-08, + "loss": 0.3365, "step": 135260 }, { - "epoch": 4.76, - "learning_rate": 3.082774212120415e-07, - "loss": 0.2521, + "epoch": 4.874941435110102, + "grad_norm": 0.33232206106185913, + "learning_rate": 8.421725785755508e-08, + "loss": 0.3491, "step": 135265 }, { - "epoch": 4.76, - "learning_rate": 3.078315938575521e-07, - "loss": 0.2656, + "epoch": 4.875121634771327, + "grad_norm": 0.26161181926727295, + "learning_rate": 8.397810447154653e-08, + "loss": 0.3811, "step": 135270 }, { - "epoch": 4.76, - "learning_rate": 3.073860871145945e-07, - "loss": 0.2623, + "epoch": 4.875301834432551, + "grad_norm": 0.2408609241247177, + "learning_rate": 8.373929056151586e-08, + "loss": 0.3769, "step": 135275 }, { - "epoch": 4.76, - "learning_rate": 3.0694090098895846e-07, - "loss": 0.2488, + "epoch": 4.875482034093776, + "grad_norm": 0.23764941096305847, + "learning_rate": 8.350081613071326e-08, + "loss": 0.3892, "step": 135280 }, { - "epoch": 4.76, - "learning_rate": 3.064960354864199e-07, - "loss": 0.2451, + "epoch": 4.875662233755, + "grad_norm": 0.2595727741718292, + "learning_rate": 8.326268118238612e-08, + "loss": 0.3808, "step": 135285 }, { - "epoch": 4.76, - "learning_rate": 3.0605149061275763e-07, - "loss": 0.2686, + "epoch": 4.875842433416225, + "grad_norm": 0.25060775876045227, + "learning_rate": 8.302488571978185e-08, + "loss": 0.3857, "step": 135290 }, { - "epoch": 4.76, - "learning_rate": 3.056072663737447e-07, - "loss": 0.2509, + "epoch": 4.87602263307745, + "grad_norm": 0.26845234632492065, + "learning_rate": 8.278742974613951e-08, + "loss": 0.4005, "step": 135295 }, { - "epoch": 4.76, - "learning_rate": 3.0516336277514323e-07, - "loss": 0.2616, + "epoch": 4.876202832738675, + "grad_norm": 0.2484569400548935, + "learning_rate": 8.255031326469542e-08, + "loss": 0.3708, "step": 135300 }, { - "epoch": 4.76, - "learning_rate": 3.047197798227236e-07, - "loss": 0.257, + "epoch": 4.876383032399899, + "grad_norm": 0.22691169381141663, + "learning_rate": 8.231353627867755e-08, + "loss": 0.379, "step": 135305 }, { - "epoch": 4.76, - "learning_rate": 3.042765175222423e-07, - "loss": 0.2525, + "epoch": 4.876563232061124, + "grad_norm": 0.28611308336257935, + "learning_rate": 8.207709879131386e-08, + "loss": 0.3717, "step": 135310 }, { - "epoch": 4.76, - "learning_rate": 3.0383357587945307e-07, - "loss": 0.2538, + "epoch": 4.876743431722348, + "grad_norm": 0.21973417699337006, + "learning_rate": 8.184100080582679e-08, + "loss": 0.3596, "step": 135315 }, { - "epoch": 4.76, - "learning_rate": 3.0339095490011236e-07, - "loss": 0.2413, + "epoch": 4.876923631383573, + "grad_norm": 0.3248337209224701, + "learning_rate": 8.160524232543043e-08, + "loss": 0.3808, "step": 135320 }, { - "epoch": 4.76, - "learning_rate": 3.0294865458996293e-07, - "loss": 0.2461, + "epoch": 4.877103831044797, + "grad_norm": 0.29214221239089966, + "learning_rate": 8.136982335333887e-08, + "loss": 0.3813, "step": 135325 }, { - "epoch": 4.76, - "learning_rate": 3.0250667495475005e-07, - "loss": 0.2333, + "epoch": 4.877284030706022, + "grad_norm": 0.26933974027633667, + "learning_rate": 8.113474389275788e-08, + "loss": 0.3874, "step": 135330 }, { - "epoch": 4.76, - "learning_rate": 3.020650160002109e-07, - "loss": 0.2433, + "epoch": 4.877464230367247, + "grad_norm": 0.2623523473739624, + "learning_rate": 8.090000394689324e-08, + "loss": 0.4208, "step": 135335 }, { - "epoch": 4.76, - "learning_rate": 3.0162367773208244e-07, - "loss": 0.236, + "epoch": 4.877644430028472, + "grad_norm": 0.2307843416929245, + "learning_rate": 8.06656035189396e-08, + "loss": 0.3574, "step": 135340 }, { - "epoch": 4.76, - "learning_rate": 3.011826601560935e-07, - "loss": 0.262, + "epoch": 4.877824629689696, + "grad_norm": 0.26475924253463745, + "learning_rate": 8.04315426120944e-08, + "loss": 0.3531, "step": 135345 }, { - "epoch": 4.76, - "learning_rate": 3.007419632779701e-07, - "loss": 0.2551, + "epoch": 4.878004829350921, + "grad_norm": 0.2883531153202057, + "learning_rate": 8.0197821229544e-08, + "loss": 0.3882, "step": 135350 }, { - "epoch": 4.76, - "learning_rate": 3.0030158710343537e-07, - "loss": 0.2543, + "epoch": 4.878185029012146, + "grad_norm": 0.2579416036605835, + "learning_rate": 7.996443937447196e-08, + "loss": 0.3979, "step": 135355 }, { - "epoch": 4.76, - "learning_rate": 2.9986153163820696e-07, - "loss": 0.2461, + "epoch": 4.8783652286733705, + "grad_norm": 0.21452683210372925, + "learning_rate": 7.973139705006183e-08, + "loss": 0.3527, "step": 135360 }, { - "epoch": 4.76, - "learning_rate": 2.994217968879942e-07, - "loss": 0.2425, + "epoch": 4.878545428334594, + "grad_norm": 0.2238248735666275, + "learning_rate": 7.949869425948609e-08, + "loss": 0.3585, "step": 135365 }, { - "epoch": 4.76, - "learning_rate": 2.9898238285851756e-07, - "loss": 0.2627, + "epoch": 4.878725627995819, + "grad_norm": 0.2821867763996124, + "learning_rate": 7.926633100591719e-08, + "loss": 0.3877, "step": 135370 }, { - "epoch": 4.76, - "learning_rate": 2.985432895554724e-07, - "loss": 0.2568, + "epoch": 4.878905827657044, + "grad_norm": 0.2567353844642639, + "learning_rate": 7.903430729251926e-08, + "loss": 0.3633, "step": 135375 }, { - "epoch": 4.76, - "learning_rate": 2.9810451698456254e-07, - "loss": 0.2551, + "epoch": 4.879086027318269, + "grad_norm": 0.26188910007476807, + "learning_rate": 7.880262312245368e-08, + "loss": 0.3812, "step": 135380 }, { - "epoch": 4.76, - "learning_rate": 2.97666065151489e-07, - "loss": 0.2592, + "epoch": 4.879266226979493, + "grad_norm": 0.2313767969608307, + "learning_rate": 7.857127849887624e-08, + "loss": 0.3708, "step": 135385 }, { - "epoch": 4.76, - "learning_rate": 2.972279340619416e-07, - "loss": 0.2342, + "epoch": 4.879446426640718, + "grad_norm": 0.318751722574234, + "learning_rate": 7.834027342494277e-08, + "loss": 0.3856, "step": 135390 }, { - "epoch": 4.76, - "learning_rate": 2.967901237216103e-07, - "loss": 0.2552, + "epoch": 4.879626626301943, + "grad_norm": 0.2602209150791168, + "learning_rate": 7.810960790379517e-08, + "loss": 0.3871, "step": 135395 }, { - "epoch": 4.76, - "learning_rate": 2.9635263413617664e-07, - "loss": 0.2617, + "epoch": 4.8798068259631675, + "grad_norm": 0.2891538441181183, + "learning_rate": 7.787928193858096e-08, + "loss": 0.3592, "step": 135400 }, { - "epoch": 4.76, - "learning_rate": 2.9591546531132773e-07, - "loss": 0.2556, + "epoch": 4.879987025624391, + "grad_norm": 0.30516698956489563, + "learning_rate": 7.76492955324365e-08, + "loss": 0.3672, "step": 135405 }, { - "epoch": 4.76, - "learning_rate": 2.9547861725273405e-07, - "loss": 0.2481, + "epoch": 4.880167225285616, + "grad_norm": 0.2555546462535858, + "learning_rate": 7.741964868849539e-08, + "loss": 0.3664, "step": 135410 }, { - "epoch": 4.76, - "learning_rate": 2.9504208996606885e-07, - "loss": 0.281, + "epoch": 4.880347424946841, + "grad_norm": 0.25104543566703796, + "learning_rate": 7.719034140988569e-08, + "loss": 0.3555, "step": 135415 }, { - "epoch": 4.76, - "learning_rate": 2.946058834569998e-07, - "loss": 0.2579, + "epoch": 4.8805276246080656, + "grad_norm": 0.2774941027164459, + "learning_rate": 7.696137369973266e-08, + "loss": 0.3659, "step": 135420 }, { - "epoch": 4.76, - "learning_rate": 2.9416999773119456e-07, - "loss": 0.2726, + "epoch": 4.88070782426929, + "grad_norm": 0.31660085916519165, + "learning_rate": 7.673274556115328e-08, + "loss": 0.3489, "step": 135425 }, { - "epoch": 4.76, - "learning_rate": 2.937344327943098e-07, - "loss": 0.2413, + "epoch": 4.880888023930515, + "grad_norm": 0.32428672909736633, + "learning_rate": 7.650445699727005e-08, + "loss": 0.4014, "step": 135430 }, { - "epoch": 4.76, - "learning_rate": 2.9329918865199925e-07, - "loss": 0.2489, + "epoch": 4.88106822359174, + "grad_norm": 0.2850460112094879, + "learning_rate": 7.627650801118325e-08, + "loss": 0.3708, "step": 135435 }, { - "epoch": 4.77, - "learning_rate": 2.928642653099167e-07, - "loss": 0.245, + "epoch": 4.8812484232529645, + "grad_norm": 0.24217717349529266, + "learning_rate": 7.604889860600706e-08, + "loss": 0.3699, "step": 135440 }, { - "epoch": 4.77, - "learning_rate": 2.924296627737133e-07, - "loss": 0.2226, + "epoch": 4.881428622914189, + "grad_norm": 0.2553984224796295, + "learning_rate": 7.582162878483623e-08, + "loss": 0.4042, "step": 135445 }, { - "epoch": 4.77, - "learning_rate": 2.9199538104902335e-07, - "loss": 0.2536, + "epoch": 4.881608822575414, + "grad_norm": 0.23735560476779938, + "learning_rate": 7.559469855077383e-08, + "loss": 0.3997, "step": 135450 }, { - "epoch": 4.77, - "learning_rate": 2.9156142014149237e-07, - "loss": 0.2443, + "epoch": 4.881789022236639, + "grad_norm": 0.2347259223461151, + "learning_rate": 7.53681079069063e-08, + "loss": 0.3606, "step": 135455 }, { - "epoch": 4.77, - "learning_rate": 2.911277800567519e-07, - "loss": 0.2464, + "epoch": 4.8819692218978625, + "grad_norm": 0.24018234014511108, + "learning_rate": 7.51418568563228e-08, + "loss": 0.3564, "step": 135460 }, { - "epoch": 4.77, - "learning_rate": 2.906944608004336e-07, - "loss": 0.2466, + "epoch": 4.882149421559087, + "grad_norm": 0.26346853375434875, + "learning_rate": 7.491594540210423e-08, + "loss": 0.3791, "step": 135465 }, { - "epoch": 4.77, - "learning_rate": 2.9026146237816354e-07, - "loss": 0.2641, + "epoch": 4.882329621220312, + "grad_norm": 0.29124143719673157, + "learning_rate": 7.469037354733144e-08, + "loss": 0.3546, "step": 135470 }, { - "epoch": 4.77, - "learning_rate": 2.89828784795565e-07, - "loss": 0.2644, + "epoch": 4.882509820881537, + "grad_norm": 0.24946916103363037, + "learning_rate": 7.446514129507698e-08, + "loss": 0.4008, "step": 135475 }, { - "epoch": 4.77, - "learning_rate": 2.8939642805825564e-07, - "loss": 0.271, + "epoch": 4.8826900205427615, + "grad_norm": 0.23744343221187592, + "learning_rate": 7.424024864841061e-08, + "loss": 0.3892, "step": 135480 }, { - "epoch": 4.77, - "learning_rate": 2.889643921718449e-07, - "loss": 0.2541, + "epoch": 4.882870220203986, + "grad_norm": 0.25719282031059265, + "learning_rate": 7.401569561039379e-08, + "loss": 0.3679, "step": 135485 }, { - "epoch": 4.77, - "learning_rate": 2.885326771419505e-07, - "loss": 0.2691, + "epoch": 4.883050419865211, + "grad_norm": 0.27071502804756165, + "learning_rate": 7.379148218408516e-08, + "loss": 0.3677, "step": 135490 }, { - "epoch": 4.77, - "learning_rate": 2.8810128297417083e-07, - "loss": 0.2418, + "epoch": 4.883230619526436, + "grad_norm": 0.28900146484375, + "learning_rate": 7.356760837254617e-08, + "loss": 0.3915, "step": 135495 }, { - "epoch": 4.77, - "learning_rate": 2.876702096741124e-07, - "loss": 0.262, + "epoch": 4.8834108191876595, + "grad_norm": 0.2625620663166046, + "learning_rate": 7.334407417881883e-08, + "loss": 0.389, "step": 135500 }, { - "epoch": 4.77, - "eval_loss": 0.24846675992012024, - "eval_runtime": 10.5434, - "eval_samples_per_second": 9.485, - "eval_steps_per_second": 9.485, + "epoch": 4.8834108191876595, + "eval_loss": 0.4288046956062317, + "eval_runtime": 3.533, + "eval_samples_per_second": 28.305, + "eval_steps_per_second": 7.076, "step": 135500 }, { - "epoch": 4.77, - "learning_rate": 2.87239457247368e-07, - "loss": 0.2446, + "epoch": 4.883591018848884, + "grad_norm": 0.21090403199195862, + "learning_rate": 7.312087960595348e-08, + "loss": 0.3617, "step": 135505 }, { - "epoch": 4.77, - "learning_rate": 2.8680902569953595e-07, - "loss": 0.2705, + "epoch": 4.883771218510109, + "grad_norm": 0.300222784280777, + "learning_rate": 7.289802465698936e-08, + "loss": 0.3612, "step": 135510 }, { - "epoch": 4.77, - "learning_rate": 2.8637891503620064e-07, - "loss": 0.2406, + "epoch": 4.883951418171334, + "grad_norm": 0.2370145618915558, + "learning_rate": 7.267550933496569e-08, + "loss": 0.3661, "step": 135515 }, { - "epoch": 4.77, - "learning_rate": 2.8594912526294925e-07, - "loss": 0.2783, + "epoch": 4.8841316178325584, + "grad_norm": 0.3078131377696991, + "learning_rate": 7.245333364291063e-08, + "loss": 0.3926, "step": 135520 }, { - "epoch": 4.77, - "learning_rate": 2.855196563853607e-07, - "loss": 0.2388, + "epoch": 4.884311817493783, + "grad_norm": 0.30718788504600525, + "learning_rate": 7.223149758385506e-08, + "loss": 0.3866, "step": 135525 }, { - "epoch": 4.77, - "learning_rate": 2.850905084090111e-07, - "loss": 0.2583, + "epoch": 4.884492017155008, + "grad_norm": 0.251179575920105, + "learning_rate": 7.201000116081602e-08, + "loss": 0.3607, "step": 135530 }, { - "epoch": 4.77, - "learning_rate": 2.8466168133947367e-07, - "loss": 0.2553, + "epoch": 4.884672216816233, + "grad_norm": 0.21268831193447113, + "learning_rate": 7.17888443768161e-08, + "loss": 0.3675, "step": 135535 }, { - "epoch": 4.77, - "learning_rate": 2.842331751823135e-07, - "loss": 0.2385, + "epoch": 4.884852416477457, + "grad_norm": 0.22792141139507294, + "learning_rate": 7.156802723486678e-08, + "loss": 0.3899, "step": 135540 }, { - "epoch": 4.77, - "learning_rate": 2.8380498994310113e-07, - "loss": 0.2284, + "epoch": 4.885032616138682, + "grad_norm": 0.2687564790248871, + "learning_rate": 7.134754973797674e-08, + "loss": 0.3699, "step": 135545 }, { - "epoch": 4.77, - "learning_rate": 2.8337712562739316e-07, - "loss": 0.2567, + "epoch": 4.885212815799906, + "grad_norm": 0.2454073429107666, + "learning_rate": 7.112741188915195e-08, + "loss": 0.3885, "step": 135550 }, { - "epoch": 4.77, - "learning_rate": 2.8294958224074073e-07, - "loss": 0.2624, + "epoch": 4.885393015461131, + "grad_norm": 0.2492527812719345, + "learning_rate": 7.09076136913872e-08, + "loss": 0.3554, "step": 135555 }, { - "epoch": 4.77, - "learning_rate": 2.8252235978870054e-07, - "loss": 0.2405, + "epoch": 4.885573215122355, + "grad_norm": 0.24872152507305145, + "learning_rate": 7.068815514768013e-08, + "loss": 0.3674, "step": 135560 }, { - "epoch": 4.77, - "learning_rate": 2.820954582768209e-07, - "loss": 0.2596, + "epoch": 4.88575341478358, + "grad_norm": 0.3966946005821228, + "learning_rate": 7.046903626101997e-08, + "loss": 0.3429, "step": 135565 }, { - "epoch": 4.77, - "learning_rate": 2.816688777106391e-07, - "loss": 0.2357, + "epoch": 4.885933614444805, + "grad_norm": 0.2777992784976959, + "learning_rate": 7.025025703439325e-08, + "loss": 0.3934, "step": 135570 }, { - "epoch": 4.77, - "learning_rate": 2.8124261809569785e-07, - "loss": 0.2375, + "epoch": 4.88611381410603, + "grad_norm": 0.21319259703159332, + "learning_rate": 7.003181747078091e-08, + "loss": 0.373, "step": 135575 }, { - "epoch": 4.77, - "learning_rate": 2.8081667943752885e-07, - "loss": 0.2561, + "epoch": 4.886294013767254, + "grad_norm": 0.22784800827503204, + "learning_rate": 6.981371757315835e-08, + "loss": 0.3694, "step": 135580 }, { - "epoch": 4.77, - "learning_rate": 2.803910617416666e-07, - "loss": 0.2425, + "epoch": 4.886474213428479, + "grad_norm": 0.3045668601989746, + "learning_rate": 6.959595734449542e-08, + "loss": 0.4188, "step": 135585 }, { - "epoch": 4.77, - "learning_rate": 2.7996576501363445e-07, - "loss": 0.239, + "epoch": 4.886654413089703, + "grad_norm": 0.22848592698574066, + "learning_rate": 6.937853678776474e-08, + "loss": 0.3826, "step": 135590 }, { - "epoch": 4.77, - "learning_rate": 2.795407892589558e-07, - "loss": 0.2516, + "epoch": 4.886834612750928, + "grad_norm": 0.17041075229644775, + "learning_rate": 6.916145590592227e-08, + "loss": 0.3806, "step": 135595 }, { - "epoch": 4.77, - "learning_rate": 2.791161344831511e-07, - "loss": 0.2466, + "epoch": 4.887014812412152, + "grad_norm": 0.3228053152561188, + "learning_rate": 6.894471470192676e-08, + "loss": 0.3686, "step": 135600 }, { - "epoch": 4.77, - "learning_rate": 2.7869180069172997e-07, - "loss": 0.2325, + "epoch": 4.887195012073377, + "grad_norm": 0.26596736907958984, + "learning_rate": 6.872831317873418e-08, + "loss": 0.3763, "step": 135605 }, { - "epoch": 4.77, - "learning_rate": 2.782677878902018e-07, - "loss": 0.2447, + "epoch": 4.887375211734602, + "grad_norm": 0.2538110017776489, + "learning_rate": 6.851225133929217e-08, + "loss": 0.3434, "step": 135610 }, { - "epoch": 4.77, - "learning_rate": 2.778440960840761e-07, - "loss": 0.2456, + "epoch": 4.887555411395827, + "grad_norm": 0.23110675811767578, + "learning_rate": 6.829652918654284e-08, + "loss": 0.3542, "step": 135615 }, { - "epoch": 4.77, - "learning_rate": 2.774207252788541e-07, - "loss": 0.2659, + "epoch": 4.887735611057051, + "grad_norm": 0.24243219196796417, + "learning_rate": 6.808114672342825e-08, + "loss": 0.3663, "step": 135620 }, { - "epoch": 4.77, - "learning_rate": 2.769976754800313e-07, - "loss": 0.2339, + "epoch": 4.887915810718276, + "grad_norm": 0.27579575777053833, + "learning_rate": 6.78661039528794e-08, + "loss": 0.3798, "step": 135625 }, { - "epoch": 4.77, - "learning_rate": 2.7657494669309783e-07, - "loss": 0.2383, + "epoch": 4.888096010379501, + "grad_norm": 0.2889721393585205, + "learning_rate": 6.765140087782729e-08, + "loss": 0.3903, "step": 135630 }, { - "epoch": 4.77, - "learning_rate": 2.7615253892354643e-07, - "loss": 0.2633, + "epoch": 4.8882762100407255, + "grad_norm": 0.22426536679267883, + "learning_rate": 6.743703750120011e-08, + "loss": 0.3345, "step": 135635 }, { - "epoch": 4.77, - "learning_rate": 2.757304521768617e-07, - "loss": 0.2628, + "epoch": 4.888456409701949, + "grad_norm": 0.3247759938240051, + "learning_rate": 6.722301382591223e-08, + "loss": 0.4081, "step": 135640 }, { - "epoch": 4.77, - "learning_rate": 2.753086864585197e-07, - "loss": 0.2574, + "epoch": 4.888636609363174, + "grad_norm": 0.24855536222457886, + "learning_rate": 6.700932985488628e-08, + "loss": 0.4023, "step": 135645 }, { - "epoch": 4.77, - "learning_rate": 2.748872417740023e-07, - "loss": 0.2639, + "epoch": 4.888816809024399, + "grad_norm": 0.26974859833717346, + "learning_rate": 6.679598559103107e-08, + "loss": 0.3685, "step": 135650 }, { - "epoch": 4.77, - "learning_rate": 2.7446611812877997e-07, - "loss": 0.2581, + "epoch": 4.888997008685624, + "grad_norm": 0.2807910442352295, + "learning_rate": 6.658298103725258e-08, + "loss": 0.3518, "step": 135655 }, { - "epoch": 4.77, - "learning_rate": 2.740453155283207e-07, - "loss": 0.2548, + "epoch": 4.889177208346848, + "grad_norm": 0.271327406167984, + "learning_rate": 6.637031619645684e-08, + "loss": 0.3636, "step": 135660 }, { - "epoch": 4.77, - "learning_rate": 2.7362483397808667e-07, - "loss": 0.255, + "epoch": 4.889357408008073, + "grad_norm": 0.2790520489215851, + "learning_rate": 6.61579910715332e-08, + "loss": 0.3721, "step": 135665 }, { - "epoch": 4.77, - "learning_rate": 2.732046734835403e-07, - "loss": 0.2429, + "epoch": 4.889537607669298, + "grad_norm": 0.2329198122024536, + "learning_rate": 6.594600566538212e-08, + "loss": 0.356, "step": 135670 }, { - "epoch": 4.77, - "learning_rate": 2.7278483405013545e-07, - "loss": 0.2406, + "epoch": 4.8897178073305225, + "grad_norm": 0.23520605266094208, + "learning_rate": 6.573435998089018e-08, + "loss": 0.3887, "step": 135675 }, { - "epoch": 4.77, - "learning_rate": 2.7236531568332334e-07, - "loss": 0.2339, + "epoch": 4.889898006991746, + "grad_norm": 0.27827075123786926, + "learning_rate": 6.552305402093838e-08, + "loss": 0.3742, "step": 135680 }, { - "epoch": 4.77, - "learning_rate": 2.7194611838855243e-07, - "loss": 0.2244, + "epoch": 4.890078206652971, + "grad_norm": 0.2520049214363098, + "learning_rate": 6.531208778841059e-08, + "loss": 0.379, "step": 135685 }, { - "epoch": 4.77, - "learning_rate": 2.715272421712628e-07, - "loss": 0.2684, + "epoch": 4.890258406314196, + "grad_norm": 0.27177202701568604, + "learning_rate": 6.510146128617389e-08, + "loss": 0.3425, "step": 135690 }, { - "epoch": 4.77, - "learning_rate": 2.711086870368973e-07, - "loss": 0.2579, + "epoch": 4.890438605975421, + "grad_norm": 0.2742045819759369, + "learning_rate": 6.48911745171038e-08, + "loss": 0.4138, "step": 135695 }, { - "epoch": 4.77, - "learning_rate": 2.706904529908849e-07, - "loss": 0.2527, + "epoch": 4.890618805636645, + "grad_norm": 0.2716894745826721, + "learning_rate": 6.468122748406469e-08, + "loss": 0.3458, "step": 135700 }, { - "epoch": 4.77, - "learning_rate": 2.70272540038663e-07, - "loss": 0.2475, + "epoch": 4.89079900529787, + "grad_norm": 0.2410261183977127, + "learning_rate": 6.447162018991537e-08, + "loss": 0.3588, "step": 135705 }, { - "epoch": 4.77, - "learning_rate": 2.6985494818565215e-07, - "loss": 0.2505, + "epoch": 4.890979204959095, + "grad_norm": 0.2710227072238922, + "learning_rate": 6.426235263751468e-08, + "loss": 0.3844, "step": 135710 }, { - "epoch": 4.77, - "learning_rate": 2.6943767743728145e-07, - "loss": 0.2667, + "epoch": 4.8911594046203195, + "grad_norm": 0.27327826619148254, + "learning_rate": 6.405342482970755e-08, + "loss": 0.3764, "step": 135715 }, { - "epoch": 4.78, - "learning_rate": 2.6902072779896314e-07, - "loss": 0.2407, + "epoch": 4.891339604281544, + "grad_norm": 0.2611567974090576, + "learning_rate": 6.384483676934727e-08, + "loss": 0.3772, "step": 135720 }, { - "epoch": 4.78, - "learning_rate": 2.686040992761096e-07, - "loss": 0.2654, + "epoch": 4.891519803942769, + "grad_norm": 0.23226478695869446, + "learning_rate": 6.363658845927323e-08, + "loss": 0.3606, "step": 135725 }, { - "epoch": 4.78, - "learning_rate": 2.6818779187413324e-07, - "loss": 0.2428, + "epoch": 4.891700003603994, + "grad_norm": 0.28792256116867065, + "learning_rate": 6.342867990232481e-08, + "loss": 0.3684, "step": 135730 }, { - "epoch": 4.78, - "learning_rate": 2.677718055984435e-07, - "loss": 0.249, + "epoch": 4.8918802032652176, + "grad_norm": 0.24476850032806396, + "learning_rate": 6.322111110133033e-08, + "loss": 0.3505, "step": 135735 }, { - "epoch": 4.78, - "learning_rate": 2.6735614045443615e-07, - "loss": 0.2608, + "epoch": 4.892060402926442, + "grad_norm": 0.25403422117233276, + "learning_rate": 6.301388205912084e-08, + "loss": 0.3466, "step": 135740 }, { - "epoch": 4.78, - "learning_rate": 2.669407964475068e-07, - "loss": 0.2707, + "epoch": 4.892240602587667, + "grad_norm": 0.2334025502204895, + "learning_rate": 6.28069927785191e-08, + "loss": 0.3667, "step": 135745 }, { - "epoch": 4.78, - "learning_rate": 2.66525773583054e-07, - "loss": 0.2643, + "epoch": 4.892420802248892, + "grad_norm": 0.30650824308395386, + "learning_rate": 6.260044326234227e-08, + "loss": 0.3441, "step": 135750 }, { - "epoch": 4.78, - "learning_rate": 2.661110718664622e-07, - "loss": 0.2534, + "epoch": 4.8926010019101165, + "grad_norm": 0.2807537317276001, + "learning_rate": 6.239423351341034e-08, + "loss": 0.3514, "step": 135755 }, { - "epoch": 4.78, - "learning_rate": 2.656966913031217e-07, - "loss": 0.2655, + "epoch": 4.892781201571341, + "grad_norm": 0.28259849548339844, + "learning_rate": 6.218836353452662e-08, + "loss": 0.3651, "step": 135760 }, { - "epoch": 4.78, - "learning_rate": 2.6528263189840583e-07, - "loss": 0.2535, + "epoch": 4.892961401232566, + "grad_norm": 0.2717583179473877, + "learning_rate": 6.198283332849719e-08, + "loss": 0.3725, "step": 135765 }, { - "epoch": 4.78, - "learning_rate": 2.6486889365769374e-07, - "loss": 0.2756, + "epoch": 4.893141600893791, + "grad_norm": 0.24983961880207062, + "learning_rate": 6.177764289812538e-08, + "loss": 0.3589, "step": 135770 }, { - "epoch": 4.78, - "learning_rate": 2.644554765863616e-07, - "loss": 0.2767, + "epoch": 4.8933218005550145, + "grad_norm": 0.2629464566707611, + "learning_rate": 6.157279224620616e-08, + "loss": 0.361, "step": 135775 }, { - "epoch": 4.78, - "learning_rate": 2.6404238068977184e-07, - "loss": 0.2491, + "epoch": 4.893502000216239, + "grad_norm": 0.22154061496257782, + "learning_rate": 6.136828137552619e-08, + "loss": 0.3872, "step": 135780 }, { - "epoch": 4.78, - "learning_rate": 2.6362960597328954e-07, - "loss": 0.2487, + "epoch": 4.893682199877464, + "grad_norm": 0.294659823179245, + "learning_rate": 6.11641102888777e-08, + "loss": 0.3613, "step": 135785 }, { - "epoch": 4.78, - "learning_rate": 2.6321715244227443e-07, - "loss": 0.2472, + "epoch": 4.893862399538689, + "grad_norm": 0.20993182063102722, + "learning_rate": 6.0960278989039e-08, + "loss": 0.3675, "step": 135790 }, { - "epoch": 4.78, - "learning_rate": 2.6280502010208594e-07, - "loss": 0.2413, + "epoch": 4.8940425991999135, + "grad_norm": 0.21128609776496887, + "learning_rate": 6.075678747878567e-08, + "loss": 0.3701, "step": 135795 }, { - "epoch": 4.78, - "learning_rate": 2.6239320895806997e-07, - "loss": 0.2438, + "epoch": 4.894222798861138, + "grad_norm": 0.22223599255084991, + "learning_rate": 6.055363576089601e-08, + "loss": 0.3529, "step": 135800 }, { - "epoch": 4.78, - "learning_rate": 2.6198171901557213e-07, - "loss": 0.2769, + "epoch": 4.894402998522363, + "grad_norm": 0.24055111408233643, + "learning_rate": 6.035082383813451e-08, + "loss": 0.3486, "step": 135805 }, { - "epoch": 4.78, - "learning_rate": 2.615705502799437e-07, - "loss": 0.2551, + "epoch": 4.894583198183588, + "grad_norm": 0.30938276648521423, + "learning_rate": 6.014835171326283e-08, + "loss": 0.3945, "step": 135810 }, { - "epoch": 4.78, - "learning_rate": 2.611597027565138e-07, - "loss": 0.2648, + "epoch": 4.894763397844812, + "grad_norm": 0.2910751700401306, + "learning_rate": 5.994621938904266e-08, + "loss": 0.337, "step": 135815 }, { - "epoch": 4.78, - "learning_rate": 2.6074917645062537e-07, - "loss": 0.2381, + "epoch": 4.894943597506037, + "grad_norm": 0.2789898216724396, + "learning_rate": 5.974442686822457e-08, + "loss": 0.3547, "step": 135820 }, { - "epoch": 4.78, - "learning_rate": 2.6033897136760186e-07, - "loss": 0.2514, + "epoch": 4.895123797167261, + "grad_norm": 0.2609977424144745, + "learning_rate": 5.9542974153561916e-08, + "loss": 0.3549, "step": 135825 }, { - "epoch": 4.78, - "learning_rate": 2.599290875127752e-07, - "loss": 0.2506, + "epoch": 4.895303996828486, + "grad_norm": 0.2318691611289978, + "learning_rate": 5.934186124779972e-08, + "loss": 0.352, "step": 135830 }, { - "epoch": 4.78, - "learning_rate": 2.5951952489146613e-07, - "loss": 0.2424, + "epoch": 4.8954841964897104, + "grad_norm": 0.24600878357887268, + "learning_rate": 5.914108815367192e-08, + "loss": 0.3723, "step": 135835 }, { - "epoch": 4.78, - "learning_rate": 2.5911028350898704e-07, - "loss": 0.2303, + "epoch": 4.895664396150935, + "grad_norm": 0.2980553209781647, + "learning_rate": 5.894065487392075e-08, + "loss": 0.3656, "step": 135840 }, { - "epoch": 4.78, - "learning_rate": 2.5870136337065863e-07, - "loss": 0.2387, + "epoch": 4.89584459581216, + "grad_norm": 0.2521979808807373, + "learning_rate": 5.8740561411271824e-08, + "loss": 0.3641, "step": 135845 }, { - "epoch": 4.78, - "learning_rate": 2.5829276448179063e-07, - "loss": 0.284, + "epoch": 4.896024795473385, + "grad_norm": 0.31992876529693604, + "learning_rate": 5.854080776845627e-08, + "loss": 0.3926, "step": 135850 }, { - "epoch": 4.78, - "learning_rate": 2.578844868476815e-07, - "loss": 0.2156, + "epoch": 4.896204995134609, + "grad_norm": 0.28693369030952454, + "learning_rate": 5.8341393948194154e-08, + "loss": 0.3604, "step": 135855 }, { - "epoch": 4.78, - "learning_rate": 2.574765304736382e-07, - "loss": 0.2494, + "epoch": 4.896385194795834, + "grad_norm": 0.2383836805820465, + "learning_rate": 5.814231995319996e-08, + "loss": 0.3979, "step": 135860 }, { - "epoch": 4.78, - "learning_rate": 2.570688953649564e-07, - "loss": 0.2516, + "epoch": 4.896565394457058, + "grad_norm": 0.32336336374282837, + "learning_rate": 5.794358578618819e-08, + "loss": 0.3494, "step": 135865 }, { - "epoch": 4.78, - "learning_rate": 2.5666158152692907e-07, - "loss": 0.2611, + "epoch": 4.896745594118283, + "grad_norm": 0.2836247682571411, + "learning_rate": 5.774519144986501e-08, + "loss": 0.3809, "step": 135870 }, { - "epoch": 4.78, - "learning_rate": 2.562545889648438e-07, - "loss": 0.2535, + "epoch": 4.896925793779507, + "grad_norm": 0.2685668170452118, + "learning_rate": 5.754713694693659e-08, + "loss": 0.3776, "step": 135875 }, { - "epoch": 4.78, - "learning_rate": 2.558479176839851e-07, - "loss": 0.2218, + "epoch": 4.897105993440732, + "grad_norm": 0.28975343704223633, + "learning_rate": 5.734942228009799e-08, + "loss": 0.3755, "step": 135880 }, { - "epoch": 4.78, - "learning_rate": 2.554415676896349e-07, - "loss": 0.2752, + "epoch": 4.897286193101957, + "grad_norm": 0.3143594264984131, + "learning_rate": 5.715204745204428e-08, + "loss": 0.3687, "step": 135885 }, { - "epoch": 4.78, - "learning_rate": 2.550355389870723e-07, - "loss": 0.2576, + "epoch": 4.897466392763182, + "grad_norm": 0.21849270164966583, + "learning_rate": 5.695501246546775e-08, + "loss": 0.3852, "step": 135890 }, { - "epoch": 4.78, - "learning_rate": 2.546298315815626e-07, - "loss": 0.2479, + "epoch": 4.897646592424406, + "grad_norm": 0.26976725459098816, + "learning_rate": 5.6758317323046815e-08, + "loss": 0.3919, "step": 135895 }, { - "epoch": 4.78, - "learning_rate": 2.5422444547837376e-07, - "loss": 0.2629, + "epoch": 4.897826792085631, + "grad_norm": 0.32330963015556335, + "learning_rate": 5.656196202746544e-08, + "loss": 0.3806, "step": 135900 }, { - "epoch": 4.78, - "learning_rate": 2.5381938068277654e-07, - "loss": 0.2659, + "epoch": 4.898006991746856, + "grad_norm": 0.26278913021087646, + "learning_rate": 5.6365946581399265e-08, + "loss": 0.3952, "step": 135905 }, { - "epoch": 4.78, - "learning_rate": 2.534146372000251e-07, - "loss": 0.2494, + "epoch": 4.898187191408081, + "grad_norm": 0.28790876269340515, + "learning_rate": 5.617027098751559e-08, + "loss": 0.3506, "step": 135910 }, { - "epoch": 4.78, - "learning_rate": 2.530102150353764e-07, - "loss": 0.2525, + "epoch": 4.898367391069304, + "grad_norm": 0.26349297165870667, + "learning_rate": 5.597493524848452e-08, + "loss": 0.3843, "step": 135915 }, { - "epoch": 4.78, - "learning_rate": 2.5260611419408163e-07, - "loss": 0.2237, + "epoch": 4.898547590730529, + "grad_norm": 0.2596375644207001, + "learning_rate": 5.577993936696502e-08, + "loss": 0.3871, "step": 135920 }, { - "epoch": 4.78, - "learning_rate": 2.52202334681384e-07, - "loss": 0.2504, + "epoch": 4.898727790391754, + "grad_norm": 0.22803239524364471, + "learning_rate": 5.5585283345613304e-08, + "loss": 0.3691, "step": 135925 }, { - "epoch": 4.78, - "learning_rate": 2.5179887650253476e-07, - "loss": 0.232, + "epoch": 4.898907990052979, + "grad_norm": 0.22493457794189453, + "learning_rate": 5.5390967187085585e-08, + "loss": 0.37, "step": 135930 }, { - "epoch": 4.78, - "learning_rate": 2.513957396627631e-07, - "loss": 0.253, + "epoch": 4.899088189714203, + "grad_norm": 0.2446439415216446, + "learning_rate": 5.519699089402419e-08, + "loss": 0.3802, "step": 135935 }, { - "epoch": 4.78, - "learning_rate": 2.5099292416730925e-07, - "loss": 0.231, + "epoch": 4.899268389375428, + "grad_norm": 0.2472543567419052, + "learning_rate": 5.5003354469077006e-08, + "loss": 0.3364, "step": 135940 }, { - "epoch": 4.78, - "learning_rate": 2.5059043002140235e-07, - "loss": 0.2375, + "epoch": 4.899448589036653, + "grad_norm": 0.23050671815872192, + "learning_rate": 5.481005791487526e-08, + "loss": 0.3832, "step": 135945 }, { - "epoch": 4.78, - "learning_rate": 2.5018825723026884e-07, - "loss": 0.2539, + "epoch": 4.8996287886978775, + "grad_norm": 0.3573814034461975, + "learning_rate": 5.461710123406128e-08, + "loss": 0.368, "step": 135950 }, { - "epoch": 4.78, - "learning_rate": 2.4978640579912947e-07, - "loss": 0.2531, + "epoch": 4.899808988359101, + "grad_norm": 0.24702803790569305, + "learning_rate": 5.442448442925796e-08, + "loss": 0.3933, "step": 135955 }, { - "epoch": 4.78, - "learning_rate": 2.4938487573320234e-07, - "loss": 0.2811, + "epoch": 4.899989188020326, + "grad_norm": 0.20937135815620422, + "learning_rate": 5.423220750309099e-08, + "loss": 0.3516, "step": 135960 }, { - "epoch": 4.78, - "learning_rate": 2.489836670377027e-07, - "loss": 0.2536, + "epoch": 4.900169387681551, + "grad_norm": 0.30278798937797546, + "learning_rate": 5.4040270458180496e-08, + "loss": 0.3488, "step": 135965 }, { - "epoch": 4.78, - "learning_rate": 2.485827797178375e-07, - "loss": 0.2492, + "epoch": 4.900349587342776, + "grad_norm": 0.2523956298828125, + "learning_rate": 5.384867329714105e-08, + "loss": 0.3533, "step": 135970 }, { - "epoch": 4.78, - "learning_rate": 2.481822137788137e-07, - "loss": 0.2356, + "epoch": 4.900529787004, + "grad_norm": 0.30250051617622375, + "learning_rate": 5.365741602258445e-08, + "loss": 0.3883, "step": 135975 }, { - "epoch": 4.78, - "learning_rate": 2.4778196922582984e-07, - "loss": 0.2409, + "epoch": 4.900709986665225, + "grad_norm": 0.21715682744979858, + "learning_rate": 5.3466498637116945e-08, + "loss": 0.3592, "step": 135980 }, { - "epoch": 4.78, - "learning_rate": 2.4738204606408465e-07, - "loss": 0.2646, + "epoch": 4.90089018632645, + "grad_norm": 0.2727758586406708, + "learning_rate": 5.327592114333646e-08, + "loss": 0.3527, "step": 135985 }, { - "epoch": 4.78, - "learning_rate": 2.469824442987739e-07, - "loss": 0.2426, + "epoch": 4.9010703859876745, + "grad_norm": 0.33506885170936584, + "learning_rate": 5.308568354384369e-08, + "loss": 0.4078, "step": 135990 }, { - "epoch": 4.78, - "learning_rate": 2.465831639350824e-07, - "loss": 0.2461, + "epoch": 4.901250585648899, + "grad_norm": 0.2240988165140152, + "learning_rate": 5.289578584122823e-08, + "loss": 0.3615, "step": 135995 }, { - "epoch": 4.78, - "learning_rate": 2.4618420497819484e-07, - "loss": 0.2414, + "epoch": 4.901430785310124, + "grad_norm": 0.2511599361896515, + "learning_rate": 5.27062280380769e-08, + "loss": 0.3921, "step": 136000 }, { - "epoch": 4.78, - "eval_loss": 0.2484423816204071, - "eval_runtime": 10.5512, - "eval_samples_per_second": 9.478, - "eval_steps_per_second": 9.478, + "epoch": 4.901430785310124, + "eval_loss": 0.4287901222705841, + "eval_runtime": 3.5335, + "eval_samples_per_second": 28.3, + "eval_steps_per_second": 7.075, "step": 136000 }, { - "epoch": 4.79, - "learning_rate": 2.457855674332932e-07, - "loss": 0.2498, + "epoch": 4.901610984971349, + "grad_norm": 0.27596327662467957, + "learning_rate": 5.251701013697374e-08, + "loss": 0.3939, "step": 136005 }, { - "epoch": 4.79, - "learning_rate": 2.453872513055511e-07, - "loss": 0.2329, + "epoch": 4.901791184632573, + "grad_norm": 0.3502033054828644, + "learning_rate": 5.2328132140497255e-08, + "loss": 0.3705, "step": 136010 }, { - "epoch": 4.79, - "learning_rate": 2.449892566001394e-07, - "loss": 0.2374, + "epoch": 4.901971384293797, + "grad_norm": 0.269580602645874, + "learning_rate": 5.21395940512176e-08, + "loss": 0.3879, "step": 136015 }, { - "epoch": 4.79, - "learning_rate": 2.4459158332223173e-07, - "loss": 0.2284, + "epoch": 4.902151583955022, + "grad_norm": 0.24801914393901825, + "learning_rate": 5.195139587170772e-08, + "loss": 0.3859, "step": 136020 }, { - "epoch": 4.79, - "learning_rate": 2.4419423147698793e-07, - "loss": 0.244, + "epoch": 4.902331783616247, + "grad_norm": 0.2341059297323227, + "learning_rate": 5.1763537604529453e-08, + "loss": 0.371, "step": 136025 }, { - "epoch": 4.79, - "learning_rate": 2.437972010695649e-07, - "loss": 0.2669, + "epoch": 4.9025119832774715, + "grad_norm": 0.21445782482624054, + "learning_rate": 5.1576019252241866e-08, + "loss": 0.3202, "step": 136030 }, { - "epoch": 4.79, - "learning_rate": 2.434004921051197e-07, - "loss": 0.2353, + "epoch": 4.902692182938696, + "grad_norm": 0.1884135752916336, + "learning_rate": 5.138884081740125e-08, + "loss": 0.3182, "step": 136035 }, { - "epoch": 4.79, - "learning_rate": 2.4300410458880654e-07, - "loss": 0.2529, + "epoch": 4.902872382599921, + "grad_norm": 0.24446237087249756, + "learning_rate": 5.120200230255834e-08, + "loss": 0.3877, "step": 136040 }, { - "epoch": 4.79, - "learning_rate": 2.426080385257684e-07, - "loss": 0.26, + "epoch": 4.903052582261146, + "grad_norm": 0.26809754967689514, + "learning_rate": 5.101550371025832e-08, + "loss": 0.3798, "step": 136045 }, { - "epoch": 4.79, - "learning_rate": 2.422122939211513e-07, - "loss": 0.2691, + "epoch": 4.9032327819223696, + "grad_norm": 0.2706208825111389, + "learning_rate": 5.082934504304082e-08, + "loss": 0.3462, "step": 136050 }, { - "epoch": 4.79, - "learning_rate": 2.418168707800872e-07, - "loss": 0.2767, + "epoch": 4.903412981583594, + "grad_norm": 0.2580229341983795, + "learning_rate": 5.0643526303445485e-08, + "loss": 0.3824, "step": 136055 }, { - "epoch": 4.79, - "learning_rate": 2.414217691077192e-07, - "loss": 0.2778, + "epoch": 4.903593181244819, + "grad_norm": 0.24854421615600586, + "learning_rate": 5.045804749399807e-08, + "loss": 0.3535, "step": 136060 }, { - "epoch": 4.79, - "learning_rate": 2.41026988909171e-07, - "loss": 0.2329, + "epoch": 4.903773380906044, + "grad_norm": 0.22989878058433533, + "learning_rate": 5.0272908617229885e-08, + "loss": 0.3991, "step": 136065 }, { - "epoch": 4.79, - "learning_rate": 2.40632530189569e-07, - "loss": 0.2373, + "epoch": 4.9039535805672685, + "grad_norm": 0.24265380203723907, + "learning_rate": 5.0088109675666684e-08, + "loss": 0.3802, "step": 136070 }, { - "epoch": 4.79, - "learning_rate": 2.402383929540397e-07, - "loss": 0.2503, + "epoch": 4.904133780228493, + "grad_norm": 0.2347709685564041, + "learning_rate": 4.990365067181757e-08, + "loss": 0.3465, "step": 136075 }, { - "epoch": 4.79, - "learning_rate": 2.398445772076957e-07, - "loss": 0.2404, + "epoch": 4.904313979889718, + "grad_norm": 0.18331743776798248, + "learning_rate": 4.9719531608205527e-08, + "loss": 0.3867, "step": 136080 }, { - "epoch": 4.79, - "learning_rate": 2.394510829556523e-07, - "loss": 0.2588, + "epoch": 4.904494179550943, + "grad_norm": 0.3420829772949219, + "learning_rate": 4.9535752487331335e-08, + "loss": 0.3615, "step": 136085 }, { - "epoch": 4.79, - "learning_rate": 2.390579102030166e-07, - "loss": 0.2546, + "epoch": 4.904674379212167, + "grad_norm": 0.31512126326560974, + "learning_rate": 4.9352313311701317e-08, + "loss": 0.3836, "step": 136090 }, { - "epoch": 4.79, - "learning_rate": 2.3866505895489833e-07, - "loss": 0.2649, + "epoch": 4.904854578873392, + "grad_norm": 0.3358932137489319, + "learning_rate": 4.916921408381625e-08, + "loss": 0.3708, "step": 136095 }, { - "epoch": 4.79, - "learning_rate": 2.382725292163962e-07, - "loss": 0.2536, + "epoch": 4.905034778534616, + "grad_norm": 0.21517984569072723, + "learning_rate": 4.898645480617137e-08, + "loss": 0.3639, "step": 136100 }, { - "epoch": 4.79, - "learning_rate": 2.3788032099260893e-07, - "loss": 0.2585, + "epoch": 4.905214978195841, + "grad_norm": 0.28815269470214844, + "learning_rate": 4.880403548125356e-08, + "loss": 0.3626, "step": 136105 }, { - "epoch": 4.79, - "learning_rate": 2.374884342886241e-07, - "loss": 0.237, + "epoch": 4.9053951778570655, + "grad_norm": 0.2533737123012543, + "learning_rate": 4.8621956111549735e-08, + "loss": 0.3567, "step": 136110 }, { - "epoch": 4.79, - "learning_rate": 2.370968691095321e-07, - "loss": 0.2487, + "epoch": 4.90557537751829, + "grad_norm": 0.2487945407629013, + "learning_rate": 4.8440216699544015e-08, + "loss": 0.3604, "step": 136115 }, { - "epoch": 4.79, - "learning_rate": 2.3670562546042053e-07, - "loss": 0.2351, + "epoch": 4.905755577179515, + "grad_norm": 0.22873222827911377, + "learning_rate": 4.8258817247706646e-08, + "loss": 0.346, "step": 136120 }, { - "epoch": 4.79, - "learning_rate": 2.3631470334636584e-07, - "loss": 0.2577, + "epoch": 4.90593577684074, + "grad_norm": 0.2279924899339676, + "learning_rate": 4.807775775851342e-08, + "loss": 0.3948, "step": 136125 }, { - "epoch": 4.79, - "learning_rate": 2.359241027724418e-07, - "loss": 0.2367, + "epoch": 4.906115976501964, + "grad_norm": 0.24945561587810516, + "learning_rate": 4.7897038234429035e-08, + "loss": 0.3805, "step": 136130 }, { - "epoch": 4.79, - "learning_rate": 2.355338237437277e-07, - "loss": 0.2543, + "epoch": 4.906296176163189, + "grad_norm": 0.19843456149101257, + "learning_rate": 4.7716658677918194e-08, + "loss": 0.3494, "step": 136135 }, { - "epoch": 4.79, - "learning_rate": 2.3514386626528328e-07, - "loss": 0.2224, + "epoch": 4.906476375824413, + "grad_norm": 0.2849428057670593, + "learning_rate": 4.753661909143448e-08, + "loss": 0.4041, "step": 136140 }, { - "epoch": 4.79, - "learning_rate": 2.3475423034217402e-07, - "loss": 0.256, + "epoch": 4.906656575485638, + "grad_norm": 0.2670860290527344, + "learning_rate": 4.735691947743703e-08, + "loss": 0.3781, "step": 136145 }, { - "epoch": 4.79, - "learning_rate": 2.3436491597946253e-07, - "loss": 0.2579, + "epoch": 4.9068367751468625, + "grad_norm": 0.2657918930053711, + "learning_rate": 4.717755983836836e-08, + "loss": 0.3338, "step": 136150 }, { - "epoch": 4.79, - "learning_rate": 2.3397592318220308e-07, - "loss": 0.2449, + "epoch": 4.907016974808087, + "grad_norm": 0.2555624544620514, + "learning_rate": 4.699854017667371e-08, + "loss": 0.3745, "step": 136155 }, { - "epoch": 4.79, - "learning_rate": 2.3358725195544162e-07, - "loss": 0.2583, + "epoch": 4.907197174469312, + "grad_norm": 0.2596377432346344, + "learning_rate": 4.6819860494792814e-08, + "loss": 0.3466, "step": 136160 }, { - "epoch": 4.79, - "learning_rate": 2.3319890230422968e-07, - "loss": 0.2448, + "epoch": 4.907377374130537, + "grad_norm": 0.21930861473083496, + "learning_rate": 4.6641520795162596e-08, + "loss": 0.3511, "step": 136165 }, { - "epoch": 4.79, - "learning_rate": 2.328108742336077e-07, - "loss": 0.2609, + "epoch": 4.907557573791761, + "grad_norm": 0.2237590104341507, + "learning_rate": 4.6463521080208906e-08, + "loss": 0.3618, "step": 136170 }, { - "epoch": 4.79, - "learning_rate": 2.3242316774861604e-07, - "loss": 0.2442, + "epoch": 4.907737773452986, + "grad_norm": 0.2339451014995575, + "learning_rate": 4.6285861352357574e-08, + "loss": 0.3686, "step": 136175 }, { - "epoch": 4.79, - "learning_rate": 2.3203578285428406e-07, - "loss": 0.2456, + "epoch": 4.907917973114211, + "grad_norm": 0.27476266026496887, + "learning_rate": 4.610854161403166e-08, + "loss": 0.3819, "step": 136180 }, { - "epoch": 4.79, - "learning_rate": 2.316487195556466e-07, - "loss": 0.2538, + "epoch": 4.908098172775436, + "grad_norm": 0.2562359869480133, + "learning_rate": 4.593156186764591e-08, + "loss": 0.3852, "step": 136185 }, { - "epoch": 4.79, - "learning_rate": 2.3126197785772742e-07, - "loss": 0.2818, + "epoch": 4.908278372436659, + "grad_norm": 0.2553425133228302, + "learning_rate": 4.57549221156095e-08, + "loss": 0.3646, "step": 136190 }, { - "epoch": 4.79, - "learning_rate": 2.3087555776554747e-07, - "loss": 0.2886, + "epoch": 4.908458572097884, + "grad_norm": 0.25126445293426514, + "learning_rate": 4.5578622360334386e-08, + "loss": 0.3564, "step": 136195 }, { - "epoch": 4.79, - "learning_rate": 2.3048945928412502e-07, - "loss": 0.2577, + "epoch": 4.908638771759109, + "grad_norm": 0.2578040659427643, + "learning_rate": 4.540266260421588e-08, + "loss": 0.346, "step": 136200 }, { - "epoch": 4.79, - "learning_rate": 2.3010368241847268e-07, - "loss": 0.2543, + "epoch": 4.908818971420334, + "grad_norm": 0.2958472967147827, + "learning_rate": 4.5227042849654844e-08, + "loss": 0.4071, "step": 136205 }, { - "epoch": 4.79, - "learning_rate": 2.2971822717360038e-07, - "loss": 0.2468, + "epoch": 4.908999171081558, + "grad_norm": 0.2669653594493866, + "learning_rate": 4.505176309904657e-08, + "loss": 0.3787, "step": 136210 }, { - "epoch": 4.79, - "learning_rate": 2.2933309355451515e-07, - "loss": 0.253, + "epoch": 4.909179370742783, + "grad_norm": 0.294811874628067, + "learning_rate": 4.4876823354775275e-08, + "loss": 0.3732, "step": 136215 }, { - "epoch": 4.79, - "learning_rate": 2.2894828156621305e-07, - "loss": 0.2237, + "epoch": 4.909359570404008, + "grad_norm": 0.2965461015701294, + "learning_rate": 4.470222361922516e-08, + "loss": 0.3712, "step": 136220 }, { - "epoch": 4.79, - "learning_rate": 2.2856379121369288e-07, - "loss": 0.2673, + "epoch": 4.909539770065233, + "grad_norm": 0.24471497535705566, + "learning_rate": 4.452796389477487e-08, + "loss": 0.355, "step": 136225 }, { - "epoch": 4.79, - "learning_rate": 2.281796225019478e-07, - "loss": 0.2708, + "epoch": 4.909719969726457, + "grad_norm": 0.2504851520061493, + "learning_rate": 4.435404418380029e-08, + "loss": 0.3638, "step": 136230 }, { - "epoch": 4.79, - "learning_rate": 2.2779577543596276e-07, - "loss": 0.2639, + "epoch": 4.909900169387681, + "grad_norm": 0.26970013976097107, + "learning_rate": 4.418046448867175e-08, + "loss": 0.3881, "step": 136235 }, { - "epoch": 4.79, - "learning_rate": 2.274122500207282e-07, - "loss": 0.2538, + "epoch": 4.910080369048906, + "grad_norm": 0.31252387166023254, + "learning_rate": 4.400722481175401e-08, + "loss": 0.4242, "step": 136240 }, { - "epoch": 4.79, - "learning_rate": 2.2702904626121514e-07, - "loss": 0.2511, + "epoch": 4.910260568710131, + "grad_norm": 0.27864494919776917, + "learning_rate": 4.3834325155403535e-08, + "loss": 0.3557, "step": 136245 }, { - "epoch": 4.79, - "learning_rate": 2.266461641624057e-07, - "loss": 0.2426, + "epoch": 4.910440768371355, + "grad_norm": 0.2759137451648712, + "learning_rate": 4.366176552197676e-08, + "loss": 0.3641, "step": 136250 }, { - "epoch": 4.79, - "learning_rate": 2.2626360372927092e-07, - "loss": 0.269, + "epoch": 4.91062096803258, + "grad_norm": 0.23578329384326935, + "learning_rate": 4.348954591383014e-08, + "loss": 0.3712, "step": 136255 }, { - "epoch": 4.79, - "learning_rate": 2.2588136496677625e-07, - "loss": 0.2443, + "epoch": 4.910801167693805, + "grad_norm": 0.2522645592689514, + "learning_rate": 4.331766633330625e-08, + "loss": 0.3695, "step": 136260 }, { - "epoch": 4.79, - "learning_rate": 2.2549944787988163e-07, - "loss": 0.2544, + "epoch": 4.9109813673550295, + "grad_norm": 0.300750195980072, + "learning_rate": 4.3146126782747655e-08, + "loss": 0.3721, "step": 136265 }, { - "epoch": 4.79, - "learning_rate": 2.2511785247355533e-07, - "loss": 0.2678, + "epoch": 4.911161567016254, + "grad_norm": 0.26035916805267334, + "learning_rate": 4.29749272644886e-08, + "loss": 0.3666, "step": 136270 }, { - "epoch": 4.79, - "learning_rate": 2.2473657875274334e-07, - "loss": 0.2303, + "epoch": 4.911341766677479, + "grad_norm": 0.2753373086452484, + "learning_rate": 4.28040677808661e-08, + "loss": 0.372, "step": 136275 }, { - "epoch": 4.79, - "learning_rate": 2.2435562672240006e-07, - "loss": 0.2569, + "epoch": 4.911521966338704, + "grad_norm": 0.24623210728168488, + "learning_rate": 4.263354833420607e-08, + "loss": 0.3655, "step": 136280 }, { - "epoch": 4.79, - "learning_rate": 2.2397499638747155e-07, - "loss": 0.2722, + "epoch": 4.911702165999928, + "grad_norm": 0.25700467824935913, + "learning_rate": 4.246336892683445e-08, + "loss": 0.3749, "step": 136285 }, { - "epoch": 4.8, - "learning_rate": 2.2359468775289827e-07, - "loss": 0.2658, + "epoch": 4.911882365661152, + "grad_norm": 0.21152786910533905, + "learning_rate": 4.229352956106603e-08, + "loss": 0.3809, "step": 136290 }, { - "epoch": 4.8, - "learning_rate": 2.2321470082362072e-07, - "loss": 0.2705, + "epoch": 4.912062565322377, + "grad_norm": 0.3113376200199127, + "learning_rate": 4.2124030239212855e-08, + "loss": 0.3877, "step": 136295 }, { - "epoch": 4.8, - "learning_rate": 2.2283503560457108e-07, - "loss": 0.2751, + "epoch": 4.912242764983602, + "grad_norm": 0.22789722681045532, + "learning_rate": 4.195487096359252e-08, + "loss": 0.3668, "step": 136300 }, { - "epoch": 4.8, - "learning_rate": 2.2245569210068152e-07, - "loss": 0.2432, + "epoch": 4.9124229646448265, + "grad_norm": 0.22643516957759857, + "learning_rate": 4.178605173650318e-08, + "loss": 0.3519, "step": 136305 }, { - "epoch": 4.8, - "learning_rate": 2.2207667031687307e-07, - "loss": 0.246, + "epoch": 4.912603164306051, + "grad_norm": 0.1881403774023056, + "learning_rate": 4.161757256024579e-08, + "loss": 0.3422, "step": 136310 }, { - "epoch": 4.8, - "learning_rate": 2.2169797025807239e-07, - "loss": 0.2459, + "epoch": 4.912783363967276, + "grad_norm": 0.2479417622089386, + "learning_rate": 4.144943343711849e-08, + "loss": 0.4007, "step": 136315 }, { - "epoch": 4.8, - "learning_rate": 2.2131959192919215e-07, - "loss": 0.2376, + "epoch": 4.912963563628501, + "grad_norm": 0.23649077117443085, + "learning_rate": 4.128163436941113e-08, + "loss": 0.3453, "step": 136320 }, { - "epoch": 4.8, - "learning_rate": 2.2094153533514794e-07, - "loss": 0.253, + "epoch": 4.913143763289725, + "grad_norm": 0.25313544273376465, + "learning_rate": 4.111417535940798e-08, + "loss": 0.4058, "step": 136325 }, { - "epoch": 4.8, - "learning_rate": 2.2056380048084968e-07, - "loss": 0.2476, + "epoch": 4.913323962950949, + "grad_norm": 0.2785737216472626, + "learning_rate": 4.094705640939056e-08, + "loss": 0.3633, "step": 136330 }, { - "epoch": 4.8, - "learning_rate": 2.2018638737119902e-07, - "loss": 0.2345, + "epoch": 4.913504162612174, + "grad_norm": 0.292338490486145, + "learning_rate": 4.0780277521640374e-08, + "loss": 0.3945, "step": 136335 }, { - "epoch": 4.8, - "learning_rate": 2.1980929601109756e-07, - "loss": 0.2509, + "epoch": 4.913684362273399, + "grad_norm": 0.2278224229812622, + "learning_rate": 4.061383869842506e-08, + "loss": 0.3593, "step": 136340 }, { - "epoch": 4.8, - "learning_rate": 2.1943252640544143e-07, - "loss": 0.2467, + "epoch": 4.9138645619346235, + "grad_norm": 0.2170296460390091, + "learning_rate": 4.044773994201223e-08, + "loss": 0.3742, "step": 136345 }, { - "epoch": 4.8, - "learning_rate": 2.1905607855912115e-07, - "loss": 0.2424, + "epoch": 4.914044761595848, + "grad_norm": 0.2683648467063904, + "learning_rate": 4.0281981254669535e-08, + "loss": 0.4156, "step": 136350 }, { - "epoch": 4.8, - "learning_rate": 2.186799524770272e-07, - "loss": 0.2398, + "epoch": 4.914224961257073, + "grad_norm": 0.2352461963891983, + "learning_rate": 4.011656263865071e-08, + "loss": 0.3855, "step": 136355 }, { - "epoch": 4.8, - "learning_rate": 2.1830414816404466e-07, - "loss": 0.2574, + "epoch": 4.914405160918298, + "grad_norm": 0.28996291756629944, + "learning_rate": 3.99514840962123e-08, + "loss": 0.4141, "step": 136360 }, { - "epoch": 4.8, - "learning_rate": 2.179286656250501e-07, - "loss": 0.2354, + "epoch": 4.914585360579522, + "grad_norm": 0.22475266456604004, + "learning_rate": 3.9786745629602494e-08, + "loss": 0.3708, "step": 136365 }, { - "epoch": 4.8, - "learning_rate": 2.175535048649202e-07, - "loss": 0.2391, + "epoch": 4.914765560240747, + "grad_norm": 0.19460433721542358, + "learning_rate": 3.962234724106395e-08, + "loss": 0.3589, "step": 136370 }, { - "epoch": 4.8, - "learning_rate": 2.171786658885233e-07, - "loss": 0.2304, + "epoch": 4.914945759901971, + "grad_norm": 0.2615772485733032, + "learning_rate": 3.945828893284209e-08, + "loss": 0.3617, "step": 136375 }, { - "epoch": 4.8, - "learning_rate": 2.1680414870073053e-07, - "loss": 0.2502, + "epoch": 4.915125959563196, + "grad_norm": 0.26762276887893677, + "learning_rate": 3.929457070716569e-08, + "loss": 0.3672, "step": 136380 }, { - "epoch": 4.8, - "learning_rate": 2.164299533064046e-07, - "loss": 0.2468, + "epoch": 4.9153061592244205, + "grad_norm": 0.40078943967819214, + "learning_rate": 3.913119256626907e-08, + "loss": 0.3814, "step": 136385 }, { - "epoch": 4.8, - "learning_rate": 2.1605607971040276e-07, - "loss": 0.2589, + "epoch": 4.915486358885645, + "grad_norm": 0.2580387592315674, + "learning_rate": 3.896815451237823e-08, + "loss": 0.3731, "step": 136390 }, { - "epoch": 4.8, - "learning_rate": 2.156825279175767e-07, - "loss": 0.2593, + "epoch": 4.91566655854687, + "grad_norm": 0.2312445044517517, + "learning_rate": 3.880545654771084e-08, + "loss": 0.3327, "step": 136395 }, { - "epoch": 4.8, - "learning_rate": 2.1530929793278088e-07, - "loss": 0.2649, + "epoch": 4.915846758208095, + "grad_norm": 0.22005638480186462, + "learning_rate": 3.864309867449012e-08, + "loss": 0.3817, "step": 136400 }, { - "epoch": 4.8, - "learning_rate": 2.149363897608614e-07, - "loss": 0.2745, + "epoch": 4.916026957869319, + "grad_norm": 0.2570466995239258, + "learning_rate": 3.848108089492264e-08, + "loss": 0.3434, "step": 136405 }, { - "epoch": 4.8, - "learning_rate": 2.145638034066588e-07, - "loss": 0.2492, + "epoch": 4.916207157530544, + "grad_norm": 0.2323221117258072, + "learning_rate": 3.8319403211217744e-08, + "loss": 0.3523, "step": 136410 }, { - "epoch": 4.8, - "learning_rate": 2.141915388750082e-07, - "loss": 0.2456, + "epoch": 4.916387357191768, + "grad_norm": 0.26294851303100586, + "learning_rate": 3.815806562557644e-08, + "loss": 0.3646, "step": 136415 }, { - "epoch": 4.8, - "learning_rate": 2.1381959617074732e-07, - "loss": 0.2561, + "epoch": 4.916567556852993, + "grad_norm": 0.29720500111579895, + "learning_rate": 3.799706814020254e-08, + "loss": 0.3537, "step": 136420 }, { - "epoch": 4.8, - "learning_rate": 2.1344797529870565e-07, - "loss": 0.2624, + "epoch": 4.9167477565142175, + "grad_norm": 0.27285248041152954, + "learning_rate": 3.783641075728317e-08, + "loss": 0.3456, "step": 136425 }, { - "epoch": 4.8, - "learning_rate": 2.130766762637071e-07, - "loss": 0.2427, + "epoch": 4.916927956175442, + "grad_norm": 0.2373269498348236, + "learning_rate": 3.767609347901102e-08, + "loss": 0.3396, "step": 136430 }, { - "epoch": 4.8, - "learning_rate": 2.1270569907057004e-07, - "loss": 0.2615, + "epoch": 4.917108155836667, + "grad_norm": 0.232501819729805, + "learning_rate": 3.751611630756768e-08, + "loss": 0.3594, "step": 136435 }, { - "epoch": 4.8, - "learning_rate": 2.123350437241156e-07, - "loss": 0.2741, + "epoch": 4.917288355497892, + "grad_norm": 0.24298062920570374, + "learning_rate": 3.735647924513475e-08, + "loss": 0.3462, "step": 136440 }, { - "epoch": 4.8, - "learning_rate": 2.1196471022915665e-07, - "loss": 0.2443, + "epoch": 4.917468555159116, + "grad_norm": 0.2818912863731384, + "learning_rate": 3.719718229388824e-08, + "loss": 0.3911, "step": 136445 }, { - "epoch": 4.8, - "learning_rate": 2.1159469859049764e-07, - "loss": 0.2567, + "epoch": 4.917648754820341, + "grad_norm": 0.2380588948726654, + "learning_rate": 3.703822545599589e-08, + "loss": 0.424, "step": 136450 }, { - "epoch": 4.8, - "learning_rate": 2.1122500881294582e-07, - "loss": 0.262, + "epoch": 4.917828954481566, + "grad_norm": 0.23889237642288208, + "learning_rate": 3.687960873362539e-08, + "loss": 0.3688, "step": 136455 }, { - "epoch": 4.8, - "learning_rate": 2.1085564090130294e-07, - "loss": 0.2325, + "epoch": 4.918009154142791, + "grad_norm": 0.2520101070404053, + "learning_rate": 3.672133212893891e-08, + "loss": 0.3466, "step": 136460 }, { - "epoch": 4.8, - "learning_rate": 2.104865948603596e-07, - "loss": 0.2378, + "epoch": 4.918189353804015, + "grad_norm": 0.19939465820789337, + "learning_rate": 3.6563395644087504e-08, + "loss": 0.3633, "step": 136465 }, { - "epoch": 4.8, - "learning_rate": 2.1011787069491195e-07, - "loss": 0.2405, + "epoch": 4.918369553465239, + "grad_norm": 0.21489644050598145, + "learning_rate": 3.6405799281230556e-08, + "loss": 0.373, "step": 136470 }, { - "epoch": 4.8, - "learning_rate": 2.0974946840974786e-07, - "loss": 0.2429, + "epoch": 4.918549753126464, + "grad_norm": 0.22894160449504852, + "learning_rate": 3.6248543042508023e-08, + "loss": 0.3401, "step": 136475 }, { - "epoch": 4.8, - "learning_rate": 2.093813880096468e-07, - "loss": 0.2424, + "epoch": 4.918729952787689, + "grad_norm": 0.2765687108039856, + "learning_rate": 3.609162693006818e-08, + "loss": 0.3808, "step": 136480 }, { - "epoch": 4.8, - "learning_rate": 2.0901362949939384e-07, - "loss": 0.2447, + "epoch": 4.918910152448913, + "grad_norm": 0.2676806151866913, + "learning_rate": 3.5935050946045434e-08, + "loss": 0.3748, "step": 136485 }, { - "epoch": 4.8, - "learning_rate": 2.086461928837602e-07, - "loss": 0.2492, + "epoch": 4.919090352110138, + "grad_norm": 0.1796978861093521, + "learning_rate": 3.5778815092576965e-08, + "loss": 0.364, "step": 136490 }, { - "epoch": 4.8, - "learning_rate": 2.0827907816751702e-07, - "loss": 0.2411, + "epoch": 4.919270551771363, + "grad_norm": 0.2699376344680786, + "learning_rate": 3.562291937178608e-08, + "loss": 0.3574, "step": 136495 }, { - "epoch": 4.8, - "learning_rate": 2.0791228535543272e-07, - "loss": 0.2492, + "epoch": 4.919450751432588, + "grad_norm": 0.2322850078344345, + "learning_rate": 3.546736378580162e-08, + "loss": 0.375, "step": 136500 }, { - "epoch": 4.8, - "eval_loss": 0.2484273761510849, - "eval_runtime": 10.5494, - "eval_samples_per_second": 9.479, - "eval_steps_per_second": 9.479, + "epoch": 4.919450751432588, + "eval_loss": 0.42881008982658386, + "eval_runtime": 3.537, + "eval_samples_per_second": 28.272, + "eval_steps_per_second": 7.068, "step": 136500 }, { - "epoch": 4.8, - "learning_rate": 2.075458144522674e-07, - "loss": 0.2473, + "epoch": 4.919630951093812, + "grad_norm": 0.266796350479126, + "learning_rate": 3.531214833673857e-08, + "loss": 0.4061, "step": 136505 }, { - "epoch": 4.8, - "learning_rate": 2.071796654627839e-07, - "loss": 0.2616, + "epoch": 4.919811150755036, + "grad_norm": 0.23469606041908264, + "learning_rate": 3.515727302671468e-08, + "loss": 0.3704, "step": 136510 }, { - "epoch": 4.8, - "learning_rate": 2.0681383839172842e-07, - "loss": 0.2791, + "epoch": 4.919991350416261, + "grad_norm": 0.27841997146606445, + "learning_rate": 3.500273785784214e-08, + "loss": 0.3688, "step": 136515 }, { - "epoch": 4.8, - "learning_rate": 2.0644833324386104e-07, - "loss": 0.2526, + "epoch": 4.920171550077486, + "grad_norm": 0.23662640154361725, + "learning_rate": 3.4848542832222056e-08, + "loss": 0.3902, "step": 136520 }, { - "epoch": 4.8, - "learning_rate": 2.0608315002391687e-07, - "loss": 0.2583, + "epoch": 4.92035174973871, + "grad_norm": 0.3038434386253357, + "learning_rate": 3.46946879519583e-08, + "loss": 0.3871, "step": 136525 }, { - "epoch": 4.8, - "learning_rate": 2.0571828873664767e-07, - "loss": 0.2542, + "epoch": 4.920531949399935, + "grad_norm": 0.2633965015411377, + "learning_rate": 3.454117321914363e-08, + "loss": 0.3661, "step": 136530 }, { - "epoch": 4.8, - "learning_rate": 2.0535374938678575e-07, - "loss": 0.2544, + "epoch": 4.92071214906116, + "grad_norm": 0.3141336441040039, + "learning_rate": 3.438799863587361e-08, + "loss": 0.3908, "step": 136535 }, { - "epoch": 4.8, - "learning_rate": 2.0498953197906344e-07, - "loss": 0.2393, + "epoch": 4.920892348722385, + "grad_norm": 0.21846546232700348, + "learning_rate": 3.423516420423545e-08, + "loss": 0.3788, "step": 136540 }, { - "epoch": 4.8, - "learning_rate": 2.046256365182131e-07, - "loss": 0.246, + "epoch": 4.921072548383609, + "grad_norm": 0.27012789249420166, + "learning_rate": 3.408266992630804e-08, + "loss": 0.3757, "step": 136545 }, { - "epoch": 4.8, - "learning_rate": 2.042620630089559e-07, - "loss": 0.2716, + "epoch": 4.921252748044834, + "grad_norm": 0.24852919578552246, + "learning_rate": 3.393051580417028e-08, + "loss": 0.3594, "step": 136550 }, { - "epoch": 4.8, - "learning_rate": 2.0389881145601863e-07, - "loss": 0.2491, + "epoch": 4.921432947706059, + "grad_norm": 0.2667607069015503, + "learning_rate": 3.377870183989551e-08, + "loss": 0.3766, "step": 136555 }, { - "epoch": 4.8, - "learning_rate": 2.0353588186411143e-07, - "loss": 0.2638, + "epoch": 4.921613147367283, + "grad_norm": 0.26542437076568604, + "learning_rate": 3.3627228035551515e-08, + "loss": 0.368, "step": 136560 }, { - "epoch": 4.8, - "learning_rate": 2.0317327423794995e-07, - "loss": 0.2367, + "epoch": 4.921793347028507, + "grad_norm": 0.3026573657989502, + "learning_rate": 3.3476094393203313e-08, + "loss": 0.3884, "step": 136565 }, { - "epoch": 4.8, - "learning_rate": 2.0281098858224157e-07, - "loss": 0.2575, + "epoch": 4.921973546689732, + "grad_norm": 0.26088666915893555, + "learning_rate": 3.3325300914910374e-08, + "loss": 0.3654, "step": 136570 }, { - "epoch": 4.81, - "learning_rate": 2.0244902490169081e-07, - "loss": 0.2421, + "epoch": 4.922153746350957, + "grad_norm": 0.22487777471542358, + "learning_rate": 3.31748476027266e-08, + "loss": 0.3848, "step": 136575 }, { - "epoch": 4.81, - "learning_rate": 2.0208738320099675e-07, - "loss": 0.2673, + "epoch": 4.9223339460121815, + "grad_norm": 0.2889478802680969, + "learning_rate": 3.302473445870313e-08, + "loss": 0.4139, "step": 136580 }, { - "epoch": 4.81, - "learning_rate": 2.0172606348485556e-07, - "loss": 0.2437, + "epoch": 4.922514145673406, + "grad_norm": 0.21475286781787872, + "learning_rate": 3.2874961484882785e-08, + "loss": 0.3401, "step": 136585 }, { - "epoch": 4.81, - "learning_rate": 2.0136506575795798e-07, - "loss": 0.2469, + "epoch": 4.922694345334631, + "grad_norm": 0.27408671379089355, + "learning_rate": 3.272552868330558e-08, + "loss": 0.4108, "step": 136590 }, { - "epoch": 4.81, - "learning_rate": 2.0100439002499473e-07, - "loss": 0.258, + "epoch": 4.922874544995856, + "grad_norm": 0.2363937646150589, + "learning_rate": 3.257643605601157e-08, + "loss": 0.3962, "step": 136595 }, { - "epoch": 4.81, - "learning_rate": 2.0064403629064533e-07, - "loss": 0.2409, + "epoch": 4.92305474465708, + "grad_norm": 0.22098080813884735, + "learning_rate": 3.2427683605026905e-08, + "loss": 0.3596, "step": 136600 }, { - "epoch": 4.81, - "learning_rate": 2.002840045595894e-07, - "loss": 0.2655, + "epoch": 4.923234944318304, + "grad_norm": 0.2587352693080902, + "learning_rate": 3.227927133238329e-08, + "loss": 0.4166, "step": 136605 }, { - "epoch": 4.81, - "learning_rate": 1.9992429483650098e-07, - "loss": 0.2792, + "epoch": 4.923415143979529, + "grad_norm": 0.3030674159526825, + "learning_rate": 3.213119924010133e-08, + "loss": 0.3743, "step": 136610 }, { - "epoch": 4.81, - "learning_rate": 1.9956490712605135e-07, - "loss": 0.2716, + "epoch": 4.923595343640754, + "grad_norm": 0.24243710935115814, + "learning_rate": 3.1983467330196084e-08, + "loss": 0.372, "step": 136615 }, { - "epoch": 4.81, - "learning_rate": 1.9920584143290898e-07, - "loss": 0.2667, + "epoch": 4.9237755433019785, + "grad_norm": 0.2680684030056, + "learning_rate": 3.1836075604685375e-08, + "loss": 0.4072, "step": 136620 }, { - "epoch": 4.81, - "learning_rate": 1.9884709776173404e-07, - "loss": 0.2351, + "epoch": 4.923955742963203, + "grad_norm": 0.2981574237346649, + "learning_rate": 3.1689024065570397e-08, + "loss": 0.3586, "step": 136625 }, { - "epoch": 4.81, - "learning_rate": 1.9848867611718393e-07, - "loss": 0.2529, + "epoch": 4.924135942624428, + "grad_norm": 0.24394196271896362, + "learning_rate": 3.1542312714860635e-08, + "loss": 0.3844, "step": 136630 }, { - "epoch": 4.81, - "learning_rate": 1.981305765039132e-07, - "loss": 0.2524, + "epoch": 4.924316142285653, + "grad_norm": 0.26692622900009155, + "learning_rate": 3.139594155455172e-08, + "loss": 0.3926, "step": 136635 }, { - "epoch": 4.81, - "learning_rate": 1.9777279892657375e-07, - "loss": 0.2502, + "epoch": 4.9244963419468775, + "grad_norm": 0.3153393268585205, + "learning_rate": 3.1249910586639286e-08, + "loss": 0.3689, "step": 136640 }, { - "epoch": 4.81, - "learning_rate": 1.9741534338980906e-07, - "loss": 0.2542, + "epoch": 4.924676541608102, + "grad_norm": 0.2018764615058899, + "learning_rate": 3.11042198131134e-08, + "loss": 0.3361, "step": 136645 }, { - "epoch": 4.81, - "learning_rate": 1.9705820989825984e-07, - "loss": 0.2528, + "epoch": 4.924856741269326, + "grad_norm": 0.24366877973079681, + "learning_rate": 3.095886923595581e-08, + "loss": 0.3578, "step": 136650 }, { - "epoch": 4.81, - "learning_rate": 1.967013984565641e-07, - "loss": 0.2565, + "epoch": 4.925036940930551, + "grad_norm": 0.21055671572685242, + "learning_rate": 3.081385885715105e-08, + "loss": 0.3606, "step": 136655 }, { - "epoch": 4.81, - "learning_rate": 1.9634490906935698e-07, - "loss": 0.2407, + "epoch": 4.9252171405917755, + "grad_norm": 0.25222551822662354, + "learning_rate": 3.06691886786753e-08, + "loss": 0.3939, "step": 136660 }, { - "epoch": 4.81, - "learning_rate": 1.9598874174126258e-07, - "loss": 0.2541, + "epoch": 4.925397340253, + "grad_norm": 0.18292470276355743, + "learning_rate": 3.052485870249644e-08, + "loss": 0.3271, "step": 136665 }, { - "epoch": 4.81, - "learning_rate": 1.9563289647690775e-07, - "loss": 0.2327, + "epoch": 4.925577539914225, + "grad_norm": 0.2845352590084076, + "learning_rate": 3.038086893057956e-08, + "loss": 0.366, "step": 136670 }, { - "epoch": 4.81, - "learning_rate": 1.9527737328091378e-07, - "loss": 0.2477, + "epoch": 4.92575773957545, + "grad_norm": 0.26779624819755554, + "learning_rate": 3.0237219364892544e-08, + "loss": 0.3505, "step": 136675 }, { - "epoch": 4.81, - "learning_rate": 1.9492217215789644e-07, - "loss": 0.2629, + "epoch": 4.9259379392366744, + "grad_norm": 0.24947002530097961, + "learning_rate": 3.009391000738659e-08, + "loss": 0.3692, "step": 136680 }, { - "epoch": 4.81, - "learning_rate": 1.945672931124659e-07, - "loss": 0.2566, + "epoch": 4.926118138897899, + "grad_norm": 0.2781367301940918, + "learning_rate": 2.995094086001848e-08, + "loss": 0.3752, "step": 136685 }, { - "epoch": 4.81, - "learning_rate": 1.9421273614923518e-07, - "loss": 0.261, + "epoch": 4.926298338559123, + "grad_norm": 0.24780642986297607, + "learning_rate": 2.980831192473388e-08, + "loss": 0.3734, "step": 136690 }, { - "epoch": 4.81, - "learning_rate": 1.938585012728006e-07, - "loss": 0.2365, + "epoch": 4.926478538220348, + "grad_norm": 0.2541256248950958, + "learning_rate": 2.9666023203475667e-08, + "loss": 0.3417, "step": 136695 }, { - "epoch": 4.81, - "learning_rate": 1.9350458848776675e-07, - "loss": 0.2402, + "epoch": 4.9266587378815725, + "grad_norm": 0.2498217672109604, + "learning_rate": 2.9524074698186743e-08, + "loss": 0.3829, "step": 136700 }, { - "epoch": 4.81, - "learning_rate": 1.9315099779872726e-07, - "loss": 0.2434, + "epoch": 4.926838937542797, + "grad_norm": 0.2726345360279083, + "learning_rate": 2.938246641079334e-08, + "loss": 0.3742, "step": 136705 }, { - "epoch": 4.81, - "learning_rate": 1.927977292102756e-07, - "loss": 0.2461, + "epoch": 4.927019137204022, + "grad_norm": 0.29105934500694275, + "learning_rate": 2.9241198343232802e-08, + "loss": 0.3852, "step": 136710 }, { - "epoch": 4.81, - "learning_rate": 1.9244478272699705e-07, - "loss": 0.2583, + "epoch": 4.927199336865247, + "grad_norm": 0.24921295046806335, + "learning_rate": 2.910027049742581e-08, + "loss": 0.3763, "step": 136715 }, { - "epoch": 4.81, - "learning_rate": 1.9209215835347404e-07, - "loss": 0.2627, + "epoch": 4.927379536526471, + "grad_norm": 0.3070908784866333, + "learning_rate": 2.8959682875293047e-08, + "loss": 0.3878, "step": 136720 }, { - "epoch": 4.81, - "learning_rate": 1.9173985609428348e-07, - "loss": 0.2547, + "epoch": 4.927559736187696, + "grad_norm": 0.319961279630661, + "learning_rate": 2.8819435478749657e-08, + "loss": 0.4065, "step": 136725 }, { - "epoch": 4.81, - "learning_rate": 1.913878759540022e-07, - "loss": 0.253, + "epoch": 4.927739935848921, + "grad_norm": 0.3225443959236145, + "learning_rate": 2.8679528309705217e-08, + "loss": 0.4024, "step": 136730 }, { - "epoch": 4.81, - "learning_rate": 1.9103621793719885e-07, - "loss": 0.246, + "epoch": 4.927920135510146, + "grad_norm": 0.2788389325141907, + "learning_rate": 2.8539961370069314e-08, + "loss": 0.3578, "step": 136735 }, { - "epoch": 4.81, - "learning_rate": 1.9068488204844193e-07, - "loss": 0.256, + "epoch": 4.92810033517137, + "grad_norm": 0.2546929717063904, + "learning_rate": 2.84007346617432e-08, + "loss": 0.3585, "step": 136740 }, { - "epoch": 4.81, - "learning_rate": 1.903338682922917e-07, - "loss": 0.2588, + "epoch": 4.928280534832594, + "grad_norm": 0.23796042799949646, + "learning_rate": 2.826184818661981e-08, + "loss": 0.3595, "step": 136745 }, { - "epoch": 4.81, - "learning_rate": 1.899831766733057e-07, - "loss": 0.2472, + "epoch": 4.928460734493819, + "grad_norm": 0.22895626723766327, + "learning_rate": 2.8123301946594847e-08, + "loss": 0.3904, "step": 136750 }, { - "epoch": 4.81, - "learning_rate": 1.896328071960385e-07, - "loss": 0.265, + "epoch": 4.928640934155044, + "grad_norm": 0.2509956657886505, + "learning_rate": 2.7985095943555696e-08, + "loss": 0.4023, "step": 136755 }, { - "epoch": 4.81, - "learning_rate": 1.892827598650393e-07, - "loss": 0.2555, + "epoch": 4.928821133816268, + "grad_norm": 0.34779104590415955, + "learning_rate": 2.7847230179384176e-08, + "loss": 0.3417, "step": 136760 }, { - "epoch": 4.81, - "learning_rate": 1.889330346848489e-07, - "loss": 0.2616, + "epoch": 4.929001333477493, + "grad_norm": 0.19889460504055023, + "learning_rate": 2.7709704655959344e-08, + "loss": 0.342, "step": 136765 }, { - "epoch": 4.81, - "learning_rate": 1.8858363166001646e-07, - "loss": 0.2351, + "epoch": 4.929181533138718, + "grad_norm": 0.2757225036621094, + "learning_rate": 2.757251937515748e-08, + "loss": 0.3928, "step": 136770 }, { - "epoch": 4.81, - "learning_rate": 1.8823455079507168e-07, - "loss": 0.2585, + "epoch": 4.929361732799943, + "grad_norm": 0.32304972410202026, + "learning_rate": 2.7435674338843752e-08, + "loss": 0.379, "step": 136775 }, { - "epoch": 4.81, - "learning_rate": 1.8788579209454982e-07, - "loss": 0.2486, + "epoch": 4.929541932461167, + "grad_norm": 0.21971245110034943, + "learning_rate": 2.729916954888334e-08, + "loss": 0.3514, "step": 136780 }, { - "epoch": 4.81, - "learning_rate": 1.8753735556298057e-07, - "loss": 0.2458, + "epoch": 4.929722132122391, + "grad_norm": 0.2933555841445923, + "learning_rate": 2.7163005007135867e-08, + "loss": 0.3583, "step": 136785 }, { - "epoch": 4.81, - "learning_rate": 1.8718924120488536e-07, - "loss": 0.2521, + "epoch": 4.929902331783616, + "grad_norm": 0.25747615098953247, + "learning_rate": 2.7027180715458177e-08, + "loss": 0.3979, "step": 136790 }, { - "epoch": 4.81, - "learning_rate": 1.8684144902478552e-07, - "loss": 0.2684, + "epoch": 4.930082531444841, + "grad_norm": 0.27800336480140686, + "learning_rate": 2.689169667570157e-08, + "loss": 0.3521, "step": 136795 }, { - "epoch": 4.81, - "learning_rate": 1.864939790271969e-07, - "loss": 0.2521, + "epoch": 4.930262731106065, + "grad_norm": 0.24476049840450287, + "learning_rate": 2.675655288970902e-08, + "loss": 0.3589, "step": 136800 }, { - "epoch": 4.81, - "learning_rate": 1.8614683121663257e-07, - "loss": 0.2539, + "epoch": 4.93044293076729, + "grad_norm": 0.21203723549842834, + "learning_rate": 2.6621749359326263e-08, + "loss": 0.3473, "step": 136805 }, { - "epoch": 4.81, - "learning_rate": 1.8580000559759726e-07, - "loss": 0.2546, + "epoch": 4.930623130428515, + "grad_norm": 0.25764790177345276, + "learning_rate": 2.6487286086385177e-08, + "loss": 0.3886, "step": 136810 }, { - "epoch": 4.81, - "learning_rate": 1.8545350217459846e-07, - "loss": 0.2537, + "epoch": 4.93080333008974, + "grad_norm": 0.2776869535446167, + "learning_rate": 2.63531630727204e-08, + "loss": 0.4101, "step": 136815 }, { - "epoch": 4.81, - "learning_rate": 1.851073209521298e-07, - "loss": 0.2552, + "epoch": 4.930983529750964, + "grad_norm": 0.23723161220550537, + "learning_rate": 2.6219380320158248e-08, + "loss": 0.3742, "step": 136820 }, { - "epoch": 4.81, - "learning_rate": 1.8476146193468768e-07, - "loss": 0.242, + "epoch": 4.931163729412189, + "grad_norm": 0.28152310848236084, + "learning_rate": 2.6085937830522266e-08, + "loss": 0.3533, "step": 136825 }, { - "epoch": 4.81, - "learning_rate": 1.8441592512676853e-07, - "loss": 0.2568, + "epoch": 4.931343929073414, + "grad_norm": 0.3157213032245636, + "learning_rate": 2.595283560562767e-08, + "loss": 0.3813, "step": 136830 }, { - "epoch": 4.81, - "learning_rate": 1.8407071053285208e-07, - "loss": 0.2641, + "epoch": 4.931524128734638, + "grad_norm": 0.22716785967350006, + "learning_rate": 2.582007364729522e-08, + "loss": 0.3488, "step": 136835 }, { - "epoch": 4.81, - "learning_rate": 1.8372581815742085e-07, - "loss": 0.2589, + "epoch": 4.931704328395862, + "grad_norm": 0.27288514375686646, + "learning_rate": 2.568765195732625e-08, + "loss": 0.3763, "step": 136840 }, { - "epoch": 4.81, - "learning_rate": 1.8338124800496014e-07, - "loss": 0.2387, + "epoch": 4.931884528057087, + "grad_norm": 0.22473356127738953, + "learning_rate": 2.5555570537527663e-08, + "loss": 0.3811, "step": 136845 }, { - "epoch": 4.81, - "learning_rate": 1.8303700007993585e-07, - "loss": 0.2526, + "epoch": 4.932064727718312, + "grad_norm": 0.25203046202659607, + "learning_rate": 2.5423829389700783e-08, + "loss": 0.3522, "step": 136850 }, { - "epoch": 4.81, - "learning_rate": 1.8269307438682214e-07, - "loss": 0.2391, + "epoch": 4.932244927379537, + "grad_norm": 0.24768032133579254, + "learning_rate": 2.5292428515638622e-08, + "loss": 0.3684, "step": 136855 }, { - "epoch": 4.82, - "learning_rate": 1.8234947093008214e-07, - "loss": 0.2204, + "epoch": 4.932425127040761, + "grad_norm": 0.23223379254341125, + "learning_rate": 2.5161367917131416e-08, + "loss": 0.3809, "step": 136860 }, { - "epoch": 4.82, - "learning_rate": 1.8200618971417892e-07, - "loss": 0.2157, + "epoch": 4.932605326701986, + "grad_norm": 0.24357058107852936, + "learning_rate": 2.5030647595963853e-08, + "loss": 0.3799, "step": 136865 }, { - "epoch": 4.82, - "learning_rate": 1.8166323074357284e-07, - "loss": 0.2753, + "epoch": 4.932785526363211, + "grad_norm": 0.3189919590950012, + "learning_rate": 2.4900267553920608e-08, + "loss": 0.3666, "step": 136870 }, { - "epoch": 4.82, - "learning_rate": 1.813205940227103e-07, - "loss": 0.2546, + "epoch": 4.932965726024435, + "grad_norm": 0.2528684437274933, + "learning_rate": 2.4770227792775268e-08, + "loss": 0.3559, "step": 136875 }, { - "epoch": 4.82, - "learning_rate": 1.8097827955604608e-07, - "loss": 0.2641, + "epoch": 4.933145925685659, + "grad_norm": 0.2626766562461853, + "learning_rate": 2.464052831429864e-08, + "loss": 0.4019, "step": 136880 }, { - "epoch": 4.82, - "learning_rate": 1.8063628734802107e-07, - "loss": 0.2514, + "epoch": 4.933326125346884, + "grad_norm": 0.2691434323787689, + "learning_rate": 2.4511169120261524e-08, + "loss": 0.3848, "step": 136885 }, { - "epoch": 4.82, - "learning_rate": 1.8036292561080804e-07, - "loss": 0.222, + "epoch": 4.933506325008109, + "grad_norm": 0.26981598138809204, + "learning_rate": 2.4382150212423625e-08, + "loss": 0.3692, "step": 136890 }, { - "epoch": 4.82, - "learning_rate": 1.8002151347952245e-07, - "loss": 0.2635, + "epoch": 4.9336865246693336, + "grad_norm": 0.2676762342453003, + "learning_rate": 2.425347159254465e-08, + "loss": 0.3717, "step": 136895 }, { - "epoch": 4.82, - "learning_rate": 1.7968042361930227e-07, - "loss": 0.2581, + "epoch": 4.933866724330558, + "grad_norm": 0.2530960738658905, + "learning_rate": 2.4125133262373202e-08, + "loss": 0.3849, "step": 136900 }, { - "epoch": 4.82, - "learning_rate": 1.7933965603457447e-07, - "loss": 0.2634, + "epoch": 4.934046923991783, + "grad_norm": 0.2379455268383026, + "learning_rate": 2.3997135223663424e-08, + "loss": 0.3896, "step": 136905 }, { - "epoch": 4.82, - "learning_rate": 1.7899921072976332e-07, - "loss": 0.2852, + "epoch": 4.934227123653008, + "grad_norm": 0.29824960231781006, + "learning_rate": 2.3869477478158374e-08, + "loss": 0.3574, "step": 136910 }, { - "epoch": 4.82, - "learning_rate": 1.7865908770929306e-07, - "loss": 0.2385, + "epoch": 4.9344073233142325, + "grad_norm": 0.21034584939479828, + "learning_rate": 2.374216002759555e-08, + "loss": 0.3759, "step": 136915 }, { - "epoch": 4.82, - "learning_rate": 1.7831928697757404e-07, - "loss": 0.2633, + "epoch": 4.934587522975457, + "grad_norm": 0.28804388642311096, + "learning_rate": 2.3615182873709673e-08, + "loss": 0.386, "step": 136920 }, { - "epoch": 4.82, - "learning_rate": 1.7797980853901942e-07, - "loss": 0.2316, + "epoch": 4.934767722636681, + "grad_norm": 0.25442805886268616, + "learning_rate": 2.3488546018232692e-08, + "loss": 0.39, "step": 136925 }, { - "epoch": 4.82, - "learning_rate": 1.776406523980395e-07, - "loss": 0.2528, + "epoch": 4.934947922297906, + "grad_norm": 0.2353426218032837, + "learning_rate": 2.3362249462885454e-08, + "loss": 0.3843, "step": 136930 }, { - "epoch": 4.82, - "learning_rate": 1.7730181855903916e-07, - "loss": 0.264, + "epoch": 4.9351281219591305, + "grad_norm": 0.22822360694408417, + "learning_rate": 2.3236293209394354e-08, + "loss": 0.3774, "step": 136935 }, { - "epoch": 4.82, - "learning_rate": 1.7696330702641482e-07, - "loss": 0.2626, + "epoch": 4.935308321620355, + "grad_norm": 0.21662981808185577, + "learning_rate": 2.311067725947469e-08, + "loss": 0.3724, "step": 136940 }, { - "epoch": 4.82, - "learning_rate": 1.7662511780456304e-07, - "loss": 0.2429, + "epoch": 4.93548852128158, + "grad_norm": 0.2427043616771698, + "learning_rate": 2.2985401614833422e-08, + "loss": 0.3599, "step": 136945 }, { - "epoch": 4.82, - "learning_rate": 1.762872508978719e-07, - "loss": 0.256, + "epoch": 4.935668720942805, + "grad_norm": 0.2436215728521347, + "learning_rate": 2.2860466277180303e-08, + "loss": 0.3463, "step": 136950 }, { - "epoch": 4.82, - "learning_rate": 1.7594970631073515e-07, - "loss": 0.2664, + "epoch": 4.9358489206040295, + "grad_norm": 0.18670439720153809, + "learning_rate": 2.273587124821952e-08, + "loss": 0.3419, "step": 136955 }, { - "epoch": 4.82, - "learning_rate": 1.756124840475326e-07, - "loss": 0.238, + "epoch": 4.936029120265254, + "grad_norm": 0.23611606657505035, + "learning_rate": 2.2611616529644164e-08, + "loss": 0.374, "step": 136960 }, { - "epoch": 4.82, - "learning_rate": 1.7527558411263856e-07, - "loss": 0.232, + "epoch": 4.936209319926478, + "grad_norm": 0.27556440234184265, + "learning_rate": 2.2487702123152877e-08, + "loss": 0.3414, "step": 136965 }, { - "epoch": 4.82, - "learning_rate": 1.7493900651043282e-07, - "loss": 0.2588, + "epoch": 4.936389519587703, + "grad_norm": 0.2503463625907898, + "learning_rate": 2.2364128030430422e-08, + "loss": 0.3384, "step": 136970 }, { - "epoch": 4.82, - "learning_rate": 1.7460275124528413e-07, - "loss": 0.2515, + "epoch": 4.9365697192489275, + "grad_norm": 0.2716796398162842, + "learning_rate": 2.2240894253158785e-08, + "loss": 0.3669, "step": 136975 }, { - "epoch": 4.82, - "learning_rate": 1.7426681832155844e-07, - "loss": 0.2641, + "epoch": 4.936749918910152, + "grad_norm": 0.2600260376930237, + "learning_rate": 2.2118000793019956e-08, + "loss": 0.4075, "step": 136980 }, { - "epoch": 4.82, - "learning_rate": 1.7393120774361338e-07, - "loss": 0.2433, + "epoch": 4.936930118571377, + "grad_norm": 0.20133672654628754, + "learning_rate": 2.1995447651687595e-08, + "loss": 0.3881, "step": 136985 }, { - "epoch": 4.82, - "learning_rate": 1.7359591951581489e-07, - "loss": 0.259, + "epoch": 4.937110318232602, + "grad_norm": 0.268351674079895, + "learning_rate": 2.187323483083259e-08, + "loss": 0.355, "step": 136990 }, { - "epoch": 4.82, - "learning_rate": 1.7326095364251226e-07, - "loss": 0.2382, + "epoch": 4.9372905178938264, + "grad_norm": 0.24614043533802032, + "learning_rate": 2.1751362332117496e-08, + "loss": 0.3716, "step": 136995 }, { - "epoch": 4.82, - "learning_rate": 1.7292631012805204e-07, - "loss": 0.2548, + "epoch": 4.937470717555051, + "grad_norm": 0.24108512699604034, + "learning_rate": 2.162983015720488e-08, + "loss": 0.3567, "step": 137000 }, { - "epoch": 4.82, - "eval_loss": 0.24844510853290558, - "eval_runtime": 10.5423, - "eval_samples_per_second": 9.486, - "eval_steps_per_second": 9.486, + "epoch": 4.937470717555051, + "eval_loss": 0.4288177788257599, + "eval_runtime": 3.5288, + "eval_samples_per_second": 28.339, + "eval_steps_per_second": 7.085, "step": 137000 }, { - "epoch": 4.82, - "learning_rate": 1.7259198897678352e-07, - "loss": 0.2178, + "epoch": 4.937650917216276, + "grad_norm": 0.2266797125339508, + "learning_rate": 2.150863830774896e-08, + "loss": 0.3821, "step": 137005 }, { - "epoch": 4.82, - "learning_rate": 1.722579901930449e-07, - "loss": 0.2575, + "epoch": 4.937831116877501, + "grad_norm": 0.2394786775112152, + "learning_rate": 2.138778678540121e-08, + "loss": 0.3445, "step": 137010 }, { - "epoch": 4.82, - "learning_rate": 1.7192431378117714e-07, - "loss": 0.2518, + "epoch": 4.938011316538725, + "grad_norm": 0.2852950394153595, + "learning_rate": 2.12672755918103e-08, + "loss": 0.3695, "step": 137015 }, { - "epoch": 4.82, - "learning_rate": 1.7159095974550731e-07, - "loss": 0.2554, + "epoch": 4.938191516199949, + "grad_norm": 0.270437628030777, + "learning_rate": 2.1147104728616584e-08, + "loss": 0.3827, "step": 137020 }, { - "epoch": 4.82, - "learning_rate": 1.7125792809036533e-07, - "loss": 0.2502, + "epoch": 4.938371715861174, + "grad_norm": 0.26983991265296936, + "learning_rate": 2.1027274197457646e-08, + "loss": 0.3773, "step": 137025 }, { - "epoch": 4.82, - "learning_rate": 1.7092521882007827e-07, - "loss": 0.2334, + "epoch": 4.938551915522399, + "grad_norm": 0.2832012474536896, + "learning_rate": 2.0907783999965515e-08, + "loss": 0.3651, "step": 137030 }, { - "epoch": 4.82, - "learning_rate": 1.7059283193896492e-07, - "loss": 0.2553, + "epoch": 4.938732115183623, + "grad_norm": 0.28883448243141174, + "learning_rate": 2.0788634137769437e-08, + "loss": 0.3767, "step": 137035 }, { - "epoch": 4.82, - "learning_rate": 1.7026076745134123e-07, - "loss": 0.2533, + "epoch": 4.938912314844848, + "grad_norm": 0.29116883873939514, + "learning_rate": 2.066982461249034e-08, + "loss": 0.3505, "step": 137040 }, { - "epoch": 4.82, - "learning_rate": 1.699290253615149e-07, - "loss": 0.2623, + "epoch": 4.939092514506073, + "grad_norm": 0.2609765827655792, + "learning_rate": 2.0551355425749154e-08, + "loss": 0.3741, "step": 137045 }, { - "epoch": 4.82, - "learning_rate": 1.6959760567379912e-07, - "loss": 0.2666, + "epoch": 4.939272714167298, + "grad_norm": 0.23762352764606476, + "learning_rate": 2.0433226579161247e-08, + "loss": 0.402, "step": 137050 }, { - "epoch": 4.82, - "learning_rate": 1.6926650839249325e-07, - "loss": 0.245, + "epoch": 4.939452913828522, + "grad_norm": 0.23419687151908875, + "learning_rate": 2.031543807433367e-08, + "loss": 0.3582, "step": 137055 }, { - "epoch": 4.82, - "learning_rate": 1.6893573352189663e-07, - "loss": 0.2505, + "epoch": 4.939633113489746, + "grad_norm": 0.23861204087734222, + "learning_rate": 2.0197989912873473e-08, + "loss": 0.3914, "step": 137060 }, { - "epoch": 4.82, - "learning_rate": 1.686052810663058e-07, - "loss": 0.2386, + "epoch": 4.939813313150971, + "grad_norm": 0.2407989501953125, + "learning_rate": 2.0080882096376595e-08, + "loss": 0.3529, "step": 137065 }, { - "epoch": 4.82, - "learning_rate": 1.682751510300118e-07, - "loss": 0.2312, + "epoch": 4.939993512812196, + "grad_norm": 0.2740454375743866, + "learning_rate": 1.9964114626441764e-08, + "loss": 0.3948, "step": 137070 }, { - "epoch": 4.82, - "learning_rate": 1.6794534341730006e-07, - "loss": 0.2437, + "epoch": 4.94017371247342, + "grad_norm": 0.28621092438697815, + "learning_rate": 1.984768750466215e-08, + "loss": 0.3807, "step": 137075 }, { - "epoch": 4.82, - "learning_rate": 1.676158582324533e-07, - "loss": 0.2555, + "epoch": 4.940353912134645, + "grad_norm": 0.26705020666122437, + "learning_rate": 1.9731600732619816e-08, + "loss": 0.4056, "step": 137080 }, { - "epoch": 4.82, - "learning_rate": 1.672866954797514e-07, - "loss": 0.2692, + "epoch": 4.94053411179587, + "grad_norm": 0.27954116463661194, + "learning_rate": 1.961585431189683e-08, + "loss": 0.3494, "step": 137085 }, { - "epoch": 4.82, - "learning_rate": 1.6695785516346318e-07, - "loss": 0.2638, + "epoch": 4.940714311457095, + "grad_norm": 0.2583678662776947, + "learning_rate": 1.9500448244072487e-08, + "loss": 0.4074, "step": 137090 }, { - "epoch": 4.82, - "learning_rate": 1.6662933728786302e-07, - "loss": 0.2389, + "epoch": 4.940894511118319, + "grad_norm": 0.27501171827316284, + "learning_rate": 1.9385382530717755e-08, + "loss": 0.3644, "step": 137095 }, { - "epoch": 4.82, - "learning_rate": 1.6630114185721413e-07, - "loss": 0.2585, + "epoch": 4.941074710779544, + "grad_norm": 0.2305932641029358, + "learning_rate": 1.9270657173403593e-08, + "loss": 0.3497, "step": 137100 }, { - "epoch": 4.82, - "learning_rate": 1.659732688757798e-07, - "loss": 0.2204, + "epoch": 4.941254910440769, + "grad_norm": 0.23129644989967346, + "learning_rate": 1.9156272173687094e-08, + "loss": 0.3861, "step": 137105 }, { - "epoch": 4.82, - "learning_rate": 1.6564571834781495e-07, - "loss": 0.2584, + "epoch": 4.941435110101993, + "grad_norm": 0.295730322599411, + "learning_rate": 1.9042227533130897e-08, + "loss": 0.3406, "step": 137110 }, { - "epoch": 4.82, - "learning_rate": 1.6531849027757728e-07, - "loss": 0.2436, + "epoch": 4.941615309763217, + "grad_norm": 0.2581028938293457, + "learning_rate": 1.8928523253286536e-08, + "loss": 0.3465, "step": 137115 }, { - "epoch": 4.82, - "learning_rate": 1.6499158466930786e-07, - "loss": 0.2404, + "epoch": 4.941795509424442, + "grad_norm": 0.21017776429653168, + "learning_rate": 1.881515933570832e-08, + "loss": 0.3547, "step": 137120 }, { - "epoch": 4.82, - "learning_rate": 1.6466500152725882e-07, - "loss": 0.225, + "epoch": 4.941975709085667, + "grad_norm": 0.24874837696552277, + "learning_rate": 1.8702135781933917e-08, + "loss": 0.3988, "step": 137125 }, { - "epoch": 4.82, - "learning_rate": 1.6433874085566848e-07, - "loss": 0.2519, + "epoch": 4.942155908746892, + "grad_norm": 0.2943408191204071, + "learning_rate": 1.858945259350653e-08, + "loss": 0.3855, "step": 137130 }, { - "epoch": 4.82, - "learning_rate": 1.640128026587695e-07, - "loss": 0.2528, + "epoch": 4.942336108408116, + "grad_norm": 0.27087777853012085, + "learning_rate": 1.8477109771963817e-08, + "loss": 0.381, "step": 137135 }, { - "epoch": 4.82, - "learning_rate": 1.6368718694079744e-07, - "loss": 0.2454, + "epoch": 4.942516308069341, + "grad_norm": 0.2375209629535675, + "learning_rate": 1.8365107318829568e-08, + "loss": 0.3586, "step": 137140 }, { - "epoch": 4.83, - "learning_rate": 1.633618937059822e-07, - "loss": 0.2469, + "epoch": 4.942696507730566, + "grad_norm": 0.2501397430896759, + "learning_rate": 1.8253445235638654e-08, + "loss": 0.3695, "step": 137145 }, { - "epoch": 4.83, - "learning_rate": 1.6303692295854268e-07, - "loss": 0.2677, + "epoch": 4.94287670739179, + "grad_norm": 0.2996070981025696, + "learning_rate": 1.8142123523903765e-08, + "loss": 0.4018, "step": 137150 }, { - "epoch": 4.83, - "learning_rate": 1.6271227470270046e-07, - "loss": 0.2444, + "epoch": 4.943056907053014, + "grad_norm": 0.3034278154373169, + "learning_rate": 1.8031142185148674e-08, + "loss": 0.3636, "step": 137155 }, { - "epoch": 4.83, - "learning_rate": 1.6238794894267163e-07, - "loss": 0.2543, + "epoch": 4.943237106714239, + "grad_norm": 0.2226463109254837, + "learning_rate": 1.792050122088329e-08, + "loss": 0.3754, "step": 137160 }, { - "epoch": 4.83, - "learning_rate": 1.6206394568266393e-07, - "loss": 0.2486, + "epoch": 4.943417306375464, + "grad_norm": 0.24234063923358917, + "learning_rate": 1.7810200632611963e-08, + "loss": 0.3599, "step": 137165 }, { - "epoch": 4.83, - "learning_rate": 1.6174026492689065e-07, - "loss": 0.2558, + "epoch": 4.943597506036689, + "grad_norm": 0.25728529691696167, + "learning_rate": 1.7700240421841817e-08, + "loss": 0.3817, "step": 137170 }, { - "epoch": 4.83, - "learning_rate": 1.6141690667954846e-07, - "loss": 0.2419, + "epoch": 4.943777705697913, + "grad_norm": 0.22455373406410217, + "learning_rate": 1.759062059006611e-08, + "loss": 0.3744, "step": 137175 }, { - "epoch": 4.83, - "learning_rate": 1.6109387094483953e-07, - "loss": 0.259, + "epoch": 4.943957905359138, + "grad_norm": 0.2268945425748825, + "learning_rate": 1.7481341138783634e-08, + "loss": 0.3862, "step": 137180 }, { - "epoch": 4.83, - "learning_rate": 1.6077115772695773e-07, - "loss": 0.247, + "epoch": 4.944138105020363, + "grad_norm": 0.27768200635910034, + "learning_rate": 1.7372402069482097e-08, + "loss": 0.3813, "step": 137185 }, { - "epoch": 4.83, - "learning_rate": 1.6044876703009414e-07, - "loss": 0.2441, + "epoch": 4.9443183046815875, + "grad_norm": 0.256106436252594, + "learning_rate": 1.726380338364364e-08, + "loss": 0.3873, "step": 137190 }, { - "epoch": 4.83, - "learning_rate": 1.6012669885843158e-07, - "loss": 0.2417, + "epoch": 4.944498504342812, + "grad_norm": 0.21937619149684906, + "learning_rate": 1.7155545082750413e-08, + "loss": 0.3762, "step": 137195 }, { - "epoch": 4.83, - "learning_rate": 1.5980495321615273e-07, - "loss": 0.2485, + "epoch": 4.944678704004036, + "grad_norm": 0.2467588484287262, + "learning_rate": 1.704762716827346e-08, + "loss": 0.3593, "step": 137200 }, { - "epoch": 4.83, - "learning_rate": 1.5948353010743765e-07, - "loss": 0.2444, + "epoch": 4.944858903665261, + "grad_norm": 0.282312273979187, + "learning_rate": 1.6940049641686605e-08, + "loss": 0.3672, "step": 137205 }, { - "epoch": 4.83, - "learning_rate": 1.59162429536458e-07, - "loss": 0.2523, + "epoch": 4.9450391033264856, + "grad_norm": 0.2458559274673462, + "learning_rate": 1.683281250445534e-08, + "loss": 0.3546, "step": 137210 }, { - "epoch": 4.83, - "learning_rate": 1.5884165150738538e-07, - "loss": 0.2692, + "epoch": 4.94521930298771, + "grad_norm": 0.2797468602657318, + "learning_rate": 1.6725915758039612e-08, + "loss": 0.3572, "step": 137215 }, { - "epoch": 4.83, - "learning_rate": 1.5852119602438043e-07, - "loss": 0.2524, + "epoch": 4.945399502648935, + "grad_norm": 0.2865085005760193, + "learning_rate": 1.661935940389936e-08, + "loss": 0.3809, "step": 137220 }, { - "epoch": 4.83, - "learning_rate": 1.5820106309160642e-07, - "loss": 0.2647, + "epoch": 4.94557970231016, + "grad_norm": 0.2759266495704651, + "learning_rate": 1.6513143443477873e-08, + "loss": 0.3687, "step": 137225 }, { - "epoch": 4.83, - "learning_rate": 1.578812527132212e-07, - "loss": 0.247, + "epoch": 4.9457599019713845, + "grad_norm": 0.2847881615161896, + "learning_rate": 1.6407267878232326e-08, + "loss": 0.3586, "step": 137230 }, { - "epoch": 4.83, - "learning_rate": 1.5756176489337693e-07, - "loss": 0.2237, + "epoch": 4.945940101632609, + "grad_norm": 0.2958735227584839, + "learning_rate": 1.630173270960045e-08, + "loss": 0.3507, "step": 137235 }, { - "epoch": 4.83, - "learning_rate": 1.5724259963622035e-07, - "loss": 0.2317, + "epoch": 4.946120301293834, + "grad_norm": 0.29240357875823975, + "learning_rate": 1.619653793901721e-08, + "loss": 0.3602, "step": 137240 }, { - "epoch": 4.83, - "learning_rate": 1.5692375694589813e-07, - "loss": 0.2366, + "epoch": 4.946300500955058, + "grad_norm": 0.25079208612442017, + "learning_rate": 1.609168356792312e-08, + "loss": 0.3342, "step": 137245 }, { - "epoch": 4.83, - "learning_rate": 1.566052368265486e-07, - "loss": 0.261, + "epoch": 4.9464807006162825, + "grad_norm": 0.2954399287700653, + "learning_rate": 1.598716959773927e-08, + "loss": 0.3812, "step": 137250 }, { - "epoch": 4.83, - "learning_rate": 1.562870392823046e-07, - "loss": 0.2254, + "epoch": 4.946660900277507, + "grad_norm": 0.28976091742515564, + "learning_rate": 1.588299602989507e-08, + "loss": 0.3823, "step": 137255 }, { - "epoch": 4.83, - "learning_rate": 1.5596916431730447e-07, - "loss": 0.2487, + "epoch": 4.946841099938732, + "grad_norm": 0.328617125749588, + "learning_rate": 1.5779162865806053e-08, + "loss": 0.3833, "step": 137260 }, { - "epoch": 4.83, - "learning_rate": 1.556516119356699e-07, - "loss": 0.2459, + "epoch": 4.947021299599957, + "grad_norm": 0.2867159843444824, + "learning_rate": 1.5675670106890527e-08, + "loss": 0.3928, "step": 137265 }, { - "epoch": 4.83, - "learning_rate": 1.553343821415254e-07, - "loss": 0.2599, + "epoch": 4.9472014992611815, + "grad_norm": 0.221615731716156, + "learning_rate": 1.557251775455848e-08, + "loss": 0.3601, "step": 137270 }, { - "epoch": 4.83, - "learning_rate": 1.5501747493899266e-07, - "loss": 0.2809, + "epoch": 4.947381698922406, + "grad_norm": 0.17245341837406158, + "learning_rate": 1.5469705810211566e-08, + "loss": 0.3856, "step": 137275 }, { - "epoch": 4.83, - "learning_rate": 1.5470089033218504e-07, - "loss": 0.2483, + "epoch": 4.947561898583631, + "grad_norm": 0.23257580399513245, + "learning_rate": 1.5367234275254215e-08, + "loss": 0.391, "step": 137280 }, { - "epoch": 4.83, - "learning_rate": 1.543846283252104e-07, - "loss": 0.2593, + "epoch": 4.947742098244856, + "grad_norm": 0.2740226984024048, + "learning_rate": 1.5265103151079762e-08, + "loss": 0.3524, "step": 137285 }, { - "epoch": 4.83, - "learning_rate": 1.5406868892217928e-07, - "loss": 0.2499, + "epoch": 4.94792229790608, + "grad_norm": 0.23505844175815582, + "learning_rate": 1.5163312439081533e-08, + "loss": 0.3628, "step": 137290 }, { - "epoch": 4.83, - "learning_rate": 1.5375307212719125e-07, - "loss": 0.2366, + "epoch": 4.948102497567304, + "grad_norm": 0.26007720828056335, + "learning_rate": 1.5061862140644535e-08, + "loss": 0.3935, "step": 137295 }, { - "epoch": 4.83, - "learning_rate": 1.5343777794434577e-07, - "loss": 0.2279, + "epoch": 4.948282697228529, + "grad_norm": 0.24692949652671814, + "learning_rate": 1.4960752257153764e-08, + "loss": 0.3707, "step": 137300 }, { - "epoch": 4.83, - "learning_rate": 1.531228063777368e-07, - "loss": 0.2305, + "epoch": 4.948462896889754, + "grad_norm": 0.25878119468688965, + "learning_rate": 1.4859982789985905e-08, + "loss": 0.3508, "step": 137305 }, { - "epoch": 4.83, - "learning_rate": 1.5280815743144993e-07, - "loss": 0.2283, + "epoch": 4.9486430965509784, + "grad_norm": 0.3022497892379761, + "learning_rate": 1.4759553740512078e-08, + "loss": 0.3793, "step": 137310 }, { - "epoch": 4.83, - "learning_rate": 1.5249383110957915e-07, - "loss": 0.2718, + "epoch": 4.948823296212203, + "grad_norm": 0.21404872834682465, + "learning_rate": 1.4659465110103409e-08, + "loss": 0.3658, "step": 137315 }, { - "epoch": 4.83, - "learning_rate": 1.5217982741619897e-07, - "loss": 0.2493, + "epoch": 4.949003495873428, + "grad_norm": 0.23077760636806488, + "learning_rate": 1.4559716900122699e-08, + "loss": 0.3532, "step": 137320 }, { - "epoch": 4.83, - "learning_rate": 1.5186614635538664e-07, - "loss": 0.2549, + "epoch": 4.949183695534653, + "grad_norm": 0.29384756088256836, + "learning_rate": 1.4460309111927194e-08, + "loss": 0.3861, "step": 137325 }, { - "epoch": 4.83, - "learning_rate": 1.5155278793121953e-07, - "loss": 0.2453, + "epoch": 4.949363895195877, + "grad_norm": 0.26955410838127136, + "learning_rate": 1.4361241746874143e-08, + "loss": 0.3721, "step": 137330 }, { - "epoch": 4.83, - "learning_rate": 1.5123975214776098e-07, - "loss": 0.2457, + "epoch": 4.949544094857101, + "grad_norm": 0.3485639989376068, + "learning_rate": 1.426251480630969e-08, + "loss": 0.4169, "step": 137335 }, { - "epoch": 4.83, - "learning_rate": 1.5092703900908e-07, - "loss": 0.2334, + "epoch": 4.949724294518326, + "grad_norm": 0.2870672643184662, + "learning_rate": 1.4164128291582757e-08, + "loss": 0.344, "step": 137340 }, { - "epoch": 4.83, - "learning_rate": 1.5061464851923445e-07, - "loss": 0.2459, + "epoch": 4.949904494179551, + "grad_norm": 0.2651885449886322, + "learning_rate": 1.4066082204031162e-08, + "loss": 0.386, "step": 137345 }, { - "epoch": 4.83, - "learning_rate": 1.503025806822822e-07, - "loss": 0.251, + "epoch": 4.950084693840775, + "grad_norm": 0.21898417174816132, + "learning_rate": 1.3968376544992723e-08, + "loss": 0.3918, "step": 137350 }, { - "epoch": 4.83, - "learning_rate": 1.4999083550227277e-07, - "loss": 0.2453, + "epoch": 4.950264893502, + "grad_norm": 0.23120352625846863, + "learning_rate": 1.3871011315796934e-08, + "loss": 0.3546, "step": 137355 }, { - "epoch": 4.83, - "learning_rate": 1.496794129832585e-07, - "loss": 0.2514, + "epoch": 4.950445093163225, + "grad_norm": 0.2565666437149048, + "learning_rate": 1.377398651777051e-08, + "loss": 0.3648, "step": 137360 }, { - "epoch": 4.83, - "learning_rate": 1.4936831312927778e-07, - "loss": 0.2441, + "epoch": 4.95062529282445, + "grad_norm": 0.26287904381752014, + "learning_rate": 1.3677302152237393e-08, + "loss": 0.3705, "step": 137365 }, { - "epoch": 4.83, - "learning_rate": 1.4905753594437465e-07, - "loss": 0.2444, + "epoch": 4.950805492485674, + "grad_norm": 0.2553930878639221, + "learning_rate": 1.3580958220513195e-08, + "loss": 0.3652, "step": 137370 }, { - "epoch": 4.83, - "learning_rate": 1.487470814325792e-07, - "loss": 0.2529, + "epoch": 4.950985692146899, + "grad_norm": 0.2285178005695343, + "learning_rate": 1.3484954723910759e-08, + "loss": 0.3703, "step": 137375 }, { - "epoch": 4.83, - "learning_rate": 1.484369495979271e-07, - "loss": 0.2412, + "epoch": 4.951165891808124, + "grad_norm": 0.2720915377140045, + "learning_rate": 1.3389291663737368e-08, + "loss": 0.3932, "step": 137380 }, { - "epoch": 4.83, - "learning_rate": 1.4812714044444286e-07, - "loss": 0.2791, + "epoch": 4.951346091469348, + "grad_norm": 0.26943936944007874, + "learning_rate": 1.3293969041297538e-08, + "loss": 0.3849, "step": 137385 }, { - "epoch": 4.83, - "learning_rate": 1.4781765397614834e-07, - "loss": 0.2492, + "epoch": 4.951526291130572, + "grad_norm": 0.2463103085756302, + "learning_rate": 1.3198986857890227e-08, + "loss": 0.3596, "step": 137390 }, { - "epoch": 4.83, - "learning_rate": 1.4750849019706526e-07, - "loss": 0.2469, + "epoch": 4.951706490791797, + "grad_norm": 0.3053506016731262, + "learning_rate": 1.3104345114808848e-08, + "loss": 0.3752, "step": 137395 }, { - "epoch": 4.83, - "learning_rate": 1.4719964911120433e-07, - "loss": 0.2257, + "epoch": 4.951886690453022, + "grad_norm": 0.24420666694641113, + "learning_rate": 1.301004381334403e-08, + "loss": 0.3704, "step": 137400 }, { - "epoch": 4.83, - "learning_rate": 1.46891130722579e-07, - "loss": 0.2551, + "epoch": 4.952066890114247, + "grad_norm": 0.26522696018218994, + "learning_rate": 1.2916082954780861e-08, + "loss": 0.372, "step": 137405 }, { - "epoch": 4.83, - "learning_rate": 1.465829350351944e-07, - "loss": 0.2612, + "epoch": 4.952247089775471, + "grad_norm": 0.2860383987426758, + "learning_rate": 1.2822462540396096e-08, + "loss": 0.3768, "step": 137410 }, { - "epoch": 4.83, - "learning_rate": 1.462750620530501e-07, - "loss": 0.253, + "epoch": 4.952427289436696, + "grad_norm": 0.21746990084648132, + "learning_rate": 1.2729182571466492e-08, + "loss": 0.3822, "step": 137415 }, { - "epoch": 4.83, - "learning_rate": 1.4596751178014567e-07, - "loss": 0.2282, + "epoch": 4.952607489097921, + "grad_norm": 0.24215134978294373, + "learning_rate": 1.2636243049266028e-08, + "loss": 0.3612, "step": 137420 }, { - "epoch": 4.83, - "learning_rate": 1.4566028422046962e-07, - "loss": 0.265, + "epoch": 4.952787688759145, + "grad_norm": 0.23102861642837524, + "learning_rate": 1.254364397506036e-08, + "loss": 0.3624, "step": 137425 }, { - "epoch": 4.84, - "learning_rate": 1.4535337937801875e-07, - "loss": 0.2416, + "epoch": 4.952967888420369, + "grad_norm": 0.2532614767551422, + "learning_rate": 1.245138535010959e-08, + "loss": 0.3882, "step": 137430 }, { - "epoch": 4.84, - "learning_rate": 1.4504679725677594e-07, - "loss": 0.2468, + "epoch": 4.953148088081594, + "grad_norm": 0.25287774205207825, + "learning_rate": 1.235946717566827e-08, + "loss": 0.4318, "step": 137435 }, { - "epoch": 4.84, - "learning_rate": 1.4474053786071862e-07, - "loss": 0.2627, + "epoch": 4.953328287742819, + "grad_norm": 0.26038363575935364, + "learning_rate": 1.226788945299373e-08, + "loss": 0.3708, "step": 137440 }, { - "epoch": 4.84, - "learning_rate": 1.4443460119382413e-07, - "loss": 0.2615, + "epoch": 4.953508487404044, + "grad_norm": 0.2988220453262329, + "learning_rate": 1.2176652183329418e-08, + "loss": 0.3655, "step": 137445 }, { - "epoch": 4.84, - "learning_rate": 1.4412898726006708e-07, - "loss": 0.2619, + "epoch": 4.953688687065268, + "grad_norm": 0.3282366096973419, + "learning_rate": 1.2085755367921558e-08, + "loss": 0.3618, "step": 137450 }, { - "epoch": 4.84, - "learning_rate": 1.4382369606341372e-07, - "loss": 0.2418, + "epoch": 4.953868886726493, + "grad_norm": 0.31220465898513794, + "learning_rate": 1.1995199008008051e-08, + "loss": 0.3505, "step": 137455 }, { - "epoch": 4.84, - "learning_rate": 1.4351872760783036e-07, - "loss": 0.2381, + "epoch": 4.954049086387718, + "grad_norm": 0.32093510031700134, + "learning_rate": 1.1904983104821243e-08, + "loss": 0.3727, "step": 137460 }, { - "epoch": 4.84, - "learning_rate": 1.4321408189727214e-07, - "loss": 0.2456, + "epoch": 4.9542292860489425, + "grad_norm": 0.2403687983751297, + "learning_rate": 1.1815107659590707e-08, + "loss": 0.3588, "step": 137465 }, { - "epoch": 4.84, - "learning_rate": 1.4290975893570258e-07, - "loss": 0.2547, + "epoch": 4.954409485710167, + "grad_norm": 0.21455620229244232, + "learning_rate": 1.172557267354324e-08, + "loss": 0.3521, "step": 137470 }, { - "epoch": 4.84, - "learning_rate": 1.426057587270657e-07, - "loss": 0.2227, + "epoch": 4.954589685371392, + "grad_norm": 0.26999422907829285, + "learning_rate": 1.1636378147897308e-08, + "loss": 0.3487, "step": 137475 }, { - "epoch": 4.84, - "learning_rate": 1.423020812753112e-07, - "loss": 0.2465, + "epoch": 4.954769885032616, + "grad_norm": 0.2714780569076538, + "learning_rate": 1.1547524083865835e-08, + "loss": 0.3968, "step": 137480 }, { - "epoch": 4.84, - "learning_rate": 1.4199872658438308e-07, - "loss": 0.2424, + "epoch": 4.954950084693841, + "grad_norm": 0.3368239104747772, + "learning_rate": 1.1459010482661735e-08, + "loss": 0.3995, "step": 137485 }, { - "epoch": 4.84, - "learning_rate": 1.4169569465821708e-07, - "loss": 0.2548, + "epoch": 4.955130284355065, + "grad_norm": 0.24230121076107025, + "learning_rate": 1.137083734549238e-08, + "loss": 0.3657, "step": 137490 }, { - "epoch": 4.84, - "learning_rate": 1.4139298550075174e-07, - "loss": 0.2368, + "epoch": 4.95531048401629, + "grad_norm": 0.20803499221801758, + "learning_rate": 1.1283004673554032e-08, + "loss": 0.3669, "step": 137495 }, { - "epoch": 4.84, - "learning_rate": 1.4109059911591726e-07, - "loss": 0.2584, + "epoch": 4.955490683677515, + "grad_norm": 0.19922496378421783, + "learning_rate": 1.1195512468048508e-08, + "loss": 0.3611, "step": 137500 }, { - "epoch": 4.84, - "eval_loss": 0.24842242896556854, - "eval_runtime": 10.5421, - "eval_samples_per_second": 9.486, - "eval_steps_per_second": 9.486, + "epoch": 4.955490683677515, + "eval_loss": 0.42881152033805847, + "eval_runtime": 3.5375, + "eval_samples_per_second": 28.268, + "eval_steps_per_second": 7.067, "step": 137500 }, { - "epoch": 4.84, - "learning_rate": 1.4078853550763827e-07, - "loss": 0.2424, + "epoch": 4.9556708833387395, + "grad_norm": 0.2823123335838318, + "learning_rate": 1.1108360730169299e-08, + "loss": 0.3637, "step": 137505 }, { - "epoch": 4.84, - "learning_rate": 1.404867946798366e-07, - "loss": 0.2437, + "epoch": 4.955851082999964, + "grad_norm": 0.2755952775478363, + "learning_rate": 1.1021549461096015e-08, + "loss": 0.3775, "step": 137510 }, { - "epoch": 4.84, - "learning_rate": 1.4018537663643138e-07, - "loss": 0.2562, + "epoch": 4.956031282661189, + "grad_norm": 0.24063868820667267, + "learning_rate": 1.093507866201937e-08, + "loss": 0.3269, "step": 137515 }, { - "epoch": 4.84, - "learning_rate": 1.3988428138133614e-07, - "loss": 0.2518, + "epoch": 4.956211482322413, + "grad_norm": 0.20716242492198944, + "learning_rate": 1.0848948334113428e-08, + "loss": 0.3577, "step": 137520 }, { - "epoch": 4.84, - "learning_rate": 1.3958350891845884e-07, - "loss": 0.2405, + "epoch": 4.9563916819836376, + "grad_norm": 0.30676987767219543, + "learning_rate": 1.0763158478549474e-08, + "loss": 0.3848, "step": 137525 }, { - "epoch": 4.84, - "learning_rate": 1.3928305925170748e-07, - "loss": 0.265, + "epoch": 4.956571881644862, + "grad_norm": 0.2790820896625519, + "learning_rate": 1.0677709096501565e-08, + "loss": 0.354, "step": 137530 }, { - "epoch": 4.84, - "learning_rate": 1.3898293238498173e-07, - "loss": 0.2407, + "epoch": 4.956752081306087, + "grad_norm": 0.22698865830898285, + "learning_rate": 1.0609594732577389e-08, + "loss": 0.3403, "step": 137535 }, { - "epoch": 4.84, - "learning_rate": 1.3868312832217566e-07, - "loss": 0.243, + "epoch": 4.956932280967312, + "grad_norm": 0.19917847216129303, + "learning_rate": 1.0524758205784913e-08, + "loss": 0.3678, "step": 137540 }, { - "epoch": 4.84, - "learning_rate": 1.3838364706718898e-07, - "loss": 0.2689, + "epoch": 4.9571124806285365, + "grad_norm": 0.2684897482395172, + "learning_rate": 1.0440262155755886e-08, + "loss": 0.3663, "step": 137545 }, { - "epoch": 4.84, - "learning_rate": 1.3808448862390467e-07, - "loss": 0.2632, + "epoch": 4.957292680289761, + "grad_norm": 0.24667024612426758, + "learning_rate": 1.0356106583639391e-08, + "loss": 0.3596, "step": 137550 }, { - "epoch": 4.84, - "learning_rate": 1.377856529962085e-07, - "loss": 0.2625, + "epoch": 4.957472879950986, + "grad_norm": 0.1977359503507614, + "learning_rate": 1.0272291490581731e-08, + "loss": 0.3723, "step": 137555 }, { - "epoch": 4.84, - "learning_rate": 1.3748714018797792e-07, - "loss": 0.2655, + "epoch": 4.957653079612211, + "grad_norm": 0.26738354563713074, + "learning_rate": 1.0188816877726437e-08, + "loss": 0.3911, "step": 137560 }, { - "epoch": 4.84, - "learning_rate": 1.3718895020309598e-07, - "loss": 0.2619, + "epoch": 4.957833279273435, + "grad_norm": 0.2211945503950119, + "learning_rate": 1.0105682746211487e-08, + "loss": 0.3196, "step": 137565 }, { - "epoch": 4.84, - "learning_rate": 1.3689108304542896e-07, - "loss": 0.2728, + "epoch": 4.958013478934659, + "grad_norm": 0.27656158804893494, + "learning_rate": 1.0022889097169308e-08, + "loss": 0.348, "step": 137570 }, { - "epoch": 4.84, - "learning_rate": 1.3659353871884606e-07, - "loss": 0.2463, + "epoch": 4.958193678595884, + "grad_norm": 0.2519603967666626, + "learning_rate": 9.940435931724001e-09, + "loss": 0.3757, "step": 137575 }, { - "epoch": 4.84, - "learning_rate": 1.3629631722721082e-07, - "loss": 0.2852, + "epoch": 4.958373878257109, + "grad_norm": 0.25226572155952454, + "learning_rate": 9.858323251005219e-09, + "loss": 0.3778, "step": 137580 }, { - "epoch": 4.84, - "learning_rate": 1.3599941857438125e-07, - "loss": 0.2604, + "epoch": 4.9585540779183335, + "grad_norm": 0.20732538402080536, + "learning_rate": 9.77655105612596e-09, + "loss": 0.3805, "step": 137585 }, { - "epoch": 4.84, - "learning_rate": 1.357028427642154e-07, - "loss": 0.2438, + "epoch": 4.958734277579558, + "grad_norm": 0.2726946771144867, + "learning_rate": 9.695119348204773e-09, + "loss": 0.3624, "step": 137590 }, { - "epoch": 4.84, - "learning_rate": 1.354065898005602e-07, - "loss": 0.2587, + "epoch": 4.958914477240783, + "grad_norm": 0.20067138969898224, + "learning_rate": 9.614028128346331e-09, + "loss": 0.333, "step": 137595 }, { - "epoch": 4.84, - "learning_rate": 1.3511065968726255e-07, - "loss": 0.2569, + "epoch": 4.959094676902008, + "grad_norm": 0.2827952206134796, + "learning_rate": 9.533277397660856e-09, + "loss": 0.3483, "step": 137600 }, { - "epoch": 4.84, - "learning_rate": 1.3481505242816661e-07, - "loss": 0.2462, + "epoch": 4.959274876563232, + "grad_norm": 0.23344914615154266, + "learning_rate": 9.452867157247469e-09, + "loss": 0.3538, "step": 137605 }, { - "epoch": 4.84, - "learning_rate": 1.3451976802711376e-07, - "loss": 0.2453, + "epoch": 4.959455076224456, + "grad_norm": 0.2827022969722748, + "learning_rate": 9.372797408196965e-09, + "loss": 0.3309, "step": 137610 }, { - "epoch": 4.84, - "learning_rate": 1.3422480648793146e-07, - "loss": 0.2474, + "epoch": 4.959635275885681, + "grad_norm": 0.2828150987625122, + "learning_rate": 9.293068151605689e-09, + "loss": 0.3894, "step": 137615 }, { - "epoch": 4.84, - "learning_rate": 1.3393016781445278e-07, - "loss": 0.26, + "epoch": 4.959815475546906, + "grad_norm": 0.26365163922309875, + "learning_rate": 9.213679388558882e-09, + "loss": 0.3679, "step": 137620 }, { - "epoch": 4.84, - "learning_rate": 1.336358520105052e-07, - "loss": 0.2556, + "epoch": 4.9599956752081305, + "grad_norm": 0.291981965303421, + "learning_rate": 9.134631120136238e-09, + "loss": 0.3676, "step": 137625 }, { - "epoch": 4.84, - "learning_rate": 1.3334185907990792e-07, - "loss": 0.2798, + "epoch": 4.960175874869355, + "grad_norm": 0.2089691460132599, + "learning_rate": 9.055923347414675e-09, + "loss": 0.3302, "step": 137630 }, { - "epoch": 4.84, - "learning_rate": 1.330481890264801e-07, - "loss": 0.2664, + "epoch": 4.96035607453058, + "grad_norm": 0.33166277408599854, + "learning_rate": 8.977556071471105e-09, + "loss": 0.352, "step": 137635 }, { - "epoch": 4.84, - "learning_rate": 1.3275484185403254e-07, - "loss": 0.2458, + "epoch": 4.960536274191805, + "grad_norm": 0.2414046823978424, + "learning_rate": 8.899529293365794e-09, + "loss": 0.3702, "step": 137640 }, { - "epoch": 4.84, - "learning_rate": 1.324618175663761e-07, - "loss": 0.2679, + "epoch": 4.960716473853029, + "grad_norm": 0.2852274477481842, + "learning_rate": 8.821843014170106e-09, + "loss": 0.3833, "step": 137645 }, { - "epoch": 4.84, - "learning_rate": 1.321691161673133e-07, - "loss": 0.2491, + "epoch": 4.960896673514254, + "grad_norm": 0.23705297708511353, + "learning_rate": 8.744497234935978e-09, + "loss": 0.3855, "step": 137650 }, { - "epoch": 4.84, - "learning_rate": 1.3187673766064667e-07, - "loss": 0.2237, + "epoch": 4.961076873175479, + "grad_norm": 0.28535374999046326, + "learning_rate": 8.66749195671812e-09, + "loss": 0.3653, "step": 137655 }, { - "epoch": 4.84, - "learning_rate": 1.3158468205017038e-07, - "loss": 0.2304, + "epoch": 4.961257072836703, + "grad_norm": 0.23056595027446747, + "learning_rate": 8.590827180571248e-09, + "loss": 0.3719, "step": 137660 }, { - "epoch": 4.84, - "learning_rate": 1.3129294933967862e-07, - "loss": 0.2609, + "epoch": 4.961437272497927, + "grad_norm": 0.24574077129364014, + "learning_rate": 8.514502907533418e-09, + "loss": 0.397, "step": 137665 }, { - "epoch": 4.84, - "learning_rate": 1.3100153953296e-07, - "loss": 0.2479, + "epoch": 4.961617472159152, + "grad_norm": 0.24938498437404633, + "learning_rate": 8.438519138645462e-09, + "loss": 0.3871, "step": 137670 }, { - "epoch": 4.84, - "learning_rate": 1.3071045263379765e-07, - "loss": 0.2493, + "epoch": 4.961797671820377, + "grad_norm": 0.35726431012153625, + "learning_rate": 8.362875874945442e-09, + "loss": 0.3636, "step": 137675 }, { - "epoch": 4.84, - "learning_rate": 1.304196886459691e-07, - "loss": 0.2465, + "epoch": 4.961977871481602, + "grad_norm": 0.2681273818016052, + "learning_rate": 8.287573117465863e-09, + "loss": 0.3763, "step": 137680 }, { - "epoch": 4.84, - "learning_rate": 1.3012924757325184e-07, - "loss": 0.2568, + "epoch": 4.962158071142826, + "grad_norm": 0.25855353474617004, + "learning_rate": 8.212610867225356e-09, + "loss": 0.3967, "step": 137685 }, { - "epoch": 4.84, - "learning_rate": 1.2983912941941789e-07, - "loss": 0.2331, + "epoch": 4.962338270804051, + "grad_norm": 0.22436444461345673, + "learning_rate": 8.137989125250877e-09, + "loss": 0.3584, "step": 137690 }, { - "epoch": 4.84, - "learning_rate": 1.295493341882309e-07, - "loss": 0.2528, + "epoch": 4.962518470465276, + "grad_norm": 0.2069372534751892, + "learning_rate": 8.063707892558281e-09, + "loss": 0.3502, "step": 137695 }, { - "epoch": 4.84, - "learning_rate": 1.292598618834573e-07, - "loss": 0.2509, + "epoch": 4.9626986701265, + "grad_norm": 0.23629046976566315, + "learning_rate": 7.989767170157869e-09, + "loss": 0.3882, "step": 137700 }, { - "epoch": 4.84, - "learning_rate": 1.289707125088524e-07, - "loss": 0.2602, + "epoch": 4.962878869787724, + "grad_norm": 0.24616286158561707, + "learning_rate": 7.916166959059944e-09, + "loss": 0.3603, "step": 137705 }, { - "epoch": 4.85, - "learning_rate": 1.2868188606817156e-07, - "loss": 0.2426, + "epoch": 4.963059069448949, + "grad_norm": 0.22220478951931, + "learning_rate": 7.84290726026371e-09, + "loss": 0.3726, "step": 137710 }, { - "epoch": 4.85, - "learning_rate": 1.2839338256516454e-07, - "loss": 0.2452, + "epoch": 4.963239269110174, + "grad_norm": 0.23332755267620087, + "learning_rate": 7.769988074771139e-09, + "loss": 0.3892, "step": 137715 }, { - "epoch": 4.85, - "learning_rate": 1.281052020035811e-07, - "loss": 0.2606, + "epoch": 4.963419468771399, + "grad_norm": 0.2551371455192566, + "learning_rate": 7.697409403573108e-09, + "loss": 0.3594, "step": 137720 }, { - "epoch": 4.85, - "learning_rate": 1.2781734438715998e-07, - "loss": 0.2324, + "epoch": 4.963599668432623, + "grad_norm": 0.26924464106559753, + "learning_rate": 7.625171247657714e-09, + "loss": 0.3503, "step": 137725 }, { - "epoch": 4.85, - "learning_rate": 1.275298097196398e-07, - "loss": 0.2512, + "epoch": 4.963779868093848, + "grad_norm": 0.25808215141296387, + "learning_rate": 7.553273608013057e-09, + "loss": 0.3743, "step": 137730 }, { - "epoch": 4.85, - "learning_rate": 1.2724259800475092e-07, - "loss": 0.2296, + "epoch": 4.963960067755073, + "grad_norm": 0.30064940452575684, + "learning_rate": 7.481716485616131e-09, + "loss": 0.3813, "step": 137735 }, { - "epoch": 4.85, - "learning_rate": 1.2695570924622924e-07, - "loss": 0.247, + "epoch": 4.9641402674162975, + "grad_norm": 0.23391714692115784, + "learning_rate": 7.410499881438382e-09, + "loss": 0.3778, "step": 137740 }, { - "epoch": 4.85, - "learning_rate": 1.2666914344779402e-07, - "loss": 0.2445, + "epoch": 4.964320467077522, + "grad_norm": 0.25494813919067383, + "learning_rate": 7.339623796456807e-09, + "loss": 0.3887, "step": 137745 }, { - "epoch": 4.85, - "learning_rate": 1.2638290061316727e-07, - "loss": 0.2427, + "epoch": 4.964500666738747, + "grad_norm": 0.30371445417404175, + "learning_rate": 7.2690882316345245e-09, + "loss": 0.3799, "step": 137750 }, { - "epoch": 4.85, - "learning_rate": 1.2609698074606546e-07, - "loss": 0.2335, + "epoch": 4.964680866399971, + "grad_norm": 0.3035849630832672, + "learning_rate": 7.198893187931877e-09, + "loss": 0.372, "step": 137755 }, { - "epoch": 4.85, - "learning_rate": 1.258113838502023e-07, - "loss": 0.2391, + "epoch": 4.964861066061196, + "grad_norm": 0.7903338670730591, + "learning_rate": 7.129038666306431e-09, + "loss": 0.3787, "step": 137760 }, { - "epoch": 4.85, - "learning_rate": 1.2552610992928594e-07, - "loss": 0.2573, + "epoch": 4.96504126572242, + "grad_norm": 0.24585622549057007, + "learning_rate": 7.059524667707429e-09, + "loss": 0.3528, "step": 137765 }, { - "epoch": 4.85, - "learning_rate": 1.2524115898701893e-07, - "loss": 0.2355, + "epoch": 4.965221465383645, + "grad_norm": 0.6391294002532959, + "learning_rate": 6.99035119308411e-09, + "loss": 0.3562, "step": 137770 }, { - "epoch": 4.85, - "learning_rate": 1.249565310271039e-07, - "loss": 0.2606, + "epoch": 4.96540166504487, + "grad_norm": 0.2620919942855835, + "learning_rate": 6.9215182433773895e-09, + "loss": 0.349, "step": 137775 }, { - "epoch": 4.85, - "learning_rate": 1.2467222605323514e-07, - "loss": 0.2592, + "epoch": 4.9655818647060945, + "grad_norm": 0.25720539689064026, + "learning_rate": 6.85302581952818e-09, + "loss": 0.334, "step": 137780 }, { - "epoch": 4.85, - "learning_rate": 1.243882440691041e-07, - "loss": 0.2403, + "epoch": 4.965762064367319, + "grad_norm": 0.23249337077140808, + "learning_rate": 6.784873922466295e-09, + "loss": 0.3948, "step": 137785 }, { - "epoch": 4.85, - "learning_rate": 1.2410458507839673e-07, - "loss": 0.2583, + "epoch": 4.965942264028544, + "grad_norm": 0.29254230856895447, + "learning_rate": 6.717062553121545e-09, + "loss": 0.3938, "step": 137790 }, { - "epoch": 4.85, - "learning_rate": 1.238212490847962e-07, - "loss": 0.2581, + "epoch": 4.966122463689768, + "grad_norm": 0.27492207288742065, + "learning_rate": 6.649591712420966e-09, + "loss": 0.4013, "step": 137795 }, { - "epoch": 4.85, - "learning_rate": 1.2353823609198567e-07, - "loss": 0.2451, + "epoch": 4.966302663350993, + "grad_norm": 0.24353964626789093, + "learning_rate": 6.5824614012777174e-09, + "loss": 0.3498, "step": 137800 }, { - "epoch": 4.85, - "learning_rate": 1.232555461036372e-07, - "loss": 0.2673, + "epoch": 4.966482863012217, + "grad_norm": 0.2534390389919281, + "learning_rate": 6.515671620610508e-09, + "loss": 0.3779, "step": 137805 }, { - "epoch": 4.85, - "learning_rate": 1.229731791234201e-07, - "loss": 0.2485, + "epoch": 4.966663062673442, + "grad_norm": 0.28155553340911865, + "learning_rate": 6.449222371326946e-09, + "loss": 0.3622, "step": 137810 }, { - "epoch": 4.85, - "learning_rate": 1.2269113515500085e-07, - "loss": 0.2637, + "epoch": 4.966843262334667, + "grad_norm": 0.28064605593681335, + "learning_rate": 6.383113654334638e-09, + "loss": 0.3944, "step": 137815 }, { - "epoch": 4.85, - "learning_rate": 1.224094142020432e-07, - "loss": 0.2276, + "epoch": 4.9670234619958915, + "grad_norm": 0.32647833228111267, + "learning_rate": 6.31734547053564e-09, + "loss": 0.3521, "step": 137820 }, { - "epoch": 4.85, - "learning_rate": 1.2212801626820536e-07, - "loss": 0.254, + "epoch": 4.967203661657116, + "grad_norm": 0.2887493073940277, + "learning_rate": 6.2519178208209075e-09, + "loss": 0.3774, "step": 137825 }, { - "epoch": 4.85, - "learning_rate": 1.2184694135713714e-07, - "loss": 0.2214, + "epoch": 4.967383861318341, + "grad_norm": 0.2260408252477646, + "learning_rate": 6.186830706086943e-09, + "loss": 0.4083, "step": 137830 }, { - "epoch": 4.85, - "learning_rate": 1.2156618947249397e-07, - "loss": 0.2268, + "epoch": 4.967564060979566, + "grad_norm": 0.2433057725429535, + "learning_rate": 6.122084127216377e-09, + "loss": 0.4013, "step": 137835 }, { - "epoch": 4.85, - "learning_rate": 1.212857606179174e-07, - "loss": 0.2677, + "epoch": 4.96774426064079, + "grad_norm": 0.2385009527206421, + "learning_rate": 6.057678085094609e-09, + "loss": 0.3676, "step": 137840 }, { - "epoch": 4.85, - "learning_rate": 1.2100565479704896e-07, - "loss": 0.2739, + "epoch": 4.967924460302014, + "grad_norm": 0.20035654306411743, + "learning_rate": 5.993612580598717e-09, + "loss": 0.3494, "step": 137845 }, { - "epoch": 4.85, - "learning_rate": 1.2072587201352736e-07, - "loss": 0.2272, + "epoch": 4.968104659963239, + "grad_norm": 0.28439047932624817, + "learning_rate": 5.929887614600227e-09, + "loss": 0.3352, "step": 137850 }, { - "epoch": 4.85, - "learning_rate": 1.2044641227098308e-07, - "loss": 0.2485, + "epoch": 4.968284859624464, + "grad_norm": 0.2770126163959503, + "learning_rate": 5.866503187967886e-09, + "loss": 0.3925, "step": 137855 }, { - "epoch": 4.85, - "learning_rate": 1.2016727557304652e-07, - "loss": 0.2608, + "epoch": 4.9684650592856885, + "grad_norm": 0.33329346776008606, + "learning_rate": 5.803459301564895e-09, + "loss": 0.3521, "step": 137860 }, { - "epoch": 4.85, - "learning_rate": 1.1988846192334257e-07, - "loss": 0.2513, + "epoch": 4.968645258946913, + "grad_norm": 0.27743661403656006, + "learning_rate": 5.7407559562516754e-09, + "loss": 0.3377, "step": 137865 }, { - "epoch": 4.85, - "learning_rate": 1.1960997132548779e-07, - "loss": 0.2354, + "epoch": 4.968825458608138, + "grad_norm": 0.2770087718963623, + "learning_rate": 5.678393152880323e-09, + "loss": 0.3618, "step": 137870 }, { - "epoch": 4.85, - "learning_rate": 1.1933180378310427e-07, - "loss": 0.2553, + "epoch": 4.969005658269363, + "grad_norm": 0.26707491278648376, + "learning_rate": 5.616370892302936e-09, + "loss": 0.3735, "step": 137875 }, { - "epoch": 4.85, - "learning_rate": 1.1905395929979746e-07, - "loss": 0.2457, + "epoch": 4.969185857930587, + "grad_norm": 0.24807052314281464, + "learning_rate": 5.554689175363281e-09, + "loss": 0.3664, "step": 137880 }, { - "epoch": 4.85, - "learning_rate": 1.1877643787917837e-07, - "loss": 0.2558, + "epoch": 4.969366057591811, + "grad_norm": 0.2263546884059906, + "learning_rate": 5.493348002902354e-09, + "loss": 0.3802, "step": 137885 }, { - "epoch": 4.85, - "learning_rate": 1.1849923952484965e-07, - "loss": 0.2364, + "epoch": 4.969546257253036, + "grad_norm": 0.2367497682571411, + "learning_rate": 5.432347375755598e-09, + "loss": 0.3439, "step": 137890 }, { - "epoch": 4.85, - "learning_rate": 1.1822236424041123e-07, - "loss": 0.2577, + "epoch": 4.969726456914261, + "grad_norm": 0.25867414474487305, + "learning_rate": 5.371687294752903e-09, + "loss": 0.3795, "step": 137895 }, { - "epoch": 4.85, - "learning_rate": 1.1794581202945742e-07, - "loss": 0.2297, + "epoch": 4.9699066565754855, + "grad_norm": 0.2083641141653061, + "learning_rate": 5.311367760721387e-09, + "loss": 0.3653, "step": 137900 }, { - "epoch": 4.85, - "learning_rate": 1.1766958289557983e-07, - "loss": 0.2403, + "epoch": 4.97008685623671, + "grad_norm": 0.24108898639678955, + "learning_rate": 5.251388774485388e-09, + "loss": 0.3576, "step": 137905 }, { - "epoch": 4.85, - "learning_rate": 1.1739367684236446e-07, - "loss": 0.2338, + "epoch": 4.970267055897935, + "grad_norm": 0.2666069567203522, + "learning_rate": 5.1917503368581474e-09, + "loss": 0.3818, "step": 137910 }, { - "epoch": 4.85, - "learning_rate": 1.1711809387339457e-07, - "loss": 0.2553, + "epoch": 4.97044725555916, + "grad_norm": 0.24382546544075012, + "learning_rate": 5.132452448655678e-09, + "loss": 0.3721, "step": 137915 }, { - "epoch": 4.85, - "learning_rate": 1.1684283399224505e-07, - "loss": 0.2175, + "epoch": 4.970627455220384, + "grad_norm": 0.307109534740448, + "learning_rate": 5.073495110682891e-09, + "loss": 0.4167, "step": 137920 }, { - "epoch": 4.85, - "learning_rate": 1.165678972024964e-07, - "loss": 0.2418, + "epoch": 4.970807654881609, + "grad_norm": 0.2820093631744385, + "learning_rate": 5.014878323744698e-09, + "loss": 0.3609, "step": 137925 }, { - "epoch": 4.85, - "learning_rate": 1.162932835077124e-07, - "loss": 0.2471, + "epoch": 4.970987854542834, + "grad_norm": 0.2773934304714203, + "learning_rate": 4.956602088640461e-09, + "loss": 0.3742, "step": 137930 }, { - "epoch": 4.85, - "learning_rate": 1.1601899291146245e-07, - "loss": 0.2449, + "epoch": 4.971168054204058, + "grad_norm": 0.23025000095367432, + "learning_rate": 4.898666406163988e-09, + "loss": 0.3682, "step": 137935 }, { - "epoch": 4.85, - "learning_rate": 1.157450254173048e-07, - "loss": 0.2599, + "epoch": 4.9713482538652825, + "grad_norm": 0.24413655698299408, + "learning_rate": 4.841071277103537e-09, + "loss": 0.4005, "step": 137940 }, { - "epoch": 4.85, - "learning_rate": 1.154713810288005e-07, - "loss": 0.2583, + "epoch": 4.971528453526507, + "grad_norm": 0.23541654646396637, + "learning_rate": 4.783816702241817e-09, + "loss": 0.3692, "step": 137945 }, { - "epoch": 4.85, - "learning_rate": 1.1519805974949948e-07, - "loss": 0.2593, + "epoch": 4.971708653187732, + "grad_norm": 0.2659054398536682, + "learning_rate": 4.7269026823643095e-09, + "loss": 0.4064, "step": 137950 }, { - "epoch": 4.85, - "learning_rate": 1.1492506158295447e-07, - "loss": 0.2464, + "epoch": 4.971888852848957, + "grad_norm": 0.23656433820724487, + "learning_rate": 4.670329218242619e-09, + "loss": 0.3935, "step": 137955 }, { - "epoch": 4.85, - "learning_rate": 1.1465238653270427e-07, - "loss": 0.2589, + "epoch": 4.972069052510181, + "grad_norm": 0.27703598141670227, + "learning_rate": 4.614096310648352e-09, + "loss": 0.4068, "step": 137960 }, { - "epoch": 4.85, - "learning_rate": 1.1438003460229607e-07, - "loss": 0.2413, + "epoch": 4.972249252171406, + "grad_norm": 0.20020613074302673, + "learning_rate": 4.558203960347562e-09, + "loss": 0.3666, "step": 137965 }, { - "epoch": 4.85, - "learning_rate": 1.1410800579526037e-07, - "loss": 0.2471, + "epoch": 4.972429451832631, + "grad_norm": 0.27739599347114563, + "learning_rate": 4.5026521681035275e-09, + "loss": 0.3525, "step": 137970 }, { - "epoch": 4.85, - "learning_rate": 1.1383630011513325e-07, - "loss": 0.2523, + "epoch": 4.972609651493855, + "grad_norm": 0.31181418895721436, + "learning_rate": 4.4474409346711984e-09, + "loss": 0.412, "step": 137975 }, { - "epoch": 4.85, - "learning_rate": 1.1356491756543964e-07, - "loss": 0.2421, + "epoch": 4.972789851155079, + "grad_norm": 0.2787044048309326, + "learning_rate": 4.392570260799977e-09, + "loss": 0.3389, "step": 137980 }, { - "epoch": 4.85, - "learning_rate": 1.1329385814970727e-07, - "loss": 0.2348, + "epoch": 4.972970050816304, + "grad_norm": 0.25257426500320435, + "learning_rate": 4.338040147244815e-09, + "loss": 0.3701, "step": 137985 }, { - "epoch": 4.85, - "learning_rate": 1.1302312187145003e-07, - "loss": 0.2566, + "epoch": 4.973150250477529, + "grad_norm": 0.34394869208335876, + "learning_rate": 4.283850594741234e-09, + "loss": 0.3779, "step": 137990 }, { - "epoch": 4.86, - "learning_rate": 1.1275270873419008e-07, - "loss": 0.2439, + "epoch": 4.973330450138754, + "grad_norm": 0.2812308073043823, + "learning_rate": 4.230001604033085e-09, + "loss": 0.3751, "step": 137995 }, { - "epoch": 4.86, - "learning_rate": 1.1248261874143296e-07, - "loss": 0.2405, + "epoch": 4.973510649799978, + "grad_norm": 0.30743271112442017, + "learning_rate": 4.176493175850338e-09, + "loss": 0.392, "step": 138000 }, { - "epoch": 4.86, - "eval_loss": 0.24844183027744293, - "eval_runtime": 10.5317, - "eval_samples_per_second": 9.495, - "eval_steps_per_second": 9.495, + "epoch": 4.973510649799978, + "eval_loss": 0.4287694990634918, + "eval_runtime": 3.5429, + "eval_samples_per_second": 28.226, + "eval_steps_per_second": 7.056, "step": 138000 }, { - "epoch": 4.86, - "learning_rate": 1.1221285189668696e-07, - "loss": 0.2696, + "epoch": 4.973690849461203, + "grad_norm": 0.20221595466136932, + "learning_rate": 4.123325310925741e-09, + "loss": 0.374, "step": 138005 }, { - "epoch": 4.86, - "learning_rate": 1.1194340820345484e-07, - "loss": 0.2456, + "epoch": 4.973871049122428, + "grad_norm": 0.19391195476055145, + "learning_rate": 4.070498009978163e-09, + "loss": 0.3898, "step": 138010 }, { - "epoch": 4.86, - "learning_rate": 1.1167428766523658e-07, - "loss": 0.2656, + "epoch": 4.974051248783653, + "grad_norm": 0.2685374617576599, + "learning_rate": 4.018011273734801e-09, + "loss": 0.3656, "step": 138015 }, { - "epoch": 4.86, - "learning_rate": 1.114054902855266e-07, - "loss": 0.216, + "epoch": 4.974231448444877, + "grad_norm": 0.2981413006782532, + "learning_rate": 3.965865102903421e-09, + "loss": 0.339, "step": 138020 }, { - "epoch": 4.86, - "learning_rate": 1.1113701606781379e-07, - "loss": 0.2607, + "epoch": 4.974411648106102, + "grad_norm": 0.24895471334457397, + "learning_rate": 3.914059498197342e-09, + "loss": 0.3855, "step": 138025 }, { - "epoch": 4.86, - "learning_rate": 1.1086886501558424e-07, - "loss": 0.2399, + "epoch": 4.974591847767326, + "grad_norm": 0.289763867855072, + "learning_rate": 3.862594460324332e-09, + "loss": 0.3667, "step": 138030 }, { - "epoch": 4.86, - "learning_rate": 1.1060103713232128e-07, - "loss": 0.252, + "epoch": 4.974772047428551, + "grad_norm": 0.2891743779182434, + "learning_rate": 3.811469989983829e-09, + "loss": 0.3728, "step": 138035 }, { - "epoch": 4.86, - "learning_rate": 1.1033353242149713e-07, - "loss": 0.2688, + "epoch": 4.974952247089775, + "grad_norm": 0.30129414796829224, + "learning_rate": 3.7606860878725005e-09, + "loss": 0.3678, "step": 138040 }, { - "epoch": 4.86, - "learning_rate": 1.1006635088659234e-07, - "loss": 0.2486, + "epoch": 4.975132446751, + "grad_norm": 0.3116011619567871, + "learning_rate": 3.7102427546842343e-09, + "loss": 0.3739, "step": 138045 }, { - "epoch": 4.86, - "learning_rate": 1.097994925310708e-07, - "loss": 0.2501, + "epoch": 4.975312646412225, + "grad_norm": 0.24255453050136566, + "learning_rate": 3.6601399911018185e-09, + "loss": 0.3675, "step": 138050 }, { - "epoch": 4.86, - "learning_rate": 1.0953295735840196e-07, - "loss": 0.2465, + "epoch": 4.9754928460734495, + "grad_norm": 0.33993199467658997, + "learning_rate": 3.610377797813591e-09, + "loss": 0.3956, "step": 138055 }, { - "epoch": 4.86, - "learning_rate": 1.0926674537204417e-07, - "loss": 0.2444, + "epoch": 4.975673045734674, + "grad_norm": 0.23433293402194977, + "learning_rate": 3.560956175491237e-09, + "loss": 0.3584, "step": 138060 }, { - "epoch": 4.86, - "learning_rate": 1.09000856575453e-07, - "loss": 0.2591, + "epoch": 4.975853245395899, + "grad_norm": 0.3143203556537628, + "learning_rate": 3.511875124811992e-09, + "loss": 0.3765, "step": 138065 }, { - "epoch": 4.86, - "learning_rate": 1.0873529097208124e-07, - "loss": 0.2573, + "epoch": 4.976033445057123, + "grad_norm": 0.25443294644355774, + "learning_rate": 3.4631346464447658e-09, + "loss": 0.3904, "step": 138070 }, { - "epoch": 4.86, - "learning_rate": 1.084700485653789e-07, - "loss": 0.2582, + "epoch": 4.976213644718348, + "grad_norm": 0.22525587677955627, + "learning_rate": 3.4147347410529164e-09, + "loss": 0.3743, "step": 138075 }, { - "epoch": 4.86, - "learning_rate": 1.0820512935879046e-07, - "loss": 0.2638, + "epoch": 4.976393844379572, + "grad_norm": 0.27883243560791016, + "learning_rate": 3.3666754092970266e-09, + "loss": 0.3883, "step": 138080 }, { - "epoch": 4.86, - "learning_rate": 1.0794053335575205e-07, - "loss": 0.2711, + "epoch": 4.976574044040797, + "grad_norm": 0.2141328603029251, + "learning_rate": 3.3189566518293526e-09, + "loss": 0.3603, "step": 138085 }, { - "epoch": 4.86, - "learning_rate": 1.0767626055969982e-07, - "loss": 0.2571, + "epoch": 4.976754243702022, + "grad_norm": 0.24589018523693085, + "learning_rate": 3.2715784692993747e-09, + "loss": 0.3851, "step": 138090 }, { - "epoch": 4.86, - "learning_rate": 1.074123109740699e-07, - "loss": 0.2652, + "epoch": 4.9769344433632465, + "grad_norm": 0.27227839827537537, + "learning_rate": 3.2245408623565733e-09, + "loss": 0.3899, "step": 138095 }, { - "epoch": 4.86, - "learning_rate": 1.0714868460228456e-07, - "loss": 0.2603, + "epoch": 4.977114643024471, + "grad_norm": 0.2859554588794708, + "learning_rate": 3.1778438316393266e-09, + "loss": 0.3719, "step": 138100 }, { - "epoch": 4.86, - "learning_rate": 1.0688538144776883e-07, - "loss": 0.2508, + "epoch": 4.977294842685696, + "grad_norm": 0.2409016191959381, + "learning_rate": 3.1314873777860133e-09, + "loss": 0.3319, "step": 138105 }, { - "epoch": 4.86, - "learning_rate": 1.0662240151393942e-07, - "loss": 0.2563, + "epoch": 4.977475042346921, + "grad_norm": 0.27938273549079895, + "learning_rate": 3.0854715014266845e-09, + "loss": 0.3947, "step": 138110 }, { - "epoch": 4.86, - "learning_rate": 1.0635974480421585e-07, - "loss": 0.2324, + "epoch": 4.9776552420081455, + "grad_norm": 0.26294222474098206, + "learning_rate": 3.039796203185841e-09, + "loss": 0.3686, "step": 138115 }, { - "epoch": 4.86, - "learning_rate": 1.0609741132200369e-07, - "loss": 0.2727, + "epoch": 4.977835441669369, + "grad_norm": 0.267437607049942, + "learning_rate": 2.994461483690758e-09, + "loss": 0.367, "step": 138120 }, { - "epoch": 4.86, - "learning_rate": 1.0583540107071132e-07, - "loss": 0.2503, + "epoch": 4.978015641330594, + "grad_norm": 0.23144756257534027, + "learning_rate": 2.9494673435548346e-09, + "loss": 0.3605, "step": 138125 }, { - "epoch": 4.86, - "learning_rate": 1.0557371405374162e-07, - "loss": 0.2551, + "epoch": 4.978195840991819, + "grad_norm": 0.24928845465183258, + "learning_rate": 2.9048137833942446e-09, + "loss": 0.3923, "step": 138130 }, { - "epoch": 4.86, - "learning_rate": 1.0531235027448904e-07, - "loss": 0.2414, + "epoch": 4.9783760406530435, + "grad_norm": 0.2760869860649109, + "learning_rate": 2.860500803814059e-09, + "loss": 0.3978, "step": 138135 }, { - "epoch": 4.86, - "learning_rate": 1.0505130973635092e-07, - "loss": 0.2511, + "epoch": 4.978556240314268, + "grad_norm": 0.25941580533981323, + "learning_rate": 2.8165284054193498e-09, + "loss": 0.3736, "step": 138140 }, { - "epoch": 4.86, - "learning_rate": 1.0479059244271616e-07, - "loss": 0.2478, + "epoch": 4.978736439975493, + "grad_norm": 0.26913321018218994, + "learning_rate": 2.7728965888124124e-09, + "loss": 0.3906, "step": 138145 }, { - "epoch": 4.86, - "learning_rate": 1.0453019839696543e-07, - "loss": 0.24, + "epoch": 4.978916639636718, + "grad_norm": 0.24921806156635284, + "learning_rate": 2.729605354584441e-09, + "loss": 0.3461, "step": 138150 }, { - "epoch": 4.86, - "learning_rate": 1.0427012760248489e-07, - "loss": 0.2648, + "epoch": 4.9790968392979424, + "grad_norm": 0.2868900001049042, + "learning_rate": 2.68665470332663e-09, + "loss": 0.3779, "step": 138155 }, { - "epoch": 4.86, - "learning_rate": 1.040103800626524e-07, - "loss": 0.2579, + "epoch": 4.979277038959166, + "grad_norm": 0.2325631082057953, + "learning_rate": 2.644044635621845e-09, + "loss": 0.384, "step": 138160 }, { - "epoch": 4.86, - "learning_rate": 1.0375095578083471e-07, - "loss": 0.242, + "epoch": 4.979457238620391, + "grad_norm": 0.24822711944580078, + "learning_rate": 2.601775152052954e-09, + "loss": 0.3653, "step": 138165 }, { - "epoch": 4.86, - "learning_rate": 1.034918547604069e-07, - "loss": 0.2422, + "epoch": 4.979637438281616, + "grad_norm": 0.2686733603477478, + "learning_rate": 2.559846253194498e-09, + "loss": 0.3809, "step": 138170 }, { - "epoch": 4.86, - "learning_rate": 1.032330770047274e-07, - "loss": 0.2489, + "epoch": 4.9798176379428405, + "grad_norm": 0.27422237396240234, + "learning_rate": 2.518257939621016e-09, + "loss": 0.3853, "step": 138175 }, { - "epoch": 4.86, - "learning_rate": 1.029746225171574e-07, - "loss": 0.242, + "epoch": 4.979997837604065, + "grad_norm": 0.24748918414115906, + "learning_rate": 2.4770102118931714e-09, + "loss": 0.4023, "step": 138180 }, { - "epoch": 4.86, - "learning_rate": 1.0271649130105809e-07, - "loss": 0.2356, + "epoch": 4.98017803726529, + "grad_norm": 0.2908730208873749, + "learning_rate": 2.4361030705771782e-09, + "loss": 0.3696, "step": 138185 }, { - "epoch": 4.86, - "learning_rate": 1.0245868335977405e-07, - "loss": 0.2374, + "epoch": 4.980358236926515, + "grad_norm": 0.31870460510253906, + "learning_rate": 2.395536516230923e-09, + "loss": 0.3693, "step": 138190 }, { - "epoch": 4.86, - "learning_rate": 1.0220119869665812e-07, - "loss": 0.2428, + "epoch": 4.980538436587739, + "grad_norm": 0.25807613134384155, + "learning_rate": 2.355310549406742e-09, + "loss": 0.4022, "step": 138195 }, { - "epoch": 4.86, - "learning_rate": 1.0194403731504932e-07, - "loss": 0.2554, + "epoch": 4.980718636248964, + "grad_norm": 0.269734263420105, + "learning_rate": 2.315425170648644e-09, + "loss": 0.3582, "step": 138200 }, { - "epoch": 4.86, - "learning_rate": 1.016871992182894e-07, - "loss": 0.2524, + "epoch": 4.980898835910189, + "grad_norm": 0.22261251509189606, + "learning_rate": 2.27588038050619e-09, + "loss": 0.4311, "step": 138205 }, { - "epoch": 4.86, - "learning_rate": 1.014306844097146e-07, - "loss": 0.2522, + "epoch": 4.981079035571413, + "grad_norm": 0.25113579630851746, + "learning_rate": 2.2366761795150626e-09, + "loss": 0.3689, "step": 138210 }, { - "epoch": 4.86, - "learning_rate": 1.0117449289265001e-07, - "loss": 0.2532, + "epoch": 4.9812592352326375, + "grad_norm": 0.21979762613773346, + "learning_rate": 2.1978125682081683e-09, + "loss": 0.3975, "step": 138215 }, { - "epoch": 4.86, - "learning_rate": 1.0091862467042912e-07, - "loss": 0.2641, + "epoch": 4.981439434893862, + "grad_norm": 0.2908676564693451, + "learning_rate": 2.159289547115639e-09, + "loss": 0.3526, "step": 138220 }, { - "epoch": 4.86, - "learning_rate": 1.0066307974636868e-07, - "loss": 0.2501, + "epoch": 4.981619634555087, + "grad_norm": 0.2306397557258606, + "learning_rate": 2.121107116764831e-09, + "loss": 0.3521, "step": 138225 }, { - "epoch": 4.86, - "learning_rate": 1.0040785812379106e-07, - "loss": 0.2482, + "epoch": 4.981799834216312, + "grad_norm": 0.2513945698738098, + "learning_rate": 2.0832652776719974e-09, + "loss": 0.3938, "step": 138230 }, { - "epoch": 4.86, - "learning_rate": 1.001529598060047e-07, - "loss": 0.2367, + "epoch": 4.981980033877536, + "grad_norm": 0.2760458290576935, + "learning_rate": 2.0457640303561676e-09, + "loss": 0.3951, "step": 138235 }, { - "epoch": 4.86, - "learning_rate": 9.989838479632641e-08, - "loss": 0.2477, + "epoch": 4.982160233538761, + "grad_norm": 0.2553420066833496, + "learning_rate": 2.0086033753280442e-09, + "loss": 0.3646, "step": 138240 }, { - "epoch": 4.86, - "learning_rate": 9.964413309805353e-08, - "loss": 0.2497, + "epoch": 4.982340433199986, + "grad_norm": 0.25386497378349304, + "learning_rate": 1.971783313090003e-09, + "loss": 0.3633, "step": 138245 }, { - "epoch": 4.86, - "learning_rate": 9.939020471449456e-08, - "loss": 0.2469, + "epoch": 4.982520632861211, + "grad_norm": 0.20062953233718872, + "learning_rate": 1.9353038441499716e-09, + "loss": 0.3194, "step": 138250 }, { - "epoch": 4.86, - "learning_rate": 9.913659964894406e-08, - "loss": 0.2403, + "epoch": 4.9827008325224345, + "grad_norm": 0.26775026321411133, + "learning_rate": 1.899164968999223e-09, + "loss": 0.4075, "step": 138255 }, { - "epoch": 4.86, - "learning_rate": 9.888331790469107e-08, - "loss": 0.2633, + "epoch": 4.982881032183659, + "grad_norm": 0.273628830909729, + "learning_rate": 1.8633666881345823e-09, + "loss": 0.3616, "step": 138260 }, { - "epoch": 4.86, - "learning_rate": 9.863035948503019e-08, - "loss": 0.255, + "epoch": 4.983061231844884, + "grad_norm": 0.2722027003765106, + "learning_rate": 1.8279090020417721e-09, + "loss": 0.3612, "step": 138265 }, { - "epoch": 4.86, - "learning_rate": 9.837772439324488e-08, - "loss": 0.2596, + "epoch": 4.983241431506109, + "grad_norm": 0.2538944184780121, + "learning_rate": 1.7927919112009639e-09, + "loss": 0.3556, "step": 138270 }, { - "epoch": 4.86, - "learning_rate": 9.812541263261033e-08, - "loss": 0.2755, + "epoch": 4.983421631167333, + "grad_norm": 0.264728844165802, + "learning_rate": 1.7580154160951047e-09, + "loss": 0.386, "step": 138275 }, { - "epoch": 4.87, - "learning_rate": 9.787342420640999e-08, - "loss": 0.2565, + "epoch": 4.983601830828558, + "grad_norm": 0.2643708288669586, + "learning_rate": 1.7235795171988146e-09, + "loss": 0.3676, "step": 138280 }, { - "epoch": 4.87, - "learning_rate": 9.762175911791072e-08, - "loss": 0.2332, + "epoch": 4.983782030489783, + "grad_norm": 0.2401455044746399, + "learning_rate": 1.689484214975612e-09, + "loss": 0.3727, "step": 138285 }, { - "epoch": 4.87, - "learning_rate": 9.73704173703821e-08, - "loss": 0.2405, + "epoch": 4.983962230151008, + "grad_norm": 0.2023637741804123, + "learning_rate": 1.6557295098945658e-09, + "loss": 0.4015, "step": 138290 }, { - "epoch": 4.87, - "learning_rate": 9.711939896708544e-08, - "loss": 0.2411, + "epoch": 4.984142429812232, + "grad_norm": 0.26757675409317017, + "learning_rate": 1.622315402416419e-09, + "loss": 0.3644, "step": 138295 }, { - "epoch": 4.87, - "learning_rate": 9.686870391128478e-08, - "loss": 0.233, + "epoch": 4.984322629473457, + "grad_norm": 0.22083128988742828, + "learning_rate": 1.5892418929908114e-09, + "loss": 0.3287, "step": 138300 }, { - "epoch": 4.87, - "learning_rate": 9.661833220623029e-08, - "loss": 0.2342, + "epoch": 4.984502829134681, + "grad_norm": 0.3092709183692932, + "learning_rate": 1.5565089820729351e-09, + "loss": 0.3711, "step": 138305 }, { - "epoch": 4.87, - "learning_rate": 9.636828385517493e-08, - "loss": 0.2569, + "epoch": 4.984683028795906, + "grad_norm": 0.22019417583942413, + "learning_rate": 1.5241166701096543e-09, + "loss": 0.3608, "step": 138310 }, { - "epoch": 4.87, - "learning_rate": 9.611855886136334e-08, - "loss": 0.2464, + "epoch": 4.98486322845713, + "grad_norm": 0.24744448065757751, + "learning_rate": 1.4920649575395073e-09, + "loss": 0.3614, "step": 138315 }, { - "epoch": 4.87, - "learning_rate": 9.58691572280429e-08, - "loss": 0.2324, + "epoch": 4.985043428118355, + "grad_norm": 0.17398010194301605, + "learning_rate": 1.4603538448010323e-09, + "loss": 0.3591, "step": 138320 }, { - "epoch": 4.87, - "learning_rate": 9.562007895844716e-08, - "loss": 0.261, + "epoch": 4.98522362777958, + "grad_norm": 0.22615504264831543, + "learning_rate": 1.428983332321665e-09, + "loss": 0.363, "step": 138325 }, { - "epoch": 4.87, - "learning_rate": 9.537132405581239e-08, - "loss": 0.243, + "epoch": 4.985403827440805, + "grad_norm": 0.25961410999298096, + "learning_rate": 1.397953420537168e-09, + "loss": 0.3894, "step": 138330 }, { - "epoch": 4.87, - "learning_rate": 9.512289252336937e-08, - "loss": 0.2475, + "epoch": 4.985584027102029, + "grad_norm": 0.27555739879608154, + "learning_rate": 1.367264109863875e-09, + "loss": 0.3741, "step": 138335 }, { - "epoch": 4.87, - "learning_rate": 9.487478436434327e-08, - "loss": 0.2221, + "epoch": 4.985764226763254, + "grad_norm": 0.3149866759777069, + "learning_rate": 1.336915400720895e-09, + "loss": 0.4, "step": 138340 }, { - "epoch": 4.87, - "learning_rate": 9.462699958195376e-08, - "loss": 0.2452, + "epoch": 4.985944426424478, + "grad_norm": 0.2356063425540924, + "learning_rate": 1.3069072935217863e-09, + "loss": 0.3838, "step": 138345 }, { - "epoch": 4.87, - "learning_rate": 9.437953817941769e-08, - "loss": 0.2392, + "epoch": 4.986124626085703, + "grad_norm": 0.28220483660697937, + "learning_rate": 1.2772397886773313e-09, + "loss": 0.3788, "step": 138350 }, { - "epoch": 4.87, - "learning_rate": 9.413240015995195e-08, - "loss": 0.2473, + "epoch": 4.986304825746927, + "grad_norm": 0.21516813337802887, + "learning_rate": 1.247912886589986e-09, + "loss": 0.3715, "step": 138355 }, { - "epoch": 4.87, - "learning_rate": 9.388558552676508e-08, - "loss": 0.2585, + "epoch": 4.986485025408152, + "grad_norm": 0.2194688469171524, + "learning_rate": 1.2189265876622058e-09, + "loss": 0.3802, "step": 138360 }, { - "epoch": 4.87, - "learning_rate": 9.363909428305728e-08, - "loss": 0.2457, + "epoch": 4.986665225069377, + "grad_norm": 0.26937180757522583, + "learning_rate": 1.1902808922853447e-09, + "loss": 0.3905, "step": 138365 }, { - "epoch": 4.87, - "learning_rate": 9.339292643203157e-08, - "loss": 0.2499, + "epoch": 4.9868454247306016, + "grad_norm": 0.2718323767185211, + "learning_rate": 1.1619758008507564e-09, + "loss": 0.3827, "step": 138370 }, { - "epoch": 4.87, - "learning_rate": 9.314708197688537e-08, - "loss": 0.2475, + "epoch": 4.987025624391826, + "grad_norm": 0.31076350808143616, + "learning_rate": 1.1340113137442431e-09, + "loss": 0.3789, "step": 138375 }, { - "epoch": 4.87, - "learning_rate": 9.290156092081059e-08, - "loss": 0.2387, + "epoch": 4.987205824053051, + "grad_norm": 0.26941627264022827, + "learning_rate": 1.1063874313460564e-09, + "loss": 0.3854, "step": 138380 }, { - "epoch": 4.87, - "learning_rate": 9.265636326699634e-08, - "loss": 0.2408, + "epoch": 4.987386023714276, + "grad_norm": 0.2722313702106476, + "learning_rate": 1.0791041540336722e-09, + "loss": 0.3723, "step": 138385 }, { - "epoch": 4.87, - "learning_rate": 9.24114890186234e-08, - "loss": 0.2603, + "epoch": 4.9875662233755005, + "grad_norm": 0.2729116976261139, + "learning_rate": 1.0521614821790148e-09, + "loss": 0.3595, "step": 138390 }, { - "epoch": 4.87, - "learning_rate": 9.21669381788698e-08, - "loss": 0.2541, + "epoch": 4.987746423036724, + "grad_norm": 0.25742781162261963, + "learning_rate": 1.0255594161484583e-09, + "loss": 0.3608, "step": 138395 }, { - "epoch": 4.87, - "learning_rate": 9.19227107509163e-08, - "loss": 0.2484, + "epoch": 4.987926622697949, + "grad_norm": 0.27901482582092285, + "learning_rate": 9.992979563056003e-10, + "loss": 0.3677, "step": 138400 }, { - "epoch": 4.87, - "learning_rate": 9.167880673793261e-08, - "loss": 0.2568, + "epoch": 4.988106822359174, + "grad_norm": 0.20445889234542847, + "learning_rate": 9.785340251255993e-10, + "loss": 0.3751, "step": 138405 }, { - "epoch": 4.87, - "learning_rate": 9.143522614308287e-08, - "loss": 0.251, + "epoch": 4.9882870220203985, + "grad_norm": 0.2854103446006775, + "learning_rate": 9.528856573182144e-10, + "loss": 0.3522, "step": 138410 }, { - "epoch": 4.87, - "learning_rate": 9.11919689695312e-08, - "loss": 0.2694, + "epoch": 4.988467221681623, + "grad_norm": 0.25394073128700256, + "learning_rate": 9.275778966866267e-10, + "loss": 0.3537, "step": 138415 }, { - "epoch": 4.87, - "learning_rate": 9.094903522043618e-08, - "loss": 0.234, + "epoch": 4.988647421342848, + "grad_norm": 0.24267399311065674, + "learning_rate": 9.026107435750053e-10, + "loss": 0.3617, "step": 138420 }, { - "epoch": 4.87, - "learning_rate": 9.070642489895087e-08, - "loss": 0.2431, + "epoch": 4.988827621004073, + "grad_norm": 0.21222999691963196, + "learning_rate": 8.779841983275195e-10, + "loss": 0.3673, "step": 138425 }, { - "epoch": 4.87, - "learning_rate": 9.046413800822828e-08, - "loss": 0.257, + "epoch": 4.9890078206652975, + "grad_norm": 0.24381837248802185, + "learning_rate": 8.53698261277236e-10, + "loss": 0.376, "step": 138430 }, { - "epoch": 4.87, - "learning_rate": 9.022217455141314e-08, - "loss": 0.2599, + "epoch": 4.989188020326521, + "grad_norm": 0.3303523659706116, + "learning_rate": 8.297529327544462e-10, + "loss": 0.3671, "step": 138435 }, { - "epoch": 4.87, - "learning_rate": 8.998053453164734e-08, - "loss": 0.2709, + "epoch": 4.989368219987746, + "grad_norm": 0.21352322399616241, + "learning_rate": 8.061482130894416e-10, + "loss": 0.3689, "step": 138440 }, { - "epoch": 4.87, - "learning_rate": 8.97392179520673e-08, - "loss": 0.2447, + "epoch": 4.989548419648971, + "grad_norm": 0.21397538483142853, + "learning_rate": 7.8288410259586e-10, + "loss": 0.3615, "step": 138445 }, { - "epoch": 4.87, - "learning_rate": 8.94982248158066e-08, - "loss": 0.2296, + "epoch": 4.9897286193101955, + "grad_norm": 0.25124630331993103, + "learning_rate": 7.599606015984418e-10, + "loss": 0.3633, "step": 138450 }, { - "epoch": 4.87, - "learning_rate": 8.925755512599609e-08, - "loss": 0.2401, + "epoch": 4.98990881897142, + "grad_norm": 0.31034645438194275, + "learning_rate": 7.373777104052737e-10, + "loss": 0.3662, "step": 138455 }, { - "epoch": 4.87, - "learning_rate": 8.901720888575826e-08, - "loss": 0.2526, + "epoch": 4.990089018632645, + "grad_norm": 0.23552536964416504, + "learning_rate": 7.151354293272183e-10, + "loss": 0.3589, "step": 138460 }, { - "epoch": 4.87, - "learning_rate": 8.877718609821562e-08, - "loss": 0.2359, + "epoch": 4.99026921829387, + "grad_norm": 0.23122437298297882, + "learning_rate": 6.932337586640359e-10, + "loss": 0.3852, "step": 138465 }, { - "epoch": 4.87, - "learning_rate": 8.85374867664851e-08, - "loss": 0.2719, + "epoch": 4.9904494179550944, + "grad_norm": 0.25336378812789917, + "learning_rate": 6.716726987154865e-10, + "loss": 0.3705, "step": 138470 }, { - "epoch": 4.87, - "learning_rate": 8.829811089367811e-08, - "loss": 0.2384, + "epoch": 4.990629617616319, + "grad_norm": 0.2418462485074997, + "learning_rate": 6.504522497757792e-10, + "loss": 0.395, "step": 138475 }, { - "epoch": 4.87, - "learning_rate": 8.805905848290053e-08, - "loss": 0.2477, + "epoch": 4.990809817277544, + "grad_norm": 0.23543506860733032, + "learning_rate": 6.295724121335722e-10, + "loss": 0.352, "step": 138480 }, { - "epoch": 4.87, - "learning_rate": 8.782032953725817e-08, - "loss": 0.2318, + "epoch": 4.990990016938769, + "grad_norm": 0.28832608461380005, + "learning_rate": 6.090331860719722e-10, + "loss": 0.3692, "step": 138485 }, { - "epoch": 4.87, - "learning_rate": 8.758192405985133e-08, - "loss": 0.2362, + "epoch": 4.9911702165999925, + "grad_norm": 0.21373923122882843, + "learning_rate": 5.888345718740862e-10, + "loss": 0.373, "step": 138490 }, { - "epoch": 4.87, - "learning_rate": 8.734384205377755e-08, - "loss": 0.2613, + "epoch": 4.991350416261217, + "grad_norm": 0.28929153084754944, + "learning_rate": 5.689765698119187e-10, + "loss": 0.4071, "step": 138495 }, { - "epoch": 4.87, - "learning_rate": 8.710608352212324e-08, - "loss": 0.2318, + "epoch": 4.991530615922442, + "grad_norm": 0.226441890001297, + "learning_rate": 5.4945918016025e-10, + "loss": 0.398, "step": 138500 }, { - "epoch": 4.87, - "eval_loss": 0.24843639135360718, - "eval_runtime": 10.5531, - "eval_samples_per_second": 9.476, - "eval_steps_per_second": 9.476, + "epoch": 4.991530615922442, + "eval_loss": 0.428780198097229, + "eval_runtime": 3.5301, + "eval_samples_per_second": 28.328, + "eval_steps_per_second": 7.082, "step": 138500 - }, - { - "epoch": 4.87, - "learning_rate": 8.686864846798037e-08, - "loss": 0.2708, - "step": 138505 - }, - { - "epoch": 4.87, - "learning_rate": 8.663153689442982e-08, - "loss": 0.2577, - "step": 138510 - }, - { - "epoch": 4.87, - "learning_rate": 8.639474880454968e-08, - "loss": 0.2459, - "step": 138515 - }, - { - "epoch": 4.87, - "learning_rate": 8.615828420141526e-08, - "loss": 0.2296, - "step": 138520 - }, - { - "epoch": 4.87, - "learning_rate": 8.592214308809632e-08, - "loss": 0.2533, - "step": 138525 - }, - { - "epoch": 4.87, - "learning_rate": 8.568632546765987e-08, - "loss": 0.2483, - "step": 138530 - }, - { - "epoch": 4.87, - "learning_rate": 8.545083134316456e-08, - "loss": 0.2506, - "step": 138535 - }, - { - "epoch": 4.87, - "learning_rate": 8.521566071767462e-08, - "loss": 0.2508, - "step": 138540 - }, - { - "epoch": 4.87, - "learning_rate": 8.49808135942376e-08, - "loss": 0.2545, - "step": 138545 - }, - { - "epoch": 4.87, - "learning_rate": 8.474628997590661e-08, - "loss": 0.2407, - "step": 138550 - }, - { - "epoch": 4.87, - "learning_rate": 8.451208986572367e-08, - "loss": 0.2555, - "step": 138555 - }, - { - "epoch": 4.87, - "learning_rate": 8.42782132667308e-08, - "loss": 0.258, - "step": 138560 - }, - { - "epoch": 4.88, - "learning_rate": 8.404466018196722e-08, - "loss": 0.2316, - "step": 138565 - }, - { - "epoch": 4.88, - "learning_rate": 8.381143061446383e-08, - "loss": 0.2609, - "step": 138570 - }, - { - "epoch": 4.88, - "learning_rate": 8.35785245672488e-08, - "loss": 0.2565, - "step": 138575 - }, - { - "epoch": 4.88, - "learning_rate": 8.334594204334467e-08, - "loss": 0.2244, - "step": 138580 - }, - { - "epoch": 4.88, - "learning_rate": 8.311368304577128e-08, - "loss": 0.2621, - "step": 138585 - }, - { - "epoch": 4.88, - "learning_rate": 8.28817475775484e-08, - "loss": 0.2418, - "step": 138590 - }, - { - "epoch": 4.88, - "learning_rate": 8.265013564168478e-08, - "loss": 0.2513, - "step": 138595 - }, - { - "epoch": 4.88, - "learning_rate": 8.241884724118631e-08, - "loss": 0.2521, - "step": 138600 - }, - { - "epoch": 4.88, - "learning_rate": 8.218788237905895e-08, - "loss": 0.2319, - "step": 138605 - }, - { - "epoch": 4.88, - "learning_rate": 8.195724105830305e-08, - "loss": 0.2504, - "step": 138610 - }, - { - "epoch": 4.88, - "learning_rate": 8.172692328190512e-08, - "loss": 0.2492, - "step": 138615 - }, - { - "epoch": 4.88, - "learning_rate": 8.149692905286555e-08, - "loss": 0.2471, - "step": 138620 - }, - { - "epoch": 4.88, - "learning_rate": 8.126725837416527e-08, - "loss": 0.2494, - "step": 138625 - }, - { - "epoch": 4.88, - "learning_rate": 8.103791124878801e-08, - "loss": 0.2642, - "step": 138630 - }, - { - "epoch": 4.88, - "learning_rate": 8.080888767971196e-08, - "loss": 0.2405, - "step": 138635 - }, - { - "epoch": 4.88, - "learning_rate": 8.058018766990972e-08, - "loss": 0.2446, - "step": 138640 - }, - { - "epoch": 4.88, - "learning_rate": 8.035181122234837e-08, - "loss": 0.244, - "step": 138645 - }, - { - "epoch": 4.88, - "learning_rate": 8.012375834000053e-08, - "loss": 0.261, - "step": 138650 - }, - { - "epoch": 4.88, - "learning_rate": 7.989602902581938e-08, - "loss": 0.2475, - "step": 138655 - }, - { - "epoch": 4.88, - "learning_rate": 7.966862328276647e-08, - "loss": 0.2544, - "step": 138660 - }, - { - "epoch": 4.88, - "learning_rate": 7.944154111379498e-08, - "loss": 0.2701, - "step": 138665 - }, - { - "epoch": 4.88, - "learning_rate": 7.921478252184977e-08, - "loss": 0.2404, - "step": 138670 - }, - { - "epoch": 4.88, - "learning_rate": 7.898834750987849e-08, - "loss": 0.2496, - "step": 138675 - }, - { - "epoch": 4.88, - "learning_rate": 7.876223608081767e-08, - "loss": 0.2536, - "step": 138680 - }, - { - "epoch": 4.88, - "learning_rate": 7.853644823760941e-08, - "loss": 0.2337, - "step": 138685 - }, - { - "epoch": 4.88, - "learning_rate": 7.831098398317916e-08, - "loss": 0.2566, - "step": 138690 - }, - { - "epoch": 4.88, - "learning_rate": 7.80858433204551e-08, - "loss": 0.2438, - "step": 138695 - }, - { - "epoch": 4.88, - "learning_rate": 7.786102625236269e-08, - "loss": 0.2492, - "step": 138700 - }, - { - "epoch": 4.88, - "learning_rate": 7.763653278182182e-08, - "loss": 0.2476, - "step": 138705 - }, - { - "epoch": 4.88, - "learning_rate": 7.74123629117468e-08, - "loss": 0.2509, - "step": 138710 - }, - { - "epoch": 4.88, - "learning_rate": 7.718851664504645e-08, - "loss": 0.2566, - "step": 138715 - }, - { - "epoch": 4.88, - "learning_rate": 7.696499398462953e-08, - "loss": 0.2335, - "step": 138720 - }, - { - "epoch": 4.88, - "learning_rate": 7.67417949333965e-08, - "loss": 0.2551, - "step": 138725 - }, - { - "epoch": 4.88, - "learning_rate": 7.651891949424784e-08, - "loss": 0.2579, - "step": 138730 - }, - { - "epoch": 4.88, - "learning_rate": 7.629636767007287e-08, - "loss": 0.2632, - "step": 138735 - }, - { - "epoch": 4.88, - "learning_rate": 7.607413946376651e-08, - "loss": 0.2468, - "step": 138740 - }, - { - "epoch": 4.88, - "learning_rate": 7.585223487821259e-08, - "loss": 0.2377, - "step": 138745 - }, - { - "epoch": 4.88, - "learning_rate": 7.563065391628932e-08, - "loss": 0.2611, - "step": 138750 - }, - { - "epoch": 4.88, - "learning_rate": 7.540939658087776e-08, - "loss": 0.2642, - "step": 138755 - }, - { - "epoch": 4.88, - "learning_rate": 7.518846287484782e-08, - "loss": 0.2432, - "step": 138760 - }, - { - "epoch": 4.88, - "learning_rate": 7.496785280107222e-08, - "loss": 0.2575, - "step": 138765 - }, - { - "epoch": 4.88, - "learning_rate": 7.474756636240976e-08, - "loss": 0.2462, - "step": 138770 - }, - { - "epoch": 4.88, - "learning_rate": 7.452760356172483e-08, - "loss": 0.2464, - "step": 138775 - }, - { - "epoch": 4.88, - "learning_rate": 7.430796440187349e-08, - "loss": 0.2735, - "step": 138780 - }, - { - "epoch": 4.88, - "learning_rate": 7.408864888570621e-08, - "loss": 0.2807, - "step": 138785 - }, - { - "epoch": 4.88, - "learning_rate": 7.386965701607074e-08, - "loss": 0.2442, - "step": 138790 - }, - { - "epoch": 4.88, - "learning_rate": 7.3650988795812e-08, - "loss": 0.2407, - "step": 138795 - }, - { - "epoch": 4.88, - "learning_rate": 7.343264422776663e-08, - "loss": 0.2234, - "step": 138800 - }, - { - "epoch": 4.88, - "learning_rate": 7.321462331477124e-08, - "loss": 0.2519, - "step": 138805 - }, - { - "epoch": 4.88, - "learning_rate": 7.299692605965691e-08, - "loss": 0.2525, - "step": 138810 - }, - { - "epoch": 4.88, - "learning_rate": 7.277955246524915e-08, - "loss": 0.2468, - "step": 138815 - }, - { - "epoch": 4.88, - "learning_rate": 7.256250253437347e-08, - "loss": 0.2615, - "step": 138820 - }, - { - "epoch": 4.88, - "learning_rate": 7.234577626984151e-08, - "loss": 0.2465, - "step": 138825 - }, - { - "epoch": 4.88, - "learning_rate": 7.212937367447603e-08, - "loss": 0.2319, - "step": 138830 - }, - { - "epoch": 4.88, - "learning_rate": 7.191329475108032e-08, - "loss": 0.2629, - "step": 138835 - }, - { - "epoch": 4.88, - "learning_rate": 7.169753950246605e-08, - "loss": 0.2428, - "step": 138840 - }, - { - "epoch": 4.88, - "learning_rate": 7.14821079314254e-08, - "loss": 0.2525, - "step": 138845 - }, - { - "epoch": 4.89, - "learning_rate": 7.126700004076448e-08, - "loss": 0.234, - "step": 138850 - }, - { - "epoch": 4.89, - "learning_rate": 7.10522158332727e-08, - "loss": 0.2653, - "step": 138855 - }, - { - "epoch": 4.89, - "learning_rate": 7.083775531173675e-08, - "loss": 0.2505, - "step": 138860 - }, - { - "epoch": 4.89, - "learning_rate": 7.062361847894605e-08, - "loss": 0.2414, - "step": 138865 - }, - { - "epoch": 4.89, - "learning_rate": 7.040980533767894e-08, - "loss": 0.2582, - "step": 138870 - }, - { - "epoch": 4.89, - "learning_rate": 7.019631589070818e-08, - "loss": 0.2623, - "step": 138875 - }, - { - "epoch": 4.89, - "learning_rate": 6.998315014081213e-08, - "loss": 0.2364, - "step": 138880 - }, - { - "epoch": 4.89, - "learning_rate": 6.977030809075247e-08, - "loss": 0.2384, - "step": 138885 - }, - { - "epoch": 4.89, - "learning_rate": 6.955778974329919e-08, - "loss": 0.2242, - "step": 138890 - }, - { - "epoch": 4.89, - "learning_rate": 6.934559510120564e-08, - "loss": 0.235, - "step": 138895 - }, - { - "epoch": 4.89, - "learning_rate": 6.913372416723074e-08, - "loss": 0.2355, - "step": 138900 - }, - { - "epoch": 4.89, - "learning_rate": 6.892217694412505e-08, - "loss": 0.2448, - "step": 138905 - }, - { - "epoch": 4.89, - "learning_rate": 6.871095343463362e-08, - "loss": 0.2716, - "step": 138910 - }, - { - "epoch": 4.89, - "learning_rate": 6.850005364150147e-08, - "loss": 0.2524, - "step": 138915 - }, - { - "epoch": 4.89, - "learning_rate": 6.828947756746252e-08, - "loss": 0.2654, - "step": 138920 - }, - { - "epoch": 4.89, - "learning_rate": 6.807922521525622e-08, - "loss": 0.2407, - "step": 138925 - }, - { - "epoch": 4.89, - "learning_rate": 6.786929658761099e-08, - "loss": 0.2612, - "step": 138930 - }, - { - "epoch": 4.89, - "learning_rate": 6.765969168724962e-08, - "loss": 0.2588, - "step": 138935 - }, - { - "epoch": 4.89, - "learning_rate": 6.745041051689772e-08, - "loss": 0.2408, - "step": 138940 - }, - { - "epoch": 4.89, - "learning_rate": 6.724145307927254e-08, - "loss": 0.2578, - "step": 138945 - }, - { - "epoch": 4.89, - "learning_rate": 6.703281937708306e-08, - "loss": 0.2552, - "step": 138950 - }, - { - "epoch": 4.89, - "learning_rate": 6.682450941304374e-08, - "loss": 0.2701, - "step": 138955 - }, - { - "epoch": 4.89, - "learning_rate": 6.661652318985523e-08, - "loss": 0.2682, - "step": 138960 - }, - { - "epoch": 4.89, - "learning_rate": 6.640886071021813e-08, - "loss": 0.2446, - "step": 138965 - }, - { - "epoch": 4.89, - "learning_rate": 6.620152197683305e-08, - "loss": 0.2478, - "step": 138970 - }, - { - "epoch": 4.89, - "learning_rate": 6.599450699238674e-08, - "loss": 0.241, - "step": 138975 - }, - { - "epoch": 4.89, - "learning_rate": 6.578781575956872e-08, - "loss": 0.2433, - "step": 138980 - }, - { - "epoch": 4.89, - "learning_rate": 6.558144828106572e-08, - "loss": 0.2669, - "step": 138985 - }, - { - "epoch": 4.89, - "learning_rate": 6.537540455955616e-08, - "loss": 0.2576, - "step": 138990 - }, - { - "epoch": 4.89, - "learning_rate": 6.516968459771289e-08, - "loss": 0.2703, - "step": 138995 - }, - { - "epoch": 4.89, - "learning_rate": 6.496428839820878e-08, - "loss": 0.2401, - "step": 139000 - }, - { - "epoch": 4.89, - "eval_loss": 0.24838781356811523, - "eval_runtime": 10.5455, - "eval_samples_per_second": 9.483, - "eval_steps_per_second": 9.483, - "step": 139000 - }, - { - "epoch": 4.89, - "learning_rate": 6.475921596370838e-08, - "loss": 0.2372, - "step": 139005 - }, - { - "epoch": 4.89, - "learning_rate": 6.455446729688175e-08, - "loss": 0.2433, - "step": 139010 - }, - { - "epoch": 4.89, - "learning_rate": 6.435004240037957e-08, - "loss": 0.2662, - "step": 139015 - }, - { - "epoch": 4.89, - "learning_rate": 6.414594127685802e-08, - "loss": 0.282, - "step": 139020 - }, - { - "epoch": 4.89, - "learning_rate": 6.39421639289678e-08, - "loss": 0.2483, - "step": 139025 - }, - { - "epoch": 4.89, - "learning_rate": 6.373871035935675e-08, - "loss": 0.2787, - "step": 139030 - }, - { - "epoch": 4.89, - "learning_rate": 6.353558057066445e-08, - "loss": 0.2755, - "step": 139035 - }, - { - "epoch": 4.89, - "learning_rate": 6.333277456552766e-08, - "loss": 0.2572, - "step": 139040 - }, - { - "epoch": 4.89, - "learning_rate": 6.313029234658042e-08, - "loss": 0.242, - "step": 139045 - }, - { - "epoch": 4.89, - "learning_rate": 6.292813391645391e-08, - "loss": 0.2458, - "step": 139050 - }, - { - "epoch": 4.89, - "learning_rate": 6.272629927777108e-08, - "loss": 0.243, - "step": 139055 - }, - { - "epoch": 4.89, - "learning_rate": 6.252478843315479e-08, - "loss": 0.2483, - "step": 139060 - }, - { - "epoch": 4.89, - "learning_rate": 6.232360138521686e-08, - "loss": 0.2451, - "step": 139065 - }, - { - "epoch": 4.89, - "learning_rate": 6.212273813657465e-08, - "loss": 0.2657, - "step": 139070 - }, - { - "epoch": 4.89, - "learning_rate": 6.19221986898344e-08, - "loss": 0.2732, - "step": 139075 - }, - { - "epoch": 4.89, - "learning_rate": 6.17219830475968e-08, - "loss": 0.2429, - "step": 139080 - }, - { - "epoch": 4.89, - "learning_rate": 6.152209121246532e-08, - "loss": 0.2576, - "step": 139085 - }, - { - "epoch": 4.89, - "learning_rate": 6.132252318703513e-08, - "loss": 0.2451, - "step": 139090 - }, - { - "epoch": 4.89, - "learning_rate": 6.11232789738958e-08, - "loss": 0.244, - "step": 139095 - }, - { - "epoch": 4.89, - "learning_rate": 6.092435857563695e-08, - "loss": 0.2442, - "step": 139100 - }, - { - "epoch": 4.89, - "learning_rate": 6.072576199483982e-08, - "loss": 0.2445, - "step": 139105 - }, - { - "epoch": 4.89, - "learning_rate": 6.052748923408291e-08, - "loss": 0.249, - "step": 139110 - }, - { - "epoch": 4.89, - "learning_rate": 6.032954029593917e-08, - "loss": 0.2412, - "step": 139115 - }, - { - "epoch": 4.89, - "learning_rate": 6.013191518298156e-08, - "loss": 0.2595, - "step": 139120 - }, - { - "epoch": 4.89, - "learning_rate": 5.993461389777467e-08, - "loss": 0.2366, - "step": 139125 - }, - { - "epoch": 4.89, - "learning_rate": 5.973763644288033e-08, - "loss": 0.2396, - "step": 139130 - }, - { - "epoch": 4.9, - "learning_rate": 5.954098282085763e-08, - "loss": 0.2491, - "step": 139135 - }, - { - "epoch": 4.9, - "learning_rate": 5.934465303425729e-08, - "loss": 0.2339, - "step": 139140 - }, - { - "epoch": 4.9, - "learning_rate": 5.914864708563283e-08, - "loss": 0.2546, - "step": 139145 - }, - { - "epoch": 4.9, - "learning_rate": 5.895296497752389e-08, - "loss": 0.2587, - "step": 139150 - }, - { - "epoch": 4.9, - "learning_rate": 5.875760671247565e-08, - "loss": 0.252, - "step": 139155 - }, - { - "epoch": 4.9, - "learning_rate": 5.85625722930222e-08, - "loss": 0.2302, - "step": 139160 - }, - { - "epoch": 4.9, - "learning_rate": 5.8367861721697617e-08, - "loss": 0.2826, - "step": 139165 - }, - { - "epoch": 4.9, - "learning_rate": 5.817347500102765e-08, - "loss": 0.2645, - "step": 139170 - }, - { - "epoch": 4.9, - "learning_rate": 5.797941213353808e-08, - "loss": 0.2565, - "step": 139175 - }, - { - "epoch": 4.9, - "learning_rate": 5.77856731217491e-08, - "loss": 0.2418, - "step": 139180 - }, - { - "epoch": 4.9, - "learning_rate": 5.759225796817536e-08, - "loss": 0.25, - "step": 139185 - }, - { - "epoch": 4.9, - "learning_rate": 5.7399166675328745e-08, - "loss": 0.258, - "step": 139190 - }, - { - "epoch": 4.9, - "learning_rate": 5.7206399245715584e-08, - "loss": 0.2421, - "step": 139195 - }, - { - "epoch": 4.9, - "learning_rate": 5.7013955681839426e-08, - "loss": 0.2445, - "step": 139200 - }, - { - "epoch": 4.9, - "learning_rate": 5.682183598620106e-08, - "loss": 0.2674, - "step": 139205 - }, - { - "epoch": 4.9, - "learning_rate": 5.6630040161290145e-08, - "loss": 0.2495, - "step": 139210 - }, - { - "epoch": 4.9, - "learning_rate": 5.643856820960192e-08, - "loss": 0.2737, - "step": 139215 - }, - { - "epoch": 4.9, - "learning_rate": 5.62474201336205e-08, - "loss": 0.262, - "step": 139220 - }, - { - "epoch": 4.9, - "learning_rate": 5.6056595935827236e-08, - "loss": 0.244, - "step": 139225 - }, - { - "epoch": 4.9, - "learning_rate": 5.586609561869793e-08, - "loss": 0.2663, - "step": 139230 - }, - { - "epoch": 4.9, - "learning_rate": 5.567591918471116e-08, - "loss": 0.254, - "step": 139235 - }, - { - "epoch": 4.9, - "learning_rate": 5.5486066636334374e-08, - "loss": 0.274, - "step": 139240 - }, - { - "epoch": 4.9, - "learning_rate": 5.529653797602952e-08, - "loss": 0.2628, - "step": 139245 - }, - { - "epoch": 4.9, - "learning_rate": 5.5107333206261266e-08, - "loss": 0.2555, - "step": 139250 - }, - { - "epoch": 4.9, - "learning_rate": 5.4918452329486004e-08, - "loss": 0.2574, - "step": 139255 - }, - { - "epoch": 4.9, - "learning_rate": 5.4729895348154534e-08, - "loss": 0.2486, - "step": 139260 - }, - { - "epoch": 4.9, - "learning_rate": 5.454166226471491e-08, - "loss": 0.236, - "step": 139265 - }, - { - "epoch": 4.9, - "learning_rate": 5.435375308161239e-08, - "loss": 0.2445, - "step": 139270 - }, - { - "epoch": 4.9, - "learning_rate": 5.4166167801286697e-08, - "loss": 0.2466, - "step": 139275 - }, - { - "epoch": 4.9, - "learning_rate": 5.397890642617476e-08, - "loss": 0.2489, - "step": 139280 - }, - { - "epoch": 4.9, - "learning_rate": 5.3791968958705195e-08, - "loss": 0.2564, - "step": 139285 - }, - { - "epoch": 4.9, - "learning_rate": 5.360535540130662e-08, - "loss": 0.2539, - "step": 139290 - }, - { - "epoch": 4.9, - "learning_rate": 5.34190657564021e-08, - "loss": 0.2812, - "step": 139295 - }, - { - "epoch": 4.9, - "learning_rate": 5.3233100026411906e-08, - "loss": 0.2725, - "step": 139300 - }, - { - "epoch": 4.9, - "learning_rate": 5.3047458213748015e-08, - "loss": 0.2347, - "step": 139305 - }, - { - "epoch": 4.9, - "learning_rate": 5.286214032082237e-08, - "loss": 0.2598, - "step": 139310 - }, - { - "epoch": 4.9, - "learning_rate": 5.26771463500414e-08, - "loss": 0.2549, - "step": 139315 - }, - { - "epoch": 4.9, - "learning_rate": 5.249247630380594e-08, - "loss": 0.2379, - "step": 139320 - }, - { - "epoch": 4.9, - "learning_rate": 5.230813018451686e-08, - "loss": 0.261, - "step": 139325 - }, - { - "epoch": 4.9, - "learning_rate": 5.212410799456391e-08, - "loss": 0.2443, - "step": 139330 - }, - { - "epoch": 4.9, - "learning_rate": 5.1940409736339626e-08, - "loss": 0.2517, - "step": 139335 - }, - { - "epoch": 4.9, - "learning_rate": 5.175703541222543e-08, - "loss": 0.277, - "step": 139340 - }, - { - "epoch": 4.9, - "learning_rate": 5.1573985024605534e-08, - "loss": 0.2466, - "step": 139345 - }, - { - "epoch": 4.9, - "learning_rate": 5.13912585758558e-08, - "loss": 0.2624, - "step": 139350 - }, - { - "epoch": 4.9, - "learning_rate": 5.120885606834935e-08, - "loss": 0.2759, - "step": 139355 - }, - { - "epoch": 4.9, - "learning_rate": 5.1026777504453724e-08, - "loss": 0.271, - "step": 139360 - }, - { - "epoch": 4.9, - "learning_rate": 5.08450228865337e-08, - "loss": 0.2514, - "step": 139365 - }, - { - "epoch": 4.9, - "learning_rate": 5.0663592216948496e-08, - "loss": 0.2702, - "step": 139370 - }, - { - "epoch": 4.9, - "learning_rate": 5.0482485498054566e-08, - "loss": 0.2748, - "step": 139375 - }, - { - "epoch": 4.9, - "learning_rate": 5.03017027322028e-08, - "loss": 0.2497, - "step": 139380 - }, - { - "epoch": 4.9, - "learning_rate": 5.0121243921741337e-08, - "loss": 0.2428, - "step": 139385 - }, - { - "epoch": 4.9, - "learning_rate": 4.9941109069012726e-08, - "loss": 0.2449, - "step": 139390 - }, - { - "epoch": 4.9, - "learning_rate": 4.976129817635677e-08, - "loss": 0.2438, - "step": 139395 - }, - { - "epoch": 4.9, - "learning_rate": 4.958181124610772e-08, - "loss": 0.2722, - "step": 139400 - }, - { - "epoch": 4.9, - "learning_rate": 4.9402648280594265e-08, - "loss": 0.2562, - "step": 139405 - }, - { - "epoch": 4.9, - "learning_rate": 4.922380928214787e-08, - "loss": 0.2715, - "step": 139410 - }, - { - "epoch": 4.91, - "learning_rate": 4.9045294253083354e-08, - "loss": 0.2519, - "step": 139415 - }, - { - "epoch": 4.91, - "learning_rate": 4.886710319572385e-08, - "loss": 0.2516, - "step": 139420 - }, - { - "epoch": 4.91, - "learning_rate": 4.868923611238141e-08, - "loss": 0.269, - "step": 139425 - }, - { - "epoch": 4.91, - "learning_rate": 4.851169300536529e-08, - "loss": 0.2386, - "step": 139430 - }, - { - "epoch": 4.91, - "learning_rate": 4.833447387698198e-08, - "loss": 0.2752, - "step": 139435 - }, - { - "epoch": 4.91, - "learning_rate": 4.815757872952964e-08, - "loss": 0.2407, - "step": 139440 - }, - { - "epoch": 4.91, - "learning_rate": 4.798100756530921e-08, - "loss": 0.2601, - "step": 139445 - }, - { - "epoch": 4.91, - "learning_rate": 4.78047603866133e-08, - "loss": 0.2495, - "step": 139450 - }, - { - "epoch": 4.91, - "learning_rate": 4.762883719572619e-08, - "loss": 0.2573, - "step": 139455 - }, - { - "epoch": 4.91, - "learning_rate": 4.745323799493495e-08, - "loss": 0.2502, - "step": 139460 - }, - { - "epoch": 4.91, - "learning_rate": 4.72779627865183e-08, - "loss": 0.2501, - "step": 139465 - }, - { - "epoch": 4.91, - "learning_rate": 4.7103011572752206e-08, - "loss": 0.2381, - "step": 139470 - }, - { - "epoch": 4.91, - "learning_rate": 4.692838435590985e-08, - "loss": 0.2508, - "step": 139475 - }, - { - "epoch": 4.91, - "learning_rate": 4.675408113825885e-08, - "loss": 0.2582, - "step": 139480 - }, - { - "epoch": 4.91, - "learning_rate": 4.6580101922058527e-08, - "loss": 0.251, - "step": 139485 - }, - { - "epoch": 4.91, - "learning_rate": 4.640644670957372e-08, - "loss": 0.2454, - "step": 139490 - }, - { - "epoch": 4.91, - "learning_rate": 4.623311550305542e-08, - "loss": 0.2563, - "step": 139495 - }, - { - "epoch": 4.91, - "learning_rate": 4.6060108304754604e-08, - "loss": 0.2511, - "step": 139500 - }, - { - "epoch": 4.91, - "eval_loss": 0.24838365614414215, - "eval_runtime": 10.5546, - "eval_samples_per_second": 9.475, - "eval_steps_per_second": 9.475, - "step": 139500 - }, - { - "epoch": 4.91, - "learning_rate": 4.5887425116916685e-08, - "loss": 0.2367, - "step": 139505 - }, - { - "epoch": 4.91, - "learning_rate": 4.5715065941787095e-08, - "loss": 0.2403, - "step": 139510 - }, - { - "epoch": 4.91, - "learning_rate": 4.554303078160016e-08, - "loss": 0.2501, - "step": 139515 - }, - { - "epoch": 4.91, - "learning_rate": 4.537131963859298e-08, - "loss": 0.2529, - "step": 139520 - }, - { - "epoch": 4.91, - "learning_rate": 4.519993251499155e-08, - "loss": 0.2343, - "step": 139525 - }, - { - "epoch": 4.91, - "learning_rate": 4.502886941302464e-08, - "loss": 0.2539, - "step": 139530 - }, - { - "epoch": 4.91, - "learning_rate": 4.485813033491271e-08, - "loss": 0.2312, - "step": 139535 - }, - { - "epoch": 4.91, - "learning_rate": 4.468771528287064e-08, - "loss": 0.2603, - "step": 139540 - }, - { - "epoch": 4.91, - "learning_rate": 4.451762425911332e-08, - "loss": 0.2469, - "step": 139545 - }, - { - "epoch": 4.91, - "learning_rate": 4.434785726584734e-08, - "loss": 0.2494, - "step": 139550 - }, - { - "epoch": 4.91, - "learning_rate": 4.417841430527925e-08, - "loss": 0.247, - "step": 139555 - }, - { - "epoch": 4.91, - "learning_rate": 4.40092953796073e-08, - "loss": 0.2504, - "step": 139560 - }, - { - "epoch": 4.91, - "learning_rate": 4.3840500491026946e-08, - "loss": 0.2637, - "step": 139565 - }, - { - "epoch": 4.91, - "learning_rate": 4.3672029641730896e-08, - "loss": 0.2409, - "step": 139570 - }, - { - "epoch": 4.91, - "learning_rate": 4.350388283390905e-08, - "loss": 0.2413, - "step": 139575 - }, - { - "epoch": 4.91, - "learning_rate": 4.3336060069740224e-08, - "loss": 0.2618, - "step": 139580 - }, - { - "epoch": 4.91, - "learning_rate": 4.3168561351406014e-08, - "loss": 0.2557, - "step": 139585 - }, - { - "epoch": 4.91, - "learning_rate": 4.3001386681082444e-08, - "loss": 0.2262, - "step": 139590 - }, - { - "epoch": 4.91, - "learning_rate": 4.2834536060937235e-08, - "loss": 0.2644, - "step": 139595 - }, - { - "epoch": 4.91, - "learning_rate": 4.26680094931381e-08, - "loss": 0.2536, - "step": 139600 - }, - { - "epoch": 4.91, - "learning_rate": 4.250180697984718e-08, - "loss": 0.2648, - "step": 139605 - }, - { - "epoch": 4.91, - "learning_rate": 4.233592852322388e-08, - "loss": 0.2665, - "step": 139610 - }, - { - "epoch": 4.91, - "learning_rate": 4.217037412541924e-08, - "loss": 0.2514, - "step": 139615 - }, - { - "epoch": 4.91, - "learning_rate": 4.200514378858433e-08, - "loss": 0.2558, - "step": 139620 - }, - { - "epoch": 4.91, - "learning_rate": 4.184023751486743e-08, - "loss": 0.2542, - "step": 139625 - }, - { - "epoch": 4.91, - "learning_rate": 4.167565530640571e-08, - "loss": 0.252, - "step": 139630 - }, - { - "epoch": 4.91, - "learning_rate": 4.151139716533636e-08, - "loss": 0.2377, - "step": 139635 - }, - { - "epoch": 4.91, - "learning_rate": 4.134746309379378e-08, - "loss": 0.2441, - "step": 139640 - }, - { - "epoch": 4.91, - "learning_rate": 4.118385309390682e-08, - "loss": 0.2431, - "step": 139645 - }, - { - "epoch": 4.91, - "learning_rate": 4.102056716779601e-08, - "loss": 0.2303, - "step": 139650 - }, - { - "epoch": 4.91, - "learning_rate": 4.0857605317587424e-08, - "loss": 0.2495, - "step": 139655 - }, - { - "epoch": 4.91, - "learning_rate": 4.069496754539326e-08, - "loss": 0.2484, - "step": 139660 - }, - { - "epoch": 4.91, - "learning_rate": 4.0532653853328495e-08, - "loss": 0.2516, - "step": 139665 - }, - { - "epoch": 4.91, - "learning_rate": 4.037066424349423e-08, - "loss": 0.2341, - "step": 139670 - }, - { - "epoch": 4.91, - "learning_rate": 4.020899871799988e-08, - "loss": 0.2712, - "step": 139675 - }, - { - "epoch": 4.91, - "learning_rate": 4.0047657278943774e-08, - "loss": 0.2612, - "step": 139680 - }, - { - "epoch": 4.91, - "learning_rate": 3.988663992841868e-08, - "loss": 0.2186, - "step": 139685 - }, - { - "epoch": 4.91, - "learning_rate": 3.972594666851459e-08, - "loss": 0.2565, - "step": 139690 - }, - { - "epoch": 4.91, - "learning_rate": 3.9565577501321505e-08, - "loss": 0.253, - "step": 139695 - }, - { - "epoch": 4.92, - "learning_rate": 3.940553242891831e-08, - "loss": 0.25, - "step": 139700 - }, - { - "epoch": 4.92, - "learning_rate": 3.924581145338668e-08, - "loss": 0.2526, - "step": 139705 - }, - { - "epoch": 4.92, - "learning_rate": 3.908641457679718e-08, - "loss": 0.2363, - "step": 139710 - }, - { - "epoch": 4.92, - "learning_rate": 3.892734180122037e-08, - "loss": 0.2487, - "step": 139715 - }, - { - "epoch": 4.92, - "learning_rate": 3.8768593128724054e-08, - "loss": 0.2375, - "step": 139720 - }, - { - "epoch": 4.92, - "learning_rate": 3.861016856136768e-08, - "loss": 0.2468, - "step": 139725 - }, - { - "epoch": 4.92, - "learning_rate": 3.8452068101205165e-08, - "loss": 0.2504, - "step": 139730 - }, - { - "epoch": 4.92, - "learning_rate": 3.82942917502932e-08, - "loss": 0.2475, - "step": 139735 - }, - { - "epoch": 4.92, - "learning_rate": 3.8136839510680145e-08, - "loss": 0.2352, - "step": 139740 - }, - { - "epoch": 4.92, - "learning_rate": 3.7979711384408814e-08, - "loss": 0.2396, - "step": 139745 - }, - { - "epoch": 4.92, - "learning_rate": 3.782290737352201e-08, - "loss": 0.2344, - "step": 139750 - }, - { - "epoch": 4.92, - "learning_rate": 3.7666427480054224e-08, - "loss": 0.2612, - "step": 139755 - }, - { - "epoch": 4.92, - "learning_rate": 3.751027170603438e-08, - "loss": 0.2375, - "step": 139760 - }, - { - "epoch": 4.92, - "learning_rate": 3.73544400534942e-08, - "loss": 0.2605, - "step": 139765 - }, - { - "epoch": 4.92, - "learning_rate": 3.719893252445705e-08, - "loss": 0.2591, - "step": 139770 - }, - { - "epoch": 4.92, - "learning_rate": 3.7043749120938e-08, - "loss": 0.2505, - "step": 139775 - }, - { - "epoch": 4.92, - "learning_rate": 3.688888984495764e-08, - "loss": 0.2627, - "step": 139780 - }, - { - "epoch": 4.92, - "learning_rate": 3.673435469851994e-08, - "loss": 0.2744, - "step": 139785 - }, - { - "epoch": 4.92, - "learning_rate": 3.658014368363993e-08, - "loss": 0.2597, - "step": 139790 - }, - { - "epoch": 4.92, - "learning_rate": 3.642625680231049e-08, - "loss": 0.2467, - "step": 139795 - }, - { - "epoch": 4.92, - "learning_rate": 3.627269405653832e-08, - "loss": 0.2613, - "step": 139800 - }, - { - "epoch": 4.92, - "learning_rate": 3.6119455448310746e-08, - "loss": 0.2453, - "step": 139805 - }, - { - "epoch": 4.92, - "learning_rate": 3.59665409796206e-08, - "loss": 0.2469, - "step": 139810 - }, - { - "epoch": 4.92, - "learning_rate": 3.581395065245519e-08, - "loss": 0.2677, - "step": 139815 - }, - { - "epoch": 4.92, - "learning_rate": 3.566168446879348e-08, - "loss": 0.2571, - "step": 139820 - }, - { - "epoch": 4.92, - "learning_rate": 3.5509742430608894e-08, - "loss": 0.2689, - "step": 139825 - }, - { - "epoch": 4.92, - "learning_rate": 3.535812453988319e-08, - "loss": 0.2546, - "step": 139830 - }, - { - "epoch": 4.92, - "learning_rate": 3.52068307985759e-08, - "loss": 0.2396, - "step": 139835 - }, - { - "epoch": 4.92, - "learning_rate": 3.505586120865767e-08, - "loss": 0.2477, - "step": 139840 - }, - { - "epoch": 4.92, - "learning_rate": 3.4905215772085275e-08, - "loss": 0.2458, - "step": 139845 - }, - { - "epoch": 4.92, - "learning_rate": 3.475489449081826e-08, - "loss": 0.2482, - "step": 139850 - }, - { - "epoch": 4.92, - "learning_rate": 3.460489736680228e-08, - "loss": 0.2831, - "step": 139855 - }, - { - "epoch": 4.92, - "learning_rate": 3.445522440199134e-08, - "loss": 0.2551, - "step": 139860 - }, - { - "epoch": 4.92, - "learning_rate": 3.4305875598325545e-08, - "loss": 0.2651, - "step": 139865 - }, - { - "epoch": 4.92, - "learning_rate": 3.415685095774779e-08, - "loss": 0.2481, - "step": 139870 - }, - { - "epoch": 4.92, - "learning_rate": 3.400815048218708e-08, - "loss": 0.2391, - "step": 139875 - }, - { - "epoch": 4.92, - "learning_rate": 3.3859774173577996e-08, - "loss": 0.2464, - "step": 139880 - }, - { - "epoch": 4.92, - "learning_rate": 3.371172203384676e-08, - "loss": 0.2561, - "step": 139885 - }, - { - "epoch": 4.92, - "learning_rate": 3.356399406491684e-08, - "loss": 0.264, - "step": 139890 - }, - { - "epoch": 4.92, - "learning_rate": 3.3416590268703364e-08, - "loss": 0.2389, - "step": 139895 - }, - { - "epoch": 4.92, - "learning_rate": 3.3269510647121474e-08, - "loss": 0.2743, - "step": 139900 - }, - { - "epoch": 4.92, - "learning_rate": 3.312275520208352e-08, - "loss": 0.2377, - "step": 139905 - }, - { - "epoch": 4.92, - "learning_rate": 3.297632393549077e-08, - "loss": 0.2321, - "step": 139910 - }, - { - "epoch": 4.92, - "learning_rate": 3.283021684924448e-08, - "loss": 0.2727, - "step": 139915 - }, - { - "epoch": 4.92, - "learning_rate": 3.268443394524867e-08, - "loss": 0.2393, - "step": 139920 - }, - { - "epoch": 4.92, - "learning_rate": 3.2538975225387956e-08, - "loss": 0.2432, - "step": 139925 - }, - { - "epoch": 4.92, - "learning_rate": 3.2393840691555266e-08, - "loss": 0.2301, - "step": 139930 - }, - { - "epoch": 4.92, - "learning_rate": 3.22490303456352e-08, - "loss": 0.2793, - "step": 139935 - }, - { - "epoch": 4.92, - "learning_rate": 3.2104544189506816e-08, - "loss": 0.251, - "step": 139940 - }, - { - "epoch": 4.92, - "learning_rate": 3.196038222504638e-08, - "loss": 0.2762, - "step": 139945 - }, - { - "epoch": 4.92, - "learning_rate": 3.181654445412463e-08, - "loss": 0.2547, - "step": 139950 - }, - { - "epoch": 4.92, - "learning_rate": 3.1673030878612286e-08, - "loss": 0.2638, - "step": 139955 - }, - { - "epoch": 4.92, - "learning_rate": 3.1529841500368975e-08, - "loss": 0.2362, - "step": 139960 - }, - { - "epoch": 4.92, - "learning_rate": 3.138697632125709e-08, - "loss": 0.2567, - "step": 139965 - }, - { - "epoch": 4.92, - "learning_rate": 3.1244435343130706e-08, - "loss": 0.2716, - "step": 139970 - }, - { - "epoch": 4.92, - "learning_rate": 3.110221856783835e-08, - "loss": 0.2464, - "step": 139975 - }, - { - "epoch": 4.92, - "learning_rate": 3.096032599723131e-08, - "loss": 0.2455, - "step": 139980 - }, - { - "epoch": 4.93, - "learning_rate": 3.081875763314701e-08, - "loss": 0.2536, - "step": 139985 - }, - { - "epoch": 4.93, - "learning_rate": 3.067751347742842e-08, - "loss": 0.2541, - "step": 139990 - }, - { - "epoch": 4.93, - "learning_rate": 3.053659353190463e-08, - "loss": 0.2264, - "step": 139995 - }, - { - "epoch": 4.93, - "learning_rate": 3.039599779840752e-08, - "loss": 0.2472, - "step": 140000 - }, - { - "epoch": 4.93, - "eval_loss": 0.2483794391155243, - "eval_runtime": 10.5607, - "eval_samples_per_second": 9.469, - "eval_steps_per_second": 9.469, - "step": 140000 - }, - { - "epoch": 4.93, - "learning_rate": 3.025572627876339e-08, - "loss": 0.2355, - "step": 140005 - }, - { - "epoch": 4.93, - "learning_rate": 3.011577897479301e-08, - "loss": 0.2437, - "step": 140010 - }, - { - "epoch": 4.93, - "learning_rate": 2.997615588831437e-08, - "loss": 0.2383, - "step": 140015 - }, - { - "epoch": 4.93, - "learning_rate": 2.9836857021137147e-08, - "loss": 0.2328, - "step": 140020 - }, - { - "epoch": 4.93, - "learning_rate": 2.969788237507376e-08, - "loss": 0.2205, - "step": 140025 - }, - { - "epoch": 4.93, - "learning_rate": 2.9559231951928334e-08, - "loss": 0.2371, - "step": 140030 - }, - { - "epoch": 4.93, - "learning_rate": 2.942090575349943e-08, - "loss": 0.2592, - "step": 140035 - }, - { - "epoch": 4.93, - "learning_rate": 2.9282903781582828e-08, - "loss": 0.2468, - "step": 140040 - }, - { - "epoch": 4.93, - "learning_rate": 2.9145226037974317e-08, - "loss": 0.2362, - "step": 140045 - }, - { - "epoch": 4.93, - "learning_rate": 2.9007872524455805e-08, - "loss": 0.2539, - "step": 140050 - }, - { - "epoch": 4.93, - "learning_rate": 2.887084324281475e-08, - "loss": 0.2358, - "step": 140055 - }, - { - "epoch": 4.93, - "learning_rate": 2.873413819482751e-08, - "loss": 0.2481, - "step": 140060 - }, - { - "epoch": 4.93, - "learning_rate": 2.8597757382273216e-08, - "loss": 0.2433, - "step": 140065 - }, - { - "epoch": 4.93, - "learning_rate": 2.8461700806917123e-08, - "loss": 0.2358, - "step": 140070 - }, - { - "epoch": 4.93, - "learning_rate": 2.8325968470532816e-08, - "loss": 0.2537, - "step": 140075 - }, - { - "epoch": 4.93, - "learning_rate": 2.8190560374877217e-08, - "loss": 0.2366, - "step": 140080 - }, - { - "epoch": 4.93, - "learning_rate": 2.8055476521710034e-08, - "loss": 0.2612, - "step": 140085 - }, - { - "epoch": 4.93, - "learning_rate": 2.7920716912785416e-08, - "loss": 0.2616, - "step": 140090 - }, - { - "epoch": 4.93, - "learning_rate": 2.7786281549851965e-08, - "loss": 0.2407, - "step": 140095 - }, - { - "epoch": 4.93, - "learning_rate": 2.7652170434658285e-08, - "loss": 0.253, - "step": 140100 - }, - { - "epoch": 4.93, - "learning_rate": 2.7518383568941875e-08, - "loss": 0.2263, - "step": 140105 - }, - { - "epoch": 4.93, - "learning_rate": 2.7384920954443006e-08, - "loss": 0.2594, - "step": 140110 - }, - { - "epoch": 4.93, - "learning_rate": 2.7251782592893627e-08, - "loss": 0.2574, - "step": 140115 - }, - { - "epoch": 4.93, - "learning_rate": 2.7118968486020136e-08, - "loss": 0.2437, - "step": 140120 - }, - { - "epoch": 4.93, - "learning_rate": 2.6986478635551705e-08, - "loss": 0.2568, - "step": 140125 - }, - { - "epoch": 4.93, - "learning_rate": 2.6854313043203627e-08, - "loss": 0.2295, - "step": 140130 - }, - { - "epoch": 4.93, - "learning_rate": 2.672247171069675e-08, - "loss": 0.2409, - "step": 140135 - }, - { - "epoch": 4.93, - "learning_rate": 2.6590954639738043e-08, - "loss": 0.2298, - "step": 140140 - }, - { - "epoch": 4.93, - "learning_rate": 2.645976183204002e-08, - "loss": 0.2613, - "step": 140145 - }, - { - "epoch": 4.93, - "learning_rate": 2.6328893289301326e-08, - "loss": 0.2364, - "step": 140150 - }, - { - "epoch": 4.93, - "learning_rate": 2.6198349013226154e-08, - "loss": 0.2616, - "step": 140155 - }, - { - "epoch": 4.93, - "learning_rate": 2.606812900550759e-08, - "loss": 0.252, - "step": 140160 - }, - { - "epoch": 4.93, - "learning_rate": 2.593823326783318e-08, - "loss": 0.2611, - "step": 140165 - }, - { - "epoch": 4.93, - "learning_rate": 2.580866180189323e-08, - "loss": 0.2567, - "step": 140170 - }, - { - "epoch": 4.93, - "learning_rate": 2.5679414609369734e-08, - "loss": 0.2269, - "step": 140175 - }, - { - "epoch": 4.93, - "learning_rate": 2.55504916919419e-08, - "loss": 0.2433, - "step": 140180 - }, - { - "epoch": 4.93, - "learning_rate": 2.5421893051280622e-08, - "loss": 0.2451, - "step": 140185 - }, - { - "epoch": 4.93, - "learning_rate": 2.5293618689056775e-08, - "loss": 0.2458, - "step": 140190 - }, - { - "epoch": 4.93, - "learning_rate": 2.51656686069357e-08, - "loss": 0.2379, - "step": 140195 - }, - { - "epoch": 4.93, - "learning_rate": 2.5038042806579953e-08, - "loss": 0.2551, - "step": 140200 - }, - { - "epoch": 4.93, - "learning_rate": 2.4910741289646543e-08, - "loss": 0.2582, - "step": 140205 - }, - { - "epoch": 4.93, - "learning_rate": 2.478376405778693e-08, - "loss": 0.243, - "step": 140210 - }, - { - "epoch": 4.93, - "learning_rate": 2.4657111112652566e-08, - "loss": 0.2432, - "step": 140215 - }, - { - "epoch": 4.93, - "learning_rate": 2.4530782455886582e-08, - "loss": 0.2447, - "step": 140220 - }, - { - "epoch": 4.93, - "learning_rate": 2.4404778089126555e-08, - "loss": 0.252, - "step": 140225 - }, - { - "epoch": 4.93, - "learning_rate": 2.4279098014012847e-08, - "loss": 0.2502, - "step": 140230 - }, - { - "epoch": 4.93, - "learning_rate": 2.41537422321747e-08, - "loss": 0.2246, - "step": 140235 - }, - { - "epoch": 4.93, - "learning_rate": 2.4028710745241378e-08, - "loss": 0.2442, - "step": 140240 - }, - { - "epoch": 4.93, - "learning_rate": 2.39040035548338e-08, - "loss": 0.2282, - "step": 140245 - }, - { - "epoch": 4.93, - "learning_rate": 2.3779620662572888e-08, - "loss": 0.248, - "step": 140250 - }, - { - "epoch": 4.93, - "learning_rate": 2.36555620700768e-08, - "loss": 0.2496, - "step": 140255 - }, - { - "epoch": 4.93, - "learning_rate": 2.35318277789498e-08, - "loss": 0.2691, - "step": 140260 - }, - { - "epoch": 4.93, - "learning_rate": 2.3408417790801716e-08, - "loss": 0.2883, - "step": 140265 - }, - { - "epoch": 4.94, - "learning_rate": 2.3285332107236823e-08, - "loss": 0.2487, - "step": 140270 - }, - { - "epoch": 4.94, - "learning_rate": 2.316257072985384e-08, - "loss": 0.2364, - "step": 140275 - }, - { - "epoch": 4.94, - "learning_rate": 2.304013366024038e-08, - "loss": 0.2636, - "step": 140280 - }, - { - "epoch": 4.94, - "learning_rate": 2.29180208999924e-08, - "loss": 0.2453, - "step": 140285 - }, - { - "epoch": 4.94, - "learning_rate": 2.2796232450691956e-08, - "loss": 0.2455, - "step": 140290 - }, - { - "epoch": 4.94, - "learning_rate": 2.2674768313923898e-08, - "loss": 0.2399, - "step": 140295 - }, - { - "epoch": 4.94, - "learning_rate": 2.255362849126197e-08, - "loss": 0.2482, - "step": 140300 - }, - { - "epoch": 4.94, - "learning_rate": 2.2432812984279905e-08, - "loss": 0.2399, - "step": 140305 - }, - { - "epoch": 4.94, - "learning_rate": 2.2312321794548673e-08, - "loss": 0.2593, - "step": 140310 - }, - { - "epoch": 4.94, - "learning_rate": 2.219215492362814e-08, - "loss": 0.239, - "step": 140315 - }, - { - "epoch": 4.94, - "learning_rate": 2.2072312373083716e-08, - "loss": 0.261, - "step": 140320 - }, - { - "epoch": 4.94, - "learning_rate": 2.1952794144469713e-08, - "loss": 0.2639, - "step": 140325 - }, - { - "epoch": 4.94, - "learning_rate": 2.183360023933767e-08, - "loss": 0.2514, - "step": 140330 - }, - { - "epoch": 4.94, - "learning_rate": 2.1714730659233573e-08, - "loss": 0.2684, - "step": 140335 - }, - { - "epoch": 4.94, - "learning_rate": 2.1596185405703407e-08, - "loss": 0.2716, - "step": 140340 - }, - { - "epoch": 4.94, - "learning_rate": 2.1477964480284827e-08, - "loss": 0.2562, - "step": 140345 - }, - { - "epoch": 4.94, - "learning_rate": 2.1360067884512725e-08, - "loss": 0.248, - "step": 140350 - }, - { - "epoch": 4.94, - "learning_rate": 2.12424956199192e-08, - "loss": 0.239, - "step": 140355 - }, - { - "epoch": 4.94, - "learning_rate": 2.1125247688030812e-08, - "loss": 0.2444, - "step": 140360 - }, - { - "epoch": 4.94, - "learning_rate": 2.1008324090368568e-08, - "loss": 0.2539, - "step": 140365 - }, - { - "epoch": 4.94, - "learning_rate": 2.0891724828453476e-08, - "loss": 0.2366, - "step": 140370 - }, - { - "epoch": 4.94, - "learning_rate": 2.077544990379543e-08, - "loss": 0.2539, - "step": 140375 - }, - { - "epoch": 4.94, - "learning_rate": 2.0659499317907116e-08, - "loss": 0.2419, - "step": 140380 - }, - { - "epoch": 4.94, - "learning_rate": 2.0543873072292887e-08, - "loss": 0.2431, - "step": 140385 - }, - { - "epoch": 4.94, - "learning_rate": 2.0428571168454312e-08, - "loss": 0.2521, - "step": 140390 - }, - { - "epoch": 4.94, - "learning_rate": 2.0313593607887427e-08, - "loss": 0.2359, - "step": 140395 - }, - { - "epoch": 4.94, - "learning_rate": 2.019894039208825e-08, - "loss": 0.2645, - "step": 140400 - }, - { - "epoch": 4.94, - "learning_rate": 2.008461152254171e-08, - "loss": 0.2329, - "step": 140405 - }, - { - "epoch": 4.94, - "learning_rate": 1.9970607000735497e-08, - "loss": 0.2538, - "step": 140410 - }, - { - "epoch": 4.94, - "learning_rate": 1.9856926828148993e-08, - "loss": 0.2494, - "step": 140415 - }, - { - "epoch": 4.94, - "learning_rate": 1.9743571006256014e-08, - "loss": 0.2461, - "step": 140420 - }, - { - "epoch": 4.94, - "learning_rate": 1.963053953653038e-08, - "loss": 0.2636, - "step": 140425 - }, - { - "epoch": 4.94, - "learning_rate": 1.9517832420440363e-08, - "loss": 0.25, - "step": 140430 - }, - { - "epoch": 4.94, - "learning_rate": 1.940544965944868e-08, - "loss": 0.2498, - "step": 140435 - }, - { - "epoch": 4.94, - "learning_rate": 1.929339125501528e-08, - "loss": 0.2806, - "step": 140440 - }, - { - "epoch": 4.94, - "learning_rate": 1.9181657208591775e-08, - "loss": 0.2557, - "step": 140445 - }, - { - "epoch": 4.94, - "learning_rate": 1.9070247521632556e-08, - "loss": 0.2491, - "step": 140450 - }, - { - "epoch": 4.94, - "learning_rate": 1.895916219558369e-08, - "loss": 0.2463, - "step": 140455 - }, - { - "epoch": 4.94, - "learning_rate": 1.8848401231885692e-08, - "loss": 0.2464, - "step": 140460 - }, - { - "epoch": 4.94, - "learning_rate": 1.8737964631979078e-08, - "loss": 0.2438, - "step": 140465 - }, - { - "epoch": 4.94, - "learning_rate": 1.8627852397298808e-08, - "loss": 0.2542, - "step": 140470 - }, - { - "epoch": 4.94, - "learning_rate": 1.851806452926874e-08, - "loss": 0.2692, - "step": 140475 - }, - { - "epoch": 4.94, - "learning_rate": 1.840860102932107e-08, - "loss": 0.2289, - "step": 140480 - }, - { - "epoch": 4.94, - "learning_rate": 1.829946189887133e-08, - "loss": 0.2403, - "step": 140485 - }, - { - "epoch": 4.94, - "learning_rate": 1.8190647139343376e-08, - "loss": 0.2488, - "step": 140490 - }, - { - "epoch": 4.94, - "learning_rate": 1.8082156752141642e-08, - "loss": 0.2613, - "step": 140495 - }, - { - "epoch": 4.94, - "learning_rate": 1.7973990738684443e-08, - "loss": 0.2542, - "step": 140500 - }, - { - "epoch": 4.94, - "eval_loss": 0.24837003648281097, - "eval_runtime": 10.5523, - "eval_samples_per_second": 9.477, - "eval_steps_per_second": 9.477, - "step": 140500 - }, - { - "epoch": 4.94, - "learning_rate": 1.786614910036788e-08, - "loss": 0.2355, - "step": 140505 - }, - { - "epoch": 4.94, - "learning_rate": 1.7758631838596385e-08, - "loss": 0.2509, - "step": 140510 - }, - { - "epoch": 4.94, - "learning_rate": 1.7651438954763287e-08, - "loss": 0.2399, - "step": 140515 - }, - { - "epoch": 4.94, - "learning_rate": 1.7544570450264697e-08, - "loss": 0.2539, - "step": 140520 - }, - { - "epoch": 4.94, - "learning_rate": 1.7438026326485612e-08, - "loss": 0.262, - "step": 140525 - }, - { - "epoch": 4.94, - "learning_rate": 1.7331806584808263e-08, - "loss": 0.234, - "step": 140530 - }, - { - "epoch": 4.94, - "learning_rate": 1.7225911226614877e-08, - "loss": 0.2576, - "step": 140535 - }, - { - "epoch": 4.94, - "learning_rate": 1.712034025327658e-08, - "loss": 0.2385, - "step": 140540 - }, - { - "epoch": 4.94, - "learning_rate": 1.701509366616727e-08, - "loss": 0.2688, - "step": 140545 - }, - { - "epoch": 4.94, - "learning_rate": 1.69101714666553e-08, - "loss": 0.247, - "step": 140550 - }, - { - "epoch": 4.95, - "learning_rate": 1.6805573656095142e-08, - "loss": 0.2424, - "step": 140555 - }, - { - "epoch": 4.95, - "learning_rate": 1.6701300235855144e-08, - "loss": 0.2788, - "step": 140560 - }, - { - "epoch": 4.95, - "learning_rate": 1.6597351207281452e-08, - "loss": 0.2521, - "step": 140565 - }, - { - "epoch": 4.95, - "learning_rate": 1.6493726571725766e-08, - "loss": 0.2425, - "step": 140570 - }, - { - "epoch": 4.95, - "learning_rate": 1.6390426330534226e-08, - "loss": 0.2437, - "step": 140575 - }, - { - "epoch": 4.95, - "learning_rate": 1.6287450485050205e-08, - "loss": 0.2269, - "step": 140580 - }, - { - "epoch": 4.95, - "learning_rate": 1.6184799036608745e-08, - "loss": 0.2454, - "step": 140585 - }, - { - "epoch": 4.95, - "learning_rate": 1.6082471986542115e-08, - "loss": 0.2661, - "step": 140590 - }, - { - "epoch": 4.95, - "learning_rate": 1.5980469336179805e-08, - "loss": 0.2495, - "step": 140595 - }, - { - "epoch": 4.95, - "learning_rate": 1.5878791086845758e-08, - "loss": 0.2439, - "step": 140600 - }, - { - "epoch": 4.95, - "learning_rate": 1.5777437239858363e-08, - "loss": 0.248, - "step": 140605 - }, - { - "epoch": 4.95, - "learning_rate": 1.5676407796538784e-08, - "loss": 0.2583, - "step": 140610 - }, - { - "epoch": 4.95, - "learning_rate": 1.557570275819431e-08, - "loss": 0.2516, - "step": 140615 - }, - { - "epoch": 4.95, - "learning_rate": 1.5475322126135006e-08, - "loss": 0.2551, - "step": 140620 - }, - { - "epoch": 4.95, - "learning_rate": 1.5375265901662604e-08, - "loss": 0.2522, - "step": 140625 - }, - { - "epoch": 4.95, - "learning_rate": 1.527553408607607e-08, - "loss": 0.2352, - "step": 140630 - }, - { - "epoch": 4.95, - "learning_rate": 1.5176126680671587e-08, - "loss": 0.2735, - "step": 140635 - }, - { - "epoch": 4.95, - "learning_rate": 1.5077043686739788e-08, - "loss": 0.2506, - "step": 140640 - }, - { - "epoch": 4.95, - "learning_rate": 1.4978285105565758e-08, - "loss": 0.2519, - "step": 140645 - }, - { - "epoch": 4.95, - "learning_rate": 1.487985093843458e-08, - "loss": 0.2482, - "step": 140650 - }, - { - "epoch": 4.95, - "learning_rate": 1.478174118662301e-08, - "loss": 0.2525, - "step": 140655 - }, - { - "epoch": 4.95, - "learning_rate": 1.4683955851402254e-08, - "loss": 0.2385, - "step": 140660 - }, - { - "epoch": 4.95, - "learning_rate": 1.458649493404629e-08, - "loss": 0.2423, - "step": 140665 - }, - { - "epoch": 4.95, - "learning_rate": 1.4489358435818e-08, - "loss": 0.2522, - "step": 140670 - }, - { - "epoch": 4.95, - "learning_rate": 1.4392546357980263e-08, - "loss": 0.2519, - "step": 140675 - }, - { - "epoch": 4.95, - "learning_rate": 1.429605870178763e-08, - "loss": 0.2423, - "step": 140680 - }, - { - "epoch": 4.95, - "learning_rate": 1.4199895468497426e-08, - "loss": 0.2499, - "step": 140685 - }, - { - "epoch": 4.95, - "learning_rate": 1.4104056659353104e-08, - "loss": 0.2446, - "step": 140690 - }, - { - "epoch": 4.95, - "learning_rate": 1.400854227560089e-08, - "loss": 0.2484, - "step": 140695 - }, - { - "epoch": 4.95, - "learning_rate": 1.3913352318481453e-08, - "loss": 0.2521, - "step": 140700 - }, - { - "epoch": 4.95, - "learning_rate": 1.3818486789232699e-08, - "loss": 0.2474, - "step": 140705 - }, - { - "epoch": 4.95, - "learning_rate": 1.372394568908142e-08, - "loss": 0.2445, - "step": 140710 - }, - { - "epoch": 4.95, - "learning_rate": 1.3629729019259962e-08, - "loss": 0.251, - "step": 140715 - }, - { - "epoch": 4.95, - "learning_rate": 1.3535836780989575e-08, - "loss": 0.2597, - "step": 140720 - }, - { - "epoch": 4.95, - "learning_rate": 1.3442268975488726e-08, - "loss": 0.25, - "step": 140725 - }, - { - "epoch": 4.95, - "learning_rate": 1.3349025603975884e-08, - "loss": 0.2393, - "step": 140730 - }, - { - "epoch": 4.95, - "learning_rate": 1.3256106667655644e-08, - "loss": 0.2513, - "step": 140735 - }, - { - "epoch": 4.95, - "learning_rate": 1.3163512167740922e-08, - "loss": 0.2491, - "step": 140740 - }, - { - "epoch": 4.95, - "learning_rate": 1.3071242105427983e-08, - "loss": 0.2626, - "step": 140745 - }, - { - "epoch": 4.95, - "learning_rate": 1.2979296481921422e-08, - "loss": 0.2549, - "step": 140750 - }, - { - "epoch": 4.95, - "learning_rate": 1.2887675298409175e-08, - "loss": 0.2489, - "step": 140755 - }, - { - "epoch": 4.95, - "learning_rate": 1.2796378556084732e-08, - "loss": 0.2622, - "step": 140760 - }, - { - "epoch": 4.95, - "learning_rate": 1.270540625613048e-08, - "loss": 0.2323, - "step": 140765 - }, - { - "epoch": 4.95, - "learning_rate": 1.2614758399731585e-08, - "loss": 0.2562, - "step": 140770 - }, - { - "epoch": 4.95, - "learning_rate": 1.2524434988062106e-08, - "loss": 0.2434, - "step": 140775 - }, - { - "epoch": 4.95, - "learning_rate": 1.2434436022293327e-08, - "loss": 0.2521, - "step": 140780 - }, - { - "epoch": 4.95, - "learning_rate": 1.2344761503599312e-08, - "loss": 0.2525, - "step": 140785 - }, - { - "epoch": 4.95, - "learning_rate": 1.2255411433137464e-08, - "loss": 0.2441, - "step": 140790 - }, - { - "epoch": 4.95, - "learning_rate": 1.2166385812073522e-08, - "loss": 0.2572, - "step": 140795 - }, - { - "epoch": 4.95, - "learning_rate": 1.2077684641562114e-08, - "loss": 0.2391, - "step": 140800 - }, - { - "epoch": 4.95, - "learning_rate": 1.1989307922752324e-08, - "loss": 0.2428, - "step": 140805 - }, - { - "epoch": 4.95, - "learning_rate": 1.1901255656796006e-08, - "loss": 0.2312, - "step": 140810 - }, - { - "epoch": 4.95, - "learning_rate": 1.1813527844833916e-08, - "loss": 0.2521, - "step": 140815 - }, - { - "epoch": 4.95, - "learning_rate": 1.1726124488004032e-08, - "loss": 0.2576, - "step": 140820 - }, - { - "epoch": 4.95, - "learning_rate": 1.1639045587441554e-08, - "loss": 0.2525, - "step": 140825 - }, - { - "epoch": 4.95, - "learning_rate": 1.1552291144278915e-08, - "loss": 0.2638, - "step": 140830 - }, - { - "epoch": 4.95, - "learning_rate": 1.1465861159642988e-08, - "loss": 0.2554, - "step": 140835 - }, - { - "epoch": 4.96, - "learning_rate": 1.1379755634655099e-08, - "loss": 0.2675, - "step": 140840 - }, - { - "epoch": 4.96, - "learning_rate": 1.1293974570431021e-08, - "loss": 0.2464, - "step": 140845 - }, - { - "epoch": 4.96, - "learning_rate": 1.120851796808653e-08, - "loss": 0.2727, - "step": 140850 - }, - { - "epoch": 4.96, - "learning_rate": 1.1123385828734622e-08, - "loss": 0.2349, - "step": 140855 - }, - { - "epoch": 4.96, - "learning_rate": 1.103857815347442e-08, - "loss": 0.257, - "step": 140860 - }, - { - "epoch": 4.96, - "learning_rate": 1.0954094943410598e-08, - "loss": 0.2598, - "step": 140865 - }, - { - "epoch": 4.96, - "learning_rate": 1.086993619963672e-08, - "loss": 0.2654, - "step": 140870 - }, - { - "epoch": 4.96, - "learning_rate": 1.0786101923251912e-08, - "loss": 0.2598, - "step": 140875 - }, - { - "epoch": 4.96, - "learning_rate": 1.0702592115338639e-08, - "loss": 0.2388, - "step": 140880 - }, - { - "epoch": 4.96, - "learning_rate": 1.061940677698492e-08, - "loss": 0.2646, - "step": 140885 - }, - { - "epoch": 4.96, - "learning_rate": 1.0536545909270446e-08, - "loss": 0.2582, - "step": 140890 - }, - { - "epoch": 4.96, - "learning_rate": 1.0454009513269359e-08, - "loss": 0.2524, - "step": 140895 - }, - { - "epoch": 4.96, - "learning_rate": 1.0371797590053023e-08, - "loss": 0.2725, - "step": 140900 - }, - { - "epoch": 4.96, - "learning_rate": 1.0289910140692804e-08, - "loss": 0.226, - "step": 140905 - }, - { - "epoch": 4.96, - "learning_rate": 1.020834716624619e-08, - "loss": 0.2708, - "step": 140910 - }, - { - "epoch": 4.96, - "learning_rate": 1.0127108667778994e-08, - "loss": 0.2469, - "step": 140915 - }, - { - "epoch": 4.96, - "learning_rate": 1.0046194646340379e-08, - "loss": 0.2362, - "step": 140920 - }, - { - "epoch": 4.96, - "learning_rate": 9.965605102982278e-09, - "loss": 0.2453, - "step": 140925 - }, - { - "epoch": 4.96, - "learning_rate": 9.885340038751079e-09, - "loss": 0.2572, - "step": 140930 - }, - { - "epoch": 4.96, - "learning_rate": 9.805399454690389e-09, - "loss": 0.2479, - "step": 140935 - }, - { - "epoch": 4.96, - "learning_rate": 9.725783351838268e-09, - "loss": 0.2517, - "step": 140940 - }, - { - "epoch": 4.96, - "learning_rate": 9.646491731227225e-09, - "loss": 0.247, - "step": 140945 - }, - { - "epoch": 4.96, - "learning_rate": 9.567524593886989e-09, - "loss": 0.2489, - "step": 140950 - }, - { - "epoch": 4.96, - "learning_rate": 9.488881940844518e-09, - "loss": 0.257, - "step": 140955 - }, - { - "epoch": 4.96, - "learning_rate": 9.410563773118442e-09, - "loss": 0.2491, - "step": 140960 - }, - { - "epoch": 4.96, - "learning_rate": 9.332570091724612e-09, - "loss": 0.2267, - "step": 140965 - }, - { - "epoch": 4.96, - "learning_rate": 9.254900897681662e-09, - "loss": 0.2503, - "step": 140970 - }, - { - "epoch": 4.96, - "learning_rate": 9.177556191991565e-09, - "loss": 0.2487, - "step": 140975 - }, - { - "epoch": 4.96, - "learning_rate": 9.100535975661851e-09, - "loss": 0.2439, - "step": 140980 - }, - { - "epoch": 4.96, - "learning_rate": 9.023840249694492e-09, - "loss": 0.2705, - "step": 140985 - }, - { - "epoch": 4.96, - "learning_rate": 8.947469015080367e-09, - "loss": 0.2442, - "step": 140990 - }, - { - "epoch": 4.96, - "learning_rate": 8.871422272813124e-09, - "loss": 0.2463, - "step": 140995 - }, - { - "epoch": 4.96, - "learning_rate": 8.795700023883634e-09, - "loss": 0.2905, - "step": 141000 - }, - { - "epoch": 4.96, - "eval_loss": 0.24837106466293335, - "eval_runtime": 10.5397, - "eval_samples_per_second": 9.488, - "eval_steps_per_second": 9.488, - "step": 141000 - }, - { - "epoch": 4.96, - "learning_rate": 8.720302269268899e-09, - "loss": 0.2621, - "step": 141005 - }, - { - "epoch": 4.96, - "learning_rate": 8.645229009954236e-09, - "loss": 0.2356, - "step": 141010 - }, - { - "epoch": 4.96, - "learning_rate": 8.570480246908318e-09, - "loss": 0.253, - "step": 141015 - }, - { - "epoch": 4.96, - "learning_rate": 8.496055981105367e-09, - "loss": 0.2589, - "step": 141020 - }, - { - "epoch": 4.96, - "learning_rate": 8.421956213514049e-09, - "loss": 0.2399, - "step": 141025 - }, - { - "epoch": 4.96, - "learning_rate": 8.348180945091932e-09, - "loss": 0.2517, - "step": 141030 - }, - { - "epoch": 4.96, - "learning_rate": 8.274730176796586e-09, - "loss": 0.2442, - "step": 141035 - }, - { - "epoch": 4.96, - "learning_rate": 8.201603909585576e-09, - "loss": 0.2488, - "step": 141040 - }, - { - "epoch": 4.96, - "learning_rate": 8.128802144405367e-09, - "loss": 0.261, - "step": 141045 - }, - { - "epoch": 4.96, - "learning_rate": 8.056324882202425e-09, - "loss": 0.2444, - "step": 141050 - }, - { - "epoch": 4.96, - "learning_rate": 7.984172123920441e-09, - "loss": 0.2682, - "step": 141055 - }, - { - "epoch": 4.96, - "learning_rate": 7.912343870492001e-09, - "loss": 0.2582, - "step": 141060 - }, - { - "epoch": 4.96, - "learning_rate": 7.84084012284969e-09, - "loss": 0.2334, - "step": 141065 - }, - { - "epoch": 4.96, - "learning_rate": 7.7696608819261e-09, - "loss": 0.268, - "step": 141070 - }, - { - "epoch": 4.96, - "learning_rate": 7.698806148639937e-09, - "loss": 0.262, - "step": 141075 - }, - { - "epoch": 4.96, - "learning_rate": 7.628275923918238e-09, - "loss": 0.2562, - "step": 141080 - }, - { - "epoch": 4.96, - "learning_rate": 7.558070208671385e-09, - "loss": 0.2562, - "step": 141085 - }, - { - "epoch": 4.96, - "learning_rate": 7.488189003809765e-09, - "loss": 0.2381, - "step": 141090 - }, - { - "epoch": 4.96, - "learning_rate": 7.4186323102465315e-09, - "loss": 0.2648, - "step": 141095 - }, - { - "epoch": 4.96, - "learning_rate": 7.3494001288809676e-09, - "loss": 0.2472, - "step": 141100 - }, - { - "epoch": 4.96, - "learning_rate": 7.2804924606095785e-09, - "loss": 0.2379, - "step": 141105 - }, - { - "epoch": 4.96, - "learning_rate": 7.211909306334419e-09, - "loss": 0.2699, - "step": 141110 - }, - { - "epoch": 4.96, - "learning_rate": 7.1436506669381175e-09, - "loss": 0.2614, - "step": 141115 - }, - { - "epoch": 4.96, - "learning_rate": 7.0757165433144035e-09, - "loss": 0.2625, - "step": 141120 - }, - { - "epoch": 4.97, - "learning_rate": 7.008106936337577e-09, - "loss": 0.2355, - "step": 141125 - }, - { - "epoch": 4.97, - "learning_rate": 6.940821846893042e-09, - "loss": 0.2697, - "step": 141130 - }, - { - "epoch": 4.97, - "learning_rate": 6.873861275849547e-09, - "loss": 0.2454, - "step": 141135 - }, - { - "epoch": 4.97, - "learning_rate": 6.807225224078617e-09, - "loss": 0.2598, - "step": 141140 - }, - { - "epoch": 4.97, - "learning_rate": 6.7409136924434516e-09, - "loss": 0.2537, - "step": 141145 - }, - { - "epoch": 4.97, - "learning_rate": 6.674926681804472e-09, - "loss": 0.2552, - "step": 141150 - }, - { - "epoch": 4.97, - "learning_rate": 6.609264193024878e-09, - "loss": 0.2453, - "step": 141155 - }, - { - "epoch": 4.97, - "learning_rate": 6.543926226948438e-09, - "loss": 0.2443, - "step": 141160 - }, - { - "epoch": 4.97, - "learning_rate": 6.478912784430025e-09, - "loss": 0.241, - "step": 141165 - }, - { - "epoch": 4.97, - "learning_rate": 6.414223866313407e-09, - "loss": 0.2486, - "step": 141170 - }, - { - "epoch": 4.97, - "learning_rate": 6.349859473434028e-09, - "loss": 0.2404, - "step": 141175 - }, - { - "epoch": 4.97, - "learning_rate": 6.285819606630105e-09, - "loss": 0.2474, - "step": 141180 - }, - { - "epoch": 4.97, - "learning_rate": 6.2221042667343075e-09, - "loss": 0.2452, - "step": 141185 - }, - { - "epoch": 4.97, - "learning_rate": 6.15871345457375e-09, - "loss": 0.2747, - "step": 141190 - }, - { - "epoch": 4.97, - "learning_rate": 6.0956471709699985e-09, - "loss": 0.2486, - "step": 141195 - }, - { - "epoch": 4.97, - "learning_rate": 6.032905416741841e-09, - "loss": 0.2398, - "step": 141200 - }, - { - "epoch": 4.97, - "learning_rate": 5.970488192705293e-09, - "loss": 0.2424, - "step": 141205 - }, - { - "epoch": 4.97, - "learning_rate": 5.908395499670815e-09, - "loss": 0.2566, - "step": 141210 - }, - { - "epoch": 4.97, - "learning_rate": 5.846627338443322e-09, - "loss": 0.2596, - "step": 141215 - }, - { - "epoch": 4.97, - "learning_rate": 5.7851837098249486e-09, - "loss": 0.2413, - "step": 141220 - }, - { - "epoch": 4.97, - "learning_rate": 5.724064614615054e-09, - "loss": 0.2422, - "step": 141225 - }, - { - "epoch": 4.97, - "learning_rate": 5.6632700536074505e-09, - "loss": 0.2516, - "step": 141230 - }, - { - "epoch": 4.97, - "learning_rate": 5.602800027587618e-09, - "loss": 0.2625, - "step": 141235 - }, - { - "epoch": 4.97, - "learning_rate": 5.542654537346592e-09, - "loss": 0.2523, - "step": 141240 - }, - { - "epoch": 4.97, - "learning_rate": 5.482833583661529e-09, - "loss": 0.2497, - "step": 141245 - }, - { - "epoch": 4.97, - "learning_rate": 5.423337167309583e-09, - "loss": 0.2619, - "step": 141250 - }, - { - "epoch": 4.97, - "learning_rate": 5.364165289062362e-09, - "loss": 0.253, - "step": 141255 - }, - { - "epoch": 4.97, - "learning_rate": 5.305317949691468e-09, - "loss": 0.2374, - "step": 141260 - }, - { - "epoch": 4.97, - "learning_rate": 5.246795149957407e-09, - "loss": 0.2619, - "step": 141265 - }, - { - "epoch": 4.97, - "learning_rate": 5.188596890620678e-09, - "loss": 0.2507, - "step": 141270 - }, - { - "epoch": 4.97, - "learning_rate": 5.130723172439011e-09, - "loss": 0.2566, - "step": 141275 - }, - { - "epoch": 4.97, - "learning_rate": 5.073173996161806e-09, - "loss": 0.2527, - "step": 141280 - }, - { - "epoch": 4.97, - "learning_rate": 5.015949362538464e-09, - "loss": 0.2586, - "step": 141285 - }, - { - "epoch": 4.97, - "learning_rate": 4.959049272310057e-09, - "loss": 0.254, - "step": 141290 - }, - { - "epoch": 4.97, - "learning_rate": 4.90247372621766e-09, - "loss": 0.2495, - "step": 141295 - }, - { - "epoch": 4.97, - "learning_rate": 4.846222724991245e-09, - "loss": 0.2651, - "step": 141300 - }, - { - "epoch": 4.97, - "learning_rate": 4.790296269366334e-09, - "loss": 0.2469, - "step": 141305 - }, - { - "epoch": 4.97, - "learning_rate": 4.734694360067349e-09, - "loss": 0.2607, - "step": 141310 - }, - { - "epoch": 4.97, - "learning_rate": 4.679416997813158e-09, - "loss": 0.2666, - "step": 141315 - }, - { - "epoch": 4.97, - "learning_rate": 4.624464183325406e-09, - "loss": 0.2676, - "step": 141320 - }, - { - "epoch": 4.97, - "learning_rate": 4.569835917317411e-09, - "loss": 0.2595, - "step": 141325 - }, - { - "epoch": 4.97, - "learning_rate": 4.5155322004969415e-09, - "loss": 0.2523, - "step": 141330 - }, - { - "epoch": 4.97, - "learning_rate": 4.461553033571764e-09, - "loss": 0.2547, - "step": 141335 - }, - { - "epoch": 4.97, - "learning_rate": 4.407898417238543e-09, - "loss": 0.2426, - "step": 141340 - }, - { - "epoch": 4.97, - "learning_rate": 4.354568352193944e-09, - "loss": 0.25, - "step": 141345 - }, - { - "epoch": 4.97, - "learning_rate": 4.301562839134632e-09, - "loss": 0.2483, - "step": 141350 - }, - { - "epoch": 4.97, - "learning_rate": 4.248881878748945e-09, - "loss": 0.2567, - "step": 141355 - }, - { - "epoch": 4.97, - "learning_rate": 4.196525471714119e-09, - "loss": 0.2535, - "step": 141360 - }, - { - "epoch": 4.97, - "learning_rate": 4.1444936187184925e-09, - "loss": 0.2507, - "step": 141365 - }, - { - "epoch": 4.97, - "learning_rate": 4.092786320430974e-09, - "loss": 0.2381, - "step": 141370 - }, - { - "epoch": 4.97, - "learning_rate": 4.041403577528802e-09, - "loss": 0.2421, - "step": 141375 - }, - { - "epoch": 4.97, - "learning_rate": 3.9903453906725565e-09, - "loss": 0.2582, - "step": 141380 - }, - { - "epoch": 4.97, - "learning_rate": 3.939611760531148e-09, - "loss": 0.2553, - "step": 141385 - }, - { - "epoch": 4.97, - "learning_rate": 3.889202687759607e-09, - "loss": 0.2348, - "step": 141390 - }, - { - "epoch": 4.97, - "learning_rate": 3.839118173015743e-09, - "loss": 0.2817, - "step": 141395 - }, - { - "epoch": 4.97, - "learning_rate": 3.789358216946259e-09, - "loss": 0.2468, - "step": 141400 - }, - { - "epoch": 4.98, - "learning_rate": 3.7399228201978606e-09, - "loss": 0.258, - "step": 141405 - }, - { - "epoch": 4.98, - "learning_rate": 3.6908119834144772e-09, - "loss": 0.2321, - "step": 141410 - }, - { - "epoch": 4.98, - "learning_rate": 3.642025707231711e-09, - "loss": 0.2494, - "step": 141415 - }, - { - "epoch": 4.98, - "learning_rate": 3.593563992285165e-09, - "loss": 0.2506, - "step": 141420 - }, - { - "epoch": 4.98, - "learning_rate": 3.5454268392048906e-09, - "loss": 0.2526, - "step": 141425 - }, - { - "epoch": 4.98, - "learning_rate": 3.4976142486098374e-09, - "loss": 0.2732, - "step": 141430 - }, - { - "epoch": 4.98, - "learning_rate": 3.450126221127281e-09, - "loss": 0.224, - "step": 141435 - }, - { - "epoch": 4.98, - "learning_rate": 3.40296275737062e-09, - "loss": 0.2584, - "step": 141440 - }, - { - "epoch": 4.98, - "learning_rate": 3.356123857953253e-09, - "loss": 0.2479, - "step": 141445 - }, - { - "epoch": 4.98, - "learning_rate": 3.3096095234858016e-09, - "loss": 0.2733, - "step": 141450 - }, - { - "epoch": 4.98, - "learning_rate": 3.2634197545650115e-09, - "loss": 0.2542, - "step": 141455 - }, - { - "epoch": 4.98, - "learning_rate": 3.217554551798729e-09, - "loss": 0.2575, - "step": 141460 - }, - { - "epoch": 4.98, - "learning_rate": 3.1720139157781493e-09, - "loss": 0.2401, - "step": 141465 - }, - { - "epoch": 4.98, - "learning_rate": 3.126797847094465e-09, - "loss": 0.2382, - "step": 141470 - }, - { - "epoch": 4.98, - "learning_rate": 3.0819063463360943e-09, - "loss": 0.259, - "step": 141475 - }, - { - "epoch": 4.98, - "learning_rate": 3.0373394140859048e-09, - "loss": 0.2539, - "step": 141480 - }, - { - "epoch": 4.98, - "learning_rate": 2.9930970509239874e-09, - "loss": 0.243, - "step": 141485 - }, - { - "epoch": 4.98, - "learning_rate": 2.9491792574221077e-09, - "loss": 0.2471, - "step": 141490 - }, - { - "epoch": 4.98, - "learning_rate": 2.9055860341492546e-09, - "loss": 0.2529, - "step": 141495 - }, - { - "epoch": 4.98, - "learning_rate": 2.8623173816744175e-09, - "loss": 0.2396, - "step": 141500 - }, - { - "epoch": 4.98, - "eval_loss": 0.2483615279197693, - "eval_runtime": 10.5543, - "eval_samples_per_second": 9.475, - "eval_steps_per_second": 9.475, - "step": 141500 - }, - { - "epoch": 4.98, - "learning_rate": 2.8193733005610346e-09, - "loss": 0.2468, - "step": 141505 - }, - { - "epoch": 4.98, - "learning_rate": 2.7767537913614418e-09, - "loss": 0.259, - "step": 141510 - }, - { - "epoch": 4.98, - "learning_rate": 2.734458854633526e-09, - "loss": 0.2438, - "step": 141515 - }, - { - "epoch": 4.98, - "learning_rate": 2.692488490924072e-09, - "loss": 0.2723, - "step": 141520 - }, - { - "epoch": 4.98, - "learning_rate": 2.6508427007798654e-09, - "loss": 0.2448, - "step": 141525 - }, - { - "epoch": 4.98, - "learning_rate": 2.6095214847393635e-09, - "loss": 0.2505, - "step": 141530 - }, - { - "epoch": 4.98, - "learning_rate": 2.568524843341025e-09, - "loss": 0.2619, - "step": 141535 - }, - { - "epoch": 4.98, - "learning_rate": 2.5278527771177562e-09, - "loss": 0.2537, - "step": 141540 - }, - { - "epoch": 4.98, - "learning_rate": 2.4875052865941386e-09, - "loss": 0.264, - "step": 141545 - }, - { - "epoch": 4.98, - "learning_rate": 2.447482372297527e-09, - "loss": 0.2323, - "step": 141550 - }, - { - "epoch": 4.98, - "learning_rate": 2.4077840347441762e-09, - "loss": 0.2491, - "step": 141555 - }, - { - "epoch": 4.98, - "learning_rate": 2.36841027445589e-09, - "loss": 0.2279, - "step": 141560 - }, - { - "epoch": 4.98, - "learning_rate": 2.329361091935045e-09, - "loss": 0.2559, - "step": 141565 - }, - { - "epoch": 4.98, - "learning_rate": 2.290636487697895e-09, - "loss": 0.2693, - "step": 141570 - }, - { - "epoch": 4.98, - "learning_rate": 2.2522364622384885e-09, - "loss": 0.2513, - "step": 141575 - }, - { - "epoch": 4.98, - "learning_rate": 2.2141610160619773e-09, - "loss": 0.26, - "step": 141580 - }, - { - "epoch": 4.98, - "learning_rate": 2.176410149659636e-09, - "loss": 0.2768, - "step": 141585 - }, - { - "epoch": 4.98, - "learning_rate": 2.1389838635227366e-09, - "loss": 0.2432, - "step": 141590 - }, - { - "epoch": 4.98, - "learning_rate": 2.101882158137003e-09, - "loss": 0.2819, - "step": 141595 - }, - { - "epoch": 4.98, - "learning_rate": 2.0651050339853815e-09, - "loss": 0.2579, - "step": 141600 - }, - { - "epoch": 4.98, - "learning_rate": 2.028652491542493e-09, - "loss": 0.247, - "step": 141605 - }, - { - "epoch": 4.98, - "learning_rate": 1.9925245312829577e-09, - "loss": 0.2499, - "step": 141610 - }, - { - "epoch": 4.98, - "learning_rate": 1.95672115367862e-09, - "loss": 0.2774, - "step": 141615 - }, - { - "epoch": 4.98, - "learning_rate": 1.921242359190223e-09, - "loss": 0.2289, - "step": 141620 - }, - { - "epoch": 4.98, - "learning_rate": 1.886088148278509e-09, - "loss": 0.2339, - "step": 141625 - }, - { - "epoch": 4.98, - "learning_rate": 1.85125852140422e-09, - "loss": 0.251, - "step": 141630 - }, - { - "epoch": 4.98, - "learning_rate": 1.8167534790169971e-09, - "loss": 0.2504, - "step": 141635 - }, - { - "epoch": 4.98, - "learning_rate": 1.782573021563705e-09, - "loss": 0.2926, - "step": 141640 - }, - { - "epoch": 4.98, - "learning_rate": 1.7487171494884325e-09, - "loss": 0.242, - "step": 141645 - }, - { - "epoch": 4.98, - "learning_rate": 1.7151858632324935e-09, - "loss": 0.2579, - "step": 141650 - }, - { - "epoch": 4.98, - "learning_rate": 1.6819791632288751e-09, - "loss": 0.2323, - "step": 141655 - }, - { - "epoch": 4.98, - "learning_rate": 1.6490970499133395e-09, - "loss": 0.2269, - "step": 141660 - }, - { - "epoch": 4.98, - "learning_rate": 1.6165395237077718e-09, - "loss": 0.2234, - "step": 141665 - }, - { - "epoch": 4.98, - "learning_rate": 1.5843065850368321e-09, - "loss": 0.2495, - "step": 141670 - }, - { - "epoch": 4.98, - "learning_rate": 1.5523982343196297e-09, - "loss": 0.2542, - "step": 141675 - }, - { - "epoch": 4.98, - "learning_rate": 1.5208144719697227e-09, - "loss": 0.2523, - "step": 141680 - }, - { - "epoch": 4.98, - "learning_rate": 1.4895552983978934e-09, - "loss": 0.2379, - "step": 141685 - }, - { - "epoch": 4.99, - "learning_rate": 1.4586207140093733e-09, - "loss": 0.2469, - "step": 141690 - }, - { - "epoch": 4.99, - "learning_rate": 1.4280107192066184e-09, - "loss": 0.2736, - "step": 141695 - }, - { - "epoch": 4.99, - "learning_rate": 1.3977253143837577e-09, - "loss": 0.2477, - "step": 141700 - }, - { - "epoch": 4.99, - "learning_rate": 1.3677644999404716e-09, - "loss": 0.2599, - "step": 141705 - }, - { - "epoch": 4.99, - "learning_rate": 1.3381282762597868e-09, - "loss": 0.2482, - "step": 141710 - }, - { - "epoch": 4.99, - "learning_rate": 1.3088166437275062e-09, - "loss": 0.2591, - "step": 141715 - }, - { - "epoch": 4.99, - "learning_rate": 1.2798296027294321e-09, - "loss": 0.264, - "step": 141720 - }, - { - "epoch": 4.99, - "learning_rate": 1.2511671536347136e-09, - "loss": 0.2271, - "step": 141725 - }, - { - "epoch": 4.99, - "learning_rate": 1.2228292968208266e-09, - "loss": 0.2488, - "step": 141730 - }, - { - "epoch": 4.99, - "learning_rate": 1.1948160326513692e-09, - "loss": 0.2753, - "step": 141735 - }, - { - "epoch": 4.99, - "learning_rate": 1.1671273614927147e-09, - "loss": 0.2398, - "step": 141740 - }, - { - "epoch": 4.99, - "learning_rate": 1.139763283705686e-09, - "loss": 0.246, - "step": 141745 - }, - { - "epoch": 4.99, - "learning_rate": 1.1127237996427786e-09, - "loss": 0.257, - "step": 141750 - }, - { - "epoch": 4.99, - "learning_rate": 1.0860089096564884e-09, - "loss": 0.2507, - "step": 141755 - }, - { - "epoch": 4.99, - "learning_rate": 1.0596186140937602e-09, - "loss": 0.2598, - "step": 141760 - }, - { - "epoch": 4.99, - "learning_rate": 1.0335529132987631e-09, - "loss": 0.2502, - "step": 141765 - }, - { - "epoch": 4.99, - "learning_rate": 1.007811807604564e-09, - "loss": 0.2493, - "step": 141770 - }, - { - "epoch": 4.99, - "learning_rate": 9.823952973525563e-10, - "loss": 0.2634, - "step": 141775 - }, - { - "epoch": 4.99, - "learning_rate": 9.573033828674805e-10, - "loss": 0.2628, - "step": 141780 - }, - { - "epoch": 4.99, - "learning_rate": 9.325360644768522e-10, - "loss": 0.247, - "step": 141785 - }, - { - "epoch": 4.99, - "learning_rate": 9.080933425026361e-10, - "loss": 0.2553, - "step": 141790 - }, - { - "epoch": 4.99, - "learning_rate": 8.839752172612459e-10, - "loss": 0.2516, - "step": 141795 - }, - { - "epoch": 4.99, - "learning_rate": 8.601816890663195e-10, - "loss": 0.2546, - "step": 141800 - }, - { - "epoch": 4.99, - "learning_rate": 8.367127582259438e-10, - "loss": 0.2592, - "step": 141805 - }, - { - "epoch": 4.99, - "learning_rate": 8.135684250482056e-10, - "loss": 0.2581, - "step": 141810 - }, - { - "epoch": 4.99, - "learning_rate": 7.907486898300898e-10, - "loss": 0.251, - "step": 141815 - }, - { - "epoch": 4.99, - "learning_rate": 7.682535528685808e-10, - "loss": 0.2412, - "step": 141820 - }, - { - "epoch": 4.99, - "learning_rate": 7.460830144551123e-10, - "loss": 0.2584, - "step": 141825 - }, - { - "epoch": 4.99, - "learning_rate": 7.242370748811178e-10, - "loss": 0.2558, - "step": 141830 - }, - { - "epoch": 4.99, - "learning_rate": 7.027157344241531e-10, - "loss": 0.2496, - "step": 141835 - }, - { - "epoch": 4.99, - "learning_rate": 6.815189933701005e-10, - "loss": 0.267, - "step": 141840 - }, - { - "epoch": 4.99, - "learning_rate": 6.606468519909648e-10, - "loss": 0.2709, - "step": 141845 - }, - { - "epoch": 4.99, - "learning_rate": 6.400993105587505e-10, - "loss": 0.2453, - "step": 141850 - }, - { - "epoch": 4.99, - "learning_rate": 6.198763693399112e-10, - "loss": 0.2638, - "step": 141855 - }, - { - "epoch": 4.99, - "learning_rate": 5.999780285953493e-10, - "loss": 0.2354, - "step": 141860 - }, - { - "epoch": 4.99, - "learning_rate": 5.804042885859673e-10, - "loss": 0.2624, - "step": 141865 - }, - { - "epoch": 4.99, - "learning_rate": 5.611551495643408e-10, - "loss": 0.2541, - "step": 141870 - }, - { - "epoch": 4.99, - "learning_rate": 5.4223061178027e-10, - "loss": 0.2372, - "step": 141875 - }, - { - "epoch": 4.99, - "learning_rate": 5.236306754807796e-10, - "loss": 0.2381, - "step": 141880 - }, - { - "epoch": 4.99, - "learning_rate": 5.053553409045675e-10, - "loss": 0.2495, - "step": 141885 - }, - { - "epoch": 4.99, - "learning_rate": 4.874046082931072e-10, - "loss": 0.2311, - "step": 141890 - }, - { - "epoch": 4.99, - "learning_rate": 4.6977847787677e-10, - "loss": 0.2347, - "step": 141895 - }, - { - "epoch": 4.99, - "learning_rate": 4.524769498859271e-10, - "loss": 0.248, - "step": 141900 - }, - { - "epoch": 4.99, - "learning_rate": 4.3550002454539886e-10, - "loss": 0.2674, - "step": 141905 - }, - { - "epoch": 4.99, - "learning_rate": 4.1884770207167855e-10, - "loss": 0.2447, - "step": 141910 - }, - { - "epoch": 4.99, - "learning_rate": 4.025199826868109e-10, - "loss": 0.2451, - "step": 141915 - }, - { - "epoch": 4.99, - "learning_rate": 3.865168665989627e-10, - "loss": 0.2551, - "step": 141920 - }, - { - "epoch": 4.99, - "learning_rate": 3.708383540135252e-10, - "loss": 0.2753, - "step": 141925 - }, - { - "epoch": 4.99, - "learning_rate": 3.5548444514144075e-10, - "loss": 0.2452, - "step": 141930 - }, - { - "epoch": 4.99, - "learning_rate": 3.4045514017699845e-10, - "loss": 0.2607, - "step": 141935 - }, - { - "epoch": 4.99, - "learning_rate": 3.2575043931448723e-10, - "loss": 0.2621, - "step": 141940 - }, - { - "epoch": 4.99, - "learning_rate": 3.113703427481962e-10, - "loss": 0.2502, - "step": 141945 - }, - { - "epoch": 4.99, - "learning_rate": 2.973148506640877e-10, - "loss": 0.2524, - "step": 141950 - }, - { - "epoch": 4.99, - "learning_rate": 2.8358396324257295e-10, - "loss": 0.2224, - "step": 141955 - }, - { - "epoch": 4.99, - "learning_rate": 2.7017768066406323e-10, - "loss": 0.2429, - "step": 141960 - }, - { - "epoch": 4.99, - "learning_rate": 2.5709600310341864e-10, - "loss": 0.2337, - "step": 141965 - }, - { - "epoch": 4.99, - "learning_rate": 2.443389307271726e-10, - "loss": 0.2236, - "step": 141970 - }, - { - "epoch": 5.0, - "learning_rate": 2.3190646370185865e-10, - "loss": 0.2481, - "step": 141975 - }, - { - "epoch": 5.0, - "learning_rate": 2.197986021912346e-10, - "loss": 0.2354, - "step": 141980 - }, - { - "epoch": 5.0, - "learning_rate": 2.0801534635073172e-10, - "loss": 0.2381, - "step": 141985 - }, - { - "epoch": 5.0, - "learning_rate": 1.9655669633300567e-10, - "loss": 0.2499, - "step": 141990 - }, - { - "epoch": 5.0, - "learning_rate": 1.8542265228793653e-10, - "loss": 0.2834, - "step": 141995 - }, - { - "epoch": 5.0, - "learning_rate": 1.7461321435985333e-10, - "loss": 0.2573, - "step": 142000 - }, - { - "epoch": 5.0, - "eval_loss": 0.24838173389434814, - "eval_runtime": 10.5549, - "eval_samples_per_second": 9.474, - "eval_steps_per_second": 9.474, - "step": 142000 } ], "logging_steps": 5, - "max_steps": 142115, + "max_steps": 138735, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, - "total_flos": 4.632594192202221e+19, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.156594211931134e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null